12 files changed, 1588 insertions, 51 deletions
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index dc0e0c6ed075..24939b8553e6 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2479,6 +2479,20 @@ config STRING_HELPERS_KUNIT_TEST
 	depends on KUNIT
 	default KUNIT_ALL_TESTS
 
+config FFS_KUNIT_TEST
+	tristate "KUnit test ffs-family functions at runtime" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS
+	help
+	  This builds KUnit tests for ffs-family bit manipulation functions
+	  including ffs(), __ffs(), fls(), __fls(), fls64(), and __ffs64().
+
+	  These tests validate mathematical correctness, edge case handling,
+	  and cross-architecture consistency of bit scanning functions.
+
+	  For more information on KUnit and unit tests in general,
+	  please refer to Documentation/dev-tools/kunit/.
+
 config TEST_KSTRTOX
 	tristate "Test kstrto*() family of functions at runtime"
 
@@ -2894,7 +2908,7 @@ config FORTIFY_KUNIT_TEST
 config LONGEST_SYM_KUNIT_TEST
 	tristate "Test the longest symbol possible" if !KUNIT_ALL_TESTS
 	depends on KUNIT && KPROBES
-	depends on !PREFIX_SYMBOLS && !CFI_CLANG && !GCOV_KERNEL
+	depends on !PREFIX_SYMBOLS && !CFI && !GCOV_KERNEL
 	default KUNIT_ALL_TESTS
 	help
 	  Tests the longest symbol possible
diff --git a/lib/clz_ctz.c b/lib/clz_ctz.c
index fb8c0c5c2bd2..8778ec44bf63 100644
--- a/lib/clz_ctz.c
+++ b/lib/clz_ctz.c
@@ -15,28 +15,28 @@
 #include <linux/kernel.h>
 
 int __weak __ctzsi2(int val);
-int __weak __ctzsi2(int val)
+int __weak __attribute_const__ __ctzsi2(int val)
 {
 	return __ffs(val);
 }
 EXPORT_SYMBOL(__ctzsi2);
 
 int __weak __clzsi2(int val);
-int __weak __clzsi2(int val)
+int __weak __attribute_const__ __clzsi2(int val)
 {
 	return 32 - fls(val);
 }
 EXPORT_SYMBOL(__clzsi2);
 
 int __weak __clzdi2(u64 val);
-int __weak __clzdi2(u64 val)
+int __weak __attribute_const__ __clzdi2(u64 val)
 {
 	return 64 - fls64(val);
 }
 EXPORT_SYMBOL(__clzdi2);
 
 int __weak __ctzdi2(u64 val);
-int __weak __ctzdi2(u64 val)
+int __weak __attribute_const__ __ctzdi2(u64 val)
 {
 	return __ffs64(val);
 }
diff --git a/lib/crypto/arm64/sha256-ce.S b/lib/crypto/arm64/sha256-ce.S
index b99d9589c421..410174ba5237 100644
--- a/lib/crypto/arm64/sha256-ce.S
+++ b/lib/crypto/arm64/sha256-ce.S
@@ -70,18 +70,22 @@
 	.word		0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
 	.word		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
 
+	.macro load_round_constants	tmp
+	adr_l		\tmp, .Lsha2_rcon
+	ld1		{ v0.4s- v3.4s}, [\tmp], #64
+	ld1		{ v4.4s- v7.4s}, [\tmp], #64
+	ld1		{ v8.4s-v11.4s}, [\tmp], #64
+	ld1		{v12.4s-v15.4s}, [\tmp]
+	.endm
+
 	/*
 	 * size_t __sha256_ce_transform(struct sha256_block_state *state,
 	 *				const u8 *data, size_t nblocks);
 	 */
 	.text
 SYM_FUNC_START(__sha256_ce_transform)
-	/* load round constants */
-	adr_l		x8, .Lsha2_rcon
-	ld1		{ v0.4s- v3.4s}, [x8], #64
-	ld1		{ v4.4s- v7.4s}, [x8], #64
-	ld1		{ v8.4s-v11.4s}, [x8], #64
-	ld1		{v12.4s-v15.4s}, [x8]
+
+	load_round_constants	x8
 
 	/* load state */
 	ld1		{dgav.4s, dgbv.4s}, [x0]
@@ -134,3 +138,271 @@ CPU_LE(	rev32		v19.16b, v19.16b	)
 	mov		x0, x2
 	ret
 SYM_FUNC_END(__sha256_ce_transform)
+
+	.unreq dga
+	.unreq dgav
+	.unreq dgb
+	.unreq dgbv
+	.unreq t0
+	.unreq t1
+	.unreq dg0q
+	.unreq dg0v
+	.unreq dg1q
+	.unreq dg1v
+	.unreq dg2q
+	.unreq dg2v
+
+	// parameters for sha256_ce_finup2x()
+	ctx		.req	x0
+	data1		.req	x1
+	data2		.req	x2
+	len		.req	w3
+	out1		.req	x4
+	out2		.req	x5
+
+	// other scalar variables
+	count		.req	x6
+	final_step	.req	w7
+
+	// x8-x9 are used as temporaries.
+
+	// v0-v15 are used to cache the SHA-256 round constants.
+	// v16-v19 are used for the message schedule for the first message.
+	// v20-v23 are used for the message schedule for the second message.
+	// v24-v31 are used for the state and temporaries as given below.
+	// *_a are for the first message and *_b for the second.
+	state0_a_q	.req	q24
+	state0_a	.req	v24
+	state1_a_q	.req	q25
+	state1_a	.req	v25
+	state0_b_q	.req	q26
+	state0_b	.req	v26
+	state1_b_q	.req	q27
+	state1_b	.req	v27
+	t0_a		.req	v28
+	t0_b		.req	v29
+	t1_a_q		.req	q30
+	t1_a		.req	v30
+	t1_b_q		.req	q31
+	t1_b		.req	v31
+
+#define OFFSETOF_BYTECOUNT	32 // offsetof(struct __sha256_ctx, bytecount)
+#define OFFSETOF_BUF		40 // offsetof(struct __sha256_ctx, buf)
+// offsetof(struct __sha256_ctx, state) is assumed to be 0.
+
+	// Do 4 rounds of SHA-256 for each of two messages (interleaved).  m0_a
+	// and m0_b contain the current 4 message schedule words for the first
+	// and second message respectively.
+	//
+	// If not all the message schedule words have been computed yet, then
+	// this also computes 4 more message schedule words for each message.
+	// m1_a-m3_a contain the next 3 groups of 4 message schedule words for
+	// the first message, and likewise m1_b-m3_b for the second.  After
+	// consuming the current value of m0_a, this macro computes the group
+	// after m3_a and writes it to m0_a, and likewise for *_b.  This means
+	// that the next (m0_a, m1_a, m2_a, m3_a) is the current (m1_a, m2_a,
+	// m3_a, m0_a), and likewise for *_b, so the caller must cycle through
+	// the registers accordingly.
+	.macro	do_4rounds_2x	i, k,  m0_a, m1_a, m2_a, m3_a,  \
+				       m0_b, m1_b, m2_b, m3_b
+	add		t0_a\().4s, \m0_a\().4s, \k\().4s
+	add		t0_b\().4s, \m0_b\().4s, \k\().4s
+	.if \i < 48
+	sha256su0	\m0_a\().4s, \m1_a\().4s
+	sha256su0	\m0_b\().4s, \m1_b\().4s
+	sha256su1	\m0_a\().4s, \m2_a\().4s, \m3_a\().4s
+	sha256su1	\m0_b\().4s, \m2_b\().4s, \m3_b\().4s
+	.endif
+	mov		t1_a.16b, state0_a.16b
+	mov		t1_b.16b, state0_b.16b
+	sha256h		state0_a_q, state1_a_q, t0_a\().4s
+	sha256h		state0_b_q, state1_b_q, t0_b\().4s
+	sha256h2	state1_a_q, t1_a_q, t0_a\().4s
+	sha256h2	state1_b_q, t1_b_q, t0_b\().4s
+	.endm
+
+	.macro	do_16rounds_2x	i, k0, k1, k2, k3
+	do_4rounds_2x	\i + 0,  \k0,  v16, v17, v18, v19,  v20, v21, v22, v23
+	do_4rounds_2x	\i + 4,  \k1,  v17, v18, v19, v16,  v21, v22, v23, v20
+	do_4rounds_2x	\i + 8,  \k2,  v18, v19, v16, v17,  v22, v23, v20, v21
+	do_4rounds_2x	\i + 12, \k3,  v19, v16, v17, v18,  v23, v20, v21, v22
+	.endm
+
+//
+// void sha256_ce_finup2x(const struct __sha256_ctx *ctx,
+//			  const u8 *data1, const u8 *data2, int len,
+//			  u8 out1[SHA256_DIGEST_SIZE],
+//			  u8 out2[SHA256_DIGEST_SIZE]);
+//
+// This function computes the SHA-256 digests of two messages |data1| and
+// |data2| that are both |len| bytes long, starting from the initial context
+// |ctx|.  |len| must be at least SHA256_BLOCK_SIZE.
+//
+// The instructions for the two SHA-256 operations are interleaved.  On many
+// CPUs, this is almost twice as fast as hashing each message individually due
+// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
+//
+SYM_FUNC_START(sha256_ce_finup2x)
+	sub		sp, sp, #128
+	mov		final_step, #0
+	load_round_constants	x8
+
+	// Load the initial state from ctx->state.
+	ld1		{state0_a.4s-state1_a.4s}, [ctx]
+
+	// Load ctx->bytecount.  Take the mod 64 of it to get the number of
+	// bytes that are buffered in ctx->buf.  Also save it in a register with
+	// len added to it.
+	ldr		x8, [ctx, #OFFSETOF_BYTECOUNT]
+	add		count, x8, len, sxtw
+	and		x8, x8, #63
+	cbz		x8, .Lfinup2x_enter_loop	// No bytes buffered?
+
+	// x8 bytes (1 to 63) are currently buffered in ctx->buf.  Load them
+	// followed by the first 64 - x8 bytes of data.  Since len >= 64, we
+	// just load 64 bytes from each of ctx->buf, data1, and data2
+	// unconditionally and rearrange the data as needed.
+	add		x9, ctx, #OFFSETOF_BUF
+	ld1		{v16.16b-v19.16b}, [x9]
+	st1		{v16.16b-v19.16b}, [sp]
+
+	ld1		{v16.16b-v19.16b}, [data1], #64
+	add		x9, sp, x8
+	st1		{v16.16b-v19.16b}, [x9]
+	ld1		{v16.4s-v19.4s}, [sp]
+
+	ld1		{v20.16b-v23.16b}, [data2], #64
+	st1		{v20.16b-v23.16b}, [x9]
+	ld1		{v20.4s-v23.4s}, [sp]
+
+	sub		len, len, #64
+	sub		data1, data1, x8
+	sub		data2, data2, x8
+	add		len, len, w8
+	mov		state0_b.16b, state0_a.16b
+	mov		state1_b.16b, state1_a.16b
+	b		.Lfinup2x_loop_have_data
+
+.Lfinup2x_enter_loop:
+	sub		len, len, #64
+	mov		state0_b.16b, state0_a.16b
+	mov		state1_b.16b, state1_a.16b
+.Lfinup2x_loop:
+	// Load the next two data blocks.
+	ld1		{v16.4s-v19.4s}, [data1], #64
+	ld1		{v20.4s-v23.4s}, [data2], #64
+.Lfinup2x_loop_have_data:
+	// Convert the words of the data blocks from big endian.
+CPU_LE(	rev32		v16.16b, v16.16b	)
+CPU_LE(	rev32		v17.16b, v17.16b	)
+CPU_LE(	rev32		v18.16b, v18.16b	)
+CPU_LE(	rev32		v19.16b, v19.16b	)
+CPU_LE(	rev32		v20.16b, v20.16b	)
+CPU_LE(	rev32		v21.16b, v21.16b	)
+CPU_LE(	rev32		v22.16b, v22.16b	)
+CPU_LE(	rev32		v23.16b, v23.16b	)
+.Lfinup2x_loop_have_bswapped_data:
+
+	// Save the original state for each block.
+	st1		{state0_a.4s-state1_b.4s}, [sp]
+
+	// Do the SHA-256 rounds on each block.
+	do_16rounds_2x	0,  v0, v1, v2, v3
+	do_16rounds_2x	16, v4, v5, v6, v7
+	do_16rounds_2x	32, v8, v9, v10, v11
+	do_16rounds_2x	48, v12, v13, v14, v15
+
+	// Add the original state for each block.
+	ld1		{v16.4s-v19.4s}, [sp]
+	add		state0_a.4s, state0_a.4s, v16.4s
+	add		state1_a.4s, state1_a.4s, v17.4s
+	add		state0_b.4s, state0_b.4s, v18.4s
+	add		state1_b.4s, state1_b.4s, v19.4s
+
+	// Update len and loop back if more blocks remain.
+	sub		len, len, #64
+	tbz		len, #31, .Lfinup2x_loop	// len >= 0?
+
+	// Check if any final blocks need to be handled.
+	// final_step = 2: all done
+	// final_step = 1: need to do count-only padding block
+	// final_step = 0: need to do the block with 0x80 padding byte
+	tbnz		final_step, #1, .Lfinup2x_done
+	tbnz		final_step, #0, .Lfinup2x_finalize_countonly
+	add		len, len, #64
+	cbz		len, .Lfinup2x_finalize_blockaligned
+
+	// Not block-aligned; 1 <= len <= 63 data bytes remain.  Pad the block.
+	// To do this, write the padding starting with the 0x80 byte to
+	// &sp[64].  Then for each message, copy the last 64 data bytes to sp
+	// and load from &sp[64 - len] to get the needed padding block.  This
+	// code relies on the data buffers being >= 64 bytes in length.
+	sub		w8, len, #64		// w8 = len - 64
+	add		data1, data1, w8, sxtw	// data1 += len - 64
+	add		data2, data2, w8, sxtw	// data2 += len - 64
+CPU_LE(	mov		x9, #0x80		)
+CPU_LE(	fmov		d16, x9			)
+CPU_BE(	movi		v16.16b, #0		)
+CPU_BE(	mov		x9, #0x8000000000000000	)
+CPU_BE(	mov		v16.d[1], x9		)
+	movi		v17.16b, #0
+	stp		q16, q17, [sp, #64]
+	stp		q17, q17, [sp, #96]
+	sub		x9, sp, w8, sxtw	// x9 = &sp[64 - len]
+	cmp		len, #56
+	b.ge		1f		// will count spill into its own block?
+	lsl		count, count, #3
+CPU_LE(	rev		count, count		)
+	str		count, [x9, #56]
+	mov		final_step, #2	// won't need count-only block
+	b		2f
+1:
+	mov		final_step, #1	// will need count-only block
+2:
+	ld1		{v16.16b-v19.16b}, [data1]
+	st1		{v16.16b-v19.16b}, [sp]
+	ld1		{v16.4s-v19.4s}, [x9]
+	ld1		{v20.16b-v23.16b}, [data2]
+	st1		{v20.16b-v23.16b}, [sp]
+	ld1		{v20.4s-v23.4s}, [x9]
+	b		.Lfinup2x_loop_have_data
+
+	// Prepare a padding block, either:
+	//
+	//	{0x80, 0, 0, 0, ..., count (as __be64)}
+	//	This is for a block aligned message.
+	//
+	//	{   0, 0, 0, 0, ..., count (as __be64)}
+	//	This is for a message whose length mod 64 is >= 56.
+	//
+	// Pre-swap the endianness of the words.
+.Lfinup2x_finalize_countonly:
+	movi		v16.2d, #0
+	b		1f
+.Lfinup2x_finalize_blockaligned:
+	mov		x8, #0x80000000
+	fmov		d16, x8
+1:
+	movi		v17.2d, #0
+	movi		v18.2d, #0
+	ror		count, count, #29	// ror(lsl(count, 3), 32)
+	mov		v19.d[0], xzr
+	mov		v19.d[1], count
+	mov		v20.16b, v16.16b
+	movi		v21.2d, #0
+	movi		v22.2d, #0
+	mov		v23.16b, v19.16b
+	mov		final_step, #2
+	b		.Lfinup2x_loop_have_bswapped_data
+
+.Lfinup2x_done:
+	// Write the two digests with all bytes in the correct order.
+CPU_LE(	rev32		state0_a.16b, state0_a.16b	)
+CPU_LE(	rev32		state1_a.16b, state1_a.16b	)
+CPU_LE(	rev32		state0_b.16b, state0_b.16b	)
+CPU_LE(	rev32		state1_b.16b, state1_b.16b	)
+	st1		{state0_a.4s-state1_a.4s}, [out1]
+	st1		{state0_b.4s-state1_b.4s}, [out2]
+	add		sp, sp, #128
+	ret
+SYM_FUNC_END(sha256_ce_finup2x)
diff --git a/lib/crypto/arm64/sha256.h b/lib/crypto/arm64/sha256.h
index be4aeda9d0e6..80d06df27d3a 100644
--- a/lib/crypto/arm64/sha256.h
+++ b/lib/crypto/arm64/sha256.h
@@ -44,6 +44,43 @@ static void sha256_blocks(struct sha256_block_state *state,
 	}
 }
 
+static_assert(offsetof(struct __sha256_ctx, state) == 0);
+static_assert(offsetof(struct __sha256_ctx, bytecount) == 32);
+static_assert(offsetof(struct __sha256_ctx, buf) == 40);
+asmlinkage void sha256_ce_finup2x(const struct __sha256_ctx *ctx,
+				  const u8 *data1, const u8 *data2, int len,
+				  u8 out1[SHA256_DIGEST_SIZE],
+				  u8 out2[SHA256_DIGEST_SIZE]);
+
+#define sha256_finup_2x_arch sha256_finup_2x_arch
+static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
+				 const u8 *data1, const u8 *data2, size_t len,
+				 u8 out1[SHA256_DIGEST_SIZE],
+				 u8 out2[SHA256_DIGEST_SIZE])
+{
+	/*
+	 * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX.
+	 * Further limit len to 65536 to avoid spending too long with preemption
+	 * disabled.  (Of course, in practice len is nearly always 4096 anyway.)
+	 */
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    static_branch_likely(&have_ce) && len >= SHA256_BLOCK_SIZE &&
+	    len <= 65536 && likely(may_use_simd())) {
+		kernel_neon_begin();
+		sha256_ce_finup2x(ctx, data1, data2, len, out1, out2);
+		kernel_neon_end();
+		kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE);
+		kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE);
+		return true;
+	}
+	return false;
+}
+
+static bool sha256_finup_2x_is_optimized_arch(void)
+{
+	return static_key_enabled(&have_ce);
+}
+
 #ifdef CONFIG_KERNEL_MODE_NEON
 #define sha256_mod_init_arch sha256_mod_init_arch
 static void sha256_mod_init_arch(void)
diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c
index 8fa15165d23e..881b935418ce 100644
--- a/lib/crypto/sha256.c
+++ b/lib/crypto/sha256.c
@@ -25,13 +25,20 @@ static const struct sha256_block_state sha224_iv = {
 	},
 };
 
-static const struct sha256_block_state sha256_iv = {
-	.h = {
-		SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3,
-		SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7,
+static const struct sha256_ctx initial_sha256_ctx = {
+	.ctx = {
+		.state = {
+			.h = {
+				SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3,
+				SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7,
+			},
+		},
+		.bytecount = 0,
 	},
 };
 
+#define sha256_iv (initial_sha256_ctx.ctx.state)
+
 static const u32 sha256_K[64] = {
 	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
 	0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
@@ -261,8 +268,62 @@ void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE])
 }
 EXPORT_SYMBOL(sha256);
 
-/* pre-boot environment (as indicated by __DISABLE_EXPORTS) doesn't need HMAC */
+/*
+ * Pre-boot environment (as indicated by __DISABLE_EXPORTS being defined)
+ * doesn't need either HMAC support or interleaved hashing support
+ */
 #ifndef __DISABLE_EXPORTS
+
+#ifndef sha256_finup_2x_arch
+static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
+				 const u8 *data1, const u8 *data2, size_t len,
+				 u8 out1[SHA256_DIGEST_SIZE],
+				 u8 out2[SHA256_DIGEST_SIZE])
+{
+	return false;
+}
+static bool sha256_finup_2x_is_optimized_arch(void)
+{
+	return false;
+}
+#endif
+
+/* Sequential fallback implementation of sha256_finup_2x() */
+static noinline_for_stack void sha256_finup_2x_sequential(
+	const struct __sha256_ctx *ctx, const u8 *data1, const u8 *data2,
+	size_t len, u8 out1[SHA256_DIGEST_SIZE], u8 out2[SHA256_DIGEST_SIZE])
+{
+	struct __sha256_ctx mut_ctx;
+
+	mut_ctx = *ctx;
+	__sha256_update(&mut_ctx, data1, len);
+	__sha256_final(&mut_ctx, out1, SHA256_DIGEST_SIZE);
+
+	mut_ctx = *ctx;
+	__sha256_update(&mut_ctx, data2, len);
+	__sha256_final(&mut_ctx, out2, SHA256_DIGEST_SIZE);
+}
+
+void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1,
+		     const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE],
+		     u8 out2[SHA256_DIGEST_SIZE])
+{
+	if (ctx == NULL)
+		ctx = &initial_sha256_ctx;
+
+	if (likely(sha256_finup_2x_arch(&ctx->ctx, data1, data2, len, out1,
+					out2)))
+		return;
+	sha256_finup_2x_sequential(&ctx->ctx, data1, data2, len, out1, out2);
+}
+EXPORT_SYMBOL_GPL(sha256_finup_2x);
+
+bool sha256_finup_2x_is_optimized(void)
+{
+	return sha256_finup_2x_is_optimized_arch();
+}
+EXPORT_SYMBOL_GPL(sha256_finup_2x_is_optimized);
+
 static void __hmac_sha256_preparekey(struct sha256_block_state *istate,
 				     struct sha256_block_state *ostate,
 				     const u8 *raw_key, size_t raw_key_len,
diff --git a/lib/crypto/tests/sha256_kunit.c b/lib/crypto/tests/sha256_kunit.c
index 1cd4caee6010..dcedfca06df6 100644
--- a/lib/crypto/tests/sha256_kunit.c
+++ b/lib/crypto/tests/sha256_kunit.c
@@ -5,6 +5,7 @@
 #include <crypto/sha2.h>
 #include "sha256-testvecs.h"
 
+/* Generate the HASH_KUNIT_CASES using hash-test-template.h. */
 #define HASH sha256
 #define HASH_CTX sha256_ctx
 #define HASH_SIZE SHA256_DIGEST_SIZE
@@ -21,9 +22,192 @@
 #define HMAC_USINGRAWKEY hmac_sha256_usingrawkey
 #include "hash-test-template.h"
 
+static void free_guarded_buf(void *buf)
+{
+	vfree(buf);
+}
+
+/*
+ * Allocate a KUnit-managed buffer that has length @len bytes immediately
+ * followed by an unmapped page, and assert that the allocation succeeds.
+ */
+static void *alloc_guarded_buf(struct kunit *test, size_t len)
+{
+	size_t full_len = round_up(len, PAGE_SIZE);
+	void *buf = vmalloc(full_len);
+
+	KUNIT_ASSERT_NOT_NULL(test, buf);
+	KUNIT_ASSERT_EQ(test, 0,
+			kunit_add_action_or_reset(test, free_guarded_buf, buf));
+	return buf + full_len - len;
+}
+
+/*
+ * Test for sha256_finup_2x().  Specifically, choose various data lengths and
+ * salt lengths, and for each one, verify that sha256_finup_2x() produces the
+ * same results as sha256_update() and sha256_final().
+ *
+ * Use guarded buffers for all inputs and outputs to reliably detect any
+ * out-of-bounds reads or writes, even if they occur in assembly code.
+ */
+static void test_sha256_finup_2x(struct kunit *test)
+{
+	const size_t max_data_len = 16384;
+	u8 *data1_buf, *data2_buf, *hash1, *hash2;
+	u8 expected_hash1[SHA256_DIGEST_SIZE];
+	u8 expected_hash2[SHA256_DIGEST_SIZE];
+	u8 salt[SHA256_BLOCK_SIZE];
+	struct sha256_ctx *ctx;
+
+	data1_buf = alloc_guarded_buf(test, max_data_len);
+	data2_buf = alloc_guarded_buf(test, max_data_len);
+	hash1 = alloc_guarded_buf(test, SHA256_DIGEST_SIZE);
+	hash2 = alloc_guarded_buf(test, SHA256_DIGEST_SIZE);
+	ctx = alloc_guarded_buf(test, sizeof(*ctx));
+
+	rand_bytes(data1_buf, max_data_len);
+	rand_bytes(data2_buf, max_data_len);
+	rand_bytes(salt, sizeof(salt));
+
+	for (size_t i = 0; i < 500; i++) {
+		size_t salt_len = rand_length(sizeof(salt));
+		size_t data_len = rand_length(max_data_len);
+		const u8 *data1 = data1_buf + max_data_len - data_len;
+		const u8 *data2 = data2_buf + max_data_len - data_len;
+		struct sha256_ctx orig_ctx;
+
+		sha256_init(ctx);
+		sha256_update(ctx, salt, salt_len);
+		orig_ctx = *ctx;
+
+		sha256_finup_2x(ctx, data1, data2, data_len, hash1, hash2);
+		KUNIT_ASSERT_MEMEQ_MSG(
+			test, ctx, &orig_ctx, sizeof(*ctx),
+			"sha256_finup_2x() modified its ctx argument");
+
+		sha256_update(ctx, data1, data_len);
+		sha256_final(ctx, expected_hash1);
+		sha256_update(&orig_ctx, data2, data_len);
+		sha256_final(&orig_ctx, expected_hash2);
+		KUNIT_ASSERT_MEMEQ_MSG(
+			test, hash1, expected_hash1, SHA256_DIGEST_SIZE,
+			"Wrong hash1 with salt_len=%zu data_len=%zu", salt_len,
+			data_len);
+		KUNIT_ASSERT_MEMEQ_MSG(
+			test, hash2, expected_hash2, SHA256_DIGEST_SIZE,
+			"Wrong hash2 with salt_len=%zu data_len=%zu", salt_len,
+			data_len);
+	}
+}
+
+/* Test sha256_finup_2x() with ctx == NULL */
+static void test_sha256_finup_2x_defaultctx(struct kunit *test)
+{
+	const size_t data_len = 128;
+	struct sha256_ctx ctx;
+	u8 hash1_a[SHA256_DIGEST_SIZE];
+	u8 hash2_a[SHA256_DIGEST_SIZE];
+	u8 hash1_b[SHA256_DIGEST_SIZE];
+	u8 hash2_b[SHA256_DIGEST_SIZE];
+
+	rand_bytes(test_buf, 2 * data_len);
+
+	sha256_init(&ctx);
+	sha256_finup_2x(&ctx, test_buf, &test_buf[data_len], data_len, hash1_a,
+			hash2_a);
+
+	sha256_finup_2x(NULL, test_buf, &test_buf[data_len], data_len, hash1_b,
+			hash2_b);
+
+	KUNIT_ASSERT_MEMEQ(test, hash1_a, hash1_b, SHA256_DIGEST_SIZE);
+	KUNIT_ASSERT_MEMEQ(test, hash2_a, hash2_b, SHA256_DIGEST_SIZE);
+}
+
+/*
+ * Test that sha256_finup_2x() and sha256_update/final() produce consistent
+ * results with total message lengths that require more than 32 bits.
+ */
+static void test_sha256_finup_2x_hugelen(struct kunit *test)
+{
+	const size_t data_len = 4 * SHA256_BLOCK_SIZE;
+	struct sha256_ctx ctx = {};
+	u8 expected_hash[SHA256_DIGEST_SIZE];
+	u8 hash[SHA256_DIGEST_SIZE];
+
+	rand_bytes(test_buf, data_len);
+	for (size_t align = 0; align < SHA256_BLOCK_SIZE; align++) {
+		sha256_init(&ctx);
+		ctx.ctx.bytecount = 0x123456789abcd00 + align;
+
+		sha256_finup_2x(&ctx, test_buf, test_buf, data_len, hash, hash);
+
+		sha256_update(&ctx, test_buf, data_len);
+		sha256_final(&ctx, expected_hash);
+
+		KUNIT_ASSERT_MEMEQ(test, hash, expected_hash,
+				   SHA256_DIGEST_SIZE);
+	}
+}
+
+/* Benchmark for sha256_finup_2x() */
+static void benchmark_sha256_finup_2x(struct kunit *test)
+{
+	/*
+	 * Try a few different salt lengths, since sha256_finup_2x() performance
+	 * may vary slightly for the same data_len depending on how many bytes
+	 * were already processed in the initial context.
+	 */
+	static const size_t salt_lens_to_test[] = { 0, 32, 64 };
+	const size_t data_len = 4096;
+	const size_t num_iters = 4096;
+	struct sha256_ctx ctx;
+	u8 hash1[SHA256_DIGEST_SIZE];
+	u8 hash2[SHA256_DIGEST_SIZE];
+
+	if (!IS_ENABLED(CONFIG_CRYPTO_LIB_BENCHMARK))
+		kunit_skip(test, "not enabled");
+	if (!sha256_finup_2x_is_optimized())
+		kunit_skip(test, "not relevant");
+
+	rand_bytes(test_buf, data_len * 2);
+
+	/* Warm-up */
+	for (size_t i = 0; i < num_iters; i++)
+		sha256_finup_2x(NULL, &test_buf[0], &test_buf[data_len],
+				data_len, hash1, hash2);
+
+	for (size_t i = 0; i < ARRAY_SIZE(salt_lens_to_test); i++) {
+		size_t salt_len = salt_lens_to_test[i];
+		u64 t0, t1;
+
+		/*
+		 * Prepare the initial context.  The time to process the salt is
+		 * not measured; we're just interested in sha256_finup_2x().
+		 */
+		sha256_init(&ctx);
+		sha256_update(&ctx, test_buf, salt_len);
+
+		preempt_disable();
+		t0 = ktime_get_ns();
+		for (size_t j = 0; j < num_iters; j++)
+			sha256_finup_2x(&ctx, &test_buf[0], &test_buf[data_len],
+					data_len, hash1, hash2);
+		t1 = ktime_get_ns();
+		preempt_enable();
+		kunit_info(test, "data_len=%zu salt_len=%zu: %llu MB/s",
+			   data_len, salt_len,
+			   div64_u64((u64)data_len * 2 * num_iters * 1000,
+				     t1 - t0 ?: 1));
+	}
+}
+
 static struct kunit_case hash_test_cases[] = {
 	HASH_KUNIT_CASES,
+	KUNIT_CASE(test_sha256_finup_2x),
+	KUNIT_CASE(test_sha256_finup_2x_defaultctx),
+	KUNIT_CASE(test_sha256_finup_2x_hugelen),
 	KUNIT_CASE(benchmark_hash),
+	KUNIT_CASE(benchmark_sha256_finup_2x),
 	{},
 };
 
diff --git a/lib/crypto/x86/sha256-ni-asm.S b/lib/crypto/x86/sha256-ni-asm.S
index 4bd9490ffc66..de5f707e7ef7 100644
--- a/lib/crypto/x86/sha256-ni-asm.S
+++ b/lib/crypto/x86/sha256-ni-asm.S
@@ -165,6 +165,374 @@ SYM_FUNC_START(sha256_ni_transform)
 	RET
 SYM_FUNC_END(sha256_ni_transform)
 
+#undef DIGEST_PTR
+#undef DATA_PTR
+#undef NUM_BLKS
+#undef SHA256CONSTANTS
+#undef MSG
+#undef STATE0
+#undef STATE1
+#undef MSG0
+#undef MSG1
+#undef MSG2
+#undef MSG3
+#undef TMP
+#undef SHUF_MASK
+#undef ABEF_SAVE
+#undef CDGH_SAVE
+
+// parameters for sha256_ni_finup2x()
+#define CTX		%rdi
+#define DATA1		%rsi
+#define DATA2		%rdx
+#define LEN		%ecx
+#define LEN8		%cl
+#define LEN64		%rcx
+#define OUT1		%r8
+#define OUT2		%r9
+
+// other scalar variables
+#define SHA256CONSTANTS	%rax
+#define COUNT		%r10
+#define COUNT32		%r10d
+#define FINAL_STEP	%r11d
+
+// rbx is used as a temporary.
+
+#define MSG		%xmm0	// sha256rnds2 implicit operand
+#define STATE0_A	%xmm1
+#define STATE1_A	%xmm2
+#define STATE0_B	%xmm3
+#define STATE1_B	%xmm4
+#define TMP_A		%xmm5
+#define TMP_B		%xmm6
+#define MSG0_A		%xmm7
+#define MSG1_A		%xmm8
+#define MSG2_A		%xmm9
+#define MSG3_A		%xmm10
+#define MSG0_B		%xmm11
+#define MSG1_B		%xmm12
+#define MSG2_B		%xmm13
+#define MSG3_B		%xmm14
+#define SHUF_MASK	%xmm15
+
+#define OFFSETOF_STATE		0  // offsetof(struct __sha256_ctx, state)
+#define OFFSETOF_BYTECOUNT	32 // offsetof(struct __sha256_ctx, bytecount)
+#define OFFSETOF_BUF		40 // offsetof(struct __sha256_ctx, buf)
+
+// Do 4 rounds of SHA-256 for each of two messages (interleaved).  m0_a and m0_b
+// contain the current 4 message schedule words for the first and second message
+// respectively.
+//
+// If not all the message schedule words have been computed yet, then this also
+// computes 4 more message schedule words for each message.  m1_a-m3_a contain
+// the next 3 groups of 4 message schedule words for the first message, and
+// likewise m1_b-m3_b for the second.  After consuming the current value of
+// m0_a, this macro computes the group after m3_a and writes it to m0_a, and
+// likewise for *_b.  This means that the next (m0_a, m1_a, m2_a, m3_a) is the
+// current (m1_a, m2_a, m3_a, m0_a), and likewise for *_b, so the caller must
+// cycle through the registers accordingly.
+.macro	do_4rounds_2x	i, m0_a, m1_a, m2_a, m3_a,  m0_b, m1_b, m2_b, m3_b
+	movdqa		(\i-32)*4(SHA256CONSTANTS), TMP_A
+	movdqa		TMP_A, TMP_B
+	paddd		\m0_a, TMP_A
+	paddd		\m0_b, TMP_B
+.if \i < 48
+	sha256msg1	\m1_a, \m0_a
+	sha256msg1	\m1_b, \m0_b
+.endif
+	movdqa		TMP_A, MSG
+	sha256rnds2	STATE0_A, STATE1_A
+	movdqa		TMP_B, MSG
+	sha256rnds2	STATE0_B, STATE1_B
+	pshufd 		$0x0E, TMP_A, MSG
+	sha256rnds2	STATE1_A, STATE0_A
+	pshufd 		$0x0E, TMP_B, MSG
+	sha256rnds2	STATE1_B, STATE0_B
+.if \i < 48
+	movdqa		\m3_a, TMP_A
+	movdqa		\m3_b, TMP_B
+	palignr		$4, \m2_a, TMP_A
+	palignr		$4, \m2_b, TMP_B
+	paddd		TMP_A, \m0_a
+	paddd		TMP_B, \m0_b
+	sha256msg2	\m3_a, \m0_a
+	sha256msg2	\m3_b, \m0_b
+.endif
+.endm
+
+//
+// void sha256_ni_finup2x(const struct __sha256_ctx *ctx,
+//			  const u8 *data1, const u8 *data2, int len,
+//			  u8 out1[SHA256_DIGEST_SIZE],
+//			  u8 out2[SHA256_DIGEST_SIZE]);
+//
+// This function computes the SHA-256 digests of two messages |data1| and
+// |data2| that are both |len| bytes long, starting from the initial context
+// |ctx|.  |len| must be at least SHA256_BLOCK_SIZE.
+//
+// The instructions for the two SHA-256 operations are interleaved.  On many
+// CPUs, this is almost twice as fast as hashing each message individually due
+// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
+//
+SYM_FUNC_START(sha256_ni_finup2x)
+	// Allocate 128 bytes of stack space, 16-byte aligned.
+	push		%rbx
+	push		%rbp
+	mov		%rsp, %rbp
+	sub		$128, %rsp
+	and		$~15, %rsp
+
+	// Load the shuffle mask for swapping the endianness of 32-bit words.
+	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+
+	// Set up pointer to the round constants.
+	lea		K256+32*4(%rip), SHA256CONSTANTS
+
+	// Initially we're not processing the final blocks.
+	xor		FINAL_STEP, FINAL_STEP
+
+	// Load the initial state from ctx->state.
+	movdqu		OFFSETOF_STATE+0*16(CTX), STATE0_A	// DCBA
+	movdqu		OFFSETOF_STATE+1*16(CTX), STATE1_A	// HGFE
+	movdqa		STATE0_A, TMP_A
+	punpcklqdq	STATE1_A, STATE0_A			// FEBA
+	punpckhqdq	TMP_A, STATE1_A				// DCHG
+	pshufd		$0x1B, STATE0_A, STATE0_A		// ABEF
+	pshufd		$0xB1, STATE1_A, STATE1_A		// CDGH
+
+	// Load ctx->bytecount.  Take the mod 64 of it to get the number of
+	// bytes that are buffered in ctx->buf.  Also save it in a register with
+	// LEN added to it.
+	mov		LEN, LEN
+	mov		OFFSETOF_BYTECOUNT(CTX), %rbx
+	lea		(%rbx, LEN64, 1), COUNT
+	and		$63, %ebx
+	jz		.Lfinup2x_enter_loop	// No bytes buffered?
+
+	// %ebx bytes (1 to 63) are currently buffered in ctx->buf.  Load them
+	// followed by the first 64 - %ebx bytes of data.  Since LEN >= 64, we
+	// just load 64 bytes from each of ctx->buf, DATA1, and DATA2
+	// unconditionally and rearrange the data as needed.
+
+	movdqu		OFFSETOF_BUF+0*16(CTX), MSG0_A
+	movdqu		OFFSETOF_BUF+1*16(CTX), MSG1_A
+	movdqu		OFFSETOF_BUF+2*16(CTX), MSG2_A
+	movdqu		OFFSETOF_BUF+3*16(CTX), MSG3_A
+	movdqa		MSG0_A, 0*16(%rsp)
+	movdqa		MSG1_A, 1*16(%rsp)
+	movdqa		MSG2_A, 2*16(%rsp)
+	movdqa		MSG3_A, 3*16(%rsp)
+
+	movdqu		0*16(DATA1), MSG0_A
+	movdqu		1*16(DATA1), MSG1_A
+	movdqu		2*16(DATA1), MSG2_A
+	movdqu		3*16(DATA1), MSG3_A
+	movdqu		MSG0_A, 0*16(%rsp,%rbx)
+	movdqu		MSG1_A, 1*16(%rsp,%rbx)
+	movdqu		MSG2_A, 2*16(%rsp,%rbx)
+	movdqu		MSG3_A, 3*16(%rsp,%rbx)
+	movdqa		0*16(%rsp), MSG0_A
+	movdqa		1*16(%rsp), MSG1_A
+	movdqa		2*16(%rsp), MSG2_A
+	movdqa		3*16(%rsp), MSG3_A
+
+	movdqu		0*16(DATA2), MSG0_B
+	movdqu		1*16(DATA2), MSG1_B
+	movdqu		2*16(DATA2), MSG2_B
+	movdqu		3*16(DATA2), MSG3_B
+	movdqu		MSG0_B, 0*16(%rsp,%rbx)
+	movdqu		MSG1_B, 1*16(%rsp,%rbx)
+	movdqu		MSG2_B, 2*16(%rsp,%rbx)
+	movdqu		MSG3_B, 3*16(%rsp,%rbx)
+	movdqa		0*16(%rsp), MSG0_B
+	movdqa		1*16(%rsp), MSG1_B
+	movdqa		2*16(%rsp), MSG2_B
+	movdqa		3*16(%rsp), MSG3_B
+
+	sub		$64, %rbx 	// rbx = buffered - 64
+	sub		%rbx, DATA1	// DATA1 += 64 - buffered
+	sub		%rbx, DATA2	// DATA2 += 64 - buffered
+	add		%ebx, LEN	// LEN += buffered - 64
+	movdqa		STATE0_A, STATE0_B
+	movdqa		STATE1_A, STATE1_B
+	jmp		.Lfinup2x_loop_have_data
+
+.Lfinup2x_enter_loop:
+	sub		$64, LEN
+	movdqa		STATE0_A, STATE0_B
+	movdqa		STATE1_A, STATE1_B
+.Lfinup2x_loop:
+	// Load the next two data blocks.
+	movdqu		0*16(DATA1), MSG0_A
+	movdqu		0*16(DATA2), MSG0_B
+	movdqu		1*16(DATA1), MSG1_A
+	movdqu		1*16(DATA2), MSG1_B
+	movdqu		2*16(DATA1), MSG2_A
+	movdqu		2*16(DATA2), MSG2_B
+	movdqu		3*16(DATA1), MSG3_A
+	movdqu		3*16(DATA2), MSG3_B
+	add		$64, DATA1
+	add		$64, DATA2
+.Lfinup2x_loop_have_data:
+	// Convert the words of the data blocks from big endian.
+	pshufb		SHUF_MASK, MSG0_A
+	pshufb		SHUF_MASK, MSG0_B
+	pshufb		SHUF_MASK, MSG1_A
+	pshufb		SHUF_MASK, MSG1_B
+	pshufb		SHUF_MASK, MSG2_A
+	pshufb		SHUF_MASK, MSG2_B
+	pshufb		SHUF_MASK, MSG3_A
+	pshufb		SHUF_MASK, MSG3_B
+.Lfinup2x_loop_have_bswapped_data:
+
+	// Save the original state for each block.
+	movdqa		STATE0_A, 0*16(%rsp)
+	movdqa		STATE0_B, 1*16(%rsp)
+	movdqa		STATE1_A, 2*16(%rsp)
+	movdqa		STATE1_B, 3*16(%rsp)
+
+	// Do the SHA-256 rounds on each block.
+.irp i, 0, 16, 32, 48
+	do_4rounds_2x	(\i + 0),  MSG0_A, MSG1_A, MSG2_A, MSG3_A, \
+				   MSG0_B, MSG1_B, MSG2_B, MSG3_B
+	do_4rounds_2x	(\i + 4),  MSG1_A, MSG2_A, MSG3_A, MSG0_A, \
+				   MSG1_B, MSG2_B, MSG3_B, MSG0_B
+	do_4rounds_2x	(\i + 8),  MSG2_A, MSG3_A, MSG0_A, MSG1_A, \
+				   MSG2_B, MSG3_B, MSG0_B, MSG1_B
+	do_4rounds_2x	(\i + 12), MSG3_A, MSG0_A, MSG1_A, MSG2_A, \
+				   MSG3_B, MSG0_B, MSG1_B, MSG2_B
+.endr
+
+	// Add the original state for each block.
+	paddd		0*16(%rsp), STATE0_A
+	paddd		1*16(%rsp), STATE0_B
+	paddd		2*16(%rsp), STATE1_A
+	paddd		3*16(%rsp), STATE1_B
+
+	// Update LEN and loop back if more blocks remain.
+	sub		$64, LEN
+	jge		.Lfinup2x_loop
+
+	// Check if any final blocks need to be handled.
+	// FINAL_STEP = 2: all done
+	// FINAL_STEP = 1: need to do count-only padding block
+	// FINAL_STEP = 0: need to do the block with 0x80 padding byte
+	cmp		$1, FINAL_STEP
+	jg		.Lfinup2x_done
+	je		.Lfinup2x_finalize_countonly
+	add		$64, LEN
+	jz		.Lfinup2x_finalize_blockaligned
+
+	// Not block-aligned; 1 <= LEN <= 63 data bytes remain.  Pad the block.
+	// To do this, write the padding starting with the 0x80 byte to
+	// &sp[64].  Then for each message, copy the last 64 data bytes to sp
+	// and load from &sp[64 - LEN] to get the needed padding block.  This
+	// code relies on the data buffers being >= 64 bytes in length.
+	mov		$64, %ebx
+	sub		LEN, %ebx		// ebx = 64 - LEN
+	sub		%rbx, DATA1		// DATA1 -= 64 - LEN
+	sub		%rbx, DATA2		// DATA2 -= 64 - LEN
+	mov		$0x80, FINAL_STEP   // using FINAL_STEP as a temporary
+	movd		FINAL_STEP, MSG0_A
+	pxor		MSG1_A, MSG1_A
+	movdqa		MSG0_A, 4*16(%rsp)
+	movdqa		MSG1_A, 5*16(%rsp)
+	movdqa		MSG1_A, 6*16(%rsp)
+	movdqa		MSG1_A, 7*16(%rsp)
+	cmp		$56, LEN
+	jge		1f	// will COUNT spill into its own block?
+	shl		$3, COUNT
+	bswap		COUNT
+	mov		COUNT, 56(%rsp,%rbx)
+	mov		$2, FINAL_STEP	// won't need count-only block
+	jmp		2f
+1:
+	mov		$1, FINAL_STEP	// will need count-only block
+2:
+	movdqu		0*16(DATA1), MSG0_A
+	movdqu		1*16(DATA1), MSG1_A
+	movdqu		2*16(DATA1), MSG2_A
+	movdqu		3*16(DATA1), MSG3_A
+	movdqa		MSG0_A, 0*16(%rsp)
+	movdqa		MSG1_A, 1*16(%rsp)
+	movdqa		MSG2_A, 2*16(%rsp)
+	movdqa		MSG3_A, 3*16(%rsp)
+	movdqu		0*16(%rsp,%rbx), MSG0_A
+	movdqu		1*16(%rsp,%rbx), MSG1_A
+	movdqu		2*16(%rsp,%rbx), MSG2_A
+	movdqu		3*16(%rsp,%rbx), MSG3_A
+
+	movdqu		0*16(DATA2), MSG0_B
+	movdqu		1*16(DATA2), MSG1_B
+	movdqu		2*16(DATA2), MSG2_B
+	movdqu		3*16(DATA2), MSG3_B
+	movdqa		MSG0_B, 0*16(%rsp)
+	movdqa		MSG1_B, 1*16(%rsp)
+	movdqa		MSG2_B, 2*16(%rsp)
+	movdqa		MSG3_B, 3*16(%rsp)
+	movdqu		0*16(%rsp,%rbx), MSG0_B
+	movdqu		1*16(%rsp,%rbx), MSG1_B
+	movdqu		2*16(%rsp,%rbx), MSG2_B
+	movdqu		3*16(%rsp,%rbx), MSG3_B
+	jmp		.Lfinup2x_loop_have_data
+
+	// Prepare a padding block, either:
+	//
+	//	{0x80, 0, 0, 0, ..., count (as __be64)}
+	//	This is for a block aligned message.
+	//
+	//	{   0, 0, 0, 0, ..., count (as __be64)}
+	//	This is for a message whose length mod 64 is >= 56.
+	//
+	// Pre-swap the endianness of the words.
+.Lfinup2x_finalize_countonly:
+	pxor		MSG0_A, MSG0_A
+	jmp		1f
+
+.Lfinup2x_finalize_blockaligned:
+	mov		$0x80000000, %ebx
+	movd		%ebx, MSG0_A
+1:
+	pxor		MSG1_A, MSG1_A
+	pxor		MSG2_A, MSG2_A
+	ror		$29, COUNT
+	movq		COUNT, MSG3_A
+	pslldq		$8, MSG3_A
+	movdqa		MSG0_A, MSG0_B
+	pxor		MSG1_B, MSG1_B
+	pxor		MSG2_B, MSG2_B
+	movdqa		MSG3_A, MSG3_B
+	mov		$2, FINAL_STEP
+	jmp		.Lfinup2x_loop_have_bswapped_data
+
+.Lfinup2x_done:
+	// Write the two digests with all bytes in the correct order.
+	movdqa		STATE0_A, TMP_A
+	movdqa		STATE0_B, TMP_B
+	punpcklqdq	STATE1_A, STATE0_A		// GHEF
+	punpcklqdq	STATE1_B, STATE0_B
+	punpckhqdq	TMP_A, STATE1_A			// ABCD
+	punpckhqdq	TMP_B, STATE1_B
+	pshufd		$0xB1, STATE0_A, STATE0_A	// HGFE
+	pshufd		$0xB1, STATE0_B, STATE0_B
+	pshufd		$0x1B, STATE1_A, STATE1_A	// DCBA
+	pshufd		$0x1B, STATE1_B, STATE1_B
+	pshufb		SHUF_MASK, STATE0_A
+	pshufb		SHUF_MASK, STATE0_B
+	pshufb		SHUF_MASK, STATE1_A
+	pshufb		SHUF_MASK, STATE1_B
+	movdqu		STATE0_A, 1*16(OUT1)
+	movdqu		STATE0_B, 1*16(OUT2)
+	movdqu		STATE1_A, 0*16(OUT1)
+	movdqu		STATE1_B, 0*16(OUT2)
+
+	mov		%rbp, %rsp
+	pop		%rbp
+	pop		%rbx
+	RET
+SYM_FUNC_END(sha256_ni_finup2x)
+
 .section	.rodata.cst256.K256, "aM", @progbits, 256
 .align 64
 K256:
diff --git a/lib/crypto/x86/sha256.h b/lib/crypto/x86/sha256.h
index 41fa95fbc3bf..38e33b22a092 100644
--- a/lib/crypto/x86/sha256.h
+++ b/lib/crypto/x86/sha256.h
@@ -7,6 +7,8 @@
 #include <asm/fpu/api.h>
 #include <linux/static_call.h>
 
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha_ni);
+
 DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic);
 
 #define DEFINE_X86_SHA256_FN(c_fn, asm_fn)                                 \
@@ -35,11 +37,48 @@ static void sha256_blocks(struct sha256_block_state *state,
 	static_call(sha256_blocks_x86)(state, data, nblocks);
 }
 
+static_assert(offsetof(struct __sha256_ctx, state) == 0);
+static_assert(offsetof(struct __sha256_ctx, bytecount) == 32);
+static_assert(offsetof(struct __sha256_ctx, buf) == 40);
+asmlinkage void sha256_ni_finup2x(const struct __sha256_ctx *ctx,
+				  const u8 *data1, const u8 *data2, int len,
+				  u8 out1[SHA256_DIGEST_SIZE],
+				  u8 out2[SHA256_DIGEST_SIZE]);
+
+#define sha256_finup_2x_arch sha256_finup_2x_arch
+static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
+				 const u8 *data1, const u8 *data2, size_t len,
+				 u8 out1[SHA256_DIGEST_SIZE],
+				 u8 out2[SHA256_DIGEST_SIZE])
+{
+	/*
+	 * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX.
+	 * Further limit len to 65536 to avoid spending too long with preemption
+	 * disabled.  (Of course, in practice len is nearly always 4096 anyway.)
+	 */
+	if (static_branch_likely(&have_sha_ni) && len >= SHA256_BLOCK_SIZE &&
+	    len <= 65536 && likely(irq_fpu_usable())) {
+		kernel_fpu_begin();
+		sha256_ni_finup2x(ctx, data1, data2, len, out1, out2);
+		kernel_fpu_end();
+		kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE);
+		kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE);
+		return true;
+	}
+	return false;
+}
+
+static bool sha256_finup_2x_is_optimized_arch(void)
+{
+	return static_key_enabled(&have_sha_ni);
+}
+
 #define sha256_mod_init_arch sha256_mod_init_arch
 static void sha256_mod_init_arch(void)
 {
 	if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
 		static_call_update(sha256_blocks_x86, sha256_blocks_ni);
+		static_branch_enable(&have_sha_ni);
 	} else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
 				     NULL) &&
 		   boot_cpu_has(X86_FEATURE_AVX)) {
diff --git a/lib/raid6/recov_rvv.c b/lib/raid6/recov_rvv.c
index 5d54c4b437df..5f779719c3d3 100644
--- a/lib/raid6/recov_rvv.c
+++ b/lib/raid6/recov_rvv.c
@@ -4,9 +4,7 @@
  * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
  */
 
-#include <asm/simd.h>
 #include <asm/vector.h>
-#include <crypto/internal/simd.h>
 #include <linux/raid/pq.h>
 
 static int rvv_has_vector(void)
diff --git a/lib/raid6/rvv.c b/lib/raid6/rvv.c
index 7d82efa5b14f..89da5fc247aa 100644
--- a/lib/raid6/rvv.c
+++ b/lib/raid6/rvv.c
@@ -9,11 +9,8 @@
  *	Copyright 2002-2004 H. Peter Anvin
  */
 
-#include <asm/simd.h>
 #include <asm/vector.h>
-#include <crypto/internal/simd.h>
 #include <linux/raid/pq.h>
-#include <linux/types.h>
 #include "rvv.h"
 
 #define NSIZE	(riscv_v_vsize / 32) /* NSIZE = vlenb */
@@ -47,7 +44,7 @@ static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **
 		asm volatile (".option	push\n"
 			      ".option	arch,+v\n"
 			      "vle8.v	v0, (%[wp0])\n"
-			      "vle8.v	v1, (%[wp0])\n"
+			      "vmv.v.v	v1, v0\n"
 			      ".option	pop\n"
 			      : :
 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE])
@@ -120,7 +117,7 @@ static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
 		asm volatile (".option	push\n"
 			      ".option	arch,+v\n"
 			      "vle8.v	v0, (%[wp0])\n"
-			      "vle8.v	v1, (%[wp0])\n"
+			      "vmv.v.v	v1, v0\n"
 			      ".option	pop\n"
 			      : :
 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE])
@@ -221,9 +218,9 @@ static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **
 		asm volatile (".option	push\n"
 			      ".option	arch,+v\n"
 			      "vle8.v	v0, (%[wp0])\n"
-			      "vle8.v	v1, (%[wp0])\n"
+			      "vmv.v.v	v1, v0\n"
 			      "vle8.v	v4, (%[wp1])\n"
-			      "vle8.v	v5, (%[wp1])\n"
+			      "vmv.v.v	v5, v4\n"
 			      ".option	pop\n"
 			      : :
 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
@@ -313,9 +310,9 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
 		asm volatile (".option	push\n"
 			      ".option	arch,+v\n"
 			      "vle8.v	v0, (%[wp0])\n"
-			      "vle8.v	v1, (%[wp0])\n"
+			      "vmv.v.v	v1, v0\n"
 			      "vle8.v	v4, (%[wp1])\n"
-			      "vle8.v	v5, (%[wp1])\n"
+			      "vmv.v.v	v5, v4\n"
 			      ".option	pop\n"
 			      : :
 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
@@ -443,13 +440,13 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **
 		asm volatile (".option	push\n"
 			      ".option	arch,+v\n"
 			      "vle8.v	v0, (%[wp0])\n"
-			      "vle8.v	v1, (%[wp0])\n"
+			      "vmv.v.v	v1, v0\n"
 			      "vle8.v	v4, (%[wp1])\n"
-			      "vle8.v	v5, (%[wp1])\n"
+			      "vmv.v.v	v5, v4\n"
 			      "vle8.v	v8, (%[wp2])\n"
-			      "vle8.v	v9, (%[wp2])\n"
+			      "vmv.v.v	v9, v8\n"
 			      "vle8.v	v12, (%[wp3])\n"
-			      "vle8.v	v13, (%[wp3])\n"
+			      "vmv.v.v	v13, v12\n"
 			      ".option	pop\n"
 			      : :
 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
@@ -569,13 +566,13 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
 		asm volatile (".option	push\n"
 			      ".option	arch,+v\n"
 			      "vle8.v	v0, (%[wp0])\n"
-			      "vle8.v	v1, (%[wp0])\n"
+			      "vmv.v.v	v1, v0\n"
 			      "vle8.v	v4, (%[wp1])\n"
-			      "vle8.v	v5, (%[wp1])\n"
+			      "vmv.v.v	v5, v4\n"
 			      "vle8.v	v8, (%[wp2])\n"
-			      "vle8.v	v9, (%[wp2])\n"
+			      "vmv.v.v	v9, v8\n"
 			      "vle8.v	v12, (%[wp3])\n"
-			      "vle8.v	v13, (%[wp3])\n"
+			      "vmv.v.v	v13, v12\n"
 			      ".option	pop\n"
 			      : :
 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
@@ -757,21 +754,21 @@ static void raid6_rvv8_gen_syndrome_real(int disks, unsigned long bytes, void **
 		asm volatile (".option	push\n"
 			      ".option	arch,+v\n"
 			      "vle8.v	v0, (%[wp0])\n"
-			      "vle8.v	v1, (%[wp0])\n"
+			      "vmv.v.v	v1, v0\n"
 			      "vle8.v	v4, (%[wp1])\n"
-			      "vle8.v	v5, (%[wp1])\n"
+			      "vmv.v.v	v5, v4\n"
 			      "vle8.v	v8, (%[wp2])\n"
-			      "vle8.v	v9, (%[wp2])\n"
+			      "vmv.v.v	v9, v8\n"
 			      "vle8.v	v12, (%[wp3])\n"
-			      "vle8.v	v13, (%[wp3])\n"
+			      "vmv.v.v	v13, v12\n"
 			      "vle8.v	v16, (%[wp4])\n"
-			      "vle8.v	v17, (%[wp4])\n"
+			      "vmv.v.v	v17, v16\n"
 			      "vle8.v	v20, (%[wp5])\n"
-			      "vle8.v	v21, (%[wp5])\n"
+			      "vmv.v.v	v21, v20\n"
 			      "vle8.v	v24, (%[wp6])\n"
-			      "vle8.v	v25, (%[wp6])\n"
+			      "vmv.v.v	v25, v24\n"
 			      "vle8.v	v28, (%[wp7])\n"
-			      "vle8.v	v29, (%[wp7])\n"
+			      "vmv.v.v	v29, v28\n"
 			      ".option	pop\n"
 			      : :
 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
@@ -951,21 +948,21 @@ static void raid6_rvv8_xor_syndrome_real(int disks, int start, int stop,
 		asm volatile (".option	push\n"
 			      ".option	arch,+v\n"
 			      "vle8.v	v0, (%[wp0])\n"
-			      "vle8.v	v1, (%[wp0])\n"
+			      "vmv.v.v	v1, v0\n"
 			      "vle8.v	v4, (%[wp1])\n"
-			      "vle8.v	v5, (%[wp1])\n"
+			      "vmv.v.v	v5, v4\n"
 			      "vle8.v	v8, (%[wp2])\n"
-			      "vle8.v	v9, (%[wp2])\n"
+			      "vmv.v.v	v9, v8\n"
 			      "vle8.v	v12, (%[wp3])\n"
-			      "vle8.v	v13, (%[wp3])\n"
+			      "vmv.v.v	v13, v12\n"
 			      "vle8.v	v16, (%[wp4])\n"
-			      "vle8.v	v17, (%[wp4])\n"
+			      "vmv.v.v	v17, v16\n"
 			      "vle8.v	v20, (%[wp5])\n"
-			      "vle8.v	v21, (%[wp5])\n"
+			      "vmv.v.v	v21, v20\n"
 			      "vle8.v	v24, (%[wp6])\n"
-			      "vle8.v	v25, (%[wp6])\n"
+			      "vmv.v.v	v25, v24\n"
 			      "vle8.v	v28, (%[wp7])\n"
-			      "vle8.v	v29, (%[wp7])\n"
+			      "vmv.v.v	v29, v28\n"
 			      ".option	pop\n"
 			      : :
 			      [wp0]"r"(&dptr[z0][d + 0 * NSIZE]),
diff --git a/lib/tests/Makefile b/lib/tests/Makefile
index fa6d728a8b5b..f7460831cfdd 100644
--- a/lib/tests/Makefile
+++ b/lib/tests/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_BLACKHOLE_DEV_KUNIT_TEST) += blackhole_dev_kunit.o
 obj-$(CONFIG_CHECKSUM_KUNIT) += checksum_kunit.o
 obj-$(CONFIG_CMDLINE_KUNIT_TEST) += cmdline_kunit.o
 obj-$(CONFIG_CPUMASK_KUNIT_TEST) += cpumask_kunit.o
+obj-$(CONFIG_FFS_KUNIT_TEST) += ffs_kunit.o
 CFLAGS_fortify_kunit.o += $(call cc-disable-warning, unsequenced)
 CFLAGS_fortify_kunit.o += $(call cc-disable-warning, stringop-overread)
 CFLAGS_fortify_kunit.o += $(call cc-disable-warning, stringop-truncation)
diff --git a/lib/tests/ffs_kunit.c b/lib/tests/ffs_kunit.c
new file mode 100644
index 000000000000..9a329cdc09c2
--- /dev/null
+++ b/lib/tests/ffs_kunit.c
@@ -0,0 +1,566 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KUnit tests for ffs()-family functions
+ */
+#include <kunit/test.h>
+#include <linux/bitops.h>
+
+/*
+ * Test data structures
+ */
+struct ffs_test_case {
+	unsigned long input;
+	int expected_ffs;	/* ffs() result (1-based) */
+	int expected_fls;	/* fls() result (1-based) */
+	const char *description;
+};
+
+struct ffs64_test_case {
+	u64 input;
+	int expected_fls64;		    /* fls64() result (1-based) */
+	unsigned int expected_ffs64_0based; /* __ffs64() result (0-based) */
+	const char *description;
+};
+
+/*
+ * Basic edge cases - core functionality validation
+ */
+static const struct ffs_test_case basic_test_cases[] = {
+	/* Zero case - special handling */
+	{0x00000000, 0, 0, "zero value"},
+
+	/* Single bit patterns - powers of 2 */
+	{0x00000001, 1, 1, "bit 0 set"},
+	{0x00000002, 2, 2, "bit 1 set"},
+	{0x00000004, 3, 3, "bit 2 set"},
+	{0x00000008, 4, 4, "bit 3 set"},
+	{0x00000010, 5, 5, "bit 4 set"},
+	{0x00000020, 6, 6, "bit 5 set"},
+	{0x00000040, 7, 7, "bit 6 set"},
+	{0x00000080, 8, 8, "bit 7 set"},
+	{0x00000100, 9, 9, "bit 8 set"},
+	{0x00008000, 16, 16, "bit 15 set"},
+	{0x00010000, 17, 17, "bit 16 set"},
+	{0x40000000, 31, 31, "bit 30 set"},
+	{0x80000000, 32, 32, "bit 31 set (sign bit)"},
+
+	/* Maximum values */
+	{0xFFFFFFFF, 1, 32, "all bits set"},
+
+	/* Multiple bit patterns */
+	{0x00000003, 1, 2, "bits 0-1 set"},
+	{0x00000007, 1, 3, "bits 0-2 set"},
+	{0x0000000F, 1, 4, "bits 0-3 set"},
+	{0x000000FF, 1, 8, "bits 0-7 set"},
+	{0x0000FFFF, 1, 16, "bits 0-15 set"},
+	{0x7FFFFFFF, 1, 31, "bits 0-30 set"},
+
+	/* Sparse patterns */
+	{0x00000101, 1, 9, "bits 0,8 set"},
+	{0x00001001, 1, 13, "bits 0,12 set"},
+	{0x80000001, 1, 32, "bits 0,31 set"},
+	{0x40000002, 2, 31, "bits 1,30 set"},
+};
+
+/*
+ * 64-bit test cases
+ */
+static const struct ffs64_test_case ffs64_test_cases[] = {
+	/* Zero case */
+	{0x0000000000000000ULL, 0, 0, "zero value"},
+
+	/* Single bit patterns */
+	{0x0000000000000001ULL, 1, 0, "bit 0 set"},
+	{0x0000000000000002ULL, 2, 1, "bit 1 set"},
+	{0x0000000000000004ULL, 3, 2, "bit 2 set"},
+	{0x0000000000000008ULL, 4, 3, "bit 3 set"},
+	{0x0000000000008000ULL, 16, 15, "bit 15 set"},
+	{0x0000000000010000ULL, 17, 16, "bit 16 set"},
+	{0x0000000080000000ULL, 32, 31, "bit 31 set"},
+	{0x0000000100000000ULL, 33, 32, "bit 32 set"},
+	{0x0000000200000000ULL, 34, 33, "bit 33 set"},
+	{0x4000000000000000ULL, 63, 62, "bit 62 set"},
+	{0x8000000000000000ULL, 64, 63, "bit 63 set (sign bit)"},
+
+	/* Maximum values */
+	{0xFFFFFFFFFFFFFFFFULL, 64, 0, "all bits set"},
+
+	/* Cross 32-bit boundary patterns */
+	{0x00000000FFFFFFFFULL, 32, 0, "lower 32 bits set"},
+	{0xFFFFFFFF00000000ULL, 64, 32, "upper 32 bits set"},
+	{0x8000000000000001ULL, 64, 0, "bits 0,63 set"},
+	{0x4000000000000002ULL, 63, 1, "bits 1,62 set"},
+
+	/* Mixed patterns */
+	{0x00000001FFFFFFFFULL, 33, 0, "bit 32 + lower 32 bits"},
+	{0xFFFFFFFF80000000ULL, 64, 31, "upper 32 bits + bit 31"},
+};
+
+/*
+ * Helper function to validate ffs results with detailed error messages
+ */
+static void validate_ffs_result(struct kunit *test, unsigned long input,
+				int actual, int expected, const char *func_name,
+				const char *description)
+{
+	KUNIT_EXPECT_EQ_MSG(test, actual, expected,
+			    "%s(0x%08lx) [%s]: expected %d, got %d",
+			    func_name, input, description, expected, actual);
+}
+
+/*
+ * Helper function to validate 64-bit ffs results
+ */
+static void validate_ffs64_result(struct kunit *test, u64 input,
+				  int actual, int expected, const char *func_name,
+				  const char *description)
+{
+	KUNIT_EXPECT_EQ_MSG(test, actual, expected,
+			    "%s(0x%016llx) [%s]: expected %d, got %d",
+			    func_name, input, description, expected, actual);
+}
+
+/*
+ * Helper function to validate mathematical relationships between functions
+ */
+static void validate_ffs_relationships(struct kunit *test, unsigned long input)
+{
+	int ffs_result;
+	int fls_result;
+	unsigned int ffs_0based;
+	unsigned int fls_0based;
+
+	if (input == 0) {
+		/* Special case: zero input */
+		KUNIT_EXPECT_EQ(test, ffs(input), 0);
+		KUNIT_EXPECT_EQ(test, fls(input), 0);
+		/* __ffs and __fls are undefined for 0, but often return specific values */
+		return;
+	}
+
+	ffs_result = ffs(input);
+	fls_result = fls(input);
+	ffs_0based = __ffs(input);
+	fls_0based = __fls(input);
+
+	/* Relationship: ffs(x) == __ffs(x) + 1 for x != 0 */
+	KUNIT_EXPECT_EQ_MSG(test, ffs_result, ffs_0based + 1,
+			    "ffs(0x%08lx) != __ffs(0x%08lx) + 1: %d != %u + 1",
+			    input, input, ffs_result, ffs_0based);
+
+	/* Relationship: fls(x) == __fls(x) + 1 for x != 0 */
+	KUNIT_EXPECT_EQ_MSG(test, fls_result, fls_0based + 1,
+			    "fls(0x%08lx) != __fls(0x%08lx) + 1: %d != %u + 1",
+			    input, input, fls_result, fls_0based);
+
+	/* Range validation */
+	KUNIT_EXPECT_GE(test, ffs_result, 1);
+	KUNIT_EXPECT_LE(test, ffs_result, BITS_PER_LONG);
+	KUNIT_EXPECT_GE(test, fls_result, 1);
+	KUNIT_EXPECT_LE(test, fls_result, BITS_PER_LONG);
+}
+
+/*
+ * Helper function to validate 64-bit relationships
+ */
+static void validate_ffs64_relationships(struct kunit *test, u64 input)
+{
+	int fls64_result;
+	unsigned int ffs64_0based;
+
+	if (input == 0) {
+		KUNIT_EXPECT_EQ(test, fls64(input), 0);
+		return;
+	}
+
+	fls64_result = fls64(input);
+	ffs64_0based = __ffs64(input);
+
+	/* Range validation */
+	KUNIT_EXPECT_GE(test, fls64_result, 1);
+	KUNIT_EXPECT_LE(test, fls64_result, 64);
+	KUNIT_EXPECT_LT(test, ffs64_0based, 64);
+
+	/*
+	 * Relationships with 32-bit functions should hold for small values
+	 * on all architectures.
+	 */
+	if (input <= 0xFFFFFFFFULL) {
+		unsigned long input_32 = (unsigned long)input;
+		KUNIT_EXPECT_EQ_MSG(test, fls64(input), fls(input_32),
+				    "fls64(0x%llx) != fls(0x%lx): %d != %d",
+				    input, input_32, fls64(input), fls(input_32));
+
+		if (input != 0) {
+			KUNIT_EXPECT_EQ_MSG(test, __ffs64(input), __ffs(input_32),
+					    "__ffs64(0x%llx) != __ffs(0x%lx): %lu != %lu",
+					    input, input_32,
+					    (unsigned long)__ffs64(input),
+					    (unsigned long)__ffs(input_32));
+		}
+	}
+}
+
+/*
+ * Test basic correctness of all ffs-family functions
+ */
+static void ffs_basic_correctness_test(struct kunit *test)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(basic_test_cases); i++) {
+		const struct ffs_test_case *tc = &basic_test_cases[i];
+
+		/* Test ffs() */
+		validate_ffs_result(test, tc->input, ffs(tc->input),
+				    tc->expected_ffs, "ffs", tc->description);
+
+		/* Test fls() */
+		validate_ffs_result(test, tc->input, fls(tc->input),
+				    tc->expected_fls, "fls", tc->description);
+
+		/* Test __ffs() - skip zero case as it's undefined */
+		if (tc->input != 0) {
+			/* Calculate expected __ffs() result: __ffs(x) == ffs(x) - 1 */
+			unsigned int expected_ffs_0based = tc->expected_ffs - 1;
+			validate_ffs_result(test, tc->input, __ffs(tc->input),
+					    expected_ffs_0based, "__ffs", tc->description);
+		}
+
+		/* Test __fls() - skip zero case as it's undefined */
+		if (tc->input != 0) {
+			/* Calculate expected __fls() result: __fls(x) == fls(x) - 1 */
+			unsigned int expected_fls_0based = tc->expected_fls - 1;
+			validate_ffs_result(test, tc->input, __fls(tc->input),
+					    expected_fls_0based, "__fls", tc->description);
+		}
+	}
+}
+
+/*
+ * Test 64-bit function correctness
+ */
+static void ffs64_correctness_test(struct kunit *test)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ffs64_test_cases); i++) {
+		const struct ffs64_test_case *tc = &ffs64_test_cases[i];
+
+		/* Test fls64() */
+		validate_ffs64_result(test, tc->input, fls64(tc->input),
+				      tc->expected_fls64, "fls64", tc->description);
+
+		/* Test __ffs64() - skip zero case as it's undefined */
+		if (tc->input != 0) {
+			validate_ffs64_result(test, tc->input, __ffs64(tc->input),
+					      tc->expected_ffs64_0based, "__ffs64",
+					      tc->description);
+		}
+	}
+}
+
+/*
+ * Test mathematical relationships between functions
+ */
+static void ffs_mathematical_relationships_test(struct kunit *test)
+{
+	int i;
+
+	/* Test basic cases */
+	for (i = 0; i < ARRAY_SIZE(basic_test_cases); i++) {
+		validate_ffs_relationships(test, basic_test_cases[i].input);
+	}
+
+	/* Test 64-bit cases */
+	for (i = 0; i < ARRAY_SIZE(ffs64_test_cases); i++) {
+		validate_ffs64_relationships(test, ffs64_test_cases[i].input);
+	}
+}
+
+/*
+ * Test edge cases and boundary conditions
+ */
+static void ffs_edge_cases_test(struct kunit *test)
+{
+	unsigned long test_patterns[] = {
+		/* Powers of 2 */
+		1UL, 2UL, 4UL, 8UL, 16UL, 32UL, 64UL, 128UL,
+		256UL, 512UL, 1024UL, 2048UL, 4096UL, 8192UL,
+
+		/* Powers of 2 minus 1 */
+		1UL, 3UL, 7UL, 15UL, 31UL, 63UL, 127UL, 255UL,
+		511UL, 1023UL, 2047UL, 4095UL, 8191UL,
+
+		/* Boundary values */
+		0x7FFFFFFFUL,	/* Maximum positive 32-bit */
+		0x80000000UL,	/* Minimum negative 32-bit */
+		0xFFFFFFFFUL,	/* Maximum 32-bit unsigned */
+	};
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(test_patterns); i++) {
+		validate_ffs_relationships(test, test_patterns[i]);
+	}
+}
+
+/*
+ * Test 64-bit edge cases
+ */
+static void ffs64_edge_cases_test(struct kunit *test)
+{
+	u64 test_patterns_64[] = {
+		/* 64-bit powers of 2 */
+		0x0000000100000000ULL,	/* 2^32 */
+		0x0000000200000000ULL,	/* 2^33 */
+		0x0000000400000000ULL,	/* 2^34 */
+		0x0000001000000000ULL,	/* 2^36 */
+		0x0000010000000000ULL,	/* 2^40 */
+		0x0001000000000000ULL,	/* 2^48 */
+		0x0100000000000000ULL,	/* 2^56 */
+		0x4000000000000000ULL,	/* 2^62 */
+		0x8000000000000000ULL,	/* 2^63 */
+
+		/* Cross-boundary patterns */
+		0x00000000FFFFFFFFULL,	/* Lower 32 bits */
+		0xFFFFFFFF00000000ULL,	/* Upper 32 bits */
+		0x7FFFFFFFFFFFFFFFULL,	/* Maximum positive 64-bit */
+		0xFFFFFFFFFFFFFFFFULL,	/* Maximum 64-bit unsigned */
+	};
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(test_patterns_64); i++) {
+		validate_ffs64_relationships(test, test_patterns_64[i]);
+	}
+}
+
+/*
+ * ffz() test data - Find First Zero bit test cases
+ */
+struct ffz_test_case {
+	unsigned long input;
+	unsigned long expected_ffz;
+	const char *description;
+};
+
+static const struct ffz_test_case ffz_test_cases[] = {
+	/* Zero bits in specific positions */
+	{0xFFFFFFFE, 0, "bit 0 is zero"},      /* ...11111110 */
+	{0xFFFFFFFD, 1, "bit 1 is zero"},      /* ...11111101 */
+	{0xFFFFFFFB, 2, "bit 2 is zero"},      /* ...11111011 */
+	{0xFFFFFFF7, 3, "bit 3 is zero"},      /* ...11110111 */
+	{0xFFFFFFEF, 4, "bit 4 is zero"},      /* ...11101111 */
+	{0xFFFFFFDF, 5, "bit 5 is zero"},      /* ...11011111 */
+	{0xFFFFFFBF, 6, "bit 6 is zero"},      /* ...10111111 */
+	{0xFFFFFF7F, 7, "bit 7 is zero"},      /* ...01111111 */
+	{0xFFFFFEFF, 8, "bit 8 is zero"},      /* Gap in bit 8 */
+	{0xFFFF7FFF, 15, "bit 15 is zero"},    /* Gap in bit 15 */
+	{0xFFFEFFFF, 16, "bit 16 is zero"},    /* Gap in bit 16 */
+	{0xBFFFFFFF, 30, "bit 30 is zero"},    /* Gap in bit 30 */
+	{0x7FFFFFFF, 31, "bit 31 is zero"},    /* 01111111... */
+
+	/* Multiple zero patterns */
+	{0xFFFFFFFC, 0, "bits 0-1 are zero"},  /* ...11111100 */
+	{0xFFFFFFF8, 0, "bits 0-2 are zero"},  /* ...11111000 */
+	{0xFFFFFFF0, 0, "bits 0-3 are zero"},  /* ...11110000 */
+	{0xFFFFFF00, 0, "bits 0-7 are zero"},  /* ...00000000 */
+	{0xFFFF0000, 0, "bits 0-15 are zero"}, /* Lower 16 bits zero */
+
+	/* All zeros (special case) */
+	{0x00000000, 0, "all bits zero"},
+
+	/* Complex patterns */
+	{0xFFFDFFFF, 17, "bit 17 is zero"},    /* Gap in bit 17 */
+	{0xFFF7FFFF, 19, "bit 19 is zero"},    /* Gap in bit 19 */
+	{0xF7FFFFFF, 27, "bit 27 is zero"},    /* Gap in bit 27 */
+	{0xDFFFFFFF, 29, "bit 29 is zero"},    /* Gap in bit 29 */
+};
+
+/*
+ * Test basic correctness of ffz() function
+ */
+static void ffz_basic_correctness_test(struct kunit *test)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ffz_test_cases); i++) {
+		const struct ffz_test_case *tc = &ffz_test_cases[i];
+		unsigned long result = ffz(tc->input);
+
+		KUNIT_EXPECT_EQ_MSG(test, result, tc->expected_ffz,
+				    "ffz(0x%08lx) [%s]: expected %lu, got %lu",
+				    tc->input, tc->description, tc->expected_ffz, result);
+	}
+}
+
+/*
+ * Test mathematical relationships between ffz() and other functions
+ */
+static void validate_ffz_relationships(struct kunit *test, unsigned long input)
+{
+	unsigned long ffz_result;
+
+	if (input == 0) {
+		/* ffz(0) should return 0 (first zero bit is at position 0) */
+		KUNIT_EXPECT_EQ(test, ffz(input), 0);
+		return;
+	}
+
+	if (input == ~0UL) {
+		/* ffz(~0) is undefined (no zero bits) - just verify it doesn't crash */
+		ffz_result = ffz(input);
+		/* Implementation-defined behavior, just ensure it completes */
+		return;
+	}
+
+	ffz_result = ffz(input);
+
+	/* Range validation - result should be within valid bit range */
+	KUNIT_EXPECT_LT(test, ffz_result, BITS_PER_LONG);
+
+	/* Verify the bit at ffz_result position is actually zero */
+	KUNIT_EXPECT_EQ_MSG(test, (input >> ffz_result) & 1, 0,
+			    "ffz(0x%08lx) = %lu, but bit %lu is not zero",
+			    input, ffz_result, ffz_result);
+
+	/* Core relationship: if we set the ffz bit, ffz should find a different bit */
+	if (ffz_result < BITS_PER_LONG - 1) {
+		unsigned long modified = input | (1UL << ffz_result);
+		if (modified != ~0UL) { /* Skip if all bits would be set */
+			unsigned long new_ffz = ffz(modified);
+			KUNIT_EXPECT_NE_MSG(test, new_ffz, ffz_result,
+					    "ffz(0x%08lx) = %lu, but setting that bit doesn't change ffz result",
+					    input, ffz_result);
+		}
+	}
+}
+
+static void ffz_mathematical_relationships_test(struct kunit *test)
+{
+	unsigned long test_patterns[] = {
+		/* Powers of 2 with one bit clear */
+		0xFFFFFFFE, 0xFFFFFFFD, 0xFFFFFFFB, 0xFFFFFFF7,
+		0xFFFFFFEF, 0xFFFFFFDF, 0xFFFFFFBF, 0xFFFFFF7F,
+
+		/* Multiple patterns */
+		0xFFFFFF00, 0xFFFFF000, 0xFFFF0000, 0xFFF00000,
+		0x7FFFFFFF, 0x3FFFFFFF, 0x1FFFFFFF, 0x0FFFFFFF,
+
+		/* Complex bit patterns */
+		0xAAAAAAAA, 0x55555555, 0xCCCCCCCC, 0x33333333,
+		0xF0F0F0F0, 0x0F0F0F0F, 0xFF00FF00, 0x00FF00FF,
+	};
+	int i;
+
+	/* Test basic test cases */
+	for (i = 0; i < ARRAY_SIZE(ffz_test_cases); i++) {
+		validate_ffz_relationships(test, ffz_test_cases[i].input);
+	}
+
+	/* Test additional patterns */
+	for (i = 0; i < ARRAY_SIZE(test_patterns); i++) {
+		validate_ffz_relationships(test, test_patterns[i]);
+	}
+}
+
+/*
+ * Test edge cases and boundary conditions for ffz()
+ */
+static void ffz_edge_cases_test(struct kunit *test)
+{
+	unsigned long edge_patterns[] = {
+		/* Boundary values */
+		0x00000000,  /* All zeros */
+		0x80000000,  /* Only MSB set */
+		0x00000001,  /* Only LSB set */
+		0x7FFFFFFF,  /* MSB clear */
+		0xFFFFFFFE,  /* LSB clear */
+
+		/* Powers of 2 complement patterns (one zero bit each) */
+		~(1UL << 0),  ~(1UL << 1),  ~(1UL << 2),  ~(1UL << 3),
+		~(1UL << 4),  ~(1UL << 8),  ~(1UL << 16), ~(1UL << 31),
+
+		/* Walking zero patterns */
+		0xFFFFFFFE, 0xFFFFFFFD, 0xFFFFFFFB, 0xFFFFFFF7,
+		0xFFFFFFEF, 0xFFFFFFDF, 0xFFFFFFBF, 0xFFFFFF7F,
+		0xFFFFFEFF, 0xFFFFFDFF, 0xFFFFFBFF, 0xFFFFF7FF,
+
+		/* Multiple zeros */
+		0xFFFFFF00, 0xFFFFF000, 0xFFFF0000, 0xFFF00000,
+		0xFF000000, 0xF0000000, 0x00000000,
+	};
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(edge_patterns); i++) {
+		validate_ffz_relationships(test, edge_patterns[i]);
+	}
+}
+
+/*
+ * To have useful build error output, split the tests into separate
+ * functions so it's clear which are missing __attribute_const__.
+ */
+#define CREATE_WRAPPER(func)						\
+static noinline bool build_test_##func(void)				\
+{									\
+	int init_##func = 32;						\
+	int result_##func = func(6);					\
+									\
+	/* Does the static initializer vanish after calling func? */	\
+	BUILD_BUG_ON(init_##func < 32);					\
+									\
+	/* "Consume" the results so optimizer doesn't drop them. */	\
+	barrier_data(&init_##func);					\
+	barrier_data(&result_##func);					\
+									\
+	return true;							\
+}
+CREATE_WRAPPER(ffs)
+CREATE_WRAPPER(fls)
+CREATE_WRAPPER(__ffs)
+CREATE_WRAPPER(__fls)
+CREATE_WRAPPER(ffz)
+#undef CREATE_WRAPPER
+
+/*
+ * Make sure that __attribute_const__ has be applied to all the
+ * functions. This is a regression test for:
+ * https://github.com/KSPP/linux/issues/364
+ */
+static void ffs_attribute_const_test(struct kunit *test)
+{
+	KUNIT_EXPECT_TRUE(test, build_test_ffs());
+	KUNIT_EXPECT_TRUE(test, build_test_fls());
+	KUNIT_EXPECT_TRUE(test, build_test___ffs());
+	KUNIT_EXPECT_TRUE(test, build_test___fls());
+	KUNIT_EXPECT_TRUE(test, build_test_ffz());
+}
+
+/*
+ * KUnit test case definitions
+ */
+static struct kunit_case ffs_test_cases[] = {
+	KUNIT_CASE(ffs_basic_correctness_test),
+	KUNIT_CASE(ffs64_correctness_test),
+	KUNIT_CASE(ffs_mathematical_relationships_test),
+	KUNIT_CASE(ffs_edge_cases_test),
+	KUNIT_CASE(ffs64_edge_cases_test),
+	KUNIT_CASE(ffz_basic_correctness_test),
+	KUNIT_CASE(ffz_mathematical_relationships_test),
+	KUNIT_CASE(ffz_edge_cases_test),
+	KUNIT_CASE(ffs_attribute_const_test),
+	{}
+};
+
+/*
+ * KUnit test suite definition
+ */
+static struct kunit_suite ffs_test_suite = {
+	.name = "ffs",
+	.test_cases = ffs_test_cases,
+};
+
+kunit_test_suites(&ffs_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests for ffs()-family functions");
+MODULE_LICENSE("GPL");