85 files changed, 2369 insertions, 1456 deletions
diff --git a/arch/x86/Kconfig.cpufeatures b/arch/x86/Kconfig.cpufeatures
index 250c10627ab3..733d5aff2456 100644
--- a/arch/x86/Kconfig.cpufeatures
+++ b/arch/x86/Kconfig.cpufeatures
@@ -124,6 +124,10 @@ config X86_DISABLED_FEATURE_PCID
 	def_bool y
 	depends on !X86_64
 
+config X86_DISABLED_FEATURE_LASS
+	def_bool y
+	depends on X86_32
+
 config X86_DISABLED_FEATURE_PKU
 	def_bool y
 	depends on !X86_INTEL_MEMORY_PROTECTION_KEYS
diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c
index a2b6b428922a..bda042933a05 100644
--- a/arch/x86/boot/a20.c
+++ b/arch/x86/boot/a20.c
@@ -135,29 +135,29 @@ int enable_a20(void)
 		  (legacy free, etc.) */
 	       if (a20_test_short())
 		       return 0;
-	       
+
 	       /* Next, try the BIOS (INT 0x15, AX=0x2401) */
 	       enable_a20_bios();
 	       if (a20_test_short())
 		       return 0;
-	       
+
 	       /* Try enabling A20 through the keyboard controller */
 	       kbc_err = empty_8042();
 
 	       if (a20_test_short())
 		       return 0; /* BIOS worked, but with delayed reaction */
-	
+
 	       if (!kbc_err) {
 		       enable_a20_kbc();
 		       if (a20_test_long())
 			       return 0;
 	       }
-	       
+
 	       /* Finally, try enabling the "fast A20 gate" */
 	       enable_a20_fast();
 	       if (a20_test_long())
 		       return 0;
        }
-       
+
        return -1;
 }
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index a3c58ebe3662..8e3eab34dff4 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -193,8 +193,6 @@ static inline bool heap_free(size_t n)
 
 void copy_to_fs(addr_t dst, void *src, size_t len);
 void *copy_from_fs(void *dst, addr_t src, size_t len);
-void copy_to_gs(addr_t dst, void *src, size_t len);
-void *copy_from_gs(void *dst, addr_t src, size_t len);
 
 /* a20.c */
 int enable_a20(void);
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index db1048621ea2..fd855e32c9b9 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -152,17 +152,6 @@ bool insn_has_rep_prefix(struct insn *insn);
 void sev_insn_decode_init(void);
 bool early_setup_ghcb(void);
 #else
-static inline void sev_enable(struct boot_params *bp)
-{
-	/*
-	 * bp->cc_blob_address should only be set by boot/compressed kernel.
-	 * Initialize it to 0 unconditionally (thus here in this stub too) to
-	 * ensure that uninitialized values from buggy bootloaders aren't
-	 * propagated.
-	 */
-	if (bp)
-		bp->cc_blob_address = 0;
-}
 static inline void snp_check_features(void) { }
 static inline void sev_es_shutdown_ghcb(void) { }
 static inline bool sev_es_check_ghcb_fault(unsigned long address)
diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
index bdd26050dff7..0e89e197e112 100644
--- a/arch/x86/boot/compressed/pgtable_64.c
+++ b/arch/x86/boot/compressed/pgtable_64.c
@@ -3,6 +3,7 @@
 #include <asm/bootparam.h>
 #include <asm/bootparam_utils.h>
 #include <asm/e820/types.h>
+#include <asm/pgtable.h>
 #include <asm/processor.h>
 #include "../string.h"
 #include "efi.h"
@@ -168,9 +169,10 @@ asmlinkage void configure_5level_paging(struct boot_params *bp, void *pgtable)
 		 * For 4- to 5-level paging transition, set up current CR3 as
 		 * the first and the only entry in a new top-level page table.
 		 */
-		*trampoline_32bit = __native_read_cr3() | _PAGE_TABLE_NOENC;
+		*trampoline_32bit = native_read_cr3_pa() | _PAGE_TABLE_NOENC;
 	} else {
-		unsigned long src;
+		u64 *new_cr3;
+		pgd_t *pgdp;
 
 		/*
 		 * For 5- to 4-level paging transition, copy page table pointed
@@ -180,8 +182,9 @@ asmlinkage void configure_5level_paging(struct boot_params *bp, void *pgtable)
 		 * We cannot just point to the page table from trampoline as it
 		 * may be above 4G.
 		 */
-		src = *(unsigned long *)__native_read_cr3() & PAGE_MASK;
-		memcpy(trampoline_32bit, (void *)src, PAGE_SIZE);
+		pgdp = (pgd_t *)native_read_cr3_pa();
+		new_cr3 = (u64 *)(native_pgd_val(pgdp[0]) & PTE_PFN_MASK);
+		memcpy(trampoline_32bit, new_cr3, PAGE_SIZE);
 	}
 
 	toggle_la57(trampoline_32bit);
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index 6e5c32a53d03..c8c1464b3a56 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -14,6 +14,7 @@
 
 #include <asm/bootparam.h>
 #include <asm/pgtable_types.h>
+#include <asm/shared/msr.h>
 #include <asm/sev.h>
 #include <asm/trapnr.h>
 #include <asm/trap_pf.h>
@@ -397,7 +398,7 @@ void sev_enable(struct boot_params *bp)
 	}
 
 	/* Set the SME mask if this is an SEV guest. */
-	boot_rdmsr(MSR_AMD64_SEV, &m);
+	raw_rdmsr(MSR_AMD64_SEV, &m);
 	sev_status = m.q;
 	if (!(sev_status & MSR_AMD64_SEV_ENABLED))
 		return;
@@ -446,7 +447,7 @@ u64 sev_get_status(void)
 	if (sev_check_cpu_support() < 0)
 		return 0;
 
-	boot_rdmsr(MSR_AMD64_SEV, &m);
+	raw_rdmsr(MSR_AMD64_SEV, &m);
 	return m.q;
 }
 
@@ -496,7 +497,7 @@ bool early_is_sevsnp_guest(void)
 			struct msr m;
 
 			/* Obtain the address of the calling area to use */
-			boot_rdmsr(MSR_SVSM_CAA, &m);
+			raw_rdmsr(MSR_SVSM_CAA, &m);
 			boot_svsm_caa_pa = m.q;
 
 			/*
diff --git a/arch/x86/boot/compressed/sev.h b/arch/x86/boot/compressed/sev.h
index 92f79c21939c..22637b416b46 100644
--- a/arch/x86/boot/compressed/sev.h
+++ b/arch/x86/boot/compressed/sev.h
@@ -10,7 +10,7 @@
 
 #ifdef CONFIG_AMD_MEM_ENCRYPT
 
-#include "../msr.h"
+#include <asm/shared/msr.h>
 
 void snp_accept_memory(phys_addr_t start, phys_addr_t end);
 u64 sev_get_status(void);
@@ -20,7 +20,7 @@ static inline u64 sev_es_rd_ghcb_msr(void)
 {
 	struct msr m;
 
-	boot_rdmsr(MSR_AMD64_SEV_ES_GHCB, &m);
+	raw_rdmsr(MSR_AMD64_SEV_ES_GHCB, &m);
 
 	return m.q;
 }
@@ -30,7 +30,7 @@ static inline void sev_es_wr_ghcb_msr(u64 val)
 	struct msr m;
 
 	m.q = val;
-	boot_wrmsr(MSR_AMD64_SEV_ES_GHCB, &m);
+	raw_wrmsr(MSR_AMD64_SEV_ES_GHCB, &m);
 }
 
 #else
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
index f82de8de5dc6..2e1bb936cba2 100644
--- a/arch/x86/boot/cpucheck.c
+++ b/arch/x86/boot/cpucheck.c
@@ -26,9 +26,9 @@
 #include <asm/intel-family.h>
 #include <asm/processor-flags.h>
 #include <asm/msr-index.h>
+#include <asm/shared/msr.h>
 
 #include "string.h"
-#include "msr.h"
 
 static u32 err_flags[NCAPINTS];
 
@@ -134,9 +134,9 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
 
 		struct msr m;
 
-		boot_rdmsr(MSR_K7_HWCR, &m);
+		raw_rdmsr(MSR_K7_HWCR, &m);
 		m.l &= ~(1 << 15);
-		boot_wrmsr(MSR_K7_HWCR, &m);
+		raw_wrmsr(MSR_K7_HWCR, &m);
 
 		get_cpuflags();	/* Make sure it really did something */
 		err = check_cpuflags();
@@ -148,9 +148,9 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
 
 		struct msr m;
 
-		boot_rdmsr(MSR_VIA_FCR, &m);
+		raw_rdmsr(MSR_VIA_FCR, &m);
 		m.l |= (1 << 1) | (1 << 7);
-		boot_wrmsr(MSR_VIA_FCR, &m);
+		raw_wrmsr(MSR_VIA_FCR, &m);
 
 		set_bit(X86_FEATURE_CX8, cpu.flags);
 		err = check_cpuflags();
@@ -160,14 +160,14 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
 		struct msr m, m_tmp;
 		u32 level = 1;
 
-		boot_rdmsr(0x80860004, &m);
+		raw_rdmsr(0x80860004, &m);
 		m_tmp = m;
 		m_tmp.l = ~0;
-		boot_wrmsr(0x80860004, &m_tmp);
+		raw_wrmsr(0x80860004, &m_tmp);
 		asm("cpuid"
 		    : "+a" (level), "=d" (cpu.flags[0])
 		    : : "ecx", "ebx");
-		boot_wrmsr(0x80860004, &m);
+		raw_wrmsr(0x80860004, &m);
 
 		err = check_cpuflags();
 	} else if (err == 0x01 &&
diff --git a/arch/x86/boot/msr.h b/arch/x86/boot/msr.h
deleted file mode 100644
index aed66f7ae199..000000000000
--- a/arch/x86/boot/msr.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Helpers/definitions related to MSR access.
- */
-
-#ifndef BOOT_MSR_H
-#define BOOT_MSR_H
-
-#include <asm/shared/msr.h>
-
-/*
- * The kernel proper already defines rdmsr()/wrmsr(), but they are not for the
- * boot kernel since they rely on tracepoint/exception handling infrastructure
- * that's not available here.
- */
-static inline void boot_rdmsr(unsigned int reg, struct msr *m)
-{
-	asm volatile("rdmsr" : "=a" (m->l), "=d" (m->h) : "c" (reg));
-}
-
-static inline void boot_wrmsr(unsigned int reg, const struct msr *m)
-{
-	asm volatile("wrmsr" : : "c" (reg), "a"(m->l), "d" (m->h) : "memory");
-}
-
-#endif /* BOOT_MSR_H */
diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c
index 4e22ffd73516..a0fa8bb2b945 100644
--- a/arch/x86/boot/startup/sev-shared.c
+++ b/arch/x86/boot/startup/sev-shared.c
@@ -12,7 +12,7 @@
 #include <asm/setup_data.h>
 
 #ifndef __BOOT_COMPRESSED
-#define has_cpuflag(f)			boot_cpu_has(f)
+#define has_cpuflag(f)			cpu_feature_enabled(f)
 #else
 #undef WARN
 #define WARN(condition, format...) (!!(condition))
diff --git a/arch/x86/coco/sev/vc-handle.c b/arch/x86/coco/sev/vc-handle.c
index 7fc136a35334..f08c7505ed82 100644
--- a/arch/x86/coco/sev/vc-handle.c
+++ b/arch/x86/coco/sev/vc-handle.c
@@ -352,7 +352,6 @@ fault:
 
 #define sev_printk(fmt, ...)		printk(fmt, ##__VA_ARGS__)
 #define error(v)
-#define has_cpuflag(f)			boot_cpu_has(f)
 
 #include "vc-shared.c"
 
diff --git a/arch/x86/coco/sev/vc-shared.c b/arch/x86/coco/sev/vc-shared.c
index 9b01c9ad81be..58b2f985d546 100644
--- a/arch/x86/coco/sev/vc-shared.c
+++ b/arch/x86/coco/sev/vc-shared.c
@@ -1,5 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#ifndef __BOOT_COMPRESSED
+#define has_cpuflag(f)                  cpu_feature_enabled(f)
+#endif
+
 static enum es_result vc_check_opcode_bytes(struct es_em_ctxt *ctxt,
 					    unsigned long exit_code)
 {
@@ -546,6 +550,13 @@ static enum es_result vc_handle_cpuid(struct ghcb *ghcb,
 		/* xgetbv will cause #GP - use reset value for xcr0 */
 		ghcb_set_xcr0(ghcb, 1);
 
+	if (has_cpuflag(X86_FEATURE_SHSTK) && regs->ax == 0xd && regs->cx == 1) {
+		struct msr m;
+
+		raw_rdmsr(MSR_IA32_XSS, &m);
+		ghcb_set_xss(ghcb, m.q);
+	}
+
 	ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0);
 	if (ret != ES_OK)
 		return ret;
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
index 48d3076b6053..3fd2423d3cf8 100644
--- a/arch/x86/crypto/Kconfig
+++ b/arch/x86/crypto/Kconfig
@@ -353,16 +353,6 @@ config CRYPTO_NHPOLY1305_AVX2
 	  Architecture: x86_64 using:
 	  - AVX2 (Advanced Vector Extensions 2)
 
-config CRYPTO_POLYVAL_CLMUL_NI
-	tristate "Hash functions: POLYVAL (CLMUL-NI)"
-	depends on 64BIT
-	select CRYPTO_POLYVAL
-	help
-	  POLYVAL hash function for HCTR2
-
-	  Architecture: x86_64 using:
-	  - CLMUL-NI (carry-less multiplication new instructions)
-
 config CRYPTO_SM3_AVX_X86_64
 	tristate "Hash functions: SM3 (AVX)"
 	depends on 64BIT
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 2d30d5d36145..5f2fb4f148fe 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -46,15 +46,13 @@ obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
 aesni-intel-$(CONFIG_64BIT) += aes-ctr-avx-x86_64.o \
 			       aes-gcm-aesni-x86_64.o \
-			       aes-xts-avx-x86_64.o \
-			       aes-gcm-avx10-x86_64.o
+			       aes-gcm-vaes-avx2.o \
+			       aes-gcm-vaes-avx512.o \
+			       aes-xts-avx-x86_64.o
 
 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
 
-obj-$(CONFIG_CRYPTO_POLYVAL_CLMUL_NI) += polyval-clmulni.o
-polyval-clmulni-y := polyval-clmulni_asm.o polyval-clmulni_glue.o
-
 obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
 nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
 obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
diff --git a/arch/x86/crypto/aes-gcm-aesni-x86_64.S b/arch/x86/crypto/aes-gcm-aesni-x86_64.S
index 45940e2883a0..7c8a8a32bd3c 100644
--- a/arch/x86/crypto/aes-gcm-aesni-x86_64.S
+++ b/arch/x86/crypto/aes-gcm-aesni-x86_64.S
@@ -61,15 +61,15 @@
 // for the *_aesni functions or AVX for the *_aesni_avx ones.  (But it seems
 // there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.)
 //
-// The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is
+// The design generally follows that of aes-gcm-vaes-avx512.S, and that file is
 // more thoroughly commented.  This file has the following notable changes:
 //
 //    - The vector length is fixed at 128-bit, i.e. xmm registers.  This means
 //      there is only one AES block (and GHASH block) per register.
 //
-//    - Without AVX512 / AVX10, only 16 SIMD registers are available instead of
-//      32.  We work around this by being much more careful about using
-//      registers, relying heavily on loads to load values as they are needed.
+//    - Without AVX512, only 16 SIMD registers are available instead of 32.  We
+//      work around this by being much more careful about using registers,
+//      relying heavily on loads to load values as they are needed.
 //
 //    - Masking is not available either.  We work around this by implementing
 //      partial block loads and stores using overlapping scalar loads and stores
@@ -90,8 +90,8 @@
 //      multiplication instead of schoolbook multiplication.  This saves one
 //      pclmulqdq instruction per block, at the cost of one 64-bit load, one
 //      pshufd, and 0.25 pxors per block.  (This is without the three-argument
-//      XOR support that would be provided by AVX512 / AVX10, which would be
-//      more beneficial to schoolbook than Karatsuba.)
+//      XOR support that would be provided by AVX512, which would be more
+//      beneficial to schoolbook than Karatsuba.)
 //
 //      As a rough approximation, we can assume that Karatsuba multiplication is
 //      faster than schoolbook multiplication in this context if one pshufd and
diff --git a/arch/x86/crypto/aes-gcm-vaes-avx2.S b/arch/x86/crypto/aes-gcm-vaes-avx2.S
new file mode 100644
index 000000000000..93c9504a488f
--- /dev/null
+++ b/arch/x86/crypto/aes-gcm-vaes-avx2.S
@@ -0,0 +1,1146 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// AES-GCM implementation for x86_64 CPUs that support the following CPU
+// features: VAES && VPCLMULQDQ && AVX2
+//
+// Copyright 2025 Google LLC
+//
+// Author: Eric Biggers <ebiggers@google.com>
+//
+//------------------------------------------------------------------------------
+//
+// This file is dual-licensed, meaning that you can use it under your choice of
+// either of the following two licenses:
+//
+// Licensed under the Apache License 2.0 (the "License").  You may obtain a copy
+// of the License at
+//
+//	http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// or
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// -----------------------------------------------------------------------------
+//
+// This is similar to aes-gcm-vaes-avx512.S, but it uses AVX2 instead of AVX512.
+// This means it can only use 16 vector registers instead of 32, the maximum
+// vector length is 32 bytes, and some instructions such as vpternlogd and
+// masked loads/stores are unavailable.  However, it is able to run on CPUs that
+// have VAES without AVX512, namely AMD Zen 3 (including "Milan" server CPUs),
+// various Intel client CPUs such as Alder Lake, and Intel Sierra Forest.
+//
+// This implementation also uses Karatsuba multiplication instead of schoolbook
+// multiplication for GHASH in its main loop.  This does not help much on Intel,
+// but it improves performance by ~5% on AMD Zen 3.  Other factors weighing
+// slightly in favor of Karatsuba multiplication in this implementation are the
+// lower maximum vector length (which means there are fewer key powers, so we
+// can cache the halves of each key power XOR'd together and still use less
+// memory than the AVX512 implementation), and the unavailability of the
+// vpternlogd instruction (which helped schoolbook a bit more than Karatsuba).
+
+#include <linux/linkage.h>
+
+.section .rodata
+.p2align 4
+
+	// The below three 16-byte values must be in the order that they are, as
+	// they are really two 32-byte tables and a 16-byte value that overlap:
+	//
+	// - The first 32-byte table begins at .Lselect_high_bytes_table.
+	//   For 0 <= len <= 16, the 16-byte value at
+	//   '.Lselect_high_bytes_table + len' selects the high 'len' bytes of
+	//   another 16-byte value when AND'ed with it.
+	//
+	// - The second 32-byte table begins at .Lrshift_and_bswap_table.
+	//   For 0 <= len <= 16, the 16-byte value at
+	//   '.Lrshift_and_bswap_table + len' is a vpshufb mask that does the
+	//   following operation: right-shift by '16 - len' bytes (shifting in
+	//   zeroes), then reflect all 16 bytes.
+	//
+	// - The 16-byte value at .Lbswap_mask is a vpshufb mask that reflects
+	//   all 16 bytes.
+.Lselect_high_bytes_table:
+	.octa	0
+.Lrshift_and_bswap_table:
+	.octa	0xffffffffffffffffffffffffffffffff
+.Lbswap_mask:
+	.octa	0x000102030405060708090a0b0c0d0e0f
+
+	// Sixteen 0x0f bytes.  By XOR'ing an entry of .Lrshift_and_bswap_table
+	// with this, we get a mask that left-shifts by '16 - len' bytes.
+.Lfifteens:
+	.octa	0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+	// This is the GHASH reducing polynomial without its constant term, i.e.
+	// x^128 + x^7 + x^2 + x, represented using the backwards mapping
+	// between bits and polynomial coefficients.
+	//
+	// Alternatively, it can be interpreted as the naturally-ordered
+	// representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the
+	// "reversed" GHASH reducing polynomial without its x^128 term.
+.Lgfpoly:
+	.octa	0xc2000000000000000000000000000001
+
+	// Same as above, but with the (1 << 64) bit set.
+.Lgfpoly_and_internal_carrybit:
+	.octa	0xc2000000000000010000000000000001
+
+	// Values needed to prepare the initial vector of counter blocks.
+.Lctr_pattern:
+	.octa	0
+	.octa	1
+
+	// The number of AES blocks per vector, as a 128-bit value.
+.Linc_2blocks:
+	.octa	2
+
+// Offsets in struct aes_gcm_key_vaes_avx2
+#define OFFSETOF_AESKEYLEN	480
+#define OFFSETOF_H_POWERS	512
+#define NUM_H_POWERS		8
+#define OFFSETOFEND_H_POWERS    (OFFSETOF_H_POWERS + (NUM_H_POWERS * 16))
+#define OFFSETOF_H_POWERS_XORED	OFFSETOFEND_H_POWERS
+
+.text
+
+// Do one step of GHASH-multiplying the 128-bit lanes of \a by the 128-bit lanes
+// of \b and storing the reduced products in \dst.  Uses schoolbook
+// multiplication.
+.macro	_ghash_mul_step	i, a, b, dst, gfpoly, t0, t1, t2
+.if \i == 0
+	vpclmulqdq	$0x00, \a, \b, \t0	  // LO = a_L * b_L
+	vpclmulqdq	$0x01, \a, \b, \t1	  // MI_0 = a_L * b_H
+.elseif \i == 1
+	vpclmulqdq	$0x10, \a, \b, \t2	  // MI_1 = a_H * b_L
+.elseif \i == 2
+	vpxor		\t2, \t1, \t1		  // MI = MI_0 + MI_1
+.elseif \i == 3
+	vpclmulqdq	$0x01, \t0, \gfpoly, \t2  // LO_L*(x^63 + x^62 + x^57)
+.elseif \i == 4
+	vpshufd		$0x4e, \t0, \t0		  // Swap halves of LO
+.elseif \i == 5
+	vpxor		\t0, \t1, \t1		  // Fold LO into MI (part 1)
+	vpxor		\t2, \t1, \t1		  // Fold LO into MI (part 2)
+.elseif \i == 6
+	vpclmulqdq	$0x11, \a, \b, \dst	  // HI = a_H * b_H
+.elseif \i == 7
+	vpclmulqdq	$0x01, \t1, \gfpoly, \t0  // MI_L*(x^63 + x^62 + x^57)
+.elseif \i == 8
+	vpshufd		$0x4e, \t1, \t1		  // Swap halves of MI
+.elseif \i == 9
+	vpxor		\t1, \dst, \dst		  // Fold MI into HI (part 1)
+	vpxor		\t0, \dst, \dst		  // Fold MI into HI (part 2)
+.endif
+.endm
+
+// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store
+// the reduced products in \dst.  See _ghash_mul_step for full explanation.
+.macro	_ghash_mul	a, b, dst, gfpoly, t0, t1, t2
+.irp i, 0,1,2,3,4,5,6,7,8,9
+	_ghash_mul_step	\i, \a, \b, \dst, \gfpoly, \t0, \t1, \t2
+.endr
+.endm
+
+// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the
+// *unreduced* products to \lo, \mi, and \hi.
+.macro	_ghash_mul_noreduce	a, b, lo, mi, hi, t0
+	vpclmulqdq	$0x00, \a, \b, \t0	// a_L * b_L
+	vpxor		\t0, \lo, \lo
+	vpclmulqdq	$0x01, \a, \b, \t0	// a_L * b_H
+	vpxor		\t0, \mi, \mi
+	vpclmulqdq	$0x10, \a, \b, \t0	// a_H * b_L
+	vpxor		\t0, \mi, \mi
+	vpclmulqdq	$0x11, \a, \b, \t0	// a_H * b_H
+	vpxor		\t0, \hi, \hi
+.endm
+
+// Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit
+// reduced products in \hi.  See _ghash_mul_step for explanation of reduction.
+.macro	_ghash_reduce	lo, mi, hi, gfpoly, t0
+	vpclmulqdq	$0x01, \lo, \gfpoly, \t0
+	vpshufd		$0x4e, \lo, \lo
+	vpxor		\lo, \mi, \mi
+	vpxor		\t0, \mi, \mi
+	vpclmulqdq	$0x01, \mi, \gfpoly, \t0
+	vpshufd		$0x4e, \mi, \mi
+	vpxor		\mi, \hi, \hi
+	vpxor		\t0, \hi, \hi
+.endm
+
+// This is a specialized version of _ghash_mul that computes \a * \a, i.e. it
+// squares \a.  It skips computing MI = (a_L * a_H) + (a_H * a_L) = 0.
+.macro	_ghash_square	a, dst, gfpoly, t0, t1
+	vpclmulqdq	$0x00, \a, \a, \t0	  // LO = a_L * a_L
+	vpclmulqdq	$0x11, \a, \a, \dst	  // HI = a_H * a_H
+	vpclmulqdq	$0x01, \t0, \gfpoly, \t1  // LO_L*(x^63 + x^62 + x^57)
+	vpshufd		$0x4e, \t0, \t0		  // Swap halves of LO
+	vpxor		\t0, \t1, \t1		  // Fold LO into MI
+	vpclmulqdq	$0x01, \t1, \gfpoly, \t0  // MI_L*(x^63 + x^62 + x^57)
+	vpshufd		$0x4e, \t1, \t1		  // Swap halves of MI
+	vpxor		\t1, \dst, \dst		  // Fold MI into HI (part 1)
+	vpxor		\t0, \dst, \dst		  // Fold MI into HI (part 2)
+.endm
+
+// void aes_gcm_precompute_vaes_avx2(struct aes_gcm_key_vaes_avx2 *key);
+//
+// Given the expanded AES key |key->base.aes_key|, derive the GHASH subkey and
+// initialize |key->h_powers| and |key->h_powers_xored|.
+//
+// We use h_powers[0..7] to store H^8 through H^1, and h_powers_xored[0..7] to
+// store the 64-bit halves of the key powers XOR'd together (for Karatsuba
+// multiplication) in the order 8,6,7,5,4,2,3,1.
+SYM_FUNC_START(aes_gcm_precompute_vaes_avx2)
+
+	// Function arguments
+	.set	KEY,		%rdi
+
+	// Additional local variables
+	.set	POWERS_PTR,	%rsi
+	.set	RNDKEYLAST_PTR,	%rdx
+	.set	TMP0,		%ymm0
+	.set	TMP0_XMM,	%xmm0
+	.set	TMP1,		%ymm1
+	.set	TMP1_XMM,	%xmm1
+	.set	TMP2,		%ymm2
+	.set	TMP2_XMM,	%xmm2
+	.set	H_CUR,		%ymm3
+	.set	H_CUR_XMM,	%xmm3
+	.set	H_CUR2,		%ymm4
+	.set	H_INC,		%ymm5
+	.set	H_INC_XMM,	%xmm5
+	.set	GFPOLY,		%ymm6
+	.set	GFPOLY_XMM,	%xmm6
+
+	// Encrypt an all-zeroes block to get the raw hash subkey.
+	movl		OFFSETOF_AESKEYLEN(KEY), %eax
+	lea		6*16(KEY,%rax,4), RNDKEYLAST_PTR
+	vmovdqu		(KEY), H_CUR_XMM  // Zero-th round key XOR all-zeroes block
+	lea		16(KEY), %rax
+1:
+	vaesenc		(%rax), H_CUR_XMM, H_CUR_XMM
+	add		$16, %rax
+	cmp		%rax, RNDKEYLAST_PTR
+	jne		1b
+	vaesenclast	(RNDKEYLAST_PTR), H_CUR_XMM, H_CUR_XMM
+
+	// Reflect the bytes of the raw hash subkey.
+	vpshufb		.Lbswap_mask(%rip), H_CUR_XMM, H_CUR_XMM
+
+	// Finish preprocessing the byte-reflected hash subkey by multiplying it
+	// by x^-1 ("standard" interpretation of polynomial coefficients) or
+	// equivalently x^1 (natural interpretation).  This gets the key into a
+	// format that avoids having to bit-reflect the data blocks later.
+	vpshufd		$0xd3, H_CUR_XMM, TMP0_XMM
+	vpsrad		$31, TMP0_XMM, TMP0_XMM
+	vpaddq		H_CUR_XMM, H_CUR_XMM, H_CUR_XMM
+	vpand		.Lgfpoly_and_internal_carrybit(%rip), TMP0_XMM, TMP0_XMM
+	vpxor		TMP0_XMM, H_CUR_XMM, H_CUR_XMM
+
+	// Load the gfpoly constant.
+	vbroadcasti128	.Lgfpoly(%rip), GFPOLY
+
+	// Square H^1 to get H^2.
+	_ghash_square	H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, TMP0_XMM, TMP1_XMM
+
+	// Create H_CUR = [H^2, H^1] and H_INC = [H^2, H^2].
+	vinserti128	$1, H_CUR_XMM, H_INC, H_CUR
+	vinserti128	$1, H_INC_XMM, H_INC, H_INC
+
+	// Compute H_CUR2 = [H^4, H^3].
+	_ghash_mul	H_INC, H_CUR, H_CUR2, GFPOLY, TMP0, TMP1, TMP2
+
+	// Store [H^2, H^1] and [H^4, H^3].
+	vmovdqu		H_CUR, OFFSETOF_H_POWERS+3*32(KEY)
+	vmovdqu		H_CUR2, OFFSETOF_H_POWERS+2*32(KEY)
+
+	// For Karatsuba multiplication: compute and store the two 64-bit halves
+	// of each key power XOR'd together.  Order is 4,2,3,1.
+	vpunpcklqdq	H_CUR, H_CUR2, TMP0
+	vpunpckhqdq	H_CUR, H_CUR2, TMP1
+	vpxor		TMP1, TMP0, TMP0
+	vmovdqu		TMP0, OFFSETOF_H_POWERS_XORED+32(KEY)
+
+	// Compute and store H_CUR = [H^6, H^5] and H_CUR2 = [H^8, H^7].
+	_ghash_mul	H_INC, H_CUR2, H_CUR, GFPOLY, TMP0, TMP1, TMP2
+	_ghash_mul	H_INC, H_CUR, H_CUR2, GFPOLY, TMP0, TMP1, TMP2
+	vmovdqu		H_CUR, OFFSETOF_H_POWERS+1*32(KEY)
+	vmovdqu		H_CUR2, OFFSETOF_H_POWERS+0*32(KEY)
+
+	// Again, compute and store the two 64-bit halves of each key power
+	// XOR'd together.  Order is 8,6,7,5.
+	vpunpcklqdq	H_CUR, H_CUR2, TMP0
+	vpunpckhqdq	H_CUR, H_CUR2, TMP1
+	vpxor		TMP1, TMP0, TMP0
+	vmovdqu		TMP0, OFFSETOF_H_POWERS_XORED(KEY)
+
+	vzeroupper
+	RET
+SYM_FUNC_END(aes_gcm_precompute_vaes_avx2)
+
+// Do one step of the GHASH update of four vectors of data blocks.
+//   \i: the step to do, 0 through 9
+//   \ghashdata_ptr: pointer to the data blocks (ciphertext or AAD)
+//   KEY: pointer to struct aes_gcm_key_vaes_avx2
+//   BSWAP_MASK: mask for reflecting the bytes of blocks
+//   H_POW[2-1]_XORED: cached values from KEY->h_powers_xored
+//   TMP[0-2]: temporary registers.  TMP[1-2] must be preserved across steps.
+//   LO, MI: working state for this macro that must be preserved across steps
+//   GHASH_ACC: the GHASH accumulator (input/output)
+.macro	_ghash_step_4x	i, ghashdata_ptr
+	.set		HI, GHASH_ACC # alias
+	.set		HI_XMM, GHASH_ACC_XMM
+.if \i == 0
+	// First vector
+	vmovdqu		0*32(\ghashdata_ptr), TMP1
+	vpshufb		BSWAP_MASK, TMP1, TMP1
+	vmovdqu		OFFSETOF_H_POWERS+0*32(KEY), TMP2
+	vpxor		GHASH_ACC, TMP1, TMP1
+	vpclmulqdq	$0x00, TMP2, TMP1, LO
+	vpclmulqdq	$0x11, TMP2, TMP1, HI
+	vpunpckhqdq	TMP1, TMP1, TMP0
+	vpxor		TMP1, TMP0, TMP0
+	vpclmulqdq	$0x00, H_POW2_XORED, TMP0, MI
+.elseif \i == 1
+.elseif \i == 2
+	// Second vector
+	vmovdqu		1*32(\ghashdata_ptr), TMP1
+	vpshufb		BSWAP_MASK, TMP1, TMP1
+	vmovdqu		OFFSETOF_H_POWERS+1*32(KEY), TMP2
+	vpclmulqdq	$0x00, TMP2, TMP1, TMP0
+	vpxor		TMP0, LO, LO
+	vpclmulqdq	$0x11, TMP2, TMP1, TMP0
+	vpxor		TMP0, HI, HI
+	vpunpckhqdq	TMP1, TMP1, TMP0
+	vpxor		TMP1, TMP0, TMP0
+	vpclmulqdq	$0x10, H_POW2_XORED, TMP0, TMP0
+	vpxor		TMP0, MI, MI
+.elseif \i == 3
+	// Third vector
+	vmovdqu		2*32(\ghashdata_ptr), TMP1
+	vpshufb		BSWAP_MASK, TMP1, TMP1
+	vmovdqu		OFFSETOF_H_POWERS+2*32(KEY), TMP2
+.elseif \i == 4
+	vpclmulqdq	$0x00, TMP2, TMP1, TMP0
+	vpxor		TMP0, LO, LO
+	vpclmulqdq	$0x11, TMP2, TMP1, TMP0
+	vpxor		TMP0, HI, HI
+.elseif \i == 5
+	vpunpckhqdq	TMP1, TMP1, TMP0
+	vpxor		TMP1, TMP0, TMP0
+	vpclmulqdq	$0x00, H_POW1_XORED, TMP0, TMP0
+	vpxor		TMP0, MI, MI
+
+	// Fourth vector
+	vmovdqu		3*32(\ghashdata_ptr), TMP1
+	vpshufb		BSWAP_MASK, TMP1, TMP1
+.elseif \i == 6
+	vmovdqu		OFFSETOF_H_POWERS+3*32(KEY), TMP2
+	vpclmulqdq	$0x00, TMP2, TMP1, TMP0
+	vpxor		TMP0, LO, LO
+	vpclmulqdq	$0x11, TMP2, TMP1, TMP0
+	vpxor		TMP0, HI, HI
+	vpunpckhqdq	TMP1, TMP1, TMP0
+	vpxor		TMP1, TMP0, TMP0
+	vpclmulqdq	$0x10, H_POW1_XORED, TMP0, TMP0
+	vpxor		TMP0, MI, MI
+.elseif \i == 7
+	// Finalize 'mi' following Karatsuba multiplication.
+	vpxor		LO, MI, MI
+	vpxor		HI, MI, MI
+
+	// Fold lo into mi.
+	vbroadcasti128	.Lgfpoly(%rip), TMP2
+	vpclmulqdq	$0x01, LO, TMP2, TMP0
+	vpshufd		$0x4e, LO, LO
+	vpxor		LO, MI, MI
+	vpxor		TMP0, MI, MI
+.elseif \i == 8
+	// Fold mi into hi.
+	vpclmulqdq	$0x01, MI, TMP2, TMP0
+	vpshufd		$0x4e, MI, MI
+	vpxor		MI, HI, HI
+	vpxor		TMP0, HI, HI
+.elseif \i == 9
+	vextracti128	$1, HI, TMP0_XMM
+	vpxor		TMP0_XMM, HI_XMM, GHASH_ACC_XMM
+.endif
+.endm
+
+// Update GHASH with four vectors of data blocks.  See _ghash_step_4x for full
+// explanation.
+.macro	_ghash_4x	ghashdata_ptr
+.irp i, 0,1,2,3,4,5,6,7,8,9
+	_ghash_step_4x	\i, \ghashdata_ptr
+.endr
+.endm
+
+// Load 1 <= %ecx <= 16 bytes from the pointer \src into the xmm register \dst
+// and zeroize any remaining bytes.  Clobbers %rax, %rcx, and \tmp{64,32}.
+.macro	_load_partial_block	src, dst, tmp64, tmp32
+	sub		$8, %ecx		// LEN - 8
+	jle		.Lle8\@
+
+	// Load 9 <= LEN <= 16 bytes.
+	vmovq		(\src), \dst		// Load first 8 bytes
+	mov		(\src, %rcx), %rax	// Load last 8 bytes
+	neg		%ecx
+	shl		$3, %ecx
+	shr		%cl, %rax		// Discard overlapping bytes
+	vpinsrq		$1, %rax, \dst, \dst
+	jmp		.Ldone\@
+
+.Lle8\@:
+	add		$4, %ecx		// LEN - 4
+	jl		.Llt4\@
+
+	// Load 4 <= LEN <= 8 bytes.
+	mov		(\src), %eax		// Load first 4 bytes
+	mov		(\src, %rcx), \tmp32	// Load last 4 bytes
+	jmp		.Lcombine\@
+
+.Llt4\@:
+	// Load 1 <= LEN <= 3 bytes.
+	add		$2, %ecx		// LEN - 2
+	movzbl		(\src), %eax		// Load first byte
+	jl		.Lmovq\@
+	movzwl		(\src, %rcx), \tmp32	// Load last 2 bytes
+.Lcombine\@:
+	shl		$3, %ecx
+	shl		%cl, \tmp64
+	or		\tmp64, %rax		// Combine the two parts
+.Lmovq\@:
+	vmovq		%rax, \dst
+.Ldone\@:
+.endm
+
+// Store 1 <= %ecx <= 16 bytes from the xmm register \src to the pointer \dst.
+// Clobbers %rax, %rcx, and \tmp{64,32}.
+.macro	_store_partial_block	src, dst, tmp64, tmp32
+	sub		$8, %ecx		// LEN - 8
+	jl		.Llt8\@
+
+	// Store 8 <= LEN <= 16 bytes.
+	vpextrq		$1, \src, %rax
+	mov		%ecx, \tmp32
+	shl		$3, %ecx
+	ror		%cl, %rax
+	mov		%rax, (\dst, \tmp64)	// Store last LEN - 8 bytes
+	vmovq		\src, (\dst)		// Store first 8 bytes
+	jmp		.Ldone\@
+
+.Llt8\@:
+	add		$4, %ecx		// LEN - 4
+	jl		.Llt4\@
+
+	// Store 4 <= LEN <= 7 bytes.
+	vpextrd		$1, \src, %eax
+	mov		%ecx, \tmp32
+	shl		$3, %ecx
+	ror		%cl, %eax
+	mov		%eax, (\dst, \tmp64)	// Store last LEN - 4 bytes
+	vmovd		\src, (\dst)		// Store first 4 bytes
+	jmp		.Ldone\@
+
+.Llt4\@:
+	// Store 1 <= LEN <= 3 bytes.
+	vpextrb		$0, \src, 0(\dst)
+	cmp		$-2, %ecx		// LEN - 4 == -2, i.e. LEN == 2?
+	jl		.Ldone\@
+	vpextrb		$1, \src, 1(\dst)
+	je		.Ldone\@
+	vpextrb		$2, \src, 2(\dst)
+.Ldone\@:
+.endm
+
+// void aes_gcm_aad_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+//				     u8 ghash_acc[16],
+//				     const u8 *aad, int aadlen);
+//
+// This function processes the AAD (Additional Authenticated Data) in GCM.
+// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
+// data given by |aad| and |aadlen|.  On the first call, |ghash_acc| must be all
+// zeroes.  |aadlen| must be a multiple of 16, except on the last call where it
+// can be any length.  The caller must do any buffering needed to ensure this.
+//
+// This handles large amounts of AAD efficiently, while also keeping overhead
+// low for small amounts which is the common case.  TLS and IPsec use less than
+// one block of AAD, but (uncommonly) other use cases may use much more.
+SYM_FUNC_START(aes_gcm_aad_update_vaes_avx2)
+
+	// Function arguments
+	.set	KEY,		%rdi
+	.set	GHASH_ACC_PTR,	%rsi
+	.set	AAD,		%rdx
+	.set	AADLEN,		%ecx	// Must be %ecx for _load_partial_block
+	.set	AADLEN64,	%rcx	// Zero-extend AADLEN before using!
+
+	// Additional local variables.
+	// %rax and %r8 are used as temporary registers.
+	.set	TMP0,		%ymm0
+	.set	TMP0_XMM,	%xmm0
+	.set	TMP1,		%ymm1
+	.set	TMP1_XMM,	%xmm1
+	.set	TMP2,		%ymm2
+	.set	TMP2_XMM,	%xmm2
+	.set	LO,		%ymm3
+	.set	LO_XMM,		%xmm3
+	.set	MI,		%ymm4
+	.set	MI_XMM,		%xmm4
+	.set	GHASH_ACC,	%ymm5
+	.set	GHASH_ACC_XMM,	%xmm5
+	.set	BSWAP_MASK,	%ymm6
+	.set	BSWAP_MASK_XMM,	%xmm6
+	.set	GFPOLY,		%ymm7
+	.set	GFPOLY_XMM,	%xmm7
+	.set	H_POW2_XORED,	%ymm8
+	.set	H_POW1_XORED,	%ymm9
+
+	// Load the bswap_mask and gfpoly constants.  Since AADLEN is usually
+	// small, usually only 128-bit vectors will be used.  So as an
+	// optimization, don't broadcast these constants to both 128-bit lanes
+	// quite yet.
+	vmovdqu		.Lbswap_mask(%rip), BSWAP_MASK_XMM
+	vmovdqu		.Lgfpoly(%rip), GFPOLY_XMM
+
+	// Load the GHASH accumulator.
+	vmovdqu		(GHASH_ACC_PTR), GHASH_ACC_XMM
+
+	// Check for the common case of AADLEN <= 16, as well as AADLEN == 0.
+	test		AADLEN, AADLEN
+	jz		.Laad_done
+	cmp		$16, AADLEN
+	jle		.Laad_lastblock
+
+	// AADLEN > 16, so we'll operate on full vectors.  Broadcast bswap_mask
+	// and gfpoly to both 128-bit lanes.
+	vinserti128	$1, BSWAP_MASK_XMM, BSWAP_MASK, BSWAP_MASK
+	vinserti128	$1, GFPOLY_XMM, GFPOLY, GFPOLY
+
+	// If AADLEN >= 128, update GHASH with 128 bytes of AAD at a time.
+	add		$-128, AADLEN	// 128 is 4 bytes, -128 is 1 byte
+	jl		.Laad_loop_4x_done
+	vmovdqu		OFFSETOF_H_POWERS_XORED(KEY), H_POW2_XORED
+	vmovdqu		OFFSETOF_H_POWERS_XORED+32(KEY), H_POW1_XORED
+.Laad_loop_4x:
+	_ghash_4x	AAD
+	sub		$-128, AAD
+	add		$-128, AADLEN
+	jge		.Laad_loop_4x
+.Laad_loop_4x_done:
+
+	// If AADLEN >= 32, update GHASH with 32 bytes of AAD at a time.
+	add		$96, AADLEN
+	jl		.Laad_loop_1x_done
+.Laad_loop_1x:
+	vmovdqu		(AAD), TMP0
+	vpshufb		BSWAP_MASK, TMP0, TMP0
+	vpxor		TMP0, GHASH_ACC, GHASH_ACC
+	vmovdqu		OFFSETOFEND_H_POWERS-32(KEY), TMP0
+	_ghash_mul	TMP0, GHASH_ACC, GHASH_ACC, GFPOLY, TMP1, TMP2, LO
+	vextracti128	$1, GHASH_ACC, TMP0_XMM
+	vpxor		TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+	add		$32, AAD
+	sub		$32, AADLEN
+	jge		.Laad_loop_1x
+.Laad_loop_1x_done:
+	add		$32, AADLEN
+	// Now 0 <= AADLEN < 32.
+
+	jz		.Laad_done
+	cmp		$16, AADLEN
+	jle		.Laad_lastblock
+
+	// Update GHASH with the remaining 17 <= AADLEN <= 31 bytes of AAD.
+	mov		AADLEN, AADLEN	// Zero-extend AADLEN to AADLEN64.
+	vmovdqu		(AAD), TMP0_XMM
+	vmovdqu		-16(AAD, AADLEN64), TMP1_XMM
+	vpshufb		BSWAP_MASK_XMM, TMP0_XMM, TMP0_XMM
+	vpxor		TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+	lea		.Lrshift_and_bswap_table(%rip), %rax
+	vpshufb		-16(%rax, AADLEN64), TMP1_XMM, TMP1_XMM
+	vinserti128	$1, TMP1_XMM, GHASH_ACC, GHASH_ACC
+	vmovdqu		OFFSETOFEND_H_POWERS-32(KEY), TMP0
+	_ghash_mul	TMP0, GHASH_ACC, GHASH_ACC, GFPOLY, TMP1, TMP2, LO
+	vextracti128	$1, GHASH_ACC, TMP0_XMM
+	vpxor		TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+	jmp		.Laad_done
+
+.Laad_lastblock:
+	// Update GHASH with the remaining 1 <= AADLEN <= 16 bytes of AAD.
+	_load_partial_block	AAD, TMP0_XMM, %r8, %r8d
+	vpshufb		BSWAP_MASK_XMM, TMP0_XMM, TMP0_XMM
+	vpxor		TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+	vmovdqu		OFFSETOFEND_H_POWERS-16(KEY), TMP0_XMM
+	_ghash_mul	TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM, GFPOLY_XMM, \
+			TMP1_XMM, TMP2_XMM, LO_XMM
+
+.Laad_done:
+	// Store the updated GHASH accumulator back to memory.
+	vmovdqu		GHASH_ACC_XMM, (GHASH_ACC_PTR)
+
+	vzeroupper
+	RET
+SYM_FUNC_END(aes_gcm_aad_update_vaes_avx2)
+
+// Do one non-last round of AES encryption on the blocks in the given AESDATA
+// vectors using the round key that has been broadcast to all 128-bit lanes of
+// \round_key.
+.macro	_vaesenc	round_key, vecs:vararg
+.irp i, \vecs
+	vaesenc		\round_key, AESDATA\i, AESDATA\i
+.endr
+.endm
+
+// Generate counter blocks in the given AESDATA vectors, then do the zero-th AES
+// round on them.  Clobbers TMP0.
+.macro	_ctr_begin	vecs:vararg
+	vbroadcasti128	.Linc_2blocks(%rip), TMP0
+.irp i, \vecs
+	vpshufb		BSWAP_MASK, LE_CTR, AESDATA\i
+	vpaddd		TMP0, LE_CTR, LE_CTR
+.endr
+.irp i, \vecs
+	vpxor		RNDKEY0, AESDATA\i, AESDATA\i
+.endr
+.endm
+
+// Generate and encrypt counter blocks in the given AESDATA vectors, excluding
+// the last AES round.  Clobbers %rax and TMP0.
+.macro	_aesenc_loop	vecs:vararg
+	_ctr_begin	\vecs
+	lea		16(KEY), %rax
+.Laesenc_loop\@:
+	vbroadcasti128	(%rax), TMP0
+	_vaesenc	TMP0, \vecs
+	add		$16, %rax
+	cmp		%rax, RNDKEYLAST_PTR
+	jne		.Laesenc_loop\@
+.endm
+
+// Finalize the keystream blocks in the given AESDATA vectors by doing the last
+// AES round, then XOR those keystream blocks with the corresponding data.
+// Reduce latency by doing the XOR before the vaesenclast, utilizing the
+// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).  Clobbers TMP0.
+.macro	_aesenclast_and_xor	vecs:vararg
+.irp i, \vecs
+	vpxor		\i*32(SRC), RNDKEYLAST, TMP0
+	vaesenclast	TMP0, AESDATA\i, AESDATA\i
+.endr
+.irp i, \vecs
+	vmovdqu		AESDATA\i, \i*32(DST)
+.endr
+.endm
+
+// void aes_gcm_{enc,dec}_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+//					   const u32 le_ctr[4], u8 ghash_acc[16],
+//					   const u8 *src, u8 *dst, int datalen);
+//
+// This macro generates a GCM encryption or decryption update function with the
+// above prototype (with \enc selecting which one).  The function computes the
+// next portion of the CTR keystream, XOR's it with |datalen| bytes from |src|,
+// and writes the resulting encrypted or decrypted data to |dst|.  It also
+// updates the GHASH accumulator |ghash_acc| using the next |datalen| ciphertext
+// bytes.
+//
+// |datalen| must be a multiple of 16, except on the last call where it can be
+// any length.  The caller must do any buffering needed to ensure this.  Both
+// in-place and out-of-place en/decryption are supported.
+//
+// |le_ctr| must give the current counter in little-endian format.  This
+// function loads the counter from |le_ctr| and increments the loaded counter as
+// needed, but it does *not* store the updated counter back to |le_ctr|.  The
+// caller must update |le_ctr| if any more data segments follow.  Internally,
+// only the low 32-bit word of the counter is incremented, following the GCM
+// standard.
+.macro	_aes_gcm_update	enc
+
+	// Function arguments
+	.set	KEY,		%rdi
+	.set	LE_CTR_PTR,	%rsi
+	.set	LE_CTR_PTR32,	%esi
+	.set	GHASH_ACC_PTR,	%rdx
+	.set	SRC,		%rcx	// Assumed to be %rcx.
+					// See .Ltail_xor_and_ghash_1to16bytes
+	.set	DST,		%r8
+	.set	DATALEN,	%r9d
+	.set	DATALEN64,	%r9	// Zero-extend DATALEN before using!
+
+	// Additional local variables
+
+	// %rax is used as a temporary register.  LE_CTR_PTR is also available
+	// as a temporary register after the counter is loaded.
+
+	// AES key length in bytes
+	.set	AESKEYLEN,	%r10d
+	.set	AESKEYLEN64,	%r10
+
+	// Pointer to the last AES round key for the chosen AES variant
+	.set	RNDKEYLAST_PTR,	%r11
+
+	// BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values
+	// using vpshufb, copied to all 128-bit lanes.
+	.set	BSWAP_MASK,	%ymm0
+	.set	BSWAP_MASK_XMM,	%xmm0
+
+	// GHASH_ACC is the accumulator variable for GHASH.  When fully reduced,
+	// only the lowest 128-bit lane can be nonzero.  When not fully reduced,
+	// more than one lane may be used, and they need to be XOR'd together.
+	.set	GHASH_ACC,	%ymm1
+	.set	GHASH_ACC_XMM,	%xmm1
+
+	// TMP[0-2] are temporary registers.
+	.set	TMP0,		%ymm2
+	.set	TMP0_XMM,	%xmm2
+	.set	TMP1,		%ymm3
+	.set	TMP1_XMM,	%xmm3
+	.set	TMP2,		%ymm4
+	.set	TMP2_XMM,	%xmm4
+
+	// LO and MI are used to accumulate unreduced GHASH products.
+	.set	LO,		%ymm5
+	.set	LO_XMM,		%xmm5
+	.set	MI,		%ymm6
+	.set	MI_XMM,		%xmm6
+
+	// H_POW[2-1]_XORED contain cached values from KEY->h_powers_xored.  The
+	// descending numbering reflects the order of the key powers.
+	.set	H_POW2_XORED,	%ymm7
+	.set	H_POW2_XORED_XMM, %xmm7
+	.set	H_POW1_XORED,	%ymm8
+
+	// RNDKEY0 caches the zero-th round key, and RNDKEYLAST the last one.
+	.set	RNDKEY0,	%ymm9
+	.set	RNDKEYLAST,	%ymm10
+
+	// LE_CTR contains the next set of little-endian counter blocks.
+	.set	LE_CTR,		%ymm11
+
+	// AESDATA[0-3] hold the counter blocks that are being encrypted by AES.
+	.set	AESDATA0,	%ymm12
+	.set	AESDATA0_XMM,	%xmm12
+	.set	AESDATA1,	%ymm13
+	.set	AESDATA1_XMM,	%xmm13
+	.set	AESDATA2,	%ymm14
+	.set	AESDATA3,	%ymm15
+
+.if \enc
+	.set	GHASHDATA_PTR,	DST
+.else
+	.set	GHASHDATA_PTR,	SRC
+.endif
+
+	vbroadcasti128	.Lbswap_mask(%rip), BSWAP_MASK
+
+	// Load the GHASH accumulator and the starting counter.
+	vmovdqu		(GHASH_ACC_PTR), GHASH_ACC_XMM
+	vbroadcasti128	(LE_CTR_PTR), LE_CTR
+
+	// Load the AES key length in bytes.
+	movl		OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
+
+	// Make RNDKEYLAST_PTR point to the last AES round key.  This is the
+	// round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256
+	// respectively.  Then load the zero-th and last round keys.
+	lea		6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR
+	vbroadcasti128	(KEY), RNDKEY0
+	vbroadcasti128	(RNDKEYLAST_PTR), RNDKEYLAST
+
+	// Finish initializing LE_CTR by adding 1 to the second block.
+	vpaddd		.Lctr_pattern(%rip), LE_CTR, LE_CTR
+
+	// If there are at least 128 bytes of data, then continue into the loop
+	// that processes 128 bytes of data at a time.  Otherwise skip it.
+	add		$-128, DATALEN	// 128 is 4 bytes, -128 is 1 byte
+	jl		.Lcrypt_loop_4x_done\@
+
+	vmovdqu		OFFSETOF_H_POWERS_XORED(KEY), H_POW2_XORED
+	vmovdqu		OFFSETOF_H_POWERS_XORED+32(KEY), H_POW1_XORED
+
+	// Main loop: en/decrypt and hash 4 vectors (128 bytes) at a time.
+
+.if \enc
+	// Encrypt the first 4 vectors of plaintext blocks.
+	_aesenc_loop	0,1,2,3
+	_aesenclast_and_xor	0,1,2,3
+	sub		$-128, SRC	// 128 is 4 bytes, -128 is 1 byte
+	add		$-128, DATALEN
+	jl		.Lghash_last_ciphertext_4x\@
+.endif
+
+.align 16
+.Lcrypt_loop_4x\@:
+
+	// Start the AES encryption of the counter blocks.
+	_ctr_begin	0,1,2,3
+	cmp		$24, AESKEYLEN
+	jl		128f	// AES-128?
+	je		192f	// AES-192?
+	// AES-256
+	vbroadcasti128	-13*16(RNDKEYLAST_PTR), TMP0
+	_vaesenc	TMP0, 0,1,2,3
+	vbroadcasti128	-12*16(RNDKEYLAST_PTR), TMP0
+	_vaesenc	TMP0, 0,1,2,3
+192:
+	vbroadcasti128	-11*16(RNDKEYLAST_PTR), TMP0
+	_vaesenc	TMP0, 0,1,2,3
+	vbroadcasti128	-10*16(RNDKEYLAST_PTR), TMP0
+	_vaesenc	TMP0, 0,1,2,3
+128:
+
+	// Finish the AES encryption of the counter blocks in AESDATA[0-3],
+	// interleaved with the GHASH update of the ciphertext blocks.
+.irp i, 9,8,7,6,5,4,3,2,1
+	_ghash_step_4x  (9 - \i), GHASHDATA_PTR
+	vbroadcasti128	-\i*16(RNDKEYLAST_PTR), TMP0
+	_vaesenc	TMP0, 0,1,2,3
+.endr
+	_ghash_step_4x	9, GHASHDATA_PTR
+.if \enc
+	sub		$-128, DST	// 128 is 4 bytes, -128 is 1 byte
+.endif
+	_aesenclast_and_xor	0,1,2,3
+	sub		$-128, SRC
+.if !\enc
+	sub		$-128, DST
+.endif
+	add		$-128, DATALEN
+	jge		.Lcrypt_loop_4x\@
+
+.if \enc
+.Lghash_last_ciphertext_4x\@:
+	// Update GHASH with the last set of ciphertext blocks.
+	_ghash_4x	DST
+	sub		$-128, DST
+.endif
+
+.Lcrypt_loop_4x_done\@:
+
+	// Undo the extra subtraction by 128 and check whether data remains.
+	sub		$-128, DATALEN	// 128 is 4 bytes, -128 is 1 byte
+	jz		.Ldone\@
+
+	// The data length isn't a multiple of 128 bytes.  Process the remaining
+	// data of length 1 <= DATALEN < 128.
+	//
+	// Since there are enough key powers available for all remaining data,
+	// there is no need to do a GHASH reduction after each iteration.
+	// Instead, multiply each remaining block by its own key power, and only
+	// do a GHASH reduction at the very end.
+
+	// Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N
+	// is the number of blocks that remain.
+	.set		POWERS_PTR, LE_CTR_PTR	// LE_CTR_PTR is free to be reused.
+	.set		POWERS_PTR32, LE_CTR_PTR32
+	mov		DATALEN, %eax
+	neg		%rax
+	and		$~15, %rax  // -round_up(DATALEN, 16)
+	lea		OFFSETOFEND_H_POWERS(KEY,%rax), POWERS_PTR
+
+	// Start collecting the unreduced GHASH intermediate value LO, MI, HI.
+	.set		HI, H_POW2_XORED	// H_POW2_XORED is free to be reused.
+	.set		HI_XMM, H_POW2_XORED_XMM
+	vpxor		LO_XMM, LO_XMM, LO_XMM
+	vpxor		MI_XMM, MI_XMM, MI_XMM
+	vpxor		HI_XMM, HI_XMM, HI_XMM
+
+	// 1 <= DATALEN < 128.  Generate 2 or 4 more vectors of keystream blocks
+	// excluding the last AES round, depending on the remaining DATALEN.
+	cmp		$64, DATALEN
+	jg		.Ltail_gen_4_keystream_vecs\@
+	_aesenc_loop	0,1
+	cmp		$32, DATALEN
+	jge		.Ltail_xor_and_ghash_full_vec_loop\@
+	jmp		.Ltail_xor_and_ghash_partial_vec\@
+.Ltail_gen_4_keystream_vecs\@:
+	_aesenc_loop	0,1,2,3
+
+	// XOR the remaining data and accumulate the unreduced GHASH products
+	// for DATALEN >= 32, starting with one full 32-byte vector at a time.
+.Ltail_xor_and_ghash_full_vec_loop\@:
+.if \enc
+	_aesenclast_and_xor	0
+	vpshufb		BSWAP_MASK, AESDATA0, AESDATA0
+.else
+	vmovdqu		(SRC), TMP1
+	vpxor		TMP1, RNDKEYLAST, TMP0
+	vaesenclast	TMP0, AESDATA0, AESDATA0
+	vmovdqu		AESDATA0, (DST)
+	vpshufb		BSWAP_MASK, TMP1, AESDATA0
+.endif
+	// The ciphertext blocks (i.e. GHASH input data) are now in AESDATA0.
+	vpxor		GHASH_ACC, AESDATA0, AESDATA0
+	vmovdqu		(POWERS_PTR), TMP2
+	_ghash_mul_noreduce	TMP2, AESDATA0, LO, MI, HI, TMP0
+	vmovdqa		AESDATA1, AESDATA0
+	vmovdqa		AESDATA2, AESDATA1
+	vmovdqa		AESDATA3, AESDATA2
+	vpxor		GHASH_ACC_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+	add		$32, SRC
+	add		$32, DST
+	add		$32, POWERS_PTR
+	sub		$32, DATALEN
+	cmp		$32, DATALEN
+	jge		.Ltail_xor_and_ghash_full_vec_loop\@
+	test		DATALEN, DATALEN
+	jz		.Ltail_ghash_reduce\@
+
+.Ltail_xor_and_ghash_partial_vec\@:
+	// XOR the remaining data and accumulate the unreduced GHASH products,
+	// for 1 <= DATALEN < 32.
+	vaesenclast	RNDKEYLAST, AESDATA0, AESDATA0
+	cmp		$16, DATALEN
+	jle		.Ltail_xor_and_ghash_1to16bytes\@
+
+	// Handle 17 <= DATALEN < 32.
+
+	// Load a vpshufb mask that will right-shift by '32 - DATALEN' bytes
+	// (shifting in zeroes), then reflect all 16 bytes.
+	lea		.Lrshift_and_bswap_table(%rip), %rax
+	vmovdqu		-16(%rax, DATALEN64), TMP2_XMM
+
+	// Move the second keystream block to its own register and left-align it
+	vextracti128	$1, AESDATA0, AESDATA1_XMM
+	vpxor		.Lfifteens(%rip), TMP2_XMM, TMP0_XMM
+	vpshufb		TMP0_XMM, AESDATA1_XMM, AESDATA1_XMM
+
+	// Using overlapping loads and stores, XOR the source data with the
+	// keystream and write the destination data.  Then prepare the GHASH
+	// input data: the full ciphertext block and the zero-padded partial
+	// ciphertext block, both byte-reflected, in AESDATA0.
+.if \enc
+	vpxor		-16(SRC, DATALEN64), AESDATA1_XMM, AESDATA1_XMM
+	vpxor		(SRC), AESDATA0_XMM, AESDATA0_XMM
+	vmovdqu		AESDATA1_XMM, -16(DST, DATALEN64)
+	vmovdqu		AESDATA0_XMM, (DST)
+	vpshufb		TMP2_XMM, AESDATA1_XMM, AESDATA1_XMM
+	vpshufb		BSWAP_MASK_XMM, AESDATA0_XMM, AESDATA0_XMM
+.else
+	vmovdqu		-16(SRC, DATALEN64), TMP1_XMM
+	vmovdqu		(SRC), TMP0_XMM
+	vpxor		TMP1_XMM, AESDATA1_XMM, AESDATA1_XMM
+	vpxor		TMP0_XMM, AESDATA0_XMM, AESDATA0_XMM
+	vmovdqu		AESDATA1_XMM, -16(DST, DATALEN64)
+	vmovdqu		AESDATA0_XMM, (DST)
+	vpshufb		TMP2_XMM, TMP1_XMM, AESDATA1_XMM
+	vpshufb		BSWAP_MASK_XMM, TMP0_XMM, AESDATA0_XMM
+.endif
+	vpxor		GHASH_ACC_XMM, AESDATA0_XMM, AESDATA0_XMM
+	vinserti128	$1, AESDATA1_XMM, AESDATA0, AESDATA0
+	vmovdqu		(POWERS_PTR), TMP2
+	jmp		.Ltail_ghash_last_vec\@
+
+.Ltail_xor_and_ghash_1to16bytes\@:
+	// Handle 1 <= DATALEN <= 16.  Carefully load and store the
+	// possibly-partial block, which we mustn't access out of bounds.
+	vmovdqu		(POWERS_PTR), TMP2_XMM
+	mov		SRC, KEY	// Free up %rcx, assuming SRC == %rcx
+	mov		DATALEN, %ecx
+	_load_partial_block	KEY, TMP0_XMM, POWERS_PTR, POWERS_PTR32
+	vpxor		TMP0_XMM, AESDATA0_XMM, AESDATA0_XMM
+	mov		DATALEN, %ecx
+	_store_partial_block	AESDATA0_XMM, DST, POWERS_PTR, POWERS_PTR32
+.if \enc
+	lea		.Lselect_high_bytes_table(%rip), %rax
+	vpshufb		BSWAP_MASK_XMM, AESDATA0_XMM, AESDATA0_XMM
+	vpand		(%rax, DATALEN64), AESDATA0_XMM, AESDATA0_XMM
+.else
+	vpshufb		BSWAP_MASK_XMM, TMP0_XMM, AESDATA0_XMM
+.endif
+	vpxor		GHASH_ACC_XMM, AESDATA0_XMM, AESDATA0_XMM
+
+.Ltail_ghash_last_vec\@:
+	// Accumulate the unreduced GHASH products for the last 1-2 blocks.  The
+	// GHASH input data is in AESDATA0.  If only one block remains, then the
+	// second block in AESDATA0 is zero and does not affect the result.
+	_ghash_mul_noreduce	TMP2, AESDATA0, LO, MI, HI, TMP0
+
+.Ltail_ghash_reduce\@:
+	// Finally, do the GHASH reduction.
+	vbroadcasti128	.Lgfpoly(%rip), TMP0
+	_ghash_reduce	LO, MI, HI, TMP0, TMP1
+	vextracti128	$1, HI, GHASH_ACC_XMM
+	vpxor		HI_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+
+.Ldone\@:
+	// Store the updated GHASH accumulator back to memory.
+	vmovdqu		GHASH_ACC_XMM, (GHASH_ACC_PTR)
+
+	vzeroupper
+	RET
+.endm
+
+// void aes_gcm_enc_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+//				    const u32 le_ctr[4], u8 ghash_acc[16],
+//				    u64 total_aadlen, u64 total_datalen);
+// bool aes_gcm_dec_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+//				    const u32 le_ctr[4], const u8 ghash_acc[16],
+//				    u64 total_aadlen, u64 total_datalen,
+//				    const u8 tag[16], int taglen);
+//
+// This macro generates one of the above two functions (with \enc selecting
+// which one).  Both functions finish computing the GCM authentication tag by
+// updating GHASH with the lengths block and encrypting the GHASH accumulator.
+// |total_aadlen| and |total_datalen| must be the total length of the additional
+// authenticated data and the en/decrypted data in bytes, respectively.
+//
+// The encryption function then stores the full-length (16-byte) computed
+// authentication tag to |ghash_acc|.  The decryption function instead loads the
+// expected authentication tag (the one that was transmitted) from the 16-byte
+// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the
+// computed tag in constant time, and returns true if and only if they match.
+.macro	_aes_gcm_final	enc
+
+	// Function arguments
+	.set	KEY,		%rdi
+	.set	LE_CTR_PTR,	%rsi
+	.set	GHASH_ACC_PTR,	%rdx
+	.set	TOTAL_AADLEN,	%rcx
+	.set	TOTAL_DATALEN,	%r8
+	.set	TAG,		%r9
+	.set	TAGLEN,		%r10d	// Originally at 8(%rsp)
+	.set	TAGLEN64,	%r10
+
+	// Additional local variables.
+	// %rax and %xmm0-%xmm3 are used as temporary registers.
+	.set	AESKEYLEN,	%r11d
+	.set	AESKEYLEN64,	%r11
+	.set	GFPOLY,		%xmm4
+	.set	BSWAP_MASK,	%xmm5
+	.set	LE_CTR,		%xmm6
+	.set	GHASH_ACC,	%xmm7
+	.set	H_POW1,		%xmm8
+
+	// Load some constants.
+	vmovdqa		.Lgfpoly(%rip), GFPOLY
+	vmovdqa		.Lbswap_mask(%rip), BSWAP_MASK
+
+	// Load the AES key length in bytes.
+	movl		OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
+
+	// Set up a counter block with 1 in the low 32-bit word.  This is the
+	// counter that produces the ciphertext needed to encrypt the auth tag.
+	// GFPOLY has 1 in the low word, so grab the 1 from there using a blend.
+	vpblendd	$0xe, (LE_CTR_PTR), GFPOLY, LE_CTR
+
+	// Build the lengths block and XOR it with the GHASH accumulator.
+	// Although the lengths block is defined as the AAD length followed by
+	// the en/decrypted data length, both in big-endian byte order, a byte
+	// reflection of the full block is needed because of the way we compute
+	// GHASH (see _ghash_mul_step).  By using little-endian values in the
+	// opposite order, we avoid having to reflect any bytes here.
+	vmovq		TOTAL_DATALEN, %xmm0
+	vpinsrq		$1, TOTAL_AADLEN, %xmm0, %xmm0
+	vpsllq		$3, %xmm0, %xmm0	// Bytes to bits
+	vpxor		(GHASH_ACC_PTR), %xmm0, GHASH_ACC
+
+	// Load the first hash key power (H^1), which is stored last.
+	vmovdqu		OFFSETOFEND_H_POWERS-16(KEY), H_POW1
+
+	// Load TAGLEN if decrypting.
+.if !\enc
+	movl		8(%rsp), TAGLEN
+.endif
+
+	// Make %rax point to the last AES round key for the chosen AES variant.
+	lea		6*16(KEY,AESKEYLEN64,4), %rax
+
+	// Start the AES encryption of the counter block by swapping the counter
+	// block to big-endian and XOR-ing it with the zero-th AES round key.
+	vpshufb		BSWAP_MASK, LE_CTR, %xmm0
+	vpxor		(KEY), %xmm0, %xmm0
+
+	// Complete the AES encryption and multiply GHASH_ACC by H^1.
+	// Interleave the AES and GHASH instructions to improve performance.
+	cmp		$24, AESKEYLEN
+	jl		128f	// AES-128?
+	je		192f	// AES-192?
+	// AES-256
+	vaesenc		-13*16(%rax), %xmm0, %xmm0
+	vaesenc		-12*16(%rax), %xmm0, %xmm0
+192:
+	vaesenc		-11*16(%rax), %xmm0, %xmm0
+	vaesenc		-10*16(%rax), %xmm0, %xmm0
+128:
+.irp i, 0,1,2,3,4,5,6,7,8
+	_ghash_mul_step	\i, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+			%xmm1, %xmm2, %xmm3
+	vaesenc		(\i-9)*16(%rax), %xmm0, %xmm0
+.endr
+	_ghash_mul_step	9, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+			%xmm1, %xmm2, %xmm3
+
+	// Undo the byte reflection of the GHASH accumulator.
+	vpshufb		BSWAP_MASK, GHASH_ACC, GHASH_ACC
+
+	// Do the last AES round and XOR the resulting keystream block with the
+	// GHASH accumulator to produce the full computed authentication tag.
+	//
+	// Reduce latency by taking advantage of the property vaesenclast(key,
+	// a) ^ b == vaesenclast(key ^ b, a).  I.e., XOR GHASH_ACC into the last
+	// round key, instead of XOR'ing the final AES output with GHASH_ACC.
+	//
+	// enc_final then returns the computed auth tag, while dec_final
+	// compares it with the transmitted one and returns a bool.  To compare
+	// the tags, dec_final XORs them together and uses vptest to check
+	// whether the result is all-zeroes.  This should be constant-time.
+	// dec_final applies the vaesenclast optimization to this additional
+	// value XOR'd too.
+.if \enc
+	vpxor		(%rax), GHASH_ACC, %xmm1
+	vaesenclast	%xmm1, %xmm0, GHASH_ACC
+	vmovdqu		GHASH_ACC, (GHASH_ACC_PTR)
+.else
+	vpxor		(TAG), GHASH_ACC, GHASH_ACC
+	vpxor		(%rax), GHASH_ACC, GHASH_ACC
+	vaesenclast	GHASH_ACC, %xmm0, %xmm0
+	lea		.Lselect_high_bytes_table(%rip), %rax
+	vmovdqu		(%rax, TAGLEN64), %xmm1
+	vpshufb		BSWAP_MASK, %xmm1, %xmm1 // select low bytes, not high
+	xor		%eax, %eax
+	vptest		%xmm1, %xmm0
+	sete		%al
+.endif
+	// No need for vzeroupper here, since only used xmm registers were used.
+	RET
+.endm
+
+SYM_FUNC_START(aes_gcm_enc_update_vaes_avx2)
+	_aes_gcm_update	1
+SYM_FUNC_END(aes_gcm_enc_update_vaes_avx2)
+SYM_FUNC_START(aes_gcm_dec_update_vaes_avx2)
+	_aes_gcm_update	0
+SYM_FUNC_END(aes_gcm_dec_update_vaes_avx2)
+
+SYM_FUNC_START(aes_gcm_enc_final_vaes_avx2)
+	_aes_gcm_final	1
+SYM_FUNC_END(aes_gcm_enc_final_vaes_avx2)
+SYM_FUNC_START(aes_gcm_dec_final_vaes_avx2)
+	_aes_gcm_final	0
+SYM_FUNC_END(aes_gcm_dec_final_vaes_avx2)
diff --git a/arch/x86/crypto/aes-gcm-avx10-x86_64.S b/arch/x86/crypto/aes-gcm-vaes-avx512.S
index 02ee11083d4f..06b71314d65c 100644
--- a/arch/x86/crypto/aes-gcm-avx10-x86_64.S
+++ b/arch/x86/crypto/aes-gcm-vaes-avx512.S
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
 //
-// VAES and VPCLMULQDQ optimized AES-GCM for x86_64
+// AES-GCM implementation for x86_64 CPUs that support the following CPU
+// features: VAES && VPCLMULQDQ && AVX512BW && AVX512VL && BMI2
 //
 // Copyright 2024 Google LLC
 //
@@ -45,41 +46,6 @@
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
-//
-//------------------------------------------------------------------------------
-//
-// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
-// support VAES (vector AES), VPCLMULQDQ (vector carryless multiplication), and
-// either AVX512 or AVX10.  Some of the functions, notably the encryption and
-// decryption update functions which are the most performance-critical, are
-// provided in two variants generated from a macro: one using 256-bit vectors
-// (suffix: vaes_avx10_256) and one using 512-bit vectors (vaes_avx10_512).  The
-// other, "shared" functions (vaes_avx10) use at most 256-bit vectors.
-//
-// The functions that use 512-bit vectors are intended for CPUs that support
-// 512-bit vectors *and* where using them doesn't cause significant
-// downclocking.  They require the following CPU features:
-//
-//	VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/512)
-//
-// The other functions require the following CPU features:
-//
-//	VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/256)
-//
-// All functions use the "System V" ABI.  The Windows ABI is not supported.
-//
-// Note that we use "avx10" in the names of the functions as a shorthand to
-// really mean "AVX10 or a certain set of AVX512 features".  Due to Intel's
-// introduction of AVX512 and then its replacement by AVX10, there doesn't seem
-// to be a simple way to name things that makes sense on all CPUs.
-//
-// Note that the macros that support both 256-bit and 512-bit vectors could
-// fairly easily be changed to support 128-bit too.  However, this would *not*
-// be sufficient to allow the code to run on CPUs without AVX512 or AVX10,
-// because the code heavily uses several features of these extensions other than
-// the vector length: the increase in the number of SIMD registers from 16 to
-// 32, masking support, and new instructions such as vpternlogd (which can do a
-// three-argument XOR).  These features are very useful for AES-GCM.
 
 #include <linux/linkage.h>
 
@@ -104,16 +70,14 @@
 .Lgfpoly_and_internal_carrybit:
 	.octa	0xc2000000000000010000000000000001
 
-	// The below constants are used for incrementing the counter blocks.
-	// ctr_pattern points to the four 128-bit values [0, 1, 2, 3].
-	// inc_2blocks and inc_4blocks point to the single 128-bit values 2 and
-	// 4.  Note that the same '2' is reused in ctr_pattern and inc_2blocks.
+	// Values needed to prepare the initial vector of counter blocks.
 .Lctr_pattern:
 	.octa	0
 	.octa	1
-.Linc_2blocks:
 	.octa	2
 	.octa	3
+
+	// The number of AES blocks per vector, as a 128-bit value.
 .Linc_4blocks:
 	.octa	4
 
@@ -130,29 +94,13 @@
 // Offset to end of hash key powers array in the key struct.
 //
 // This is immediately followed by three zeroized padding blocks, which are
-// included so that partial vectors can be handled more easily.  E.g. if VL=64
-// and two blocks remain, we load the 4 values [H^2, H^1, 0, 0].  The most
-// padding blocks needed is 3, which occurs if [H^1, 0, 0, 0] is loaded.
+// included so that partial vectors can be handled more easily.  E.g. if two
+// blocks remain, we load the 4 values [H^2, H^1, 0, 0].  The most padding
+// blocks needed is 3, which occurs if [H^1, 0, 0, 0] is loaded.
 #define OFFSETOFEND_H_POWERS	(OFFSETOF_H_POWERS + (NUM_H_POWERS * 16))
 
 .text
 
-// Set the vector length in bytes.  This sets the VL variable and defines
-// register aliases V0-V31 that map to the ymm or zmm registers.
-.macro	_set_veclen	vl
-	.set	VL,	\vl
-.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
-	16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
-.if VL == 32
-	.set	V\i,	%ymm\i
-.elseif VL == 64
-	.set	V\i,	%zmm\i
-.else
-	.error "Unsupported vector length"
-.endif
-.endr
-.endm
-
 // The _ghash_mul_step macro does one step of GHASH multiplication of the
 // 128-bit lanes of \a by the corresponding 128-bit lanes of \b and storing the
 // reduced products in \dst.  \t0, \t1, and \t2 are temporary registers of the
@@ -312,39 +260,44 @@
 	vpternlogd	$0x96, \t0, \mi, \hi
 .endm
 
-// void aes_gcm_precompute_##suffix(struct aes_gcm_key_avx10 *key);
-//
-// Given the expanded AES key |key->aes_key|, this function derives the GHASH
-// subkey and initializes |key->ghash_key_powers| with powers of it.
-//
-// The number of key powers initialized is NUM_H_POWERS, and they are stored in
-// the order H^NUM_H_POWERS to H^1.  The zeroized padding blocks after the key
-// powers themselves are also initialized.
+// This is a specialized version of _ghash_mul that computes \a * \a, i.e. it
+// squares \a.  It skips computing MI = (a_L * a_H) + (a_H * a_L) = 0.
+.macro	_ghash_square	a, dst, gfpoly, t0, t1
+	vpclmulqdq	$0x00, \a, \a, \t0	  // LO = a_L * a_L
+	vpclmulqdq	$0x11, \a, \a, \dst	  // HI = a_H * a_H
+	vpclmulqdq	$0x01, \t0, \gfpoly, \t1  // LO_L*(x^63 + x^62 + x^57)
+	vpshufd		$0x4e, \t0, \t0		  // Swap halves of LO
+	vpxord		\t0, \t1, \t1		  // Fold LO into MI
+	vpclmulqdq	$0x01, \t1, \gfpoly, \t0  // MI_L*(x^63 + x^62 + x^57)
+	vpshufd		$0x4e, \t1, \t1		  // Swap halves of MI
+	vpternlogd	$0x96, \t0, \t1, \dst	  // Fold MI into HI
+.endm
+
+// void aes_gcm_precompute_vaes_avx512(struct aes_gcm_key_vaes_avx512 *key);
 //
-// This macro supports both VL=32 and VL=64.  _set_veclen must have been invoked
-// with the desired length.  In the VL=32 case, the function computes twice as
-// many key powers than are actually used by the VL=32 GCM update functions.
-// This is done to keep the key format the same regardless of vector length.
-.macro	_aes_gcm_precompute
+// Given the expanded AES key |key->base.aes_key|, derive the GHASH subkey and
+// initialize |key->h_powers| and |key->padding|.
+SYM_FUNC_START(aes_gcm_precompute_vaes_avx512)
 
 	// Function arguments
 	.set	KEY,		%rdi
 
-	// Additional local variables.  V0-V2 and %rax are used as temporaries.
+	// Additional local variables.
+	// %zmm[0-2] and %rax are used as temporaries.
 	.set	POWERS_PTR,	%rsi
 	.set	RNDKEYLAST_PTR,	%rdx
-	.set	H_CUR,		V3
+	.set	H_CUR,		%zmm3
 	.set	H_CUR_YMM,	%ymm3
 	.set	H_CUR_XMM,	%xmm3
-	.set	H_INC,		V4
+	.set	H_INC,		%zmm4
 	.set	H_INC_YMM,	%ymm4
 	.set	H_INC_XMM,	%xmm4
-	.set	GFPOLY,		V5
+	.set	GFPOLY,		%zmm5
 	.set	GFPOLY_YMM,	%ymm5
 	.set	GFPOLY_XMM,	%xmm5
 
 	// Get pointer to lowest set of key powers (located at end of array).
-	lea		OFFSETOFEND_H_POWERS-VL(KEY), POWERS_PTR
+	lea		OFFSETOFEND_H_POWERS-64(KEY), POWERS_PTR
 
 	// Encrypt an all-zeroes block to get the raw hash subkey.
 	movl		OFFSETOF_AESKEYLEN(KEY), %eax
@@ -363,8 +316,8 @@
 
 	// Zeroize the padding blocks.
 	vpxor		%xmm0, %xmm0, %xmm0
-	vmovdqu		%ymm0, VL(POWERS_PTR)
-	vmovdqu		%xmm0, VL+2*16(POWERS_PTR)
+	vmovdqu		%ymm0, 64(POWERS_PTR)
+	vmovdqu		%xmm0, 64+2*16(POWERS_PTR)
 
 	// Finish preprocessing the first key power, H^1.  Since this GHASH
 	// implementation operates directly on values with the backwards bit
@@ -397,54 +350,44 @@
 	// special needs to be done to make this happen, though: H^1 * H^1 would
 	// end up with two factors of x^-1, but the multiplication consumes one.
 	// So the product H^2 ends up with the desired one factor of x^-1.
-	_ghash_mul	H_CUR_XMM, H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, \
-			%xmm0, %xmm1, %xmm2
+	_ghash_square	H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, %xmm0, %xmm1
 
 	// Create H_CUR_YMM = [H^2, H^1] and H_INC_YMM = [H^2, H^2].
 	vinserti128	$1, H_CUR_XMM, H_INC_YMM, H_CUR_YMM
 	vinserti128	$1, H_INC_XMM, H_INC_YMM, H_INC_YMM
 
-.if VL == 64
 	// Create H_CUR = [H^4, H^3, H^2, H^1] and H_INC = [H^4, H^4, H^4, H^4].
 	_ghash_mul	H_INC_YMM, H_CUR_YMM, H_INC_YMM, GFPOLY_YMM, \
 			%ymm0, %ymm1, %ymm2
 	vinserti64x4	$1, H_CUR_YMM, H_INC, H_CUR
 	vshufi64x2	$0, H_INC, H_INC, H_INC
-.endif
 
 	// Store the lowest set of key powers.
 	vmovdqu8	H_CUR, (POWERS_PTR)
 
-	// Compute and store the remaining key powers.  With VL=32, repeatedly
-	// multiply [H^(i+1), H^i] by [H^2, H^2] to get [H^(i+3), H^(i+2)].
-	// With VL=64, repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by
+	// Compute and store the remaining key powers.
+	// Repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by
 	// [H^4, H^4, H^4, H^4] to get [H^(i+7), H^(i+6), H^(i+5), H^(i+4)].
-	mov		$(NUM_H_POWERS*16/VL) - 1, %eax
-.Lprecompute_next\@:
-	sub		$VL, POWERS_PTR
-	_ghash_mul	H_INC, H_CUR, H_CUR, GFPOLY, V0, V1, V2
+	mov		$3, %eax
+.Lprecompute_next:
+	sub		$64, POWERS_PTR
+	_ghash_mul	H_INC, H_CUR, H_CUR, GFPOLY, %zmm0, %zmm1, %zmm2
 	vmovdqu8	H_CUR, (POWERS_PTR)
 	dec		%eax
-	jnz		.Lprecompute_next\@
+	jnz		.Lprecompute_next
 
 	vzeroupper	// This is needed after using ymm or zmm registers.
 	RET
-.endm
+SYM_FUNC_END(aes_gcm_precompute_vaes_avx512)
 
 // XOR together the 128-bit lanes of \src (whose low lane is \src_xmm) and store
 // the result in \dst_xmm.  This implicitly zeroizes the other lanes of dst.
 .macro	_horizontal_xor	src, src_xmm, dst_xmm, t0_xmm, t1_xmm, t2_xmm
 	vextracti32x4	$1, \src, \t0_xmm
-.if VL == 32
-	vpxord		\t0_xmm, \src_xmm, \dst_xmm
-.elseif VL == 64
 	vextracti32x4	$2, \src, \t1_xmm
 	vextracti32x4	$3, \src, \t2_xmm
 	vpxord		\t0_xmm, \src_xmm, \dst_xmm
 	vpternlogd	$0x96, \t1_xmm, \t2_xmm, \dst_xmm
-.else
-	.error "Unsupported vector length"
-.endif
 .endm
 
 // Do one step of the GHASH update of the data blocks given in the vector
@@ -458,25 +401,21 @@
 //
 // The GHASH update does: GHASH_ACC = H_POW4*(GHASHDATA0 + GHASH_ACC) +
 // H_POW3*GHASHDATA1 + H_POW2*GHASHDATA2 + H_POW1*GHASHDATA3, where the
-// operations are vectorized operations on vectors of 16-byte blocks.  E.g.,
-// with VL=32 there are 2 blocks per vector and the vectorized terms correspond
-// to the following non-vectorized terms:
-//
-//	H_POW4*(GHASHDATA0 + GHASH_ACC) => H^8*(blk0 + GHASH_ACC_XMM) and H^7*(blk1 + 0)
-//	H_POW3*GHASHDATA1 => H^6*blk2 and H^5*blk3
-//	H_POW2*GHASHDATA2 => H^4*blk4 and H^3*blk5
-//	H_POW1*GHASHDATA3 => H^2*blk6 and H^1*blk7
+// operations are vectorized operations on 512-bit vectors of 128-bit blocks.
+// The vectorized terms correspond to the following non-vectorized terms:
 //
-// With VL=64, we use 4 blocks/vector, H^16 through H^1, and blk0 through blk15.
+//       H_POW4*(GHASHDATA0 + GHASH_ACC) => H^16*(blk0 + GHASH_ACC_XMM),
+//              H^15*(blk1 + 0), H^14*(blk2 + 0), and H^13*(blk3 + 0)
+//       H_POW3*GHASHDATA1 => H^12*blk4, H^11*blk5, H^10*blk6, and H^9*blk7
+//       H_POW2*GHASHDATA2 => H^8*blk8,  H^7*blk9,  H^6*blk10, and H^5*blk11
+//       H_POW1*GHASHDATA3 => H^4*blk12, H^3*blk13, H^2*blk14, and H^1*blk15
 //
 // More concretely, this code does:
 //   - Do vectorized "schoolbook" multiplications to compute the intermediate
 //     256-bit product of each block and its corresponding hash key power.
-//     There are 4*VL/16 of these intermediate products.
-//   - Sum (XOR) the intermediate 256-bit products across vectors.  This leaves
-//     VL/16 256-bit intermediate values.
+//   - Sum (XOR) the intermediate 256-bit products across vectors.
 //   - Do a vectorized reduction of these 256-bit intermediate values to
-//     128-bits each.  This leaves VL/16 128-bit intermediate values.
+//     128-bits each.
 //   - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM.
 //
 // See _ghash_mul_step for the full explanation of the operations performed for
@@ -532,85 +471,224 @@
 .endif
 .endm
 
-// Do one non-last round of AES encryption on the counter blocks in V0-V3 using
-// the round key that has been broadcast to all 128-bit lanes of \round_key.
+// Update GHASH with four vectors of data blocks.  See _ghash_step_4x for full
+// explanation.
+.macro	_ghash_4x
+.irp i, 0,1,2,3,4,5,6,7,8,9
+	_ghash_step_4x	\i
+.endr
+.endm
+
+// void aes_gcm_aad_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+//				       u8 ghash_acc[16],
+//				       const u8 *aad, int aadlen);
+//
+// This function processes the AAD (Additional Authenticated Data) in GCM.
+// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
+// data given by |aad| and |aadlen|.  On the first call, |ghash_acc| must be all
+// zeroes.  |aadlen| must be a multiple of 16, except on the last call where it
+// can be any length.  The caller must do any buffering needed to ensure this.
+//
+// This handles large amounts of AAD efficiently, while also keeping overhead
+// low for small amounts which is the common case.  TLS and IPsec use less than
+// one block of AAD, but (uncommonly) other use cases may use much more.
+SYM_FUNC_START(aes_gcm_aad_update_vaes_avx512)
+
+	// Function arguments
+	.set	KEY,		%rdi
+	.set	GHASH_ACC_PTR,	%rsi
+	.set	AAD,		%rdx
+	.set	AADLEN,		%ecx
+	.set	AADLEN64,	%rcx	// Zero-extend AADLEN before using!
+
+	// Additional local variables.
+	// %rax and %k1 are used as temporary registers.
+	.set	GHASHDATA0,	%zmm0
+	.set	GHASHDATA0_XMM,	%xmm0
+	.set	GHASHDATA1,	%zmm1
+	.set	GHASHDATA1_XMM,	%xmm1
+	.set	GHASHDATA2,	%zmm2
+	.set	GHASHDATA2_XMM,	%xmm2
+	.set	GHASHDATA3,	%zmm3
+	.set	BSWAP_MASK,	%zmm4
+	.set	BSWAP_MASK_XMM,	%xmm4
+	.set	GHASH_ACC,	%zmm5
+	.set	GHASH_ACC_XMM,	%xmm5
+	.set	H_POW4,		%zmm6
+	.set	H_POW3,		%zmm7
+	.set	H_POW2,		%zmm8
+	.set	H_POW1,		%zmm9
+	.set	H_POW1_XMM,	%xmm9
+	.set	GFPOLY,		%zmm10
+	.set	GFPOLY_XMM,	%xmm10
+	.set	GHASHTMP0,	%zmm11
+	.set	GHASHTMP1,	%zmm12
+	.set	GHASHTMP2,	%zmm13
+
+	// Load the GHASH accumulator.
+	vmovdqu		(GHASH_ACC_PTR), GHASH_ACC_XMM
+
+	// Check for the common case of AADLEN <= 16, as well as AADLEN == 0.
+	cmp		$16, AADLEN
+	jg		.Laad_more_than_16bytes
+	test		AADLEN, AADLEN
+	jz		.Laad_done
+
+	// Fast path: update GHASH with 1 <= AADLEN <= 16 bytes of AAD.
+	vmovdqu		.Lbswap_mask(%rip), BSWAP_MASK_XMM
+	vmovdqu		.Lgfpoly(%rip), GFPOLY_XMM
+	mov		$-1, %eax
+	bzhi		AADLEN, %eax, %eax
+	kmovd		%eax, %k1
+	vmovdqu8	(AAD), GHASHDATA0_XMM{%k1}{z}
+	vmovdqu		OFFSETOFEND_H_POWERS-16(KEY), H_POW1_XMM
+	vpshufb		BSWAP_MASK_XMM, GHASHDATA0_XMM, GHASHDATA0_XMM
+	vpxor		GHASHDATA0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+	_ghash_mul	H_POW1_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM, GFPOLY_XMM, \
+			GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM
+	jmp		.Laad_done
+
+.Laad_more_than_16bytes:
+	vbroadcasti32x4	.Lbswap_mask(%rip), BSWAP_MASK
+	vbroadcasti32x4	.Lgfpoly(%rip), GFPOLY
+
+	// If AADLEN >= 256, update GHASH with 256 bytes of AAD at a time.
+	sub		$256, AADLEN
+	jl		.Laad_loop_4x_done
+	vmovdqu8	OFFSETOFEND_H_POWERS-4*64(KEY), H_POW4
+	vmovdqu8	OFFSETOFEND_H_POWERS-3*64(KEY), H_POW3
+	vmovdqu8	OFFSETOFEND_H_POWERS-2*64(KEY), H_POW2
+	vmovdqu8	OFFSETOFEND_H_POWERS-1*64(KEY), H_POW1
+.Laad_loop_4x:
+	vmovdqu8	0*64(AAD), GHASHDATA0
+	vmovdqu8	1*64(AAD), GHASHDATA1
+	vmovdqu8	2*64(AAD), GHASHDATA2
+	vmovdqu8	3*64(AAD), GHASHDATA3
+	_ghash_4x
+	add		$256, AAD
+	sub		$256, AADLEN
+	jge		.Laad_loop_4x
+.Laad_loop_4x_done:
+
+	// If AADLEN >= 64, update GHASH with 64 bytes of AAD at a time.
+	add		$192, AADLEN
+	jl		.Laad_loop_1x_done
+	vmovdqu8	OFFSETOFEND_H_POWERS-1*64(KEY), H_POW1
+.Laad_loop_1x:
+	vmovdqu8	(AAD), GHASHDATA0
+	vpshufb		BSWAP_MASK, GHASHDATA0, GHASHDATA0
+	vpxord		GHASHDATA0, GHASH_ACC, GHASH_ACC
+	_ghash_mul	H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+			GHASHDATA0, GHASHDATA1, GHASHDATA2
+	_horizontal_xor GHASH_ACC, GHASH_ACC_XMM, GHASH_ACC_XMM, \
+			GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM
+	add		$64, AAD
+	sub		$64, AADLEN
+	jge		.Laad_loop_1x
+.Laad_loop_1x_done:
+
+	// Update GHASH with the remaining 0 <= AADLEN < 64 bytes of AAD.
+	add		$64, AADLEN
+	jz		.Laad_done
+	mov		$-1, %rax
+	bzhi		AADLEN64, %rax, %rax
+	kmovq		%rax, %k1
+	vmovdqu8	(AAD), GHASHDATA0{%k1}{z}
+	neg		AADLEN64
+	and		$~15, AADLEN64  // -round_up(AADLEN, 16)
+	vmovdqu8	OFFSETOFEND_H_POWERS(KEY,AADLEN64), H_POW1
+	vpshufb		BSWAP_MASK, GHASHDATA0, GHASHDATA0
+	vpxord		GHASHDATA0, GHASH_ACC, GHASH_ACC
+	_ghash_mul	H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+			GHASHDATA0, GHASHDATA1, GHASHDATA2
+	_horizontal_xor GHASH_ACC, GHASH_ACC_XMM, GHASH_ACC_XMM, \
+			GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM
+
+.Laad_done:
+	// Store the updated GHASH accumulator back to memory.
+	vmovdqu		GHASH_ACC_XMM, (GHASH_ACC_PTR)
+
+	vzeroupper	// This is needed after using ymm or zmm registers.
+	RET
+SYM_FUNC_END(aes_gcm_aad_update_vaes_avx512)
+
+// Do one non-last round of AES encryption on the blocks in %zmm[0-3] using the
+// round key that has been broadcast to all 128-bit lanes of \round_key.
 .macro	_vaesenc_4x	round_key
-	vaesenc		\round_key, V0, V0
-	vaesenc		\round_key, V1, V1
-	vaesenc		\round_key, V2, V2
-	vaesenc		\round_key, V3, V3
+	vaesenc		\round_key, %zmm0, %zmm0
+	vaesenc		\round_key, %zmm1, %zmm1
+	vaesenc		\round_key, %zmm2, %zmm2
+	vaesenc		\round_key, %zmm3, %zmm3
 .endm
 
 // Start the AES encryption of four vectors of counter blocks.
 .macro	_ctr_begin_4x
 
 	// Increment LE_CTR four times to generate four vectors of little-endian
-	// counter blocks, swap each to big-endian, and store them in V0-V3.
-	vpshufb		BSWAP_MASK, LE_CTR, V0
+	// counter blocks, swap each to big-endian, and store them in %zmm[0-3].
+	vpshufb		BSWAP_MASK, LE_CTR, %zmm0
 	vpaddd		LE_CTR_INC, LE_CTR, LE_CTR
-	vpshufb		BSWAP_MASK, LE_CTR, V1
+	vpshufb		BSWAP_MASK, LE_CTR, %zmm1
 	vpaddd		LE_CTR_INC, LE_CTR, LE_CTR
-	vpshufb		BSWAP_MASK, LE_CTR, V2
+	vpshufb		BSWAP_MASK, LE_CTR, %zmm2
 	vpaddd		LE_CTR_INC, LE_CTR, LE_CTR
-	vpshufb		BSWAP_MASK, LE_CTR, V3
+	vpshufb		BSWAP_MASK, LE_CTR, %zmm3
 	vpaddd		LE_CTR_INC, LE_CTR, LE_CTR
 
 	// AES "round zero": XOR in the zero-th round key.
-	vpxord		RNDKEY0, V0, V0
-	vpxord		RNDKEY0, V1, V1
-	vpxord		RNDKEY0, V2, V2
-	vpxord		RNDKEY0, V3, V3
+	vpxord		RNDKEY0, %zmm0, %zmm0
+	vpxord		RNDKEY0, %zmm1, %zmm1
+	vpxord		RNDKEY0, %zmm2, %zmm2
+	vpxord		RNDKEY0, %zmm3, %zmm3
 .endm
 
-// Do the last AES round for four vectors of counter blocks V0-V3, XOR source
-// data with the resulting keystream, and write the result to DST and
+// Do the last AES round for four vectors of counter blocks %zmm[0-3], XOR
+// source data with the resulting keystream, and write the result to DST and
 // GHASHDATA[0-3].  (Implementation differs slightly, but has the same effect.)
 .macro	_aesenclast_and_xor_4x
 	// XOR the source data with the last round key, saving the result in
 	// GHASHDATA[0-3].  This reduces latency by taking advantage of the
 	// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
-	vpxord		0*VL(SRC), RNDKEYLAST, GHASHDATA0
-	vpxord		1*VL(SRC), RNDKEYLAST, GHASHDATA1
-	vpxord		2*VL(SRC), RNDKEYLAST, GHASHDATA2
-	vpxord		3*VL(SRC), RNDKEYLAST, GHASHDATA3
+	vpxord		0*64(SRC), RNDKEYLAST, GHASHDATA0
+	vpxord		1*64(SRC), RNDKEYLAST, GHASHDATA1
+	vpxord		2*64(SRC), RNDKEYLAST, GHASHDATA2
+	vpxord		3*64(SRC), RNDKEYLAST, GHASHDATA3
 
 	// Do the last AES round.  This handles the XOR with the source data
 	// too, as per the optimization described above.
-	vaesenclast	GHASHDATA0, V0, GHASHDATA0
-	vaesenclast	GHASHDATA1, V1, GHASHDATA1
-	vaesenclast	GHASHDATA2, V2, GHASHDATA2
-	vaesenclast	GHASHDATA3, V3, GHASHDATA3
+	vaesenclast	GHASHDATA0, %zmm0, GHASHDATA0
+	vaesenclast	GHASHDATA1, %zmm1, GHASHDATA1
+	vaesenclast	GHASHDATA2, %zmm2, GHASHDATA2
+	vaesenclast	GHASHDATA3, %zmm3, GHASHDATA3
 
 	// Store the en/decrypted data to DST.
-	vmovdqu8	GHASHDATA0, 0*VL(DST)
-	vmovdqu8	GHASHDATA1, 1*VL(DST)
-	vmovdqu8	GHASHDATA2, 2*VL(DST)
-	vmovdqu8	GHASHDATA3, 3*VL(DST)
+	vmovdqu8	GHASHDATA0, 0*64(DST)
+	vmovdqu8	GHASHDATA1, 1*64(DST)
+	vmovdqu8	GHASHDATA2, 2*64(DST)
+	vmovdqu8	GHASHDATA3, 3*64(DST)
 .endm
 
-// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_avx10 *key,
-//					  const u32 le_ctr[4], u8 ghash_acc[16],
-//					  const u8 *src, u8 *dst, int datalen);
+// void aes_gcm_{enc,dec}_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+//					     const u32 le_ctr[4], u8 ghash_acc[16],
+//					     const u8 *src, u8 *dst, int datalen);
 //
 // This macro generates a GCM encryption or decryption update function with the
-// above prototype (with \enc selecting which one).  This macro supports both
-// VL=32 and VL=64.  _set_veclen must have been invoked with the desired length.
-//
-// This function computes the next portion of the CTR keystream, XOR's it with
-// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted
-// data to |dst|.  It also updates the GHASH accumulator |ghash_acc| using the
-// next |datalen| ciphertext bytes.
+// above prototype (with \enc selecting which one).  The function computes the
+// next portion of the CTR keystream, XOR's it with |datalen| bytes from |src|,
+// and writes the resulting encrypted or decrypted data to |dst|.  It also
+// updates the GHASH accumulator |ghash_acc| using the next |datalen| ciphertext
+// bytes.
 //
 // |datalen| must be a multiple of 16, except on the last call where it can be
 // any length.  The caller must do any buffering needed to ensure this.  Both
 // in-place and out-of-place en/decryption are supported.
 //
-// |le_ctr| must give the current counter in little-endian format.  For a new
-// message, the low word of the counter must be 2.  This function loads the
-// counter from |le_ctr| and increments the loaded counter as needed, but it
-// does *not* store the updated counter back to |le_ctr|.  The caller must
-// update |le_ctr| if any more data segments follow.  Internally, only the low
-// 32-bit word of the counter is incremented, following the GCM standard.
+// |le_ctr| must give the current counter in little-endian format.  This
+// function loads the counter from |le_ctr| and increments the loaded counter as
+// needed, but it does *not* store the updated counter back to |le_ctr|.  The
+// caller must update |le_ctr| if any more data segments follow.  Internally,
+// only the low 32-bit word of the counter is incremented, following the GCM
+// standard.
 .macro	_aes_gcm_update	enc
 
 	// Function arguments
@@ -634,69 +712,69 @@
 	// Pointer to the last AES round key for the chosen AES variant
 	.set	RNDKEYLAST_PTR,	%r11
 
-	// In the main loop, V0-V3 are used as AES input and output.  Elsewhere
-	// they are used as temporary registers.
+	// In the main loop, %zmm[0-3] are used as AES input and output.
+	// Elsewhere they are used as temporary registers.
 
 	// GHASHDATA[0-3] hold the ciphertext blocks and GHASH input data.
-	.set	GHASHDATA0,	V4
+	.set	GHASHDATA0,	%zmm4
 	.set	GHASHDATA0_XMM,	%xmm4
-	.set	GHASHDATA1,	V5
+	.set	GHASHDATA1,	%zmm5
 	.set	GHASHDATA1_XMM,	%xmm5
-	.set	GHASHDATA2,	V6
+	.set	GHASHDATA2,	%zmm6
 	.set	GHASHDATA2_XMM,	%xmm6
-	.set	GHASHDATA3,	V7
+	.set	GHASHDATA3,	%zmm7
 
 	// BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values
 	// using vpshufb, copied to all 128-bit lanes.
-	.set	BSWAP_MASK,	V8
+	.set	BSWAP_MASK,	%zmm8
 
 	// RNDKEY temporarily holds the next AES round key.
-	.set	RNDKEY,		V9
+	.set	RNDKEY,		%zmm9
 
 	// GHASH_ACC is the accumulator variable for GHASH.  When fully reduced,
 	// only the lowest 128-bit lane can be nonzero.  When not fully reduced,
 	// more than one lane may be used, and they need to be XOR'd together.
-	.set	GHASH_ACC,	V10
+	.set	GHASH_ACC,	%zmm10
 	.set	GHASH_ACC_XMM,	%xmm10
 
 	// LE_CTR_INC is the vector of 32-bit words that need to be added to a
 	// vector of little-endian counter blocks to advance it forwards.
-	.set	LE_CTR_INC,	V11
+	.set	LE_CTR_INC,	%zmm11
 
 	// LE_CTR contains the next set of little-endian counter blocks.
-	.set	LE_CTR,		V12
+	.set	LE_CTR,		%zmm12
 
 	// RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys,
 	// copied to all 128-bit lanes.  RNDKEY0 is the zero-th round key,
 	// RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last.
-	.set	RNDKEY0,	V13
-	.set	RNDKEYLAST,	V14
-	.set	RNDKEY_M9,	V15
-	.set	RNDKEY_M8,	V16
-	.set	RNDKEY_M7,	V17
-	.set	RNDKEY_M6,	V18
-	.set	RNDKEY_M5,	V19
-	.set	RNDKEY_M4,	V20
-	.set	RNDKEY_M3,	V21
-	.set	RNDKEY_M2,	V22
-	.set	RNDKEY_M1,	V23
+	.set	RNDKEY0,	%zmm13
+	.set	RNDKEYLAST,	%zmm14
+	.set	RNDKEY_M9,	%zmm15
+	.set	RNDKEY_M8,	%zmm16
+	.set	RNDKEY_M7,	%zmm17
+	.set	RNDKEY_M6,	%zmm18
+	.set	RNDKEY_M5,	%zmm19
+	.set	RNDKEY_M4,	%zmm20
+	.set	RNDKEY_M3,	%zmm21
+	.set	RNDKEY_M2,	%zmm22
+	.set	RNDKEY_M1,	%zmm23
 
 	// GHASHTMP[0-2] are temporary variables used by _ghash_step_4x.  These
 	// cannot coincide with anything used for AES encryption, since for
 	// performance reasons GHASH and AES encryption are interleaved.
-	.set	GHASHTMP0,	V24
-	.set	GHASHTMP1,	V25
-	.set	GHASHTMP2,	V26
+	.set	GHASHTMP0,	%zmm24
+	.set	GHASHTMP1,	%zmm25
+	.set	GHASHTMP2,	%zmm26
 
-	// H_POW[4-1] contain the powers of the hash key H^(4*VL/16)...H^1.  The
+	// H_POW[4-1] contain the powers of the hash key H^16...H^1.  The
 	// descending numbering reflects the order of the key powers.
-	.set	H_POW4,		V27
-	.set	H_POW3,		V28
-	.set	H_POW2,		V29
-	.set	H_POW1,		V30
+	.set	H_POW4,		%zmm27
+	.set	H_POW3,		%zmm28
+	.set	H_POW2,		%zmm29
+	.set	H_POW1,		%zmm30
 
 	// GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes.
-	.set	GFPOLY,		V31
+	.set	GFPOLY,		%zmm31
 
 	// Load some constants.
 	vbroadcasti32x4	.Lbswap_mask(%rip), BSWAP_MASK
@@ -719,29 +797,23 @@
 	// Finish initializing LE_CTR by adding [0, 1, ...] to its low words.
 	vpaddd		.Lctr_pattern(%rip), LE_CTR, LE_CTR
 
-	// Initialize LE_CTR_INC to contain VL/16 in all 128-bit lanes.
-.if VL == 32
-	vbroadcasti32x4	.Linc_2blocks(%rip), LE_CTR_INC
-.elseif VL == 64
+	// Load 4 into all 128-bit lanes of LE_CTR_INC.
 	vbroadcasti32x4	.Linc_4blocks(%rip), LE_CTR_INC
-.else
-	.error "Unsupported vector length"
-.endif
 
-	// If there are at least 4*VL bytes of data, then continue into the loop
-	// that processes 4*VL bytes of data at a time.  Otherwise skip it.
+	// If there are at least 256 bytes of data, then continue into the loop
+	// that processes 256 bytes of data at a time.  Otherwise skip it.
 	//
-	// Pre-subtracting 4*VL from DATALEN saves an instruction from the main
+	// Pre-subtracting 256 from DATALEN saves an instruction from the main
 	// loop and also ensures that at least one write always occurs to
 	// DATALEN, zero-extending it and allowing DATALEN64 to be used later.
-	add		$-4*VL, DATALEN  // shorter than 'sub 4*VL' when VL=32
+	sub		$256, DATALEN
 	jl		.Lcrypt_loop_4x_done\@
 
 	// Load powers of the hash key.
-	vmovdqu8	OFFSETOFEND_H_POWERS-4*VL(KEY), H_POW4
-	vmovdqu8	OFFSETOFEND_H_POWERS-3*VL(KEY), H_POW3
-	vmovdqu8	OFFSETOFEND_H_POWERS-2*VL(KEY), H_POW2
-	vmovdqu8	OFFSETOFEND_H_POWERS-1*VL(KEY), H_POW1
+	vmovdqu8	OFFSETOFEND_H_POWERS-4*64(KEY), H_POW4
+	vmovdqu8	OFFSETOFEND_H_POWERS-3*64(KEY), H_POW3
+	vmovdqu8	OFFSETOFEND_H_POWERS-2*64(KEY), H_POW2
+	vmovdqu8	OFFSETOFEND_H_POWERS-1*64(KEY), H_POW1
 
 	// Main loop: en/decrypt and hash 4 vectors at a time.
 	//
@@ -770,9 +842,9 @@
 	cmp		%rax, RNDKEYLAST_PTR
 	jne		1b
 	_aesenclast_and_xor_4x
-	sub		$-4*VL, SRC  // shorter than 'add 4*VL' when VL=32
-	sub		$-4*VL, DST
-	add		$-4*VL, DATALEN
+	add		$256, SRC
+	add		$256, DST
+	sub		$256, DATALEN
 	jl		.Lghash_last_ciphertext_4x\@
 .endif
 
@@ -786,10 +858,10 @@
 	// If decrypting, load more ciphertext blocks into GHASHDATA[0-3].  If
 	// encrypting, GHASHDATA[0-3] already contain the previous ciphertext.
 .if !\enc
-	vmovdqu8	0*VL(SRC), GHASHDATA0
-	vmovdqu8	1*VL(SRC), GHASHDATA1
-	vmovdqu8	2*VL(SRC), GHASHDATA2
-	vmovdqu8	3*VL(SRC), GHASHDATA3
+	vmovdqu8	0*64(SRC), GHASHDATA0
+	vmovdqu8	1*64(SRC), GHASHDATA1
+	vmovdqu8	2*64(SRC), GHASHDATA2
+	vmovdqu8	3*64(SRC), GHASHDATA3
 .endif
 
 	// Start the AES encryption of the counter blocks.
@@ -809,44 +881,44 @@
 	_vaesenc_4x	RNDKEY
 128:
 
-	// Finish the AES encryption of the counter blocks in V0-V3, interleaved
-	// with the GHASH update of the ciphertext blocks in GHASHDATA[0-3].
+	// Finish the AES encryption of the counter blocks in %zmm[0-3],
+	// interleaved with the GHASH update of the ciphertext blocks in
+	// GHASHDATA[0-3].
 .irp i, 9,8,7,6,5,4,3,2,1
 	_ghash_step_4x  (9 - \i)
 	_vaesenc_4x	RNDKEY_M\i
 .endr
 	_ghash_step_4x	9
 	_aesenclast_and_xor_4x
-	sub		$-4*VL, SRC  // shorter than 'add 4*VL' when VL=32
-	sub		$-4*VL, DST
-	add		$-4*VL, DATALEN
+	add		$256, SRC
+	add		$256, DST
+	sub		$256, DATALEN
 	jge		.Lcrypt_loop_4x\@
 
 .if \enc
 .Lghash_last_ciphertext_4x\@:
 	// Update GHASH with the last set of ciphertext blocks.
-.irp i, 0,1,2,3,4,5,6,7,8,9
-	_ghash_step_4x	\i
-.endr
+	_ghash_4x
 .endif
 
 .Lcrypt_loop_4x_done\@:
 
-	// Undo the extra subtraction by 4*VL and check whether data remains.
-	sub		$-4*VL, DATALEN  // shorter than 'add 4*VL' when VL=32
+	// Undo the extra subtraction by 256 and check whether data remains.
+	add		$256, DATALEN
 	jz		.Ldone\@
 
-	// The data length isn't a multiple of 4*VL.  Process the remaining data
-	// of length 1 <= DATALEN < 4*VL, up to one vector (VL bytes) at a time.
-	// Going one vector at a time may seem inefficient compared to having
-	// separate code paths for each possible number of vectors remaining.
-	// However, using a loop keeps the code size down, and it performs
-	// surprising well; modern CPUs will start executing the next iteration
-	// before the previous one finishes and also predict the number of loop
-	// iterations.  For a similar reason, we roll up the AES rounds.
+	// The data length isn't a multiple of 256 bytes.  Process the remaining
+	// data of length 1 <= DATALEN < 256, up to one 64-byte vector at a
+	// time.  Going one vector at a time may seem inefficient compared to
+	// having separate code paths for each possible number of vectors
+	// remaining.  However, using a loop keeps the code size down, and it
+	// performs surprising well; modern CPUs will start executing the next
+	// iteration before the previous one finishes and also predict the
+	// number of loop iterations.  For a similar reason, we roll up the AES
+	// rounds.
 	//
-	// On the last iteration, the remaining length may be less than VL.
-	// Handle this using masking.
+	// On the last iteration, the remaining length may be less than 64
+	// bytes.  Handle this using masking.
 	//
 	// Since there are enough key powers available for all remaining data,
 	// there is no need to do a GHASH reduction after each iteration.
@@ -875,65 +947,60 @@
 .Lcrypt_loop_1x\@:
 
 	// Select the appropriate mask for this iteration: all 1's if
-	// DATALEN >= VL, otherwise DATALEN 1's.  Do this branchlessly using the
+	// DATALEN >= 64, otherwise DATALEN 1's.  Do this branchlessly using the
 	// bzhi instruction from BMI2.  (This relies on DATALEN <= 255.)
-.if VL < 64
-	mov		$-1, %eax
-	bzhi		DATALEN, %eax, %eax
-	kmovd		%eax, %k1
-.else
 	mov		$-1, %rax
 	bzhi		DATALEN64, %rax, %rax
 	kmovq		%rax, %k1
-.endif
 
 	// Encrypt a vector of counter blocks.  This does not need to be masked.
-	vpshufb		BSWAP_MASK, LE_CTR, V0
+	vpshufb		BSWAP_MASK, LE_CTR, %zmm0
 	vpaddd		LE_CTR_INC, LE_CTR, LE_CTR
-	vpxord		RNDKEY0, V0, V0
+	vpxord		RNDKEY0, %zmm0, %zmm0
 	lea		16(KEY), %rax
 1:
 	vbroadcasti32x4	(%rax), RNDKEY
-	vaesenc		RNDKEY, V0, V0
+	vaesenc		RNDKEY, %zmm0, %zmm0
 	add		$16, %rax
 	cmp		%rax, RNDKEYLAST_PTR
 	jne		1b
-	vaesenclast	RNDKEYLAST, V0, V0
+	vaesenclast	RNDKEYLAST, %zmm0, %zmm0
 
 	// XOR the data with the appropriate number of keystream bytes.
-	vmovdqu8	(SRC), V1{%k1}{z}
-	vpxord		V1, V0, V0
-	vmovdqu8	V0, (DST){%k1}
+	vmovdqu8	(SRC), %zmm1{%k1}{z}
+	vpxord		%zmm1, %zmm0, %zmm0
+	vmovdqu8	%zmm0, (DST){%k1}
 
 	// Update GHASH with the ciphertext block(s), without reducing.
 	//
-	// In the case of DATALEN < VL, the ciphertext is zero-padded to VL.
-	// (If decrypting, it's done by the above masked load.  If encrypting,
-	// it's done by the below masked register-to-register move.)  Note that
-	// if DATALEN <= VL - 16, there will be additional padding beyond the
-	// padding of the last block specified by GHASH itself; i.e., there may
-	// be whole block(s) that get processed by the GHASH multiplication and
-	// reduction instructions but should not actually be included in the
+	// In the case of DATALEN < 64, the ciphertext is zero-padded to 64
+	// bytes.  (If decrypting, it's done by the above masked load.  If
+	// encrypting, it's done by the below masked register-to-register move.)
+	// Note that if DATALEN <= 48, there will be additional padding beyond
+	// the padding of the last block specified by GHASH itself; i.e., there
+	// may be whole block(s) that get processed by the GHASH multiplication
+	// and reduction instructions but should not actually be included in the
 	// GHASH.  However, any such blocks are all-zeroes, and the values that
 	// they're multiplied with are also all-zeroes.  Therefore they just add
 	// 0 * 0 = 0 to the final GHASH result, which makes no difference.
 	vmovdqu8	(POWERS_PTR), H_POW1
 .if \enc
-	vmovdqu8	V0, V1{%k1}{z}
+	vmovdqu8	%zmm0, %zmm1{%k1}{z}
 .endif
-	vpshufb		BSWAP_MASK, V1, V0
-	vpxord		GHASH_ACC, V0, V0
-	_ghash_mul_noreduce	H_POW1, V0, LO, MI, HI, GHASHDATA3, V1, V2, V3
+	vpshufb		BSWAP_MASK, %zmm1, %zmm0
+	vpxord		GHASH_ACC, %zmm0, %zmm0
+	_ghash_mul_noreduce	H_POW1, %zmm0, LO, MI, HI, \
+				GHASHDATA3, %zmm1, %zmm2, %zmm3
 	vpxor		GHASH_ACC_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
 
-	add		$VL, POWERS_PTR
-	add		$VL, SRC
-	add		$VL, DST
-	sub		$VL, DATALEN
+	add		$64, POWERS_PTR
+	add		$64, SRC
+	add		$64, DST
+	sub		$64, DATALEN
 	jg		.Lcrypt_loop_1x\@
 
 	// Finally, do the GHASH reduction.
-	_ghash_reduce	LO, MI, HI, GFPOLY, V0
+	_ghash_reduce	LO, MI, HI, GFPOLY, %zmm0
 	_horizontal_xor	HI, HI_XMM, GHASH_ACC_XMM, %xmm0, %xmm1, %xmm2
 
 .Ldone\@:
@@ -944,14 +1011,14 @@
 	RET
 .endm
 
-// void aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
-//				     const u32 le_ctr[4], u8 ghash_acc[16],
-//				     u64 total_aadlen, u64 total_datalen);
-// bool aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
-//				     const u32 le_ctr[4],
-//				     const u8 ghash_acc[16],
-//				     u64 total_aadlen, u64 total_datalen,
-//				     const u8 tag[16], int taglen);
+// void aes_gcm_enc_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+//				      const u32 le_ctr[4], u8 ghash_acc[16],
+//				      u64 total_aadlen, u64 total_datalen);
+// bool aes_gcm_dec_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+//				      const u32 le_ctr[4],
+//				      const u8 ghash_acc[16],
+//				      u64 total_aadlen, u64 total_datalen,
+//				      const u8 tag[16], int taglen);
 //
 // This macro generates one of the above two functions (with \enc selecting
 // which one).  Both functions finish computing the GCM authentication tag by
@@ -1081,119 +1148,16 @@
 	RET
 .endm
 
-_set_veclen 32
-SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_256)
-	_aes_gcm_precompute
-SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_256)
-SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_256)
-	_aes_gcm_update	1
-SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_256)
-SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_256)
-	_aes_gcm_update	0
-SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_256)
-
-_set_veclen 64
-SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_512)
-	_aes_gcm_precompute
-SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_512)
-SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_512)
+SYM_FUNC_START(aes_gcm_enc_update_vaes_avx512)
 	_aes_gcm_update	1
-SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_512)
-SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_512)
+SYM_FUNC_END(aes_gcm_enc_update_vaes_avx512)
+SYM_FUNC_START(aes_gcm_dec_update_vaes_avx512)
 	_aes_gcm_update	0
-SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_512)
-
-// void aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key,
-//				      u8 ghash_acc[16],
-//				      const u8 *aad, int aadlen);
-//
-// This function processes the AAD (Additional Authenticated Data) in GCM.
-// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
-// data given by |aad| and |aadlen|.  |key->ghash_key_powers| must have been
-// initialized.  On the first call, |ghash_acc| must be all zeroes.  |aadlen|
-// must be a multiple of 16, except on the last call where it can be any length.
-// The caller must do any buffering needed to ensure this.
-//
-// AES-GCM is almost always used with small amounts of AAD, less than 32 bytes.
-// Therefore, for AAD processing we currently only provide this implementation
-// which uses 256-bit vectors (ymm registers) and only has a 1x-wide loop.  This
-// keeps the code size down, and it enables some micro-optimizations, e.g. using
-// VEX-coded instructions instead of EVEX-coded to save some instruction bytes.
-// To optimize for large amounts of AAD, we could implement a 4x-wide loop and
-// provide a version using 512-bit vectors, but that doesn't seem to be useful.
-SYM_FUNC_START(aes_gcm_aad_update_vaes_avx10)
-
-	// Function arguments
-	.set	KEY,		%rdi
-	.set	GHASH_ACC_PTR,	%rsi
-	.set	AAD,		%rdx
-	.set	AADLEN,		%ecx
-	.set	AADLEN64,	%rcx	// Zero-extend AADLEN before using!
-
-	// Additional local variables.
-	// %rax, %ymm0-%ymm3, and %k1 are used as temporary registers.
-	.set	BSWAP_MASK,	%ymm4
-	.set	GFPOLY,		%ymm5
-	.set	GHASH_ACC,	%ymm6
-	.set	GHASH_ACC_XMM,	%xmm6
-	.set	H_POW1,		%ymm7
-
-	// Load some constants.
-	vbroadcasti128	.Lbswap_mask(%rip), BSWAP_MASK
-	vbroadcasti128	.Lgfpoly(%rip), GFPOLY
-
-	// Load the GHASH accumulator.
-	vmovdqu		(GHASH_ACC_PTR), GHASH_ACC_XMM
-
-	// Update GHASH with 32 bytes of AAD at a time.
-	//
-	// Pre-subtracting 32 from AADLEN saves an instruction from the loop and
-	// also ensures that at least one write always occurs to AADLEN,
-	// zero-extending it and allowing AADLEN64 to be used later.
-	sub		$32, AADLEN
-	jl		.Laad_loop_1x_done
-	vmovdqu8	OFFSETOFEND_H_POWERS-32(KEY), H_POW1	// [H^2, H^1]
-.Laad_loop_1x:
-	vmovdqu		(AAD), %ymm0
-	vpshufb		BSWAP_MASK, %ymm0, %ymm0
-	vpxor		%ymm0, GHASH_ACC, GHASH_ACC
-	_ghash_mul	H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
-			%ymm0, %ymm1, %ymm2
-	vextracti128	$1, GHASH_ACC, %xmm0
-	vpxor		%xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM
-	add		$32, AAD
-	sub		$32, AADLEN
-	jge		.Laad_loop_1x
-.Laad_loop_1x_done:
-	add		$32, AADLEN
-	jz		.Laad_done
-
-	// Update GHASH with the remaining 1 <= AADLEN < 32 bytes of AAD.
-	mov		$-1, %eax
-	bzhi		AADLEN, %eax, %eax
-	kmovd		%eax, %k1
-	vmovdqu8	(AAD), %ymm0{%k1}{z}
-	neg		AADLEN64
-	and		$~15, AADLEN64  // -round_up(AADLEN, 16)
-	vmovdqu8	OFFSETOFEND_H_POWERS(KEY,AADLEN64), H_POW1
-	vpshufb		BSWAP_MASK, %ymm0, %ymm0
-	vpxor		%ymm0, GHASH_ACC, GHASH_ACC
-	_ghash_mul	H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
-			%ymm0, %ymm1, %ymm2
-	vextracti128	$1, GHASH_ACC, %xmm0
-	vpxor		%xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM
-
-.Laad_done:
-	// Store the updated GHASH accumulator back to memory.
-	vmovdqu		GHASH_ACC_XMM, (GHASH_ACC_PTR)
-
-	vzeroupper	// This is needed after using ymm or zmm registers.
-	RET
-SYM_FUNC_END(aes_gcm_aad_update_vaes_avx10)
+SYM_FUNC_END(aes_gcm_dec_update_vaes_avx512)
 
-SYM_FUNC_START(aes_gcm_enc_final_vaes_avx10)
+SYM_FUNC_START(aes_gcm_enc_final_vaes_avx512)
 	_aes_gcm_final	1
-SYM_FUNC_END(aes_gcm_enc_final_vaes_avx10)
-SYM_FUNC_START(aes_gcm_dec_final_vaes_avx10)
+SYM_FUNC_END(aes_gcm_enc_final_vaes_avx512)
+SYM_FUNC_START(aes_gcm_dec_final_vaes_avx512)
 	_aes_gcm_final	0
-SYM_FUNC_END(aes_gcm_dec_final_vaes_avx10)
+SYM_FUNC_END(aes_gcm_dec_final_vaes_avx512)
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index d953ac470aae..48405e02d6e4 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -693,8 +693,7 @@ ctr_crypt(struct skcipher_request *req,
 			 * operation into two at the point where the overflow
 			 * will occur.  After the first part, add the carry bit.
 			 */
-			p1_nbytes = min_t(unsigned int, nbytes,
-					  (nblocks - ctr64) * AES_BLOCK_SIZE);
+			p1_nbytes = min(nbytes, (nblocks - ctr64) * AES_BLOCK_SIZE);
 			(*ctr64_func)(key, walk.src.virt.addr,
 				      walk.dst.virt.addr, p1_nbytes, le_ctr);
 			le_ctr[0] = 0;
@@ -874,8 +873,38 @@ struct aes_gcm_key_aesni {
 #define AES_GCM_KEY_AESNI_SIZE	\
 	(sizeof(struct aes_gcm_key_aesni) + (15 & ~(CRYPTO_MINALIGN - 1)))
 
-/* Key struct used by the VAES + AVX10 implementations of AES-GCM */
-struct aes_gcm_key_avx10 {
+/* Key struct used by the VAES + AVX2 implementation of AES-GCM */
+struct aes_gcm_key_vaes_avx2 {
+	/*
+	 * Common part of the key.  The assembly code prefers 16-byte alignment
+	 * for the round keys; we get this by them being located at the start of
+	 * the struct and the whole struct being 32-byte aligned.
+	 */
+	struct aes_gcm_key base;
+
+	/*
+	 * Powers of the hash key H^8 through H^1.  These are 128-bit values.
+	 * They all have an extra factor of x^-1 and are byte-reversed.
+	 * The assembly code prefers 32-byte alignment for this.
+	 */
+	u64 h_powers[8][2] __aligned(32);
+
+	/*
+	 * Each entry in this array contains the two halves of an entry of
+	 * h_powers XOR'd together, in the following order:
+	 * H^8,H^6,H^7,H^5,H^4,H^2,H^3,H^1 i.e. indices 0,2,1,3,4,6,5,7.
+	 * This is used for Karatsuba multiplication.
+	 */
+	u64 h_powers_xored[8];
+};
+
+#define AES_GCM_KEY_VAES_AVX2(key) \
+	container_of((key), struct aes_gcm_key_vaes_avx2, base)
+#define AES_GCM_KEY_VAES_AVX2_SIZE \
+	(sizeof(struct aes_gcm_key_vaes_avx2) + (31 & ~(CRYPTO_MINALIGN - 1)))
+
+/* Key struct used by the VAES + AVX512 implementation of AES-GCM */
+struct aes_gcm_key_vaes_avx512 {
 	/*
 	 * Common part of the key.  The assembly code prefers 16-byte alignment
 	 * for the round keys; we get this by them being located at the start of
@@ -895,10 +924,10 @@ struct aes_gcm_key_avx10 {
 	/* Three padding blocks required by the assembly code */
 	u64 padding[3][2];
 };
-#define AES_GCM_KEY_AVX10(key)	\
-	container_of((key), struct aes_gcm_key_avx10, base)
-#define AES_GCM_KEY_AVX10_SIZE	\
-	(sizeof(struct aes_gcm_key_avx10) + (63 & ~(CRYPTO_MINALIGN - 1)))
+#define AES_GCM_KEY_VAES_AVX512(key) \
+	container_of((key), struct aes_gcm_key_vaes_avx512, base)
+#define AES_GCM_KEY_VAES_AVX512_SIZE \
+	(sizeof(struct aes_gcm_key_vaes_avx512) + (63 & ~(CRYPTO_MINALIGN - 1)))
 
 /*
  * These flags are passed to the AES-GCM helper functions to specify the
@@ -910,14 +939,16 @@ struct aes_gcm_key_avx10 {
 #define FLAG_RFC4106	BIT(0)
 #define FLAG_ENC	BIT(1)
 #define FLAG_AVX	BIT(2)
-#define FLAG_AVX10_256	BIT(3)
-#define FLAG_AVX10_512	BIT(4)
+#define FLAG_VAES_AVX2	BIT(3)
+#define FLAG_VAES_AVX512 BIT(4)
 
 static inline struct aes_gcm_key *
 aes_gcm_key_get(struct crypto_aead *tfm, int flags)
 {
-	if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
+	if (flags & FLAG_VAES_AVX512)
 		return PTR_ALIGN(crypto_aead_ctx(tfm), 64);
+	else if (flags & FLAG_VAES_AVX2)
+		return PTR_ALIGN(crypto_aead_ctx(tfm), 32);
 	else
 		return PTR_ALIGN(crypto_aead_ctx(tfm), 16);
 }
@@ -927,26 +958,16 @@ aes_gcm_precompute_aesni(struct aes_gcm_key_aesni *key);
 asmlinkage void
 aes_gcm_precompute_aesni_avx(struct aes_gcm_key_aesni *key);
 asmlinkage void
-aes_gcm_precompute_vaes_avx10_256(struct aes_gcm_key_avx10 *key);
+aes_gcm_precompute_vaes_avx2(struct aes_gcm_key_vaes_avx2 *key);
 asmlinkage void
-aes_gcm_precompute_vaes_avx10_512(struct aes_gcm_key_avx10 *key);
+aes_gcm_precompute_vaes_avx512(struct aes_gcm_key_vaes_avx512 *key);
 
 static void aes_gcm_precompute(struct aes_gcm_key *key, int flags)
 {
-	/*
-	 * To make things a bit easier on the assembly side, the AVX10
-	 * implementations use the same key format.  Therefore, a single
-	 * function using 256-bit vectors would suffice here.  However, it's
-	 * straightforward to provide a 512-bit one because of how the assembly
-	 * code is structured, and it works nicely because the total size of the
-	 * key powers is a multiple of 512 bits.  So we take advantage of that.
-	 *
-	 * A similar situation applies to the AES-NI implementations.
-	 */
-	if (flags & FLAG_AVX10_512)
-		aes_gcm_precompute_vaes_avx10_512(AES_GCM_KEY_AVX10(key));
-	else if (flags & FLAG_AVX10_256)
-		aes_gcm_precompute_vaes_avx10_256(AES_GCM_KEY_AVX10(key));
+	if (flags & FLAG_VAES_AVX512)
+		aes_gcm_precompute_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key));
+	else if (flags & FLAG_VAES_AVX2)
+		aes_gcm_precompute_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key));
 	else if (flags & FLAG_AVX)
 		aes_gcm_precompute_aesni_avx(AES_GCM_KEY_AESNI(key));
 	else
@@ -960,15 +981,21 @@ asmlinkage void
 aes_gcm_aad_update_aesni_avx(const struct aes_gcm_key_aesni *key,
 			     u8 ghash_acc[16], const u8 *aad, int aadlen);
 asmlinkage void
-aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key,
-			      u8 ghash_acc[16], const u8 *aad, int aadlen);
+aes_gcm_aad_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+			     u8 ghash_acc[16], const u8 *aad, int aadlen);
+asmlinkage void
+aes_gcm_aad_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+			       u8 ghash_acc[16], const u8 *aad, int aadlen);
 
 static void aes_gcm_aad_update(const struct aes_gcm_key *key, u8 ghash_acc[16],
 			       const u8 *aad, int aadlen, int flags)
 {
-	if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
-		aes_gcm_aad_update_vaes_avx10(AES_GCM_KEY_AVX10(key), ghash_acc,
-					      aad, aadlen);
+	if (flags & FLAG_VAES_AVX512)
+		aes_gcm_aad_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
+					       ghash_acc, aad, aadlen);
+	else if (flags & FLAG_VAES_AVX2)
+		aes_gcm_aad_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
+					     ghash_acc, aad, aadlen);
 	else if (flags & FLAG_AVX)
 		aes_gcm_aad_update_aesni_avx(AES_GCM_KEY_AESNI(key), ghash_acc,
 					     aad, aadlen);
@@ -986,13 +1013,13 @@ aes_gcm_enc_update_aesni_avx(const struct aes_gcm_key_aesni *key,
 			     const u32 le_ctr[4], u8 ghash_acc[16],
 			     const u8 *src, u8 *dst, int datalen);
 asmlinkage void
-aes_gcm_enc_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key,
-				  const u32 le_ctr[4], u8 ghash_acc[16],
-				  const u8 *src, u8 *dst, int datalen);
+aes_gcm_enc_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+			     const u32 le_ctr[4], u8 ghash_acc[16],
+			     const u8 *src, u8 *dst, int datalen);
 asmlinkage void
-aes_gcm_enc_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key,
-				  const u32 le_ctr[4], u8 ghash_acc[16],
-				  const u8 *src, u8 *dst, int datalen);
+aes_gcm_enc_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+			       const u32 le_ctr[4], u8 ghash_acc[16],
+			       const u8 *src, u8 *dst, int datalen);
 
 asmlinkage void
 aes_gcm_dec_update_aesni(const struct aes_gcm_key_aesni *key,
@@ -1003,13 +1030,13 @@ aes_gcm_dec_update_aesni_avx(const struct aes_gcm_key_aesni *key,
 			     const u32 le_ctr[4], u8 ghash_acc[16],
 			     const u8 *src, u8 *dst, int datalen);
 asmlinkage void
-aes_gcm_dec_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key,
-				  const u32 le_ctr[4], u8 ghash_acc[16],
-				  const u8 *src, u8 *dst, int datalen);
+aes_gcm_dec_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+			     const u32 le_ctr[4], u8 ghash_acc[16],
+			     const u8 *src, u8 *dst, int datalen);
 asmlinkage void
-aes_gcm_dec_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key,
-				  const u32 le_ctr[4], u8 ghash_acc[16],
-				  const u8 *src, u8 *dst, int datalen);
+aes_gcm_dec_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+			       const u32 le_ctr[4], u8 ghash_acc[16],
+			       const u8 *src, u8 *dst, int datalen);
 
 /* __always_inline to optimize out the branches based on @flags */
 static __always_inline void
@@ -1018,14 +1045,14 @@ aes_gcm_update(const struct aes_gcm_key *key,
 	       const u8 *src, u8 *dst, int datalen, int flags)
 {
 	if (flags & FLAG_ENC) {
-		if (flags & FLAG_AVX10_512)
-			aes_gcm_enc_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key),
-							  le_ctr, ghash_acc,
-							  src, dst, datalen);
-		else if (flags & FLAG_AVX10_256)
-			aes_gcm_enc_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key),
-							  le_ctr, ghash_acc,
-							  src, dst, datalen);
+		if (flags & FLAG_VAES_AVX512)
+			aes_gcm_enc_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
+						       le_ctr, ghash_acc,
+						       src, dst, datalen);
+		else if (flags & FLAG_VAES_AVX2)
+			aes_gcm_enc_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
+						     le_ctr, ghash_acc,
+						     src, dst, datalen);
 		else if (flags & FLAG_AVX)
 			aes_gcm_enc_update_aesni_avx(AES_GCM_KEY_AESNI(key),
 						     le_ctr, ghash_acc,
@@ -1034,14 +1061,14 @@ aes_gcm_update(const struct aes_gcm_key *key,
 			aes_gcm_enc_update_aesni(AES_GCM_KEY_AESNI(key), le_ctr,
 						 ghash_acc, src, dst, datalen);
 	} else {
-		if (flags & FLAG_AVX10_512)
-			aes_gcm_dec_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key),
-							  le_ctr, ghash_acc,
-							  src, dst, datalen);
-		else if (flags & FLAG_AVX10_256)
-			aes_gcm_dec_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key),
-							  le_ctr, ghash_acc,
-							  src, dst, datalen);
+		if (flags & FLAG_VAES_AVX512)
+			aes_gcm_dec_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
+						       le_ctr, ghash_acc,
+						       src, dst, datalen);
+		else if (flags & FLAG_VAES_AVX2)
+			aes_gcm_dec_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
+						     le_ctr, ghash_acc,
+						     src, dst, datalen);
 		else if (flags & FLAG_AVX)
 			aes_gcm_dec_update_aesni_avx(AES_GCM_KEY_AESNI(key),
 						     le_ctr, ghash_acc,
@@ -1062,9 +1089,13 @@ aes_gcm_enc_final_aesni_avx(const struct aes_gcm_key_aesni *key,
 			    const u32 le_ctr[4], u8 ghash_acc[16],
 			    u64 total_aadlen, u64 total_datalen);
 asmlinkage void
-aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
-			     const u32 le_ctr[4], u8 ghash_acc[16],
-			     u64 total_aadlen, u64 total_datalen);
+aes_gcm_enc_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+			    const u32 le_ctr[4], u8 ghash_acc[16],
+			    u64 total_aadlen, u64 total_datalen);
+asmlinkage void
+aes_gcm_enc_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+			      const u32 le_ctr[4], u8 ghash_acc[16],
+			      u64 total_aadlen, u64 total_datalen);
 
 /* __always_inline to optimize out the branches based on @flags */
 static __always_inline void
@@ -1072,10 +1103,14 @@ aes_gcm_enc_final(const struct aes_gcm_key *key,
 		  const u32 le_ctr[4], u8 ghash_acc[16],
 		  u64 total_aadlen, u64 total_datalen, int flags)
 {
-	if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
-		aes_gcm_enc_final_vaes_avx10(AES_GCM_KEY_AVX10(key),
-					     le_ctr, ghash_acc,
-					     total_aadlen, total_datalen);
+	if (flags & FLAG_VAES_AVX512)
+		aes_gcm_enc_final_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
+					      le_ctr, ghash_acc,
+					      total_aadlen, total_datalen);
+	else if (flags & FLAG_VAES_AVX2)
+		aes_gcm_enc_final_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
+					    le_ctr, ghash_acc,
+					    total_aadlen, total_datalen);
 	else if (flags & FLAG_AVX)
 		aes_gcm_enc_final_aesni_avx(AES_GCM_KEY_AESNI(key),
 					    le_ctr, ghash_acc,
@@ -1097,10 +1132,15 @@ aes_gcm_dec_final_aesni_avx(const struct aes_gcm_key_aesni *key,
 			    u64 total_aadlen, u64 total_datalen,
 			    const u8 tag[16], int taglen);
 asmlinkage bool __must_check
-aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
-			     const u32 le_ctr[4], const u8 ghash_acc[16],
-			     u64 total_aadlen, u64 total_datalen,
-			     const u8 tag[16], int taglen);
+aes_gcm_dec_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+			    const u32 le_ctr[4], const u8 ghash_acc[16],
+			    u64 total_aadlen, u64 total_datalen,
+			    const u8 tag[16], int taglen);
+asmlinkage bool __must_check
+aes_gcm_dec_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+			      const u32 le_ctr[4], const u8 ghash_acc[16],
+			      u64 total_aadlen, u64 total_datalen,
+			      const u8 tag[16], int taglen);
 
 /* __always_inline to optimize out the branches based on @flags */
 static __always_inline bool __must_check
@@ -1108,11 +1148,16 @@ aes_gcm_dec_final(const struct aes_gcm_key *key, const u32 le_ctr[4],
 		  u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen,
 		  u8 tag[16], int taglen, int flags)
 {
-	if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
-		return aes_gcm_dec_final_vaes_avx10(AES_GCM_KEY_AVX10(key),
-						    le_ctr, ghash_acc,
-						    total_aadlen, total_datalen,
-						    tag, taglen);
+	if (flags & FLAG_VAES_AVX512)
+		return aes_gcm_dec_final_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
+						     le_ctr, ghash_acc,
+						     total_aadlen, total_datalen,
+						     tag, taglen);
+	else if (flags & FLAG_VAES_AVX2)
+		return aes_gcm_dec_final_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
+						   le_ctr, ghash_acc,
+						   total_aadlen, total_datalen,
+						   tag, taglen);
 	else if (flags & FLAG_AVX)
 		return aes_gcm_dec_final_aesni_avx(AES_GCM_KEY_AESNI(key),
 						   le_ctr, ghash_acc,
@@ -1195,10 +1240,14 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key,
 	BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers) != 496);
 	BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers_xored) != 624);
 	BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_times_x64) != 688);
-	BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_enc) != 0);
-	BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_length) != 480);
-	BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, h_powers) != 512);
-	BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, padding) != 768);
+	BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, base.aes_key.key_enc) != 0);
+	BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, base.aes_key.key_length) != 480);
+	BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, h_powers) != 512);
+	BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, h_powers_xored) != 640);
+	BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, base.aes_key.key_enc) != 0);
+	BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, base.aes_key.key_length) != 480);
+	BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, h_powers) != 512);
+	BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, padding) != 768);
 
 	if (likely(crypto_simd_usable())) {
 		err = aes_check_keylen(keylen);
@@ -1231,8 +1280,9 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key,
 		gf128mul_lle(&h, (const be128 *)x_to_the_minus1);
 
 		/* Compute the needed key powers */
-		if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) {
-			struct aes_gcm_key_avx10 *k = AES_GCM_KEY_AVX10(key);
+		if (flags & FLAG_VAES_AVX512) {
+			struct aes_gcm_key_vaes_avx512 *k =
+				AES_GCM_KEY_VAES_AVX512(key);
 
 			for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) {
 				k->h_powers[i][0] = be64_to_cpu(h.b);
@@ -1240,6 +1290,22 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key,
 				gf128mul_lle(&h, &h1);
 			}
 			memset(k->padding, 0, sizeof(k->padding));
+		} else if (flags & FLAG_VAES_AVX2) {
+			struct aes_gcm_key_vaes_avx2 *k =
+				AES_GCM_KEY_VAES_AVX2(key);
+			static const u8 indices[8] = { 0, 2, 1, 3, 4, 6, 5, 7 };
+
+			for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) {
+				k->h_powers[i][0] = be64_to_cpu(h.b);
+				k->h_powers[i][1] = be64_to_cpu(h.a);
+				gf128mul_lle(&h, &h1);
+			}
+			for (i = 0; i < ARRAY_SIZE(k->h_powers_xored); i++) {
+				int j = indices[i];
+
+				k->h_powers_xored[i] = k->h_powers[j][0] ^
+						       k->h_powers[j][1];
+			}
 		} else {
 			struct aes_gcm_key_aesni *k = AES_GCM_KEY_AESNI(key);
 
@@ -1508,15 +1574,15 @@ DEFINE_GCM_ALGS(aesni_avx, FLAG_AVX,
 		"generic-gcm-aesni-avx", "rfc4106-gcm-aesni-avx",
 		AES_GCM_KEY_AESNI_SIZE, 500);
 
-/* aes_gcm_algs_vaes_avx10_256 */
-DEFINE_GCM_ALGS(vaes_avx10_256, FLAG_AVX10_256,
-		"generic-gcm-vaes-avx10_256", "rfc4106-gcm-vaes-avx10_256",
-		AES_GCM_KEY_AVX10_SIZE, 700);
+/* aes_gcm_algs_vaes_avx2 */
+DEFINE_GCM_ALGS(vaes_avx2, FLAG_VAES_AVX2,
+		"generic-gcm-vaes-avx2", "rfc4106-gcm-vaes-avx2",
+		AES_GCM_KEY_VAES_AVX2_SIZE, 600);
 
-/* aes_gcm_algs_vaes_avx10_512 */
-DEFINE_GCM_ALGS(vaes_avx10_512, FLAG_AVX10_512,
-		"generic-gcm-vaes-avx10_512", "rfc4106-gcm-vaes-avx10_512",
-		AES_GCM_KEY_AVX10_SIZE, 800);
+/* aes_gcm_algs_vaes_avx512 */
+DEFINE_GCM_ALGS(vaes_avx512, FLAG_VAES_AVX512,
+		"generic-gcm-vaes-avx512", "rfc4106-gcm-vaes-avx512",
+		AES_GCM_KEY_VAES_AVX512_SIZE, 800);
 
 static int __init register_avx_algs(void)
 {
@@ -1548,6 +1614,10 @@ static int __init register_avx_algs(void)
 					ARRAY_SIZE(skcipher_algs_vaes_avx2));
 	if (err)
 		return err;
+	err = crypto_register_aeads(aes_gcm_algs_vaes_avx2,
+				    ARRAY_SIZE(aes_gcm_algs_vaes_avx2));
+	if (err)
+		return err;
 
 	if (!boot_cpu_has(X86_FEATURE_AVX512BW) ||
 	    !boot_cpu_has(X86_FEATURE_AVX512VL) ||
@@ -1556,26 +1626,21 @@ static int __init register_avx_algs(void)
 			       XFEATURE_MASK_AVX512, NULL))
 		return 0;
 
-	err = crypto_register_aeads(aes_gcm_algs_vaes_avx10_256,
-				    ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256));
-	if (err)
-		return err;
-
 	if (boot_cpu_has(X86_FEATURE_PREFER_YMM)) {
 		int i;
 
 		for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx512); i++)
 			skcipher_algs_vaes_avx512[i].base.cra_priority = 1;
-		for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++)
-			aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1;
+		for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx512); i++)
+			aes_gcm_algs_vaes_avx512[i].base.cra_priority = 1;
 	}
 
 	err = crypto_register_skciphers(skcipher_algs_vaes_avx512,
 					ARRAY_SIZE(skcipher_algs_vaes_avx512));
 	if (err)
 		return err;
-	err = crypto_register_aeads(aes_gcm_algs_vaes_avx10_512,
-				    ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512));
+	err = crypto_register_aeads(aes_gcm_algs_vaes_avx512,
+				    ARRAY_SIZE(aes_gcm_algs_vaes_avx512));
 	if (err)
 		return err;
 
@@ -1595,8 +1660,8 @@ static void unregister_avx_algs(void)
 	unregister_aeads(aes_gcm_algs_aesni_avx);
 	unregister_skciphers(skcipher_algs_vaes_avx2);
 	unregister_skciphers(skcipher_algs_vaes_avx512);
-	unregister_aeads(aes_gcm_algs_vaes_avx10_256);
-	unregister_aeads(aes_gcm_algs_vaes_avx10_512);
+	unregister_aeads(aes_gcm_algs_vaes_avx2);
+	unregister_aeads(aes_gcm_algs_vaes_avx512);
 }
 #else /* CONFIG_X86_64 */
 static struct aead_alg aes_gcm_algs_aesni[0];
diff --git a/arch/x86/crypto/polyval-clmulni_asm.S b/arch/x86/crypto/polyval-clmulni_asm.S
deleted file mode 100644
index a6ebe4e7dd2b..000000000000
--- a/arch/x86/crypto/polyval-clmulni_asm.S
+++ /dev/null
@@ -1,321 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright 2021 Google LLC
- */
-/*
- * This is an efficient implementation of POLYVAL using intel PCLMULQDQ-NI
- * instructions. It works on 8 blocks at a time, by precomputing the first 8
- * keys powers h^8, ..., h^1 in the POLYVAL finite field. This precomputation
- * allows us to split finite field multiplication into two steps.
- *
- * In the first step, we consider h^i, m_i as normal polynomials of degree less
- * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
- * is simply polynomial multiplication.
- *
- * In the second step, we compute the reduction of p(x) modulo the finite field
- * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
- *
- * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
- * multiplication is finite field multiplication. The advantage is that the
- * two-step process  only requires 1 finite field reduction for every 8
- * polynomial multiplications. Further parallelism is gained by interleaving the
- * multiplications and polynomial reductions.
- */
-
-#include <linux/linkage.h>
-#include <asm/frame.h>
-
-#define STRIDE_BLOCKS 8
-
-#define GSTAR %xmm7
-#define PL %xmm8
-#define PH %xmm9
-#define TMP_XMM %xmm11
-#define LO %xmm12
-#define HI %xmm13
-#define MI %xmm14
-#define SUM %xmm15
-
-#define KEY_POWERS %rdi
-#define MSG %rsi
-#define BLOCKS_LEFT %rdx
-#define ACCUMULATOR %rcx
-#define TMP %rax
-
-.section    .rodata.cst16.gstar, "aM", @progbits, 16
-.align 16
-
-.Lgstar:
-	.quad 0xc200000000000000, 0xc200000000000000
-
-.text
-
-/*
- * Performs schoolbook1_iteration on two lists of 128-bit polynomials of length
- * count pointed to by MSG and KEY_POWERS.
- */
-.macro schoolbook1 count
-	.set i, 0
-	.rept (\count)
-		schoolbook1_iteration i 0
-		.set i, (i +1)
-	.endr
-.endm
-
-/*
- * Computes the product of two 128-bit polynomials at the memory locations
- * specified by (MSG + 16*i) and (KEY_POWERS + 16*i) and XORs the components of
- * the 256-bit product into LO, MI, HI.
- *
- * Given:
- *   X = [X_1 : X_0]
- *   Y = [Y_1 : Y_0]
- *
- * We compute:
- *   LO += X_0 * Y_0
- *   MI += X_0 * Y_1 + X_1 * Y_0
- *   HI += X_1 * Y_1
- *
- * Later, the 256-bit result can be extracted as:
- *   [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
- * This step is done when computing the polynomial reduction for efficiency
- * reasons.
- *
- * If xor_sum == 1, then also XOR the value of SUM into m_0.  This avoids an
- * extra multiplication of SUM and h^8.
- */
-.macro schoolbook1_iteration i xor_sum
-	movups (16*\i)(MSG), %xmm0
-	.if (\i == 0 && \xor_sum == 1)
-		pxor SUM, %xmm0
-	.endif
-	vpclmulqdq $0x01, (16*\i)(KEY_POWERS), %xmm0, %xmm2
-	vpclmulqdq $0x00, (16*\i)(KEY_POWERS), %xmm0, %xmm1
-	vpclmulqdq $0x10, (16*\i)(KEY_POWERS), %xmm0, %xmm3
-	vpclmulqdq $0x11, (16*\i)(KEY_POWERS), %xmm0, %xmm4
-	vpxor %xmm2, MI, MI
-	vpxor %xmm1, LO, LO
-	vpxor %xmm4, HI, HI
-	vpxor %xmm3, MI, MI
-.endm
-
-/*
- * Performs the same computation as schoolbook1_iteration, except we expect the
- * arguments to already be loaded into xmm0 and xmm1 and we set the result
- * registers LO, MI, and HI directly rather than XOR'ing into them.
- */
-.macro schoolbook1_noload
-	vpclmulqdq $0x01, %xmm0, %xmm1, MI
-	vpclmulqdq $0x10, %xmm0, %xmm1, %xmm2
-	vpclmulqdq $0x00, %xmm0, %xmm1, LO
-	vpclmulqdq $0x11, %xmm0, %xmm1, HI
-	vpxor %xmm2, MI, MI
-.endm
-
-/*
- * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
- * the result in PL, PH.
- *   [PH : PL] = [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
- */
-.macro schoolbook2
-	vpslldq $8, MI, PL
-	vpsrldq $8, MI, PH
-	pxor LO, PL
-	pxor HI, PH
-.endm
-
-/*
- * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
- *
- * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
- * x^128 + x^127 + x^126 + x^121 + 1.
- *
- * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
- * product of two 128-bit polynomials in Montgomery form.  We need to reduce it
- * mod g(x).  Also, since polynomials in Montgomery form have an "extra" factor
- * of x^128, this product has two extra factors of x^128.  To get it back into
- * Montgomery form, we need to remove one of these factors by dividing by x^128.
- *
- * To accomplish both of these goals, we add multiples of g(x) that cancel out
- * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
- * bits are zero, the polynomial division by x^128 can be done by right shifting.
- *
- * Since the only nonzero term in the low 64 bits of g(x) is the constant term,
- * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x).  The CPU can
- * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
- * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x).  Adding this to
- * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
- * = T_1 : T_0 = g*(x) * P_0.  Thus, bits 0-63 got "folded" into bits 64-191.
- *
- * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
- * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
- * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
- * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
- * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
- *
- * So our final computation is:
- *   T = T_1 : T_0 = g*(x) * P_0
- *   V = V_1 : V_0 = g*(x) * (P_1 + T_0)
- *   p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
- *
- * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
- * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
- * T_1 into dest.  This allows us to reuse P_1 + T_0 when computing V.
- */
-.macro montgomery_reduction dest
-	vpclmulqdq $0x00, PL, GSTAR, TMP_XMM	# TMP_XMM = T_1 : T_0 = P_0 * g*(x)
-	pshufd $0b01001110, TMP_XMM, TMP_XMM	# TMP_XMM = T_0 : T_1
-	pxor PL, TMP_XMM			# TMP_XMM = P_1 + T_0 : P_0 + T_1
-	pxor TMP_XMM, PH			# PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
-	pclmulqdq $0x11, GSTAR, TMP_XMM		# TMP_XMM = V_1 : V_0 = V = [(P_1 + T_0) * g*(x)]
-	vpxor TMP_XMM, PH, \dest
-.endm
-
-/*
- * Compute schoolbook multiplication for 8 blocks
- * m_0h^8 + ... + m_7h^1
- *
- * If reduce is set, also computes the montgomery reduction of the
- * previous full_stride call and XORs with the first message block.
- * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
- * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
- */
-.macro full_stride reduce
-	pxor LO, LO
-	pxor HI, HI
-	pxor MI, MI
-
-	schoolbook1_iteration 7 0
-	.if \reduce
-		vpclmulqdq $0x00, PL, GSTAR, TMP_XMM
-	.endif
-
-	schoolbook1_iteration 6 0
-	.if \reduce
-		pshufd $0b01001110, TMP_XMM, TMP_XMM
-	.endif
-
-	schoolbook1_iteration 5 0
-	.if \reduce
-		pxor PL, TMP_XMM
-	.endif
-
-	schoolbook1_iteration 4 0
-	.if \reduce
-		pxor TMP_XMM, PH
-	.endif
-
-	schoolbook1_iteration 3 0
-	.if \reduce
-		pclmulqdq $0x11, GSTAR, TMP_XMM
-	.endif
-
-	schoolbook1_iteration 2 0
-	.if \reduce
-		vpxor TMP_XMM, PH, SUM
-	.endif
-
-	schoolbook1_iteration 1 0
-
-	schoolbook1_iteration 0 1
-
-	addq $(8*16), MSG
-	schoolbook2
-.endm
-
-/*
- * Process BLOCKS_LEFT blocks, where 0 < BLOCKS_LEFT < STRIDE_BLOCKS
- */
-.macro partial_stride
-	mov BLOCKS_LEFT, TMP
-	shlq $4, TMP
-	addq $(16*STRIDE_BLOCKS), KEY_POWERS
-	subq TMP, KEY_POWERS
-
-	movups (MSG), %xmm0
-	pxor SUM, %xmm0
-	movaps (KEY_POWERS), %xmm1
-	schoolbook1_noload
-	dec BLOCKS_LEFT
-	addq $16, MSG
-	addq $16, KEY_POWERS
-
-	test $4, BLOCKS_LEFT
-	jz .Lpartial4BlocksDone
-	schoolbook1 4
-	addq $(4*16), MSG
-	addq $(4*16), KEY_POWERS
-.Lpartial4BlocksDone:
-	test $2, BLOCKS_LEFT
-	jz .Lpartial2BlocksDone
-	schoolbook1 2
-	addq $(2*16), MSG
-	addq $(2*16), KEY_POWERS
-.Lpartial2BlocksDone:
-	test $1, BLOCKS_LEFT
-	jz .LpartialDone
-	schoolbook1 1
-.LpartialDone:
-	schoolbook2
-	montgomery_reduction SUM
-.endm
-
-/*
- * Perform montgomery multiplication in GF(2^128) and store result in op1.
- *
- * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1
- * If op1, op2 are in montgomery form, this computes the montgomery
- * form of op1*op2.
- *
- * void clmul_polyval_mul(u8 *op1, const u8 *op2);
- */
-SYM_FUNC_START(clmul_polyval_mul)
-	FRAME_BEGIN
-	vmovdqa .Lgstar(%rip), GSTAR
-	movups (%rdi), %xmm0
-	movups (%rsi), %xmm1
-	schoolbook1_noload
-	schoolbook2
-	montgomery_reduction SUM
-	movups SUM, (%rdi)
-	FRAME_END
-	RET
-SYM_FUNC_END(clmul_polyval_mul)
-
-/*
- * Perform polynomial evaluation as specified by POLYVAL.  This computes:
- *	h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
- * where n=nblocks, h is the hash key, and m_i are the message blocks.
- *
- * rdi - pointer to precomputed key powers h^8 ... h^1
- * rsi - pointer to message blocks
- * rdx - number of blocks to hash
- * rcx - pointer to the accumulator
- *
- * void clmul_polyval_update(const struct polyval_tfm_ctx *keys,
- *	const u8 *in, size_t nblocks, u8 *accumulator);
- */
-SYM_FUNC_START(clmul_polyval_update)
-	FRAME_BEGIN
-	vmovdqa .Lgstar(%rip), GSTAR
-	movups (ACCUMULATOR), SUM
-	subq $STRIDE_BLOCKS, BLOCKS_LEFT
-	js .LstrideLoopExit
-	full_stride 0
-	subq $STRIDE_BLOCKS, BLOCKS_LEFT
-	js .LstrideLoopExitReduce
-.LstrideLoop:
-	full_stride 1
-	subq $STRIDE_BLOCKS, BLOCKS_LEFT
-	jns .LstrideLoop
-.LstrideLoopExitReduce:
-	montgomery_reduction SUM
-.LstrideLoopExit:
-	add $STRIDE_BLOCKS, BLOCKS_LEFT
-	jz .LskipPartial
-	partial_stride
-.LskipPartial:
-	movups SUM, (ACCUMULATOR)
-	FRAME_END
-	RET
-SYM_FUNC_END(clmul_polyval_update)
diff --git a/arch/x86/crypto/polyval-clmulni_glue.c b/arch/x86/crypto/polyval-clmulni_glue.c
deleted file mode 100644
index 6b466867f91a..000000000000
--- a/arch/x86/crypto/polyval-clmulni_glue.c
+++ /dev/null
@@ -1,180 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Glue code for POLYVAL using PCMULQDQ-NI
- *
- * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi>
- * Copyright (c) 2009 Intel Corp.
- *   Author: Huang Ying <ying.huang@intel.com>
- * Copyright 2021 Google LLC
- */
-
-/*
- * Glue code based on ghash-clmulni-intel_glue.c.
- *
- * This implementation of POLYVAL uses montgomery multiplication
- * accelerated by PCLMULQDQ-NI to implement the finite field
- * operations.
- */
-
-#include <asm/cpu_device_id.h>
-#include <asm/fpu/api.h>
-#include <crypto/internal/hash.h>
-#include <crypto/polyval.h>
-#include <crypto/utils.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#define POLYVAL_ALIGN	16
-#define POLYVAL_ALIGN_ATTR __aligned(POLYVAL_ALIGN)
-#define POLYVAL_ALIGN_EXTRA ((POLYVAL_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1))
-#define POLYVAL_CTX_SIZE (sizeof(struct polyval_tfm_ctx) + POLYVAL_ALIGN_EXTRA)
-#define NUM_KEY_POWERS	8
-
-struct polyval_tfm_ctx {
-	/*
-	 * These powers must be in the order h^8, ..., h^1.
-	 */
-	u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE] POLYVAL_ALIGN_ATTR;
-};
-
-struct polyval_desc_ctx {
-	u8 buffer[POLYVAL_BLOCK_SIZE];
-};
-
-asmlinkage void clmul_polyval_update(const struct polyval_tfm_ctx *keys,
-	const u8 *in, size_t nblocks, u8 *accumulator);
-asmlinkage void clmul_polyval_mul(u8 *op1, const u8 *op2);
-
-static inline struct polyval_tfm_ctx *polyval_tfm_ctx(struct crypto_shash *tfm)
-{
-	return PTR_ALIGN(crypto_shash_ctx(tfm), POLYVAL_ALIGN);
-}
-
-static void internal_polyval_update(const struct polyval_tfm_ctx *keys,
-	const u8 *in, size_t nblocks, u8 *accumulator)
-{
-	kernel_fpu_begin();
-	clmul_polyval_update(keys, in, nblocks, accumulator);
-	kernel_fpu_end();
-}
-
-static void internal_polyval_mul(u8 *op1, const u8 *op2)
-{
-	kernel_fpu_begin();
-	clmul_polyval_mul(op1, op2);
-	kernel_fpu_end();
-}
-
-static int polyval_x86_setkey(struct crypto_shash *tfm,
-			const u8 *key, unsigned int keylen)
-{
-	struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(tfm);
-	int i;
-
-	if (keylen != POLYVAL_BLOCK_SIZE)
-		return -EINVAL;
-
-	memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE);
-
-	for (i = NUM_KEY_POWERS-2; i >= 0; i--) {
-		memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE);
-		internal_polyval_mul(tctx->key_powers[i],
-				     tctx->key_powers[i+1]);
-	}
-
-	return 0;
-}
-
-static int polyval_x86_init(struct shash_desc *desc)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	memset(dctx, 0, sizeof(*dctx));
-
-	return 0;
-}
-
-static int polyval_x86_update(struct shash_desc *desc,
-			 const u8 *src, unsigned int srclen)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-	const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm);
-	unsigned int nblocks;
-
-	do {
-		/* Allow rescheduling every 4K bytes. */
-		nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE;
-		internal_polyval_update(tctx, src, nblocks, dctx->buffer);
-		srclen -= nblocks * POLYVAL_BLOCK_SIZE;
-		src += nblocks * POLYVAL_BLOCK_SIZE;
-	} while (srclen >= POLYVAL_BLOCK_SIZE);
-
-	return srclen;
-}
-
-static int polyval_x86_finup(struct shash_desc *desc, const u8 *src,
-			     unsigned int len, u8 *dst)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-	const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm);
-
-	if (len) {
-		crypto_xor(dctx->buffer, src, len);
-		internal_polyval_mul(dctx->buffer,
-				     tctx->key_powers[NUM_KEY_POWERS-1]);
-	}
-
-	memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE);
-
-	return 0;
-}
-
-static struct shash_alg polyval_alg = {
-	.digestsize	= POLYVAL_DIGEST_SIZE,
-	.init		= polyval_x86_init,
-	.update		= polyval_x86_update,
-	.finup		= polyval_x86_finup,
-	.setkey		= polyval_x86_setkey,
-	.descsize	= sizeof(struct polyval_desc_ctx),
-	.base		= {
-		.cra_name		= "polyval",
-		.cra_driver_name	= "polyval-clmulni",
-		.cra_priority		= 200,
-		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize		= POLYVAL_BLOCK_SIZE,
-		.cra_ctxsize		= POLYVAL_CTX_SIZE,
-		.cra_module		= THIS_MODULE,
-	},
-};
-
-__maybe_unused static const struct x86_cpu_id pcmul_cpu_id[] = {
-	X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL),
-	{}
-};
-MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id);
-
-static int __init polyval_clmulni_mod_init(void)
-{
-	if (!x86_match_cpu(pcmul_cpu_id))
-		return -ENODEV;
-
-	if (!boot_cpu_has(X86_FEATURE_AVX))
-		return -ENODEV;
-
-	return crypto_register_shash(&polyval_alg);
-}
-
-static void __exit polyval_clmulni_mod_exit(void)
-{
-	crypto_unregister_shash(&polyval_alg);
-}
-
-module_init(polyval_clmulni_mod_init);
-module_exit(polyval_clmulni_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("POLYVAL hash function accelerated by PCLMULQDQ-NI");
-MODULE_ALIAS_CRYPTO("polyval");
-MODULE_ALIAS_CRYPTO("polyval-clmulni");
diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S
index 772c64ed4523..6ba2b3adcef0 100644
--- a/arch/x86/entry/entry.S
+++ b/arch/x86/entry/entry.S
@@ -4,6 +4,7 @@
  */
 
 #include <linux/export.h>
+#include <linux/kvm_types.h>
 #include <linux/linkage.h>
 #include <linux/objtool.h>
 #include <asm/msr-index.h>
@@ -29,8 +30,7 @@ SYM_FUNC_START(write_ibpb)
 	FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_BUG_IBPB_NO_RET
 	RET
 SYM_FUNC_END(write_ibpb)
-/* For KVM */
-EXPORT_SYMBOL_GPL(write_ibpb);
+EXPORT_SYMBOL_FOR_KVM(write_ibpb);
 
 SYM_FUNC_START(__WARN_trap)
 	ANNOTATE_NOENDBR
@@ -56,8 +56,7 @@ SYM_CODE_START_NOALIGN(x86_verw_sel)
 	.word __KERNEL_DS
 .align L1_CACHE_BYTES, 0xcc
 SYM_CODE_END(x86_verw_sel);
-/* For KVM */
-EXPORT_SYMBOL_GPL(x86_verw_sel);
+EXPORT_SYMBOL_FOR_KVM(x86_verw_sel);
 
 .popsection
 
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index ed04a968cc7d..f9983a1907bf 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -19,6 +19,7 @@
  * - idtentry:		Define exception entry points.
  */
 #include <linux/export.h>
+#include <linux/kvm_types.h>
 #include <linux/linkage.h>
 #include <asm/segment.h>
 #include <asm/cache.h>
@@ -1566,5 +1567,5 @@ SYM_FUNC_START(clear_bhb_loop)
 	pop	%rbp
 	RET
 SYM_FUNC_END(clear_bhb_loop)
-EXPORT_SYMBOL_GPL(clear_bhb_loop)
+EXPORT_SYMBOL_FOR_KVM(clear_bhb_loop)
 STACK_FRAME_NON_STANDARD(clear_bhb_loop)
diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S
index fafbd3e68cb8..894f7f16eb80 100644
--- a/arch/x86/entry/entry_64_fred.S
+++ b/arch/x86/entry/entry_64_fred.S
@@ -4,6 +4,7 @@
  */
 
 #include <linux/export.h>
+#include <linux/kvm_types.h>
 
 #include <asm/asm.h>
 #include <asm/fred.h>
@@ -146,5 +147,5 @@ SYM_FUNC_START(asm_fred_entry_from_kvm)
 	RET
 
 SYM_FUNC_END(asm_fred_entry_from_kvm)
-EXPORT_SYMBOL_GPL(asm_fred_entry_from_kvm);
+EXPORT_SYMBOL_FOR_KVM(asm_fred_entry_from_kvm);
 #endif
diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c
index f004a4dc74c2..94e626cc6a07 100644
--- a/arch/x86/entry/entry_fred.c
+++ b/arch/x86/entry/entry_fred.c
@@ -78,13 +78,13 @@ static noinstr void fred_intx(struct pt_regs *regs)
 static __always_inline void fred_other(struct pt_regs *regs)
 {
 	/* The compiler can fold these conditions into a single test */
-	if (likely(regs->fred_ss.vector == FRED_SYSCALL && regs->fred_ss.lm)) {
+	if (likely(regs->fred_ss.vector == FRED_SYSCALL && regs->fred_ss.l)) {
 		regs->orig_ax = regs->ax;
 		regs->ax = -ENOSYS;
 		do_syscall_64(regs, regs->orig_ax);
 		return;
 	} else if (ia32_enabled() &&
-		   likely(regs->fred_ss.vector == FRED_SYSENTER && !regs->fred_ss.lm)) {
+		   likely(regs->fred_ss.vector == FRED_SYSENTER && !regs->fred_ss.l)) {
 		regs->orig_ax = regs->ax;
 		regs->ax = -ENOSYS;
 		do_fast_syscall_32(regs);
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index 8868f5f5379b..44656d2fb555 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -2,6 +2,7 @@
 #include <linux/perf_event.h>
 #include <linux/jump_label.h>
 #include <linux/export.h>
+#include <linux/kvm_types.h>
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/slab.h>
@@ -1574,7 +1575,7 @@ void amd_pmu_enable_virt(void)
 	/* Reload all events */
 	amd_pmu_reload_virt();
 }
-EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
+EXPORT_SYMBOL_FOR_KVM(amd_pmu_enable_virt);
 
 void amd_pmu_disable_virt(void)
 {
@@ -1591,4 +1592,4 @@ void amd_pmu_disable_virt(void)
 	/* Reload all events */
 	amd_pmu_reload_virt();
 }
-EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);
+EXPORT_SYMBOL_FOR_KVM(amd_pmu_disable_virt);
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index c4091482e6c8..0c38a31d5fc7 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -20,6 +20,7 @@
 #include <linux/export.h>
 #include <linux/init.h>
 #include <linux/kdebug.h>
+#include <linux/kvm_types.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/clock.h>
 #include <linux/uaccess.h>
@@ -723,7 +724,7 @@ struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data)
 {
 	return static_call(x86_pmu_guest_get_msrs)(nr, data);
 }
-EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
+EXPORT_SYMBOL_FOR_KVM(perf_guest_get_msrs);
 
 /*
  * There may be PMI landing after enabled=0. The PMI hitting could be before or
@@ -3078,7 +3079,7 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
 	cap->events_mask_len	= x86_pmu.events_mask_len;
 	cap->pebs_ept		= x86_pmu.pebs_ept;
 }
-EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
+EXPORT_SYMBOL_FOR_KVM(perf_get_x86_pmu_capability);
 
 u64 perf_get_hw_event_config(int hw_event)
 {
@@ -3089,4 +3090,4 @@ u64 perf_get_hw_event_config(int hw_event)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(perf_get_hw_event_config);
+EXPORT_SYMBOL_FOR_KVM(perf_get_hw_event_config);
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 7aa59966e7c3..72f2adcda7c6 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <linux/kvm_types.h>
 #include <linux/perf_event.h>
 #include <linux/types.h>
 
@@ -1705,7 +1706,7 @@ void x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
 	lbr->info = x86_pmu.lbr_info;
 	lbr->has_callstack = x86_pmu_has_lbr_callstack();
 }
-EXPORT_SYMBOL_GPL(x86_perf_get_lbr);
+EXPORT_SYMBOL_FOR_KVM(x86_perf_get_lbr);
 
 struct event_constraint vlbr_constraint =
 	__EVENT_CONSTRAINT(INTEL_FIXED_VLBR_EVENT, (1ULL << INTEL_PMC_IDX_FIXED_VLBR),
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index e8cf29d2b10c..44524a387c58 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -17,6 +17,7 @@
 #include <linux/limits.h>
 #include <linux/slab.h>
 #include <linux/device.h>
+#include <linux/kvm_types.h>
 
 #include <asm/cpuid/api.h>
 #include <asm/perf_event.h>
@@ -82,13 +83,13 @@ u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability)
 
 	return (c & cd->mask) >> shift;
 }
-EXPORT_SYMBOL_GPL(intel_pt_validate_cap);
+EXPORT_SYMBOL_FOR_KVM(intel_pt_validate_cap);
 
 u32 intel_pt_validate_hw_cap(enum pt_capabilities cap)
 {
 	return intel_pt_validate_cap(pt_pmu.caps, cap);
 }
-EXPORT_SYMBOL_GPL(intel_pt_validate_hw_cap);
+EXPORT_SYMBOL_FOR_KVM(intel_pt_validate_hw_cap);
 
 static ssize_t pt_cap_show(struct device *cdev,
 			   struct device_attribute *attr,
@@ -1590,7 +1591,7 @@ void intel_pt_handle_vmx(int on)
 
 	local_irq_restore(flags);
 }
-EXPORT_SYMBOL_GPL(intel_pt_handle_vmx);
+EXPORT_SYMBOL_FOR_KVM(intel_pt_handle_vmx);
 
 /*
  * PMU callbacks
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index fc5f32d4da6e..4b1a6ade1700 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -30,7 +30,7 @@ enum cpuid_leafs
 	CPUID_6_EAX,
 	CPUID_8000_000A_EDX,
 	CPUID_7_ECX,
-	CPUID_8000_0007_EBX,
+	CPUID_LNX_6,
 	CPUID_7_EDX,
 	CPUID_8000_001F_EAX,
 	CPUID_8000_0021_EAX,
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 2abee3e9dc75..d90ce601917c 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -314,13 +314,14 @@
 #define X86_FEATURE_SM4			(12*32+ 2) /* SM4 instructions */
 #define X86_FEATURE_AVX_VNNI		(12*32+ 4) /* "avx_vnni" AVX VNNI instructions */
 #define X86_FEATURE_AVX512_BF16		(12*32+ 5) /* "avx512_bf16" AVX512 BFLOAT16 instructions */
+#define X86_FEATURE_LASS		(12*32+ 6) /* "lass" Linear Address Space Separation */
 #define X86_FEATURE_CMPCCXADD           (12*32+ 7) /* CMPccXADD instructions */
 #define X86_FEATURE_ARCH_PERFMON_EXT	(12*32+ 8) /* Intel Architectural PerfMon Extension */
 #define X86_FEATURE_FZRM		(12*32+10) /* Fast zero-length REP MOVSB */
 #define X86_FEATURE_FSRS		(12*32+11) /* Fast short REP STOSB */
 #define X86_FEATURE_FSRC		(12*32+12) /* Fast short REP {CMPSB,SCASB} */
 #define X86_FEATURE_FRED		(12*32+17) /* "fred" Flexible Return and Event Delivery */
-#define X86_FEATURE_LKGS		(12*32+18) /* Load "kernel" (userspace) GS */
+#define X86_FEATURE_LKGS		(12*32+18) /* Like MOV_GS except MSR_KERNEL_GS_BASE = GS.base */
 #define X86_FEATURE_WRMSRNS		(12*32+19) /* Non-serializing WRMSR */
 #define X86_FEATURE_AMX_FP16		(12*32+21) /* AMX fp16 Support */
 #define X86_FEATURE_AVX_IFMA            (12*32+23) /* Support for VPMADD52[H,L]UQ */
@@ -407,9 +408,12 @@
 #define X86_FEATURE_ENQCMD		(16*32+29) /* "enqcmd" ENQCMD and ENQCMDS instructions */
 #define X86_FEATURE_SGX_LC		(16*32+30) /* "sgx_lc" Software Guard Extensions Launch Control */
 
-/* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */
+/*
+ * Linux-defined word for use with scattered/synthetic bits.
+ */
 #define X86_FEATURE_OVERFLOW_RECOV	(17*32+ 0) /* "overflow_recov" MCA overflow recovery support */
 #define X86_FEATURE_SUCCOR		(17*32+ 1) /* "succor" Uncorrectable error containment and recovery */
+
 #define X86_FEATURE_SMCA		(17*32+ 3) /* "smca" Scalable MCA */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */
@@ -499,6 +503,7 @@
 #define X86_FEATURE_IBPB_EXIT_TO_USER	(21*32+14) /* Use IBPB on exit-to-userspace, see VMSCAPE bug */
 #define X86_FEATURE_ABMC		(21*32+15) /* Assignable Bandwidth Monitoring Counters */
 #define X86_FEATURE_MSR_IMM		(21*32+16) /* MSR immediate form instructions */
+#define X86_FEATURE_SGX_EUPDATESVN	(21*32+17) /* Support for ENCLS[EUPDATESVN] instruction */
 
 #define X86_FEATURE_SDCIAE		(21*32+18) /* L3 Smart Data Cache Injection Allocation Enforcement */
 
diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h
index 12b34d5b2953..2bb65677c079 100644
--- a/arch/x86/include/asm/fred.h
+++ b/arch/x86/include/asm/fred.h
@@ -79,7 +79,7 @@ static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int
 		.type   = type,
 		.vector = vector,
 		.nmi    = type == EVENT_TYPE_NMI,
-		.lm     = 1,
+		.l      = 1,
 	};
 
 	asm_fred_entry_from_kvm(ss);
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index abd637e54e94..3218770670d3 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -393,7 +393,7 @@ static __always_inline void __##func(struct pt_regs *regs)
 
 /**
  * DEFINE_IDTENTRY_VC_KERNEL - Emit code for VMM communication handler
-			       when raised from kernel mode
+ *			       when raised from kernel mode
  * @func:	Function name of the entry point
  *
  * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE
@@ -403,7 +403,7 @@ static __always_inline void __##func(struct pt_regs *regs)
 
 /**
  * DEFINE_IDTENTRY_VC_USER - Emit code for VMM communication handler
-			     when raised from user mode
+ *			     when raised from user mode
  * @func:	Function name of the entry point
  *
  * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE
diff --git a/arch/x86/include/asm/kvm_types.h b/arch/x86/include/asm/kvm_types.h
index 23268a188e70..d7c704ed1be9 100644
--- a/arch/x86/include/asm/kvm_types.h
+++ b/arch/x86/include/asm/kvm_types.h
@@ -10,6 +10,11 @@
 #define KVM_SUB_MODULES kvm-intel
 #else
 #undef KVM_SUB_MODULES
+/*
+ * Don't export symbols for KVM without vendor modules, as kvm.ko is built iff
+ * at least one vendor module is enabled.
+ */
+#define EXPORT_SYMBOL_FOR_KVM(symbol)
 #endif
 
 #define KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE 40
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 53f4089333f2..2f0e47be79a4 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -9,6 +9,7 @@
 #include <asm/alternative.h>
 
 #include <linux/kmsan-checks.h>
+#include <linux/mmdebug.h>
 
 /* duplicated to the one in bootmem.h */
 extern unsigned long max_pfn;
@@ -31,13 +32,20 @@ static __always_inline unsigned long __phys_addr_nodebug(unsigned long x)
 
 #ifdef CONFIG_DEBUG_VIRTUAL
 extern unsigned long __phys_addr(unsigned long);
-extern unsigned long __phys_addr_symbol(unsigned long);
 #else
 #define __phys_addr(x)		__phys_addr_nodebug(x)
-#define __phys_addr_symbol(x) \
-	((unsigned long)(x) - __START_KERNEL_map + phys_base)
 #endif
 
+static inline unsigned long __phys_addr_symbol(unsigned long x)
+{
+	unsigned long y = x - __START_KERNEL_map;
+
+	/* only check upper bounds since lower bounds will trigger carry */
+	VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
+
+	return y + phys_base;
+}
+
 #define __phys_reloc_hide(x)	(x)
 
 void clear_page_orig(void *page);
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 332428caaed2..725d0eff7acd 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -23,6 +23,7 @@
 #else /* !__ASSEMBLY__: */
 
 #include <linux/args.h>
+#include <linux/bits.h>
 #include <linux/build_bug.h>
 #include <linux/stringify.h>
 #include <asm/asm.h>
@@ -572,9 +573,9 @@ do {									\
 #define x86_this_cpu_constant_test_bit(_nr, _var)			\
 ({									\
 	unsigned long __percpu *addr__ =				\
-		(unsigned long __percpu *)&(_var) + ((_nr) / BITS_PER_LONG); \
+		(unsigned long __percpu *)&(_var) + BIT_WORD(_nr);	\
 									\
-	!!((1UL << ((_nr) % BITS_PER_LONG)) & raw_cpu_read(*addr__));	\
+	!!(BIT_MASK(_nr) & raw_cpu_read(*addr__));			\
 })
 
 #define x86_this_cpu_variable_test_bit(_nr, _var)			\
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index b5dec859bc75..35d062a2e304 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -84,8 +84,8 @@ struct fred_ss {
 			:  4,
 		/* Event was incident to enclave execution */
 		enclave	:  1,
-		/* CPU was in long mode */
-		lm	:  1,
+		/* CPU was in 64-bit mode */
+		l	:  1,
 		/*
 		 * Nested exception during FRED delivery, not set
 		 * for #DF.
diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h
index 6a0069761508..04958459a7ca 100644
--- a/arch/x86/include/asm/sgx.h
+++ b/arch/x86/include/asm/sgx.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/**
+/*
  * Copyright(c) 2016-20 Intel Corporation.
  *
  * Intel Software Guard Extensions (SGX) support.
@@ -28,21 +28,22 @@
 #define SGX_CPUID_EPC_MASK	GENMASK(3, 0)
 
 enum sgx_encls_function {
-	ECREATE	= 0x00,
-	EADD	= 0x01,
-	EINIT	= 0x02,
-	EREMOVE	= 0x03,
-	EDGBRD	= 0x04,
-	EDGBWR	= 0x05,
-	EEXTEND	= 0x06,
-	ELDU	= 0x08,
-	EBLOCK	= 0x09,
-	EPA	= 0x0A,
-	EWB	= 0x0B,
-	ETRACK	= 0x0C,
-	EAUG	= 0x0D,
-	EMODPR	= 0x0E,
-	EMODT	= 0x0F,
+	ECREATE		= 0x00,
+	EADD		= 0x01,
+	EINIT		= 0x02,
+	EREMOVE		= 0x03,
+	EDGBRD		= 0x04,
+	EDGBWR		= 0x05,
+	EEXTEND		= 0x06,
+	ELDU		= 0x08,
+	EBLOCK		= 0x09,
+	EPA		= 0x0A,
+	EWB		= 0x0B,
+	ETRACK		= 0x0C,
+	EAUG		= 0x0D,
+	EMODPR		= 0x0E,
+	EMODT		= 0x0F,
+	EUPDATESVN	= 0x18,
 };
 
 /**
@@ -65,15 +66,19 @@ enum sgx_encls_function {
 
 /**
  * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV
- * %SGX_EPC_PAGE_CONFLICT:	Page is being written by other ENCLS function.
- * %SGX_NOT_TRACKED:		Previous ETRACK's shootdown sequence has not
+ * @SGX_EPC_PAGE_CONFLICT:	Page is being written by other ENCLS function.
+ * @SGX_NOT_TRACKED:		Previous ETRACK's shootdown sequence has not
  *				been completed yet.
- * %SGX_CHILD_PRESENT		SECS has child pages present in the EPC.
- * %SGX_INVALID_EINITTOKEN:	EINITTOKEN is invalid and enclave signer's
+ * @SGX_CHILD_PRESENT:		SECS has child pages present in the EPC.
+ * @SGX_INVALID_EINITTOKEN:	EINITTOKEN is invalid and enclave signer's
  *				public key does not match IA32_SGXLEPUBKEYHASH.
- * %SGX_PAGE_NOT_MODIFIABLE:	The EPC page cannot be modified because it
+ * @SGX_PAGE_NOT_MODIFIABLE:	The EPC page cannot be modified because it
  *				is in the PENDING or MODIFIED state.
- * %SGX_UNMASKED_EVENT:		An unmasked event, e.g. INTR, was received
+ * @SGX_INSUFFICIENT_ENTROPY:	Insufficient entropy in RNG.
+ * @SGX_NO_UPDATE:		EUPDATESVN could not update the CPUSVN because the
+ *				current SVN was not newer than CPUSVN. This is the most
+ *				common error code returned by EUPDATESVN.
+ * @SGX_UNMASKED_EVENT:		An unmasked event, e.g. INTR, was received
  */
 enum sgx_return_code {
 	SGX_EPC_PAGE_CONFLICT		= 7,
@@ -81,6 +86,8 @@ enum sgx_return_code {
 	SGX_CHILD_PRESENT		= 13,
 	SGX_INVALID_EINITTOKEN		= 16,
 	SGX_PAGE_NOT_MODIFIABLE		= 20,
+	SGX_INSUFFICIENT_ENTROPY	= 29,
+	SGX_NO_UPDATE			= 31,
 	SGX_UNMASKED_EVENT		= 128,
 };
 
@@ -89,7 +96,7 @@ enum sgx_return_code {
 
 /**
  * enum sgx_miscselect - additional information to an SSA frame
- * %SGX_MISC_EXINFO:	Report #PF or #GP to the SSA frame.
+ * @SGX_MISC_EXINFO:	Report #PF or #GP to the SSA frame.
  *
  * Save State Area (SSA) is a stack inside the enclave used to store processor
  * state when an exception or interrupt occurs. This enum defines additional
@@ -105,17 +112,17 @@ enum sgx_miscselect {
 #define SGX_SSA_MISC_EXINFO_SIZE	16
 
 /**
- * enum sgx_attributes - the attributes field in &struct sgx_secs
- * %SGX_ATTR_INIT:		Enclave can be entered (is initialized).
- * %SGX_ATTR_DEBUG:		Allow ENCLS(EDBGRD) and ENCLS(EDBGWR).
- * %SGX_ATTR_MODE64BIT:		Tell that this a 64-bit enclave.
- * %SGX_ATTR_PROVISIONKEY:      Allow to use provisioning keys for remote
+ * enum sgx_attribute - the attributes field in &struct sgx_secs
+ * @SGX_ATTR_INIT:		Enclave can be entered (is initialized).
+ * @SGX_ATTR_DEBUG:		Allow ENCLS(EDBGRD) and ENCLS(EDBGWR).
+ * @SGX_ATTR_MODE64BIT:		Tell that this a 64-bit enclave.
+ * @SGX_ATTR_PROVISIONKEY:      Allow to use provisioning keys for remote
  *				attestation.
- * %SGX_ATTR_KSS:		Allow to use key separation and sharing (KSS).
- * %SGX_ATTR_EINITTOKENKEY:	Allow to use token signing key that is used to
+ * @SGX_ATTR_KSS:		Allow to use key separation and sharing (KSS).
+ * @SGX_ATTR_EINITTOKENKEY:	Allow to use token signing key that is used to
  *				sign cryptographic tokens that can be passed to
  *				EINIT as an authorization to run an enclave.
- * %SGX_ATTR_ASYNC_EXIT_NOTIFY:	Allow enclaves to be notified after an
+ * @SGX_ATTR_ASYNC_EXIT_NOTIFY:	Allow enclaves to be notified after an
  *				asynchronous exit has occurred.
  */
 enum sgx_attribute {
@@ -188,7 +195,7 @@ struct sgx_secs {
 
 /**
  * enum sgx_tcs_flags - execution flags for TCS
- * %SGX_TCS_DBGOPTIN:	If enabled allows single-stepping and breakpoints
+ * @SGX_TCS_DBGOPTIN:	If enabled allows single-stepping and breakpoints
  *			inside an enclave. It is cleared by EADD but can
  *			be set later with EDBGWR.
  */
@@ -253,11 +260,11 @@ struct sgx_pageinfo {
 
 /**
  * enum sgx_page_type - bits in the SECINFO flags defining the page type
- * %SGX_PAGE_TYPE_SECS:	a SECS page
- * %SGX_PAGE_TYPE_TCS:	a TCS page
- * %SGX_PAGE_TYPE_REG:	a regular page
- * %SGX_PAGE_TYPE_VA:	a VA page
- * %SGX_PAGE_TYPE_TRIM:	a page in trimmed state
+ * @SGX_PAGE_TYPE_SECS:	a SECS page
+ * @SGX_PAGE_TYPE_TCS:	a TCS page
+ * @SGX_PAGE_TYPE_REG:	a regular page
+ * @SGX_PAGE_TYPE_VA:	a VA page
+ * @SGX_PAGE_TYPE_TRIM:	a page in trimmed state
  *
  * Make sure when making changes to this enum that its values can still fit
  * in the bitfield within &struct sgx_encl_page
@@ -275,14 +282,14 @@ enum sgx_page_type {
 
 /**
  * enum sgx_secinfo_flags - the flags field in &struct sgx_secinfo
- * %SGX_SECINFO_R:	allow read
- * %SGX_SECINFO_W:	allow write
- * %SGX_SECINFO_X:	allow execution
- * %SGX_SECINFO_SECS:	a SECS page
- * %SGX_SECINFO_TCS:	a TCS page
- * %SGX_SECINFO_REG:	a regular page
- * %SGX_SECINFO_VA:	a VA page
- * %SGX_SECINFO_TRIM:	a page in trimmed state
+ * @SGX_SECINFO_R:	allow read
+ * @SGX_SECINFO_W:	allow write
+ * @SGX_SECINFO_X:	allow execution
+ * @SGX_SECINFO_SECS:	a SECS page
+ * @SGX_SECINFO_TCS:	a TCS page
+ * @SGX_SECINFO_REG:	a regular page
+ * @SGX_SECINFO_VA:	a VA page
+ * @SGX_SECINFO_TRIM:	a page in trimmed state
  */
 enum sgx_secinfo_flags {
 	SGX_SECINFO_R			= BIT(0),
diff --git a/arch/x86/include/asm/shared/msr.h b/arch/x86/include/asm/shared/msr.h
index 1e6ec10b3a15..a20b1c08c99f 100644
--- a/arch/x86/include/asm/shared/msr.h
+++ b/arch/x86/include/asm/shared/msr.h
@@ -12,4 +12,19 @@ struct msr {
 	};
 };
 
+/*
+ * The kernel proper already defines rdmsr()/wrmsr(), but they are not for the
+ * boot kernel since they rely on tracepoint/exception handling infrastructure
+ * that's not available here.
+ */
+static inline void raw_rdmsr(unsigned int reg, struct msr *m)
+{
+	asm volatile("rdmsr" : "=a" (m->l), "=d" (m->h) : "c" (reg));
+}
+
+static inline void raw_wrmsr(unsigned int reg, const struct msr *m)
+{
+	asm volatile("wrmsr" : : "c" (reg), "a"(m->l), "d" (m->h) : "memory");
+}
+
 #endif /* _ASM_X86_SHARED_MSR_H */
diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h
index 4f84d421d1cf..20a3baae9568 100644
--- a/arch/x86/include/asm/smap.h
+++ b/arch/x86/include/asm/smap.h
@@ -23,18 +23,55 @@
 
 #else /* __ASSEMBLER__ */
 
+/*
+ * The CLAC/STAC instructions toggle the enforcement of
+ * X86_FEATURE_SMAP along with X86_FEATURE_LASS.
+ *
+ * SMAP enforcement is based on the _PAGE_BIT_USER bit in the page
+ * tables. The kernel is not allowed to touch pages with that bit set
+ * unless the AC bit is set.
+ *
+ * Use stac()/clac() when accessing userspace (_PAGE_USER) mappings,
+ * regardless of location.
+ *
+ * Note: a barrier is implicit in alternative().
+ */
+
 static __always_inline void clac(void)
 {
-	/* Note: a barrier is implicit in alternative() */
 	alternative("", "clac", X86_FEATURE_SMAP);
 }
 
 static __always_inline void stac(void)
 {
-	/* Note: a barrier is implicit in alternative() */
 	alternative("", "stac", X86_FEATURE_SMAP);
 }
 
+/*
+ * LASS enforcement is based on bit 63 of the virtual address. The
+ * kernel is not allowed to touch memory in the lower half of the
+ * virtual address space.
+ *
+ * Use lass_stac()/lass_clac() to toggle the AC bit for kernel data
+ * accesses (!_PAGE_USER) that are blocked by LASS, but not by SMAP.
+ *
+ * Even with the AC bit set, LASS will continue to block instruction
+ * fetches from the user half of the address space. To allow those,
+ * clear CR4.LASS to disable the LASS mechanism entirely.
+ *
+ * Note: a barrier is implicit in alternative().
+ */
+
+static __always_inline void lass_clac(void)
+{
+	alternative("", "clac", X86_FEATURE_LASS);
+}
+
+static __always_inline void lass_stac(void)
+{
+	alternative("", "stac", X86_FEATURE_LASS);
+}
+
 static __always_inline unsigned long smap_save(void)
 {
 	unsigned long flags;
diff --git a/arch/x86/include/asm/string.h b/arch/x86/include/asm/string.h
index c3c2c1914d65..9cb5aae7fba9 100644
--- a/arch/x86/include/asm/string.h
+++ b/arch/x86/include/asm/string.h
@@ -1,6 +1,32 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_STRING_H
+#define _ASM_X86_STRING_H
+
 #ifdef CONFIG_X86_32
 # include <asm/string_32.h>
 #else
 # include <asm/string_64.h>
 #endif
+
+static __always_inline void *__inline_memcpy(void *to, const void *from, size_t len)
+{
+	void *ret = to;
+
+	asm volatile("rep movsb"
+		     : "+D" (to), "+S" (from), "+c" (len)
+		     : : "memory");
+	return ret;
+}
+
+static __always_inline void *__inline_memset(void *s, int v, size_t n)
+{
+	void *ret = s;
+
+	asm volatile("rep stosb"
+		     : "+D" (s), "+c" (n)
+		     : "a" ((uint8_t)v)
+		     : "memory");
+	return ret;
+}
+
+#endif /* _ASM_X86_STRING_H */
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 17f6c3fedeee..0581c477d466 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -701,5 +701,6 @@ DEFINE_GHCB_ACCESSORS(sw_exit_info_1)
 DEFINE_GHCB_ACCESSORS(sw_exit_info_2)
 DEFINE_GHCB_ACCESSORS(sw_scratch)
 DEFINE_GHCB_ACCESSORS(xcr0)
+DEFINE_GHCB_ACCESSORS(xss)
 
 #endif
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index f1a4adc78272..81d0c8bf1137 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -136,6 +136,8 @@
 #define X86_CR4_PKE		_BITUL(X86_CR4_PKE_BIT)
 #define X86_CR4_CET_BIT		23 /* enable Control-flow Enforcement Technology */
 #define X86_CR4_CET		_BITUL(X86_CR4_CET_BIT)
+#define X86_CR4_LASS_BIT	27 /* enable Linear Address Space Separation support */
+#define X86_CR4_LASS		_BITUL(X86_CR4_LASS_BIT)
 #define X86_CR4_LAM_SUP_BIT	28 /* LAM for supervisor pointers */
 #define X86_CR4_LAM_SUP		_BITUL(X86_CR4_LAM_SUP_BIT)
 
diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h
index 2dd35bbdc822..3c4d52072189 100644
--- a/arch/x86/include/uapi/asm/sgx.h
+++ b/arch/x86/include/uapi/asm/sgx.h
@@ -10,7 +10,7 @@
 
 /**
  * enum sgx_page_flags - page control flags
- * %SGX_PAGE_MEASURE:	Measure the page contents with a sequence of
+ * @SGX_PAGE_MEASURE:	Measure the page contents with a sequence of
  *			ENCLS[EEXTEND] operations.
  */
 enum sgx_page_flags {
@@ -143,6 +143,12 @@ struct sgx_enclave_run;
 /**
  * typedef sgx_enclave_user_handler_t - Exit handler function accepted by
  *					__vdso_sgx_enter_enclave()
+ * @rdi:	RDI at the time of EEXIT, undefined on AEX
+ * @rsi:	RSI at the time of EEXIT, undefined on AEX
+ * @rdx:	RDX at the time of EEXIT, undefined on AEX
+ * @rsp:	RSP (untrusted) at the time of EEXIT or AEX
+ * @r8:		R8 at the time of EEXIT, undefined on AEX
+ * @r9:		R9 at the time of EEXIT, undefined on AEX
  * @run:	The run instance given by the caller
  *
  * The register parameters contain the snapshot of their values at enclave
@@ -166,7 +172,7 @@ typedef int (*sgx_enclave_user_handler_t)(long rdi, long rsi, long rdx,
  * @exception_addr:		The address that triggered the exception
  * @user_handler:		User provided callback run on exception
  * @user_data:			Data passed to the user handler
- * @reserved			Reserved for future extensions
+ * @reserved:			Reserved for future extensions
  *
  * If @user_handler is provided, the handler will be invoked on all return paths
  * of the normal flow.  The user handler may transfer control, e.g. via a
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index e377b06e70e3..74f4c659f9c9 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2453,16 +2453,30 @@ void __init_or_module text_poke_early(void *addr, const void *opcode,
 __ro_after_init struct mm_struct *text_poke_mm;
 __ro_after_init unsigned long text_poke_mm_addr;
 
+/*
+ * Text poking creates and uses a mapping in the lower half of the
+ * address space. Relax LASS enforcement when accessing the poking
+ * address.
+ *
+ * objtool enforces a strict policy of "no function calls within AC=1
+ * regions". Adhere to the policy by using inline versions of
+ * memcpy()/memset() that will never result in a function call.
+ */
+
 static void text_poke_memcpy(void *dst, const void *src, size_t len)
 {
-	memcpy(dst, src, len);
+	lass_stac();
+	__inline_memcpy(dst, src, len);
+	lass_clac();
 }
 
 static void text_poke_memset(void *dst, const void *src, size_t len)
 {
 	int c = *(const int *)src;
 
-	memset(dst, c, len);
+	lass_stac();
+	__inline_memset(dst, c, len);
+	lass_clac();
 }
 
 typedef void text_poke_f(void *dst, const void *src, size_t len);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index ca1c8b70ac44..9c29e12b84e5 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -36,6 +36,7 @@
 #include <linux/dmi.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
+#include <linux/kvm_types.h>
 
 #include <xen/xen.h>
 
@@ -2319,7 +2320,7 @@ u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid)
 		dest |= msg->arch_addr_hi.destid_8_31 << 8;
 	return dest;
 }
-EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid);
+EXPORT_SYMBOL_FOR_KVM(x86_msi_msg_get_destid);
 
 static void __init apic_bsp_up_setup(void)
 {
diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c
index 9ef3be866832..2ed3b5c88c7f 100644
--- a/arch/x86/kernel/apic/apic_common.c
+++ b/arch/x86/kernel/apic/apic_common.c
@@ -4,6 +4,7 @@
  * SPDX-License-Identifier: GPL-2.0
  */
 #include <linux/irq.h>
+#include <linux/kvm_types.h>
 #include <asm/apic.h>
 
 #include "local.h"
@@ -25,7 +26,7 @@ u32 default_cpu_present_to_apicid(int mps_cpu)
 	else
 		return BAD_APICID;
 }
-EXPORT_SYMBOL_GPL(default_cpu_present_to_apicid);
+EXPORT_SYMBOL_FOR_KVM(default_cpu_present_to_apicid);
 
 /*
  * Set up the logical destination ID when the APIC operates in logical
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 5d46709c58d0..bc94ff1e250a 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -3,7 +3,7 @@
 #include <linux/bitops.h>
 #include <linux/elf.h>
 #include <linux/mm.h>
-
+#include <linux/kvm_types.h>
 #include <linux/io.h>
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
@@ -1318,7 +1318,7 @@ unsigned long amd_get_dr_addr_mask(unsigned int dr)
 
 	return per_cpu(amd_dr_addr_mask[dr], smp_processor_id());
 }
-EXPORT_SYMBOL_GPL(amd_get_dr_addr_mask);
+EXPORT_SYMBOL_FOR_KVM(amd_get_dr_addr_mask);
 
 static void zenbleed_check_cpu(void *unused)
 {
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index d7fa03bf51b4..d8660770dc6a 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -16,6 +16,7 @@
 #include <linux/sched/smt.h>
 #include <linux/pgtable.h>
 #include <linux/bpf.h>
+#include <linux/kvm_types.h>
 
 #include <asm/spec-ctrl.h>
 #include <asm/cmdline.h>
@@ -53,56 +54,8 @@
  * mitigation option.
  */
 
-static void __init spectre_v1_select_mitigation(void);
-static void __init spectre_v1_apply_mitigation(void);
-static void __init spectre_v2_select_mitigation(void);
-static void __init spectre_v2_update_mitigation(void);
-static void __init spectre_v2_apply_mitigation(void);
-static void __init retbleed_select_mitigation(void);
-static void __init retbleed_update_mitigation(void);
-static void __init retbleed_apply_mitigation(void);
-static void __init spectre_v2_user_select_mitigation(void);
-static void __init spectre_v2_user_update_mitigation(void);
-static void __init spectre_v2_user_apply_mitigation(void);
-static void __init ssb_select_mitigation(void);
-static void __init ssb_apply_mitigation(void);
-static void __init l1tf_select_mitigation(void);
-static void __init l1tf_apply_mitigation(void);
-static void __init mds_select_mitigation(void);
-static void __init mds_update_mitigation(void);
-static void __init mds_apply_mitigation(void);
-static void __init taa_select_mitigation(void);
-static void __init taa_update_mitigation(void);
-static void __init taa_apply_mitigation(void);
-static void __init mmio_select_mitigation(void);
-static void __init mmio_update_mitigation(void);
-static void __init mmio_apply_mitigation(void);
-static void __init rfds_select_mitigation(void);
-static void __init rfds_update_mitigation(void);
-static void __init rfds_apply_mitigation(void);
-static void __init srbds_select_mitigation(void);
-static void __init srbds_apply_mitigation(void);
-static void __init l1d_flush_select_mitigation(void);
-static void __init srso_select_mitigation(void);
-static void __init srso_update_mitigation(void);
-static void __init srso_apply_mitigation(void);
-static void __init gds_select_mitigation(void);
-static void __init gds_apply_mitigation(void);
-static void __init bhi_select_mitigation(void);
-static void __init bhi_update_mitigation(void);
-static void __init bhi_apply_mitigation(void);
-static void __init its_select_mitigation(void);
-static void __init its_update_mitigation(void);
-static void __init its_apply_mitigation(void);
-static void __init tsa_select_mitigation(void);
-static void __init tsa_apply_mitigation(void);
-static void __init vmscape_select_mitigation(void);
-static void __init vmscape_update_mitigation(void);
-static void __init vmscape_apply_mitigation(void);
-
 /* The base value of the SPEC_CTRL MSR without task-specific bits set */
 u64 x86_spec_ctrl_base;
-EXPORT_SYMBOL_GPL(x86_spec_ctrl_base);
 
 /* The current value of the SPEC_CTRL MSR with task-specific bits set */
 DEFINE_PER_CPU(u64, x86_spec_ctrl_current);
@@ -179,7 +132,7 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
 
 /* Control IBPB on vCPU load */
 DEFINE_STATIC_KEY_FALSE(switch_vcpu_ibpb);
-EXPORT_SYMBOL_GPL(switch_vcpu_ibpb);
+EXPORT_SYMBOL_FOR_KVM(switch_vcpu_ibpb);
 
 /* Control CPU buffer clear before idling (halt, mwait) */
 DEFINE_STATIC_KEY_FALSE(cpu_buf_idle_clear);
@@ -198,7 +151,7 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
  * mitigation is required.
  */
 DEFINE_STATIC_KEY_FALSE(cpu_buf_vm_clear);
-EXPORT_SYMBOL_GPL(cpu_buf_vm_clear);
+EXPORT_SYMBOL_FOR_KVM(cpu_buf_vm_clear);
 
 #undef pr_fmt
 #define pr_fmt(fmt)	"mitigations: " fmt
@@ -233,99 +186,6 @@ static void __init cpu_print_attack_vectors(void)
 	}
 }
 
-void __init cpu_select_mitigations(void)
-{
-	/*
-	 * Read the SPEC_CTRL MSR to account for reserved bits which may
-	 * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD
-	 * init code as it is not enumerated and depends on the family.
-	 */
-	if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) {
-		rdmsrq(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
-
-		/*
-		 * Previously running kernel (kexec), may have some controls
-		 * turned ON. Clear them and let the mitigations setup below
-		 * rediscover them based on configuration.
-		 */
-		x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK;
-	}
-
-	x86_arch_cap_msr = x86_read_arch_cap_msr();
-
-	cpu_print_attack_vectors();
-
-	/* Select the proper CPU mitigations before patching alternatives: */
-	spectre_v1_select_mitigation();
-	spectre_v2_select_mitigation();
-	retbleed_select_mitigation();
-	spectre_v2_user_select_mitigation();
-	ssb_select_mitigation();
-	l1tf_select_mitigation();
-	mds_select_mitigation();
-	taa_select_mitigation();
-	mmio_select_mitigation();
-	rfds_select_mitigation();
-	srbds_select_mitigation();
-	l1d_flush_select_mitigation();
-	srso_select_mitigation();
-	gds_select_mitigation();
-	its_select_mitigation();
-	bhi_select_mitigation();
-	tsa_select_mitigation();
-	vmscape_select_mitigation();
-
-	/*
-	 * After mitigations are selected, some may need to update their
-	 * choices.
-	 */
-	spectre_v2_update_mitigation();
-	/*
-	 * retbleed_update_mitigation() relies on the state set by
-	 * spectre_v2_update_mitigation(); specifically it wants to know about
-	 * spectre_v2=ibrs.
-	 */
-	retbleed_update_mitigation();
-	/*
-	 * its_update_mitigation() depends on spectre_v2_update_mitigation()
-	 * and retbleed_update_mitigation().
-	 */
-	its_update_mitigation();
-
-	/*
-	 * spectre_v2_user_update_mitigation() depends on
-	 * retbleed_update_mitigation(), specifically the STIBP
-	 * selection is forced for UNRET or IBPB.
-	 */
-	spectre_v2_user_update_mitigation();
-	mds_update_mitigation();
-	taa_update_mitigation();
-	mmio_update_mitigation();
-	rfds_update_mitigation();
-	bhi_update_mitigation();
-	/* srso_update_mitigation() depends on retbleed_update_mitigation(). */
-	srso_update_mitigation();
-	vmscape_update_mitigation();
-
-	spectre_v1_apply_mitigation();
-	spectre_v2_apply_mitigation();
-	retbleed_apply_mitigation();
-	spectre_v2_user_apply_mitigation();
-	ssb_apply_mitigation();
-	l1tf_apply_mitigation();
-	mds_apply_mitigation();
-	taa_apply_mitigation();
-	mmio_apply_mitigation();
-	rfds_apply_mitigation();
-	srbds_apply_mitigation();
-	srso_apply_mitigation();
-	gds_apply_mitigation();
-	its_apply_mitigation();
-	bhi_apply_mitigation();
-	tsa_apply_mitigation();
-	vmscape_apply_mitigation();
-}
-
 /*
  * NOTE: This function is *only* called for SVM, since Intel uses
  * MSR_IA32_SPEC_CTRL for SSBD.
@@ -366,7 +226,7 @@ x86_virt_spec_ctrl(u64 guest_virt_spec_ctrl, bool setguest)
 		speculation_ctrl_update(tif);
 	}
 }
-EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl);
+EXPORT_SYMBOL_FOR_KVM(x86_virt_spec_ctrl);
 
 static void x86_amd_ssb_disable(void)
 {
@@ -1032,7 +892,7 @@ bool gds_ucode_mitigated(void)
 	return (gds_mitigation == GDS_MITIGATION_FULL ||
 		gds_mitigation == GDS_MITIGATION_FULL_LOCKED);
 }
-EXPORT_SYMBOL_GPL(gds_ucode_mitigated);
+EXPORT_SYMBOL_FOR_KVM(gds_ucode_mitigated);
 
 void update_gds_msr(void)
 {
@@ -2859,7 +2719,7 @@ void x86_spec_ctrl_setup_ap(void)
 }
 
 bool itlb_multihit_kvm_mitigation;
-EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);
+EXPORT_SYMBOL_FOR_KVM(itlb_multihit_kvm_mitigation);
 
 #undef pr_fmt
 #define pr_fmt(fmt)	"L1TF: " fmt
@@ -2867,11 +2727,9 @@ EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);
 /* Default mitigation for L1TF-affected CPUs */
 enum l1tf_mitigations l1tf_mitigation __ro_after_init =
 	IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_AUTO : L1TF_MITIGATION_OFF;
-#if IS_ENABLED(CONFIG_KVM_INTEL)
-EXPORT_SYMBOL_GPL(l1tf_mitigation);
-#endif
+EXPORT_SYMBOL_FOR_KVM(l1tf_mitigation);
 enum vmx_l1d_flush_state l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
-EXPORT_SYMBOL_GPL(l1tf_vmx_mitigation);
+EXPORT_SYMBOL_FOR_KVM(l1tf_vmx_mitigation);
 
 /*
  * These CPUs all support 44bits physical address space internally in the
@@ -3371,6 +3229,99 @@ void cpu_bugs_smt_update(void)
 	mutex_unlock(&spec_ctrl_mutex);
 }
 
+void __init cpu_select_mitigations(void)
+{
+	/*
+	 * Read the SPEC_CTRL MSR to account for reserved bits which may
+	 * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD
+	 * init code as it is not enumerated and depends on the family.
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) {
+		rdmsrq(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+
+		/*
+		 * Previously running kernel (kexec), may have some controls
+		 * turned ON. Clear them and let the mitigations setup below
+		 * rediscover them based on configuration.
+		 */
+		x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK;
+	}
+
+	x86_arch_cap_msr = x86_read_arch_cap_msr();
+
+	cpu_print_attack_vectors();
+
+	/* Select the proper CPU mitigations before patching alternatives: */
+	spectre_v1_select_mitigation();
+	spectre_v2_select_mitigation();
+	retbleed_select_mitigation();
+	spectre_v2_user_select_mitigation();
+	ssb_select_mitigation();
+	l1tf_select_mitigation();
+	mds_select_mitigation();
+	taa_select_mitigation();
+	mmio_select_mitigation();
+	rfds_select_mitigation();
+	srbds_select_mitigation();
+	l1d_flush_select_mitigation();
+	srso_select_mitigation();
+	gds_select_mitigation();
+	its_select_mitigation();
+	bhi_select_mitigation();
+	tsa_select_mitigation();
+	vmscape_select_mitigation();
+
+	/*
+	 * After mitigations are selected, some may need to update their
+	 * choices.
+	 */
+	spectre_v2_update_mitigation();
+	/*
+	 * retbleed_update_mitigation() relies on the state set by
+	 * spectre_v2_update_mitigation(); specifically it wants to know about
+	 * spectre_v2=ibrs.
+	 */
+	retbleed_update_mitigation();
+	/*
+	 * its_update_mitigation() depends on spectre_v2_update_mitigation()
+	 * and retbleed_update_mitigation().
+	 */
+	its_update_mitigation();
+
+	/*
+	 * spectre_v2_user_update_mitigation() depends on
+	 * retbleed_update_mitigation(), specifically the STIBP
+	 * selection is forced for UNRET or IBPB.
+	 */
+	spectre_v2_user_update_mitigation();
+	mds_update_mitigation();
+	taa_update_mitigation();
+	mmio_update_mitigation();
+	rfds_update_mitigation();
+	bhi_update_mitigation();
+	/* srso_update_mitigation() depends on retbleed_update_mitigation(). */
+	srso_update_mitigation();
+	vmscape_update_mitigation();
+
+	spectre_v1_apply_mitigation();
+	spectre_v2_apply_mitigation();
+	retbleed_apply_mitigation();
+	spectre_v2_user_apply_mitigation();
+	ssb_apply_mitigation();
+	l1tf_apply_mitigation();
+	mds_apply_mitigation();
+	taa_apply_mitigation();
+	mmio_apply_mitigation();
+	rfds_apply_mitigation();
+	srbds_apply_mitigation();
+	srso_apply_mitigation();
+	gds_apply_mitigation();
+	its_apply_mitigation();
+	bhi_apply_mitigation();
+	tsa_apply_mitigation();
+	vmscape_apply_mitigation();
+}
+
 #ifdef CONFIG_SYSFS
 
 #define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion"
diff --git a/arch/x86/kernel/cpu/bus_lock.c b/arch/x86/kernel/cpu/bus_lock.c
index 981f8b1f0792..dbc99a47be45 100644
--- a/arch/x86/kernel/cpu/bus_lock.c
+++ b/arch/x86/kernel/cpu/bus_lock.c
@@ -6,6 +6,7 @@
 #include <linux/workqueue.h>
 #include <linux/delay.h>
 #include <linux/cpuhotplug.h>
+#include <linux/kvm_types.h>
 #include <asm/cpu_device_id.h>
 #include <asm/cmdline.h>
 #include <asm/traps.h>
@@ -289,7 +290,7 @@ bool handle_guest_split_lock(unsigned long ip)
 	force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
 	return false;
 }
-EXPORT_SYMBOL_GPL(handle_guest_split_lock);
+EXPORT_SYMBOL_FOR_KVM(handle_guest_split_lock);
 
 void bus_lock_init(void)
 {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 02d97834a1d4..e7ab22fce3b5 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -7,6 +7,7 @@
 #include <linux/bitops.h>
 #include <linux/kernel.h>
 #include <linux/export.h>
+#include <linux/kvm_types.h>
 #include <linux/percpu.h>
 #include <linux/string.h>
 #include <linux/ctype.h>
@@ -405,6 +406,28 @@ out:
 	cr4_clear_bits(X86_CR4_UMIP);
 }
 
+static __always_inline void setup_lass(struct cpuinfo_x86 *c)
+{
+	if (!cpu_feature_enabled(X86_FEATURE_LASS))
+		return;
+
+	/*
+	 * Legacy vsyscall page access causes a #GP when LASS is active.
+	 * Disable LASS because the #GP handler doesn't support vsyscall
+	 * emulation.
+	 *
+	 * Also disable LASS when running under EFI, as some runtime and
+	 * boot services rely on 1:1 mappings in the lower half.
+	 */
+	if (IS_ENABLED(CONFIG_X86_VSYSCALL_EMULATION) ||
+	    IS_ENABLED(CONFIG_EFI)) {
+		setup_clear_cpu_cap(X86_FEATURE_LASS);
+		return;
+	}
+
+	cr4_set_bits(X86_CR4_LASS);
+}
+
 /* These bits should not change their value after CPU init is finished. */
 static const unsigned long cr4_pinned_mask = X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP |
 					     X86_CR4_FSGSBASE | X86_CR4_CET | X86_CR4_FRED;
@@ -464,14 +487,14 @@ void cr4_update_irqsoff(unsigned long set, unsigned long clear)
 		__write_cr4(newval);
 	}
 }
-EXPORT_SYMBOL(cr4_update_irqsoff);
+EXPORT_SYMBOL_FOR_KVM(cr4_update_irqsoff);
 
 /* Read the CR4 shadow. */
 unsigned long cr4_read_shadow(void)
 {
 	return this_cpu_read(cpu_tlbstate.cr4);
 }
-EXPORT_SYMBOL_GPL(cr4_read_shadow);
+EXPORT_SYMBOL_FOR_KVM(cr4_read_shadow);
 
 void cr4_init(void)
 {
@@ -726,7 +749,7 @@ void load_direct_gdt(int cpu)
 	gdt_descr.size = GDT_SIZE - 1;
 	load_gdt(&gdt_descr);
 }
-EXPORT_SYMBOL_GPL(load_direct_gdt);
+EXPORT_SYMBOL_FOR_KVM(load_direct_gdt);
 
 /* Load a fixmap remapping of the per-cpu GDT */
 void load_fixmap_gdt(int cpu)
@@ -1025,12 +1048,8 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 		c->x86_capability[CPUID_8000_0001_EDX] = edx;
 	}
 
-	if (c->extended_cpuid_level >= 0x80000007) {
-		cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
-
-		c->x86_capability[CPUID_8000_0007_EBX] = ebx;
-		c->x86_power = edx;
-	}
+	if (c->extended_cpuid_level >= 0x80000007)
+		c->x86_power = cpuid_edx(0x80000007);
 
 	if (c->extended_cpuid_level >= 0x80000008) {
 		cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
@@ -2015,10 +2034,10 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 	/* Disable the PN if appropriate */
 	squash_the_stupid_serial_number(c);
 
-	/* Set up SMEP/SMAP/UMIP */
 	setup_smep(c);
 	setup_smap(c);
 	setup_umip(c);
+	setup_lass(c);
 
 	/* Enable FSGSBASE instructions if available. */
 	if (cpu_has(c, X86_FEATURE_FSGSBASE)) {
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index bc38b2d56f26..5c7a3a71191a 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -42,15 +42,6 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],
 			    *const __x86_cpu_dev_end[];
 
 #ifdef CONFIG_CPU_SUP_INTEL
-enum tsx_ctrl_states {
-	TSX_CTRL_ENABLE,
-	TSX_CTRL_DISABLE,
-	TSX_CTRL_RTM_ALWAYS_ABORT,
-	TSX_CTRL_NOT_SUPPORTED,
-};
-
-extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state;
-
 extern void __init tsx_init(void);
 void tsx_ap_init(void);
 void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index 87e78586395b..146f6f8b0650 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -80,6 +80,7 @@ static const struct cpuid_dep cpuid_deps[] = {
 	{ X86_FEATURE_SGX_LC,			X86_FEATURE_SGX	      },
 	{ X86_FEATURE_SGX1,			X86_FEATURE_SGX       },
 	{ X86_FEATURE_SGX2,			X86_FEATURE_SGX1      },
+	{ X86_FEATURE_SGX_EUPDATESVN,		X86_FEATURE_SGX1      },
 	{ X86_FEATURE_SGX_EDECCSSA,		X86_FEATURE_SGX1      },
 	{ X86_FEATURE_XFD,			X86_FEATURE_XSAVES    },
 	{ X86_FEATURE_XFD,			X86_FEATURE_XGETBV1   },
@@ -90,6 +91,7 @@ static const struct cpuid_dep cpuid_deps[] = {
 	{ X86_FEATURE_SHSTK,			X86_FEATURE_XSAVES    },
 	{ X86_FEATURE_FRED,			X86_FEATURE_LKGS      },
 	{ X86_FEATURE_SPEC_CTRL_SSBD,		X86_FEATURE_SPEC_CTRL },
+	{ X86_FEATURE_LASS,			X86_FEATURE_SMAP      },
 	{}
 };
 
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 8c18327eb10b..0863733858dc 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -89,7 +89,6 @@ static int mtrr_state_set;
 u64 mtrr_tom2;
 
 struct mtrr_state_type mtrr_state;
-EXPORT_SYMBOL_GPL(mtrr_state);
 
 /* Reserved bits in the high portion of the MTRRphysBaseN MSR. */
 u32 phys_hi_rsvd;
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 5655f253d929..2de3bd2f95d1 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -46,10 +46,6 @@ struct set_mtrr_context {
 	u32		ccr3;
 };
 
-void set_mtrr_done(struct set_mtrr_context *ctxt);
-void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
-void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
-
 void fill_mtrr_var_range(unsigned int index,
 		u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
 bool get_mtrr_state(void);
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index d113863a8eab..cde4b6cd3471 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -43,7 +43,11 @@ static const struct cpuid_bit cpuid_bits[] = {
 	{ X86_FEATURE_PER_THREAD_MBA,		CPUID_ECX,  0, 0x00000010, 3 },
 	{ X86_FEATURE_SGX1,			CPUID_EAX,  0, 0x00000012, 0 },
 	{ X86_FEATURE_SGX2,			CPUID_EAX,  1, 0x00000012, 0 },
+	{ X86_FEATURE_SGX_EUPDATESVN,		CPUID_EAX, 10, 0x00000012, 0 },
 	{ X86_FEATURE_SGX_EDECCSSA,		CPUID_EAX, 11, 0x00000012, 0 },
+	{ X86_FEATURE_OVERFLOW_RECOV,		CPUID_EBX,  0, 0x80000007, 0 },
+	{ X86_FEATURE_SUCCOR,			CPUID_EBX,  1, 0x80000007, 0 },
+	{ X86_FEATURE_SMCA,			CPUID_EBX,  3, 0x80000007, 0 },
 	{ X86_FEATURE_HW_PSTATE,		CPUID_EDX,  7, 0x80000007, 0 },
 	{ X86_FEATURE_CPB,			CPUID_EDX,  9, 0x80000007, 0 },
 	{ X86_FEATURE_PROC_FEEDBACK,		CPUID_EDX, 11, 0x80000007, 0 },
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
index 7f8d1e11dbee..79d6020dfe9c 100644
--- a/arch/x86/kernel/cpu/sgx/driver.c
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -14,7 +14,7 @@ u64 sgx_attributes_reserved_mask;
 u64 sgx_xfrm_reserved_mask = ~0x3;
 u32 sgx_misc_reserved_mask;
 
-static int sgx_open(struct inode *inode, struct file *file)
+static int __sgx_open(struct inode *inode, struct file *file)
 {
 	struct sgx_encl *encl;
 	int ret;
@@ -41,6 +41,23 @@ static int sgx_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int sgx_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	ret = sgx_inc_usage_count();
+	if (ret)
+		return ret;
+
+	ret = __sgx_open(inode, file);
+	if (ret) {
+		sgx_dec_usage_count();
+		return ret;
+	}
+
+	return 0;
+}
+
 static int sgx_release(struct inode *inode, struct file *file)
 {
 	struct sgx_encl *encl = file->private_data;
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 308dbbae6c6e..cf149b9f4916 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -765,6 +765,7 @@ void sgx_encl_release(struct kref *ref)
 	WARN_ON_ONCE(encl->secs.epc_page);
 
 	kfree(encl);
+	sgx_dec_usage_count();
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h
index 42a088a337c5..74be751199a4 100644
--- a/arch/x86/kernel/cpu/sgx/encls.h
+++ b/arch/x86/kernel/cpu/sgx/encls.h
@@ -233,4 +233,9 @@ static inline int __eaug(struct sgx_pageinfo *pginfo, void *addr)
 	return __encls_2(EAUG, pginfo, addr);
 }
 
+/* Attempt to update CPUSVN at runtime. */
+static inline int __eupdatesvn(void)
+{
+	return __encls_ret_1(EUPDATESVN, "");
+}
 #endif /* _X86_ENCLS_H */
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 2de01b379aa3..dc73194416ac 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -5,6 +5,7 @@
 #include <linux/freezer.h>
 #include <linux/highmem.h>
 #include <linux/kthread.h>
+#include <linux/kvm_types.h>
 #include <linux/miscdevice.h>
 #include <linux/node.h>
 #include <linux/pagemap.h>
@@ -16,6 +17,7 @@
 #include <linux/vmalloc.h>
 #include <asm/msr.h>
 #include <asm/sgx.h>
+#include <asm/archrandom.h>
 #include "driver.h"
 #include "encl.h"
 #include "encls.h"
@@ -915,7 +917,107 @@ int sgx_set_attribute(unsigned long *allowed_attributes,
 	*allowed_attributes |= SGX_ATTR_PROVISIONKEY;
 	return 0;
 }
-EXPORT_SYMBOL_GPL(sgx_set_attribute);
+EXPORT_SYMBOL_FOR_KVM(sgx_set_attribute);
+
+/* Counter to count the active SGX users */
+static int sgx_usage_count;
+
+/**
+ * sgx_update_svn() - Attempt to call ENCLS[EUPDATESVN].
+ *
+ * This instruction attempts to update CPUSVN to the
+ * currently loaded microcode update SVN and generate new
+ * cryptographic assets.
+ *
+ * Return:
+ * * %0:       - Success or not supported
+ * * %-EAGAIN: - Can be safely retried, failure is due to lack of
+ * *             entropy in RNG
+ * * %-EIO:    - Unexpected error, retries are not advisable
+ */
+static int sgx_update_svn(void)
+{
+	int ret;
+
+	/*
+	 * If EUPDATESVN is not available, it is ok to
+	 * silently skip it to comply with legacy behavior.
+	 */
+	if (!cpu_feature_enabled(X86_FEATURE_SGX_EUPDATESVN))
+		return 0;
+
+	/*
+	 * EPC is guaranteed to be empty when there are no users.
+	 * Ensure we are on our first user before proceeding further.
+	 */
+	WARN(sgx_usage_count, "Elevated usage count when calling EUPDATESVN\n");
+
+	for (int i = 0; i < RDRAND_RETRY_LOOPS; i++) {
+		ret = __eupdatesvn();
+
+		/* Stop on success or unexpected errors: */
+		if (ret != SGX_INSUFFICIENT_ENTROPY)
+			break;
+	}
+
+	switch (ret) {
+	case 0:
+		/*
+		 * SVN successfully updated.
+		 * Let users know when the update was successful.
+		 */
+		pr_info("SVN updated successfully\n");
+		return 0;
+	case SGX_NO_UPDATE:
+		/*
+		 * SVN update failed since the current SVN is
+		 * not newer than CPUSVN. This is the most
+		 * common case and indicates no harm.
+		 */
+		return 0;
+	case SGX_INSUFFICIENT_ENTROPY:
+		/*
+		 * SVN update failed due to lack of entropy in DRNG.
+		 * Indicate to userspace that it should retry.
+		 */
+		return -EAGAIN;
+	default:
+		break;
+	}
+
+	/*
+	 * EUPDATESVN was called when EPC is empty, all other error
+	 * codes are unexpected.
+	 */
+	ENCLS_WARN(ret, "EUPDATESVN");
+	return -EIO;
+}
+
+/* Mutex to ensure no concurrent EPC accesses during EUPDATESVN */
+static DEFINE_MUTEX(sgx_svn_lock);
+
+int sgx_inc_usage_count(void)
+{
+	int ret;
+
+	guard(mutex)(&sgx_svn_lock);
+
+	if (!sgx_usage_count) {
+		ret = sgx_update_svn();
+		if (ret)
+			return ret;
+	}
+
+	sgx_usage_count++;
+
+	return 0;
+}
+
+void sgx_dec_usage_count(void)
+{
+	guard(mutex)(&sgx_svn_lock);
+	sgx_usage_count--;
+}
 
 static int __init sgx_init(void)
 {
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index d2dad21259a8..f5940393d9bd 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -102,6 +102,9 @@ static inline int __init sgx_vepc_init(void)
 }
 #endif
 
+int sgx_inc_usage_count(void);
+void sgx_dec_usage_count(void);
+
 void sgx_update_lepubkeyhash(u64 *lepubkeyhash);
 
 #endif /* _X86_SGX_H */
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
index 7aaa3652e31d..8de1f1a755f2 100644
--- a/arch/x86/kernel/cpu/sgx/virt.c
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -5,6 +5,7 @@
  * Copyright(c) 2021 Intel Corporation.
  */
 
+#include <linux/kvm_types.h>
 #include <linux/miscdevice.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
@@ -255,10 +256,11 @@ static int sgx_vepc_release(struct inode *inode, struct file *file)
 	xa_destroy(&vepc->page_array);
 	kfree(vepc);
 
+	sgx_dec_usage_count();
 	return 0;
 }
 
-static int sgx_vepc_open(struct inode *inode, struct file *file)
+static int __sgx_vepc_open(struct inode *inode, struct file *file)
 {
 	struct sgx_vepc *vepc;
 
@@ -273,6 +275,23 @@ static int sgx_vepc_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int sgx_vepc_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	ret = sgx_inc_usage_count();
+	if (ret)
+		return ret;
+
+	ret =  __sgx_vepc_open(inode, file);
+	if (ret) {
+		sgx_dec_usage_count();
+		return ret;
+	}
+
+	return 0;
+}
+
 static long sgx_vepc_ioctl(struct file *file,
 			   unsigned int cmd, unsigned long arg)
 {
@@ -363,7 +382,7 @@ int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
 	WARN_ON_ONCE(ret);
 	return 0;
 }
-EXPORT_SYMBOL_GPL(sgx_virt_ecreate);
+EXPORT_SYMBOL_FOR_KVM(sgx_virt_ecreate);
 
 static int __sgx_virt_einit(void __user *sigstruct, void __user *token,
 			    void __user *secs)
@@ -432,4 +451,4 @@ int sgx_virt_einit(void __user *sigstruct, void __user *token,
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(sgx_virt_einit);
+EXPORT_SYMBOL_FOR_KVM(sgx_virt_einit);
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
index 49782724a943..209b5a22d880 100644
--- a/arch/x86/kernel/cpu/tsx.c
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -19,7 +19,17 @@
 #undef pr_fmt
 #define pr_fmt(fmt) "tsx: " fmt
 
-enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED;
+enum tsx_ctrl_states {
+	TSX_CTRL_AUTO,
+	TSX_CTRL_ENABLE,
+	TSX_CTRL_DISABLE,
+	TSX_CTRL_RTM_ALWAYS_ABORT,
+	TSX_CTRL_NOT_SUPPORTED,
+};
+
+static enum tsx_ctrl_states tsx_ctrl_state __ro_after_init =
+	IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO) ? TSX_CTRL_AUTO   :
+	IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF) ? TSX_CTRL_DISABLE : TSX_CTRL_ENABLE;
 
 static void tsx_disable(void)
 {
@@ -156,11 +166,28 @@ static void tsx_dev_mode_disable(void)
 	}
 }
 
-void __init tsx_init(void)
+static int __init tsx_parse_cmdline(char *str)
 {
-	char arg[5] = {};
-	int ret;
+	if (!str)
+		return -EINVAL;
+
+	if (!strcmp(str, "on")) {
+		tsx_ctrl_state = TSX_CTRL_ENABLE;
+	} else if (!strcmp(str, "off")) {
+		tsx_ctrl_state = TSX_CTRL_DISABLE;
+	} else if (!strcmp(str, "auto")) {
+		tsx_ctrl_state = TSX_CTRL_AUTO;
+	} else {
+		tsx_ctrl_state = TSX_CTRL_DISABLE;
+		pr_err("invalid option, defaulting to off\n");
+	}
+
+	return 0;
+}
+early_param("tsx", tsx_parse_cmdline);
 
+void __init tsx_init(void)
+{
 	tsx_dev_mode_disable();
 
 	/*
@@ -194,27 +221,8 @@ void __init tsx_init(void)
 		return;
 	}
 
-	ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg));
-	if (ret >= 0) {
-		if (!strcmp(arg, "on")) {
-			tsx_ctrl_state = TSX_CTRL_ENABLE;
-		} else if (!strcmp(arg, "off")) {
-			tsx_ctrl_state = TSX_CTRL_DISABLE;
-		} else if (!strcmp(arg, "auto")) {
-			tsx_ctrl_state = x86_get_tsx_auto_mode();
-		} else {
-			tsx_ctrl_state = TSX_CTRL_DISABLE;
-			pr_err("invalid option, defaulting to off\n");
-		}
-	} else {
-		/* tsx= not provided */
-		if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO))
-			tsx_ctrl_state = x86_get_tsx_auto_mode();
-		else if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF))
-			tsx_ctrl_state = TSX_CTRL_DISABLE;
-		else
-			tsx_ctrl_state = TSX_CTRL_ENABLE;
-	}
+	if (tsx_ctrl_state == TSX_CTRL_AUTO)
+		tsx_ctrl_state = x86_get_tsx_auto_mode();
 
 	if (tsx_ctrl_state == TSX_CTRL_DISABLE) {
 		tsx_disable();
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index c3acbd26408b..b15b97d3cb52 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -16,6 +16,7 @@
 #include <linux/firmware-map.h>
 #include <linux/sort.h>
 #include <linux/memory_hotplug.h>
+#include <linux/kvm_types.h>
 
 #include <asm/e820/api.h>
 #include <asm/setup.h>
@@ -95,7 +96,7 @@ bool e820__mapped_raw_any(u64 start, u64 end, enum e820_type type)
 {
 	return _e820__mapped_any(e820_table_firmware, start, end, type);
 }
-EXPORT_SYMBOL_GPL(e820__mapped_raw_any);
+EXPORT_SYMBOL_FOR_KVM(e820__mapped_raw_any);
 
 bool e820__mapped_any(u64 start, u64 end, enum e820_type type)
 {
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index e88eacb1b5bb..da233f20ae6f 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -18,6 +18,7 @@
 #include <uapi/asm/kvm.h>
 
 #include <linux/hardirq.h>
+#include <linux/kvm_types.h>
 #include <linux/pkeys.h>
 #include <linux/vmalloc.h>
 
@@ -276,7 +277,7 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
 
 	return true;
 }
-EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate);
+EXPORT_SYMBOL_FOR_KVM(fpu_alloc_guest_fpstate);
 
 void fpu_free_guest_fpstate(struct fpu_guest *gfpu)
 {
@@ -291,7 +292,7 @@ void fpu_free_guest_fpstate(struct fpu_guest *gfpu)
 	gfpu->fpstate = NULL;
 	vfree(fpstate);
 }
-EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate);
+EXPORT_SYMBOL_FOR_KVM(fpu_free_guest_fpstate);
 
 /*
   * fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable
@@ -313,7 +314,7 @@ int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures)
 
 	return __xfd_enable_feature(xfeatures, guest_fpu);
 }
-EXPORT_SYMBOL_GPL(fpu_enable_guest_xfd_features);
+EXPORT_SYMBOL_FOR_KVM(fpu_enable_guest_xfd_features);
 
 #ifdef CONFIG_X86_64
 void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd)
@@ -324,7 +325,7 @@ void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd)
 		xfd_update_state(guest_fpu->fpstate);
 	fpregs_unlock();
 }
-EXPORT_SYMBOL_GPL(fpu_update_guest_xfd);
+EXPORT_SYMBOL_FOR_KVM(fpu_update_guest_xfd);
 
 /**
  * fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state
@@ -348,7 +349,7 @@ void fpu_sync_guest_vmexit_xfd_state(void)
 		__this_cpu_write(xfd_state, fpstate->xfd);
 	}
 }
-EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state);
+EXPORT_SYMBOL_FOR_KVM(fpu_sync_guest_vmexit_xfd_state);
 #endif /* CONFIG_X86_64 */
 
 int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
@@ -390,7 +391,7 @@ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
 	fpregs_unlock();
 	return 0;
 }
-EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate);
+EXPORT_SYMBOL_FOR_KVM(fpu_swap_kvm_fpstate);
 
 void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf,
 				    unsigned int size, u64 xfeatures, u32 pkru)
@@ -409,7 +410,7 @@ void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf,
 		ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE;
 	}
 }
-EXPORT_SYMBOL_GPL(fpu_copy_guest_fpstate_to_uabi);
+EXPORT_SYMBOL_FOR_KVM(fpu_copy_guest_fpstate_to_uabi);
 
 int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
 				   u64 xcr0, u32 *vpkru)
@@ -439,7 +440,7 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
 
 	return copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru);
 }
-EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate);
+EXPORT_SYMBOL_FOR_KVM(fpu_copy_uabi_to_guest_fpstate);
 #endif /* CONFIG_KVM */
 
 void kernel_fpu_begin_mask(unsigned int kfpu_mask)
@@ -857,7 +858,7 @@ void switch_fpu_return(void)
 
 	fpregs_restore_userregs();
 }
-EXPORT_SYMBOL_GPL(switch_fpu_return);
+EXPORT_SYMBOL_FOR_KVM(switch_fpu_return);
 
 void fpregs_lock_and_load(void)
 {
@@ -892,7 +893,7 @@ void fpregs_assert_state_consistent(void)
 
 	WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id()));
 }
-EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent);
+EXPORT_SYMBOL_FOR_KVM(fpregs_assert_state_consistent);
 #endif
 
 void fpregs_mark_activate(void)
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 28e4fd65c9da..48113c5193aa 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -8,6 +8,7 @@
 #include <linux/compat.h>
 #include <linux/cpu.h>
 #include <linux/mman.h>
+#include <linux/kvm_types.h>
 #include <linux/nospec.h>
 #include <linux/pkeys.h>
 #include <linux/seq_file.h>
@@ -1058,7 +1059,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
 
 	return __raw_xsave_addr(xsave, xfeature_nr);
 }
-EXPORT_SYMBOL_GPL(get_xsave_addr);
+EXPORT_SYMBOL_FOR_KVM(get_xsave_addr);
 
 /*
  * Given an xstate feature nr, calculate where in the xsave buffer the state is.
@@ -1482,7 +1483,7 @@ void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeatu
 	if (addr)
 		memset(addr, 0, xstate_sizes[xfeature]);
 }
-EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
+EXPORT_SYMBOL_FOR_KVM(fpstate_clear_xstate_component);
 #endif
 
 #ifdef CONFIG_X86_64
@@ -1818,7 +1819,7 @@ u64 xstate_get_guest_group_perm(void)
 {
 	return xstate_get_group_perm(true);
 }
-EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);
+EXPORT_SYMBOL_FOR_KVM(xstate_get_guest_group_perm);
 
 /**
  * fpu_xstate_prctl - xstate permission operations
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index b01644c949b2..f846c15f21ca 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -24,6 +24,7 @@
 #include <linux/percpu.h>
 #include <linux/kdebug.h>
 #include <linux/kernel.h>
+#include <linux/kvm_types.h>
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
@@ -489,7 +490,7 @@ void hw_breakpoint_restore(void)
 	set_debugreg(DR6_RESERVED, 6);
 	set_debugreg(__this_cpu_read(cpu_dr7), 7);
 }
-EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
+EXPORT_SYMBOL_FOR_KVM(hw_breakpoint_restore);
 
 /*
  * Handle debug exception notifications.
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 10721a125226..86f4e574de02 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -12,6 +12,7 @@
 #include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/irq.h>
+#include <linux/kvm_types.h>
 
 #include <asm/irq_stack.h>
 #include <asm/apic.h>
@@ -361,7 +362,7 @@ void kvm_set_posted_intr_wakeup_handler(void (*handler)(void))
 		synchronize_rcu();
 	}
 }
-EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler);
+EXPORT_SYMBOL_FOR_KVM(kvm_set_posted_intr_wakeup_handler);
 
 /*
  * Handler for POSTED_INTERRUPT_VECTOR.
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index b67d7c59dca0..204765004c72 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -29,6 +29,7 @@
 #include <linux/syscore_ops.h>
 #include <linux/cc_platform.h>
 #include <linux/efi.h>
+#include <linux/kvm_types.h>
 #include <asm/timer.h>
 #include <asm/cpu.h>
 #include <asm/traps.h>
@@ -162,7 +163,7 @@ void kvm_async_pf_task_wait_schedule(u32 token)
 	}
 	finish_swait(&n.wq, &wait);
 }
-EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule);
+EXPORT_SYMBOL_FOR_KVM(kvm_async_pf_task_wait_schedule);
 
 static void apf_task_wake_one(struct kvm_task_sleep_node *n)
 {
@@ -253,7 +254,7 @@ noinstr u32 kvm_read_and_reset_apf_flags(void)
 
 	return flags;
 }
-EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags);
+EXPORT_SYMBOL_FOR_KVM(kvm_read_and_reset_apf_flags);
 
 noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
 {
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index e17c16c54a37..4469c784eaa0 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -98,7 +98,7 @@ static int filter_write(u32 reg)
 	if (!__ratelimit(&fw_rs))
 		return 0;
 
-	pr_warn("Write to unrecognized MSR 0x%x by %s (pid: %d).\n",
+	pr_warn("Write to unrecognized MSR 0x%x by %s (pid: %d), tainting CPU_OUT_OF_SPEC.\n",
 	        reg, current->comm, current->pid);
 	pr_warn("See https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/about for details.\n");
 
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index be93ec7255bf..3d239ed12744 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -24,6 +24,7 @@
 #include <linux/export.h>
 #include <linux/atomic.h>
 #include <linux/sched/clock.h>
+#include <linux/kvm_types.h>
 
 #include <asm/cpu_entry_area.h>
 #include <asm/traps.h>
@@ -613,9 +614,7 @@ DEFINE_IDTENTRY_RAW(exc_nmi_kvm_vmx)
 {
 	exc_nmi(regs);
 }
-#if IS_MODULE(CONFIG_KVM_INTEL)
-EXPORT_SYMBOL_GPL(asm_exc_nmi_kvm_vmx);
-#endif
+EXPORT_SYMBOL_FOR_KVM(asm_exc_nmi_kvm_vmx);
 #endif
 
 #ifdef CONFIG_NMI_CHECK_CPU
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 52a5c03c353c..432c0a004c60 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -30,6 +30,7 @@
 #include <linux/interrupt.h>
 #include <linux/delay.h>
 #include <linux/export.h>
+#include <linux/kvm_types.h>
 #include <linux/ptrace.h>
 #include <linux/notifier.h>
 #include <linux/kprobes.h>
@@ -303,9 +304,7 @@ void current_save_fsgs(void)
 	save_fsgs(current);
 	local_irq_restore(flags);
 }
-#if IS_ENABLED(CONFIG_KVM)
-EXPORT_SYMBOL_GPL(current_save_fsgs);
-#endif
+EXPORT_SYMBOL_FOR_KVM(current_save_fsgs);
 
 static __always_inline void loadseg(enum which_selector which,
 				    unsigned short sel)
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 964f6b0a3d68..6032fa9ec753 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -13,6 +13,7 @@
 #include <linux/objtool.h>
 #include <linux/pgtable.h>
 #include <linux/kexec.h>
+#include <linux/kvm_types.h>
 #include <acpi/reboot.h>
 #include <asm/io.h>
 #include <asm/apic.h>
@@ -541,7 +542,7 @@ void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback)
 
 	rcu_assign_pointer(cpu_emergency_virt_callback, callback);
 }
-EXPORT_SYMBOL_GPL(cpu_emergency_register_virt_callback);
+EXPORT_SYMBOL_FOR_KVM(cpu_emergency_register_virt_callback);
 
 void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback)
 {
@@ -551,7 +552,7 @@ void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback)
 	rcu_assign_pointer(cpu_emergency_virt_callback, NULL);
 	synchronize_rcu();
 }
-EXPORT_SYMBOL_GPL(cpu_emergency_unregister_virt_callback);
+EXPORT_SYMBOL_FOR_KVM(cpu_emergency_unregister_virt_callback);
 
 /*
  * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 11e20bb13aca..4ffba68dc57b 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -95,9 +95,12 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
 	/* Leave CR4 in %r13 to enable the right paging mode later. */
 	movq	%cr4, %r13
 
-	/* Disable global pages immediately to ensure this mapping is RWX */
+	/*
+	 * Disable global pages immediately to ensure this mapping is RWX.
+	 * Disable LASS before jumping to the identity mapped page.
+	 */
 	movq	%r13, %r12
-	andq	$~(X86_CR4_PGE), %r12
+	andq	$~(X86_CR4_PGE | X86_CR4_LASS), %r12
 	movq	%r12, %cr4
 
 	/* Save %rsp and CRs. */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index cb324cc1fd99..bcf1dedc1d00 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -732,13 +732,23 @@ DEFINE_IDTENTRY(exc_bounds)
 enum kernel_gp_hint {
 	GP_NO_HINT,
 	GP_NON_CANONICAL,
-	GP_CANONICAL
+	GP_CANONICAL,
+	GP_LASS_VIOLATION,
+	GP_NULL_POINTER,
+};
+
+static const char * const kernel_gp_hint_help[] = {
+	[GP_NON_CANONICAL]	= "probably for non-canonical address",
+	[GP_CANONICAL]		= "maybe for address",
+	[GP_LASS_VIOLATION]	= "probably LASS violation for address",
+	[GP_NULL_POINTER]	= "kernel NULL pointer dereference",
 };
 
 /*
  * When an uncaught #GP occurs, try to determine the memory address accessed by
  * the instruction and return that address to the caller. Also, try to figure
- * out whether any part of the access to that address was non-canonical.
+ * out whether any part of the access to that address was non-canonical or
+ * across privilege levels.
  */
 static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs,
 						 unsigned long *addr)
@@ -760,14 +770,28 @@ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs,
 		return GP_NO_HINT;
 
 #ifdef CONFIG_X86_64
+	/* Operand is in the kernel half */
+	if (*addr >= ~__VIRTUAL_MASK)
+		return GP_CANONICAL;
+
+	/* The last byte of the operand is not in the user canonical half */
+	if (*addr + insn.opnd_bytes - 1 > __VIRTUAL_MASK)
+		return GP_NON_CANONICAL;
+
 	/*
-	 * Check that:
-	 *  - the operand is not in the kernel half
-	 *  - the last byte of the operand is not in the user canonical half
+	 * A NULL pointer dereference usually causes a #PF. However, it
+	 * can result in a #GP when LASS is active. Provide the same
+	 * hint in the rare case that the condition is hit without LASS.
 	 */
-	if (*addr < ~__VIRTUAL_MASK &&
-	    *addr + insn.opnd_bytes - 1 > __VIRTUAL_MASK)
-		return GP_NON_CANONICAL;
+	if (*addr < PAGE_SIZE)
+		return GP_NULL_POINTER;
+
+	/*
+	 * Assume that LASS caused the exception, because the address is
+	 * canonical and in the user half.
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_LASS))
+		return GP_LASS_VIOLATION;
 #endif
 
 	return GP_CANONICAL;
@@ -930,9 +954,7 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
 
 	if (hint != GP_NO_HINT)
 		snprintf(desc, sizeof(desc), GPFSTR ", %s 0x%lx",
-			 (hint == GP_NON_CANONICAL) ? "probably for non-canonical address"
-						    : "maybe for address",
-			 gp_addr);
+			 kernel_gp_hint_help[hint], gp_addr);
 
 	/*
 	 * KASAN is interested only in the non-canonical case, clear it
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 87e749106dda..7d3e13e14eab 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -11,6 +11,7 @@
 #include <linux/cpufreq.h>
 #include <linux/delay.h>
 #include <linux/clocksource.h>
+#include <linux/kvm_types.h>
 #include <linux/percpu.h>
 #include <linux/timex.h>
 #include <linux/static_key.h>
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
index 743ab25ba787..81b4a7acf72e 100644
--- a/arch/x86/kvm/reverse_cpuid.h
+++ b/arch/x86/kvm/reverse_cpuid.h
@@ -78,7 +78,6 @@ static const struct cpuid_reg reverse_cpuid[] = {
 	[CPUID_6_EAX]         = {         6, 0, CPUID_EAX},
 	[CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
 	[CPUID_7_ECX]         = {         7, 0, CPUID_ECX},
-	[CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
 	[CPUID_7_EDX]         = {         7, 0, CPUID_EDX},
 	[CPUID_7_1_EAX]       = {         7, 1, CPUID_EAX},
 	[CPUID_12_EAX]        = {0x00000012, 0, CPUID_EAX},
diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c
index c5c60d07308c..824664c0ecbd 100644
--- a/arch/x86/lib/cache-smp.c
+++ b/arch/x86/lib/cache-smp.c
@@ -2,6 +2,7 @@
 #include <asm/paravirt.h>
 #include <linux/smp.h>
 #include <linux/export.h>
+#include <linux/kvm_types.h>
 
 static void __wbinvd(void *dummy)
 {
@@ -12,7 +13,7 @@ void wbinvd_on_cpu(int cpu)
 {
 	smp_call_function_single(cpu, __wbinvd, NULL, 1);
 }
-EXPORT_SYMBOL(wbinvd_on_cpu);
+EXPORT_SYMBOL_FOR_KVM(wbinvd_on_cpu);
 
 void wbinvd_on_all_cpus(void)
 {
@@ -24,7 +25,7 @@ void wbinvd_on_cpus_mask(struct cpumask *cpus)
 {
 	on_each_cpu_mask(cpus, __wbinvd, NULL, 1);
 }
-EXPORT_SYMBOL_GPL(wbinvd_on_cpus_mask);
+EXPORT_SYMBOL_FOR_KVM(wbinvd_on_cpus_mask);
 
 static void __wbnoinvd(void *dummy)
 {
@@ -35,10 +36,10 @@ void wbnoinvd_on_all_cpus(void)
 {
 	on_each_cpu(__wbnoinvd, NULL, 1);
 }
-EXPORT_SYMBOL_GPL(wbnoinvd_on_all_cpus);
+EXPORT_SYMBOL_FOR_KVM(wbnoinvd_on_all_cpus);
 
 void wbnoinvd_on_cpus_mask(struct cpumask *cpus)
 {
 	on_each_cpu_mask(cpus, __wbnoinvd, NULL, 1);
 }
-EXPORT_SYMBOL_GPL(wbnoinvd_on_cpus_mask);
+EXPORT_SYMBOL_FOR_KVM(wbnoinvd_on_cpus_mask);
diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c
index b5893928d55c..8c7cd115b484 100644
--- a/arch/x86/lib/kaslr.c
+++ b/arch/x86/lib/kaslr.c
@@ -22,7 +22,7 @@
 #include <asm/setup.h>
 
 #define debug_putstr(v) early_printk("%s", v)
-#define has_cpuflag(f) boot_cpu_has(f)
+#define has_cpuflag(f) cpu_feature_enabled(f)
 #define get_boot_seed() kaslr_offset()
 #endif
 
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 4ef7c6dcbea6..dfdd1da89f36 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/export.h>
+#include <linux/kvm_types.h>
 #include <linux/percpu.h>
 #include <linux/preempt.h>
 #include <asm/msr.h>
@@ -103,7 +104,7 @@ int msr_set_bit(u32 msr, u8 bit)
 {
 	return __flip_bit(msr, bit, true);
 }
-EXPORT_SYMBOL_GPL(msr_set_bit);
+EXPORT_SYMBOL_FOR_KVM(msr_set_bit);
 
 /**
  * msr_clear_bit - Clear @bit in a MSR @msr.
@@ -119,7 +120,7 @@ int msr_clear_bit(u32 msr, u8 bit)
 {
 	return __flip_bit(msr, bit, false);
 }
-EXPORT_SYMBOL_GPL(msr_clear_bit);
+EXPORT_SYMBOL_FOR_KVM(msr_clear_bit);
 
 #ifdef CONFIG_TRACEPOINTS
 void do_trace_write_msr(u32 msr, u64 val, int failed)
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index a4700ef6eb64..2afa7a23340e 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -486,7 +486,6 @@ void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm,
 #endif
 	ptdump_walk_pgd_level_core(m, mm, pgd, false, false);
 }
-EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
 
 void ptdump_walk_user_pgd_level_checkwx(void)
 {
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 0e4270e20fad..1044aafd5d94 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -504,9 +504,6 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
 			continue;
 		}
 
-		if (0)
-			pr_info("   pte=%p addr=%lx pte=%016lx\n", pte, paddr,
-				pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte);
 		pages++;
 		set_pte_init(pte, pfn_pte(paddr >> PAGE_SHIFT, prot), init);
 		paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE;
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index b68200a0e0c6..8a3d9722f602 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -42,6 +42,7 @@
 #include <linux/highmem.h>
 #include <linux/fs.h>
 #include <linux/rbtree.h>
+#include <linux/kvm_types.h>
 
 #include <asm/cpu_device_id.h>
 #include <asm/cacheflush.h>
@@ -697,7 +698,7 @@ bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn)
 	       cm == _PAGE_CACHE_MODE_UC_MINUS ||
 	       cm == _PAGE_CACHE_MODE_WC;
 }
-EXPORT_SYMBOL_GPL(pat_pfn_immune_to_uc_mtrr);
+EXPORT_SYMBOL_FOR_KVM(pat_pfn_immune_to_uc_mtrr);
 
 /**
  * memtype_reserve_io - Request a memory type mapping for a region of memory
diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c
index fc3f3d3e2ef2..8d31c6b9e184 100644
--- a/arch/x86/mm/physaddr.c
+++ b/arch/x86/mm/physaddr.c
@@ -31,17 +31,6 @@ unsigned long __phys_addr(unsigned long x)
 	return x;
 }
 EXPORT_SYMBOL(__phys_addr);
-
-unsigned long __phys_addr_symbol(unsigned long x)
-{
-	unsigned long y = x - __START_KERNEL_map;
-
-	/* only check upper bounds since lower bounds will trigger carry */
-	VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
-
-	return y + phys_base;
-}
-EXPORT_SYMBOL(__phys_addr_symbol);
 #endif
 
 bool __virt_addr_valid(unsigned long x)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 5d221709353e..f5b93e01e347 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -12,6 +12,7 @@
 #include <linux/task_work.h>
 #include <linux/mmu_notifier.h>
 #include <linux/mmu_context.h>
+#include <linux/kvm_types.h>
 
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
@@ -1582,7 +1583,7 @@ unsigned long __get_current_cr3_fast(void)
 	VM_BUG_ON(cr3 != __read_cr3());
 	return cr3;
 }
-EXPORT_SYMBOL_GPL(__get_current_cr3_fast);
+EXPORT_SYMBOL_FOR_KVM(__get_current_cr3_fast);
 
 /*
  * Flush one page in the kernel mapping
@@ -1723,7 +1724,7 @@ void __flush_tlb_all(void)
 		flush_tlb_local();
 	}
 }
-EXPORT_SYMBOL_GPL(__flush_tlb_all);
+EXPORT_SYMBOL_FOR_KVM(__flush_tlb_all);
 
 void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 {
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index eac403248462..5ce4ebe99774 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -29,6 +29,7 @@
 #include <linux/acpi.h>
 #include <linux/suspend.h>
 #include <linux/idr.h>
+#include <linux/kvm_types.h>
 #include <asm/page.h>
 #include <asm/special_insns.h>
 #include <asm/msr-index.h>
@@ -181,7 +182,7 @@ int tdx_cpu_enable(void)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(tdx_cpu_enable);
+EXPORT_SYMBOL_FOR_KVM(tdx_cpu_enable);
 
 /*
  * Add a memory region as a TDX memory block.  The caller must make sure
@@ -662,7 +663,7 @@ void tdx_quirk_reset_page(struct page *page)
 {
 	tdx_quirk_reset_paddr(page_to_phys(page), PAGE_SIZE);
 }
-EXPORT_SYMBOL_GPL(tdx_quirk_reset_page);
+EXPORT_SYMBOL_FOR_KVM(tdx_quirk_reset_page);
 
 static void tdmr_quirk_reset_pamt(struct tdmr_info *tdmr)
 {
@@ -1216,7 +1217,7 @@ int tdx_enable(void)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(tdx_enable);
+EXPORT_SYMBOL_FOR_KVM(tdx_enable);
 
 static bool is_pamt_page(unsigned long phys)
 {
@@ -1477,13 +1478,13 @@ const struct tdx_sys_info *tdx_get_sysinfo(void)
 
 	return p;
 }
-EXPORT_SYMBOL_GPL(tdx_get_sysinfo);
+EXPORT_SYMBOL_FOR_KVM(tdx_get_sysinfo);
 
 u32 tdx_get_nr_guest_keyids(void)
 {
 	return tdx_nr_guest_keyids;
 }
-EXPORT_SYMBOL_GPL(tdx_get_nr_guest_keyids);
+EXPORT_SYMBOL_FOR_KVM(tdx_get_nr_guest_keyids);
 
 int tdx_guest_keyid_alloc(void)
 {
@@ -1491,13 +1492,13 @@ int tdx_guest_keyid_alloc(void)
 			       tdx_guest_keyid_start + tdx_nr_guest_keyids - 1,
 			       GFP_KERNEL);
 }
-EXPORT_SYMBOL_GPL(tdx_guest_keyid_alloc);
+EXPORT_SYMBOL_FOR_KVM(tdx_guest_keyid_alloc);
 
 void tdx_guest_keyid_free(unsigned int keyid)
 {
 	ida_free(&tdx_guest_keyid_pool, keyid);
 }
-EXPORT_SYMBOL_GPL(tdx_guest_keyid_free);
+EXPORT_SYMBOL_FOR_KVM(tdx_guest_keyid_free);
 
 static inline u64 tdx_tdr_pa(struct tdx_td *td)
 {
@@ -1521,7 +1522,7 @@ noinstr u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args)
 
 	return __seamcall_dirty_cache(__seamcall_saved_ret, TDH_VP_ENTER, args);
 }
-EXPORT_SYMBOL_GPL(tdh_vp_enter);
+EXPORT_SYMBOL_FOR_KVM(tdh_vp_enter);
 
 u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page)
 {
@@ -1533,7 +1534,7 @@ u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page)
 	tdx_clflush_page(tdcs_page);
 	return seamcall(TDH_MNG_ADDCX, &args);
 }
-EXPORT_SYMBOL_GPL(tdh_mng_addcx);
+EXPORT_SYMBOL_FOR_KVM(tdh_mng_addcx);
 
 u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2)
 {
@@ -1553,7 +1554,7 @@ u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(tdh_mem_page_add);
+EXPORT_SYMBOL_FOR_KVM(tdh_mem_page_add);
 
 u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2)
 {
@@ -1572,7 +1573,7 @@ u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(tdh_mem_sept_add);
+EXPORT_SYMBOL_FOR_KVM(tdh_mem_sept_add);
 
 u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page)
 {
@@ -1584,7 +1585,7 @@ u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page)
 	tdx_clflush_page(tdcx_page);
 	return seamcall(TDH_VP_ADDCX, &args);
 }
-EXPORT_SYMBOL_GPL(tdh_vp_addcx);
+EXPORT_SYMBOL_FOR_KVM(tdh_vp_addcx);
 
 u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2)
 {
@@ -1603,7 +1604,7 @@ u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(tdh_mem_page_aug);
+EXPORT_SYMBOL_FOR_KVM(tdh_mem_page_aug);
 
 u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2)
 {
@@ -1620,7 +1621,7 @@ u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u6
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(tdh_mem_range_block);
+EXPORT_SYMBOL_FOR_KVM(tdh_mem_range_block);
 
 u64 tdh_mng_key_config(struct tdx_td *td)
 {
@@ -1630,7 +1631,7 @@ u64 tdh_mng_key_config(struct tdx_td *td)
 
 	return seamcall(TDH_MNG_KEY_CONFIG, &args);
 }
-EXPORT_SYMBOL_GPL(tdh_mng_key_config);
+EXPORT_SYMBOL_FOR_KVM(tdh_mng_key_config);
 
 u64 tdh_mng_create(struct tdx_td *td, u16 hkid)
 {
@@ -1642,7 +1643,7 @@ u64 tdh_mng_create(struct tdx_td *td, u16 hkid)
 	tdx_clflush_page(td->tdr_page);
 	return seamcall(TDH_MNG_CREATE, &args);
 }
-EXPORT_SYMBOL_GPL(tdh_mng_create);
+EXPORT_SYMBOL_FOR_KVM(tdh_mng_create);
 
 u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp)
 {
@@ -1654,7 +1655,7 @@ u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp)
 	tdx_clflush_page(vp->tdvpr_page);
 	return seamcall(TDH_VP_CREATE, &args);
 }
-EXPORT_SYMBOL_GPL(tdh_vp_create);
+EXPORT_SYMBOL_FOR_KVM(tdh_vp_create);
 
 u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data)
 {
@@ -1671,7 +1672,7 @@ u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(tdh_mng_rd);
+EXPORT_SYMBOL_FOR_KVM(tdh_mng_rd);
 
 u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2)
 {
@@ -1688,7 +1689,7 @@ u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(tdh_mr_extend);
+EXPORT_SYMBOL_FOR_KVM(tdh_mr_extend);
 
 u64 tdh_mr_finalize(struct tdx_td *td)
 {
@@ -1698,7 +1699,7 @@ u64 tdh_mr_finalize(struct tdx_td *td)
 
 	return seamcall(TDH_MR_FINALIZE, &args);
 }
-EXPORT_SYMBOL_GPL(tdh_mr_finalize);
+EXPORT_SYMBOL_FOR_KVM(tdh_mr_finalize);
 
 u64 tdh_vp_flush(struct tdx_vp *vp)
 {
@@ -1708,7 +1709,7 @@ u64 tdh_vp_flush(struct tdx_vp *vp)
 
 	return seamcall(TDH_VP_FLUSH, &args);
 }
-EXPORT_SYMBOL_GPL(tdh_vp_flush);
+EXPORT_SYMBOL_FOR_KVM(tdh_vp_flush);
 
 u64 tdh_mng_vpflushdone(struct tdx_td *td)
 {
@@ -1718,7 +1719,7 @@ u64 tdh_mng_vpflushdone(struct tdx_td *td)
 
 	return seamcall(TDH_MNG_VPFLUSHDONE, &args);
 }
-EXPORT_SYMBOL_GPL(tdh_mng_vpflushdone);
+EXPORT_SYMBOL_FOR_KVM(tdh_mng_vpflushdone);
 
 u64 tdh_mng_key_freeid(struct tdx_td *td)
 {
@@ -1728,7 +1729,7 @@ u64 tdh_mng_key_freeid(struct tdx_td *td)
 
 	return seamcall(TDH_MNG_KEY_FREEID, &args);
 }
-EXPORT_SYMBOL_GPL(tdh_mng_key_freeid);
+EXPORT_SYMBOL_FOR_KVM(tdh_mng_key_freeid);
 
 u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err)
 {
@@ -1744,7 +1745,7 @@ u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(tdh_mng_init);
+EXPORT_SYMBOL_FOR_KVM(tdh_mng_init);
 
 u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data)
 {
@@ -1761,7 +1762,7 @@ u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(tdh_vp_rd);
+EXPORT_SYMBOL_FOR_KVM(tdh_vp_rd);
 
 u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask)
 {
@@ -1774,7 +1775,7 @@ u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask)
 
 	return seamcall(TDH_VP_WR, &args);
 }
-EXPORT_SYMBOL_GPL(tdh_vp_wr);
+EXPORT_SYMBOL_FOR_KVM(tdh_vp_wr);
 
 u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid)
 {
@@ -1787,7 +1788,7 @@ u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid)
 	/* apicid requires version == 1. */
 	return seamcall(TDH_VP_INIT | (1ULL << TDX_VERSION_SHIFT), &args);
 }
-EXPORT_SYMBOL_GPL(tdh_vp_init);
+EXPORT_SYMBOL_FOR_KVM(tdh_vp_init);
 
 /*
  * TDX ABI defines output operands as PT, OWNER and SIZE. These are TDX defined fomats.
@@ -1809,7 +1810,7 @@ u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(tdh_phymem_page_reclaim);
+EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_reclaim);
 
 u64 tdh_mem_track(struct tdx_td *td)
 {
@@ -1819,7 +1820,7 @@ u64 tdh_mem_track(struct tdx_td *td)
 
 	return seamcall(TDH_MEM_TRACK, &args);
 }
-EXPORT_SYMBOL_GPL(tdh_mem_track);
+EXPORT_SYMBOL_FOR_KVM(tdh_mem_track);
 
 u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2)
 {
@@ -1836,7 +1837,7 @@ u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u6
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(tdh_mem_page_remove);
+EXPORT_SYMBOL_FOR_KVM(tdh_mem_page_remove);
 
 u64 tdh_phymem_cache_wb(bool resume)
 {
@@ -1846,7 +1847,7 @@ u64 tdh_phymem_cache_wb(bool resume)
 
 	return seamcall(TDH_PHYMEM_CACHE_WB, &args);
 }
-EXPORT_SYMBOL_GPL(tdh_phymem_cache_wb);
+EXPORT_SYMBOL_FOR_KVM(tdh_phymem_cache_wb);
 
 u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td)
 {
@@ -1856,7 +1857,7 @@ u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td)
 
 	return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
 }
-EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_tdr);
+EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_wbinvd_tdr);
 
 u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page)
 {
@@ -1866,7 +1867,7 @@ u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page)
 
 	return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
 }
-EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid);
+EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_wbinvd_hkid);
 
 #ifdef CONFIG_KEXEC_CORE
 void tdx_cpu_flush_cache_for_kexec(void)
@@ -1884,5 +1885,5 @@ void tdx_cpu_flush_cache_for_kexec(void)
 	wbinvd();
 	this_cpu_write(cache_state_incoherent, false);
 }
-EXPORT_SYMBOL_GPL(tdx_cpu_flush_cache_for_kexec);
+EXPORT_SYMBOL_FOR_KVM(tdx_cpu_flush_cache_for_kexec);
 #endif