diff options
Diffstat (limited to 'arch/x86')
153 files changed, 5246 insertions, 2374 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fa3b616af03a..34fb46d5341b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -261,6 +261,7 @@ config X86 select HAVE_FUNCTION_ERROR_INJECTION select HAVE_KRETPROBES select HAVE_RETHOOK + select HAVE_KLP_BUILD if X86_64 select HAVE_LIVEPATCH if X86_64 select HAVE_MIXED_BREAKPOINTS_REGS select HAVE_MOD_ARCH_SPECIFIC @@ -297,6 +298,7 @@ config X86 select HAVE_SYSCALL_TRACEPOINTS select HAVE_UACCESS_VALIDATION if HAVE_OBJTOOL select HAVE_UNSTABLE_SCHED_CLOCK + select HAVE_UNWIND_USER_FP if X86_64 select HAVE_USER_RETURN_NOTIFIER select HAVE_GENERIC_VDSO select VDSO_GETRANDOM if X86_64 @@ -379,7 +381,7 @@ config GENERIC_CSUM config GENERIC_BUG def_bool y depends on BUG - select GENERIC_BUG_RELATIVE_POINTERS if X86_64 + select GENERIC_BUG_RELATIVE_POINTERS config GENERIC_BUG_RELATIVE_POINTERS bool diff --git a/arch/x86/Kconfig.cpufeatures b/arch/x86/Kconfig.cpufeatures index 250c10627ab3..733d5aff2456 100644 --- a/arch/x86/Kconfig.cpufeatures +++ b/arch/x86/Kconfig.cpufeatures @@ -124,6 +124,10 @@ config X86_DISABLED_FEATURE_PCID def_bool y depends on !X86_64 +config X86_DISABLED_FEATURE_LASS + def_bool y + depends on X86_32 + config X86_DISABLED_FEATURE_PKU def_bool y depends on !X86_INTEL_MEMORY_PROTECTION_KEYS diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 4db7e4bf69f5..1d403a3612ea 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -48,7 +48,8 @@ endif # How to compile the 16-bit code. Note we always compile for -march=i386; # that way we can complain to the user if the CPU is insufficient. -REALMODE_CFLAGS := -std=gnu11 -m16 -g -Os -DDISABLE_BRANCH_PROFILING -D__DISABLE_EXPORTS \ +REALMODE_CFLAGS := -std=gnu11 -fms-extensions -m16 -g -Os \ + -DDISABLE_BRANCH_PROFILING -D__DISABLE_EXPORTS \ -Wall -Wstrict-prototypes -march=i386 -mregparm=3 \ -fno-strict-aliasing -fomit-frame-pointer -fno-pic \ -mno-mmx -mno-sse $(call cc-option,-fcf-protection=none) @@ -60,6 +61,7 @@ REALMODE_CFLAGS += $(cc_stack_align4) REALMODE_CFLAGS += $(CLANG_FLAGS) ifdef CONFIG_CC_IS_CLANG REALMODE_CFLAGS += -Wno-gnu +REALMODE_CFLAGS += -Wno-microsoft-anon-tag endif export REALMODE_CFLAGS @@ -75,7 +77,7 @@ export BITS # # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383 # -KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx +KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -mno-sse4a KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2 @@ -98,7 +100,7 @@ ifeq ($(CONFIG_X86_KERNEL_IBT),y) # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104816 # KBUILD_CFLAGS += $(call cc-option,-fcf-protection=branch -fno-jump-tables) -KBUILD_RUSTFLAGS += -Zcf-protection=branch -Zno-jump-tables +KBUILD_RUSTFLAGS += -Zcf-protection=branch $(if $(call rustc-min-version,109300),-Cjump-tables=n,-Zno-jump-tables) else KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none) endif diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 74657589264d..68f9d7a1683b 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -25,7 +25,7 @@ targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ # avoid errors with '-march=i386', and future flags may depend on the target to # be valid. KBUILD_CFLAGS := -m$(BITS) -O2 $(CLANG_FLAGS) -KBUILD_CFLAGS += -std=gnu11 +KBUILD_CFLAGS += -std=gnu11 -fms-extensions KBUILD_CFLAGS += -fno-strict-aliasing -fPIE KBUILD_CFLAGS += -Wundef KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING @@ -36,7 +36,10 @@ KBUILD_CFLAGS += -mno-mmx -mno-sse KBUILD_CFLAGS += -ffreestanding -fshort-wchar KBUILD_CFLAGS += -fno-stack-protector KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) -KBUILD_CFLAGS += $(call cc-disable-warning, gnu) +ifdef CONFIG_CC_IS_CLANG +KBUILD_CFLAGS += -Wno-gnu +KBUILD_CFLAGS += -Wno-microsoft-anon-tag +endif KBUILD_CFLAGS += -Wno-pointer-sign KBUILD_CFLAGS += -fno-asynchronous-unwind-tables KBUILD_CFLAGS += -D__DISABLE_EXPORTS diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index db1048621ea2..fd855e32c9b9 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -152,17 +152,6 @@ bool insn_has_rep_prefix(struct insn *insn); void sev_insn_decode_init(void); bool early_setup_ghcb(void); #else -static inline void sev_enable(struct boot_params *bp) -{ - /* - * bp->cc_blob_address should only be set by boot/compressed kernel. - * Initialize it to 0 unconditionally (thus here in this stub too) to - * ensure that uninitialized values from buggy bootloaders aren't - * propagated. - */ - if (bp) - bp->cc_blob_address = 0; -} static inline void snp_check_features(void) { } static inline void sev_es_shutdown_ghcb(void) { } static inline bool sev_es_check_ghcb_fault(unsigned long address) diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index bdd26050dff7..0e89e197e112 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -3,6 +3,7 @@ #include <asm/bootparam.h> #include <asm/bootparam_utils.h> #include <asm/e820/types.h> +#include <asm/pgtable.h> #include <asm/processor.h> #include "../string.h" #include "efi.h" @@ -168,9 +169,10 @@ asmlinkage void configure_5level_paging(struct boot_params *bp, void *pgtable) * For 4- to 5-level paging transition, set up current CR3 as * the first and the only entry in a new top-level page table. */ - *trampoline_32bit = __native_read_cr3() | _PAGE_TABLE_NOENC; + *trampoline_32bit = native_read_cr3_pa() | _PAGE_TABLE_NOENC; } else { - unsigned long src; + u64 *new_cr3; + pgd_t *pgdp; /* * For 5- to 4-level paging transition, copy page table pointed @@ -180,8 +182,9 @@ asmlinkage void configure_5level_paging(struct boot_params *bp, void *pgtable) * We cannot just point to the page table from trampoline as it * may be above 4G. */ - src = *(unsigned long *)__native_read_cr3() & PAGE_MASK; - memcpy(trampoline_32bit, (void *)src, PAGE_SIZE); + pgdp = (pgd_t *)native_read_cr3_pa(); + new_cr3 = (u64 *)(native_pgd_val(pgdp[0]) & PTE_PFN_MASK); + memcpy(trampoline_32bit, new_cr3, PAGE_SIZE); } toggle_la57(trampoline_32bit); diff --git a/arch/x86/boot/compressed/sev-handle-vc.c b/arch/x86/boot/compressed/sev-handle-vc.c index 7530ad8b768b..030001b46554 100644 --- a/arch/x86/boot/compressed/sev-handle-vc.c +++ b/arch/x86/boot/compressed/sev-handle-vc.c @@ -29,11 +29,10 @@ bool insn_has_rep_prefix(struct insn *insn) { insn_byte_t p; - int i; insn_get_prefixes(insn); - for_each_insn_prefix(insn, i, p) { + for_each_insn_prefix(insn, p) { if (p == 0xf2 || p == 0xf3) return true; } diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 6e5c32a53d03..c8c1464b3a56 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -14,6 +14,7 @@ #include <asm/bootparam.h> #include <asm/pgtable_types.h> +#include <asm/shared/msr.h> #include <asm/sev.h> #include <asm/trapnr.h> #include <asm/trap_pf.h> @@ -397,7 +398,7 @@ void sev_enable(struct boot_params *bp) } /* Set the SME mask if this is an SEV guest. */ - boot_rdmsr(MSR_AMD64_SEV, &m); + raw_rdmsr(MSR_AMD64_SEV, &m); sev_status = m.q; if (!(sev_status & MSR_AMD64_SEV_ENABLED)) return; @@ -446,7 +447,7 @@ u64 sev_get_status(void) if (sev_check_cpu_support() < 0) return 0; - boot_rdmsr(MSR_AMD64_SEV, &m); + raw_rdmsr(MSR_AMD64_SEV, &m); return m.q; } @@ -496,7 +497,7 @@ bool early_is_sevsnp_guest(void) struct msr m; /* Obtain the address of the calling area to use */ - boot_rdmsr(MSR_SVSM_CAA, &m); + raw_rdmsr(MSR_SVSM_CAA, &m); boot_svsm_caa_pa = m.q; /* diff --git a/arch/x86/boot/compressed/sev.h b/arch/x86/boot/compressed/sev.h index 92f79c21939c..22637b416b46 100644 --- a/arch/x86/boot/compressed/sev.h +++ b/arch/x86/boot/compressed/sev.h @@ -10,7 +10,7 @@ #ifdef CONFIG_AMD_MEM_ENCRYPT -#include "../msr.h" +#include <asm/shared/msr.h> void snp_accept_memory(phys_addr_t start, phys_addr_t end); u64 sev_get_status(void); @@ -20,7 +20,7 @@ static inline u64 sev_es_rd_ghcb_msr(void) { struct msr m; - boot_rdmsr(MSR_AMD64_SEV_ES_GHCB, &m); + raw_rdmsr(MSR_AMD64_SEV_ES_GHCB, &m); return m.q; } @@ -30,7 +30,7 @@ static inline void sev_es_wr_ghcb_msr(u64 val) struct msr m; m.q = val; - boot_wrmsr(MSR_AMD64_SEV_ES_GHCB, &m); + raw_wrmsr(MSR_AMD64_SEV_ES_GHCB, &m); } #else diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c index f82de8de5dc6..2e1bb936cba2 100644 --- a/arch/x86/boot/cpucheck.c +++ b/arch/x86/boot/cpucheck.c @@ -26,9 +26,9 @@ #include <asm/intel-family.h> #include <asm/processor-flags.h> #include <asm/msr-index.h> +#include <asm/shared/msr.h> #include "string.h" -#include "msr.h" static u32 err_flags[NCAPINTS]; @@ -134,9 +134,9 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr) struct msr m; - boot_rdmsr(MSR_K7_HWCR, &m); + raw_rdmsr(MSR_K7_HWCR, &m); m.l &= ~(1 << 15); - boot_wrmsr(MSR_K7_HWCR, &m); + raw_wrmsr(MSR_K7_HWCR, &m); get_cpuflags(); /* Make sure it really did something */ err = check_cpuflags(); @@ -148,9 +148,9 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr) struct msr m; - boot_rdmsr(MSR_VIA_FCR, &m); + raw_rdmsr(MSR_VIA_FCR, &m); m.l |= (1 << 1) | (1 << 7); - boot_wrmsr(MSR_VIA_FCR, &m); + raw_wrmsr(MSR_VIA_FCR, &m); set_bit(X86_FEATURE_CX8, cpu.flags); err = check_cpuflags(); @@ -160,14 +160,14 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr) struct msr m, m_tmp; u32 level = 1; - boot_rdmsr(0x80860004, &m); + raw_rdmsr(0x80860004, &m); m_tmp = m; m_tmp.l = ~0; - boot_wrmsr(0x80860004, &m_tmp); + raw_wrmsr(0x80860004, &m_tmp); asm("cpuid" : "+a" (level), "=d" (cpu.flags[0]) : : "ecx", "ebx"); - boot_wrmsr(0x80860004, &m); + raw_wrmsr(0x80860004, &m); err = check_cpuflags(); } else if (err == 0x01 && diff --git a/arch/x86/boot/msr.h b/arch/x86/boot/msr.h deleted file mode 100644 index aed66f7ae199..000000000000 --- a/arch/x86/boot/msr.h +++ /dev/null @@ -1,26 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Helpers/definitions related to MSR access. - */ - -#ifndef BOOT_MSR_H -#define BOOT_MSR_H - -#include <asm/shared/msr.h> - -/* - * The kernel proper already defines rdmsr()/wrmsr(), but they are not for the - * boot kernel since they rely on tracepoint/exception handling infrastructure - * that's not available here. - */ -static inline void boot_rdmsr(unsigned int reg, struct msr *m) -{ - asm volatile("rdmsr" : "=a" (m->l), "=d" (m->h) : "c" (reg)); -} - -static inline void boot_wrmsr(unsigned int reg, const struct msr *m) -{ - asm volatile("wrmsr" : : "c" (reg), "a"(m->l), "d" (m->h) : "memory"); -} - -#endif /* BOOT_MSR_H */ diff --git a/arch/x86/boot/startup/Makefile b/arch/x86/boot/startup/Makefile index e8fdf020b422..5e499cfb29b5 100644 --- a/arch/x86/boot/startup/Makefile +++ b/arch/x86/boot/startup/Makefile @@ -36,7 +36,7 @@ $(patsubst %.o,$(obj)/%.o,$(lib-y)): OBJECT_FILES_NON_STANDARD := y # relocations, even if other objtool actions are being deferred. # $(pi-objs): objtool-enabled = 1 -$(pi-objs): objtool-args = $(if $(delay-objtool),,$(objtool-args-y)) --noabs +$(pi-objs): objtool-args = $(if $(delay-objtool),--dry-run,$(objtool-args-y)) --noabs # # Confine the startup code by prefixing all symbols with __pi_ (for position diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index 4e22ffd73516..a0fa8bb2b945 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -12,7 +12,7 @@ #include <asm/setup_data.h> #ifndef __BOOT_COMPRESSED -#define has_cpuflag(f) boot_cpu_has(f) +#define has_cpuflag(f) cpu_feature_enabled(f) #else #undef WARN #define WARN(condition, format...) (!!(condition)) diff --git a/arch/x86/coco/sev/vc-handle.c b/arch/x86/coco/sev/vc-handle.c index 7fc136a35334..f08c7505ed82 100644 --- a/arch/x86/coco/sev/vc-handle.c +++ b/arch/x86/coco/sev/vc-handle.c @@ -352,7 +352,6 @@ fault: #define sev_printk(fmt, ...) printk(fmt, ##__VA_ARGS__) #define error(v) -#define has_cpuflag(f) boot_cpu_has(f) #include "vc-shared.c" diff --git a/arch/x86/coco/sev/vc-shared.c b/arch/x86/coco/sev/vc-shared.c index 9b01c9ad81be..58b2f985d546 100644 --- a/arch/x86/coco/sev/vc-shared.c +++ b/arch/x86/coco/sev/vc-shared.c @@ -1,5 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 +#ifndef __BOOT_COMPRESSED +#define has_cpuflag(f) cpu_feature_enabled(f) +#endif + static enum es_result vc_check_opcode_bytes(struct es_em_ctxt *ctxt, unsigned long exit_code) { @@ -546,6 +550,13 @@ static enum es_result vc_handle_cpuid(struct ghcb *ghcb, /* xgetbv will cause #GP - use reset value for xcr0 */ ghcb_set_xcr0(ghcb, 1); + if (has_cpuflag(X86_FEATURE_SHSTK) && regs->ax == 0xd && regs->cx == 1) { + struct msr m; + + raw_rdmsr(MSR_IA32_XSS, &m); + ghcb_set_xss(ghcb, m.q); + } + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); if (ret != ES_OK) return ret; diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig index 48d3076b6053..3fd2423d3cf8 100644 --- a/arch/x86/crypto/Kconfig +++ b/arch/x86/crypto/Kconfig @@ -353,16 +353,6 @@ config CRYPTO_NHPOLY1305_AVX2 Architecture: x86_64 using: - AVX2 (Advanced Vector Extensions 2) -config CRYPTO_POLYVAL_CLMUL_NI - tristate "Hash functions: POLYVAL (CLMUL-NI)" - depends on 64BIT - select CRYPTO_POLYVAL - help - POLYVAL hash function for HCTR2 - - Architecture: x86_64 using: - - CLMUL-NI (carry-less multiplication new instructions) - config CRYPTO_SM3_AVX_X86_64 tristate "Hash functions: SM3 (AVX)" depends on 64BIT diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 2d30d5d36145..5f2fb4f148fe 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -46,15 +46,13 @@ obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o aesni-intel-$(CONFIG_64BIT) += aes-ctr-avx-x86_64.o \ aes-gcm-aesni-x86_64.o \ - aes-xts-avx-x86_64.o \ - aes-gcm-avx10-x86_64.o + aes-gcm-vaes-avx2.o \ + aes-gcm-vaes-avx512.o \ + aes-xts-avx-x86_64.o obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o -obj-$(CONFIG_CRYPTO_POLYVAL_CLMUL_NI) += polyval-clmulni.o -polyval-clmulni-y := polyval-clmulni_asm.o polyval-clmulni_glue.o - obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o diff --git a/arch/x86/crypto/aes-gcm-aesni-x86_64.S b/arch/x86/crypto/aes-gcm-aesni-x86_64.S index 45940e2883a0..7c8a8a32bd3c 100644 --- a/arch/x86/crypto/aes-gcm-aesni-x86_64.S +++ b/arch/x86/crypto/aes-gcm-aesni-x86_64.S @@ -61,15 +61,15 @@ // for the *_aesni functions or AVX for the *_aesni_avx ones. (But it seems // there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.) // -// The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is +// The design generally follows that of aes-gcm-vaes-avx512.S, and that file is // more thoroughly commented. This file has the following notable changes: // // - The vector length is fixed at 128-bit, i.e. xmm registers. This means // there is only one AES block (and GHASH block) per register. // -// - Without AVX512 / AVX10, only 16 SIMD registers are available instead of -// 32. We work around this by being much more careful about using -// registers, relying heavily on loads to load values as they are needed. +// - Without AVX512, only 16 SIMD registers are available instead of 32. We +// work around this by being much more careful about using registers, +// relying heavily on loads to load values as they are needed. // // - Masking is not available either. We work around this by implementing // partial block loads and stores using overlapping scalar loads and stores @@ -90,8 +90,8 @@ // multiplication instead of schoolbook multiplication. This saves one // pclmulqdq instruction per block, at the cost of one 64-bit load, one // pshufd, and 0.25 pxors per block. (This is without the three-argument -// XOR support that would be provided by AVX512 / AVX10, which would be -// more beneficial to schoolbook than Karatsuba.) +// XOR support that would be provided by AVX512, which would be more +// beneficial to schoolbook than Karatsuba.) // // As a rough approximation, we can assume that Karatsuba multiplication is // faster than schoolbook multiplication in this context if one pshufd and diff --git a/arch/x86/crypto/aes-gcm-vaes-avx2.S b/arch/x86/crypto/aes-gcm-vaes-avx2.S new file mode 100644 index 000000000000..93c9504a488f --- /dev/null +++ b/arch/x86/crypto/aes-gcm-vaes-avx2.S @@ -0,0 +1,1146 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// AES-GCM implementation for x86_64 CPUs that support the following CPU +// features: VAES && VPCLMULQDQ && AVX2 +// +// Copyright 2025 Google LLC +// +// Author: Eric Biggers <ebiggers@google.com> +// +//------------------------------------------------------------------------------ +// +// This file is dual-licensed, meaning that you can use it under your choice of +// either of the following two licenses: +// +// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// or +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// ----------------------------------------------------------------------------- +// +// This is similar to aes-gcm-vaes-avx512.S, but it uses AVX2 instead of AVX512. +// This means it can only use 16 vector registers instead of 32, the maximum +// vector length is 32 bytes, and some instructions such as vpternlogd and +// masked loads/stores are unavailable. However, it is able to run on CPUs that +// have VAES without AVX512, namely AMD Zen 3 (including "Milan" server CPUs), +// various Intel client CPUs such as Alder Lake, and Intel Sierra Forest. +// +// This implementation also uses Karatsuba multiplication instead of schoolbook +// multiplication for GHASH in its main loop. This does not help much on Intel, +// but it improves performance by ~5% on AMD Zen 3. Other factors weighing +// slightly in favor of Karatsuba multiplication in this implementation are the +// lower maximum vector length (which means there are fewer key powers, so we +// can cache the halves of each key power XOR'd together and still use less +// memory than the AVX512 implementation), and the unavailability of the +// vpternlogd instruction (which helped schoolbook a bit more than Karatsuba). + +#include <linux/linkage.h> + +.section .rodata +.p2align 4 + + // The below three 16-byte values must be in the order that they are, as + // they are really two 32-byte tables and a 16-byte value that overlap: + // + // - The first 32-byte table begins at .Lselect_high_bytes_table. + // For 0 <= len <= 16, the 16-byte value at + // '.Lselect_high_bytes_table + len' selects the high 'len' bytes of + // another 16-byte value when AND'ed with it. + // + // - The second 32-byte table begins at .Lrshift_and_bswap_table. + // For 0 <= len <= 16, the 16-byte value at + // '.Lrshift_and_bswap_table + len' is a vpshufb mask that does the + // following operation: right-shift by '16 - len' bytes (shifting in + // zeroes), then reflect all 16 bytes. + // + // - The 16-byte value at .Lbswap_mask is a vpshufb mask that reflects + // all 16 bytes. +.Lselect_high_bytes_table: + .octa 0 +.Lrshift_and_bswap_table: + .octa 0xffffffffffffffffffffffffffffffff +.Lbswap_mask: + .octa 0x000102030405060708090a0b0c0d0e0f + + // Sixteen 0x0f bytes. By XOR'ing an entry of .Lrshift_and_bswap_table + // with this, we get a mask that left-shifts by '16 - len' bytes. +.Lfifteens: + .octa 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f + + // This is the GHASH reducing polynomial without its constant term, i.e. + // x^128 + x^7 + x^2 + x, represented using the backwards mapping + // between bits and polynomial coefficients. + // + // Alternatively, it can be interpreted as the naturally-ordered + // representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the + // "reversed" GHASH reducing polynomial without its x^128 term. +.Lgfpoly: + .octa 0xc2000000000000000000000000000001 + + // Same as above, but with the (1 << 64) bit set. +.Lgfpoly_and_internal_carrybit: + .octa 0xc2000000000000010000000000000001 + + // Values needed to prepare the initial vector of counter blocks. +.Lctr_pattern: + .octa 0 + .octa 1 + + // The number of AES blocks per vector, as a 128-bit value. +.Linc_2blocks: + .octa 2 + +// Offsets in struct aes_gcm_key_vaes_avx2 +#define OFFSETOF_AESKEYLEN 480 +#define OFFSETOF_H_POWERS 512 +#define NUM_H_POWERS 8 +#define OFFSETOFEND_H_POWERS (OFFSETOF_H_POWERS + (NUM_H_POWERS * 16)) +#define OFFSETOF_H_POWERS_XORED OFFSETOFEND_H_POWERS + +.text + +// Do one step of GHASH-multiplying the 128-bit lanes of \a by the 128-bit lanes +// of \b and storing the reduced products in \dst. Uses schoolbook +// multiplication. +.macro _ghash_mul_step i, a, b, dst, gfpoly, t0, t1, t2 +.if \i == 0 + vpclmulqdq $0x00, \a, \b, \t0 // LO = a_L * b_L + vpclmulqdq $0x01, \a, \b, \t1 // MI_0 = a_L * b_H +.elseif \i == 1 + vpclmulqdq $0x10, \a, \b, \t2 // MI_1 = a_H * b_L +.elseif \i == 2 + vpxor \t2, \t1, \t1 // MI = MI_0 + MI_1 +.elseif \i == 3 + vpclmulqdq $0x01, \t0, \gfpoly, \t2 // LO_L*(x^63 + x^62 + x^57) +.elseif \i == 4 + vpshufd $0x4e, \t0, \t0 // Swap halves of LO +.elseif \i == 5 + vpxor \t0, \t1, \t1 // Fold LO into MI (part 1) + vpxor \t2, \t1, \t1 // Fold LO into MI (part 2) +.elseif \i == 6 + vpclmulqdq $0x11, \a, \b, \dst // HI = a_H * b_H +.elseif \i == 7 + vpclmulqdq $0x01, \t1, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57) +.elseif \i == 8 + vpshufd $0x4e, \t1, \t1 // Swap halves of MI +.elseif \i == 9 + vpxor \t1, \dst, \dst // Fold MI into HI (part 1) + vpxor \t0, \dst, \dst // Fold MI into HI (part 2) +.endif +.endm + +// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store +// the reduced products in \dst. See _ghash_mul_step for full explanation. +.macro _ghash_mul a, b, dst, gfpoly, t0, t1, t2 +.irp i, 0,1,2,3,4,5,6,7,8,9 + _ghash_mul_step \i, \a, \b, \dst, \gfpoly, \t0, \t1, \t2 +.endr +.endm + +// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the +// *unreduced* products to \lo, \mi, and \hi. +.macro _ghash_mul_noreduce a, b, lo, mi, hi, t0 + vpclmulqdq $0x00, \a, \b, \t0 // a_L * b_L + vpxor \t0, \lo, \lo + vpclmulqdq $0x01, \a, \b, \t0 // a_L * b_H + vpxor \t0, \mi, \mi + vpclmulqdq $0x10, \a, \b, \t0 // a_H * b_L + vpxor \t0, \mi, \mi + vpclmulqdq $0x11, \a, \b, \t0 // a_H * b_H + vpxor \t0, \hi, \hi +.endm + +// Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit +// reduced products in \hi. See _ghash_mul_step for explanation of reduction. +.macro _ghash_reduce lo, mi, hi, gfpoly, t0 + vpclmulqdq $0x01, \lo, \gfpoly, \t0 + vpshufd $0x4e, \lo, \lo + vpxor \lo, \mi, \mi + vpxor \t0, \mi, \mi + vpclmulqdq $0x01, \mi, \gfpoly, \t0 + vpshufd $0x4e, \mi, \mi + vpxor \mi, \hi, \hi + vpxor \t0, \hi, \hi +.endm + +// This is a specialized version of _ghash_mul that computes \a * \a, i.e. it +// squares \a. It skips computing MI = (a_L * a_H) + (a_H * a_L) = 0. +.macro _ghash_square a, dst, gfpoly, t0, t1 + vpclmulqdq $0x00, \a, \a, \t0 // LO = a_L * a_L + vpclmulqdq $0x11, \a, \a, \dst // HI = a_H * a_H + vpclmulqdq $0x01, \t0, \gfpoly, \t1 // LO_L*(x^63 + x^62 + x^57) + vpshufd $0x4e, \t0, \t0 // Swap halves of LO + vpxor \t0, \t1, \t1 // Fold LO into MI + vpclmulqdq $0x01, \t1, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57) + vpshufd $0x4e, \t1, \t1 // Swap halves of MI + vpxor \t1, \dst, \dst // Fold MI into HI (part 1) + vpxor \t0, \dst, \dst // Fold MI into HI (part 2) +.endm + +// void aes_gcm_precompute_vaes_avx2(struct aes_gcm_key_vaes_avx2 *key); +// +// Given the expanded AES key |key->base.aes_key|, derive the GHASH subkey and +// initialize |key->h_powers| and |key->h_powers_xored|. +// +// We use h_powers[0..7] to store H^8 through H^1, and h_powers_xored[0..7] to +// store the 64-bit halves of the key powers XOR'd together (for Karatsuba +// multiplication) in the order 8,6,7,5,4,2,3,1. +SYM_FUNC_START(aes_gcm_precompute_vaes_avx2) + + // Function arguments + .set KEY, %rdi + + // Additional local variables + .set POWERS_PTR, %rsi + .set RNDKEYLAST_PTR, %rdx + .set TMP0, %ymm0 + .set TMP0_XMM, %xmm0 + .set TMP1, %ymm1 + .set TMP1_XMM, %xmm1 + .set TMP2, %ymm2 + .set TMP2_XMM, %xmm2 + .set H_CUR, %ymm3 + .set H_CUR_XMM, %xmm3 + .set H_CUR2, %ymm4 + .set H_INC, %ymm5 + .set H_INC_XMM, %xmm5 + .set GFPOLY, %ymm6 + .set GFPOLY_XMM, %xmm6 + + // Encrypt an all-zeroes block to get the raw hash subkey. + movl OFFSETOF_AESKEYLEN(KEY), %eax + lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR + vmovdqu (KEY), H_CUR_XMM // Zero-th round key XOR all-zeroes block + lea 16(KEY), %rax +1: + vaesenc (%rax), H_CUR_XMM, H_CUR_XMM + add $16, %rax + cmp %rax, RNDKEYLAST_PTR + jne 1b + vaesenclast (RNDKEYLAST_PTR), H_CUR_XMM, H_CUR_XMM + + // Reflect the bytes of the raw hash subkey. + vpshufb .Lbswap_mask(%rip), H_CUR_XMM, H_CUR_XMM + + // Finish preprocessing the byte-reflected hash subkey by multiplying it + // by x^-1 ("standard" interpretation of polynomial coefficients) or + // equivalently x^1 (natural interpretation). This gets the key into a + // format that avoids having to bit-reflect the data blocks later. + vpshufd $0xd3, H_CUR_XMM, TMP0_XMM + vpsrad $31, TMP0_XMM, TMP0_XMM + vpaddq H_CUR_XMM, H_CUR_XMM, H_CUR_XMM + vpand .Lgfpoly_and_internal_carrybit(%rip), TMP0_XMM, TMP0_XMM + vpxor TMP0_XMM, H_CUR_XMM, H_CUR_XMM + + // Load the gfpoly constant. + vbroadcasti128 .Lgfpoly(%rip), GFPOLY + + // Square H^1 to get H^2. + _ghash_square H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, TMP0_XMM, TMP1_XMM + + // Create H_CUR = [H^2, H^1] and H_INC = [H^2, H^2]. + vinserti128 $1, H_CUR_XMM, H_INC, H_CUR + vinserti128 $1, H_INC_XMM, H_INC, H_INC + + // Compute H_CUR2 = [H^4, H^3]. + _ghash_mul H_INC, H_CUR, H_CUR2, GFPOLY, TMP0, TMP1, TMP2 + + // Store [H^2, H^1] and [H^4, H^3]. + vmovdqu H_CUR, OFFSETOF_H_POWERS+3*32(KEY) + vmovdqu H_CUR2, OFFSETOF_H_POWERS+2*32(KEY) + + // For Karatsuba multiplication: compute and store the two 64-bit halves + // of each key power XOR'd together. Order is 4,2,3,1. + vpunpcklqdq H_CUR, H_CUR2, TMP0 + vpunpckhqdq H_CUR, H_CUR2, TMP1 + vpxor TMP1, TMP0, TMP0 + vmovdqu TMP0, OFFSETOF_H_POWERS_XORED+32(KEY) + + // Compute and store H_CUR = [H^6, H^5] and H_CUR2 = [H^8, H^7]. + _ghash_mul H_INC, H_CUR2, H_CUR, GFPOLY, TMP0, TMP1, TMP2 + _ghash_mul H_INC, H_CUR, H_CUR2, GFPOLY, TMP0, TMP1, TMP2 + vmovdqu H_CUR, OFFSETOF_H_POWERS+1*32(KEY) + vmovdqu H_CUR2, OFFSETOF_H_POWERS+0*32(KEY) + + // Again, compute and store the two 64-bit halves of each key power + // XOR'd together. Order is 8,6,7,5. + vpunpcklqdq H_CUR, H_CUR2, TMP0 + vpunpckhqdq H_CUR, H_CUR2, TMP1 + vpxor TMP1, TMP0, TMP0 + vmovdqu TMP0, OFFSETOF_H_POWERS_XORED(KEY) + + vzeroupper + RET +SYM_FUNC_END(aes_gcm_precompute_vaes_avx2) + +// Do one step of the GHASH update of four vectors of data blocks. +// \i: the step to do, 0 through 9 +// \ghashdata_ptr: pointer to the data blocks (ciphertext or AAD) +// KEY: pointer to struct aes_gcm_key_vaes_avx2 +// BSWAP_MASK: mask for reflecting the bytes of blocks +// H_POW[2-1]_XORED: cached values from KEY->h_powers_xored +// TMP[0-2]: temporary registers. TMP[1-2] must be preserved across steps. +// LO, MI: working state for this macro that must be preserved across steps +// GHASH_ACC: the GHASH accumulator (input/output) +.macro _ghash_step_4x i, ghashdata_ptr + .set HI, GHASH_ACC # alias + .set HI_XMM, GHASH_ACC_XMM +.if \i == 0 + // First vector + vmovdqu 0*32(\ghashdata_ptr), TMP1 + vpshufb BSWAP_MASK, TMP1, TMP1 + vmovdqu OFFSETOF_H_POWERS+0*32(KEY), TMP2 + vpxor GHASH_ACC, TMP1, TMP1 + vpclmulqdq $0x00, TMP2, TMP1, LO + vpclmulqdq $0x11, TMP2, TMP1, HI + vpunpckhqdq TMP1, TMP1, TMP0 + vpxor TMP1, TMP0, TMP0 + vpclmulqdq $0x00, H_POW2_XORED, TMP0, MI +.elseif \i == 1 +.elseif \i == 2 + // Second vector + vmovdqu 1*32(\ghashdata_ptr), TMP1 + vpshufb BSWAP_MASK, TMP1, TMP1 + vmovdqu OFFSETOF_H_POWERS+1*32(KEY), TMP2 + vpclmulqdq $0x00, TMP2, TMP1, TMP0 + vpxor TMP0, LO, LO + vpclmulqdq $0x11, TMP2, TMP1, TMP0 + vpxor TMP0, HI, HI + vpunpckhqdq TMP1, TMP1, TMP0 + vpxor TMP1, TMP0, TMP0 + vpclmulqdq $0x10, H_POW2_XORED, TMP0, TMP0 + vpxor TMP0, MI, MI +.elseif \i == 3 + // Third vector + vmovdqu 2*32(\ghashdata_ptr), TMP1 + vpshufb BSWAP_MASK, TMP1, TMP1 + vmovdqu OFFSETOF_H_POWERS+2*32(KEY), TMP2 +.elseif \i == 4 + vpclmulqdq $0x00, TMP2, TMP1, TMP0 + vpxor TMP0, LO, LO + vpclmulqdq $0x11, TMP2, TMP1, TMP0 + vpxor TMP0, HI, HI +.elseif \i == 5 + vpunpckhqdq TMP1, TMP1, TMP0 + vpxor TMP1, TMP0, TMP0 + vpclmulqdq $0x00, H_POW1_XORED, TMP0, TMP0 + vpxor TMP0, MI, MI + + // Fourth vector + vmovdqu 3*32(\ghashdata_ptr), TMP1 + vpshufb BSWAP_MASK, TMP1, TMP1 +.elseif \i == 6 + vmovdqu OFFSETOF_H_POWERS+3*32(KEY), TMP2 + vpclmulqdq $0x00, TMP2, TMP1, TMP0 + vpxor TMP0, LO, LO + vpclmulqdq $0x11, TMP2, TMP1, TMP0 + vpxor TMP0, HI, HI + vpunpckhqdq TMP1, TMP1, TMP0 + vpxor TMP1, TMP0, TMP0 + vpclmulqdq $0x10, H_POW1_XORED, TMP0, TMP0 + vpxor TMP0, MI, MI +.elseif \i == 7 + // Finalize 'mi' following Karatsuba multiplication. + vpxor LO, MI, MI + vpxor HI, MI, MI + + // Fold lo into mi. + vbroadcasti128 .Lgfpoly(%rip), TMP2 + vpclmulqdq $0x01, LO, TMP2, TMP0 + vpshufd $0x4e, LO, LO + vpxor LO, MI, MI + vpxor TMP0, MI, MI +.elseif \i == 8 + // Fold mi into hi. + vpclmulqdq $0x01, MI, TMP2, TMP0 + vpshufd $0x4e, MI, MI + vpxor MI, HI, HI + vpxor TMP0, HI, HI +.elseif \i == 9 + vextracti128 $1, HI, TMP0_XMM + vpxor TMP0_XMM, HI_XMM, GHASH_ACC_XMM +.endif +.endm + +// Update GHASH with four vectors of data blocks. See _ghash_step_4x for full +// explanation. +.macro _ghash_4x ghashdata_ptr +.irp i, 0,1,2,3,4,5,6,7,8,9 + _ghash_step_4x \i, \ghashdata_ptr +.endr +.endm + +// Load 1 <= %ecx <= 16 bytes from the pointer \src into the xmm register \dst +// and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}. +.macro _load_partial_block src, dst, tmp64, tmp32 + sub $8, %ecx // LEN - 8 + jle .Lle8\@ + + // Load 9 <= LEN <= 16 bytes. + vmovq (\src), \dst // Load first 8 bytes + mov (\src, %rcx), %rax // Load last 8 bytes + neg %ecx + shl $3, %ecx + shr %cl, %rax // Discard overlapping bytes + vpinsrq $1, %rax, \dst, \dst + jmp .Ldone\@ + +.Lle8\@: + add $4, %ecx // LEN - 4 + jl .Llt4\@ + + // Load 4 <= LEN <= 8 bytes. + mov (\src), %eax // Load first 4 bytes + mov (\src, %rcx), \tmp32 // Load last 4 bytes + jmp .Lcombine\@ + +.Llt4\@: + // Load 1 <= LEN <= 3 bytes. + add $2, %ecx // LEN - 2 + movzbl (\src), %eax // Load first byte + jl .Lmovq\@ + movzwl (\src, %rcx), \tmp32 // Load last 2 bytes +.Lcombine\@: + shl $3, %ecx + shl %cl, \tmp64 + or \tmp64, %rax // Combine the two parts +.Lmovq\@: + vmovq %rax, \dst +.Ldone\@: +.endm + +// Store 1 <= %ecx <= 16 bytes from the xmm register \src to the pointer \dst. +// Clobbers %rax, %rcx, and \tmp{64,32}. +.macro _store_partial_block src, dst, tmp64, tmp32 + sub $8, %ecx // LEN - 8 + jl .Llt8\@ + + // Store 8 <= LEN <= 16 bytes. + vpextrq $1, \src, %rax + mov %ecx, \tmp32 + shl $3, %ecx + ror %cl, %rax + mov %rax, (\dst, \tmp64) // Store last LEN - 8 bytes + vmovq \src, (\dst) // Store first 8 bytes + jmp .Ldone\@ + +.Llt8\@: + add $4, %ecx // LEN - 4 + jl .Llt4\@ + + // Store 4 <= LEN <= 7 bytes. + vpextrd $1, \src, %eax + mov %ecx, \tmp32 + shl $3, %ecx + ror %cl, %eax + mov %eax, (\dst, \tmp64) // Store last LEN - 4 bytes + vmovd \src, (\dst) // Store first 4 bytes + jmp .Ldone\@ + +.Llt4\@: + // Store 1 <= LEN <= 3 bytes. + vpextrb $0, \src, 0(\dst) + cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2? + jl .Ldone\@ + vpextrb $1, \src, 1(\dst) + je .Ldone\@ + vpextrb $2, \src, 2(\dst) +.Ldone\@: +.endm + +// void aes_gcm_aad_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, +// u8 ghash_acc[16], +// const u8 *aad, int aadlen); +// +// This function processes the AAD (Additional Authenticated Data) in GCM. +// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the +// data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all +// zeroes. |aadlen| must be a multiple of 16, except on the last call where it +// can be any length. The caller must do any buffering needed to ensure this. +// +// This handles large amounts of AAD efficiently, while also keeping overhead +// low for small amounts which is the common case. TLS and IPsec use less than +// one block of AAD, but (uncommonly) other use cases may use much more. +SYM_FUNC_START(aes_gcm_aad_update_vaes_avx2) + + // Function arguments + .set KEY, %rdi + .set GHASH_ACC_PTR, %rsi + .set AAD, %rdx + .set AADLEN, %ecx // Must be %ecx for _load_partial_block + .set AADLEN64, %rcx // Zero-extend AADLEN before using! + + // Additional local variables. + // %rax and %r8 are used as temporary registers. + .set TMP0, %ymm0 + .set TMP0_XMM, %xmm0 + .set TMP1, %ymm1 + .set TMP1_XMM, %xmm1 + .set TMP2, %ymm2 + .set TMP2_XMM, %xmm2 + .set LO, %ymm3 + .set LO_XMM, %xmm3 + .set MI, %ymm4 + .set MI_XMM, %xmm4 + .set GHASH_ACC, %ymm5 + .set GHASH_ACC_XMM, %xmm5 + .set BSWAP_MASK, %ymm6 + .set BSWAP_MASK_XMM, %xmm6 + .set GFPOLY, %ymm7 + .set GFPOLY_XMM, %xmm7 + .set H_POW2_XORED, %ymm8 + .set H_POW1_XORED, %ymm9 + + // Load the bswap_mask and gfpoly constants. Since AADLEN is usually + // small, usually only 128-bit vectors will be used. So as an + // optimization, don't broadcast these constants to both 128-bit lanes + // quite yet. + vmovdqu .Lbswap_mask(%rip), BSWAP_MASK_XMM + vmovdqu .Lgfpoly(%rip), GFPOLY_XMM + + // Load the GHASH accumulator. + vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM + + // Check for the common case of AADLEN <= 16, as well as AADLEN == 0. + test AADLEN, AADLEN + jz .Laad_done + cmp $16, AADLEN + jle .Laad_lastblock + + // AADLEN > 16, so we'll operate on full vectors. Broadcast bswap_mask + // and gfpoly to both 128-bit lanes. + vinserti128 $1, BSWAP_MASK_XMM, BSWAP_MASK, BSWAP_MASK + vinserti128 $1, GFPOLY_XMM, GFPOLY, GFPOLY + + // If AADLEN >= 128, update GHASH with 128 bytes of AAD at a time. + add $-128, AADLEN // 128 is 4 bytes, -128 is 1 byte + jl .Laad_loop_4x_done + vmovdqu OFFSETOF_H_POWERS_XORED(KEY), H_POW2_XORED + vmovdqu OFFSETOF_H_POWERS_XORED+32(KEY), H_POW1_XORED +.Laad_loop_4x: + _ghash_4x AAD + sub $-128, AAD + add $-128, AADLEN + jge .Laad_loop_4x +.Laad_loop_4x_done: + + // If AADLEN >= 32, update GHASH with 32 bytes of AAD at a time. + add $96, AADLEN + jl .Laad_loop_1x_done +.Laad_loop_1x: + vmovdqu (AAD), TMP0 + vpshufb BSWAP_MASK, TMP0, TMP0 + vpxor TMP0, GHASH_ACC, GHASH_ACC + vmovdqu OFFSETOFEND_H_POWERS-32(KEY), TMP0 + _ghash_mul TMP0, GHASH_ACC, GHASH_ACC, GFPOLY, TMP1, TMP2, LO + vextracti128 $1, GHASH_ACC, TMP0_XMM + vpxor TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM + add $32, AAD + sub $32, AADLEN + jge .Laad_loop_1x +.Laad_loop_1x_done: + add $32, AADLEN + // Now 0 <= AADLEN < 32. + + jz .Laad_done + cmp $16, AADLEN + jle .Laad_lastblock + + // Update GHASH with the remaining 17 <= AADLEN <= 31 bytes of AAD. + mov AADLEN, AADLEN // Zero-extend AADLEN to AADLEN64. + vmovdqu (AAD), TMP0_XMM + vmovdqu -16(AAD, AADLEN64), TMP1_XMM + vpshufb BSWAP_MASK_XMM, TMP0_XMM, TMP0_XMM + vpxor TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM + lea .Lrshift_and_bswap_table(%rip), %rax + vpshufb -16(%rax, AADLEN64), TMP1_XMM, TMP1_XMM + vinserti128 $1, TMP1_XMM, GHASH_ACC, GHASH_ACC + vmovdqu OFFSETOFEND_H_POWERS-32(KEY), TMP0 + _ghash_mul TMP0, GHASH_ACC, GHASH_ACC, GFPOLY, TMP1, TMP2, LO + vextracti128 $1, GHASH_ACC, TMP0_XMM + vpxor TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM + jmp .Laad_done + +.Laad_lastblock: + // Update GHASH with the remaining 1 <= AADLEN <= 16 bytes of AAD. + _load_partial_block AAD, TMP0_XMM, %r8, %r8d + vpshufb BSWAP_MASK_XMM, TMP0_XMM, TMP0_XMM + vpxor TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM + vmovdqu OFFSETOFEND_H_POWERS-16(KEY), TMP0_XMM + _ghash_mul TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM, GFPOLY_XMM, \ + TMP1_XMM, TMP2_XMM, LO_XMM + +.Laad_done: + // Store the updated GHASH accumulator back to memory. + vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR) + + vzeroupper + RET +SYM_FUNC_END(aes_gcm_aad_update_vaes_avx2) + +// Do one non-last round of AES encryption on the blocks in the given AESDATA +// vectors using the round key that has been broadcast to all 128-bit lanes of +// \round_key. +.macro _vaesenc round_key, vecs:vararg +.irp i, \vecs + vaesenc \round_key, AESDATA\i, AESDATA\i +.endr +.endm + +// Generate counter blocks in the given AESDATA vectors, then do the zero-th AES +// round on them. Clobbers TMP0. +.macro _ctr_begin vecs:vararg + vbroadcasti128 .Linc_2blocks(%rip), TMP0 +.irp i, \vecs + vpshufb BSWAP_MASK, LE_CTR, AESDATA\i + vpaddd TMP0, LE_CTR, LE_CTR +.endr +.irp i, \vecs + vpxor RNDKEY0, AESDATA\i, AESDATA\i +.endr +.endm + +// Generate and encrypt counter blocks in the given AESDATA vectors, excluding +// the last AES round. Clobbers %rax and TMP0. +.macro _aesenc_loop vecs:vararg + _ctr_begin \vecs + lea 16(KEY), %rax +.Laesenc_loop\@: + vbroadcasti128 (%rax), TMP0 + _vaesenc TMP0, \vecs + add $16, %rax + cmp %rax, RNDKEYLAST_PTR + jne .Laesenc_loop\@ +.endm + +// Finalize the keystream blocks in the given AESDATA vectors by doing the last +// AES round, then XOR those keystream blocks with the corresponding data. +// Reduce latency by doing the XOR before the vaesenclast, utilizing the +// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). Clobbers TMP0. +.macro _aesenclast_and_xor vecs:vararg +.irp i, \vecs + vpxor \i*32(SRC), RNDKEYLAST, TMP0 + vaesenclast TMP0, AESDATA\i, AESDATA\i +.endr +.irp i, \vecs + vmovdqu AESDATA\i, \i*32(DST) +.endr +.endm + +// void aes_gcm_{enc,dec}_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, +// const u32 le_ctr[4], u8 ghash_acc[16], +// const u8 *src, u8 *dst, int datalen); +// +// This macro generates a GCM encryption or decryption update function with the +// above prototype (with \enc selecting which one). The function computes the +// next portion of the CTR keystream, XOR's it with |datalen| bytes from |src|, +// and writes the resulting encrypted or decrypted data to |dst|. It also +// updates the GHASH accumulator |ghash_acc| using the next |datalen| ciphertext +// bytes. +// +// |datalen| must be a multiple of 16, except on the last call where it can be +// any length. The caller must do any buffering needed to ensure this. Both +// in-place and out-of-place en/decryption are supported. +// +// |le_ctr| must give the current counter in little-endian format. This +// function loads the counter from |le_ctr| and increments the loaded counter as +// needed, but it does *not* store the updated counter back to |le_ctr|. The +// caller must update |le_ctr| if any more data segments follow. Internally, +// only the low 32-bit word of the counter is incremented, following the GCM +// standard. +.macro _aes_gcm_update enc + + // Function arguments + .set KEY, %rdi + .set LE_CTR_PTR, %rsi + .set LE_CTR_PTR32, %esi + .set GHASH_ACC_PTR, %rdx + .set SRC, %rcx // Assumed to be %rcx. + // See .Ltail_xor_and_ghash_1to16bytes + .set DST, %r8 + .set DATALEN, %r9d + .set DATALEN64, %r9 // Zero-extend DATALEN before using! + + // Additional local variables + + // %rax is used as a temporary register. LE_CTR_PTR is also available + // as a temporary register after the counter is loaded. + + // AES key length in bytes + .set AESKEYLEN, %r10d + .set AESKEYLEN64, %r10 + + // Pointer to the last AES round key for the chosen AES variant + .set RNDKEYLAST_PTR, %r11 + + // BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values + // using vpshufb, copied to all 128-bit lanes. + .set BSWAP_MASK, %ymm0 + .set BSWAP_MASK_XMM, %xmm0 + + // GHASH_ACC is the accumulator variable for GHASH. When fully reduced, + // only the lowest 128-bit lane can be nonzero. When not fully reduced, + // more than one lane may be used, and they need to be XOR'd together. + .set GHASH_ACC, %ymm1 + .set GHASH_ACC_XMM, %xmm1 + + // TMP[0-2] are temporary registers. + .set TMP0, %ymm2 + .set TMP0_XMM, %xmm2 + .set TMP1, %ymm3 + .set TMP1_XMM, %xmm3 + .set TMP2, %ymm4 + .set TMP2_XMM, %xmm4 + + // LO and MI are used to accumulate unreduced GHASH products. + .set LO, %ymm5 + .set LO_XMM, %xmm5 + .set MI, %ymm6 + .set MI_XMM, %xmm6 + + // H_POW[2-1]_XORED contain cached values from KEY->h_powers_xored. The + // descending numbering reflects the order of the key powers. + .set H_POW2_XORED, %ymm7 + .set H_POW2_XORED_XMM, %xmm7 + .set H_POW1_XORED, %ymm8 + + // RNDKEY0 caches the zero-th round key, and RNDKEYLAST the last one. + .set RNDKEY0, %ymm9 + .set RNDKEYLAST, %ymm10 + + // LE_CTR contains the next set of little-endian counter blocks. + .set LE_CTR, %ymm11 + + // AESDATA[0-3] hold the counter blocks that are being encrypted by AES. + .set AESDATA0, %ymm12 + .set AESDATA0_XMM, %xmm12 + .set AESDATA1, %ymm13 + .set AESDATA1_XMM, %xmm13 + .set AESDATA2, %ymm14 + .set AESDATA3, %ymm15 + +.if \enc + .set GHASHDATA_PTR, DST +.else + .set GHASHDATA_PTR, SRC +.endif + + vbroadcasti128 .Lbswap_mask(%rip), BSWAP_MASK + + // Load the GHASH accumulator and the starting counter. + vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM + vbroadcasti128 (LE_CTR_PTR), LE_CTR + + // Load the AES key length in bytes. + movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN + + // Make RNDKEYLAST_PTR point to the last AES round key. This is the + // round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256 + // respectively. Then load the zero-th and last round keys. + lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR + vbroadcasti128 (KEY), RNDKEY0 + vbroadcasti128 (RNDKEYLAST_PTR), RNDKEYLAST + + // Finish initializing LE_CTR by adding 1 to the second block. + vpaddd .Lctr_pattern(%rip), LE_CTR, LE_CTR + + // If there are at least 128 bytes of data, then continue into the loop + // that processes 128 bytes of data at a time. Otherwise skip it. + add $-128, DATALEN // 128 is 4 bytes, -128 is 1 byte + jl .Lcrypt_loop_4x_done\@ + + vmovdqu OFFSETOF_H_POWERS_XORED(KEY), H_POW2_XORED + vmovdqu OFFSETOF_H_POWERS_XORED+32(KEY), H_POW1_XORED + + // Main loop: en/decrypt and hash 4 vectors (128 bytes) at a time. + +.if \enc + // Encrypt the first 4 vectors of plaintext blocks. + _aesenc_loop 0,1,2,3 + _aesenclast_and_xor 0,1,2,3 + sub $-128, SRC // 128 is 4 bytes, -128 is 1 byte + add $-128, DATALEN + jl .Lghash_last_ciphertext_4x\@ +.endif + +.align 16 +.Lcrypt_loop_4x\@: + + // Start the AES encryption of the counter blocks. + _ctr_begin 0,1,2,3 + cmp $24, AESKEYLEN + jl 128f // AES-128? + je 192f // AES-192? + // AES-256 + vbroadcasti128 -13*16(RNDKEYLAST_PTR), TMP0 + _vaesenc TMP0, 0,1,2,3 + vbroadcasti128 -12*16(RNDKEYLAST_PTR), TMP0 + _vaesenc TMP0, 0,1,2,3 +192: + vbroadcasti128 -11*16(RNDKEYLAST_PTR), TMP0 + _vaesenc TMP0, 0,1,2,3 + vbroadcasti128 -10*16(RNDKEYLAST_PTR), TMP0 + _vaesenc TMP0, 0,1,2,3 +128: + + // Finish the AES encryption of the counter blocks in AESDATA[0-3], + // interleaved with the GHASH update of the ciphertext blocks. +.irp i, 9,8,7,6,5,4,3,2,1 + _ghash_step_4x (9 - \i), GHASHDATA_PTR + vbroadcasti128 -\i*16(RNDKEYLAST_PTR), TMP0 + _vaesenc TMP0, 0,1,2,3 +.endr + _ghash_step_4x 9, GHASHDATA_PTR +.if \enc + sub $-128, DST // 128 is 4 bytes, -128 is 1 byte +.endif + _aesenclast_and_xor 0,1,2,3 + sub $-128, SRC +.if !\enc + sub $-128, DST +.endif + add $-128, DATALEN + jge .Lcrypt_loop_4x\@ + +.if \enc +.Lghash_last_ciphertext_4x\@: + // Update GHASH with the last set of ciphertext blocks. + _ghash_4x DST + sub $-128, DST +.endif + +.Lcrypt_loop_4x_done\@: + + // Undo the extra subtraction by 128 and check whether data remains. + sub $-128, DATALEN // 128 is 4 bytes, -128 is 1 byte + jz .Ldone\@ + + // The data length isn't a multiple of 128 bytes. Process the remaining + // data of length 1 <= DATALEN < 128. + // + // Since there are enough key powers available for all remaining data, + // there is no need to do a GHASH reduction after each iteration. + // Instead, multiply each remaining block by its own key power, and only + // do a GHASH reduction at the very end. + + // Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N + // is the number of blocks that remain. + .set POWERS_PTR, LE_CTR_PTR // LE_CTR_PTR is free to be reused. + .set POWERS_PTR32, LE_CTR_PTR32 + mov DATALEN, %eax + neg %rax + and $~15, %rax // -round_up(DATALEN, 16) + lea OFFSETOFEND_H_POWERS(KEY,%rax), POWERS_PTR + + // Start collecting the unreduced GHASH intermediate value LO, MI, HI. + .set HI, H_POW2_XORED // H_POW2_XORED is free to be reused. + .set HI_XMM, H_POW2_XORED_XMM + vpxor LO_XMM, LO_XMM, LO_XMM + vpxor MI_XMM, MI_XMM, MI_XMM + vpxor HI_XMM, HI_XMM, HI_XMM + + // 1 <= DATALEN < 128. Generate 2 or 4 more vectors of keystream blocks + // excluding the last AES round, depending on the remaining DATALEN. + cmp $64, DATALEN + jg .Ltail_gen_4_keystream_vecs\@ + _aesenc_loop 0,1 + cmp $32, DATALEN + jge .Ltail_xor_and_ghash_full_vec_loop\@ + jmp .Ltail_xor_and_ghash_partial_vec\@ +.Ltail_gen_4_keystream_vecs\@: + _aesenc_loop 0,1,2,3 + + // XOR the remaining data and accumulate the unreduced GHASH products + // for DATALEN >= 32, starting with one full 32-byte vector at a time. +.Ltail_xor_and_ghash_full_vec_loop\@: +.if \enc + _aesenclast_and_xor 0 + vpshufb BSWAP_MASK, AESDATA0, AESDATA0 +.else + vmovdqu (SRC), TMP1 + vpxor TMP1, RNDKEYLAST, TMP0 + vaesenclast TMP0, AESDATA0, AESDATA0 + vmovdqu AESDATA0, (DST) + vpshufb BSWAP_MASK, TMP1, AESDATA0 +.endif + // The ciphertext blocks (i.e. GHASH input data) are now in AESDATA0. + vpxor GHASH_ACC, AESDATA0, AESDATA0 + vmovdqu (POWERS_PTR), TMP2 + _ghash_mul_noreduce TMP2, AESDATA0, LO, MI, HI, TMP0 + vmovdqa AESDATA1, AESDATA0 + vmovdqa AESDATA2, AESDATA1 + vmovdqa AESDATA3, AESDATA2 + vpxor GHASH_ACC_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM + add $32, SRC + add $32, DST + add $32, POWERS_PTR + sub $32, DATALEN + cmp $32, DATALEN + jge .Ltail_xor_and_ghash_full_vec_loop\@ + test DATALEN, DATALEN + jz .Ltail_ghash_reduce\@ + +.Ltail_xor_and_ghash_partial_vec\@: + // XOR the remaining data and accumulate the unreduced GHASH products, + // for 1 <= DATALEN < 32. + vaesenclast RNDKEYLAST, AESDATA0, AESDATA0 + cmp $16, DATALEN + jle .Ltail_xor_and_ghash_1to16bytes\@ + + // Handle 17 <= DATALEN < 32. + + // Load a vpshufb mask that will right-shift by '32 - DATALEN' bytes + // (shifting in zeroes), then reflect all 16 bytes. + lea .Lrshift_and_bswap_table(%rip), %rax + vmovdqu -16(%rax, DATALEN64), TMP2_XMM + + // Move the second keystream block to its own register and left-align it + vextracti128 $1, AESDATA0, AESDATA1_XMM + vpxor .Lfifteens(%rip), TMP2_XMM, TMP0_XMM + vpshufb TMP0_XMM, AESDATA1_XMM, AESDATA1_XMM + + // Using overlapping loads and stores, XOR the source data with the + // keystream and write the destination data. Then prepare the GHASH + // input data: the full ciphertext block and the zero-padded partial + // ciphertext block, both byte-reflected, in AESDATA0. +.if \enc + vpxor -16(SRC, DATALEN64), AESDATA1_XMM, AESDATA1_XMM + vpxor (SRC), AESDATA0_XMM, AESDATA0_XMM + vmovdqu AESDATA1_XMM, -16(DST, DATALEN64) + vmovdqu AESDATA0_XMM, (DST) + vpshufb TMP2_XMM, AESDATA1_XMM, AESDATA1_XMM + vpshufb BSWAP_MASK_XMM, AESDATA0_XMM, AESDATA0_XMM +.else + vmovdqu -16(SRC, DATALEN64), TMP1_XMM + vmovdqu (SRC), TMP0_XMM + vpxor TMP1_XMM, AESDATA1_XMM, AESDATA1_XMM + vpxor TMP0_XMM, AESDATA0_XMM, AESDATA0_XMM + vmovdqu AESDATA1_XMM, -16(DST, DATALEN64) + vmovdqu AESDATA0_XMM, (DST) + vpshufb TMP2_XMM, TMP1_XMM, AESDATA1_XMM + vpshufb BSWAP_MASK_XMM, TMP0_XMM, AESDATA0_XMM +.endif + vpxor GHASH_ACC_XMM, AESDATA0_XMM, AESDATA0_XMM + vinserti128 $1, AESDATA1_XMM, AESDATA0, AESDATA0 + vmovdqu (POWERS_PTR), TMP2 + jmp .Ltail_ghash_last_vec\@ + +.Ltail_xor_and_ghash_1to16bytes\@: + // Handle 1 <= DATALEN <= 16. Carefully load and store the + // possibly-partial block, which we mustn't access out of bounds. + vmovdqu (POWERS_PTR), TMP2_XMM + mov SRC, KEY // Free up %rcx, assuming SRC == %rcx + mov DATALEN, %ecx + _load_partial_block KEY, TMP0_XMM, POWERS_PTR, POWERS_PTR32 + vpxor TMP0_XMM, AESDATA0_XMM, AESDATA0_XMM + mov DATALEN, %ecx + _store_partial_block AESDATA0_XMM, DST, POWERS_PTR, POWERS_PTR32 +.if \enc + lea .Lselect_high_bytes_table(%rip), %rax + vpshufb BSWAP_MASK_XMM, AESDATA0_XMM, AESDATA0_XMM + vpand (%rax, DATALEN64), AESDATA0_XMM, AESDATA0_XMM +.else + vpshufb BSWAP_MASK_XMM, TMP0_XMM, AESDATA0_XMM +.endif + vpxor GHASH_ACC_XMM, AESDATA0_XMM, AESDATA0_XMM + +.Ltail_ghash_last_vec\@: + // Accumulate the unreduced GHASH products for the last 1-2 blocks. The + // GHASH input data is in AESDATA0. If only one block remains, then the + // second block in AESDATA0 is zero and does not affect the result. + _ghash_mul_noreduce TMP2, AESDATA0, LO, MI, HI, TMP0 + +.Ltail_ghash_reduce\@: + // Finally, do the GHASH reduction. + vbroadcasti128 .Lgfpoly(%rip), TMP0 + _ghash_reduce LO, MI, HI, TMP0, TMP1 + vextracti128 $1, HI, GHASH_ACC_XMM + vpxor HI_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM + +.Ldone\@: + // Store the updated GHASH accumulator back to memory. + vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR) + + vzeroupper + RET +.endm + +// void aes_gcm_enc_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, +// const u32 le_ctr[4], u8 ghash_acc[16], +// u64 total_aadlen, u64 total_datalen); +// bool aes_gcm_dec_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, +// const u32 le_ctr[4], const u8 ghash_acc[16], +// u64 total_aadlen, u64 total_datalen, +// const u8 tag[16], int taglen); +// +// This macro generates one of the above two functions (with \enc selecting +// which one). Both functions finish computing the GCM authentication tag by +// updating GHASH with the lengths block and encrypting the GHASH accumulator. +// |total_aadlen| and |total_datalen| must be the total length of the additional +// authenticated data and the en/decrypted data in bytes, respectively. +// +// The encryption function then stores the full-length (16-byte) computed +// authentication tag to |ghash_acc|. The decryption function instead loads the +// expected authentication tag (the one that was transmitted) from the 16-byte +// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the +// computed tag in constant time, and returns true if and only if they match. +.macro _aes_gcm_final enc + + // Function arguments + .set KEY, %rdi + .set LE_CTR_PTR, %rsi + .set GHASH_ACC_PTR, %rdx + .set TOTAL_AADLEN, %rcx + .set TOTAL_DATALEN, %r8 + .set TAG, %r9 + .set TAGLEN, %r10d // Originally at 8(%rsp) + .set TAGLEN64, %r10 + + // Additional local variables. + // %rax and %xmm0-%xmm3 are used as temporary registers. + .set AESKEYLEN, %r11d + .set AESKEYLEN64, %r11 + .set GFPOLY, %xmm4 + .set BSWAP_MASK, %xmm5 + .set LE_CTR, %xmm6 + .set GHASH_ACC, %xmm7 + .set H_POW1, %xmm8 + + // Load some constants. + vmovdqa .Lgfpoly(%rip), GFPOLY + vmovdqa .Lbswap_mask(%rip), BSWAP_MASK + + // Load the AES key length in bytes. + movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN + + // Set up a counter block with 1 in the low 32-bit word. This is the + // counter that produces the ciphertext needed to encrypt the auth tag. + // GFPOLY has 1 in the low word, so grab the 1 from there using a blend. + vpblendd $0xe, (LE_CTR_PTR), GFPOLY, LE_CTR + + // Build the lengths block and XOR it with the GHASH accumulator. + // Although the lengths block is defined as the AAD length followed by + // the en/decrypted data length, both in big-endian byte order, a byte + // reflection of the full block is needed because of the way we compute + // GHASH (see _ghash_mul_step). By using little-endian values in the + // opposite order, we avoid having to reflect any bytes here. + vmovq TOTAL_DATALEN, %xmm0 + vpinsrq $1, TOTAL_AADLEN, %xmm0, %xmm0 + vpsllq $3, %xmm0, %xmm0 // Bytes to bits + vpxor (GHASH_ACC_PTR), %xmm0, GHASH_ACC + + // Load the first hash key power (H^1), which is stored last. + vmovdqu OFFSETOFEND_H_POWERS-16(KEY), H_POW1 + + // Load TAGLEN if decrypting. +.if !\enc + movl 8(%rsp), TAGLEN +.endif + + // Make %rax point to the last AES round key for the chosen AES variant. + lea 6*16(KEY,AESKEYLEN64,4), %rax + + // Start the AES encryption of the counter block by swapping the counter + // block to big-endian and XOR-ing it with the zero-th AES round key. + vpshufb BSWAP_MASK, LE_CTR, %xmm0 + vpxor (KEY), %xmm0, %xmm0 + + // Complete the AES encryption and multiply GHASH_ACC by H^1. + // Interleave the AES and GHASH instructions to improve performance. + cmp $24, AESKEYLEN + jl 128f // AES-128? + je 192f // AES-192? + // AES-256 + vaesenc -13*16(%rax), %xmm0, %xmm0 + vaesenc -12*16(%rax), %xmm0, %xmm0 +192: + vaesenc -11*16(%rax), %xmm0, %xmm0 + vaesenc -10*16(%rax), %xmm0, %xmm0 +128: +.irp i, 0,1,2,3,4,5,6,7,8 + _ghash_mul_step \i, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ + %xmm1, %xmm2, %xmm3 + vaesenc (\i-9)*16(%rax), %xmm0, %xmm0 +.endr + _ghash_mul_step 9, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ + %xmm1, %xmm2, %xmm3 + + // Undo the byte reflection of the GHASH accumulator. + vpshufb BSWAP_MASK, GHASH_ACC, GHASH_ACC + + // Do the last AES round and XOR the resulting keystream block with the + // GHASH accumulator to produce the full computed authentication tag. + // + // Reduce latency by taking advantage of the property vaesenclast(key, + // a) ^ b == vaesenclast(key ^ b, a). I.e., XOR GHASH_ACC into the last + // round key, instead of XOR'ing the final AES output with GHASH_ACC. + // + // enc_final then returns the computed auth tag, while dec_final + // compares it with the transmitted one and returns a bool. To compare + // the tags, dec_final XORs them together and uses vptest to check + // whether the result is all-zeroes. This should be constant-time. + // dec_final applies the vaesenclast optimization to this additional + // value XOR'd too. +.if \enc + vpxor (%rax), GHASH_ACC, %xmm1 + vaesenclast %xmm1, %xmm0, GHASH_ACC + vmovdqu GHASH_ACC, (GHASH_ACC_PTR) +.else + vpxor (TAG), GHASH_ACC, GHASH_ACC + vpxor (%rax), GHASH_ACC, GHASH_ACC + vaesenclast GHASH_ACC, %xmm0, %xmm0 + lea .Lselect_high_bytes_table(%rip), %rax + vmovdqu (%rax, TAGLEN64), %xmm1 + vpshufb BSWAP_MASK, %xmm1, %xmm1 // select low bytes, not high + xor %eax, %eax + vptest %xmm1, %xmm0 + sete %al +.endif + // No need for vzeroupper here, since only used xmm registers were used. + RET +.endm + +SYM_FUNC_START(aes_gcm_enc_update_vaes_avx2) + _aes_gcm_update 1 +SYM_FUNC_END(aes_gcm_enc_update_vaes_avx2) +SYM_FUNC_START(aes_gcm_dec_update_vaes_avx2) + _aes_gcm_update 0 +SYM_FUNC_END(aes_gcm_dec_update_vaes_avx2) + +SYM_FUNC_START(aes_gcm_enc_final_vaes_avx2) + _aes_gcm_final 1 +SYM_FUNC_END(aes_gcm_enc_final_vaes_avx2) +SYM_FUNC_START(aes_gcm_dec_final_vaes_avx2) + _aes_gcm_final 0 +SYM_FUNC_END(aes_gcm_dec_final_vaes_avx2) diff --git a/arch/x86/crypto/aes-gcm-avx10-x86_64.S b/arch/x86/crypto/aes-gcm-vaes-avx512.S index 02ee11083d4f..06b71314d65c 100644 --- a/arch/x86/crypto/aes-gcm-avx10-x86_64.S +++ b/arch/x86/crypto/aes-gcm-vaes-avx512.S @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ // -// VAES and VPCLMULQDQ optimized AES-GCM for x86_64 +// AES-GCM implementation for x86_64 CPUs that support the following CPU +// features: VAES && VPCLMULQDQ && AVX512BW && AVX512VL && BMI2 // // Copyright 2024 Google LLC // @@ -45,41 +46,6 @@ // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. -// -//------------------------------------------------------------------------------ -// -// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that -// support VAES (vector AES), VPCLMULQDQ (vector carryless multiplication), and -// either AVX512 or AVX10. Some of the functions, notably the encryption and -// decryption update functions which are the most performance-critical, are -// provided in two variants generated from a macro: one using 256-bit vectors -// (suffix: vaes_avx10_256) and one using 512-bit vectors (vaes_avx10_512). The -// other, "shared" functions (vaes_avx10) use at most 256-bit vectors. -// -// The functions that use 512-bit vectors are intended for CPUs that support -// 512-bit vectors *and* where using them doesn't cause significant -// downclocking. They require the following CPU features: -// -// VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/512) -// -// The other functions require the following CPU features: -// -// VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/256) -// -// All functions use the "System V" ABI. The Windows ABI is not supported. -// -// Note that we use "avx10" in the names of the functions as a shorthand to -// really mean "AVX10 or a certain set of AVX512 features". Due to Intel's -// introduction of AVX512 and then its replacement by AVX10, there doesn't seem -// to be a simple way to name things that makes sense on all CPUs. -// -// Note that the macros that support both 256-bit and 512-bit vectors could -// fairly easily be changed to support 128-bit too. However, this would *not* -// be sufficient to allow the code to run on CPUs without AVX512 or AVX10, -// because the code heavily uses several features of these extensions other than -// the vector length: the increase in the number of SIMD registers from 16 to -// 32, masking support, and new instructions such as vpternlogd (which can do a -// three-argument XOR). These features are very useful for AES-GCM. #include <linux/linkage.h> @@ -104,16 +70,14 @@ .Lgfpoly_and_internal_carrybit: .octa 0xc2000000000000010000000000000001 - // The below constants are used for incrementing the counter blocks. - // ctr_pattern points to the four 128-bit values [0, 1, 2, 3]. - // inc_2blocks and inc_4blocks point to the single 128-bit values 2 and - // 4. Note that the same '2' is reused in ctr_pattern and inc_2blocks. + // Values needed to prepare the initial vector of counter blocks. .Lctr_pattern: .octa 0 .octa 1 -.Linc_2blocks: .octa 2 .octa 3 + + // The number of AES blocks per vector, as a 128-bit value. .Linc_4blocks: .octa 4 @@ -130,29 +94,13 @@ // Offset to end of hash key powers array in the key struct. // // This is immediately followed by three zeroized padding blocks, which are -// included so that partial vectors can be handled more easily. E.g. if VL=64 -// and two blocks remain, we load the 4 values [H^2, H^1, 0, 0]. The most -// padding blocks needed is 3, which occurs if [H^1, 0, 0, 0] is loaded. +// included so that partial vectors can be handled more easily. E.g. if two +// blocks remain, we load the 4 values [H^2, H^1, 0, 0]. The most padding +// blocks needed is 3, which occurs if [H^1, 0, 0, 0] is loaded. #define OFFSETOFEND_H_POWERS (OFFSETOF_H_POWERS + (NUM_H_POWERS * 16)) .text -// Set the vector length in bytes. This sets the VL variable and defines -// register aliases V0-V31 that map to the ymm or zmm registers. -.macro _set_veclen vl - .set VL, \vl -.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ - 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 -.if VL == 32 - .set V\i, %ymm\i -.elseif VL == 64 - .set V\i, %zmm\i -.else - .error "Unsupported vector length" -.endif -.endr -.endm - // The _ghash_mul_step macro does one step of GHASH multiplication of the // 128-bit lanes of \a by the corresponding 128-bit lanes of \b and storing the // reduced products in \dst. \t0, \t1, and \t2 are temporary registers of the @@ -312,39 +260,44 @@ vpternlogd $0x96, \t0, \mi, \hi .endm -// void aes_gcm_precompute_##suffix(struct aes_gcm_key_avx10 *key); -// -// Given the expanded AES key |key->aes_key|, this function derives the GHASH -// subkey and initializes |key->ghash_key_powers| with powers of it. -// -// The number of key powers initialized is NUM_H_POWERS, and they are stored in -// the order H^NUM_H_POWERS to H^1. The zeroized padding blocks after the key -// powers themselves are also initialized. +// This is a specialized version of _ghash_mul that computes \a * \a, i.e. it +// squares \a. It skips computing MI = (a_L * a_H) + (a_H * a_L) = 0. +.macro _ghash_square a, dst, gfpoly, t0, t1 + vpclmulqdq $0x00, \a, \a, \t0 // LO = a_L * a_L + vpclmulqdq $0x11, \a, \a, \dst // HI = a_H * a_H + vpclmulqdq $0x01, \t0, \gfpoly, \t1 // LO_L*(x^63 + x^62 + x^57) + vpshufd $0x4e, \t0, \t0 // Swap halves of LO + vpxord \t0, \t1, \t1 // Fold LO into MI + vpclmulqdq $0x01, \t1, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57) + vpshufd $0x4e, \t1, \t1 // Swap halves of MI + vpternlogd $0x96, \t0, \t1, \dst // Fold MI into HI +.endm + +// void aes_gcm_precompute_vaes_avx512(struct aes_gcm_key_vaes_avx512 *key); // -// This macro supports both VL=32 and VL=64. _set_veclen must have been invoked -// with the desired length. In the VL=32 case, the function computes twice as -// many key powers than are actually used by the VL=32 GCM update functions. -// This is done to keep the key format the same regardless of vector length. -.macro _aes_gcm_precompute +// Given the expanded AES key |key->base.aes_key|, derive the GHASH subkey and +// initialize |key->h_powers| and |key->padding|. +SYM_FUNC_START(aes_gcm_precompute_vaes_avx512) // Function arguments .set KEY, %rdi - // Additional local variables. V0-V2 and %rax are used as temporaries. + // Additional local variables. + // %zmm[0-2] and %rax are used as temporaries. .set POWERS_PTR, %rsi .set RNDKEYLAST_PTR, %rdx - .set H_CUR, V3 + .set H_CUR, %zmm3 .set H_CUR_YMM, %ymm3 .set H_CUR_XMM, %xmm3 - .set H_INC, V4 + .set H_INC, %zmm4 .set H_INC_YMM, %ymm4 .set H_INC_XMM, %xmm4 - .set GFPOLY, V5 + .set GFPOLY, %zmm5 .set GFPOLY_YMM, %ymm5 .set GFPOLY_XMM, %xmm5 // Get pointer to lowest set of key powers (located at end of array). - lea OFFSETOFEND_H_POWERS-VL(KEY), POWERS_PTR + lea OFFSETOFEND_H_POWERS-64(KEY), POWERS_PTR // Encrypt an all-zeroes block to get the raw hash subkey. movl OFFSETOF_AESKEYLEN(KEY), %eax @@ -363,8 +316,8 @@ // Zeroize the padding blocks. vpxor %xmm0, %xmm0, %xmm0 - vmovdqu %ymm0, VL(POWERS_PTR) - vmovdqu %xmm0, VL+2*16(POWERS_PTR) + vmovdqu %ymm0, 64(POWERS_PTR) + vmovdqu %xmm0, 64+2*16(POWERS_PTR) // Finish preprocessing the first key power, H^1. Since this GHASH // implementation operates directly on values with the backwards bit @@ -397,54 +350,44 @@ // special needs to be done to make this happen, though: H^1 * H^1 would // end up with two factors of x^-1, but the multiplication consumes one. // So the product H^2 ends up with the desired one factor of x^-1. - _ghash_mul H_CUR_XMM, H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, \ - %xmm0, %xmm1, %xmm2 + _ghash_square H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, %xmm0, %xmm1 // Create H_CUR_YMM = [H^2, H^1] and H_INC_YMM = [H^2, H^2]. vinserti128 $1, H_CUR_XMM, H_INC_YMM, H_CUR_YMM vinserti128 $1, H_INC_XMM, H_INC_YMM, H_INC_YMM -.if VL == 64 // Create H_CUR = [H^4, H^3, H^2, H^1] and H_INC = [H^4, H^4, H^4, H^4]. _ghash_mul H_INC_YMM, H_CUR_YMM, H_INC_YMM, GFPOLY_YMM, \ %ymm0, %ymm1, %ymm2 vinserti64x4 $1, H_CUR_YMM, H_INC, H_CUR vshufi64x2 $0, H_INC, H_INC, H_INC -.endif // Store the lowest set of key powers. vmovdqu8 H_CUR, (POWERS_PTR) - // Compute and store the remaining key powers. With VL=32, repeatedly - // multiply [H^(i+1), H^i] by [H^2, H^2] to get [H^(i+3), H^(i+2)]. - // With VL=64, repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by + // Compute and store the remaining key powers. + // Repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by // [H^4, H^4, H^4, H^4] to get [H^(i+7), H^(i+6), H^(i+5), H^(i+4)]. - mov $(NUM_H_POWERS*16/VL) - 1, %eax -.Lprecompute_next\@: - sub $VL, POWERS_PTR - _ghash_mul H_INC, H_CUR, H_CUR, GFPOLY, V0, V1, V2 + mov $3, %eax +.Lprecompute_next: + sub $64, POWERS_PTR + _ghash_mul H_INC, H_CUR, H_CUR, GFPOLY, %zmm0, %zmm1, %zmm2 vmovdqu8 H_CUR, (POWERS_PTR) dec %eax - jnz .Lprecompute_next\@ + jnz .Lprecompute_next vzeroupper // This is needed after using ymm or zmm registers. RET -.endm +SYM_FUNC_END(aes_gcm_precompute_vaes_avx512) // XOR together the 128-bit lanes of \src (whose low lane is \src_xmm) and store // the result in \dst_xmm. This implicitly zeroizes the other lanes of dst. .macro _horizontal_xor src, src_xmm, dst_xmm, t0_xmm, t1_xmm, t2_xmm vextracti32x4 $1, \src, \t0_xmm -.if VL == 32 - vpxord \t0_xmm, \src_xmm, \dst_xmm -.elseif VL == 64 vextracti32x4 $2, \src, \t1_xmm vextracti32x4 $3, \src, \t2_xmm vpxord \t0_xmm, \src_xmm, \dst_xmm vpternlogd $0x96, \t1_xmm, \t2_xmm, \dst_xmm -.else - .error "Unsupported vector length" -.endif .endm // Do one step of the GHASH update of the data blocks given in the vector @@ -458,25 +401,21 @@ // // The GHASH update does: GHASH_ACC = H_POW4*(GHASHDATA0 + GHASH_ACC) + // H_POW3*GHASHDATA1 + H_POW2*GHASHDATA2 + H_POW1*GHASHDATA3, where the -// operations are vectorized operations on vectors of 16-byte blocks. E.g., -// with VL=32 there are 2 blocks per vector and the vectorized terms correspond -// to the following non-vectorized terms: -// -// H_POW4*(GHASHDATA0 + GHASH_ACC) => H^8*(blk0 + GHASH_ACC_XMM) and H^7*(blk1 + 0) -// H_POW3*GHASHDATA1 => H^6*blk2 and H^5*blk3 -// H_POW2*GHASHDATA2 => H^4*blk4 and H^3*blk5 -// H_POW1*GHASHDATA3 => H^2*blk6 and H^1*blk7 +// operations are vectorized operations on 512-bit vectors of 128-bit blocks. +// The vectorized terms correspond to the following non-vectorized terms: // -// With VL=64, we use 4 blocks/vector, H^16 through H^1, and blk0 through blk15. +// H_POW4*(GHASHDATA0 + GHASH_ACC) => H^16*(blk0 + GHASH_ACC_XMM), +// H^15*(blk1 + 0), H^14*(blk2 + 0), and H^13*(blk3 + 0) +// H_POW3*GHASHDATA1 => H^12*blk4, H^11*blk5, H^10*blk6, and H^9*blk7 +// H_POW2*GHASHDATA2 => H^8*blk8, H^7*blk9, H^6*blk10, and H^5*blk11 +// H_POW1*GHASHDATA3 => H^4*blk12, H^3*blk13, H^2*blk14, and H^1*blk15 // // More concretely, this code does: // - Do vectorized "schoolbook" multiplications to compute the intermediate // 256-bit product of each block and its corresponding hash key power. -// There are 4*VL/16 of these intermediate products. -// - Sum (XOR) the intermediate 256-bit products across vectors. This leaves -// VL/16 256-bit intermediate values. +// - Sum (XOR) the intermediate 256-bit products across vectors. // - Do a vectorized reduction of these 256-bit intermediate values to -// 128-bits each. This leaves VL/16 128-bit intermediate values. +// 128-bits each. // - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM. // // See _ghash_mul_step for the full explanation of the operations performed for @@ -532,85 +471,224 @@ .endif .endm -// Do one non-last round of AES encryption on the counter blocks in V0-V3 using -// the round key that has been broadcast to all 128-bit lanes of \round_key. +// Update GHASH with four vectors of data blocks. See _ghash_step_4x for full +// explanation. +.macro _ghash_4x +.irp i, 0,1,2,3,4,5,6,7,8,9 + _ghash_step_4x \i +.endr +.endm + +// void aes_gcm_aad_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, +// u8 ghash_acc[16], +// const u8 *aad, int aadlen); +// +// This function processes the AAD (Additional Authenticated Data) in GCM. +// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the +// data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all +// zeroes. |aadlen| must be a multiple of 16, except on the last call where it +// can be any length. The caller must do any buffering needed to ensure this. +// +// This handles large amounts of AAD efficiently, while also keeping overhead +// low for small amounts which is the common case. TLS and IPsec use less than +// one block of AAD, but (uncommonly) other use cases may use much more. +SYM_FUNC_START(aes_gcm_aad_update_vaes_avx512) + + // Function arguments + .set KEY, %rdi + .set GHASH_ACC_PTR, %rsi + .set AAD, %rdx + .set AADLEN, %ecx + .set AADLEN64, %rcx // Zero-extend AADLEN before using! + + // Additional local variables. + // %rax and %k1 are used as temporary registers. + .set GHASHDATA0, %zmm0 + .set GHASHDATA0_XMM, %xmm0 + .set GHASHDATA1, %zmm1 + .set GHASHDATA1_XMM, %xmm1 + .set GHASHDATA2, %zmm2 + .set GHASHDATA2_XMM, %xmm2 + .set GHASHDATA3, %zmm3 + .set BSWAP_MASK, %zmm4 + .set BSWAP_MASK_XMM, %xmm4 + .set GHASH_ACC, %zmm5 + .set GHASH_ACC_XMM, %xmm5 + .set H_POW4, %zmm6 + .set H_POW3, %zmm7 + .set H_POW2, %zmm8 + .set H_POW1, %zmm9 + .set H_POW1_XMM, %xmm9 + .set GFPOLY, %zmm10 + .set GFPOLY_XMM, %xmm10 + .set GHASHTMP0, %zmm11 + .set GHASHTMP1, %zmm12 + .set GHASHTMP2, %zmm13 + + // Load the GHASH accumulator. + vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM + + // Check for the common case of AADLEN <= 16, as well as AADLEN == 0. + cmp $16, AADLEN + jg .Laad_more_than_16bytes + test AADLEN, AADLEN + jz .Laad_done + + // Fast path: update GHASH with 1 <= AADLEN <= 16 bytes of AAD. + vmovdqu .Lbswap_mask(%rip), BSWAP_MASK_XMM + vmovdqu .Lgfpoly(%rip), GFPOLY_XMM + mov $-1, %eax + bzhi AADLEN, %eax, %eax + kmovd %eax, %k1 + vmovdqu8 (AAD), GHASHDATA0_XMM{%k1}{z} + vmovdqu OFFSETOFEND_H_POWERS-16(KEY), H_POW1_XMM + vpshufb BSWAP_MASK_XMM, GHASHDATA0_XMM, GHASHDATA0_XMM + vpxor GHASHDATA0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM + _ghash_mul H_POW1_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM, GFPOLY_XMM, \ + GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM + jmp .Laad_done + +.Laad_more_than_16bytes: + vbroadcasti32x4 .Lbswap_mask(%rip), BSWAP_MASK + vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY + + // If AADLEN >= 256, update GHASH with 256 bytes of AAD at a time. + sub $256, AADLEN + jl .Laad_loop_4x_done + vmovdqu8 OFFSETOFEND_H_POWERS-4*64(KEY), H_POW4 + vmovdqu8 OFFSETOFEND_H_POWERS-3*64(KEY), H_POW3 + vmovdqu8 OFFSETOFEND_H_POWERS-2*64(KEY), H_POW2 + vmovdqu8 OFFSETOFEND_H_POWERS-1*64(KEY), H_POW1 +.Laad_loop_4x: + vmovdqu8 0*64(AAD), GHASHDATA0 + vmovdqu8 1*64(AAD), GHASHDATA1 + vmovdqu8 2*64(AAD), GHASHDATA2 + vmovdqu8 3*64(AAD), GHASHDATA3 + _ghash_4x + add $256, AAD + sub $256, AADLEN + jge .Laad_loop_4x +.Laad_loop_4x_done: + + // If AADLEN >= 64, update GHASH with 64 bytes of AAD at a time. + add $192, AADLEN + jl .Laad_loop_1x_done + vmovdqu8 OFFSETOFEND_H_POWERS-1*64(KEY), H_POW1 +.Laad_loop_1x: + vmovdqu8 (AAD), GHASHDATA0 + vpshufb BSWAP_MASK, GHASHDATA0, GHASHDATA0 + vpxord GHASHDATA0, GHASH_ACC, GHASH_ACC + _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ + GHASHDATA0, GHASHDATA1, GHASHDATA2 + _horizontal_xor GHASH_ACC, GHASH_ACC_XMM, GHASH_ACC_XMM, \ + GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM + add $64, AAD + sub $64, AADLEN + jge .Laad_loop_1x +.Laad_loop_1x_done: + + // Update GHASH with the remaining 0 <= AADLEN < 64 bytes of AAD. + add $64, AADLEN + jz .Laad_done + mov $-1, %rax + bzhi AADLEN64, %rax, %rax + kmovq %rax, %k1 + vmovdqu8 (AAD), GHASHDATA0{%k1}{z} + neg AADLEN64 + and $~15, AADLEN64 // -round_up(AADLEN, 16) + vmovdqu8 OFFSETOFEND_H_POWERS(KEY,AADLEN64), H_POW1 + vpshufb BSWAP_MASK, GHASHDATA0, GHASHDATA0 + vpxord GHASHDATA0, GHASH_ACC, GHASH_ACC + _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ + GHASHDATA0, GHASHDATA1, GHASHDATA2 + _horizontal_xor GHASH_ACC, GHASH_ACC_XMM, GHASH_ACC_XMM, \ + GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM + +.Laad_done: + // Store the updated GHASH accumulator back to memory. + vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR) + + vzeroupper // This is needed after using ymm or zmm registers. + RET +SYM_FUNC_END(aes_gcm_aad_update_vaes_avx512) + +// Do one non-last round of AES encryption on the blocks in %zmm[0-3] using the +// round key that has been broadcast to all 128-bit lanes of \round_key. .macro _vaesenc_4x round_key - vaesenc \round_key, V0, V0 - vaesenc \round_key, V1, V1 - vaesenc \round_key, V2, V2 - vaesenc \round_key, V3, V3 + vaesenc \round_key, %zmm0, %zmm0 + vaesenc \round_key, %zmm1, %zmm1 + vaesenc \round_key, %zmm2, %zmm2 + vaesenc \round_key, %zmm3, %zmm3 .endm // Start the AES encryption of four vectors of counter blocks. .macro _ctr_begin_4x // Increment LE_CTR four times to generate four vectors of little-endian - // counter blocks, swap each to big-endian, and store them in V0-V3. - vpshufb BSWAP_MASK, LE_CTR, V0 + // counter blocks, swap each to big-endian, and store them in %zmm[0-3]. + vpshufb BSWAP_MASK, LE_CTR, %zmm0 vpaddd LE_CTR_INC, LE_CTR, LE_CTR - vpshufb BSWAP_MASK, LE_CTR, V1 + vpshufb BSWAP_MASK, LE_CTR, %zmm1 vpaddd LE_CTR_INC, LE_CTR, LE_CTR - vpshufb BSWAP_MASK, LE_CTR, V2 + vpshufb BSWAP_MASK, LE_CTR, %zmm2 vpaddd LE_CTR_INC, LE_CTR, LE_CTR - vpshufb BSWAP_MASK, LE_CTR, V3 + vpshufb BSWAP_MASK, LE_CTR, %zmm3 vpaddd LE_CTR_INC, LE_CTR, LE_CTR // AES "round zero": XOR in the zero-th round key. - vpxord RNDKEY0, V0, V0 - vpxord RNDKEY0, V1, V1 - vpxord RNDKEY0, V2, V2 - vpxord RNDKEY0, V3, V3 + vpxord RNDKEY0, %zmm0, %zmm0 + vpxord RNDKEY0, %zmm1, %zmm1 + vpxord RNDKEY0, %zmm2, %zmm2 + vpxord RNDKEY0, %zmm3, %zmm3 .endm -// Do the last AES round for four vectors of counter blocks V0-V3, XOR source -// data with the resulting keystream, and write the result to DST and +// Do the last AES round for four vectors of counter blocks %zmm[0-3], XOR +// source data with the resulting keystream, and write the result to DST and // GHASHDATA[0-3]. (Implementation differs slightly, but has the same effect.) .macro _aesenclast_and_xor_4x // XOR the source data with the last round key, saving the result in // GHASHDATA[0-3]. This reduces latency by taking advantage of the // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). - vpxord 0*VL(SRC), RNDKEYLAST, GHASHDATA0 - vpxord 1*VL(SRC), RNDKEYLAST, GHASHDATA1 - vpxord 2*VL(SRC), RNDKEYLAST, GHASHDATA2 - vpxord 3*VL(SRC), RNDKEYLAST, GHASHDATA3 + vpxord 0*64(SRC), RNDKEYLAST, GHASHDATA0 + vpxord 1*64(SRC), RNDKEYLAST, GHASHDATA1 + vpxord 2*64(SRC), RNDKEYLAST, GHASHDATA2 + vpxord 3*64(SRC), RNDKEYLAST, GHASHDATA3 // Do the last AES round. This handles the XOR with the source data // too, as per the optimization described above. - vaesenclast GHASHDATA0, V0, GHASHDATA0 - vaesenclast GHASHDATA1, V1, GHASHDATA1 - vaesenclast GHASHDATA2, V2, GHASHDATA2 - vaesenclast GHASHDATA3, V3, GHASHDATA3 + vaesenclast GHASHDATA0, %zmm0, GHASHDATA0 + vaesenclast GHASHDATA1, %zmm1, GHASHDATA1 + vaesenclast GHASHDATA2, %zmm2, GHASHDATA2 + vaesenclast GHASHDATA3, %zmm3, GHASHDATA3 // Store the en/decrypted data to DST. - vmovdqu8 GHASHDATA0, 0*VL(DST) - vmovdqu8 GHASHDATA1, 1*VL(DST) - vmovdqu8 GHASHDATA2, 2*VL(DST) - vmovdqu8 GHASHDATA3, 3*VL(DST) + vmovdqu8 GHASHDATA0, 0*64(DST) + vmovdqu8 GHASHDATA1, 1*64(DST) + vmovdqu8 GHASHDATA2, 2*64(DST) + vmovdqu8 GHASHDATA3, 3*64(DST) .endm -// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_avx10 *key, -// const u32 le_ctr[4], u8 ghash_acc[16], -// const u8 *src, u8 *dst, int datalen); +// void aes_gcm_{enc,dec}_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, +// const u32 le_ctr[4], u8 ghash_acc[16], +// const u8 *src, u8 *dst, int datalen); // // This macro generates a GCM encryption or decryption update function with the -// above prototype (with \enc selecting which one). This macro supports both -// VL=32 and VL=64. _set_veclen must have been invoked with the desired length. -// -// This function computes the next portion of the CTR keystream, XOR's it with -// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted -// data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the -// next |datalen| ciphertext bytes. +// above prototype (with \enc selecting which one). The function computes the +// next portion of the CTR keystream, XOR's it with |datalen| bytes from |src|, +// and writes the resulting encrypted or decrypted data to |dst|. It also +// updates the GHASH accumulator |ghash_acc| using the next |datalen| ciphertext +// bytes. // // |datalen| must be a multiple of 16, except on the last call where it can be // any length. The caller must do any buffering needed to ensure this. Both // in-place and out-of-place en/decryption are supported. // -// |le_ctr| must give the current counter in little-endian format. For a new -// message, the low word of the counter must be 2. This function loads the -// counter from |le_ctr| and increments the loaded counter as needed, but it -// does *not* store the updated counter back to |le_ctr|. The caller must -// update |le_ctr| if any more data segments follow. Internally, only the low -// 32-bit word of the counter is incremented, following the GCM standard. +// |le_ctr| must give the current counter in little-endian format. This +// function loads the counter from |le_ctr| and increments the loaded counter as +// needed, but it does *not* store the updated counter back to |le_ctr|. The +// caller must update |le_ctr| if any more data segments follow. Internally, +// only the low 32-bit word of the counter is incremented, following the GCM +// standard. .macro _aes_gcm_update enc // Function arguments @@ -634,69 +712,69 @@ // Pointer to the last AES round key for the chosen AES variant .set RNDKEYLAST_PTR, %r11 - // In the main loop, V0-V3 are used as AES input and output. Elsewhere - // they are used as temporary registers. + // In the main loop, %zmm[0-3] are used as AES input and output. + // Elsewhere they are used as temporary registers. // GHASHDATA[0-3] hold the ciphertext blocks and GHASH input data. - .set GHASHDATA0, V4 + .set GHASHDATA0, %zmm4 .set GHASHDATA0_XMM, %xmm4 - .set GHASHDATA1, V5 + .set GHASHDATA1, %zmm5 .set GHASHDATA1_XMM, %xmm5 - .set GHASHDATA2, V6 + .set GHASHDATA2, %zmm6 .set GHASHDATA2_XMM, %xmm6 - .set GHASHDATA3, V7 + .set GHASHDATA3, %zmm7 // BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values // using vpshufb, copied to all 128-bit lanes. - .set BSWAP_MASK, V8 + .set BSWAP_MASK, %zmm8 // RNDKEY temporarily holds the next AES round key. - .set RNDKEY, V9 + .set RNDKEY, %zmm9 // GHASH_ACC is the accumulator variable for GHASH. When fully reduced, // only the lowest 128-bit lane can be nonzero. When not fully reduced, // more than one lane may be used, and they need to be XOR'd together. - .set GHASH_ACC, V10 + .set GHASH_ACC, %zmm10 .set GHASH_ACC_XMM, %xmm10 // LE_CTR_INC is the vector of 32-bit words that need to be added to a // vector of little-endian counter blocks to advance it forwards. - .set LE_CTR_INC, V11 + .set LE_CTR_INC, %zmm11 // LE_CTR contains the next set of little-endian counter blocks. - .set LE_CTR, V12 + .set LE_CTR, %zmm12 // RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys, // copied to all 128-bit lanes. RNDKEY0 is the zero-th round key, // RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last. - .set RNDKEY0, V13 - .set RNDKEYLAST, V14 - .set RNDKEY_M9, V15 - .set RNDKEY_M8, V16 - .set RNDKEY_M7, V17 - .set RNDKEY_M6, V18 - .set RNDKEY_M5, V19 - .set RNDKEY_M4, V20 - .set RNDKEY_M3, V21 - .set RNDKEY_M2, V22 - .set RNDKEY_M1, V23 + .set RNDKEY0, %zmm13 + .set RNDKEYLAST, %zmm14 + .set RNDKEY_M9, %zmm15 + .set RNDKEY_M8, %zmm16 + .set RNDKEY_M7, %zmm17 + .set RNDKEY_M6, %zmm18 + .set RNDKEY_M5, %zmm19 + .set RNDKEY_M4, %zmm20 + .set RNDKEY_M3, %zmm21 + .set RNDKEY_M2, %zmm22 + .set RNDKEY_M1, %zmm23 // GHASHTMP[0-2] are temporary variables used by _ghash_step_4x. These // cannot coincide with anything used for AES encryption, since for // performance reasons GHASH and AES encryption are interleaved. - .set GHASHTMP0, V24 - .set GHASHTMP1, V25 - .set GHASHTMP2, V26 + .set GHASHTMP0, %zmm24 + .set GHASHTMP1, %zmm25 + .set GHASHTMP2, %zmm26 - // H_POW[4-1] contain the powers of the hash key H^(4*VL/16)...H^1. The + // H_POW[4-1] contain the powers of the hash key H^16...H^1. The // descending numbering reflects the order of the key powers. - .set H_POW4, V27 - .set H_POW3, V28 - .set H_POW2, V29 - .set H_POW1, V30 + .set H_POW4, %zmm27 + .set H_POW3, %zmm28 + .set H_POW2, %zmm29 + .set H_POW1, %zmm30 // GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes. - .set GFPOLY, V31 + .set GFPOLY, %zmm31 // Load some constants. vbroadcasti32x4 .Lbswap_mask(%rip), BSWAP_MASK @@ -719,29 +797,23 @@ // Finish initializing LE_CTR by adding [0, 1, ...] to its low words. vpaddd .Lctr_pattern(%rip), LE_CTR, LE_CTR - // Initialize LE_CTR_INC to contain VL/16 in all 128-bit lanes. -.if VL == 32 - vbroadcasti32x4 .Linc_2blocks(%rip), LE_CTR_INC -.elseif VL == 64 + // Load 4 into all 128-bit lanes of LE_CTR_INC. vbroadcasti32x4 .Linc_4blocks(%rip), LE_CTR_INC -.else - .error "Unsupported vector length" -.endif - // If there are at least 4*VL bytes of data, then continue into the loop - // that processes 4*VL bytes of data at a time. Otherwise skip it. + // If there are at least 256 bytes of data, then continue into the loop + // that processes 256 bytes of data at a time. Otherwise skip it. // - // Pre-subtracting 4*VL from DATALEN saves an instruction from the main + // Pre-subtracting 256 from DATALEN saves an instruction from the main // loop and also ensures that at least one write always occurs to // DATALEN, zero-extending it and allowing DATALEN64 to be used later. - add $-4*VL, DATALEN // shorter than 'sub 4*VL' when VL=32 + sub $256, DATALEN jl .Lcrypt_loop_4x_done\@ // Load powers of the hash key. - vmovdqu8 OFFSETOFEND_H_POWERS-4*VL(KEY), H_POW4 - vmovdqu8 OFFSETOFEND_H_POWERS-3*VL(KEY), H_POW3 - vmovdqu8 OFFSETOFEND_H_POWERS-2*VL(KEY), H_POW2 - vmovdqu8 OFFSETOFEND_H_POWERS-1*VL(KEY), H_POW1 + vmovdqu8 OFFSETOFEND_H_POWERS-4*64(KEY), H_POW4 + vmovdqu8 OFFSETOFEND_H_POWERS-3*64(KEY), H_POW3 + vmovdqu8 OFFSETOFEND_H_POWERS-2*64(KEY), H_POW2 + vmovdqu8 OFFSETOFEND_H_POWERS-1*64(KEY), H_POW1 // Main loop: en/decrypt and hash 4 vectors at a time. // @@ -770,9 +842,9 @@ cmp %rax, RNDKEYLAST_PTR jne 1b _aesenclast_and_xor_4x - sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 - sub $-4*VL, DST - add $-4*VL, DATALEN + add $256, SRC + add $256, DST + sub $256, DATALEN jl .Lghash_last_ciphertext_4x\@ .endif @@ -786,10 +858,10 @@ // If decrypting, load more ciphertext blocks into GHASHDATA[0-3]. If // encrypting, GHASHDATA[0-3] already contain the previous ciphertext. .if !\enc - vmovdqu8 0*VL(SRC), GHASHDATA0 - vmovdqu8 1*VL(SRC), GHASHDATA1 - vmovdqu8 2*VL(SRC), GHASHDATA2 - vmovdqu8 3*VL(SRC), GHASHDATA3 + vmovdqu8 0*64(SRC), GHASHDATA0 + vmovdqu8 1*64(SRC), GHASHDATA1 + vmovdqu8 2*64(SRC), GHASHDATA2 + vmovdqu8 3*64(SRC), GHASHDATA3 .endif // Start the AES encryption of the counter blocks. @@ -809,44 +881,44 @@ _vaesenc_4x RNDKEY 128: - // Finish the AES encryption of the counter blocks in V0-V3, interleaved - // with the GHASH update of the ciphertext blocks in GHASHDATA[0-3]. + // Finish the AES encryption of the counter blocks in %zmm[0-3], + // interleaved with the GHASH update of the ciphertext blocks in + // GHASHDATA[0-3]. .irp i, 9,8,7,6,5,4,3,2,1 _ghash_step_4x (9 - \i) _vaesenc_4x RNDKEY_M\i .endr _ghash_step_4x 9 _aesenclast_and_xor_4x - sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 - sub $-4*VL, DST - add $-4*VL, DATALEN + add $256, SRC + add $256, DST + sub $256, DATALEN jge .Lcrypt_loop_4x\@ .if \enc .Lghash_last_ciphertext_4x\@: // Update GHASH with the last set of ciphertext blocks. -.irp i, 0,1,2,3,4,5,6,7,8,9 - _ghash_step_4x \i -.endr + _ghash_4x .endif .Lcrypt_loop_4x_done\@: - // Undo the extra subtraction by 4*VL and check whether data remains. - sub $-4*VL, DATALEN // shorter than 'add 4*VL' when VL=32 + // Undo the extra subtraction by 256 and check whether data remains. + add $256, DATALEN jz .Ldone\@ - // The data length isn't a multiple of 4*VL. Process the remaining data - // of length 1 <= DATALEN < 4*VL, up to one vector (VL bytes) at a time. - // Going one vector at a time may seem inefficient compared to having - // separate code paths for each possible number of vectors remaining. - // However, using a loop keeps the code size down, and it performs - // surprising well; modern CPUs will start executing the next iteration - // before the previous one finishes and also predict the number of loop - // iterations. For a similar reason, we roll up the AES rounds. + // The data length isn't a multiple of 256 bytes. Process the remaining + // data of length 1 <= DATALEN < 256, up to one 64-byte vector at a + // time. Going one vector at a time may seem inefficient compared to + // having separate code paths for each possible number of vectors + // remaining. However, using a loop keeps the code size down, and it + // performs surprising well; modern CPUs will start executing the next + // iteration before the previous one finishes and also predict the + // number of loop iterations. For a similar reason, we roll up the AES + // rounds. // - // On the last iteration, the remaining length may be less than VL. - // Handle this using masking. + // On the last iteration, the remaining length may be less than 64 + // bytes. Handle this using masking. // // Since there are enough key powers available for all remaining data, // there is no need to do a GHASH reduction after each iteration. @@ -875,65 +947,60 @@ .Lcrypt_loop_1x\@: // Select the appropriate mask for this iteration: all 1's if - // DATALEN >= VL, otherwise DATALEN 1's. Do this branchlessly using the + // DATALEN >= 64, otherwise DATALEN 1's. Do this branchlessly using the // bzhi instruction from BMI2. (This relies on DATALEN <= 255.) -.if VL < 64 - mov $-1, %eax - bzhi DATALEN, %eax, %eax - kmovd %eax, %k1 -.else mov $-1, %rax bzhi DATALEN64, %rax, %rax kmovq %rax, %k1 -.endif // Encrypt a vector of counter blocks. This does not need to be masked. - vpshufb BSWAP_MASK, LE_CTR, V0 + vpshufb BSWAP_MASK, LE_CTR, %zmm0 vpaddd LE_CTR_INC, LE_CTR, LE_CTR - vpxord RNDKEY0, V0, V0 + vpxord RNDKEY0, %zmm0, %zmm0 lea 16(KEY), %rax 1: vbroadcasti32x4 (%rax), RNDKEY - vaesenc RNDKEY, V0, V0 + vaesenc RNDKEY, %zmm0, %zmm0 add $16, %rax cmp %rax, RNDKEYLAST_PTR jne 1b - vaesenclast RNDKEYLAST, V0, V0 + vaesenclast RNDKEYLAST, %zmm0, %zmm0 // XOR the data with the appropriate number of keystream bytes. - vmovdqu8 (SRC), V1{%k1}{z} - vpxord V1, V0, V0 - vmovdqu8 V0, (DST){%k1} + vmovdqu8 (SRC), %zmm1{%k1}{z} + vpxord %zmm1, %zmm0, %zmm0 + vmovdqu8 %zmm0, (DST){%k1} // Update GHASH with the ciphertext block(s), without reducing. // - // In the case of DATALEN < VL, the ciphertext is zero-padded to VL. - // (If decrypting, it's done by the above masked load. If encrypting, - // it's done by the below masked register-to-register move.) Note that - // if DATALEN <= VL - 16, there will be additional padding beyond the - // padding of the last block specified by GHASH itself; i.e., there may - // be whole block(s) that get processed by the GHASH multiplication and - // reduction instructions but should not actually be included in the + // In the case of DATALEN < 64, the ciphertext is zero-padded to 64 + // bytes. (If decrypting, it's done by the above masked load. If + // encrypting, it's done by the below masked register-to-register move.) + // Note that if DATALEN <= 48, there will be additional padding beyond + // the padding of the last block specified by GHASH itself; i.e., there + // may be whole block(s) that get processed by the GHASH multiplication + // and reduction instructions but should not actually be included in the // GHASH. However, any such blocks are all-zeroes, and the values that // they're multiplied with are also all-zeroes. Therefore they just add // 0 * 0 = 0 to the final GHASH result, which makes no difference. vmovdqu8 (POWERS_PTR), H_POW1 .if \enc - vmovdqu8 V0, V1{%k1}{z} + vmovdqu8 %zmm0, %zmm1{%k1}{z} .endif - vpshufb BSWAP_MASK, V1, V0 - vpxord GHASH_ACC, V0, V0 - _ghash_mul_noreduce H_POW1, V0, LO, MI, HI, GHASHDATA3, V1, V2, V3 + vpshufb BSWAP_MASK, %zmm1, %zmm0 + vpxord GHASH_ACC, %zmm0, %zmm0 + _ghash_mul_noreduce H_POW1, %zmm0, LO, MI, HI, \ + GHASHDATA3, %zmm1, %zmm2, %zmm3 vpxor GHASH_ACC_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM - add $VL, POWERS_PTR - add $VL, SRC - add $VL, DST - sub $VL, DATALEN + add $64, POWERS_PTR + add $64, SRC + add $64, DST + sub $64, DATALEN jg .Lcrypt_loop_1x\@ // Finally, do the GHASH reduction. - _ghash_reduce LO, MI, HI, GFPOLY, V0 + _ghash_reduce LO, MI, HI, GFPOLY, %zmm0 _horizontal_xor HI, HI_XMM, GHASH_ACC_XMM, %xmm0, %xmm1, %xmm2 .Ldone\@: @@ -944,14 +1011,14 @@ RET .endm -// void aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, -// const u32 le_ctr[4], u8 ghash_acc[16], -// u64 total_aadlen, u64 total_datalen); -// bool aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, -// const u32 le_ctr[4], -// const u8 ghash_acc[16], -// u64 total_aadlen, u64 total_datalen, -// const u8 tag[16], int taglen); +// void aes_gcm_enc_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, +// const u32 le_ctr[4], u8 ghash_acc[16], +// u64 total_aadlen, u64 total_datalen); +// bool aes_gcm_dec_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, +// const u32 le_ctr[4], +// const u8 ghash_acc[16], +// u64 total_aadlen, u64 total_datalen, +// const u8 tag[16], int taglen); // // This macro generates one of the above two functions (with \enc selecting // which one). Both functions finish computing the GCM authentication tag by @@ -1081,119 +1148,16 @@ RET .endm -_set_veclen 32 -SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_256) - _aes_gcm_precompute -SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_256) -SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_256) - _aes_gcm_update 1 -SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_256) -SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_256) - _aes_gcm_update 0 -SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_256) - -_set_veclen 64 -SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_512) - _aes_gcm_precompute -SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_512) -SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_512) +SYM_FUNC_START(aes_gcm_enc_update_vaes_avx512) _aes_gcm_update 1 -SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_512) -SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_512) +SYM_FUNC_END(aes_gcm_enc_update_vaes_avx512) +SYM_FUNC_START(aes_gcm_dec_update_vaes_avx512) _aes_gcm_update 0 -SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_512) - -// void aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key, -// u8 ghash_acc[16], -// const u8 *aad, int aadlen); -// -// This function processes the AAD (Additional Authenticated Data) in GCM. -// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the -// data given by |aad| and |aadlen|. |key->ghash_key_powers| must have been -// initialized. On the first call, |ghash_acc| must be all zeroes. |aadlen| -// must be a multiple of 16, except on the last call where it can be any length. -// The caller must do any buffering needed to ensure this. -// -// AES-GCM is almost always used with small amounts of AAD, less than 32 bytes. -// Therefore, for AAD processing we currently only provide this implementation -// which uses 256-bit vectors (ymm registers) and only has a 1x-wide loop. This -// keeps the code size down, and it enables some micro-optimizations, e.g. using -// VEX-coded instructions instead of EVEX-coded to save some instruction bytes. -// To optimize for large amounts of AAD, we could implement a 4x-wide loop and -// provide a version using 512-bit vectors, but that doesn't seem to be useful. -SYM_FUNC_START(aes_gcm_aad_update_vaes_avx10) - - // Function arguments - .set KEY, %rdi - .set GHASH_ACC_PTR, %rsi - .set AAD, %rdx - .set AADLEN, %ecx - .set AADLEN64, %rcx // Zero-extend AADLEN before using! - - // Additional local variables. - // %rax, %ymm0-%ymm3, and %k1 are used as temporary registers. - .set BSWAP_MASK, %ymm4 - .set GFPOLY, %ymm5 - .set GHASH_ACC, %ymm6 - .set GHASH_ACC_XMM, %xmm6 - .set H_POW1, %ymm7 - - // Load some constants. - vbroadcasti128 .Lbswap_mask(%rip), BSWAP_MASK - vbroadcasti128 .Lgfpoly(%rip), GFPOLY - - // Load the GHASH accumulator. - vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM - - // Update GHASH with 32 bytes of AAD at a time. - // - // Pre-subtracting 32 from AADLEN saves an instruction from the loop and - // also ensures that at least one write always occurs to AADLEN, - // zero-extending it and allowing AADLEN64 to be used later. - sub $32, AADLEN - jl .Laad_loop_1x_done - vmovdqu8 OFFSETOFEND_H_POWERS-32(KEY), H_POW1 // [H^2, H^1] -.Laad_loop_1x: - vmovdqu (AAD), %ymm0 - vpshufb BSWAP_MASK, %ymm0, %ymm0 - vpxor %ymm0, GHASH_ACC, GHASH_ACC - _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ - %ymm0, %ymm1, %ymm2 - vextracti128 $1, GHASH_ACC, %xmm0 - vpxor %xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM - add $32, AAD - sub $32, AADLEN - jge .Laad_loop_1x -.Laad_loop_1x_done: - add $32, AADLEN - jz .Laad_done - - // Update GHASH with the remaining 1 <= AADLEN < 32 bytes of AAD. - mov $-1, %eax - bzhi AADLEN, %eax, %eax - kmovd %eax, %k1 - vmovdqu8 (AAD), %ymm0{%k1}{z} - neg AADLEN64 - and $~15, AADLEN64 // -round_up(AADLEN, 16) - vmovdqu8 OFFSETOFEND_H_POWERS(KEY,AADLEN64), H_POW1 - vpshufb BSWAP_MASK, %ymm0, %ymm0 - vpxor %ymm0, GHASH_ACC, GHASH_ACC - _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ - %ymm0, %ymm1, %ymm2 - vextracti128 $1, GHASH_ACC, %xmm0 - vpxor %xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM - -.Laad_done: - // Store the updated GHASH accumulator back to memory. - vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR) - - vzeroupper // This is needed after using ymm or zmm registers. - RET -SYM_FUNC_END(aes_gcm_aad_update_vaes_avx10) +SYM_FUNC_END(aes_gcm_dec_update_vaes_avx512) -SYM_FUNC_START(aes_gcm_enc_final_vaes_avx10) +SYM_FUNC_START(aes_gcm_enc_final_vaes_avx512) _aes_gcm_final 1 -SYM_FUNC_END(aes_gcm_enc_final_vaes_avx10) -SYM_FUNC_START(aes_gcm_dec_final_vaes_avx10) +SYM_FUNC_END(aes_gcm_enc_final_vaes_avx512) +SYM_FUNC_START(aes_gcm_dec_final_vaes_avx512) _aes_gcm_final 0 -SYM_FUNC_END(aes_gcm_dec_final_vaes_avx10) +SYM_FUNC_END(aes_gcm_dec_final_vaes_avx512) diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index d953ac470aae..bb6e2c47ffc6 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -874,8 +874,38 @@ struct aes_gcm_key_aesni { #define AES_GCM_KEY_AESNI_SIZE \ (sizeof(struct aes_gcm_key_aesni) + (15 & ~(CRYPTO_MINALIGN - 1))) -/* Key struct used by the VAES + AVX10 implementations of AES-GCM */ -struct aes_gcm_key_avx10 { +/* Key struct used by the VAES + AVX2 implementation of AES-GCM */ +struct aes_gcm_key_vaes_avx2 { + /* + * Common part of the key. The assembly code prefers 16-byte alignment + * for the round keys; we get this by them being located at the start of + * the struct and the whole struct being 32-byte aligned. + */ + struct aes_gcm_key base; + + /* + * Powers of the hash key H^8 through H^1. These are 128-bit values. + * They all have an extra factor of x^-1 and are byte-reversed. + * The assembly code prefers 32-byte alignment for this. + */ + u64 h_powers[8][2] __aligned(32); + + /* + * Each entry in this array contains the two halves of an entry of + * h_powers XOR'd together, in the following order: + * H^8,H^6,H^7,H^5,H^4,H^2,H^3,H^1 i.e. indices 0,2,1,3,4,6,5,7. + * This is used for Karatsuba multiplication. + */ + u64 h_powers_xored[8]; +}; + +#define AES_GCM_KEY_VAES_AVX2(key) \ + container_of((key), struct aes_gcm_key_vaes_avx2, base) +#define AES_GCM_KEY_VAES_AVX2_SIZE \ + (sizeof(struct aes_gcm_key_vaes_avx2) + (31 & ~(CRYPTO_MINALIGN - 1))) + +/* Key struct used by the VAES + AVX512 implementation of AES-GCM */ +struct aes_gcm_key_vaes_avx512 { /* * Common part of the key. The assembly code prefers 16-byte alignment * for the round keys; we get this by them being located at the start of @@ -895,10 +925,10 @@ struct aes_gcm_key_avx10 { /* Three padding blocks required by the assembly code */ u64 padding[3][2]; }; -#define AES_GCM_KEY_AVX10(key) \ - container_of((key), struct aes_gcm_key_avx10, base) -#define AES_GCM_KEY_AVX10_SIZE \ - (sizeof(struct aes_gcm_key_avx10) + (63 & ~(CRYPTO_MINALIGN - 1))) +#define AES_GCM_KEY_VAES_AVX512(key) \ + container_of((key), struct aes_gcm_key_vaes_avx512, base) +#define AES_GCM_KEY_VAES_AVX512_SIZE \ + (sizeof(struct aes_gcm_key_vaes_avx512) + (63 & ~(CRYPTO_MINALIGN - 1))) /* * These flags are passed to the AES-GCM helper functions to specify the @@ -910,14 +940,16 @@ struct aes_gcm_key_avx10 { #define FLAG_RFC4106 BIT(0) #define FLAG_ENC BIT(1) #define FLAG_AVX BIT(2) -#define FLAG_AVX10_256 BIT(3) -#define FLAG_AVX10_512 BIT(4) +#define FLAG_VAES_AVX2 BIT(3) +#define FLAG_VAES_AVX512 BIT(4) static inline struct aes_gcm_key * aes_gcm_key_get(struct crypto_aead *tfm, int flags) { - if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) + if (flags & FLAG_VAES_AVX512) return PTR_ALIGN(crypto_aead_ctx(tfm), 64); + else if (flags & FLAG_VAES_AVX2) + return PTR_ALIGN(crypto_aead_ctx(tfm), 32); else return PTR_ALIGN(crypto_aead_ctx(tfm), 16); } @@ -927,26 +959,16 @@ aes_gcm_precompute_aesni(struct aes_gcm_key_aesni *key); asmlinkage void aes_gcm_precompute_aesni_avx(struct aes_gcm_key_aesni *key); asmlinkage void -aes_gcm_precompute_vaes_avx10_256(struct aes_gcm_key_avx10 *key); +aes_gcm_precompute_vaes_avx2(struct aes_gcm_key_vaes_avx2 *key); asmlinkage void -aes_gcm_precompute_vaes_avx10_512(struct aes_gcm_key_avx10 *key); +aes_gcm_precompute_vaes_avx512(struct aes_gcm_key_vaes_avx512 *key); static void aes_gcm_precompute(struct aes_gcm_key *key, int flags) { - /* - * To make things a bit easier on the assembly side, the AVX10 - * implementations use the same key format. Therefore, a single - * function using 256-bit vectors would suffice here. However, it's - * straightforward to provide a 512-bit one because of how the assembly - * code is structured, and it works nicely because the total size of the - * key powers is a multiple of 512 bits. So we take advantage of that. - * - * A similar situation applies to the AES-NI implementations. - */ - if (flags & FLAG_AVX10_512) - aes_gcm_precompute_vaes_avx10_512(AES_GCM_KEY_AVX10(key)); - else if (flags & FLAG_AVX10_256) - aes_gcm_precompute_vaes_avx10_256(AES_GCM_KEY_AVX10(key)); + if (flags & FLAG_VAES_AVX512) + aes_gcm_precompute_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key)); + else if (flags & FLAG_VAES_AVX2) + aes_gcm_precompute_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key)); else if (flags & FLAG_AVX) aes_gcm_precompute_aesni_avx(AES_GCM_KEY_AESNI(key)); else @@ -960,15 +982,21 @@ asmlinkage void aes_gcm_aad_update_aesni_avx(const struct aes_gcm_key_aesni *key, u8 ghash_acc[16], const u8 *aad, int aadlen); asmlinkage void -aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key, - u8 ghash_acc[16], const u8 *aad, int aadlen); +aes_gcm_aad_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, + u8 ghash_acc[16], const u8 *aad, int aadlen); +asmlinkage void +aes_gcm_aad_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, + u8 ghash_acc[16], const u8 *aad, int aadlen); static void aes_gcm_aad_update(const struct aes_gcm_key *key, u8 ghash_acc[16], const u8 *aad, int aadlen, int flags) { - if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) - aes_gcm_aad_update_vaes_avx10(AES_GCM_KEY_AVX10(key), ghash_acc, - aad, aadlen); + if (flags & FLAG_VAES_AVX512) + aes_gcm_aad_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key), + ghash_acc, aad, aadlen); + else if (flags & FLAG_VAES_AVX2) + aes_gcm_aad_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key), + ghash_acc, aad, aadlen); else if (flags & FLAG_AVX) aes_gcm_aad_update_aesni_avx(AES_GCM_KEY_AESNI(key), ghash_acc, aad, aadlen); @@ -986,13 +1014,13 @@ aes_gcm_enc_update_aesni_avx(const struct aes_gcm_key_aesni *key, const u32 le_ctr[4], u8 ghash_acc[16], const u8 *src, u8 *dst, int datalen); asmlinkage void -aes_gcm_enc_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key, - const u32 le_ctr[4], u8 ghash_acc[16], - const u8 *src, u8 *dst, int datalen); +aes_gcm_enc_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); asmlinkage void -aes_gcm_enc_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key, - const u32 le_ctr[4], u8 ghash_acc[16], - const u8 *src, u8 *dst, int datalen); +aes_gcm_enc_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); asmlinkage void aes_gcm_dec_update_aesni(const struct aes_gcm_key_aesni *key, @@ -1003,13 +1031,13 @@ aes_gcm_dec_update_aesni_avx(const struct aes_gcm_key_aesni *key, const u32 le_ctr[4], u8 ghash_acc[16], const u8 *src, u8 *dst, int datalen); asmlinkage void -aes_gcm_dec_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key, - const u32 le_ctr[4], u8 ghash_acc[16], - const u8 *src, u8 *dst, int datalen); +aes_gcm_dec_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); asmlinkage void -aes_gcm_dec_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key, - const u32 le_ctr[4], u8 ghash_acc[16], - const u8 *src, u8 *dst, int datalen); +aes_gcm_dec_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); /* __always_inline to optimize out the branches based on @flags */ static __always_inline void @@ -1018,14 +1046,14 @@ aes_gcm_update(const struct aes_gcm_key *key, const u8 *src, u8 *dst, int datalen, int flags) { if (flags & FLAG_ENC) { - if (flags & FLAG_AVX10_512) - aes_gcm_enc_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key), - le_ctr, ghash_acc, - src, dst, datalen); - else if (flags & FLAG_AVX10_256) - aes_gcm_enc_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key), - le_ctr, ghash_acc, - src, dst, datalen); + if (flags & FLAG_VAES_AVX512) + aes_gcm_enc_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key), + le_ctr, ghash_acc, + src, dst, datalen); + else if (flags & FLAG_VAES_AVX2) + aes_gcm_enc_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key), + le_ctr, ghash_acc, + src, dst, datalen); else if (flags & FLAG_AVX) aes_gcm_enc_update_aesni_avx(AES_GCM_KEY_AESNI(key), le_ctr, ghash_acc, @@ -1034,14 +1062,14 @@ aes_gcm_update(const struct aes_gcm_key *key, aes_gcm_enc_update_aesni(AES_GCM_KEY_AESNI(key), le_ctr, ghash_acc, src, dst, datalen); } else { - if (flags & FLAG_AVX10_512) - aes_gcm_dec_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key), - le_ctr, ghash_acc, - src, dst, datalen); - else if (flags & FLAG_AVX10_256) - aes_gcm_dec_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key), - le_ctr, ghash_acc, - src, dst, datalen); + if (flags & FLAG_VAES_AVX512) + aes_gcm_dec_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key), + le_ctr, ghash_acc, + src, dst, datalen); + else if (flags & FLAG_VAES_AVX2) + aes_gcm_dec_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key), + le_ctr, ghash_acc, + src, dst, datalen); else if (flags & FLAG_AVX) aes_gcm_dec_update_aesni_avx(AES_GCM_KEY_AESNI(key), le_ctr, ghash_acc, @@ -1062,9 +1090,13 @@ aes_gcm_enc_final_aesni_avx(const struct aes_gcm_key_aesni *key, const u32 le_ctr[4], u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen); asmlinkage void -aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, - const u32 le_ctr[4], u8 ghash_acc[16], - u64 total_aadlen, u64 total_datalen); +aes_gcm_enc_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, + const u32 le_ctr[4], u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen); +asmlinkage void +aes_gcm_enc_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, + const u32 le_ctr[4], u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen); /* __always_inline to optimize out the branches based on @flags */ static __always_inline void @@ -1072,10 +1104,14 @@ aes_gcm_enc_final(const struct aes_gcm_key *key, const u32 le_ctr[4], u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen, int flags) { - if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) - aes_gcm_enc_final_vaes_avx10(AES_GCM_KEY_AVX10(key), - le_ctr, ghash_acc, - total_aadlen, total_datalen); + if (flags & FLAG_VAES_AVX512) + aes_gcm_enc_final_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen); + else if (flags & FLAG_VAES_AVX2) + aes_gcm_enc_final_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen); else if (flags & FLAG_AVX) aes_gcm_enc_final_aesni_avx(AES_GCM_KEY_AESNI(key), le_ctr, ghash_acc, @@ -1097,10 +1133,15 @@ aes_gcm_dec_final_aesni_avx(const struct aes_gcm_key_aesni *key, u64 total_aadlen, u64 total_datalen, const u8 tag[16], int taglen); asmlinkage bool __must_check -aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, - const u32 le_ctr[4], const u8 ghash_acc[16], - u64 total_aadlen, u64 total_datalen, - const u8 tag[16], int taglen); +aes_gcm_dec_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, + const u32 le_ctr[4], const u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen, + const u8 tag[16], int taglen); +asmlinkage bool __must_check +aes_gcm_dec_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, + const u32 le_ctr[4], const u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen, + const u8 tag[16], int taglen); /* __always_inline to optimize out the branches based on @flags */ static __always_inline bool __must_check @@ -1108,11 +1149,16 @@ aes_gcm_dec_final(const struct aes_gcm_key *key, const u32 le_ctr[4], u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen, u8 tag[16], int taglen, int flags) { - if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) - return aes_gcm_dec_final_vaes_avx10(AES_GCM_KEY_AVX10(key), - le_ctr, ghash_acc, - total_aadlen, total_datalen, - tag, taglen); + if (flags & FLAG_VAES_AVX512) + return aes_gcm_dec_final_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen, + tag, taglen); + else if (flags & FLAG_VAES_AVX2) + return aes_gcm_dec_final_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen, + tag, taglen); else if (flags & FLAG_AVX) return aes_gcm_dec_final_aesni_avx(AES_GCM_KEY_AESNI(key), le_ctr, ghash_acc, @@ -1195,10 +1241,14 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key, BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers) != 496); BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers_xored) != 624); BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_times_x64) != 688); - BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_enc) != 0); - BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_length) != 480); - BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, h_powers) != 512); - BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, padding) != 768); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, base.aes_key.key_enc) != 0); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, base.aes_key.key_length) != 480); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, h_powers) != 512); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, h_powers_xored) != 640); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, base.aes_key.key_enc) != 0); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, base.aes_key.key_length) != 480); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, h_powers) != 512); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, padding) != 768); if (likely(crypto_simd_usable())) { err = aes_check_keylen(keylen); @@ -1231,8 +1281,9 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key, gf128mul_lle(&h, (const be128 *)x_to_the_minus1); /* Compute the needed key powers */ - if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) { - struct aes_gcm_key_avx10 *k = AES_GCM_KEY_AVX10(key); + if (flags & FLAG_VAES_AVX512) { + struct aes_gcm_key_vaes_avx512 *k = + AES_GCM_KEY_VAES_AVX512(key); for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) { k->h_powers[i][0] = be64_to_cpu(h.b); @@ -1240,6 +1291,22 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key, gf128mul_lle(&h, &h1); } memset(k->padding, 0, sizeof(k->padding)); + } else if (flags & FLAG_VAES_AVX2) { + struct aes_gcm_key_vaes_avx2 *k = + AES_GCM_KEY_VAES_AVX2(key); + static const u8 indices[8] = { 0, 2, 1, 3, 4, 6, 5, 7 }; + + for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) { + k->h_powers[i][0] = be64_to_cpu(h.b); + k->h_powers[i][1] = be64_to_cpu(h.a); + gf128mul_lle(&h, &h1); + } + for (i = 0; i < ARRAY_SIZE(k->h_powers_xored); i++) { + int j = indices[i]; + + k->h_powers_xored[i] = k->h_powers[j][0] ^ + k->h_powers[j][1]; + } } else { struct aes_gcm_key_aesni *k = AES_GCM_KEY_AESNI(key); @@ -1508,15 +1575,15 @@ DEFINE_GCM_ALGS(aesni_avx, FLAG_AVX, "generic-gcm-aesni-avx", "rfc4106-gcm-aesni-avx", AES_GCM_KEY_AESNI_SIZE, 500); -/* aes_gcm_algs_vaes_avx10_256 */ -DEFINE_GCM_ALGS(vaes_avx10_256, FLAG_AVX10_256, - "generic-gcm-vaes-avx10_256", "rfc4106-gcm-vaes-avx10_256", - AES_GCM_KEY_AVX10_SIZE, 700); +/* aes_gcm_algs_vaes_avx2 */ +DEFINE_GCM_ALGS(vaes_avx2, FLAG_VAES_AVX2, + "generic-gcm-vaes-avx2", "rfc4106-gcm-vaes-avx2", + AES_GCM_KEY_VAES_AVX2_SIZE, 600); -/* aes_gcm_algs_vaes_avx10_512 */ -DEFINE_GCM_ALGS(vaes_avx10_512, FLAG_AVX10_512, - "generic-gcm-vaes-avx10_512", "rfc4106-gcm-vaes-avx10_512", - AES_GCM_KEY_AVX10_SIZE, 800); +/* aes_gcm_algs_vaes_avx512 */ +DEFINE_GCM_ALGS(vaes_avx512, FLAG_VAES_AVX512, + "generic-gcm-vaes-avx512", "rfc4106-gcm-vaes-avx512", + AES_GCM_KEY_VAES_AVX512_SIZE, 800); static int __init register_avx_algs(void) { @@ -1548,6 +1615,10 @@ static int __init register_avx_algs(void) ARRAY_SIZE(skcipher_algs_vaes_avx2)); if (err) return err; + err = crypto_register_aeads(aes_gcm_algs_vaes_avx2, + ARRAY_SIZE(aes_gcm_algs_vaes_avx2)); + if (err) + return err; if (!boot_cpu_has(X86_FEATURE_AVX512BW) || !boot_cpu_has(X86_FEATURE_AVX512VL) || @@ -1556,26 +1627,21 @@ static int __init register_avx_algs(void) XFEATURE_MASK_AVX512, NULL)) return 0; - err = crypto_register_aeads(aes_gcm_algs_vaes_avx10_256, - ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256)); - if (err) - return err; - if (boot_cpu_has(X86_FEATURE_PREFER_YMM)) { int i; for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx512); i++) skcipher_algs_vaes_avx512[i].base.cra_priority = 1; - for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++) - aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1; + for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx512); i++) + aes_gcm_algs_vaes_avx512[i].base.cra_priority = 1; } err = crypto_register_skciphers(skcipher_algs_vaes_avx512, ARRAY_SIZE(skcipher_algs_vaes_avx512)); if (err) return err; - err = crypto_register_aeads(aes_gcm_algs_vaes_avx10_512, - ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512)); + err = crypto_register_aeads(aes_gcm_algs_vaes_avx512, + ARRAY_SIZE(aes_gcm_algs_vaes_avx512)); if (err) return err; @@ -1595,8 +1661,8 @@ static void unregister_avx_algs(void) unregister_aeads(aes_gcm_algs_aesni_avx); unregister_skciphers(skcipher_algs_vaes_avx2); unregister_skciphers(skcipher_algs_vaes_avx512); - unregister_aeads(aes_gcm_algs_vaes_avx10_256); - unregister_aeads(aes_gcm_algs_vaes_avx10_512); + unregister_aeads(aes_gcm_algs_vaes_avx2); + unregister_aeads(aes_gcm_algs_vaes_avx512); } #else /* CONFIG_X86_64 */ static struct aead_alg aes_gcm_algs_aesni[0]; diff --git a/arch/x86/crypto/polyval-clmulni_asm.S b/arch/x86/crypto/polyval-clmulni_asm.S deleted file mode 100644 index a6ebe4e7dd2b..000000000000 --- a/arch/x86/crypto/polyval-clmulni_asm.S +++ /dev/null @@ -1,321 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright 2021 Google LLC - */ -/* - * This is an efficient implementation of POLYVAL using intel PCLMULQDQ-NI - * instructions. It works on 8 blocks at a time, by precomputing the first 8 - * keys powers h^8, ..., h^1 in the POLYVAL finite field. This precomputation - * allows us to split finite field multiplication into two steps. - * - * In the first step, we consider h^i, m_i as normal polynomials of degree less - * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication - * is simply polynomial multiplication. - * - * In the second step, we compute the reduction of p(x) modulo the finite field - * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1. - * - * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where - * multiplication is finite field multiplication. The advantage is that the - * two-step process only requires 1 finite field reduction for every 8 - * polynomial multiplications. Further parallelism is gained by interleaving the - * multiplications and polynomial reductions. - */ - -#include <linux/linkage.h> -#include <asm/frame.h> - -#define STRIDE_BLOCKS 8 - -#define GSTAR %xmm7 -#define PL %xmm8 -#define PH %xmm9 -#define TMP_XMM %xmm11 -#define LO %xmm12 -#define HI %xmm13 -#define MI %xmm14 -#define SUM %xmm15 - -#define KEY_POWERS %rdi -#define MSG %rsi -#define BLOCKS_LEFT %rdx -#define ACCUMULATOR %rcx -#define TMP %rax - -.section .rodata.cst16.gstar, "aM", @progbits, 16 -.align 16 - -.Lgstar: - .quad 0xc200000000000000, 0xc200000000000000 - -.text - -/* - * Performs schoolbook1_iteration on two lists of 128-bit polynomials of length - * count pointed to by MSG and KEY_POWERS. - */ -.macro schoolbook1 count - .set i, 0 - .rept (\count) - schoolbook1_iteration i 0 - .set i, (i +1) - .endr -.endm - -/* - * Computes the product of two 128-bit polynomials at the memory locations - * specified by (MSG + 16*i) and (KEY_POWERS + 16*i) and XORs the components of - * the 256-bit product into LO, MI, HI. - * - * Given: - * X = [X_1 : X_0] - * Y = [Y_1 : Y_0] - * - * We compute: - * LO += X_0 * Y_0 - * MI += X_0 * Y_1 + X_1 * Y_0 - * HI += X_1 * Y_1 - * - * Later, the 256-bit result can be extracted as: - * [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0] - * This step is done when computing the polynomial reduction for efficiency - * reasons. - * - * If xor_sum == 1, then also XOR the value of SUM into m_0. This avoids an - * extra multiplication of SUM and h^8. - */ -.macro schoolbook1_iteration i xor_sum - movups (16*\i)(MSG), %xmm0 - .if (\i == 0 && \xor_sum == 1) - pxor SUM, %xmm0 - .endif - vpclmulqdq $0x01, (16*\i)(KEY_POWERS), %xmm0, %xmm2 - vpclmulqdq $0x00, (16*\i)(KEY_POWERS), %xmm0, %xmm1 - vpclmulqdq $0x10, (16*\i)(KEY_POWERS), %xmm0, %xmm3 - vpclmulqdq $0x11, (16*\i)(KEY_POWERS), %xmm0, %xmm4 - vpxor %xmm2, MI, MI - vpxor %xmm1, LO, LO - vpxor %xmm4, HI, HI - vpxor %xmm3, MI, MI -.endm - -/* - * Performs the same computation as schoolbook1_iteration, except we expect the - * arguments to already be loaded into xmm0 and xmm1 and we set the result - * registers LO, MI, and HI directly rather than XOR'ing into them. - */ -.macro schoolbook1_noload - vpclmulqdq $0x01, %xmm0, %xmm1, MI - vpclmulqdq $0x10, %xmm0, %xmm1, %xmm2 - vpclmulqdq $0x00, %xmm0, %xmm1, LO - vpclmulqdq $0x11, %xmm0, %xmm1, HI - vpxor %xmm2, MI, MI -.endm - -/* - * Computes the 256-bit polynomial represented by LO, HI, MI. Stores - * the result in PL, PH. - * [PH : PL] = [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0] - */ -.macro schoolbook2 - vpslldq $8, MI, PL - vpsrldq $8, MI, PH - pxor LO, PL - pxor HI, PH -.endm - -/* - * Computes the 128-bit reduction of PH : PL. Stores the result in dest. - * - * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) = - * x^128 + x^127 + x^126 + x^121 + 1. - * - * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the - * product of two 128-bit polynomials in Montgomery form. We need to reduce it - * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor - * of x^128, this product has two extra factors of x^128. To get it back into - * Montgomery form, we need to remove one of these factors by dividing by x^128. - * - * To accomplish both of these goals, we add multiples of g(x) that cancel out - * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low - * bits are zero, the polynomial division by x^128 can be done by right shifting. - * - * Since the only nonzero term in the low 64 bits of g(x) is the constant term, - * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can - * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 + - * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to - * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T - * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191. - * - * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits - * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1 - * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) * - * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 : - * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0). - * - * So our final computation is: - * T = T_1 : T_0 = g*(x) * P_0 - * V = V_1 : V_0 = g*(x) * (P_1 + T_0) - * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0 - * - * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0 - * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 : - * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V. - */ -.macro montgomery_reduction dest - vpclmulqdq $0x00, PL, GSTAR, TMP_XMM # TMP_XMM = T_1 : T_0 = P_0 * g*(x) - pshufd $0b01001110, TMP_XMM, TMP_XMM # TMP_XMM = T_0 : T_1 - pxor PL, TMP_XMM # TMP_XMM = P_1 + T_0 : P_0 + T_1 - pxor TMP_XMM, PH # PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1 - pclmulqdq $0x11, GSTAR, TMP_XMM # TMP_XMM = V_1 : V_0 = V = [(P_1 + T_0) * g*(x)] - vpxor TMP_XMM, PH, \dest -.endm - -/* - * Compute schoolbook multiplication for 8 blocks - * m_0h^8 + ... + m_7h^1 - * - * If reduce is set, also computes the montgomery reduction of the - * previous full_stride call and XORs with the first message block. - * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1. - * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0. - */ -.macro full_stride reduce - pxor LO, LO - pxor HI, HI - pxor MI, MI - - schoolbook1_iteration 7 0 - .if \reduce - vpclmulqdq $0x00, PL, GSTAR, TMP_XMM - .endif - - schoolbook1_iteration 6 0 - .if \reduce - pshufd $0b01001110, TMP_XMM, TMP_XMM - .endif - - schoolbook1_iteration 5 0 - .if \reduce - pxor PL, TMP_XMM - .endif - - schoolbook1_iteration 4 0 - .if \reduce - pxor TMP_XMM, PH - .endif - - schoolbook1_iteration 3 0 - .if \reduce - pclmulqdq $0x11, GSTAR, TMP_XMM - .endif - - schoolbook1_iteration 2 0 - .if \reduce - vpxor TMP_XMM, PH, SUM - .endif - - schoolbook1_iteration 1 0 - - schoolbook1_iteration 0 1 - - addq $(8*16), MSG - schoolbook2 -.endm - -/* - * Process BLOCKS_LEFT blocks, where 0 < BLOCKS_LEFT < STRIDE_BLOCKS - */ -.macro partial_stride - mov BLOCKS_LEFT, TMP - shlq $4, TMP - addq $(16*STRIDE_BLOCKS), KEY_POWERS - subq TMP, KEY_POWERS - - movups (MSG), %xmm0 - pxor SUM, %xmm0 - movaps (KEY_POWERS), %xmm1 - schoolbook1_noload - dec BLOCKS_LEFT - addq $16, MSG - addq $16, KEY_POWERS - - test $4, BLOCKS_LEFT - jz .Lpartial4BlocksDone - schoolbook1 4 - addq $(4*16), MSG - addq $(4*16), KEY_POWERS -.Lpartial4BlocksDone: - test $2, BLOCKS_LEFT - jz .Lpartial2BlocksDone - schoolbook1 2 - addq $(2*16), MSG - addq $(2*16), KEY_POWERS -.Lpartial2BlocksDone: - test $1, BLOCKS_LEFT - jz .LpartialDone - schoolbook1 1 -.LpartialDone: - schoolbook2 - montgomery_reduction SUM -.endm - -/* - * Perform montgomery multiplication in GF(2^128) and store result in op1. - * - * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1 - * If op1, op2 are in montgomery form, this computes the montgomery - * form of op1*op2. - * - * void clmul_polyval_mul(u8 *op1, const u8 *op2); - */ -SYM_FUNC_START(clmul_polyval_mul) - FRAME_BEGIN - vmovdqa .Lgstar(%rip), GSTAR - movups (%rdi), %xmm0 - movups (%rsi), %xmm1 - schoolbook1_noload - schoolbook2 - montgomery_reduction SUM - movups SUM, (%rdi) - FRAME_END - RET -SYM_FUNC_END(clmul_polyval_mul) - -/* - * Perform polynomial evaluation as specified by POLYVAL. This computes: - * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1} - * where n=nblocks, h is the hash key, and m_i are the message blocks. - * - * rdi - pointer to precomputed key powers h^8 ... h^1 - * rsi - pointer to message blocks - * rdx - number of blocks to hash - * rcx - pointer to the accumulator - * - * void clmul_polyval_update(const struct polyval_tfm_ctx *keys, - * const u8 *in, size_t nblocks, u8 *accumulator); - */ -SYM_FUNC_START(clmul_polyval_update) - FRAME_BEGIN - vmovdqa .Lgstar(%rip), GSTAR - movups (ACCUMULATOR), SUM - subq $STRIDE_BLOCKS, BLOCKS_LEFT - js .LstrideLoopExit - full_stride 0 - subq $STRIDE_BLOCKS, BLOCKS_LEFT - js .LstrideLoopExitReduce -.LstrideLoop: - full_stride 1 - subq $STRIDE_BLOCKS, BLOCKS_LEFT - jns .LstrideLoop -.LstrideLoopExitReduce: - montgomery_reduction SUM -.LstrideLoopExit: - add $STRIDE_BLOCKS, BLOCKS_LEFT - jz .LskipPartial - partial_stride -.LskipPartial: - movups SUM, (ACCUMULATOR) - FRAME_END - RET -SYM_FUNC_END(clmul_polyval_update) diff --git a/arch/x86/crypto/polyval-clmulni_glue.c b/arch/x86/crypto/polyval-clmulni_glue.c deleted file mode 100644 index 6b466867f91a..000000000000 --- a/arch/x86/crypto/polyval-clmulni_glue.c +++ /dev/null @@ -1,180 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Glue code for POLYVAL using PCMULQDQ-NI - * - * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi> - * Copyright (c) 2009 Intel Corp. - * Author: Huang Ying <ying.huang@intel.com> - * Copyright 2021 Google LLC - */ - -/* - * Glue code based on ghash-clmulni-intel_glue.c. - * - * This implementation of POLYVAL uses montgomery multiplication - * accelerated by PCLMULQDQ-NI to implement the finite field - * operations. - */ - -#include <asm/cpu_device_id.h> -#include <asm/fpu/api.h> -#include <crypto/internal/hash.h> -#include <crypto/polyval.h> -#include <crypto/utils.h> -#include <linux/errno.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/string.h> - -#define POLYVAL_ALIGN 16 -#define POLYVAL_ALIGN_ATTR __aligned(POLYVAL_ALIGN) -#define POLYVAL_ALIGN_EXTRA ((POLYVAL_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1)) -#define POLYVAL_CTX_SIZE (sizeof(struct polyval_tfm_ctx) + POLYVAL_ALIGN_EXTRA) -#define NUM_KEY_POWERS 8 - -struct polyval_tfm_ctx { - /* - * These powers must be in the order h^8, ..., h^1. - */ - u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE] POLYVAL_ALIGN_ATTR; -}; - -struct polyval_desc_ctx { - u8 buffer[POLYVAL_BLOCK_SIZE]; -}; - -asmlinkage void clmul_polyval_update(const struct polyval_tfm_ctx *keys, - const u8 *in, size_t nblocks, u8 *accumulator); -asmlinkage void clmul_polyval_mul(u8 *op1, const u8 *op2); - -static inline struct polyval_tfm_ctx *polyval_tfm_ctx(struct crypto_shash *tfm) -{ - return PTR_ALIGN(crypto_shash_ctx(tfm), POLYVAL_ALIGN); -} - -static void internal_polyval_update(const struct polyval_tfm_ctx *keys, - const u8 *in, size_t nblocks, u8 *accumulator) -{ - kernel_fpu_begin(); - clmul_polyval_update(keys, in, nblocks, accumulator); - kernel_fpu_end(); -} - -static void internal_polyval_mul(u8 *op1, const u8 *op2) -{ - kernel_fpu_begin(); - clmul_polyval_mul(op1, op2); - kernel_fpu_end(); -} - -static int polyval_x86_setkey(struct crypto_shash *tfm, - const u8 *key, unsigned int keylen) -{ - struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(tfm); - int i; - - if (keylen != POLYVAL_BLOCK_SIZE) - return -EINVAL; - - memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE); - - for (i = NUM_KEY_POWERS-2; i >= 0; i--) { - memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE); - internal_polyval_mul(tctx->key_powers[i], - tctx->key_powers[i+1]); - } - - return 0; -} - -static int polyval_x86_init(struct shash_desc *desc) -{ - struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); - - memset(dctx, 0, sizeof(*dctx)); - - return 0; -} - -static int polyval_x86_update(struct shash_desc *desc, - const u8 *src, unsigned int srclen) -{ - struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); - const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm); - unsigned int nblocks; - - do { - /* Allow rescheduling every 4K bytes. */ - nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE; - internal_polyval_update(tctx, src, nblocks, dctx->buffer); - srclen -= nblocks * POLYVAL_BLOCK_SIZE; - src += nblocks * POLYVAL_BLOCK_SIZE; - } while (srclen >= POLYVAL_BLOCK_SIZE); - - return srclen; -} - -static int polyval_x86_finup(struct shash_desc *desc, const u8 *src, - unsigned int len, u8 *dst) -{ - struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); - const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm); - - if (len) { - crypto_xor(dctx->buffer, src, len); - internal_polyval_mul(dctx->buffer, - tctx->key_powers[NUM_KEY_POWERS-1]); - } - - memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE); - - return 0; -} - -static struct shash_alg polyval_alg = { - .digestsize = POLYVAL_DIGEST_SIZE, - .init = polyval_x86_init, - .update = polyval_x86_update, - .finup = polyval_x86_finup, - .setkey = polyval_x86_setkey, - .descsize = sizeof(struct polyval_desc_ctx), - .base = { - .cra_name = "polyval", - .cra_driver_name = "polyval-clmulni", - .cra_priority = 200, - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, - .cra_blocksize = POLYVAL_BLOCK_SIZE, - .cra_ctxsize = POLYVAL_CTX_SIZE, - .cra_module = THIS_MODULE, - }, -}; - -__maybe_unused static const struct x86_cpu_id pcmul_cpu_id[] = { - X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id); - -static int __init polyval_clmulni_mod_init(void) -{ - if (!x86_match_cpu(pcmul_cpu_id)) - return -ENODEV; - - if (!boot_cpu_has(X86_FEATURE_AVX)) - return -ENODEV; - - return crypto_register_shash(&polyval_alg); -} - -static void __exit polyval_clmulni_mod_exit(void) -{ - crypto_unregister_shash(&polyval_alg); -} - -module_init(polyval_clmulni_mod_init); -module_exit(polyval_clmulni_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("POLYVAL hash function accelerated by PCLMULQDQ-NI"); -MODULE_ALIAS_CRYPTO("polyval"); -MODULE_ALIAS_CRYPTO("polyval-clmulni"); diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S index 8e9a0cc20a4a..6ba2b3adcef0 100644 --- a/arch/x86/entry/entry.S +++ b/arch/x86/entry/entry.S @@ -4,6 +4,7 @@ */ #include <linux/export.h> +#include <linux/kvm_types.h> #include <linux/linkage.h> #include <linux/objtool.h> #include <asm/msr-index.h> @@ -29,8 +30,15 @@ SYM_FUNC_START(write_ibpb) FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_BUG_IBPB_NO_RET RET SYM_FUNC_END(write_ibpb) -/* For KVM */ -EXPORT_SYMBOL_GPL(write_ibpb); +EXPORT_SYMBOL_FOR_KVM(write_ibpb); + +SYM_FUNC_START(__WARN_trap) + ANNOTATE_NOENDBR + ANNOTATE_REACHABLE + ud1 (%edx), %_ASM_ARG1 + RET +SYM_FUNC_END(__WARN_trap) +EXPORT_SYMBOL(__WARN_trap) .popsection @@ -48,8 +56,7 @@ SYM_CODE_START_NOALIGN(x86_verw_sel) .word __KERNEL_DS .align L1_CACHE_BYTES, 0xcc SYM_CODE_END(x86_verw_sel); -/* For KVM */ -EXPORT_SYMBOL_GPL(x86_verw_sel); +EXPORT_SYMBOL_FOR_KVM(x86_verw_sel); .popsection diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index ed04a968cc7d..f9983a1907bf 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -19,6 +19,7 @@ * - idtentry: Define exception entry points. */ #include <linux/export.h> +#include <linux/kvm_types.h> #include <linux/linkage.h> #include <asm/segment.h> #include <asm/cache.h> @@ -1566,5 +1567,5 @@ SYM_FUNC_START(clear_bhb_loop) pop %rbp RET SYM_FUNC_END(clear_bhb_loop) -EXPORT_SYMBOL_GPL(clear_bhb_loop) +EXPORT_SYMBOL_FOR_KVM(clear_bhb_loop) STACK_FRAME_NON_STANDARD(clear_bhb_loop) diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S index fafbd3e68cb8..894f7f16eb80 100644 --- a/arch/x86/entry/entry_64_fred.S +++ b/arch/x86/entry/entry_64_fred.S @@ -4,6 +4,7 @@ */ #include <linux/export.h> +#include <linux/kvm_types.h> #include <asm/asm.h> #include <asm/fred.h> @@ -146,5 +147,5 @@ SYM_FUNC_START(asm_fred_entry_from_kvm) RET SYM_FUNC_END(asm_fred_entry_from_kvm) -EXPORT_SYMBOL_GPL(asm_fred_entry_from_kvm); +EXPORT_SYMBOL_FOR_KVM(asm_fred_entry_from_kvm); #endif diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c index f004a4dc74c2..94e626cc6a07 100644 --- a/arch/x86/entry/entry_fred.c +++ b/arch/x86/entry/entry_fred.c @@ -78,13 +78,13 @@ static noinstr void fred_intx(struct pt_regs *regs) static __always_inline void fred_other(struct pt_regs *regs) { /* The compiler can fold these conditions into a single test */ - if (likely(regs->fred_ss.vector == FRED_SYSCALL && regs->fred_ss.lm)) { + if (likely(regs->fred_ss.vector == FRED_SYSCALL && regs->fred_ss.l)) { regs->orig_ax = regs->ax; regs->ax = -ENOSYS; do_syscall_64(regs, regs->orig_ax); return; } else if (ia32_enabled() && - likely(regs->fred_ss.vector == FRED_SYSENTER && !regs->fred_ss.lm)) { + likely(regs->fred_ss.vector == FRED_SYSENTER && !regs->fred_ss.l)) { regs->orig_ax = regs->ax; regs->ax = -ENOSYS; do_fast_syscall_32(regs); diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 2b15ea17bb7c..a67a644d0cfe 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -274,9 +274,10 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) * fetch EBP before invoking any of the syscall entry work * functions. */ - syscall_enter_from_user_mode_prepare(regs); + enter_from_user_mode(regs); instrumentation_begin(); + local_irq_enable(); /* Fetch EBP from where the vDSO stashed it. */ if (IS_ENABLED(CONFIG_X86_64)) { /* diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 4877e16da69a..e979a3eac7a3 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -475,3 +475,4 @@ 467 i386 open_tree_attr sys_open_tree_attr 468 i386 file_getattr sys_file_getattr 469 i386 file_setattr sys_file_setattr +470 i386 listns sys_listns diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index ced2a1deecd7..8a4ac4841be6 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -394,6 +394,7 @@ 467 common open_tree_attr sys_open_tree_attr 468 common file_getattr sys_file_getattr 469 common file_setattr sys_file_setattr +470 common listns sys_listns # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index b20661b8621d..44656d2fb555 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -2,6 +2,7 @@ #include <linux/perf_event.h> #include <linux/jump_label.h> #include <linux/export.h> +#include <linux/kvm_types.h> #include <linux/types.h> #include <linux/init.h> #include <linux/slab.h> @@ -763,7 +764,12 @@ static void amd_pmu_enable_all(int added) if (!test_bit(idx, cpuc->active_mask)) continue; - amd_pmu_enable_event(cpuc->events[idx]); + /* + * FIXME: cpuc->events[idx] can become NULL in a subtle race + * condition with NMI->throttle->x86_pmu_stop(). + */ + if (cpuc->events[idx]) + amd_pmu_enable_event(cpuc->events[idx]); } } @@ -1569,7 +1575,7 @@ void amd_pmu_enable_virt(void) /* Reload all events */ amd_pmu_reload_virt(); } -EXPORT_SYMBOL_GPL(amd_pmu_enable_virt); +EXPORT_SYMBOL_FOR_KVM(amd_pmu_enable_virt); void amd_pmu_disable_virt(void) { @@ -1586,4 +1592,4 @@ void amd_pmu_disable_virt(void) /* Reload all events */ amd_pmu_reload_virt(); } -EXPORT_SYMBOL_GPL(amd_pmu_disable_virt); +EXPORT_SYMBOL_FOR_KVM(amd_pmu_disable_virt); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 745caa6c15a3..0c38a31d5fc7 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -20,6 +20,7 @@ #include <linux/export.h> #include <linux/init.h> #include <linux/kdebug.h> +#include <linux/kvm_types.h> #include <linux/sched/mm.h> #include <linux/sched/clock.h> #include <linux/uaccess.h> @@ -554,14 +555,22 @@ static inline int precise_br_compat(struct perf_event *event) return m == b; } -int x86_pmu_max_precise(void) +int x86_pmu_max_precise(struct pmu *pmu) { int precise = 0; - /* Support for constant skid */ if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { - precise++; + /* arch PEBS */ + if (x86_pmu.arch_pebs) { + precise = 2; + if (hybrid(pmu, arch_pebs_cap).pdists) + precise++; + + return precise; + } + /* legacy PEBS - support for constant skid */ + precise++; /* Support for IP fixup */ if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2) precise++; @@ -569,13 +578,14 @@ int x86_pmu_max_precise(void) if (x86_pmu.pebs_prec_dist) precise++; } + return precise; } int x86_pmu_hw_config(struct perf_event *event) { if (event->attr.precise_ip) { - int precise = x86_pmu_max_precise(); + int precise = x86_pmu_max_precise(event->pmu); if (event->attr.precise_ip > precise) return -EOPNOTSUPP; @@ -714,7 +724,7 @@ struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data) { return static_call(x86_pmu_guest_get_msrs)(nr, data); } -EXPORT_SYMBOL_GPL(perf_guest_get_msrs); +EXPORT_SYMBOL_FOR_KVM(perf_guest_get_msrs); /* * There may be PMI landing after enabled=0. The PMI hitting could be before or @@ -1344,6 +1354,7 @@ static void x86_pmu_enable(struct pmu *pmu) hwc->state |= PERF_HES_ARCH; x86_pmu_stop(event, PERF_EF_UPDATE); + cpuc->events[hwc->idx] = NULL; } /* @@ -1365,6 +1376,7 @@ static void x86_pmu_enable(struct pmu *pmu) * if cpuc->enabled = 0, then no wrmsr as * per x86_pmu_enable_event() */ + cpuc->events[hwc->idx] = event; x86_pmu_start(event, PERF_EF_RELOAD); } cpuc->n_added = 0; @@ -1531,7 +1543,6 @@ static void x86_pmu_start(struct perf_event *event, int flags) event->hw.state = 0; - cpuc->events[idx] = event; __set_bit(idx, cpuc->active_mask); static_call(x86_pmu_enable)(event); perf_event_update_userpage(event); @@ -1610,7 +1621,6 @@ void x86_pmu_stop(struct perf_event *event, int flags) if (test_bit(hwc->idx, cpuc->active_mask)) { static_call(x86_pmu_disable)(event); __clear_bit(hwc->idx, cpuc->active_mask); - cpuc->events[hwc->idx] = NULL; WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); hwc->state |= PERF_HES_STOPPED; } @@ -1648,6 +1658,7 @@ static void x86_pmu_del(struct perf_event *event, int flags) * Not a TXN, therefore cleanup properly. */ x86_pmu_stop(event, PERF_EF_UPDATE); + cpuc->events[event->hw.idx] = NULL; for (i = 0; i < cpuc->n_events; i++) { if (event == cpuc->event_list[i]) @@ -2629,7 +2640,9 @@ static ssize_t max_precise_show(struct device *cdev, struct device_attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise()); + struct pmu *pmu = dev_get_drvdata(cdev); + + return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise(pmu)); } static DEVICE_ATTR_RO(max_precise); @@ -2789,13 +2802,13 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re return; } - if (perf_callchain_store(entry, regs->ip)) - return; - - if (perf_hw_regs(regs)) + if (perf_hw_regs(regs)) { + if (perf_callchain_store(entry, regs->ip)) + return; unwind_start(&state, current, regs, NULL); - else + } else { unwind_start(&state, current, NULL, (void *)regs->sp); + } for (; !unwind_done(&state); unwind_next_frame(&state)) { addr = unwind_get_return_address(&state); @@ -2845,46 +2858,6 @@ static unsigned long get_segment_base(unsigned int segment) return get_desc_base(desc); } -#ifdef CONFIG_UPROBES -/* - * Heuristic-based check if uprobe is installed at the function entry. - * - * Under assumption of user code being compiled with frame pointers, - * `push %rbp/%ebp` is a good indicator that we indeed are. - * - * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern. - * If we get this wrong, captured stack trace might have one extra bogus - * entry, but the rest of stack trace will still be meaningful. - */ -static bool is_uprobe_at_func_entry(struct pt_regs *regs) -{ - struct arch_uprobe *auprobe; - - if (!current->utask) - return false; - - auprobe = current->utask->auprobe; - if (!auprobe) - return false; - - /* push %rbp/%ebp */ - if (auprobe->insn[0] == 0x55) - return true; - - /* endbr64 (64-bit only) */ - if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn)) - return true; - - return false; -} - -#else -static bool is_uprobe_at_func_entry(struct pt_regs *regs) -{ - return false; -} -#endif /* CONFIG_UPROBES */ - #ifdef CONFIG_IA32_EMULATION #include <linux/compat.h> @@ -3106,7 +3079,7 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) cap->events_mask_len = x86_pmu.events_mask_len; cap->pebs_ept = x86_pmu.pebs_ept; } -EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability); +EXPORT_SYMBOL_FOR_KVM(perf_get_x86_pmu_capability); u64 perf_get_hw_event_config(int hw_event) { @@ -3117,4 +3090,4 @@ u64 perf_get_hw_event_config(int hw_event) return 0; } -EXPORT_SYMBOL_GPL(perf_get_hw_event_config); +EXPORT_SYMBOL_FOR_KVM(perf_get_hw_event_config); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 28f5468a6ea3..853fe073bab3 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2563,6 +2563,44 @@ static void intel_pmu_disable_fixed(struct perf_event *event) cpuc->fixed_ctrl_val &= ~mask; } +static inline void __intel_pmu_update_event_ext(int idx, u64 ext) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u32 msr; + + if (idx < INTEL_PMC_IDX_FIXED) { + msr = MSR_IA32_PMC_V6_GP0_CFG_C + + x86_pmu.addr_offset(idx, false); + } else { + msr = MSR_IA32_PMC_V6_FX0_CFG_C + + x86_pmu.addr_offset(idx - INTEL_PMC_IDX_FIXED, false); + } + + cpuc->cfg_c_val[idx] = ext; + wrmsrq(msr, ext); +} + +static void intel_pmu_disable_event_ext(struct perf_event *event) +{ + /* + * Only clear CFG_C MSR for PEBS counter group events, + * it avoids the HW counter's value to be added into + * other PEBS records incorrectly after PEBS counter + * group events are disabled. + * + * For other events, it's unnecessary to clear CFG_C MSRs + * since CFG_C doesn't take effect if counter is in + * disabled state. That helps to reduce the WRMSR overhead + * in context switches. + */ + if (!is_pebs_counter_event_group(event)) + return; + + __intel_pmu_update_event_ext(event->hw.idx, 0); +} + +DEFINE_STATIC_CALL_NULL(intel_pmu_disable_event_ext, intel_pmu_disable_event_ext); + static void intel_pmu_disable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; @@ -2571,9 +2609,12 @@ static void intel_pmu_disable_event(struct perf_event *event) switch (idx) { case 0 ... INTEL_PMC_IDX_FIXED - 1: intel_clear_masks(event, idx); + static_call_cond(intel_pmu_disable_event_ext)(event); x86_pmu_disable_event(event); break; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: + static_call_cond(intel_pmu_disable_event_ext)(event); + fallthrough; case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: intel_pmu_disable_fixed(event); break; @@ -2940,6 +2981,79 @@ static void intel_pmu_enable_acr(struct perf_event *event) DEFINE_STATIC_CALL_NULL(intel_pmu_enable_acr_event, intel_pmu_enable_acr); +static void intel_pmu_enable_event_ext(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + union arch_pebs_index old, new; + struct arch_pebs_cap cap; + u64 ext = 0; + + cap = hybrid(cpuc->pmu, arch_pebs_cap); + + if (event->attr.precise_ip) { + u64 pebs_data_cfg = intel_get_arch_pebs_data_config(event); + + ext |= ARCH_PEBS_EN; + if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) + ext |= (-hwc->sample_period) & ARCH_PEBS_RELOAD; + + if (pebs_data_cfg && cap.caps) { + if (pebs_data_cfg & PEBS_DATACFG_MEMINFO) + ext |= ARCH_PEBS_AUX & cap.caps; + + if (pebs_data_cfg & PEBS_DATACFG_GP) + ext |= ARCH_PEBS_GPR & cap.caps; + + if (pebs_data_cfg & PEBS_DATACFG_XMMS) + ext |= ARCH_PEBS_VECR_XMM & cap.caps; + + if (pebs_data_cfg & PEBS_DATACFG_LBRS) + ext |= ARCH_PEBS_LBR & cap.caps; + + if (pebs_data_cfg & + (PEBS_DATACFG_CNTR_MASK << PEBS_DATACFG_CNTR_SHIFT)) + ext |= ARCH_PEBS_CNTR_GP & cap.caps; + + if (pebs_data_cfg & + (PEBS_DATACFG_FIX_MASK << PEBS_DATACFG_FIX_SHIFT)) + ext |= ARCH_PEBS_CNTR_FIXED & cap.caps; + + if (pebs_data_cfg & PEBS_DATACFG_METRICS) + ext |= ARCH_PEBS_CNTR_METRICS & cap.caps; + } + + if (cpuc->n_pebs == cpuc->n_large_pebs) + new.thresh = ARCH_PEBS_THRESH_MULTI; + else + new.thresh = ARCH_PEBS_THRESH_SINGLE; + + rdmsrq(MSR_IA32_PEBS_INDEX, old.whole); + if (new.thresh != old.thresh || !old.en) { + if (old.thresh == ARCH_PEBS_THRESH_MULTI && old.wr > 0) { + /* + * Large PEBS was enabled. + * Drain PEBS buffer before applying the single PEBS. + */ + intel_pmu_drain_pebs_buffer(); + } else { + new.wr = 0; + new.full = 0; + new.en = 1; + wrmsrq(MSR_IA32_PEBS_INDEX, new.whole); + } + } + } + + if (is_pebs_counter_event_group(event)) + ext |= ARCH_PEBS_CNTR_ALLOW; + + if (cpuc->cfg_c_val[hwc->idx] != ext) + __intel_pmu_update_event_ext(hwc->idx, ext); +} + +DEFINE_STATIC_CALL_NULL(intel_pmu_enable_event_ext, intel_pmu_enable_event_ext); + static void intel_pmu_enable_event(struct perf_event *event) { u64 enable_mask = ARCH_PERFMON_EVENTSEL_ENABLE; @@ -2955,10 +3069,12 @@ static void intel_pmu_enable_event(struct perf_event *event) enable_mask |= ARCH_PERFMON_EVENTSEL_BR_CNTR; intel_set_masks(event, idx); static_call_cond(intel_pmu_enable_acr_event)(event); + static_call_cond(intel_pmu_enable_event_ext)(event); __x86_pmu_enable_event(hwc, enable_mask); break; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: static_call_cond(intel_pmu_enable_acr_event)(event); + static_call_cond(intel_pmu_enable_event_ext)(event); fallthrough; case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: intel_pmu_enable_fixed(event); @@ -3216,6 +3332,19 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) } /* + * Arch PEBS sets bit 54 in the global status register + */ + if (__test_and_clear_bit(GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT, + (unsigned long *)&status)) { + handled++; + static_call(x86_pmu_drain_pebs)(regs, &data); + + if (cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS] && + is_pebs_counter_event_group(cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS])) + status &= ~GLOBAL_STATUS_PERF_METRICS_OVF_BIT; + } + + /* * Intel PT */ if (__test_and_clear_bit(GLOBAL_STATUS_TRACE_TOPAPMI_BIT, (unsigned long *)&status)) { @@ -3269,7 +3398,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) * The PEBS buffer has to be drained before handling the A-PMI */ if (is_pebs_counter_event_group(event)) - x86_pmu.drain_pebs(regs, &data); + static_call(x86_pmu_drain_pebs)(regs, &data); last_period = event->hw.last_period; @@ -4029,7 +4158,9 @@ static unsigned long intel_pmu_large_pebs_flags(struct perf_event *event) if (!event->attr.exclude_kernel) flags &= ~PERF_SAMPLE_REGS_USER; if (event->attr.sample_regs_user & ~PEBS_GP_REGS) - flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR); + flags &= ~PERF_SAMPLE_REGS_USER; + if (event->attr.sample_regs_intr & ~PEBS_GP_REGS) + flags &= ~PERF_SAMPLE_REGS_INTR; return flags; } @@ -4204,6 +4335,20 @@ static bool intel_pmu_is_acr_group(struct perf_event *event) return false; } +static inline bool intel_pmu_has_pebs_counter_group(struct pmu *pmu) +{ + u64 caps; + + if (x86_pmu.intel_cap.pebs_format >= 6 && x86_pmu.intel_cap.pebs_baseline) + return true; + + caps = hybrid(pmu, arch_pebs_cap).caps; + if (x86_pmu.arch_pebs && (caps & ARCH_PEBS_CNTR_MASK)) + return true; + + return false; +} + static inline void intel_pmu_set_acr_cntr_constr(struct perf_event *event, u64 *cause_mask, int *num) { @@ -4237,6 +4382,8 @@ static int intel_pmu_hw_config(struct perf_event *event) } if (event->attr.precise_ip) { + struct arch_pebs_cap pebs_cap = hybrid(event->pmu, arch_pebs_cap); + if ((event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_FIXED_VLBR_EVENT) return -EINVAL; @@ -4250,6 +4397,15 @@ static int intel_pmu_hw_config(struct perf_event *event) } if (x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); + + if (x86_pmu.arch_pebs) { + u64 cntr_mask = hybrid(event->pmu, intel_ctrl) & + ~GLOBAL_CTRL_EN_PERF_METRICS; + u64 pebs_mask = event->attr.precise_ip >= 3 ? + pebs_cap.pdists : pebs_cap.counters; + if (cntr_mask != pebs_mask) + event->hw.dyn_constraint &= pebs_mask; + } } if (needs_branch_stack(event)) { @@ -4341,8 +4497,7 @@ static int intel_pmu_hw_config(struct perf_event *event) } if ((event->attr.sample_type & PERF_SAMPLE_READ) && - (x86_pmu.intel_cap.pebs_format >= 6) && - x86_pmu.intel_cap.pebs_baseline && + intel_pmu_has_pebs_counter_group(event->pmu) && is_sampling_event(event) && event->attr.precise_ip) event->group_leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR; @@ -5212,7 +5367,13 @@ err: static int intel_pmu_cpu_prepare(int cpu) { - return intel_cpuc_prepare(&per_cpu(cpu_hw_events, cpu), cpu); + int ret; + + ret = intel_cpuc_prepare(&per_cpu(cpu_hw_events, cpu), cpu); + if (ret) + return ret; + + return alloc_arch_pebs_buf_on_cpu(cpu); } static void flip_smm_bit(void *data) @@ -5257,6 +5418,163 @@ static void intel_pmu_check_event_constraints(struct event_constraint *event_con u64 fixed_cntr_mask, u64 intel_ctrl); +enum dyn_constr_type { + DYN_CONSTR_NONE, + DYN_CONSTR_BR_CNTR, + DYN_CONSTR_ACR_CNTR, + DYN_CONSTR_ACR_CAUSE, + DYN_CONSTR_PEBS, + DYN_CONSTR_PDIST, + + DYN_CONSTR_MAX, +}; + +static const char * const dyn_constr_type_name[] = { + [DYN_CONSTR_NONE] = "a normal event", + [DYN_CONSTR_BR_CNTR] = "a branch counter logging event", + [DYN_CONSTR_ACR_CNTR] = "an auto-counter reload event", + [DYN_CONSTR_ACR_CAUSE] = "an auto-counter reload cause event", + [DYN_CONSTR_PEBS] = "a PEBS event", + [DYN_CONSTR_PDIST] = "a PEBS PDIST event", +}; + +static void __intel_pmu_check_dyn_constr(struct event_constraint *constr, + enum dyn_constr_type type, u64 mask) +{ + struct event_constraint *c1, *c2; + int new_weight, check_weight; + u64 new_mask, check_mask; + + for_each_event_constraint(c1, constr) { + new_mask = c1->idxmsk64 & mask; + new_weight = hweight64(new_mask); + + /* ignore topdown perf metrics event */ + if (c1->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) + continue; + + if (!new_weight && fls64(c1->idxmsk64) < INTEL_PMC_IDX_FIXED) { + pr_info("The event 0x%llx is not supported as %s.\n", + c1->code, dyn_constr_type_name[type]); + } + + if (new_weight <= 1) + continue; + + for_each_event_constraint(c2, c1 + 1) { + bool check_fail = false; + + check_mask = c2->idxmsk64 & mask; + check_weight = hweight64(check_mask); + + if (c2->idxmsk64 & INTEL_PMC_MSK_TOPDOWN || + !check_weight) + continue; + + /* The same constraints or no overlap */ + if (new_mask == check_mask || + (new_mask ^ check_mask) == (new_mask | check_mask)) + continue; + + /* + * A scheduler issue may be triggered in the following cases. + * - Two overlap constraints have the same weight. + * E.g., A constraints: 0x3, B constraints: 0x6 + * event counter failure case + * B PMC[2:1] 1 + * A PMC[1:0] 0 + * A PMC[1:0] FAIL + * - Two overlap constraints have different weight. + * The constraint has a low weight, but has high last bit. + * E.g., A constraints: 0x7, B constraints: 0xC + * event counter failure case + * B PMC[3:2] 2 + * A PMC[2:0] 0 + * A PMC[2:0] 1 + * A PMC[2:0] FAIL + */ + if (new_weight == check_weight) { + check_fail = true; + } else if (new_weight < check_weight) { + if ((new_mask | check_mask) != check_mask && + fls64(new_mask) > fls64(check_mask)) + check_fail = true; + } else { + if ((new_mask | check_mask) != new_mask && + fls64(new_mask) < fls64(check_mask)) + check_fail = true; + } + + if (check_fail) { + pr_info("The two events 0x%llx and 0x%llx may not be " + "fully scheduled under some circumstances as " + "%s.\n", + c1->code, c2->code, dyn_constr_type_name[type]); + } + } + } +} + +static void intel_pmu_check_dyn_constr(struct pmu *pmu, + struct event_constraint *constr, + u64 cntr_mask) +{ + enum dyn_constr_type i; + u64 mask; + + for (i = DYN_CONSTR_NONE; i < DYN_CONSTR_MAX; i++) { + mask = 0; + switch (i) { + case DYN_CONSTR_NONE: + mask = cntr_mask; + break; + case DYN_CONSTR_BR_CNTR: + if (x86_pmu.flags & PMU_FL_BR_CNTR) + mask = x86_pmu.lbr_counters; + break; + case DYN_CONSTR_ACR_CNTR: + mask = hybrid(pmu, acr_cntr_mask64) & GENMASK_ULL(INTEL_PMC_MAX_GENERIC - 1, 0); + break; + case DYN_CONSTR_ACR_CAUSE: + if (hybrid(pmu, acr_cntr_mask64) == hybrid(pmu, acr_cause_mask64)) + continue; + mask = hybrid(pmu, acr_cause_mask64) & GENMASK_ULL(INTEL_PMC_MAX_GENERIC - 1, 0); + break; + case DYN_CONSTR_PEBS: + if (x86_pmu.arch_pebs) + mask = hybrid(pmu, arch_pebs_cap).counters; + break; + case DYN_CONSTR_PDIST: + if (x86_pmu.arch_pebs) + mask = hybrid(pmu, arch_pebs_cap).pdists; + break; + default: + pr_warn("Unsupported dynamic constraint type %d\n", i); + } + + if (mask) + __intel_pmu_check_dyn_constr(constr, i, mask); + } +} + +static void intel_pmu_check_event_constraints_all(struct pmu *pmu) +{ + struct event_constraint *event_constraints = hybrid(pmu, event_constraints); + struct event_constraint *pebs_constraints = hybrid(pmu, pebs_constraints); + u64 cntr_mask = hybrid(pmu, cntr_mask64); + u64 fixed_cntr_mask = hybrid(pmu, fixed_cntr_mask64); + u64 intel_ctrl = hybrid(pmu, intel_ctrl); + + intel_pmu_check_event_constraints(event_constraints, cntr_mask, + fixed_cntr_mask, intel_ctrl); + + if (event_constraints) + intel_pmu_check_dyn_constr(pmu, event_constraints, cntr_mask); + + if (pebs_constraints) + intel_pmu_check_dyn_constr(pmu, pebs_constraints, cntr_mask); +} + static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs); static inline bool intel_pmu_broken_perf_cap(void) @@ -5269,34 +5587,89 @@ static inline bool intel_pmu_broken_perf_cap(void) return false; } +static inline void __intel_update_pmu_caps(struct pmu *pmu) +{ + struct pmu *dest_pmu = pmu ? pmu : x86_get_pmu(smp_processor_id()); + + if (hybrid(pmu, arch_pebs_cap).caps & ARCH_PEBS_VECR_XMM) + dest_pmu->capabilities |= PERF_PMU_CAP_EXTENDED_REGS; +} + +static inline void __intel_update_large_pebs_flags(struct pmu *pmu) +{ + u64 caps = hybrid(pmu, arch_pebs_cap).caps; + + x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME; + if (caps & ARCH_PEBS_LBR) + x86_pmu.large_pebs_flags |= PERF_SAMPLE_BRANCH_STACK; + if (caps & ARCH_PEBS_CNTR_MASK) + x86_pmu.large_pebs_flags |= PERF_SAMPLE_READ; + + if (!(caps & ARCH_PEBS_AUX)) + x86_pmu.large_pebs_flags &= ~PERF_SAMPLE_DATA_SRC; + if (!(caps & ARCH_PEBS_GPR)) { + x86_pmu.large_pebs_flags &= + ~(PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER); + } +} + +#define counter_mask(_gp, _fixed) ((_gp) | ((u64)(_fixed) << INTEL_PMC_IDX_FIXED)) + static void update_pmu_cap(struct pmu *pmu) { - unsigned int cntr, fixed_cntr, ecx, edx; - union cpuid35_eax eax; - union cpuid35_ebx ebx; + unsigned int eax, ebx, ecx, edx; + union cpuid35_eax eax_0; + union cpuid35_ebx ebx_0; + u64 cntrs_mask = 0; + u64 pebs_mask = 0; + u64 pdists_mask = 0; - cpuid(ARCH_PERFMON_EXT_LEAF, &eax.full, &ebx.full, &ecx, &edx); + cpuid(ARCH_PERFMON_EXT_LEAF, &eax_0.full, &ebx_0.full, &ecx, &edx); - if (ebx.split.umask2) + if (ebx_0.split.umask2) hybrid(pmu, config_mask) |= ARCH_PERFMON_EVENTSEL_UMASK2; - if (ebx.split.eq) + if (ebx_0.split.eq) hybrid(pmu, config_mask) |= ARCH_PERFMON_EVENTSEL_EQ; - if (eax.split.cntr_subleaf) { + if (eax_0.split.cntr_subleaf) { cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_NUM_COUNTER_LEAF, - &cntr, &fixed_cntr, &ecx, &edx); - hybrid(pmu, cntr_mask64) = cntr; - hybrid(pmu, fixed_cntr_mask64) = fixed_cntr; + &eax, &ebx, &ecx, &edx); + hybrid(pmu, cntr_mask64) = eax; + hybrid(pmu, fixed_cntr_mask64) = ebx; + cntrs_mask = counter_mask(eax, ebx); } - if (eax.split.acr_subleaf) { + if (eax_0.split.acr_subleaf) { cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_ACR_LEAF, - &cntr, &fixed_cntr, &ecx, &edx); + &eax, &ebx, &ecx, &edx); /* The mask of the counters which can be reloaded */ - hybrid(pmu, acr_cntr_mask64) = cntr | ((u64)fixed_cntr << INTEL_PMC_IDX_FIXED); - + hybrid(pmu, acr_cntr_mask64) = counter_mask(eax, ebx); /* The mask of the counters which can cause a reload of reloadable counters */ - hybrid(pmu, acr_cause_mask64) = ecx | ((u64)edx << INTEL_PMC_IDX_FIXED); + hybrid(pmu, acr_cause_mask64) = counter_mask(ecx, edx); + } + + /* Bits[5:4] should be set simultaneously if arch-PEBS is supported */ + if (eax_0.split.pebs_caps_subleaf && eax_0.split.pebs_cnts_subleaf) { + cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_PEBS_CAP_LEAF, + &eax, &ebx, &ecx, &edx); + hybrid(pmu, arch_pebs_cap).caps = (u64)ebx << 32; + + cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_PEBS_COUNTER_LEAF, + &eax, &ebx, &ecx, &edx); + pebs_mask = counter_mask(eax, ecx); + pdists_mask = counter_mask(ebx, edx); + hybrid(pmu, arch_pebs_cap).counters = pebs_mask; + hybrid(pmu, arch_pebs_cap).pdists = pdists_mask; + + if (WARN_ON((pebs_mask | pdists_mask) & ~cntrs_mask)) { + x86_pmu.arch_pebs = 0; + } else { + __intel_update_pmu_caps(pmu); + __intel_update_large_pebs_flags(pmu); + } + } else { + WARN_ON(x86_pmu.arch_pebs == 1); + x86_pmu.arch_pebs = 0; } if (!intel_pmu_broken_perf_cap()) { @@ -5319,10 +5692,7 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) else pmu->intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS; - intel_pmu_check_event_constraints(pmu->event_constraints, - pmu->cntr_mask64, - pmu->fixed_cntr_mask64, - pmu->intel_ctrl); + intel_pmu_check_event_constraints_all(&pmu->pmu); intel_pmu_check_extra_regs(pmu->extra_regs); } @@ -5418,6 +5788,7 @@ static void intel_pmu_cpu_starting(int cpu) return; init_debug_store_on_cpu(cpu); + init_arch_pebs_on_cpu(cpu); /* * Deal with CPUs that don't clear their LBRs on power-up, and that may * even boot with LBRs enabled. @@ -5456,6 +5827,8 @@ static void intel_pmu_cpu_starting(int cpu) } } + __intel_update_pmu_caps(cpuc->pmu); + if (!cpuc->shared_regs) return; @@ -5515,6 +5888,7 @@ static void free_excl_cntrs(struct cpu_hw_events *cpuc) static void intel_pmu_cpu_dying(int cpu) { fini_debug_store_on_cpu(cpu); + fini_arch_pebs_on_cpu(cpu); } void intel_cpuc_finish(struct cpu_hw_events *cpuc) @@ -5535,6 +5909,7 @@ static void intel_pmu_cpu_dead(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + release_arch_pebs_buf_on_cpu(cpu); intel_cpuc_finish(cpuc); if (is_hybrid() && cpuc->pmu) @@ -6250,7 +6625,7 @@ tsx_is_visible(struct kobject *kobj, struct attribute *attr, int i) static umode_t pebs_is_visible(struct kobject *kobj, struct attribute *attr, int i) { - return x86_pmu.ds_pebs ? attr->mode : 0; + return intel_pmu_has_pebs() ? attr->mode : 0; } static umode_t @@ -6940,8 +7315,11 @@ __init int intel_pmu_init(void) * Many features on and after V6 require dynamic constraint, * e.g., Arch PEBS, ACR. */ - if (version >= 6) + if (version >= 6) { x86_pmu.flags |= PMU_FL_DYN_CONSTRAINT; + x86_pmu.late_setup = intel_pmu_late_setup; + } + /* * Install the hw-cache-events table: */ @@ -7596,6 +7974,7 @@ __init int intel_pmu_init(void) break; case INTEL_PANTHERLAKE_L: + case INTEL_WILDCATLAKE_L: pr_cont("Pantherlake Hybrid events, "); name = "pantherlake_hybrid"; goto lnl_common; @@ -7726,6 +8105,14 @@ __init int intel_pmu_init(void) if (!is_hybrid() && boot_cpu_has(X86_FEATURE_ARCH_PERFMON_EXT)) update_pmu_cap(NULL); + if (x86_pmu.arch_pebs) { + static_call_update(intel_pmu_disable_event_ext, + intel_pmu_disable_event_ext); + static_call_update(intel_pmu_enable_event_ext, + intel_pmu_enable_event_ext); + pr_cont("Architectural PEBS, "); + } + intel_pmu_check_counters_mask(&x86_pmu.cntr_mask64, &x86_pmu.fixed_cntr_mask64, &x86_pmu.intel_ctrl); @@ -7734,10 +8121,8 @@ __init int intel_pmu_init(void) if (x86_pmu.intel_cap.anythread_deprecated) x86_pmu.format_attrs = intel_arch_formats_attr; - intel_pmu_check_event_constraints(x86_pmu.event_constraints, - x86_pmu.cntr_mask64, - x86_pmu.fixed_cntr_mask64, - x86_pmu.intel_ctrl); + intel_pmu_check_event_constraints_all(NULL); + /* * Access LBR MSR may cause #GP under certain circumstances. * Check all LBR MSR here. diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index ec753e39b007..fa67fda6e45b 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -41,7 +41,7 @@ * MSR_CORE_C1_RES: CORE C1 Residency Counter * perf code: 0x00 * Available model: SLM,AMT,GLM,CNL,ICX,TNT,ADL,RPL - * MTL,SRF,GRR,ARL,LNL + * MTL,SRF,GRR,ARL,LNL,PTL * Scope: Core (each processor core has a MSR) * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter * perf code: 0x01 @@ -53,31 +53,32 @@ * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF, - * GRR,ARL,LNL + * GRR,ARL,LNL,PTL * Scope: Core * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter * perf code: 0x03 * Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML, - * ICL,TGL,RKL,ADL,RPL,MTL,ARL,LNL + * ICL,TGL,RKL,ADL,RPL,MTL,ARL,LNL, + * PTL * Scope: Core * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. * perf code: 0x00 * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL, * KBL,CML,ICL,ICX,TGL,TNT,RKL,ADL, - * RPL,SPR,MTL,ARL,LNL,SRF + * RPL,SPR,MTL,ARL,LNL,SRF,PTL * Scope: Package (physical package) * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. * perf code: 0x01 * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL, * GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL, - * ADL,RPL,MTL,ARL,LNL + * ADL,RPL,MTL,ARL * Scope: Package (physical package) * MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter. * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF, - * ARL,LNL + * ARL,LNL,PTL * Scope: Package (physical package) * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. * perf code: 0x03 @@ -96,7 +97,7 @@ * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter. * perf code: 0x06 * Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL, - * TNT,RKL,ADL,RPL,MTL,ARL,LNL + * TNT,RKL,ADL,RPL,MTL,ARL,LNL,PTL * Scope: Package (physical package) * MSR_MODULE_C6_RES_MS: Module C6 Residency Counter. * perf code: 0x00 @@ -522,7 +523,6 @@ static const struct cstate_model lnl_cstates __initconst = { BIT(PERF_CSTATE_CORE_C7_RES), .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) | - BIT(PERF_CSTATE_PKG_C3_RES) | BIT(PERF_CSTATE_PKG_C6_RES) | BIT(PERF_CSTATE_PKG_C10_RES), }; @@ -628,6 +628,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &adl_cstates), X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, &srf_cstates), X86_MATCH_VFM(INTEL_ATOM_CRESTMONT, &grr_cstates), + X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, &srf_cstates), X86_MATCH_VFM(INTEL_ICELAKE_L, &icl_cstates), X86_MATCH_VFM(INTEL_ICELAKE, &icl_cstates), @@ -652,6 +653,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_VFM(INTEL_ARROWLAKE_H, &adl_cstates), X86_MATCH_VFM(INTEL_ARROWLAKE_U, &adl_cstates), X86_MATCH_VFM(INTEL_LUNARLAKE_M, &lnl_cstates), + X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &lnl_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index c0b7ac1c7594..feb1c3cf63e4 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -317,7 +317,8 @@ static u64 __grt_latency_data(struct perf_event *event, u64 status, { u64 val; - WARN_ON_ONCE(hybrid_pmu(event->pmu)->pmu_type == hybrid_big); + WARN_ON_ONCE(is_hybrid() && + hybrid_pmu(event->pmu)->pmu_type == hybrid_big); dse &= PERF_PEBS_DATA_SOURCE_GRT_MASK; val = hybrid_var(event->pmu, pebs_data_source)[dse]; @@ -625,13 +626,18 @@ static int alloc_pebs_buffer(int cpu) int max, node = cpu_to_node(cpu); void *buffer, *insn_buff, *cea; - if (!x86_pmu.ds_pebs) + if (!intel_pmu_has_pebs()) return 0; buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu); if (unlikely(!buffer)) return -ENOMEM; + if (x86_pmu.arch_pebs) { + hwev->pebs_vaddr = buffer; + return 0; + } + /* * HSW+ already provides us the eventing ip; no need to allocate this * buffer then. @@ -644,7 +650,7 @@ static int alloc_pebs_buffer(int cpu) } per_cpu(insn_buffer, cpu) = insn_buff; } - hwev->ds_pebs_vaddr = buffer; + hwev->pebs_vaddr = buffer; /* Update the cpu entry area mapping */ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; ds->pebs_buffer_base = (unsigned long) cea; @@ -660,17 +666,20 @@ static void release_pebs_buffer(int cpu) struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); void *cea; - if (!x86_pmu.ds_pebs) + if (!intel_pmu_has_pebs()) return; - kfree(per_cpu(insn_buffer, cpu)); - per_cpu(insn_buffer, cpu) = NULL; + if (x86_pmu.ds_pebs) { + kfree(per_cpu(insn_buffer, cpu)); + per_cpu(insn_buffer, cpu) = NULL; - /* Clear the fixmap */ - cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; - ds_clear_cea(cea, x86_pmu.pebs_buffer_size); - dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size); - hwev->ds_pebs_vaddr = NULL; + /* Clear the fixmap */ + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; + ds_clear_cea(cea, x86_pmu.pebs_buffer_size); + } + + dsfree_pages(hwev->pebs_vaddr, x86_pmu.pebs_buffer_size); + hwev->pebs_vaddr = NULL; } static int alloc_bts_buffer(int cpu) @@ -823,6 +832,56 @@ void reserve_ds_buffers(void) } } +inline int alloc_arch_pebs_buf_on_cpu(int cpu) +{ + if (!x86_pmu.arch_pebs) + return 0; + + return alloc_pebs_buffer(cpu); +} + +inline void release_arch_pebs_buf_on_cpu(int cpu) +{ + if (!x86_pmu.arch_pebs) + return; + + release_pebs_buffer(cpu); +} + +void init_arch_pebs_on_cpu(int cpu) +{ + struct cpu_hw_events *cpuc = per_cpu_ptr(&cpu_hw_events, cpu); + u64 arch_pebs_base; + + if (!x86_pmu.arch_pebs) + return; + + if (!cpuc->pebs_vaddr) { + WARN(1, "Fail to allocate PEBS buffer on CPU %d\n", cpu); + x86_pmu.pebs_active = 0; + return; + } + + /* + * 4KB-aligned pointer of the output buffer + * (__alloc_pages_node() return page aligned address) + * Buffer Size = 4KB * 2^SIZE + * contiguous physical buffer (__alloc_pages_node() with order) + */ + arch_pebs_base = virt_to_phys(cpuc->pebs_vaddr) | PEBS_BUFFER_SHIFT; + wrmsr_on_cpu(cpu, MSR_IA32_PEBS_BASE, (u32)arch_pebs_base, + (u32)(arch_pebs_base >> 32)); + x86_pmu.pebs_active = 1; +} + +inline void fini_arch_pebs_on_cpu(int cpu) +{ + if (!x86_pmu.arch_pebs) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_PEBS_BASE, 0, 0); +} + /* * BTS */ @@ -1470,6 +1529,25 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, } } +u64 intel_get_arch_pebs_data_config(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 pebs_data_cfg = 0; + u64 cntr_mask; + + if (WARN_ON(event->hw.idx < 0 || event->hw.idx >= X86_PMC_IDX_MAX)) + return 0; + + pebs_data_cfg |= pebs_update_adaptive_cfg(event); + + cntr_mask = (PEBS_DATACFG_CNTR_MASK << PEBS_DATACFG_CNTR_SHIFT) | + (PEBS_DATACFG_FIX_MASK << PEBS_DATACFG_FIX_SHIFT) | + PEBS_DATACFG_CNTR | PEBS_DATACFG_METRICS; + pebs_data_cfg |= cpuc->pebs_data_cfg & cntr_mask; + + return pebs_data_cfg; +} + void intel_pmu_pebs_add(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1531,6 +1609,15 @@ static inline void intel_pmu_drain_large_pebs(struct cpu_hw_events *cpuc) intel_pmu_drain_pebs_buffer(); } +static void __intel_pmu_pebs_enable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + + hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; + cpuc->pebs_enabled |= 1ULL << hwc->idx; +} + void intel_pmu_pebs_enable(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1539,9 +1626,7 @@ void intel_pmu_pebs_enable(struct perf_event *event) struct debug_store *ds = cpuc->ds; unsigned int idx = hwc->idx; - hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; - - cpuc->pebs_enabled |= 1ULL << hwc->idx; + __intel_pmu_pebs_enable(event); if ((event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) && (x86_pmu.version < 5)) cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32); @@ -1603,14 +1688,22 @@ void intel_pmu_pebs_del(struct perf_event *event) pebs_update_state(needed_cb, cpuc, event, false); } -void intel_pmu_pebs_disable(struct perf_event *event) +static void __intel_pmu_pebs_disable(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; intel_pmu_drain_large_pebs(cpuc); - cpuc->pebs_enabled &= ~(1ULL << hwc->idx); + hwc->config |= ARCH_PERFMON_EVENTSEL_INT; +} + +void intel_pmu_pebs_disable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + + __intel_pmu_pebs_disable(event); if ((event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) && (x86_pmu.version < 5)) @@ -1622,8 +1715,6 @@ void intel_pmu_pebs_disable(struct perf_event *event) if (cpuc->enabled) wrmsrq(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); - - hwc->config |= ARCH_PERFMON_EVENTSEL_INT; } void intel_pmu_pebs_enable_all(void) @@ -2059,6 +2150,90 @@ static inline void __setup_pebs_counter_group(struct cpu_hw_events *cpuc, #define PEBS_LATENCY_MASK 0xffff +static inline void __setup_perf_sample_data(struct perf_event *event, + struct pt_regs *iregs, + struct perf_sample_data *data) +{ + perf_sample_data_init(data, 0, event->hw.last_period); + + /* + * We must however always use iregs for the unwinder to stay sane; the + * record BP,SP,IP can point into thin air when the record is from a + * previous PMI context or an (I)RET happened between the record and + * PMI. + */ + perf_sample_save_callchain(data, event, iregs); +} + +static inline void __setup_pebs_basic_group(struct perf_event *event, + struct pt_regs *regs, + struct perf_sample_data *data, + u64 sample_type, u64 ip, + u64 tsc, u16 retire) +{ + /* The ip in basic is EventingIP */ + set_linear_ip(regs, ip); + regs->flags = PERF_EFLAGS_EXACT; + setup_pebs_time(event, data, tsc); + + if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) + data->weight.var3_w = retire; +} + +static inline void __setup_pebs_gpr_group(struct perf_event *event, + struct pt_regs *regs, + struct pebs_gprs *gprs, + u64 sample_type) +{ + if (event->attr.precise_ip < 2) { + set_linear_ip(regs, gprs->ip); + regs->flags &= ~PERF_EFLAGS_EXACT; + } + + if (sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)) + adaptive_pebs_save_regs(regs, gprs); +} + +static inline void __setup_pebs_meminfo_group(struct perf_event *event, + struct perf_sample_data *data, + u64 sample_type, u64 latency, + u16 instr_latency, u64 address, + u64 aux, u64 tsx_tuning, u64 ax) +{ + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) { + u64 tsx_latency = intel_get_tsx_weight(tsx_tuning); + + data->weight.var2_w = instr_latency; + + /* + * Although meminfo::latency is defined as a u64, + * only the lower 32 bits include the valid data + * in practice on Ice Lake and earlier platforms. + */ + if (sample_type & PERF_SAMPLE_WEIGHT) + data->weight.full = latency ?: tsx_latency; + else + data->weight.var1_dw = (u32)latency ?: tsx_latency; + + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } + + if (sample_type & PERF_SAMPLE_DATA_SRC) { + data->data_src.val = get_data_src(event, aux); + data->sample_flags |= PERF_SAMPLE_DATA_SRC; + } + + if (sample_type & PERF_SAMPLE_ADDR_TYPE) { + data->addr = address; + data->sample_flags |= PERF_SAMPLE_ADDR; + } + + if (sample_type & PERF_SAMPLE_TRANSACTION) { + data->txn = intel_get_tsx_transaction(tsx_tuning, ax); + data->sample_flags |= PERF_SAMPLE_TRANSACTION; + } +} + /* * With adaptive PEBS the layout depends on what fields are configured. */ @@ -2068,12 +2243,14 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, struct pt_regs *regs) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 sample_type = event->attr.sample_type; struct pebs_basic *basic = __pebs; void *next_record = basic + 1; - u64 sample_type, format_group; struct pebs_meminfo *meminfo = NULL; struct pebs_gprs *gprs = NULL; struct x86_perf_regs *perf_regs; + u64 format_group; + u16 retire; if (basic == NULL) return; @@ -2081,31 +2258,17 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, perf_regs = container_of(regs, struct x86_perf_regs, regs); perf_regs->xmm_regs = NULL; - sample_type = event->attr.sample_type; format_group = basic->format_group; - perf_sample_data_init(data, 0, event->hw.last_period); - setup_pebs_time(event, data, basic->tsc); - - /* - * We must however always use iregs for the unwinder to stay sane; the - * record BP,SP,IP can point into thin air when the record is from a - * previous PMI context or an (I)RET happened between the record and - * PMI. - */ - perf_sample_save_callchain(data, event, iregs); + __setup_perf_sample_data(event, iregs, data); *regs = *iregs; - /* The ip in basic is EventingIP */ - set_linear_ip(regs, basic->ip); - regs->flags = PERF_EFLAGS_EXACT; - if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) { - if (x86_pmu.flags & PMU_FL_RETIRE_LATENCY) - data->weight.var3_w = basic->retire_latency; - else - data->weight.var3_w = 0; - } + /* basic group */ + retire = x86_pmu.flags & PMU_FL_RETIRE_LATENCY ? + basic->retire_latency : 0; + __setup_pebs_basic_group(event, regs, data, sample_type, + basic->ip, basic->tsc, retire); /* * The record for MEMINFO is in front of GP @@ -2121,54 +2284,20 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, gprs = next_record; next_record = gprs + 1; - if (event->attr.precise_ip < 2) { - set_linear_ip(regs, gprs->ip); - regs->flags &= ~PERF_EFLAGS_EXACT; - } - - if (sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)) - adaptive_pebs_save_regs(regs, gprs); + __setup_pebs_gpr_group(event, regs, gprs, sample_type); } if (format_group & PEBS_DATACFG_MEMINFO) { - if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) { - u64 latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ? - meminfo->cache_latency : meminfo->mem_latency; - - if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) - data->weight.var2_w = meminfo->instr_latency; - - /* - * Although meminfo::latency is defined as a u64, - * only the lower 32 bits include the valid data - * in practice on Ice Lake and earlier platforms. - */ - if (sample_type & PERF_SAMPLE_WEIGHT) { - data->weight.full = latency ?: - intel_get_tsx_weight(meminfo->tsx_tuning); - } else { - data->weight.var1_dw = (u32)latency ?: - intel_get_tsx_weight(meminfo->tsx_tuning); - } - - data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; - } - - if (sample_type & PERF_SAMPLE_DATA_SRC) { - data->data_src.val = get_data_src(event, meminfo->aux); - data->sample_flags |= PERF_SAMPLE_DATA_SRC; - } - - if (sample_type & PERF_SAMPLE_ADDR_TYPE) { - data->addr = meminfo->address; - data->sample_flags |= PERF_SAMPLE_ADDR; - } - - if (sample_type & PERF_SAMPLE_TRANSACTION) { - data->txn = intel_get_tsx_transaction(meminfo->tsx_tuning, - gprs ? gprs->ax : 0); - data->sample_flags |= PERF_SAMPLE_TRANSACTION; - } + u64 latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ? + meminfo->cache_latency : meminfo->mem_latency; + u64 instr_latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ? + meminfo->instr_latency : 0; + u64 ax = gprs ? gprs->ax : 0; + + __setup_pebs_meminfo_group(event, data, sample_type, latency, + instr_latency, meminfo->address, + meminfo->aux, meminfo->tsx_tuning, + ax); } if (format_group & PEBS_DATACFG_XMMS) { @@ -2219,6 +2348,135 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, format_group); } +static inline bool arch_pebs_record_continued(struct arch_pebs_header *header) +{ + /* Continue bit or null PEBS record indicates fragment follows. */ + return header->cont || !(header->format & GENMASK_ULL(63, 16)); +} + +static void setup_arch_pebs_sample_data(struct perf_event *event, + struct pt_regs *iregs, + void *__pebs, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 sample_type = event->attr.sample_type; + struct arch_pebs_header *header = NULL; + struct arch_pebs_aux *meminfo = NULL; + struct arch_pebs_gprs *gprs = NULL; + struct x86_perf_regs *perf_regs; + void *next_record; + void *at = __pebs; + + if (at == NULL) + return; + + perf_regs = container_of(regs, struct x86_perf_regs, regs); + perf_regs->xmm_regs = NULL; + + __setup_perf_sample_data(event, iregs, data); + + *regs = *iregs; + +again: + header = at; + next_record = at + sizeof(struct arch_pebs_header); + if (header->basic) { + struct arch_pebs_basic *basic = next_record; + u16 retire = 0; + + next_record = basic + 1; + + if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) + retire = basic->valid ? basic->retire : 0; + __setup_pebs_basic_group(event, regs, data, sample_type, + basic->ip, basic->tsc, retire); + } + + /* + * The record for MEMINFO is in front of GP + * But PERF_SAMPLE_TRANSACTION needs gprs->ax. + * Save the pointer here but process later. + */ + if (header->aux) { + meminfo = next_record; + next_record = meminfo + 1; + } + + if (header->gpr) { + gprs = next_record; + next_record = gprs + 1; + + __setup_pebs_gpr_group(event, regs, + (struct pebs_gprs *)gprs, + sample_type); + } + + if (header->aux) { + u64 ax = gprs ? gprs->ax : 0; + + __setup_pebs_meminfo_group(event, data, sample_type, + meminfo->cache_latency, + meminfo->instr_latency, + meminfo->address, meminfo->aux, + meminfo->tsx_tuning, ax); + } + + if (header->xmm) { + struct pebs_xmm *xmm; + + next_record += sizeof(struct arch_pebs_xer_header); + + xmm = next_record; + perf_regs->xmm_regs = xmm->xmm; + next_record = xmm + 1; + } + + if (header->lbr) { + struct arch_pebs_lbr_header *lbr_header = next_record; + struct lbr_entry *lbr; + int num_lbr; + + next_record = lbr_header + 1; + lbr = next_record; + + num_lbr = header->lbr == ARCH_PEBS_LBR_NUM_VAR ? + lbr_header->depth : + header->lbr * ARCH_PEBS_BASE_LBR_ENTRIES; + next_record += num_lbr * sizeof(struct lbr_entry); + + if (has_branch_stack(event)) { + intel_pmu_store_pebs_lbrs(lbr); + intel_pmu_lbr_save_brstack(data, cpuc, event); + } + } + + if (header->cntr) { + struct arch_pebs_cntr_header *cntr = next_record; + unsigned int nr; + + next_record += sizeof(struct arch_pebs_cntr_header); + + if (is_pebs_counter_event_group(event)) { + __setup_pebs_counter_group(cpuc, event, + (struct pebs_cntr_header *)cntr, next_record); + data->sample_flags |= PERF_SAMPLE_READ; + } + + nr = hweight32(cntr->cntr) + hweight32(cntr->fixed); + if (cntr->metrics == INTEL_CNTR_METRICS) + nr += 2; + next_record += nr * sizeof(u64); + } + + /* Parse followed fragments if there are. */ + if (arch_pebs_record_continued(header)) { + at = at + header->size; + goto again; + } +} + static inline void * get_next_pebs_record_by_bit(void *base, void *top, int bit) { @@ -2601,6 +2859,57 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d } } +static __always_inline void +__intel_pmu_handle_pebs_record(struct pt_regs *iregs, + struct pt_regs *regs, + struct perf_sample_data *data, + void *at, u64 pebs_status, + short *counts, void **last, + setup_fn setup_sample) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *event; + int bit; + + for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) { + event = cpuc->events[bit]; + + if (WARN_ON_ONCE(!event) || + WARN_ON_ONCE(!event->attr.precise_ip)) + continue; + + if (counts[bit]++) { + __intel_pmu_pebs_event(event, iregs, regs, data, + last[bit], setup_sample); + } + + last[bit] = at; + } +} + +static __always_inline void +__intel_pmu_handle_last_pebs_record(struct pt_regs *iregs, + struct pt_regs *regs, + struct perf_sample_data *data, + u64 mask, short *counts, void **last, + setup_fn setup_sample) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *event; + int bit; + + for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) { + if (!counts[bit]) + continue; + + event = cpuc->events[bit]; + + __intel_pmu_pebs_last_event(event, iregs, regs, data, last[bit], + counts[bit], setup_sample); + } + +} + static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data) { short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; @@ -2610,9 +2919,7 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d struct x86_perf_regs perf_regs; struct pt_regs *regs = &perf_regs.regs; struct pebs_basic *basic; - struct perf_event *event; void *base, *at, *top; - int bit; u64 mask; if (!x86_pmu.pebs_active) @@ -2625,6 +2932,7 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d mask = hybrid(cpuc->pmu, pebs_events_mask) | (hybrid(cpuc->pmu, fixed_cntr_mask64) << INTEL_PMC_IDX_FIXED); + mask &= cpuc->pebs_enabled; if (unlikely(base >= top)) { intel_pmu_pebs_event_update_no_drain(cpuc, mask); @@ -2642,38 +2950,114 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d if (basic->format_size != cpuc->pebs_record_size) continue; - pebs_status = basic->applicable_counters & cpuc->pebs_enabled & mask; - for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) { - event = cpuc->events[bit]; + pebs_status = mask & basic->applicable_counters; + __intel_pmu_handle_pebs_record(iregs, regs, data, at, + pebs_status, counts, last, + setup_pebs_adaptive_sample_data); + } - if (WARN_ON_ONCE(!event) || - WARN_ON_ONCE(!event->attr.precise_ip)) - continue; + __intel_pmu_handle_last_pebs_record(iregs, regs, data, mask, counts, last, + setup_pebs_adaptive_sample_data); +} - if (counts[bit]++) { - __intel_pmu_pebs_event(event, iregs, regs, data, last[bit], - setup_pebs_adaptive_sample_data); - } - last[bit] = at; - } +static void intel_pmu_drain_arch_pebs(struct pt_regs *iregs, + struct perf_sample_data *data) +{ + short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; + void *last[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS]; + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union arch_pebs_index index; + struct x86_perf_regs perf_regs; + struct pt_regs *regs = &perf_regs.regs; + void *base, *at, *top; + u64 mask; + + rdmsrq(MSR_IA32_PEBS_INDEX, index.whole); + + if (unlikely(!index.wr)) { + intel_pmu_pebs_event_update_no_drain(cpuc, X86_PMC_IDX_MAX); + return; } - for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) { - if (!counts[bit]) + base = cpuc->pebs_vaddr; + top = cpuc->pebs_vaddr + (index.wr << ARCH_PEBS_INDEX_WR_SHIFT); + + index.wr = 0; + index.full = 0; + index.en = 1; + if (cpuc->n_pebs == cpuc->n_large_pebs) + index.thresh = ARCH_PEBS_THRESH_MULTI; + else + index.thresh = ARCH_PEBS_THRESH_SINGLE; + wrmsrq(MSR_IA32_PEBS_INDEX, index.whole); + + mask = hybrid(cpuc->pmu, arch_pebs_cap).counters & cpuc->pebs_enabled; + + if (!iregs) + iregs = &dummy_iregs; + + /* Process all but the last event for each counter. */ + for (at = base; at < top;) { + struct arch_pebs_header *header; + struct arch_pebs_basic *basic; + u64 pebs_status; + + header = at; + + if (WARN_ON_ONCE(!header->size)) + break; + + /* 1st fragment or single record must have basic group */ + if (!header->basic) { + at += header->size; continue; + } - event = cpuc->events[bit]; + basic = at + sizeof(struct arch_pebs_header); + pebs_status = mask & basic->applicable_counters; + __intel_pmu_handle_pebs_record(iregs, regs, data, at, + pebs_status, counts, last, + setup_arch_pebs_sample_data); + + /* Skip non-last fragments */ + while (arch_pebs_record_continued(header)) { + if (!header->size) + break; + at += header->size; + header = at; + } - __intel_pmu_pebs_last_event(event, iregs, regs, data, last[bit], - counts[bit], setup_pebs_adaptive_sample_data); + /* Skip last fragment or the single record */ + at += header->size; } + + __intel_pmu_handle_last_pebs_record(iregs, regs, data, mask, + counts, last, + setup_arch_pebs_sample_data); +} + +static void __init intel_arch_pebs_init(void) +{ + /* + * Current hybrid platforms always both support arch-PEBS or not + * on all kinds of cores. So directly set x86_pmu.arch_pebs flag + * if boot cpu supports arch-PEBS. + */ + x86_pmu.arch_pebs = 1; + x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE; + x86_pmu.drain_pebs = intel_pmu_drain_arch_pebs; + x86_pmu.pebs_capable = ~0ULL; + x86_pmu.flags |= PMU_FL_PEBS_ALL; + + x86_pmu.pebs_enable = __intel_pmu_pebs_enable; + x86_pmu.pebs_disable = __intel_pmu_pebs_disable; } /* * PEBS probe and setup */ -void __init intel_pebs_init(void) +static void __init intel_ds_pebs_init(void) { /* * No support for 32bit formats @@ -2735,10 +3119,8 @@ void __init intel_pebs_init(void) break; case 6: - if (x86_pmu.intel_cap.pebs_baseline) { + if (x86_pmu.intel_cap.pebs_baseline) x86_pmu.large_pebs_flags |= PERF_SAMPLE_READ; - x86_pmu.late_setup = intel_pmu_late_setup; - } fallthrough; case 5: x86_pmu.pebs_ept = 1; @@ -2788,6 +3170,14 @@ void __init intel_pebs_init(void) } } +void __init intel_pebs_init(void) +{ + if (x86_pmu.intel_cap.pebs_format == 0xf) + intel_arch_pebs_init(); + else + intel_ds_pebs_init(); +} + void perf_restore_debug_store(void) { struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds); diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 7aa59966e7c3..72f2adcda7c6 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/kvm_types.h> #include <linux/perf_event.h> #include <linux/types.h> @@ -1705,7 +1706,7 @@ void x86_perf_get_lbr(struct x86_pmu_lbr *lbr) lbr->info = x86_pmu.lbr_info; lbr->has_callstack = x86_pmu_has_lbr_callstack(); } -EXPORT_SYMBOL_GPL(x86_perf_get_lbr); +EXPORT_SYMBOL_FOR_KVM(x86_perf_get_lbr); struct event_constraint vlbr_constraint = __EVENT_CONSTRAINT(INTEL_FIXED_VLBR_EVENT, (1ULL << INTEL_PMC_IDX_FIXED_VLBR), diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index e8cf29d2b10c..44524a387c58 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -17,6 +17,7 @@ #include <linux/limits.h> #include <linux/slab.h> #include <linux/device.h> +#include <linux/kvm_types.h> #include <asm/cpuid/api.h> #include <asm/perf_event.h> @@ -82,13 +83,13 @@ u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability) return (c & cd->mask) >> shift; } -EXPORT_SYMBOL_GPL(intel_pt_validate_cap); +EXPORT_SYMBOL_FOR_KVM(intel_pt_validate_cap); u32 intel_pt_validate_hw_cap(enum pt_capabilities cap) { return intel_pt_validate_cap(pt_pmu.caps, cap); } -EXPORT_SYMBOL_GPL(intel_pt_validate_hw_cap); +EXPORT_SYMBOL_FOR_KVM(intel_pt_validate_hw_cap); static ssize_t pt_cap_show(struct device *cdev, struct device_attribute *attr, @@ -1590,7 +1591,7 @@ void intel_pt_handle_vmx(int on) local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(intel_pt_handle_vmx); +EXPORT_SYMBOL_FOR_KVM(intel_pt_handle_vmx); /* * PMU callbacks diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index a762f7f5b161..e228e564b15e 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1325,8 +1325,6 @@ static void uncore_pci_sub_driver_init(void) continue; pmu = &type->pmus[UNCORE_PCI_DEV_IDX(ids->driver_data)]; - if (!pmu) - continue; if (uncore_pci_get_dev_die_info(pci_sub_dev, &die)) continue; @@ -1895,6 +1893,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_VFM(INTEL_ARROWLAKE_H, &mtl_uncore_init), X86_MATCH_VFM(INTEL_LUNARLAKE_M, &lnl_uncore_init), X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &ptl_uncore_init), + X86_MATCH_VFM(INTEL_WILDCATLAKE_L, &ptl_uncore_init), X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &spr_uncore_init), X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &spr_uncore_init), X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &gnr_uncore_init), diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 2b969386dcdd..3161ec0a3416 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -283,8 +283,9 @@ struct cpu_hw_events { * Intel DebugStore bits */ struct debug_store *ds; - void *ds_pebs_vaddr; void *ds_bts_vaddr; + /* DS based PEBS or arch-PEBS buffer address */ + void *pebs_vaddr; u64 pebs_enabled; int n_pebs; int n_large_pebs; @@ -303,6 +304,8 @@ struct cpu_hw_events { /* Intel ACR configuration */ u64 acr_cfg_b[X86_PMC_IDX_MAX]; u64 acr_cfg_c[X86_PMC_IDX_MAX]; + /* Cached CFG_C values */ + u64 cfg_c_val[X86_PMC_IDX_MAX]; /* * Intel LBR bits @@ -708,6 +711,12 @@ enum hybrid_pmu_type { hybrid_big_small_tiny = hybrid_big | hybrid_small_tiny, }; +struct arch_pebs_cap { + u64 caps; + u64 counters; + u64 pdists; +}; + struct x86_hybrid_pmu { struct pmu pmu; const char *name; @@ -752,6 +761,8 @@ struct x86_hybrid_pmu { mid_ack :1, enabled_ack :1; + struct arch_pebs_cap arch_pebs_cap; + u64 pebs_data_source[PERF_PEBS_DATA_SOURCE_MAX]; }; @@ -906,7 +917,7 @@ struct x86_pmu { union perf_capabilities intel_cap; /* - * Intel DebugStore bits + * Intel DebugStore and PEBS bits */ unsigned int bts :1, bts_active :1, @@ -917,7 +928,8 @@ struct x86_pmu { pebs_no_tlb :1, pebs_no_isolation :1, pebs_block :1, - pebs_ept :1; + pebs_ept :1, + arch_pebs :1; int pebs_record_size; int pebs_buffer_size; u64 pebs_events_mask; @@ -930,6 +942,11 @@ struct x86_pmu { u64 pebs_capable; /* + * Intel Architectural PEBS + */ + struct arch_pebs_cap arch_pebs_cap; + + /* * Intel LBR */ unsigned int lbr_tos, lbr_from, lbr_to, @@ -1124,7 +1141,6 @@ static struct perf_pmu_format_hybrid_attr format_attr_hybrid_##_name = {\ .pmu_type = _pmu, \ } -int is_x86_event(struct perf_event *event); struct pmu *x86_get_pmu(unsigned int cpu); extern struct x86_pmu x86_pmu __read_mostly; @@ -1217,7 +1233,7 @@ int x86_reserve_hardware(void); void x86_release_hardware(void); -int x86_pmu_max_precise(void); +int x86_pmu_max_precise(struct pmu *pmu); void hw_perf_lbr_event_destroy(struct perf_event *event); @@ -1604,6 +1620,14 @@ extern void intel_cpuc_finish(struct cpu_hw_events *cpuc); int intel_pmu_init(void); +int alloc_arch_pebs_buf_on_cpu(int cpu); + +void release_arch_pebs_buf_on_cpu(int cpu); + +void init_arch_pebs_on_cpu(int cpu); + +void fini_arch_pebs_on_cpu(int cpu); + void init_debug_store_on_cpu(int cpu); void fini_debug_store_on_cpu(int cpu); @@ -1760,6 +1784,8 @@ void intel_pmu_pebs_data_source_cmt(void); void intel_pmu_pebs_data_source_lnl(void); +u64 intel_get_arch_pebs_data_config(struct perf_event *event); + int intel_pmu_setup_lbr_filter(struct perf_event *event); void intel_pt_interrupt(void); @@ -1792,6 +1818,11 @@ static inline int intel_pmu_max_num_pebs(struct pmu *pmu) return fls((u32)hybrid(pmu, pebs_events_mask)); } +static inline bool intel_pmu_has_pebs(void) +{ + return x86_pmu.ds_pebs || x86_pmu.arch_pebs; +} + #else /* CONFIG_CPU_SUP_INTEL */ static inline void reserve_ds_buffers(void) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 15bc07a5ebb3..b14c045679e1 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -198,6 +198,7 @@ static inline int alternatives_text_reserved(void *start, void *end) #define ALTINSTR_ENTRY(ft_flags) \ ".pushsection .altinstructions,\"a\"\n" \ + ANNOTATE_DATA_SPECIAL \ " .long 771b - .\n" /* label */ \ " .long 774f - .\n" /* new instruction */ \ " .4byte " __stringify(ft_flags) "\n" /* feature + flags */ \ @@ -207,6 +208,7 @@ static inline int alternatives_text_reserved(void *start, void *end) #define ALTINSTR_REPLACEMENT(newinstr) /* replacement */ \ ".pushsection .altinstr_replacement, \"ax\"\n" \ + ANNOTATE_DATA_SPECIAL \ "# ALT: replacement\n" \ "774:\n\t" newinstr "\n775:\n" \ ".popsection\n" @@ -337,6 +339,7 @@ void nop_func(void); * instruction. See apply_alternatives(). */ .macro altinstr_entry orig alt ft_flags orig_len alt_len + ANNOTATE_DATA_SPECIAL .long \orig - . .long \alt - . .4byte \ft_flags @@ -365,6 +368,7 @@ void nop_func(void); .popsection ; \ .pushsection .altinstr_replacement,"ax" ; \ 743: \ + ANNOTATE_DATA_SPECIAL ; \ newinst ; \ 744: \ .popsection ; diff --git a/arch/x86/include/asm/amd/node.h b/arch/x86/include/asm/amd/node.h index 23fe617898a8..a672b8765fa8 100644 --- a/arch/x86/include/asm/amd/node.h +++ b/arch/x86/include/asm/amd/node.h @@ -23,7 +23,6 @@ #define AMD_NODE0_PCI_SLOT 0x18 struct pci_dev *amd_node_get_func(u16 node, u8 func); -struct pci_dev *amd_node_get_root(u16 node); static inline u16 amd_num_nodes(void) { diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index d5c8d3afe196..bd62bd87a841 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_ASM_H #define _ASM_X86_ASM_H +#include <linux/annotate.h> + #ifdef __ASSEMBLER__ # define __ASM_FORM(x, ...) x,## __VA_ARGS__ # define __ASM_FORM_RAW(x, ...) x,## __VA_ARGS__ @@ -132,6 +134,7 @@ static __always_inline __pure void *rip_rel_ptr(void *p) # define _ASM_EXTABLE_TYPE(from, to, type) \ .pushsection "__ex_table","a" ; \ .balign 4 ; \ + ANNOTATE_DATA_SPECIAL ; \ .long (from) - . ; \ .long (to) - . ; \ .long type ; \ @@ -179,6 +182,7 @@ static __always_inline __pure void *rip_rel_ptr(void *p) # define _ASM_EXTABLE_TYPE(from, to, type) \ " .pushsection \"__ex_table\",\"a\"\n" \ " .balign 4\n" \ + ANNOTATE_DATA_SPECIAL \ " .long (" #from ") - .\n" \ " .long (" #to ") - .\n" \ " .long " __stringify(type) " \n" \ @@ -187,6 +191,7 @@ static __always_inline __pure void *rip_rel_ptr(void *p) # define _ASM_EXTABLE_TYPE_REG(from, to, type, reg) \ " .pushsection \"__ex_table\",\"a\"\n" \ " .balign 4\n" \ + ANNOTATE_DATA_SPECIAL \ " .long (" #from ") - .\n" \ " .long (" #to ") - .\n" \ DEFINE_EXTABLE_TYPE_REG \ diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 880ca15073ed..ab5bba6cf7f5 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -7,6 +7,11 @@ #include <linux/objtool.h> #include <asm/asm.h> +#ifndef __ASSEMBLY__ +struct bug_entry; +extern void __WARN_trap(struct bug_entry *bug, ...); +#endif + /* * Despite that some emulators terminate on UD2, we use it for WARN(). */ @@ -31,52 +36,77 @@ #define BUG_UD2 0xfffe #define BUG_UD1 0xfffd #define BUG_UD1_UBSAN 0xfffc +#define BUG_UD1_WARN 0xfffb #define BUG_UDB 0xffd6 #define BUG_LOCK 0xfff0 #ifdef CONFIG_GENERIC_BUG -#ifdef CONFIG_X86_32 -# define __BUG_REL(val) ".long " val +#ifdef CONFIG_DEBUG_BUGVERBOSE +#define __BUG_ENTRY_VERBOSE(file, line) \ + "\t.long " file " - .\t# bug_entry::file\n" \ + "\t.word " line "\t# bug_entry::line\n" #else -# define __BUG_REL(val) ".long " val " - ." +#define __BUG_ENTRY_VERBOSE(file, line) #endif -#ifdef CONFIG_DEBUG_BUGVERBOSE -#define __BUG_ENTRY(file, line, flags) \ - "2:\t" __BUG_REL("1b") "\t# bug_entry::bug_addr\n" \ - "\t" __BUG_REL(file) "\t# bug_entry::file\n" \ - "\t.word " line "\t# bug_entry::line\n" \ - "\t.word " flags "\t# bug_entry::flags\n" +#if defined(CONFIG_X86_64) || defined(CONFIG_DEBUG_BUGVERBOSE_DETAILED) +#define HAVE_ARCH_BUG_FORMAT +#define __BUG_ENTRY_FORMAT(format) \ + "\t.long " format " - .\t# bug_entry::format\n" #else -#define __BUG_ENTRY(file, line, flags) \ - "2:\t" __BUG_REL("1b") "\t# bug_entry::bug_addr\n" \ - "\t.word " flags "\t# bug_entry::flags\n" +#define __BUG_ENTRY_FORMAT(format) +#endif + +#ifdef CONFIG_X86_64 +#define HAVE_ARCH_BUG_FORMAT_ARGS #endif -#define _BUG_FLAGS_ASM(ins, file, line, flags, size, extra) \ - "1:\t" ins "\n" \ - ".pushsection __bug_table,\"aw\"\n" \ - __BUG_ENTRY(file, line, flags) \ +#define __BUG_ENTRY(format, file, line, flags) \ + "\t.long 1b - ." "\t# bug_entry::bug_addr\n" \ + __BUG_ENTRY_FORMAT(format) \ + __BUG_ENTRY_VERBOSE(file, line) \ + "\t.word " flags "\t# bug_entry::flags\n" + +#define _BUG_FLAGS_ASM(format, file, line, flags, size, extra) \ + ".pushsection __bug_table,\"aw\"\n\t" \ + ANNOTATE_DATA_SPECIAL \ + "2:\n\t" \ + __BUG_ENTRY(format, file, line, flags) \ "\t.org 2b + " size "\n" \ ".popsection\n" \ extra -#define _BUG_FLAGS(ins, flags, extra) \ +#ifdef CONFIG_DEBUG_BUGVERBOSE_DETAILED +#define WARN_CONDITION_STR(cond_str) cond_str +#else +#define WARN_CONDITION_STR(cond_str) "" +#endif + +#define _BUG_FLAGS(cond_str, ins, flags, extra) \ do { \ - asm_inline volatile(_BUG_FLAGS_ASM(ins, "%c0", \ - "%c1", "%c2", "%c3", extra) \ - : : "i" (__FILE__), "i" (__LINE__), \ - "i" (flags), \ - "i" (sizeof(struct bug_entry))); \ + asm_inline volatile("1:\t" ins "\n" \ + _BUG_FLAGS_ASM("%c[fmt]", "%c[file]", \ + "%c[line]", "%c[fl]", \ + "%c[size]", extra) \ + : : [fmt] "i" (WARN_CONDITION_STR(cond_str)), \ + [file] "i" (__FILE__), \ + [line] "i" (__LINE__), \ + [fl] "i" (flags), \ + [size] "i" (sizeof(struct bug_entry))); \ } while (0) #define ARCH_WARN_ASM(file, line, flags, size) \ - _BUG_FLAGS_ASM(ASM_UD2, file, line, flags, size, "") + ".pushsection .rodata.str1.1, \"aMS\", @progbits, 1\n" \ + "99:\n" \ + "\t.string \"\"\n" \ + ".popsection\n" \ + "1:\t " ASM_UD2 "\n" \ + _BUG_FLAGS_ASM("99b", file, line, flags, size, "") #else -#define _BUG_FLAGS(ins, flags, extra) asm volatile(ins) +#define _BUG_FLAGS(cond_str, ins, flags, extra) asm volatile(ins) #endif /* CONFIG_GENERIC_BUG */ @@ -84,7 +114,7 @@ do { \ #define BUG() \ do { \ instrumentation_begin(); \ - _BUG_FLAGS(ASM_UD2, 0, ""); \ + _BUG_FLAGS("", ASM_UD2, 0, ""); \ __builtin_unreachable(); \ } while (0) @@ -97,14 +127,69 @@ do { \ #define ARCH_WARN_REACHABLE ANNOTATE_REACHABLE(1b) -#define __WARN_FLAGS(flags) \ -do { \ - __auto_type __flags = BUGFLAG_WARNING|(flags); \ - instrumentation_begin(); \ - _BUG_FLAGS(ASM_UD2, __flags, ARCH_WARN_REACHABLE); \ - instrumentation_end(); \ +#define __WARN_FLAGS(cond_str, flags) \ +do { \ + __auto_type __flags = BUGFLAG_WARNING|(flags); \ + instrumentation_begin(); \ + _BUG_FLAGS(cond_str, ASM_UD2, __flags, ARCH_WARN_REACHABLE); \ + instrumentation_end(); \ } while (0) +#ifdef HAVE_ARCH_BUG_FORMAT_ARGS + +#ifndef __ASSEMBLY__ +#include <linux/static_call_types.h> +DECLARE_STATIC_CALL(WARN_trap, __WARN_trap); + +struct pt_regs; +struct sysv_va_list { /* from AMD64 System V ABI */ + unsigned int gp_offset; + unsigned int fp_offset; + void *overflow_arg_area; + void *reg_save_area; +}; +struct arch_va_list { + unsigned long regs[6]; + struct sysv_va_list args; +}; +extern void *__warn_args(struct arch_va_list *args, struct pt_regs *regs); +#endif /* __ASSEMBLY__ */ + +#define __WARN_bug_entry(flags, format) ({ \ + struct bug_entry *bug; \ + asm_inline volatile("lea (2f)(%%rip), %[addr]\n1:\n" \ + _BUG_FLAGS_ASM("%c[fmt]", "%c[file]", \ + "%c[line]", "%c[fl]", \ + "%c[size]", "") \ + : [addr] "=r" (bug) \ + : [fmt] "i" (format), \ + [file] "i" (__FILE__), \ + [line] "i" (__LINE__), \ + [fl] "i" (flags), \ + [size] "i" (sizeof(struct bug_entry))); \ + bug; }) + +#define __WARN_print_arg(flags, format, arg...) \ +do { \ + int __flags = (flags) | BUGFLAG_WARNING | BUGFLAG_ARGS ; \ + static_call_mod(WARN_trap)(__WARN_bug_entry(__flags, format), ## arg); \ + asm (""); /* inhibit tail-call optimization */ \ +} while (0) + +#define __WARN_printf(taint, fmt, arg...) \ + __WARN_print_arg(BUGFLAG_TAINT(taint), fmt, ## arg) + +#define WARN_ONCE(cond, format, arg...) ({ \ + int __ret_warn_on = !!(cond); \ + if (unlikely(__ret_warn_on)) { \ + __WARN_print_arg(BUGFLAG_ONCE|BUGFLAG_TAINT(TAINT_WARN),\ + format, ## arg); \ + } \ + __ret_warn_on; \ +}) + +#endif /* HAVE_ARCH_BUG_FORMAT_ARGS */ + #include <asm-generic/bug.h> #endif /* _ASM_X86_BUG_H */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 893cbca37fe9..4b1a6ade1700 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -30,7 +30,7 @@ enum cpuid_leafs CPUID_6_EAX, CPUID_8000_000A_EDX, CPUID_7_ECX, - CPUID_8000_0007_EBX, + CPUID_LNX_6, CPUID_7_EDX, CPUID_8000_001F_EAX, CPUID_8000_0021_EAX, @@ -101,6 +101,7 @@ static __always_inline bool _static_cpu_has(u16 bit) asm goto(ALTERNATIVE_TERNARY("jmp 6f", %c[feature], "", "jmp %l[t_no]") ".pushsection .altinstr_aux,\"ax\"\n" "6:\n" + ANNOTATE_DATA_SPECIAL " testb %[bitnum], %a[cap_byte]\n" " jnz %l[t_yes]\n" " jmp %l[t_no]\n" diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 245cf6b3ec57..d90ce601917c 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -314,6 +314,7 @@ #define X86_FEATURE_SM4 (12*32+ 2) /* SM4 instructions */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* "avx_vnni" AVX VNNI instructions */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* "avx512_bf16" AVX512 BFLOAT16 instructions */ +#define X86_FEATURE_LASS (12*32+ 6) /* "lass" Linear Address Space Separation */ #define X86_FEATURE_CMPCCXADD (12*32+ 7) /* CMPccXADD instructions */ #define X86_FEATURE_ARCH_PERFMON_EXT (12*32+ 8) /* Intel Architectural PerfMon Extension */ #define X86_FEATURE_FZRM (12*32+10) /* Fast zero-length REP MOVSB */ @@ -407,9 +408,12 @@ #define X86_FEATURE_ENQCMD (16*32+29) /* "enqcmd" ENQCMD and ENQCMDS instructions */ #define X86_FEATURE_SGX_LC (16*32+30) /* "sgx_lc" Software Guard Extensions Launch Control */ -/* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */ +/* + * Linux-defined word for use with scattered/synthetic bits. + */ #define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* "overflow_recov" MCA overflow recovery support */ #define X86_FEATURE_SUCCOR (17*32+ 1) /* "succor" Uncorrectable error containment and recovery */ + #define X86_FEATURE_SMCA (17*32+ 3) /* "smca" Scalable MCA */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ @@ -499,6 +503,9 @@ #define X86_FEATURE_IBPB_EXIT_TO_USER (21*32+14) /* Use IBPB on exit-to-userspace, see VMSCAPE bug */ #define X86_FEATURE_ABMC (21*32+15) /* Assignable Bandwidth Monitoring Counters */ #define X86_FEATURE_MSR_IMM (21*32+16) /* MSR immediate form instructions */ +#define X86_FEATURE_SGX_EUPDATESVN (21*32+17) /* Support for ENCLS[EUPDATESVN] instruction */ + +#define X86_FEATURE_SDCIAE (21*32+18) /* L3 Smart Data Cache Injection Allocation Enforcement */ /* * BUG word(s) diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h index 12b34d5b2953..2bb65677c079 100644 --- a/arch/x86/include/asm/fred.h +++ b/arch/x86/include/asm/fred.h @@ -79,7 +79,7 @@ static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int .type = type, .vector = vector, .nmi = type == EVENT_TYPE_NMI, - .lm = 1, + .l = 1, }; asm_fred_entry_from_kvm(ss); diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 93156ac4ffe0..b08c95872eed 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -56,6 +56,11 @@ arch_ftrace_get_regs(struct ftrace_regs *fregs) return &arch_ftrace_regs(fregs)->regs; } +#define arch_ftrace_partial_regs(regs) do { \ + regs->flags &= ~X86_EFLAGS_FIXED; \ + regs->cs = __KERNEL_CS; \ +} while (0) + #define arch_ftrace_fill_perf_regs(fregs, _regs) do { \ (_regs)->ip = arch_ftrace_regs(fregs)->regs.ip; \ (_regs)->sp = arch_ftrace_regs(fregs)->regs.sp; \ diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index 6e2458088800..fe5d9a10d900 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -46,38 +46,31 @@ do { \ } while(0) static __always_inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, - u32 __user *uaddr) + u32 __user *uaddr) { - if (can_do_masked_user_access()) - uaddr = masked_user_access_begin(uaddr); - else if (!user_access_begin(uaddr, sizeof(u32))) - return -EFAULT; - - switch (op) { - case FUTEX_OP_SET: - unsafe_atomic_op1("xchgl %0, %2", oval, uaddr, oparg, Efault); - break; - case FUTEX_OP_ADD: - unsafe_atomic_op1(LOCK_PREFIX "xaddl %0, %2", oval, - uaddr, oparg, Efault); - break; - case FUTEX_OP_OR: - unsafe_atomic_op2("orl %4, %3", oval, uaddr, oparg, Efault); - break; - case FUTEX_OP_ANDN: - unsafe_atomic_op2("andl %4, %3", oval, uaddr, ~oparg, Efault); - break; - case FUTEX_OP_XOR: - unsafe_atomic_op2("xorl %4, %3", oval, uaddr, oparg, Efault); - break; - default: - user_access_end(); - return -ENOSYS; + scoped_user_rw_access(uaddr, Efault) { + switch (op) { + case FUTEX_OP_SET: + unsafe_atomic_op1("xchgl %0, %2", oval, uaddr, oparg, Efault); + break; + case FUTEX_OP_ADD: + unsafe_atomic_op1(LOCK_PREFIX "xaddl %0, %2", oval, uaddr, oparg, Efault); + break; + case FUTEX_OP_OR: + unsafe_atomic_op2("orl %4, %3", oval, uaddr, oparg, Efault); + break; + case FUTEX_OP_ANDN: + unsafe_atomic_op2("andl %4, %3", oval, uaddr, ~oparg, Efault); + break; + case FUTEX_OP_XOR: + unsafe_atomic_op2("xorl %4, %3", oval, uaddr, oparg, Efault); + break; + default: + return -ENOSYS; + } } - user_access_end(); return 0; Efault: - user_access_end(); return -EFAULT; } @@ -86,21 +79,19 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, { int ret = 0; - if (can_do_masked_user_access()) - uaddr = masked_user_access_begin(uaddr); - else if (!user_access_begin(uaddr, sizeof(u32))) - return -EFAULT; - asm volatile("\n" - "1:\t" LOCK_PREFIX "cmpxchgl %3, %2\n" - "2:\n" - _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %0) \ - : "+r" (ret), "=a" (oldval), "+m" (*uaddr) - : "r" (newval), "1" (oldval) - : "memory" - ); - user_access_end(); - *uval = oldval; + scoped_user_rw_access(uaddr, Efault) { + asm_inline volatile("\n" + "1:\t" LOCK_PREFIX "cmpxchgl %3, %2\n" + "2:\n" + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %0) + : "+r" (ret), "=a" (oldval), "+m" (*uaddr) + : "r" (newval), "1" (oldval) + : "memory"); + *uval = oldval; + } return ret; +Efault: + return -EFAULT; } #endif diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h index 54368a43abf6..4733e9064ee5 100644 --- a/arch/x86/include/asm/insn-eval.h +++ b/arch/x86/include/asm/insn-eval.h @@ -44,4 +44,6 @@ enum insn_mmio_type { enum insn_mmio_type insn_decode_mmio(struct insn *insn, int *bytes); +bool insn_is_nop(struct insn *insn); + #endif /* _ASM_X86_INSN_EVAL_H */ diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 091f88c8254d..846d21c1a7f8 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -312,7 +312,6 @@ static inline int insn_offset_immediate(struct insn *insn) /** * for_each_insn_prefix() -- Iterate prefixes in the instruction * @insn: Pointer to struct insn. - * @idx: Index storage. * @prefix: Prefix byte. * * Iterate prefix bytes of given @insn. Each prefix byte is stored in @prefix @@ -321,8 +320,8 @@ static inline int insn_offset_immediate(struct insn *insn) * Since prefixes.nbytes can be bigger than 4 if some prefixes * are repeated, it cannot be used for looping over the prefixes. */ -#define for_each_insn_prefix(insn, idx, prefix) \ - for (idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++) +#define for_each_insn_prefix(insn, prefix) \ + for (int idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++) #define POP_SS_OPCODE 0x1f #define MOV_SREG_OPCODE 0x8e diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index f32a0eca2ae5..950bfd006905 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -150,12 +150,12 @@ #define INTEL_LUNARLAKE_M IFM(6, 0xBD) /* Lion Cove / Skymont */ -#define INTEL_PANTHERLAKE_L IFM(6, 0xCC) /* Cougar Cove / Crestmont */ +#define INTEL_PANTHERLAKE_L IFM(6, 0xCC) /* Cougar Cove / Darkmont */ #define INTEL_WILDCATLAKE_L IFM(6, 0xD5) -#define INTEL_NOVALAKE IFM(18, 0x01) -#define INTEL_NOVALAKE_L IFM(18, 0x03) +#define INTEL_NOVALAKE IFM(18, 0x01) /* Coyote Cove / Arctic Wolf */ +#define INTEL_NOVALAKE_L IFM(18, 0x03) /* Coyote Cove / Arctic Wolf */ /* "Small Core" Processors (Atom/E-Core) */ diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h index 5dbeac48a5b9..695f87efbeb8 100644 --- a/arch/x86/include/asm/intel_ds.h +++ b/arch/x86/include/asm/intel_ds.h @@ -4,7 +4,15 @@ #include <linux/percpu-defs.h> #define BTS_BUFFER_SIZE (PAGE_SIZE << 4) -#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4) +#define PEBS_BUFFER_SHIFT 4 +#define PEBS_BUFFER_SIZE (PAGE_SIZE << PEBS_BUFFER_SHIFT) + +/* + * The largest PEBS record could consume a page, ensure + * a record at least can be written after triggering PMI. + */ +#define ARCH_PEBS_THRESH_MULTI ((PEBS_BUFFER_SIZE - PAGE_SIZE) >> PEBS_BUFFER_SHIFT) +#define ARCH_PEBS_THRESH_SINGLE 1 /* The maximal number of PEBS events: */ #define MAX_PEBS_EVENTS_FMT4 8 diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 61dd1dee7812..e0a6930a4029 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -15,6 +15,7 @@ #define JUMP_TABLE_ENTRY(key, label) \ ".pushsection __jump_table, \"aw\" \n\t" \ _ASM_ALIGN "\n\t" \ + ANNOTATE_DATA_SPECIAL \ ".long 1b - . \n\t" \ ".long " label " - . \n\t" \ _ASM_PTR " " key " - . \n\t" \ diff --git a/arch/x86/include/asm/kvm_types.h b/arch/x86/include/asm/kvm_types.h index 23268a188e70..d7c704ed1be9 100644 --- a/arch/x86/include/asm/kvm_types.h +++ b/arch/x86/include/asm/kvm_types.h @@ -10,6 +10,11 @@ #define KVM_SUB_MODULES kvm-intel #else #undef KVM_SUB_MODULES +/* + * Don't export symbols for KVM without vendor modules, as kvm.ko is built iff + * at least one vendor module is enabled. + */ +#define EXPORT_SYMBOL_FOR_KVM(symbol) #endif #define KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE 40 diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 31e3cb550fb3..2d98886de09a 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -48,6 +48,7 @@ /* AMD-specific bits */ #define MCI_STATUS_TCC BIT_ULL(55) /* Task context corrupt */ +#define MCI_STATUS_PADDRV BIT_ULL(54) /* Valid System Physical Address */ #define MCI_STATUS_SYNDV BIT_ULL(53) /* synd reg. valid */ #define MCI_STATUS_DEFERRED BIT_ULL(44) /* uncorrected error, deferred exception */ #define MCI_STATUS_POISON BIT_ULL(43) /* access poisonous data */ @@ -62,6 +63,7 @@ */ #define MCI_CONFIG_MCAX 0x1 #define MCI_CONFIG_FRUTEXT BIT_ULL(9) +#define MCI_CONFIG_PADDRV BIT_ULL(11) #define MCI_IPID_MCATYPE 0xFFFF0000 #define MCI_IPID_HWID 0xFFF @@ -166,6 +168,12 @@ #define MCE_IN_KERNEL_COPYIN BIT_ULL(7) /* + * Indicates that handler should check and clear Deferred error registers + * rather than common ones. + */ +#define MCE_CHECK_DFR_REGS BIT_ULL(8) + +/* * This structure contains all data related to the MCE log. Also * carries a signature to make it easier to find from external * debugging tools. Each entry is only valid when its finished flag @@ -302,6 +310,12 @@ DECLARE_PER_CPU(struct mce, injectm); /* Disable CMCI/polling for MCA bank claimed by firmware */ extern void mce_disable_bank(int bank); +#ifdef CONFIG_X86_MCE_THRESHOLD +void mce_save_apei_thr_limit(u32 thr_limit); +#else +static inline void mce_save_apei_thr_limit(u32 thr_limit) { } +#endif /* CONFIG_X86_MCE_THRESHOLD */ + /* * Exception handler */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 9e1720d73244..3d0a0950d20a 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -166,6 +166,10 @@ * Processor MMIO stale data * vulnerabilities. */ +#define ARCH_CAP_MCU_ENUM BIT(16) /* + * Indicates the presence of microcode update + * feature enumeration and status information. + */ #define ARCH_CAP_FB_CLEAR BIT(17) /* * VERW clears CPU fill buffer * even on MDS_NO CPUs. @@ -327,6 +331,26 @@ PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \ PERF_CAP_PEBS_TIMING_INFO) +/* Arch PEBS */ +#define MSR_IA32_PEBS_BASE 0x000003f4 +#define MSR_IA32_PEBS_INDEX 0x000003f5 +#define ARCH_PEBS_OFFSET_MASK 0x7fffff +#define ARCH_PEBS_INDEX_WR_SHIFT 4 + +#define ARCH_PEBS_RELOAD 0xffffffff +#define ARCH_PEBS_CNTR_ALLOW BIT_ULL(35) +#define ARCH_PEBS_CNTR_GP BIT_ULL(36) +#define ARCH_PEBS_CNTR_FIXED BIT_ULL(37) +#define ARCH_PEBS_CNTR_METRICS BIT_ULL(38) +#define ARCH_PEBS_LBR_SHIFT 40 +#define ARCH_PEBS_LBR (0x3ull << ARCH_PEBS_LBR_SHIFT) +#define ARCH_PEBS_VECR_XMM BIT_ULL(49) +#define ARCH_PEBS_GPR BIT_ULL(61) +#define ARCH_PEBS_AUX BIT_ULL(62) +#define ARCH_PEBS_EN BIT_ULL(63) +#define ARCH_PEBS_CNTR_MASK (ARCH_PEBS_CNTR_GP | ARCH_PEBS_CNTR_FIXED | \ + ARCH_PEBS_CNTR_METRICS) + #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) #define RTIT_CTL_CYCLEACC BIT(1) @@ -929,6 +953,10 @@ #define MSR_IA32_APICBASE_BASE (0xfffff<<12) #define MSR_IA32_UCODE_WRITE 0x00000079 + +#define MSR_IA32_MCU_ENUMERATION 0x0000007b +#define MCU_STAGING BIT(4) + #define MSR_IA32_UCODE_REV 0x0000008b /* Intel SGX Launch Enclave Public Key Hash MSRs */ @@ -1226,6 +1254,8 @@ #define MSR_IA32_VMX_VMFUNC 0x00000491 #define MSR_IA32_VMX_PROCBASED_CTLS3 0x00000492 +#define MSR_IA32_MCU_STAGING_MBOX_ADDR 0x000007a5 + /* Resctrl MSRs: */ /* - Intel: */ #define MSR_IA32_L3_QOS_CFG 0xc81 diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 015d23f3e01f..2f0e47be79a4 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -9,6 +9,7 @@ #include <asm/alternative.h> #include <linux/kmsan-checks.h> +#include <linux/mmdebug.h> /* duplicated to the one in bootmem.h */ extern unsigned long max_pfn; @@ -31,18 +32,28 @@ static __always_inline unsigned long __phys_addr_nodebug(unsigned long x) #ifdef CONFIG_DEBUG_VIRTUAL extern unsigned long __phys_addr(unsigned long); -extern unsigned long __phys_addr_symbol(unsigned long); #else #define __phys_addr(x) __phys_addr_nodebug(x) -#define __phys_addr_symbol(x) \ - ((unsigned long)(x) - __START_KERNEL_map + phys_base) #endif +static inline unsigned long __phys_addr_symbol(unsigned long x) +{ + unsigned long y = x - __START_KERNEL_map; + + /* only check upper bounds since lower bounds will trigger carry */ + VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE); + + return y + phys_base; +} + #define __phys_reloc_hide(x) (x) void clear_page_orig(void *page); void clear_page_rep(void *page); void clear_page_erms(void *page); +KCFI_REFERENCE(clear_page_orig); +KCFI_REFERENCE(clear_page_rep); +KCFI_REFERENCE(clear_page_erms); static inline void clear_page(void *page) { diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 49a4d442f3fc..7276ba70c88a 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -141,16 +141,16 @@ #define ARCH_PERFMON_EVENTS_COUNT 7 #define PEBS_DATACFG_MEMINFO BIT_ULL(0) -#define PEBS_DATACFG_GP BIT_ULL(1) +#define PEBS_DATACFG_GP BIT_ULL(1) #define PEBS_DATACFG_XMMS BIT_ULL(2) #define PEBS_DATACFG_LBRS BIT_ULL(3) -#define PEBS_DATACFG_LBR_SHIFT 24 #define PEBS_DATACFG_CNTR BIT_ULL(4) +#define PEBS_DATACFG_METRICS BIT_ULL(5) +#define PEBS_DATACFG_LBR_SHIFT 24 #define PEBS_DATACFG_CNTR_SHIFT 32 #define PEBS_DATACFG_CNTR_MASK GENMASK_ULL(15, 0) #define PEBS_DATACFG_FIX_SHIFT 48 #define PEBS_DATACFG_FIX_MASK GENMASK_ULL(7, 0) -#define PEBS_DATACFG_METRICS BIT_ULL(5) /* Steal the highest bit of pebs_data_cfg for SW usage */ #define PEBS_UPDATE_DS_SW BIT_ULL(63) @@ -200,6 +200,8 @@ union cpuid10_edx { #define ARCH_PERFMON_EXT_LEAF 0x00000023 #define ARCH_PERFMON_NUM_COUNTER_LEAF 0x1 #define ARCH_PERFMON_ACR_LEAF 0x2 +#define ARCH_PERFMON_PEBS_CAP_LEAF 0x4 +#define ARCH_PERFMON_PEBS_COUNTER_LEAF 0x5 union cpuid35_eax { struct { @@ -210,7 +212,10 @@ union cpuid35_eax { unsigned int acr_subleaf:1; /* Events Sub-Leaf */ unsigned int events_subleaf:1; - unsigned int reserved:28; + /* arch-PEBS Sub-Leaves */ + unsigned int pebs_caps_subleaf:1; + unsigned int pebs_cnts_subleaf:1; + unsigned int reserved:26; } split; unsigned int full; }; @@ -432,6 +437,8 @@ static inline bool is_topdown_idx(int idx) #define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(GLOBAL_STATUS_LBRS_FROZEN_BIT) #define GLOBAL_STATUS_TRACE_TOPAPMI_BIT 55 #define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(GLOBAL_STATUS_TRACE_TOPAPMI_BIT) +#define GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT 54 +#define GLOBAL_STATUS_ARCH_PEBS_THRESHOLD BIT_ULL(GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT) #define GLOBAL_STATUS_PERF_METRICS_OVF_BIT 48 #define GLOBAL_CTRL_EN_PERF_METRICS BIT_ULL(48) @@ -503,6 +510,107 @@ struct pebs_cntr_header { #define INTEL_CNTR_METRICS 0x3 /* + * Arch PEBS + */ +union arch_pebs_index { + struct { + u64 rsvd:4, + wr:23, + rsvd2:4, + full:1, + en:1, + rsvd3:3, + thresh:23, + rsvd4:5; + }; + u64 whole; +}; + +struct arch_pebs_header { + union { + u64 format; + struct { + u64 size:16, /* Record size */ + rsvd:14, + mode:1, /* 64BIT_MODE */ + cont:1, + rsvd2:3, + cntr:5, + lbr:2, + rsvd3:7, + xmm:1, + ymmh:1, + rsvd4:2, + opmask:1, + zmmh:1, + h16zmm:1, + rsvd5:5, + gpr:1, + aux:1, + basic:1; + }; + }; + u64 rsvd6; +}; + +struct arch_pebs_basic { + u64 ip; + u64 applicable_counters; + u64 tsc; + u64 retire :16, /* Retire Latency */ + valid :1, + rsvd :47; + u64 rsvd2; + u64 rsvd3; +}; + +struct arch_pebs_aux { + u64 address; + u64 rsvd; + u64 rsvd2; + u64 rsvd3; + u64 rsvd4; + u64 aux; + u64 instr_latency :16, + pad2 :16, + cache_latency :16, + pad3 :16; + u64 tsx_tuning; +}; + +struct arch_pebs_gprs { + u64 flags, ip, ax, cx, dx, bx, sp, bp, si, di; + u64 r8, r9, r10, r11, r12, r13, r14, r15, ssp; + u64 rsvd; +}; + +struct arch_pebs_xer_header { + u64 xstate; + u64 rsvd; +}; + +#define ARCH_PEBS_LBR_NAN 0x0 +#define ARCH_PEBS_LBR_NUM_8 0x1 +#define ARCH_PEBS_LBR_NUM_16 0x2 +#define ARCH_PEBS_LBR_NUM_VAR 0x3 +#define ARCH_PEBS_BASE_LBR_ENTRIES 8 +struct arch_pebs_lbr_header { + u64 rsvd; + u64 ctl; + u64 depth; + u64 ler_from; + u64 ler_to; + u64 ler_info; +}; + +struct arch_pebs_cntr_header { + u32 cntr; + u32 fixed; + u32 metrics; + u32 reserved; +}; + +/* * AMD Extended Performance Monitoring and Debug cpuid feature detection */ #define EXT_PERFMON_DEBUG_FEATURES 0x80000022 diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 50f75467f73d..35d062a2e304 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -84,8 +84,8 @@ struct fred_ss { : 4, /* Event was incident to enclave execution */ enclave : 1, - /* CPU was in long mode */ - lm : 1, + /* CPU was in 64-bit mode */ + l : 1, /* * Nested exception during FRED delivery, not set * for #DF. @@ -187,12 +187,12 @@ convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs); extern void send_sigtrap(struct pt_regs *regs, int error_code, int si_code); -static inline unsigned long regs_return_value(struct pt_regs *regs) +static __always_inline unsigned long regs_return_value(struct pt_regs *regs) { return regs->ax; } -static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) +static __always_inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) { regs->ax = rc; } @@ -277,34 +277,34 @@ static __always_inline bool ip_within_syscall_gap(struct pt_regs *regs) } #endif -static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) +static __always_inline unsigned long kernel_stack_pointer(struct pt_regs *regs) { return regs->sp; } -static inline unsigned long instruction_pointer(struct pt_regs *regs) +static __always_inline unsigned long instruction_pointer(struct pt_regs *regs) { return regs->ip; } -static inline void instruction_pointer_set(struct pt_regs *regs, - unsigned long val) +static __always_inline +void instruction_pointer_set(struct pt_regs *regs, unsigned long val) { regs->ip = val; } -static inline unsigned long frame_pointer(struct pt_regs *regs) +static __always_inline unsigned long frame_pointer(struct pt_regs *regs) { return regs->bp; } -static inline unsigned long user_stack_pointer(struct pt_regs *regs) +static __always_inline unsigned long user_stack_pointer(struct pt_regs *regs) { return regs->sp; } -static inline void user_stack_pointer_set(struct pt_regs *regs, - unsigned long val) +static __always_inline +void user_stack_pointer_set(struct pt_regs *regs, unsigned long val) { regs->sp = val; } diff --git a/arch/x86/include/asm/runtime-const.h b/arch/x86/include/asm/runtime-const.h index 8d983cfd06ea..e5a13dc8816e 100644 --- a/arch/x86/include/asm/runtime-const.h +++ b/arch/x86/include/asm/runtime-const.h @@ -2,6 +2,10 @@ #ifndef _ASM_RUNTIME_CONST_H #define _ASM_RUNTIME_CONST_H +#ifdef MODULE + #error "Cannot use runtime-const infrastructure from modules" +#endif + #ifdef __ASSEMBLY__ .macro RUNTIME_CONST_PTR sym reg diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h index 6a0069761508..04958459a7ca 100644 --- a/arch/x86/include/asm/sgx.h +++ b/arch/x86/include/asm/sgx.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/** +/* * Copyright(c) 2016-20 Intel Corporation. * * Intel Software Guard Extensions (SGX) support. @@ -28,21 +28,22 @@ #define SGX_CPUID_EPC_MASK GENMASK(3, 0) enum sgx_encls_function { - ECREATE = 0x00, - EADD = 0x01, - EINIT = 0x02, - EREMOVE = 0x03, - EDGBRD = 0x04, - EDGBWR = 0x05, - EEXTEND = 0x06, - ELDU = 0x08, - EBLOCK = 0x09, - EPA = 0x0A, - EWB = 0x0B, - ETRACK = 0x0C, - EAUG = 0x0D, - EMODPR = 0x0E, - EMODT = 0x0F, + ECREATE = 0x00, + EADD = 0x01, + EINIT = 0x02, + EREMOVE = 0x03, + EDGBRD = 0x04, + EDGBWR = 0x05, + EEXTEND = 0x06, + ELDU = 0x08, + EBLOCK = 0x09, + EPA = 0x0A, + EWB = 0x0B, + ETRACK = 0x0C, + EAUG = 0x0D, + EMODPR = 0x0E, + EMODT = 0x0F, + EUPDATESVN = 0x18, }; /** @@ -65,15 +66,19 @@ enum sgx_encls_function { /** * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV - * %SGX_EPC_PAGE_CONFLICT: Page is being written by other ENCLS function. - * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not + * @SGX_EPC_PAGE_CONFLICT: Page is being written by other ENCLS function. + * @SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not * been completed yet. - * %SGX_CHILD_PRESENT SECS has child pages present in the EPC. - * %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's + * @SGX_CHILD_PRESENT: SECS has child pages present in the EPC. + * @SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's * public key does not match IA32_SGXLEPUBKEYHASH. - * %SGX_PAGE_NOT_MODIFIABLE: The EPC page cannot be modified because it + * @SGX_PAGE_NOT_MODIFIABLE: The EPC page cannot be modified because it * is in the PENDING or MODIFIED state. - * %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received + * @SGX_INSUFFICIENT_ENTROPY: Insufficient entropy in RNG. + * @SGX_NO_UPDATE: EUPDATESVN could not update the CPUSVN because the + * current SVN was not newer than CPUSVN. This is the most + * common error code returned by EUPDATESVN. + * @SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received */ enum sgx_return_code { SGX_EPC_PAGE_CONFLICT = 7, @@ -81,6 +86,8 @@ enum sgx_return_code { SGX_CHILD_PRESENT = 13, SGX_INVALID_EINITTOKEN = 16, SGX_PAGE_NOT_MODIFIABLE = 20, + SGX_INSUFFICIENT_ENTROPY = 29, + SGX_NO_UPDATE = 31, SGX_UNMASKED_EVENT = 128, }; @@ -89,7 +96,7 @@ enum sgx_return_code { /** * enum sgx_miscselect - additional information to an SSA frame - * %SGX_MISC_EXINFO: Report #PF or #GP to the SSA frame. + * @SGX_MISC_EXINFO: Report #PF or #GP to the SSA frame. * * Save State Area (SSA) is a stack inside the enclave used to store processor * state when an exception or interrupt occurs. This enum defines additional @@ -105,17 +112,17 @@ enum sgx_miscselect { #define SGX_SSA_MISC_EXINFO_SIZE 16 /** - * enum sgx_attributes - the attributes field in &struct sgx_secs - * %SGX_ATTR_INIT: Enclave can be entered (is initialized). - * %SGX_ATTR_DEBUG: Allow ENCLS(EDBGRD) and ENCLS(EDBGWR). - * %SGX_ATTR_MODE64BIT: Tell that this a 64-bit enclave. - * %SGX_ATTR_PROVISIONKEY: Allow to use provisioning keys for remote + * enum sgx_attribute - the attributes field in &struct sgx_secs + * @SGX_ATTR_INIT: Enclave can be entered (is initialized). + * @SGX_ATTR_DEBUG: Allow ENCLS(EDBGRD) and ENCLS(EDBGWR). + * @SGX_ATTR_MODE64BIT: Tell that this a 64-bit enclave. + * @SGX_ATTR_PROVISIONKEY: Allow to use provisioning keys for remote * attestation. - * %SGX_ATTR_KSS: Allow to use key separation and sharing (KSS). - * %SGX_ATTR_EINITTOKENKEY: Allow to use token signing key that is used to + * @SGX_ATTR_KSS: Allow to use key separation and sharing (KSS). + * @SGX_ATTR_EINITTOKENKEY: Allow to use token signing key that is used to * sign cryptographic tokens that can be passed to * EINIT as an authorization to run an enclave. - * %SGX_ATTR_ASYNC_EXIT_NOTIFY: Allow enclaves to be notified after an + * @SGX_ATTR_ASYNC_EXIT_NOTIFY: Allow enclaves to be notified after an * asynchronous exit has occurred. */ enum sgx_attribute { @@ -188,7 +195,7 @@ struct sgx_secs { /** * enum sgx_tcs_flags - execution flags for TCS - * %SGX_TCS_DBGOPTIN: If enabled allows single-stepping and breakpoints + * @SGX_TCS_DBGOPTIN: If enabled allows single-stepping and breakpoints * inside an enclave. It is cleared by EADD but can * be set later with EDBGWR. */ @@ -253,11 +260,11 @@ struct sgx_pageinfo { /** * enum sgx_page_type - bits in the SECINFO flags defining the page type - * %SGX_PAGE_TYPE_SECS: a SECS page - * %SGX_PAGE_TYPE_TCS: a TCS page - * %SGX_PAGE_TYPE_REG: a regular page - * %SGX_PAGE_TYPE_VA: a VA page - * %SGX_PAGE_TYPE_TRIM: a page in trimmed state + * @SGX_PAGE_TYPE_SECS: a SECS page + * @SGX_PAGE_TYPE_TCS: a TCS page + * @SGX_PAGE_TYPE_REG: a regular page + * @SGX_PAGE_TYPE_VA: a VA page + * @SGX_PAGE_TYPE_TRIM: a page in trimmed state * * Make sure when making changes to this enum that its values can still fit * in the bitfield within &struct sgx_encl_page @@ -275,14 +282,14 @@ enum sgx_page_type { /** * enum sgx_secinfo_flags - the flags field in &struct sgx_secinfo - * %SGX_SECINFO_R: allow read - * %SGX_SECINFO_W: allow write - * %SGX_SECINFO_X: allow execution - * %SGX_SECINFO_SECS: a SECS page - * %SGX_SECINFO_TCS: a TCS page - * %SGX_SECINFO_REG: a regular page - * %SGX_SECINFO_VA: a VA page - * %SGX_SECINFO_TRIM: a page in trimmed state + * @SGX_SECINFO_R: allow read + * @SGX_SECINFO_W: allow write + * @SGX_SECINFO_X: allow execution + * @SGX_SECINFO_SECS: a SECS page + * @SGX_SECINFO_TCS: a TCS page + * @SGX_SECINFO_REG: a regular page + * @SGX_SECINFO_VA: a VA page + * @SGX_SECINFO_TRIM: a page in trimmed state */ enum sgx_secinfo_flags { SGX_SECINFO_R = BIT(0), diff --git a/arch/x86/include/asm/shared/msr.h b/arch/x86/include/asm/shared/msr.h index 1e6ec10b3a15..a20b1c08c99f 100644 --- a/arch/x86/include/asm/shared/msr.h +++ b/arch/x86/include/asm/shared/msr.h @@ -12,4 +12,19 @@ struct msr { }; }; +/* + * The kernel proper already defines rdmsr()/wrmsr(), but they are not for the + * boot kernel since they rely on tracepoint/exception handling infrastructure + * that's not available here. + */ +static inline void raw_rdmsr(unsigned int reg, struct msr *m) +{ + asm volatile("rdmsr" : "=a" (m->l), "=d" (m->h) : "c" (reg)); +} + +static inline void raw_wrmsr(unsigned int reg, const struct msr *m) +{ + asm volatile("wrmsr" : : "c" (reg), "a"(m->l), "d" (m->h) : "memory"); +} + #endif /* _ASM_X86_SHARED_MSR_H */ diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index 4f84d421d1cf..20a3baae9568 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -23,18 +23,55 @@ #else /* __ASSEMBLER__ */ +/* + * The CLAC/STAC instructions toggle the enforcement of + * X86_FEATURE_SMAP along with X86_FEATURE_LASS. + * + * SMAP enforcement is based on the _PAGE_BIT_USER bit in the page + * tables. The kernel is not allowed to touch pages with that bit set + * unless the AC bit is set. + * + * Use stac()/clac() when accessing userspace (_PAGE_USER) mappings, + * regardless of location. + * + * Note: a barrier is implicit in alternative(). + */ + static __always_inline void clac(void) { - /* Note: a barrier is implicit in alternative() */ alternative("", "clac", X86_FEATURE_SMAP); } static __always_inline void stac(void) { - /* Note: a barrier is implicit in alternative() */ alternative("", "stac", X86_FEATURE_SMAP); } +/* + * LASS enforcement is based on bit 63 of the virtual address. The + * kernel is not allowed to touch memory in the lower half of the + * virtual address space. + * + * Use lass_stac()/lass_clac() to toggle the AC bit for kernel data + * accesses (!_PAGE_USER) that are blocked by LASS, but not by SMAP. + * + * Even with the AC bit set, LASS will continue to block instruction + * fetches from the user half of the address space. To allow those, + * clear CR4.LASS to disable the LASS mechanism entirely. + * + * Note: a barrier is implicit in alternative(). + */ + +static __always_inline void lass_clac(void) +{ + alternative("", "clac", X86_FEATURE_LASS); +} + +static __always_inline void lass_stac(void) +{ + alternative("", "stac", X86_FEATURE_LASS); +} + static __always_inline unsigned long smap_save(void) { unsigned long flags; diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 22bfebe6776d..84951572ab81 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -109,7 +109,7 @@ int common_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_kick_ap(unsigned int cpu, struct task_struct *tidle); int native_cpu_disable(void); void __noreturn hlt_play_dead(void); -void native_play_dead(void); +void __noreturn native_play_dead(void); void play_dead_common(void); void wbinvd_on_cpu(int cpu); void wbinvd_on_all_cpus(void); diff --git a/arch/x86/include/asm/string.h b/arch/x86/include/asm/string.h index c3c2c1914d65..9cb5aae7fba9 100644 --- a/arch/x86/include/asm/string.h +++ b/arch/x86/include/asm/string.h @@ -1,6 +1,32 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_STRING_H +#define _ASM_X86_STRING_H + #ifdef CONFIG_X86_32 # include <asm/string_32.h> #else # include <asm/string_64.h> #endif + +static __always_inline void *__inline_memcpy(void *to, const void *from, size_t len) +{ + void *ret = to; + + asm volatile("rep movsb" + : "+D" (to), "+S" (from), "+c" (len) + : : "memory"); + return ret; +} + +static __always_inline void *__inline_memset(void *s, int v, size_t n) +{ + void *ret = s; + + asm volatile("rep stosb" + : "+D" (s), "+c" (n) + : "a" ((uint8_t)v) + : "memory"); + return ret; +} + +#endif /* _ASM_X86_STRING_H */ diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 17f6c3fedeee..0581c477d466 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -701,5 +701,6 @@ DEFINE_GHCB_ACCESSORS(sw_exit_info_1) DEFINE_GHCB_ACCESSORS(sw_exit_info_2) DEFINE_GHCB_ACCESSORS(sw_scratch) DEFINE_GHCB_ACCESSORS(xcr0) +DEFINE_GHCB_ACCESSORS(xss) #endif diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 21041898157a..1fadf0cf520c 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -218,6 +218,12 @@ static inline unsigned int topology_amd_nodes_per_pkg(void) return __amd_nodes_per_pkg; } +#else /* CONFIG_SMP */ +static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; } +static inline int topology_max_smt_threads(void) { return 1; } +static inline unsigned int topology_amd_nodes_per_pkg(void) { return 1; } +#endif /* !CONFIG_SMP */ + extern struct cpumask __cpu_primary_thread_mask; #define cpu_primary_thread_mask ((const struct cpumask *)&__cpu_primary_thread_mask) @@ -241,12 +247,6 @@ static inline bool topology_is_core_online(unsigned int cpu) } #define topology_is_core_online topology_is_core_online -#else /* CONFIG_SMP */ -static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; } -static inline int topology_max_smt_threads(void) { return 1; } -static inline unsigned int topology_amd_nodes_per_pkg(void) { return 1; } -#endif /* !CONFIG_SMP */ - static inline void arch_fix_phys_package_id(int num, u32 slot) { } @@ -325,4 +325,6 @@ static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled extern void arch_scale_freq_tick(void); #define arch_scale_freq_tick arch_scale_freq_tick +extern int arch_sched_node_distance(int from, int to); + #endif /* _ASM_X86_TOPOLOGY_H */ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 91a3fb8ae7ff..367297b188c3 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -528,18 +528,18 @@ static __must_check __always_inline bool user_access_begin(const void __user *pt #define user_access_save() smap_save() #define user_access_restore(x) smap_restore(x) -#define unsafe_put_user(x, ptr, label) \ +#define arch_unsafe_put_user(x, ptr, label) \ __put_user_size((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)), label) #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT -#define unsafe_get_user(x, ptr, err_label) \ +#define arch_unsafe_get_user(x, ptr, err_label) \ do { \ __inttype(*(ptr)) __gu_val; \ __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), err_label); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ } while (0) #else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT -#define unsafe_get_user(x, ptr, err_label) \ +#define arch_unsafe_get_user(x, ptr, err_label) \ do { \ int __gu_err; \ __inttype(*(ptr)) __gu_val; \ @@ -618,11 +618,11 @@ do { \ } while (0) #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT -#define __get_kernel_nofault(dst, src, type, err_label) \ +#define arch_get_kernel_nofault(dst, src, type, err_label) \ __get_user_size(*((type *)(dst)), (__force type __user *)(src), \ sizeof(type), err_label) #else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT -#define __get_kernel_nofault(dst, src, type, err_label) \ +#define arch_get_kernel_nofault(dst, src, type, err_label) \ do { \ int __kr_err; \ \ @@ -633,7 +633,7 @@ do { \ } while (0) #endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT -#define __put_kernel_nofault(dst, src, type, err_label) \ +#define arch_put_kernel_nofault(dst, src, type, err_label) \ __put_user_size(*((type *)(src)), (__force type __user *)(dst), \ sizeof(type), err_label) diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index c8a5ae35c871..641f45c22f9d 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -12,12 +12,12 @@ #include <asm/cpufeatures.h> #include <asm/page.h> #include <asm/percpu.h> -#include <asm/runtime-const.h> -/* - * Virtual variable: there's no actual backing store for this, - * it can purely be used as 'runtime_const_ptr(USER_PTR_MAX)' - */ +#ifdef MODULE + #define runtime_const_ptr(sym) (sym) +#else + #include <asm/runtime-const.h> +#endif extern unsigned long USER_PTR_MAX; #ifdef CONFIG_ADDRESS_MASKING diff --git a/arch/x86/include/asm/unwind_user.h b/arch/x86/include/asm/unwind_user.h new file mode 100644 index 000000000000..12064284bc4e --- /dev/null +++ b/arch/x86/include/asm/unwind_user.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_UNWIND_USER_H +#define _ASM_X86_UNWIND_USER_H + +#ifdef CONFIG_HAVE_UNWIND_USER_FP + +#include <asm/ptrace.h> +#include <asm/uprobes.h> + +#define ARCH_INIT_USER_FP_FRAME(ws) \ + .cfa_off = 2*(ws), \ + .ra_off = -1*(ws), \ + .fp_off = -2*(ws), \ + .use_fp = true, + +#define ARCH_INIT_USER_FP_ENTRY_FRAME(ws) \ + .cfa_off = 1*(ws), \ + .ra_off = -1*(ws), \ + .fp_off = 0, \ + .use_fp = false, + +static inline int unwind_user_word_size(struct pt_regs *regs) +{ + /* We can't unwind VM86 stacks */ + if (regs->flags & X86_VM_MASK) + return 0; +#ifdef CONFIG_X86_64 + if (!user_64bit_mode(regs)) + return sizeof(int); +#endif + return sizeof(long); +} + +static inline bool unwind_user_at_function_start(struct pt_regs *regs) +{ + return is_uprobe_at_func_entry(regs); +} + +#endif /* CONFIG_HAVE_UNWIND_USER_FP */ + +#endif /* _ASM_X86_UNWIND_USER_H */ diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 1ee2e5115955..362210c79998 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -62,4 +62,13 @@ struct arch_uprobe_task { unsigned int saved_tf; }; +#ifdef CONFIG_UPROBES +extern bool is_uprobe_at_func_entry(struct pt_regs *regs); +#else +static bool is_uprobe_at_func_entry(struct pt_regs *regs) +{ + return false; +} +#endif /* CONFIG_UPROBES */ + #endif /* _ASM_UPROBES_H */ diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index f1a4adc78272..81d0c8bf1137 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -136,6 +136,8 @@ #define X86_CR4_PKE _BITUL(X86_CR4_PKE_BIT) #define X86_CR4_CET_BIT 23 /* enable Control-flow Enforcement Technology */ #define X86_CR4_CET _BITUL(X86_CR4_CET_BIT) +#define X86_CR4_LASS_BIT 27 /* enable Linear Address Space Separation support */ +#define X86_CR4_LASS _BITUL(X86_CR4_LASS_BIT) #define X86_CR4_LAM_SUP_BIT 28 /* LAM for supervisor pointers */ #define X86_CR4_LAM_SUP _BITUL(X86_CR4_LAM_SUP_BIT) diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h index 2dd35bbdc822..3c4d52072189 100644 --- a/arch/x86/include/uapi/asm/sgx.h +++ b/arch/x86/include/uapi/asm/sgx.h @@ -10,7 +10,7 @@ /** * enum sgx_page_flags - page control flags - * %SGX_PAGE_MEASURE: Measure the page contents with a sequence of + * @SGX_PAGE_MEASURE: Measure the page contents with a sequence of * ENCLS[EEXTEND] operations. */ enum sgx_page_flags { @@ -143,6 +143,12 @@ struct sgx_enclave_run; /** * typedef sgx_enclave_user_handler_t - Exit handler function accepted by * __vdso_sgx_enter_enclave() + * @rdi: RDI at the time of EEXIT, undefined on AEX + * @rsi: RSI at the time of EEXIT, undefined on AEX + * @rdx: RDX at the time of EEXIT, undefined on AEX + * @rsp: RSP (untrusted) at the time of EEXIT or AEX + * @r8: R8 at the time of EEXIT, undefined on AEX + * @r9: R9 at the time of EEXIT, undefined on AEX * @run: The run instance given by the caller * * The register parameters contain the snapshot of their values at enclave @@ -166,7 +172,7 @@ typedef int (*sgx_enclave_user_handler_t)(long rdi, long rsi, long rdx, * @exception_addr: The address that triggered the exception * @user_handler: User provided callback run on exception * @user_data: Data passed to the user handler - * @reserved Reserved for future extensions + * @reserved: Reserved for future extensions * * If @user_handler is provided, the handler will be invoked on all return paths * of the normal flow. The user handler may transfer control, e.g. via a diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 9792e329343e..1baa86dfe029 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -93,6 +93,7 @@ #define EXIT_REASON_TPAUSE 68 #define EXIT_REASON_BUS_LOCK 74 #define EXIT_REASON_NOTIFY 75 +#define EXIT_REASON_SEAMCALL 76 #define EXIT_REASON_TDCALL 77 #define EXIT_REASON_MSR_READ_IMM 84 #define EXIT_REASON_MSR_WRITE_IMM 85 diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c index 0916f00a992e..e21419e686eb 100644 --- a/arch/x86/kernel/acpi/apei.c +++ b/arch/x86/kernel/acpi/apei.c @@ -19,6 +19,8 @@ int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data) if (!cmc->enabled) return 0; + mce_save_apei_thr_limit(cmc->notify.error_threshold_value); + /* * We expect HEST to provide a list of MC banks that report errors * in firmware first mode. Otherwise, return non-zero value to diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c index 7047124490f6..d7c8ef1e354d 100644 --- a/arch/x86/kernel/acpi/cppc.c +++ b/arch/x86/kernel/acpi/cppc.c @@ -196,7 +196,7 @@ int amd_detect_prefcore(bool *detected) break; } - for_each_present_cpu(cpu) { + for_each_online_cpu(cpu) { u32 tmp; int ret; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 8ee5ff547357..74f4c659f9c9 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -9,6 +9,7 @@ #include <asm/text-patching.h> #include <asm/insn.h> +#include <asm/insn-eval.h> #include <asm/ibt.h> #include <asm/set_memory.h> #include <asm/nmi.h> @@ -346,25 +347,6 @@ static void add_nop(u8 *buf, unsigned int len) } /* - * Matches NOP and NOPL, not any of the other possible NOPs. - */ -static bool insn_is_nop(struct insn *insn) -{ - /* Anything NOP, but no REP NOP */ - if (insn->opcode.bytes[0] == 0x90 && - (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3)) - return true; - - /* NOPL */ - if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F) - return true; - - /* TODO: more nops */ - - return false; -} - -/* * Find the offset of the first non-NOP instruction starting at @offset * but no further than @len. */ @@ -559,7 +541,7 @@ EXPORT_SYMBOL(BUG_func); * Rewrite the "call BUG_func" replacement to point to the target of the * indirect pv_ops call "call *disp(%ip)". */ -static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a) +static unsigned int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a) { void *target, *bug = &BUG_func; s32 disp; @@ -643,7 +625,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, * order. */ for (a = start; a < end; a++) { - int insn_buff_sz = 0; + unsigned int insn_buff_sz = 0; /* * In case of nested ALTERNATIVE()s the outer alternative might @@ -683,11 +665,8 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, memcpy(insn_buff, replacement, a->replacementlen); insn_buff_sz = a->replacementlen; - if (a->flags & ALT_FLAG_DIRECT_CALL) { + if (a->flags & ALT_FLAG_DIRECT_CALL) insn_buff_sz = alt_replace_call(instr, insn_buff, a); - if (insn_buff_sz < 0) - continue; - } for (; insn_buff_sz < a->instrlen; insn_buff_sz++) insn_buff[insn_buff_sz] = 0x90; @@ -2244,21 +2223,34 @@ int alternatives_text_reserved(void *start, void *end) * See entry_{32,64}.S for more details. */ -/* - * We define the int3_magic() function in assembly to control the calling - * convention such that we can 'call' it from assembly. - */ +extern void int3_selftest_asm(unsigned int *ptr); -extern void int3_magic(unsigned int *ptr); /* defined in asm */ +asm ( +" .pushsection .init.text, \"ax\", @progbits\n" +" .type int3_selftest_asm, @function\n" +"int3_selftest_asm:\n" + ANNOTATE_NOENDBR + /* + * INT3 padded with NOP to CALL_INSN_SIZE. The INT3 triggers an + * exception, then the int3_exception_nb notifier emulates a call to + * int3_selftest_callee(). + */ +" int3; nop; nop; nop; nop\n" + ASM_RET +" .size int3_selftest_asm, . - int3_selftest_asm\n" +" .popsection\n" +); + +extern void int3_selftest_callee(unsigned int *ptr); asm ( " .pushsection .init.text, \"ax\", @progbits\n" -" .type int3_magic, @function\n" -"int3_magic:\n" +" .type int3_selftest_callee, @function\n" +"int3_selftest_callee:\n" ANNOTATE_NOENDBR -" movl $1, (%" _ASM_ARG1 ")\n" +" movl $0x1234, (%" _ASM_ARG1 ")\n" ASM_RET -" .size int3_magic, .-int3_magic\n" +" .size int3_selftest_callee, . - int3_selftest_callee\n" " .popsection\n" ); @@ -2267,7 +2259,7 @@ extern void int3_selftest_ip(void); /* defined in asm below */ static int __init int3_exception_notify(struct notifier_block *self, unsigned long val, void *data) { - unsigned long selftest = (unsigned long)&int3_selftest_ip; + unsigned long selftest = (unsigned long)&int3_selftest_asm; struct die_args *args = data; struct pt_regs *regs = args->regs; @@ -2282,7 +2274,7 @@ int3_exception_notify(struct notifier_block *self, unsigned long val, void *data if (regs->ip - INT3_INSN_SIZE != selftest) return NOTIFY_DONE; - int3_emulate_call(regs, (unsigned long)&int3_magic); + int3_emulate_call(regs, (unsigned long)&int3_selftest_callee); return NOTIFY_STOP; } @@ -2298,19 +2290,11 @@ static noinline void __init int3_selftest(void) BUG_ON(register_die_notifier(&int3_exception_nb)); /* - * Basically: int3_magic(&val); but really complicated :-) - * - * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb - * notifier above will emulate CALL for us. + * Basically: int3_selftest_callee(&val); but really complicated :-) */ - asm volatile ("int3_selftest_ip:\n\t" - ANNOTATE_NOENDBR - " int3; nop; nop; nop; nop\n\t" - : ASM_CALL_CONSTRAINT - : __ASM_SEL_RAW(a, D) (&val) - : "memory"); + int3_selftest_asm(&val); - BUG_ON(val != 1); + BUG_ON(val != 0x1234); unregister_die_notifier(&int3_exception_nb); } @@ -2469,16 +2453,30 @@ void __init_or_module text_poke_early(void *addr, const void *opcode, __ro_after_init struct mm_struct *text_poke_mm; __ro_after_init unsigned long text_poke_mm_addr; +/* + * Text poking creates and uses a mapping in the lower half of the + * address space. Relax LASS enforcement when accessing the poking + * address. + * + * objtool enforces a strict policy of "no function calls within AC=1 + * regions". Adhere to the policy by using inline versions of + * memcpy()/memset() that will never result in a function call. + */ + static void text_poke_memcpy(void *dst, const void *src, size_t len) { - memcpy(dst, src, len); + lass_stac(); + __inline_memcpy(dst, src, len); + lass_clac(); } static void text_poke_memset(void *dst, const void *src, size_t len) { int c = *(const int *)src; - memset(dst, c, len); + lass_stac(); + __inline_memset(dst, c, len); + lass_clac(); } typedef void text_poke_f(void *dst, const void *src, size_t len); diff --git a/arch/x86/kernel/amd_node.c b/arch/x86/kernel/amd_node.c index a40176b62eb5..3d0a4768d603 100644 --- a/arch/x86/kernel/amd_node.c +++ b/arch/x86/kernel/amd_node.c @@ -34,62 +34,6 @@ struct pci_dev *amd_node_get_func(u16 node, u8 func) return pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(AMD_NODE0_PCI_SLOT + node, func)); } -#define DF_BLK_INST_CNT 0x040 -#define DF_CFG_ADDR_CNTL_LEGACY 0x084 -#define DF_CFG_ADDR_CNTL_DF4 0xC04 - -#define DF_MAJOR_REVISION GENMASK(27, 24) - -static u16 get_cfg_addr_cntl_offset(struct pci_dev *df_f0) -{ - u32 reg; - - /* - * Revision fields added for DF4 and later. - * - * Major revision of '0' is found pre-DF4. Field is Read-as-Zero. - */ - if (pci_read_config_dword(df_f0, DF_BLK_INST_CNT, ®)) - return 0; - - if (reg & DF_MAJOR_REVISION) - return DF_CFG_ADDR_CNTL_DF4; - - return DF_CFG_ADDR_CNTL_LEGACY; -} - -struct pci_dev *amd_node_get_root(u16 node) -{ - struct pci_dev *root; - u16 cntl_off; - u8 bus; - - if (!cpu_feature_enabled(X86_FEATURE_ZEN)) - return NULL; - - /* - * D18F0xXXX [Config Address Control] (DF::CfgAddressCntl) - * Bits [7:0] (SecBusNum) holds the bus number of the root device for - * this Data Fabric instance. The segment, device, and function will be 0. - */ - struct pci_dev *df_f0 __free(pci_dev_put) = amd_node_get_func(node, 0); - if (!df_f0) - return NULL; - - cntl_off = get_cfg_addr_cntl_offset(df_f0); - if (!cntl_off) - return NULL; - - if (pci_read_config_byte(df_f0, cntl_off, &bus)) - return NULL; - - /* Grab the pointer for the actual root device instance. */ - root = pci_get_domain_bus_and_slot(0, bus, 0); - - pci_dbg(root, "is root for AMD node %u\n", node); - return root; -} - static struct pci_dev **amd_roots; /* Protect the PCI config register pairs used for SMN. */ @@ -274,51 +218,21 @@ DEFINE_SHOW_STORE_ATTRIBUTE(smn_node); DEFINE_SHOW_STORE_ATTRIBUTE(smn_address); DEFINE_SHOW_STORE_ATTRIBUTE(smn_value); -static int amd_cache_roots(void) -{ - u16 node, num_nodes = amd_num_nodes(); - - amd_roots = kcalloc(num_nodes, sizeof(*amd_roots), GFP_KERNEL); - if (!amd_roots) - return -ENOMEM; - - for (node = 0; node < num_nodes; node++) - amd_roots[node] = amd_node_get_root(node); - - return 0; -} - -static int reserve_root_config_spaces(void) +static struct pci_dev *get_next_root(struct pci_dev *root) { - struct pci_dev *root = NULL; - struct pci_bus *bus = NULL; - - while ((bus = pci_find_next_bus(bus))) { - /* Root device is Device 0 Function 0 on each Primary Bus. */ - root = pci_get_slot(bus, 0); - if (!root) + while ((root = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, root))) { + /* Root device is Device 0 Function 0. */ + if (root->devfn) continue; if (root->vendor != PCI_VENDOR_ID_AMD && root->vendor != PCI_VENDOR_ID_HYGON) continue; - pci_dbg(root, "Reserving PCI config space\n"); - - /* - * There are a few SMN index/data pairs and other registers - * that shouldn't be accessed by user space. - * So reserve the entire PCI config space for simplicity rather - * than covering specific registers piecemeal. - */ - if (!pci_request_config_region_exclusive(root, 0, PCI_CFG_SPACE_SIZE, NULL)) { - pci_err(root, "Failed to reserve config space\n"); - return -EEXIST; - } + break; } - smn_exclusive = true; - return 0; + return root; } static bool enable_dfs; @@ -332,7 +246,8 @@ __setup("amd_smn_debugfs_enable", amd_smn_enable_dfs); static int __init amd_smn_init(void) { - int err; + u16 count, num_roots, roots_per_node, node, num_nodes; + struct pci_dev *root; if (!cpu_feature_enabled(X86_FEATURE_ZEN)) return 0; @@ -342,13 +257,48 @@ static int __init amd_smn_init(void) if (amd_roots) return 0; - err = amd_cache_roots(); - if (err) - return err; + num_roots = 0; + root = NULL; + while ((root = get_next_root(root))) { + pci_dbg(root, "Reserving PCI config space\n"); - err = reserve_root_config_spaces(); - if (err) - return err; + /* + * There are a few SMN index/data pairs and other registers + * that shouldn't be accessed by user space. So reserve the + * entire PCI config space for simplicity rather than covering + * specific registers piecemeal. + */ + if (!pci_request_config_region_exclusive(root, 0, PCI_CFG_SPACE_SIZE, NULL)) { + pci_err(root, "Failed to reserve config space\n"); + return -EEXIST; + } + + num_roots++; + } + + pr_debug("Found %d AMD root devices\n", num_roots); + + if (!num_roots) + return -ENODEV; + + num_nodes = amd_num_nodes(); + amd_roots = kcalloc(num_nodes, sizeof(*amd_roots), GFP_KERNEL); + if (!amd_roots) + return -ENOMEM; + + roots_per_node = num_roots / num_nodes; + + count = 0; + node = 0; + root = NULL; + while (node < num_nodes && (root = get_next_root(root))) { + /* Use one root for each node and skip the rest. */ + if (count++ % roots_per_node) + continue; + + pci_dbg(root, "is root for AMD node %u\n", node); + amd_roots[node++] = root; + } if (enable_dfs) { debugfs_dir = debugfs_create_dir("amd_smn", arch_debugfs_dir); @@ -358,6 +308,8 @@ static int __init amd_smn_init(void) debugfs_create_file("value", 0600, debugfs_dir, NULL, &smn_value_fops); } + smn_exclusive = true; + return 0; } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 680d305589a3..9c29e12b84e5 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -36,6 +36,7 @@ #include <linux/dmi.h> #include <linux/smp.h> #include <linux/mm.h> +#include <linux/kvm_types.h> #include <xen/xen.h> @@ -173,6 +174,7 @@ static struct resource lapic_resource = { .flags = IORESOURCE_MEM | IORESOURCE_BUSY, }; +/* Measured in ticks per HZ. */ unsigned int lapic_timer_period = 0; static void apic_pm_activate(void); @@ -792,6 +794,7 @@ static int __init calibrate_APIC_clock(void) { struct clock_event_device *levt = this_cpu_ptr(&lapic_events); u64 tsc_perj = 0, tsc_start = 0; + long delta_tsc_khz, bus_khz; unsigned long jif_start; unsigned long deltaj; long delta, deltatsc; @@ -894,14 +897,15 @@ static int __init calibrate_APIC_clock(void) apic_pr_verbose("..... calibration result: %u\n", lapic_timer_period); if (boot_cpu_has(X86_FEATURE_TSC)) { - apic_pr_verbose("..... CPU clock speed is %ld.%04ld MHz.\n", - (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ), - (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ)); + delta_tsc_khz = (deltatsc * HZ) / (1000 * LAPIC_CAL_LOOPS); + + apic_pr_verbose("..... CPU clock speed is %ld.%03ld MHz.\n", + delta_tsc_khz / 1000, delta_tsc_khz % 1000); } - apic_pr_verbose("..... host bus clock speed is %u.%04u MHz.\n", - lapic_timer_period / (1000000 / HZ), - lapic_timer_period % (1000000 / HZ)); + bus_khz = (long)lapic_timer_period * HZ / 1000; + apic_pr_verbose("..... host bus clock speed is %ld.%03ld MHz.\n", + bus_khz / 1000, bus_khz % 1000); /* * Do a sanity check on the APIC calibration result @@ -2316,7 +2320,7 @@ u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid) dest |= msg->arch_addr_hi.destid_8_31 << 8; return dest; } -EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid); +EXPORT_SYMBOL_FOR_KVM(x86_msi_msg_get_destid); static void __init apic_bsp_up_setup(void) { diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c index 9ef3be866832..2ed3b5c88c7f 100644 --- a/arch/x86/kernel/apic/apic_common.c +++ b/arch/x86/kernel/apic/apic_common.c @@ -4,6 +4,7 @@ * SPDX-License-Identifier: GPL-2.0 */ #include <linux/irq.h> +#include <linux/kvm_types.h> #include <asm/apic.h> #include "local.h" @@ -25,7 +26,7 @@ u32 default_cpu_present_to_apicid(int mps_cpu) else return BAD_APICID; } -EXPORT_SYMBOL_GPL(default_cpu_present_to_apicid); +EXPORT_SYMBOL_FOR_KVM(default_cpu_present_to_apicid); /* * Set up the logical destination ID when the APIC operates in logical diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5ba2feb2c04c..1e0442e867b1 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2864,7 +2864,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, ioapic = mp_irqdomain_ioapic_idx(domain); pin = info->ioapic.pin; - if (irq_find_mapping(domain, (irq_hw_number_t)pin) > 0) + if (irq_resolve_mapping(domain, (irq_hw_number_t)pin)) return -EEXIST; data = kzalloc(sizeof(*data), GFP_KERNEL); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 5398db4dedb4..bc94ff1e250a 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -3,7 +3,7 @@ #include <linux/bitops.h> #include <linux/elf.h> #include <linux/mm.h> - +#include <linux/kvm_types.h> #include <linux/io.h> #include <linux/sched.h> #include <linux/sched/clock.h> @@ -516,7 +516,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) setup_force_cpu_cap(X86_FEATURE_ZEN5); break; case 0x50 ... 0x5f: - case 0x90 ... 0xaf: + case 0x80 ... 0xaf: case 0xc0 ... 0xcf: setup_force_cpu_cap(X86_FEATURE_ZEN6); break; @@ -1035,8 +1035,26 @@ static void init_amd_zen4(struct cpuinfo_x86 *c) } } +static const struct x86_cpu_id zen5_rdseed_microcode[] = { + ZEN_MODEL_STEP_UCODE(0x1a, 0x02, 0x1, 0x0b00215a), + ZEN_MODEL_STEP_UCODE(0x1a, 0x08, 0x1, 0x0b008121), + ZEN_MODEL_STEP_UCODE(0x1a, 0x11, 0x0, 0x0b101054), + ZEN_MODEL_STEP_UCODE(0x1a, 0x24, 0x0, 0x0b204037), + ZEN_MODEL_STEP_UCODE(0x1a, 0x44, 0x0, 0x0b404035), + ZEN_MODEL_STEP_UCODE(0x1a, 0x44, 0x1, 0x0b404108), + ZEN_MODEL_STEP_UCODE(0x1a, 0x60, 0x0, 0x0b600037), + ZEN_MODEL_STEP_UCODE(0x1a, 0x68, 0x0, 0x0b608038), + ZEN_MODEL_STEP_UCODE(0x1a, 0x70, 0x0, 0x0b700037), + {}, +}; + static void init_amd_zen5(struct cpuinfo_x86 *c) { + if (!x86_match_min_microcode_rev(zen5_rdseed_microcode)) { + clear_cpu_cap(c, X86_FEATURE_RDSEED); + msr_clear_bit(MSR_AMD64_CPUID_FN_7, 18); + pr_emerg_once("RDSEED32 is broken. Disabling the corresponding CPUID bit.\n"); + } } static void init_amd(struct cpuinfo_x86 *c) @@ -1300,7 +1318,7 @@ unsigned long amd_get_dr_addr_mask(unsigned int dr) return per_cpu(amd_dr_addr_mask[dr], smp_processor_id()); } -EXPORT_SYMBOL_GPL(amd_get_dr_addr_mask); +EXPORT_SYMBOL_FOR_KVM(amd_get_dr_addr_mask); static void zenbleed_check_cpu(void *unused) { @@ -1355,11 +1373,23 @@ static __init int print_s5_reset_status_mmio(void) return 0; value = ioread32(addr); - iounmap(addr); /* Value with "all bits set" is an error response and should be ignored. */ - if (value == U32_MAX) + if (value == U32_MAX) { + iounmap(addr); return 0; + } + + /* + * Clear all reason bits so they won't be retained if the next reset + * does not update the register. Besides, some bits are never cleared by + * hardware so it's software's responsibility to clear them. + * + * Writing the value back effectively clears all reason bits as they are + * write-1-to-clear. + */ + iowrite32(value, addr); + iounmap(addr); for (i = 0; i < ARRAY_SIZE(s5_reset_reason_txt); i++) { if (!(value & BIT(i))) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 6a526ae1fe99..d8660770dc6a 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -16,6 +16,7 @@ #include <linux/sched/smt.h> #include <linux/pgtable.h> #include <linux/bpf.h> +#include <linux/kvm_types.h> #include <asm/spec-ctrl.h> #include <asm/cmdline.h> @@ -53,56 +54,8 @@ * mitigation option. */ -static void __init spectre_v1_select_mitigation(void); -static void __init spectre_v1_apply_mitigation(void); -static void __init spectre_v2_select_mitigation(void); -static void __init spectre_v2_update_mitigation(void); -static void __init spectre_v2_apply_mitigation(void); -static void __init retbleed_select_mitigation(void); -static void __init retbleed_update_mitigation(void); -static void __init retbleed_apply_mitigation(void); -static void __init spectre_v2_user_select_mitigation(void); -static void __init spectre_v2_user_update_mitigation(void); -static void __init spectre_v2_user_apply_mitigation(void); -static void __init ssb_select_mitigation(void); -static void __init ssb_apply_mitigation(void); -static void __init l1tf_select_mitigation(void); -static void __init l1tf_apply_mitigation(void); -static void __init mds_select_mitigation(void); -static void __init mds_update_mitigation(void); -static void __init mds_apply_mitigation(void); -static void __init taa_select_mitigation(void); -static void __init taa_update_mitigation(void); -static void __init taa_apply_mitigation(void); -static void __init mmio_select_mitigation(void); -static void __init mmio_update_mitigation(void); -static void __init mmio_apply_mitigation(void); -static void __init rfds_select_mitigation(void); -static void __init rfds_update_mitigation(void); -static void __init rfds_apply_mitigation(void); -static void __init srbds_select_mitigation(void); -static void __init srbds_apply_mitigation(void); -static void __init l1d_flush_select_mitigation(void); -static void __init srso_select_mitigation(void); -static void __init srso_update_mitigation(void); -static void __init srso_apply_mitigation(void); -static void __init gds_select_mitigation(void); -static void __init gds_apply_mitigation(void); -static void __init bhi_select_mitigation(void); -static void __init bhi_update_mitigation(void); -static void __init bhi_apply_mitigation(void); -static void __init its_select_mitigation(void); -static void __init its_update_mitigation(void); -static void __init its_apply_mitigation(void); -static void __init tsa_select_mitigation(void); -static void __init tsa_apply_mitigation(void); -static void __init vmscape_select_mitigation(void); -static void __init vmscape_update_mitigation(void); -static void __init vmscape_apply_mitigation(void); - /* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; -EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); /* The current value of the SPEC_CTRL MSR with task-specific bits set */ DEFINE_PER_CPU(u64, x86_spec_ctrl_current); @@ -179,7 +132,7 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb); /* Control IBPB on vCPU load */ DEFINE_STATIC_KEY_FALSE(switch_vcpu_ibpb); -EXPORT_SYMBOL_GPL(switch_vcpu_ibpb); +EXPORT_SYMBOL_FOR_KVM(switch_vcpu_ibpb); /* Control CPU buffer clear before idling (halt, mwait) */ DEFINE_STATIC_KEY_FALSE(cpu_buf_idle_clear); @@ -198,7 +151,7 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); * mitigation is required. */ DEFINE_STATIC_KEY_FALSE(cpu_buf_vm_clear); -EXPORT_SYMBOL_GPL(cpu_buf_vm_clear); +EXPORT_SYMBOL_FOR_KVM(cpu_buf_vm_clear); #undef pr_fmt #define pr_fmt(fmt) "mitigations: " fmt @@ -233,99 +186,6 @@ static void __init cpu_print_attack_vectors(void) } } -void __init cpu_select_mitigations(void) -{ - /* - * Read the SPEC_CTRL MSR to account for reserved bits which may - * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD - * init code as it is not enumerated and depends on the family. - */ - if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) { - rdmsrq(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); - - /* - * Previously running kernel (kexec), may have some controls - * turned ON. Clear them and let the mitigations setup below - * rediscover them based on configuration. - */ - x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK; - } - - x86_arch_cap_msr = x86_read_arch_cap_msr(); - - cpu_print_attack_vectors(); - - /* Select the proper CPU mitigations before patching alternatives: */ - spectre_v1_select_mitigation(); - spectre_v2_select_mitigation(); - retbleed_select_mitigation(); - spectre_v2_user_select_mitigation(); - ssb_select_mitigation(); - l1tf_select_mitigation(); - mds_select_mitigation(); - taa_select_mitigation(); - mmio_select_mitigation(); - rfds_select_mitigation(); - srbds_select_mitigation(); - l1d_flush_select_mitigation(); - srso_select_mitigation(); - gds_select_mitigation(); - its_select_mitigation(); - bhi_select_mitigation(); - tsa_select_mitigation(); - vmscape_select_mitigation(); - - /* - * After mitigations are selected, some may need to update their - * choices. - */ - spectre_v2_update_mitigation(); - /* - * retbleed_update_mitigation() relies on the state set by - * spectre_v2_update_mitigation(); specifically it wants to know about - * spectre_v2=ibrs. - */ - retbleed_update_mitigation(); - /* - * its_update_mitigation() depends on spectre_v2_update_mitigation() - * and retbleed_update_mitigation(). - */ - its_update_mitigation(); - - /* - * spectre_v2_user_update_mitigation() depends on - * retbleed_update_mitigation(), specifically the STIBP - * selection is forced for UNRET or IBPB. - */ - spectre_v2_user_update_mitigation(); - mds_update_mitigation(); - taa_update_mitigation(); - mmio_update_mitigation(); - rfds_update_mitigation(); - bhi_update_mitigation(); - /* srso_update_mitigation() depends on retbleed_update_mitigation(). */ - srso_update_mitigation(); - vmscape_update_mitigation(); - - spectre_v1_apply_mitigation(); - spectre_v2_apply_mitigation(); - retbleed_apply_mitigation(); - spectre_v2_user_apply_mitigation(); - ssb_apply_mitigation(); - l1tf_apply_mitigation(); - mds_apply_mitigation(); - taa_apply_mitigation(); - mmio_apply_mitigation(); - rfds_apply_mitigation(); - srbds_apply_mitigation(); - srso_apply_mitigation(); - gds_apply_mitigation(); - its_apply_mitigation(); - bhi_apply_mitigation(); - tsa_apply_mitigation(); - vmscape_apply_mitigation(); -} - /* * NOTE: This function is *only* called for SVM, since Intel uses * MSR_IA32_SPEC_CTRL for SSBD. @@ -366,7 +226,7 @@ x86_virt_spec_ctrl(u64 guest_virt_spec_ctrl, bool setguest) speculation_ctrl_update(tif); } } -EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl); +EXPORT_SYMBOL_FOR_KVM(x86_virt_spec_ctrl); static void x86_amd_ssb_disable(void) { @@ -1032,7 +892,7 @@ bool gds_ucode_mitigated(void) return (gds_mitigation == GDS_MITIGATION_FULL || gds_mitigation == GDS_MITIGATION_FULL_LOCKED); } -EXPORT_SYMBOL_GPL(gds_ucode_mitigated); +EXPORT_SYMBOL_FOR_KVM(gds_ucode_mitigated); void update_gds_msr(void) { @@ -1463,7 +1323,9 @@ static void __init retbleed_update_mitigation(void) break; default: if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) { - pr_err(RETBLEED_INTEL_MSG); + if (retbleed_mitigation != RETBLEED_MITIGATION_NONE) + pr_err(RETBLEED_INTEL_MSG); + retbleed_mitigation = RETBLEED_MITIGATION_NONE; } } @@ -1825,13 +1687,6 @@ void unpriv_ebpf_notify(int new_state) } #endif -static inline bool match_option(const char *arg, int arglen, const char *opt) -{ - int len = strlen(opt); - - return len == arglen && !strncmp(arg, opt, len); -} - /* The kernel command line selection for spectre v2 */ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_NONE, @@ -2864,7 +2719,7 @@ void x86_spec_ctrl_setup_ap(void) } bool itlb_multihit_kvm_mitigation; -EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); +EXPORT_SYMBOL_FOR_KVM(itlb_multihit_kvm_mitigation); #undef pr_fmt #define pr_fmt(fmt) "L1TF: " fmt @@ -2872,11 +2727,9 @@ EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); /* Default mitigation for L1TF-affected CPUs */ enum l1tf_mitigations l1tf_mitigation __ro_after_init = IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_AUTO : L1TF_MITIGATION_OFF; -#if IS_ENABLED(CONFIG_KVM_INTEL) -EXPORT_SYMBOL_GPL(l1tf_mitigation); -#endif +EXPORT_SYMBOL_FOR_KVM(l1tf_mitigation); enum vmx_l1d_flush_state l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; -EXPORT_SYMBOL_GPL(l1tf_vmx_mitigation); +EXPORT_SYMBOL_FOR_KVM(l1tf_vmx_mitigation); /* * These CPUs all support 44bits physical address space internally in the @@ -3376,6 +3229,99 @@ void cpu_bugs_smt_update(void) mutex_unlock(&spec_ctrl_mutex); } +void __init cpu_select_mitigations(void) +{ + /* + * Read the SPEC_CTRL MSR to account for reserved bits which may + * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD + * init code as it is not enumerated and depends on the family. + */ + if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) { + rdmsrq(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + + /* + * Previously running kernel (kexec), may have some controls + * turned ON. Clear them and let the mitigations setup below + * rediscover them based on configuration. + */ + x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK; + } + + x86_arch_cap_msr = x86_read_arch_cap_msr(); + + cpu_print_attack_vectors(); + + /* Select the proper CPU mitigations before patching alternatives: */ + spectre_v1_select_mitigation(); + spectre_v2_select_mitigation(); + retbleed_select_mitigation(); + spectre_v2_user_select_mitigation(); + ssb_select_mitigation(); + l1tf_select_mitigation(); + mds_select_mitigation(); + taa_select_mitigation(); + mmio_select_mitigation(); + rfds_select_mitigation(); + srbds_select_mitigation(); + l1d_flush_select_mitigation(); + srso_select_mitigation(); + gds_select_mitigation(); + its_select_mitigation(); + bhi_select_mitigation(); + tsa_select_mitigation(); + vmscape_select_mitigation(); + + /* + * After mitigations are selected, some may need to update their + * choices. + */ + spectre_v2_update_mitigation(); + /* + * retbleed_update_mitigation() relies on the state set by + * spectre_v2_update_mitigation(); specifically it wants to know about + * spectre_v2=ibrs. + */ + retbleed_update_mitigation(); + /* + * its_update_mitigation() depends on spectre_v2_update_mitigation() + * and retbleed_update_mitigation(). + */ + its_update_mitigation(); + + /* + * spectre_v2_user_update_mitigation() depends on + * retbleed_update_mitigation(), specifically the STIBP + * selection is forced for UNRET or IBPB. + */ + spectre_v2_user_update_mitigation(); + mds_update_mitigation(); + taa_update_mitigation(); + mmio_update_mitigation(); + rfds_update_mitigation(); + bhi_update_mitigation(); + /* srso_update_mitigation() depends on retbleed_update_mitigation(). */ + srso_update_mitigation(); + vmscape_update_mitigation(); + + spectre_v1_apply_mitigation(); + spectre_v2_apply_mitigation(); + retbleed_apply_mitigation(); + spectre_v2_user_apply_mitigation(); + ssb_apply_mitigation(); + l1tf_apply_mitigation(); + mds_apply_mitigation(); + taa_apply_mitigation(); + mmio_apply_mitigation(); + rfds_apply_mitigation(); + srbds_apply_mitigation(); + srso_apply_mitigation(); + gds_apply_mitigation(); + its_apply_mitigation(); + bhi_apply_mitigation(); + tsa_apply_mitigation(); + vmscape_apply_mitigation(); +} + #ifdef CONFIG_SYSFS #define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion" diff --git a/arch/x86/kernel/cpu/bus_lock.c b/arch/x86/kernel/cpu/bus_lock.c index 981f8b1f0792..dbc99a47be45 100644 --- a/arch/x86/kernel/cpu/bus_lock.c +++ b/arch/x86/kernel/cpu/bus_lock.c @@ -6,6 +6,7 @@ #include <linux/workqueue.h> #include <linux/delay.h> #include <linux/cpuhotplug.h> +#include <linux/kvm_types.h> #include <asm/cpu_device_id.h> #include <asm/cmdline.h> #include <asm/traps.h> @@ -289,7 +290,7 @@ bool handle_guest_split_lock(unsigned long ip) force_sig_fault(SIGBUS, BUS_ADRALN, NULL); return false; } -EXPORT_SYMBOL_GPL(handle_guest_split_lock); +EXPORT_SYMBOL_FOR_KVM(handle_guest_split_lock); void bus_lock_init(void) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c7d3512914ca..e7ab22fce3b5 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -7,6 +7,7 @@ #include <linux/bitops.h> #include <linux/kernel.h> #include <linux/export.h> +#include <linux/kvm_types.h> #include <linux/percpu.h> #include <linux/string.h> #include <linux/ctype.h> @@ -78,6 +79,10 @@ DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); +/* Used for modules: built-in code uses runtime constants */ +unsigned long USER_PTR_MAX; +EXPORT_SYMBOL(USER_PTR_MAX); + u32 elf_hwcap2 __read_mostly; /* Number of siblings per CPU package */ @@ -401,6 +406,28 @@ out: cr4_clear_bits(X86_CR4_UMIP); } +static __always_inline void setup_lass(struct cpuinfo_x86 *c) +{ + if (!cpu_feature_enabled(X86_FEATURE_LASS)) + return; + + /* + * Legacy vsyscall page access causes a #GP when LASS is active. + * Disable LASS because the #GP handler doesn't support vsyscall + * emulation. + * + * Also disable LASS when running under EFI, as some runtime and + * boot services rely on 1:1 mappings in the lower half. + */ + if (IS_ENABLED(CONFIG_X86_VSYSCALL_EMULATION) || + IS_ENABLED(CONFIG_EFI)) { + setup_clear_cpu_cap(X86_FEATURE_LASS); + return; + } + + cr4_set_bits(X86_CR4_LASS); +} + /* These bits should not change their value after CPU init is finished. */ static const unsigned long cr4_pinned_mask = X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | X86_CR4_FSGSBASE | X86_CR4_CET | X86_CR4_FRED; @@ -460,14 +487,14 @@ void cr4_update_irqsoff(unsigned long set, unsigned long clear) __write_cr4(newval); } } -EXPORT_SYMBOL(cr4_update_irqsoff); +EXPORT_SYMBOL_FOR_KVM(cr4_update_irqsoff); /* Read the CR4 shadow. */ unsigned long cr4_read_shadow(void) { return this_cpu_read(cpu_tlbstate.cr4); } -EXPORT_SYMBOL_GPL(cr4_read_shadow); +EXPORT_SYMBOL_FOR_KVM(cr4_read_shadow); void cr4_init(void) { @@ -722,7 +749,7 @@ void load_direct_gdt(int cpu) gdt_descr.size = GDT_SIZE - 1; load_gdt(&gdt_descr); } -EXPORT_SYMBOL_GPL(load_direct_gdt); +EXPORT_SYMBOL_FOR_KVM(load_direct_gdt); /* Load a fixmap remapping of the per-cpu GDT */ void load_fixmap_gdt(int cpu) @@ -1021,12 +1048,8 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[CPUID_8000_0001_EDX] = edx; } - if (c->extended_cpuid_level >= 0x80000007) { - cpuid(0x80000007, &eax, &ebx, &ecx, &edx); - - c->x86_capability[CPUID_8000_0007_EBX] = ebx; - c->x86_power = edx; - } + if (c->extended_cpuid_level >= 0x80000007) + c->x86_power = cpuid_edx(0x80000007); if (c->extended_cpuid_level >= 0x80000008) { cpuid(0x80000008, &eax, &ebx, &ecx, &edx); @@ -2011,10 +2034,10 @@ static void identify_cpu(struct cpuinfo_x86 *c) /* Disable the PN if appropriate */ squash_the_stupid_serial_number(c); - /* Set up SMEP/SMAP/UMIP */ setup_smep(c); setup_smap(c); setup_umip(c); + setup_lass(c); /* Enable FSGSBASE instructions if available. */ if (cpu_has(c, X86_FEATURE_FSGSBASE)) { @@ -2579,7 +2602,7 @@ void __init arch_cpu_finalize_init(void) alternative_instructions(); if (IS_ENABLED(CONFIG_X86_64)) { - unsigned long USER_PTR_MAX = TASK_SIZE_MAX; + USER_PTR_MAX = TASK_SIZE_MAX; /* * Enable this when LAM is gated on LASS support diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index bc38b2d56f26..5c7a3a71191a 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -42,15 +42,6 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[], *const __x86_cpu_dev_end[]; #ifdef CONFIG_CPU_SUP_INTEL -enum tsx_ctrl_states { - TSX_CTRL_ENABLE, - TSX_CTRL_DISABLE, - TSX_CTRL_RTM_ALWAYS_ABORT, - TSX_CTRL_NOT_SUPPORTED, -}; - -extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state; - extern void __init tsx_init(void); void tsx_ap_init(void); void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index 46efcbd6afa4..146f6f8b0650 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -72,6 +72,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC }, { X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_TOTAL }, { X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_LOCAL }, + { X86_FEATURE_SDCIAE, X86_FEATURE_CAT_L3 }, { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL }, { X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW }, { X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES }, @@ -79,6 +80,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_SGX_LC, X86_FEATURE_SGX }, { X86_FEATURE_SGX1, X86_FEATURE_SGX }, { X86_FEATURE_SGX2, X86_FEATURE_SGX1 }, + { X86_FEATURE_SGX_EUPDATESVN, X86_FEATURE_SGX1 }, { X86_FEATURE_SGX_EDECCSSA, X86_FEATURE_SGX1 }, { X86_FEATURE_XFD, X86_FEATURE_XSAVES }, { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 }, @@ -89,6 +91,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_SHSTK, X86_FEATURE_XSAVES }, { X86_FEATURE_FRED, X86_FEATURE_LKGS }, { X86_FEATURE_SPEC_CTRL_SSBD, X86_FEATURE_SPEC_CTRL }, + { X86_FEATURE_LASS, X86_FEATURE_SMAP }, {} }; diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index d6906442f49b..3f1dda355307 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -43,9 +43,6 @@ /* Deferred error settings */ #define MSR_CU_DEF_ERR 0xC0000410 #define MASK_DEF_LVTOFF 0x000000F0 -#define MASK_DEF_INT_TYPE 0x00000006 -#define DEF_LVT_OFF 0x2 -#define DEF_INT_TYPE_APIC 0x2 /* Scalable MCA: */ @@ -54,6 +51,17 @@ static bool thresholding_irq_en; +struct mce_amd_cpu_data { + mce_banks_t thr_intr_banks; + mce_banks_t dfr_intr_banks; + + u32 thr_intr_en: 1, + dfr_intr_en: 1, + __resv: 30; +}; + +static DEFINE_PER_CPU_READ_MOSTLY(struct mce_amd_cpu_data, mce_amd_data); + static const char * const th_names[] = { "load_store", "insn_fetch", @@ -79,6 +87,8 @@ struct smca_bank { const struct smca_hwid *hwid; u32 id; /* Value of MCA_IPID[InstanceId]. */ u8 sysfs_id; /* Value used for sysfs name. */ + u64 paddrv :1, /* Physical Address Valid bit in MCA_CONFIG */ + __reserved :63; }; static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks); @@ -264,6 +274,7 @@ void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; static void smca_configure(unsigned int bank, unsigned int cpu) { + struct mce_amd_cpu_data *data = this_cpu_ptr(&mce_amd_data); u8 *bank_counts = this_cpu_ptr(smca_bank_counts); const struct smca_hwid *s_hwid; unsigned int i, hwid_mcatype; @@ -294,11 +305,33 @@ static void smca_configure(unsigned int bank, unsigned int cpu) * APIC based interrupt. First, check that no interrupt has been * set. */ - if ((low & BIT(5)) && !((high >> 5) & 0x3)) + if ((low & BIT(5)) && !((high >> 5) & 0x3) && data->dfr_intr_en) { + __set_bit(bank, data->dfr_intr_banks); high |= BIT(5); + } + + /* + * SMCA Corrected Error Interrupt + * + * MCA_CONFIG[IntPresent] is bit 10, and tells us if the bank can + * send an MCA Thresholding interrupt without the OS initializing + * this feature. This can be used if the threshold limit is managed + * by the platform. + * + * MCA_CONFIG[IntEn] is bit 40 (8 in the high portion of the MSR). + * The OS should set this to inform the platform that the OS is ready + * to handle the MCA Thresholding interrupt. + */ + if ((low & BIT(10)) && data->thr_intr_en) { + __set_bit(bank, data->thr_intr_banks); + high |= BIT(8); + } this_cpu_ptr(mce_banks_array)[bank].lsb_in_status = !!(low & BIT(8)); + if (low & MCI_CONFIG_PADDRV) + this_cpu_ptr(smca_banks)[bank].paddrv = 1; + wrmsr(smca_config, low, high); } @@ -368,6 +401,14 @@ static bool lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) { int msr = (hi & MASK_LVTOFF_HI) >> 20; + /* + * On SMCA CPUs, LVT offset is programmed at a different MSR, and + * the BIOS provides the value. The original field where LVT offset + * was set is reserved. Return early here: + */ + if (mce_flags.smca) + return false; + if (apic < 0) { pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt " "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu, @@ -376,14 +417,6 @@ static bool lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) } if (apic != msr) { - /* - * On SMCA CPUs, LVT offset is programmed at a different MSR, and - * the BIOS provides the value. The original field where LVT offset - * was set is reserved. Return early here: - */ - if (mce_flags.smca) - return false; - pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d " "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu, apic, b->bank, b->block, b->address, hi, lo); @@ -443,6 +476,36 @@ static void threshold_restart_block(void *_tr) wrmsr(tr->b->address, lo, hi); } +static void threshold_restart_bank(unsigned int bank, bool intr_en) +{ + struct threshold_bank **thr_banks = this_cpu_read(threshold_banks); + struct threshold_block *block, *tmp; + struct thresh_restart tr; + + if (!thr_banks || !thr_banks[bank]) + return; + + memset(&tr, 0, sizeof(tr)); + + list_for_each_entry_safe(block, tmp, &thr_banks[bank]->miscj, miscj) { + tr.b = block; + tr.b->interrupt_enable = intr_en; + threshold_restart_block(&tr); + } +} + +/* Try to use the threshold limit reported through APEI. */ +static u16 get_thr_limit(void) +{ + u32 thr_limit = mce_get_apei_thr_limit(); + + /* Fallback to old default if APEI limit is not available. */ + if (!thr_limit) + return THRESHOLD_MAX; + + return min(thr_limit, THRESHOLD_MAX); +} + static void mce_threshold_block_init(struct threshold_block *b, int offset) { struct thresh_restart tr = { @@ -451,7 +514,7 @@ static void mce_threshold_block_init(struct threshold_block *b, int offset) .lvt_off = offset, }; - b->threshold_limit = THRESHOLD_MAX; + b->threshold_limit = get_thr_limit(); threshold_restart_block(&tr); }; @@ -464,41 +527,6 @@ static int setup_APIC_mce_threshold(int reserved, int new) return reserved; } -static int setup_APIC_deferred_error(int reserved, int new) -{ - if (reserved < 0 && !setup_APIC_eilvt(new, DEFERRED_ERROR_VECTOR, - APIC_EILVT_MSG_FIX, 0)) - return new; - - return reserved; -} - -static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) -{ - u32 low = 0, high = 0; - int def_offset = -1, def_new; - - if (rdmsr_safe(MSR_CU_DEF_ERR, &low, &high)) - return; - - def_new = (low & MASK_DEF_LVTOFF) >> 4; - if (!(low & MASK_DEF_LVTOFF)) { - pr_err(FW_BUG "Your BIOS is not setting up LVT offset 0x2 for deferred error IRQs correctly.\n"); - def_new = DEF_LVT_OFF; - low = (low & ~MASK_DEF_LVTOFF) | (DEF_LVT_OFF << 4); - } - - def_offset = setup_APIC_deferred_error(def_offset, def_new); - if ((def_offset == def_new) && - (deferred_error_int_vector != amd_deferred_error_interrupt)) - deferred_error_int_vector = amd_deferred_error_interrupt; - - if (!mce_flags.smca) - low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC; - - wrmsr(MSR_CU_DEF_ERR, low, high); -} - static u32 get_block_address(u32 current_addr, u32 low, u32 high, unsigned int bank, unsigned int block, unsigned int cpu) @@ -534,12 +562,10 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high, return addr; } -static int -prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, - int offset, u32 misc_high) +static int prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, + int offset, u32 misc_high) { unsigned int cpu = smp_processor_id(); - u32 smca_low, smca_high; struct threshold_block b; int new; @@ -556,20 +582,13 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, if (!b.interrupt_capable) goto done; + __set_bit(bank, this_cpu_ptr(&mce_amd_data)->thr_intr_banks); b.interrupt_enable = 1; - if (!mce_flags.smca) { - new = (misc_high & MASK_LVTOFF_HI) >> 20; - goto set_offset; - } - - /* Gather LVT offset for thresholding: */ - if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high)) - goto out; - - new = (smca_low & SMCA_THR_LVT_OFF) >> 12; + if (mce_flags.smca) + goto done; -set_offset: + new = (misc_high & MASK_LVTOFF_HI) >> 20; offset = setup_APIC_mce_threshold(offset, new); if (offset == new) thresholding_irq_en = true; @@ -577,7 +596,6 @@ set_offset: done: mce_threshold_block_init(&b, offset); -out: return offset; } @@ -668,6 +686,32 @@ static void amd_apply_cpu_quirks(struct cpuinfo_x86 *c) mce_banks[0].ctl = 0; } +/* + * Enable the APIC LVT interrupt vectors once per-CPU. This should be done before hardware is + * ready to send interrupts. + * + * Individual error sources are enabled later during per-bank init. + */ +static void smca_enable_interrupt_vectors(void) +{ + struct mce_amd_cpu_data *data = this_cpu_ptr(&mce_amd_data); + u64 mca_intr_cfg, offset; + + if (!mce_flags.smca || !mce_flags.succor) + return; + + if (rdmsrq_safe(MSR_CU_DEF_ERR, &mca_intr_cfg)) + return; + + offset = (mca_intr_cfg & SMCA_THR_LVT_OFF) >> 12; + if (!setup_APIC_eilvt(offset, THRESHOLD_APIC_VECTOR, APIC_EILVT_MSG_FIX, 0)) + data->thr_intr_en = 1; + + offset = (mca_intr_cfg & MASK_DEF_LVTOFF) >> 4; + if (!setup_APIC_eilvt(offset, DEFERRED_ERROR_VECTOR, APIC_EILVT_MSG_FIX, 0)) + data->dfr_intr_en = 1; +} + /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { @@ -679,10 +723,16 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) mce_flags.amd_threshold = 1; + smca_enable_interrupt_vectors(); + for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { - if (mce_flags.smca) + if (mce_flags.smca) { smca_configure(bank, cpu); + if (!this_cpu_ptr(&mce_amd_data)->thr_intr_en) + continue; + } + disable_err_thresholding(c, bank); for (block = 0; block < NR_BLOCKS; ++block) { @@ -703,9 +753,6 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) offset = prepare_threshold_block(bank, block, address, offset, high); } } - - if (mce_flags.succor) - deferred_error_interrupt_enable(c); } void smca_bsp_init(void) @@ -748,9 +795,9 @@ bool amd_mce_is_memory_error(struct mce *m) } /* - * AMD systems do not have an explicit indicator that the value in MCA_ADDR is - * a system physical address. Therefore, individual cases need to be detected. - * Future cases and checks will be added as needed. + * Some AMD systems have an explicit indicator that the value in MCA_ADDR is a + * system physical address. Individual cases though, need to be detected for + * other systems. Future cases will be added as needed. * * 1) General case * a) Assume address is not usable. @@ -764,6 +811,8 @@ bool amd_mce_is_memory_error(struct mce *m) * a) Reported in legacy bank 4 with extended error code (XEC) 8. * b) MCA_STATUS[43] is *not* defined as poison in legacy bank 4. Therefore, * this bit should not be checked. + * 4) MCI_STATUS_PADDRVAL is set + * a) Will provide a valid system physical address. * * NOTE: SMCA UMC memory errors fall into case #1. */ @@ -777,6 +826,9 @@ bool amd_mce_usable_address(struct mce *m) return false; } + if (this_cpu_ptr(smca_banks)[m->bank].paddrv) + return m->status & MCI_STATUS_PADDRV; + /* Check poison bit for all other bank types. */ if (m->status & MCI_STATUS_POISON) return true; @@ -785,37 +837,6 @@ bool amd_mce_usable_address(struct mce *m) return false; } -static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) -{ - struct mce_hw_err err; - struct mce *m = &err.m; - - mce_prep_record(&err); - - m->status = status; - m->misc = misc; - m->bank = bank; - m->tsc = rdtsc(); - - if (m->status & MCI_STATUS_ADDRV) { - m->addr = addr; - - smca_extract_err_addr(m); - } - - if (mce_flags.smca) { - rdmsrq(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid); - - if (m->status & MCI_STATUS_SYNDV) { - rdmsrq(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd); - rdmsrq(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1); - rdmsrq(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2); - } - } - - mce_log(&err); -} - DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error) { trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); @@ -825,103 +846,20 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error) apic_eoi(); } -/* - * Returns true if the logged error is deferred. False, otherwise. - */ -static inline bool -_log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc) -{ - u64 status, addr = 0; - - rdmsrq(msr_stat, status); - if (!(status & MCI_STATUS_VAL)) - return false; - - if (status & MCI_STATUS_ADDRV) - rdmsrq(msr_addr, addr); - - __log_error(bank, status, addr, misc); - - wrmsrq(msr_stat, 0); - - return status & MCI_STATUS_DEFERRED; -} - -static bool _log_error_deferred(unsigned int bank, u32 misc) -{ - if (!_log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), - mca_msr_reg(bank, MCA_ADDR), misc)) - return false; - - /* - * Non-SMCA systems don't have MCA_DESTAT/MCA_DEADDR registers. - * Return true here to avoid accessing these registers. - */ - if (!mce_flags.smca) - return true; - - /* Clear MCA_DESTAT if the deferred error was logged from MCA_STATUS. */ - wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); - return true; -} - -/* - * We have three scenarios for checking for Deferred errors: - * - * 1) Non-SMCA systems check MCA_STATUS and log error if found. - * 2) SMCA systems check MCA_STATUS. If error is found then log it and also - * clear MCA_DESTAT. - * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and - * log it. - */ -static void log_error_deferred(unsigned int bank) -{ - if (_log_error_deferred(bank, 0)) - return; - - /* - * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check - * for a valid error. - */ - _log_error_bank(bank, MSR_AMD64_SMCA_MCx_DESTAT(bank), - MSR_AMD64_SMCA_MCx_DEADDR(bank), 0); -} - /* APIC interrupt handler for deferred errors */ static void amd_deferred_error_interrupt(void) { - unsigned int bank; - - for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) - log_error_deferred(bank); + machine_check_poll(MCP_TIMESTAMP, &this_cpu_ptr(&mce_amd_data)->dfr_intr_banks); } -static void log_error_thresholding(unsigned int bank, u64 misc) +void mce_amd_handle_storm(unsigned int bank, bool on) { - _log_error_deferred(bank, misc); + threshold_restart_bank(bank, on); } -static void log_and_reset_block(struct threshold_block *block) +static void amd_reset_thr_limit(unsigned int bank) { - struct thresh_restart tr; - u32 low = 0, high = 0; - - if (!block) - return; - - if (rdmsr_safe(block->address, &low, &high)) - return; - - if (!(high & MASK_OVERFLOW_HI)) - return; - - /* Log the MCE which caused the threshold event. */ - log_error_thresholding(block->bank, ((u64)high << 32) | low); - - /* Reset threshold block after logging error. */ - memset(&tr, 0, sizeof(tr)); - tr.b = block; - threshold_restart_block(&tr); + threshold_restart_bank(bank, true); } /* @@ -930,33 +868,21 @@ static void log_and_reset_block(struct threshold_block *block) */ static void amd_threshold_interrupt(void) { - struct threshold_bank **bp = this_cpu_read(threshold_banks), *thr_bank; - unsigned int bank, cpu = smp_processor_id(); - struct threshold_block *block, *tmp; - - /* - * Validate that the threshold bank has been initialized already. The - * handler is installed at boot time, but on a hotplug event the - * interrupt might fire before the data has been initialized. - */ - if (!bp) - return; - - for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { - if (!(per_cpu(bank_map, cpu) & BIT_ULL(bank))) - continue; - - thr_bank = bp[bank]; - if (!thr_bank) - continue; - - list_for_each_entry_safe(block, tmp, &thr_bank->miscj, miscj) - log_and_reset_block(block); - } + machine_check_poll(MCP_TIMESTAMP, &this_cpu_ptr(&mce_amd_data)->thr_intr_banks); } void amd_clear_bank(struct mce *m) { + amd_reset_thr_limit(m->bank); + + /* Clear MCA_DESTAT for all deferred errors even those logged in MCA_STATUS. */ + if (m->status & MCI_STATUS_DEFERRED) + mce_wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank), 0); + + /* Don't clear MCA_STATUS if MCA_DESTAT was used exclusively. */ + if (m->kflags & MCE_CHECK_DFR_REGS) + return; + mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0); } @@ -1172,7 +1098,7 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb b->address = address; b->interrupt_enable = 0; b->interrupt_capable = lvt_interrupt_supported(bank, high); - b->threshold_limit = THRESHOLD_MAX; + b->threshold_limit = get_thr_limit(); if (b->interrupt_capable) { default_attrs[2] = &interrupt_enable.attr; @@ -1183,6 +1109,8 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb list_add(&b->miscj, &tb->miscj); + mce_threshold_block_init(b, (high & MASK_LVTOFF_HI) >> 20); + err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b)); if (err) goto out_free; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 460e90a1a0b1..4aff14e04287 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -687,7 +687,10 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i) m->misc = mce_rdmsrq(mca_msr_reg(i, MCA_MISC)); if (m->status & MCI_STATUS_ADDRV) { - m->addr = mce_rdmsrq(mca_msr_reg(i, MCA_ADDR)); + if (m->kflags & MCE_CHECK_DFR_REGS) + m->addr = mce_rdmsrq(MSR_AMD64_SMCA_MCx_DEADDR(i)); + else + m->addr = mce_rdmsrq(mca_msr_reg(i, MCA_ADDR)); /* * Mask the reported address by the reported granularity. @@ -715,6 +718,29 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i) DEFINE_PER_CPU(unsigned, mce_poll_count); /* + * We have three scenarios for checking for Deferred errors: + * + * 1) Non-SMCA systems check MCA_STATUS and log error if found. + * 2) SMCA systems check MCA_STATUS. If error is found then log it and also + * clear MCA_DESTAT. + * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and + * log it. + */ +static bool smca_should_log_poll_error(struct mce *m) +{ + if (m->status & MCI_STATUS_VAL) + return true; + + m->status = mce_rdmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank)); + if ((m->status & MCI_STATUS_VAL) && (m->status & MCI_STATUS_DEFERRED)) { + m->kflags |= MCE_CHECK_DFR_REGS; + return true; + } + + return false; +} + +/* * Newer Intel systems that support software error * recovery need to make additional checks. Other * CPUs should skip over uncorrected errors, but log @@ -740,6 +766,9 @@ static bool should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *err) { struct mce *m = &err->m; + if (mce_flags.smca) + return smca_should_log_poll_error(m); + /* If this entry is not valid, ignore it. */ if (!(m->status & MCI_STATUS_VAL)) return false; diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index b0e00ec5cc8c..a31cf984619c 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -67,6 +67,7 @@ void mce_track_storm(struct mce *mce); void mce_inherit_storm(unsigned int bank); bool mce_get_storm_mode(void); void mce_set_storm_mode(bool storm); +u32 mce_get_apei_thr_limit(void); #else static inline void cmci_storm_begin(unsigned int bank) {} static inline void cmci_storm_end(unsigned int bank) {} @@ -74,6 +75,7 @@ static inline void mce_track_storm(struct mce *mce) {} static inline void mce_inherit_storm(unsigned int bank) {} static inline bool mce_get_storm_mode(void) { return false; } static inline void mce_set_storm_mode(bool storm) {} +static inline u32 mce_get_apei_thr_limit(void) { return 0; } #endif /* @@ -267,6 +269,7 @@ void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m); #ifdef CONFIG_X86_MCE_AMD void mce_threshold_create_device(unsigned int cpu); void mce_threshold_remove_device(unsigned int cpu); +void mce_amd_handle_storm(unsigned int bank, bool on); extern bool amd_filter_mce(struct mce *m); bool amd_mce_usable_address(struct mce *m); void amd_clear_bank(struct mce *m); @@ -299,6 +302,7 @@ void smca_bsp_init(void); #else static inline void mce_threshold_create_device(unsigned int cpu) { } static inline void mce_threshold_remove_device(unsigned int cpu) { } +static inline void mce_amd_handle_storm(unsigned int bank, bool on) { } static inline bool amd_filter_mce(struct mce *m) { return false; } static inline bool amd_mce_usable_address(struct mce *m) { return false; } static inline void amd_clear_bank(struct mce *m) { } diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c index f4a007616468..0d13c9ffcba0 100644 --- a/arch/x86/kernel/cpu/mce/threshold.c +++ b/arch/x86/kernel/cpu/mce/threshold.c @@ -13,6 +13,19 @@ #include "internal.h" +static u32 mce_apei_thr_limit; + +void mce_save_apei_thr_limit(u32 thr_limit) +{ + mce_apei_thr_limit = thr_limit; + pr_info("HEST corrected error threshold limit: %u\n", thr_limit); +} + +u32 mce_get_apei_thr_limit(void) +{ + return mce_apei_thr_limit; +} + static void default_threshold_interrupt(void) { pr_err("Unexpected threshold interrupt at vector %x\n", @@ -63,6 +76,9 @@ static void mce_handle_storm(unsigned int bank, bool on) case X86_VENDOR_INTEL: mce_intel_handle_storm(bank, on); break; + case X86_VENDOR_AMD: + mce_amd_handle_storm(bank, on); + break; } } @@ -85,7 +101,8 @@ void cmci_storm_end(unsigned int bank) { struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); - __clear_bit(bank, this_cpu_ptr(mce_poll_banks)); + if (!mce_flags.amd_threshold) + __clear_bit(bank, this_cpu_ptr(mce_poll_banks)); storm->banks[bank].history = 0; storm->banks[bank].in_storm_mode = false; diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index cdce885e2fd5..3821a985f4ff 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -186,60 +186,92 @@ static u32 cpuid_to_ucode_rev(unsigned int val) return p.ucode_rev; } +static u32 get_cutoff_revision(u32 rev) +{ + switch (rev >> 8) { + case 0x80012: return 0x8001277; break; + case 0x80082: return 0x800820f; break; + case 0x83010: return 0x830107c; break; + case 0x86001: return 0x860010e; break; + case 0x86081: return 0x8608108; break; + case 0x87010: return 0x8701034; break; + case 0x8a000: return 0x8a0000a; break; + case 0xa0010: return 0xa00107a; break; + case 0xa0011: return 0xa0011da; break; + case 0xa0012: return 0xa001243; break; + case 0xa0082: return 0xa00820e; break; + case 0xa1011: return 0xa101153; break; + case 0xa1012: return 0xa10124e; break; + case 0xa1081: return 0xa108109; break; + case 0xa2010: return 0xa20102f; break; + case 0xa2012: return 0xa201212; break; + case 0xa4041: return 0xa404109; break; + case 0xa5000: return 0xa500013; break; + case 0xa6012: return 0xa60120a; break; + case 0xa7041: return 0xa704109; break; + case 0xa7052: return 0xa705208; break; + case 0xa7080: return 0xa708009; break; + case 0xa70c0: return 0xa70C009; break; + case 0xaa001: return 0xaa00116; break; + case 0xaa002: return 0xaa00218; break; + case 0xb0021: return 0xb002146; break; + case 0xb0081: return 0xb008111; break; + case 0xb1010: return 0xb101046; break; + case 0xb2040: return 0xb204031; break; + case 0xb4040: return 0xb404031; break; + case 0xb4041: return 0xb404101; break; + case 0xb6000: return 0xb600031; break; + case 0xb6080: return 0xb608031; break; + case 0xb7000: return 0xb700031; break; + default: break; + + } + return 0; +} + static bool need_sha_check(u32 cur_rev) { + u32 cutoff; + if (!cur_rev) { cur_rev = cpuid_to_ucode_rev(bsp_cpuid_1_eax); pr_info_once("No current revision, generating the lowest one: 0x%x\n", cur_rev); } - switch (cur_rev >> 8) { - case 0x80012: return cur_rev <= 0x800126f; break; - case 0x80082: return cur_rev <= 0x800820f; break; - case 0x83010: return cur_rev <= 0x830107c; break; - case 0x86001: return cur_rev <= 0x860010e; break; - case 0x86081: return cur_rev <= 0x8608108; break; - case 0x87010: return cur_rev <= 0x8701034; break; - case 0x8a000: return cur_rev <= 0x8a0000a; break; - case 0xa0010: return cur_rev <= 0xa00107a; break; - case 0xa0011: return cur_rev <= 0xa0011da; break; - case 0xa0012: return cur_rev <= 0xa001243; break; - case 0xa0082: return cur_rev <= 0xa00820e; break; - case 0xa1011: return cur_rev <= 0xa101153; break; - case 0xa1012: return cur_rev <= 0xa10124e; break; - case 0xa1081: return cur_rev <= 0xa108109; break; - case 0xa2010: return cur_rev <= 0xa20102f; break; - case 0xa2012: return cur_rev <= 0xa201212; break; - case 0xa4041: return cur_rev <= 0xa404109; break; - case 0xa5000: return cur_rev <= 0xa500013; break; - case 0xa6012: return cur_rev <= 0xa60120a; break; - case 0xa7041: return cur_rev <= 0xa704109; break; - case 0xa7052: return cur_rev <= 0xa705208; break; - case 0xa7080: return cur_rev <= 0xa708009; break; - case 0xa70c0: return cur_rev <= 0xa70C009; break; - case 0xaa001: return cur_rev <= 0xaa00116; break; - case 0xaa002: return cur_rev <= 0xaa00218; break; - case 0xb0021: return cur_rev <= 0xb002146; break; - case 0xb1010: return cur_rev <= 0xb101046; break; - case 0xb2040: return cur_rev <= 0xb204031; break; - case 0xb4040: return cur_rev <= 0xb404031; break; - case 0xb6000: return cur_rev <= 0xb600031; break; - case 0xb7000: return cur_rev <= 0xb700031; break; - default: break; - } + cutoff = get_cutoff_revision(cur_rev); + if (cutoff) + return cur_rev <= cutoff; pr_info("You should not be seeing this. Please send the following couple of lines to x86-<at>-kernel.org\n"); pr_info("CPUID(1).EAX: 0x%x, current revision: 0x%x\n", bsp_cpuid_1_eax, cur_rev); return true; } +static bool cpu_has_entrysign(void) +{ + unsigned int fam = x86_family(bsp_cpuid_1_eax); + unsigned int model = x86_model(bsp_cpuid_1_eax); + + if (fam == 0x17 || fam == 0x19) + return true; + + if (fam == 0x1a) { + if (model <= 0x2f || + (0x40 <= model && model <= 0x4f) || + (0x60 <= model && model <= 0x6f)) + return true; + } + + return false; +} + static bool verify_sha256_digest(u32 patch_id, u32 cur_rev, const u8 *data, unsigned int len) { struct patch_digest *pd = NULL; u8 digest[SHA256_DIGEST_SIZE]; int i; - if (x86_family(bsp_cpuid_1_eax) < 0x17) + if (!cpu_has_entrysign()) return true; if (!need_sha_check(cur_rev)) @@ -473,6 +505,7 @@ static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size) { u8 family = x86_family(bsp_cpuid_1_eax); struct microcode_header_amd *mc_hdr; + u32 cur_rev, cutoff, patch_rev; u32 sh_psize; u16 proc_id; u8 patch_fam; @@ -512,11 +545,32 @@ static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size) proc_id = mc_hdr->processor_rev_id; patch_fam = 0xf + (proc_id >> 12); - ucode_dbg("Patch-ID 0x%08x: family: 0x%x\n", mc_hdr->patch_id, patch_fam); - if (patch_fam != family) return 1; + cur_rev = get_patch_level(); + + /* No cutoff revision means old/unaffected by signing algorithm weakness => matches */ + cutoff = get_cutoff_revision(cur_rev); + if (!cutoff) + goto ok; + + patch_rev = mc_hdr->patch_id; + + ucode_dbg("cur_rev: 0x%x, cutoff: 0x%x, patch_rev: 0x%x\n", + cur_rev, cutoff, patch_rev); + + if (cur_rev <= cutoff && patch_rev <= cutoff) + goto ok; + + if (cur_rev > cutoff && patch_rev > cutoff) + goto ok; + + return 1; + +ok: + ucode_dbg("Patch-ID 0x%08x: family: 0x%x\n", mc_hdr->patch_id, patch_fam); + return 0; } @@ -585,8 +639,6 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc) mc = (struct microcode_amd *)(buf + SECTION_HDR_SIZE); - ucode_dbg("patch_id: 0x%x\n", mc->hdr.patch_id); - if (mc_patch_matches(mc, eq_id)) { desc->psize = patch_size; desc->mc = mc; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index f75c140906d0..ccc83b0bf63c 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -136,7 +136,7 @@ bool __init microcode_loader_disabled(void) return dis_ucode_ldr; } -static void early_parse_cmdline(void) +static void __init early_parse_cmdline(void) { char cmd_buf[64] = {}; char *s, *p = cmd_buf; @@ -589,6 +589,17 @@ static int load_late_stop_cpus(bool is_safe) pr_err("You should switch to early loading, if possible.\n"); } + /* + * Pre-load the microcode image into a staging device. This + * process is preemptible and does not require stopping CPUs. + * Successful staging simplifies the subsequent late-loading + * process, reducing rendezvous time. + * + * Even if the transfer fails, the update will proceed as usual. + */ + if (microcode_ops->use_staging) + microcode_ops->stage_microcode(); + atomic_set(&late_cpus_in, num_online_cpus()); atomic_set(&offline_in_nmi, 0); loops_per_usec = loops_per_jiffy / (TICK_NSEC / 1000); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 371ca6eac00e..8744f3adc2a0 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -13,12 +13,15 @@ #define pr_fmt(fmt) "microcode: " fmt #include <linux/earlycpio.h> #include <linux/firmware.h> +#include <linux/pci_ids.h> #include <linux/uaccess.h> #include <linux/initrd.h> #include <linux/kernel.h> +#include <linux/delay.h> #include <linux/slab.h> #include <linux/cpu.h> #include <linux/uio.h> +#include <linux/io.h> #include <linux/mm.h> #include <asm/cpu_device_id.h> @@ -33,6 +36,38 @@ static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin"; #define UCODE_BSP_LOADED ((struct microcode_intel *)0x1UL) +/* Defines for the microcode staging mailbox interface */ +#define MBOX_REG_NUM 4 +#define MBOX_REG_SIZE sizeof(u32) + +#define MBOX_CONTROL_OFFSET 0x0 +#define MBOX_STATUS_OFFSET 0x4 +#define MBOX_WRDATA_OFFSET 0x8 +#define MBOX_RDDATA_OFFSET 0xc + +#define MASK_MBOX_CTRL_ABORT BIT(0) +#define MASK_MBOX_CTRL_GO BIT(31) + +#define MASK_MBOX_STATUS_ERROR BIT(2) +#define MASK_MBOX_STATUS_READY BIT(31) + +#define MASK_MBOX_RESP_SUCCESS BIT(0) +#define MASK_MBOX_RESP_PROGRESS BIT(1) +#define MASK_MBOX_RESP_ERROR BIT(2) + +#define MBOX_CMD_LOAD 0x3 +#define MBOX_OBJ_STAGING 0xb +#define MBOX_HEADER(size) ((PCI_VENDOR_ID_INTEL) | \ + (MBOX_OBJ_STAGING << 16) | \ + ((u64)((size) / sizeof(u32)) << 32)) + +/* The size of each mailbox header */ +#define MBOX_HEADER_SIZE sizeof(u64) +/* The size of staging hardware response */ +#define MBOX_RESPONSE_SIZE sizeof(u64) + +#define MBOX_XACTION_TIMEOUT_MS (10 * MSEC_PER_SEC) + /* Current microcode patch used in early patching on the APs. */ static struct microcode_intel *ucode_patch_va __read_mostly; static struct microcode_intel *ucode_patch_late __read_mostly; @@ -54,6 +89,23 @@ struct extended_sigtable { struct extended_signature sigs[]; }; +/** + * struct staging_state - Track the current staging process state + * + * @mmio_base: MMIO base address for staging + * @ucode_len: Total size of the microcode image + * @chunk_size: Size of each data piece + * @bytes_sent: Total bytes transmitted so far + * @offset: Current offset in the microcode image + */ +struct staging_state { + void __iomem *mmio_base; + unsigned int ucode_len; + unsigned int chunk_size; + unsigned int bytes_sent; + unsigned int offset; +}; + #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) #define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) #define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) @@ -299,6 +351,298 @@ static __init struct microcode_intel *scan_microcode(void *data, size_t size, return size ? NULL : patch; } +static inline u32 read_mbox_dword(void __iomem *mmio_base) +{ + u32 dword = readl(mmio_base + MBOX_RDDATA_OFFSET); + + /* Acknowledge read completion to the staging hardware */ + writel(0, mmio_base + MBOX_RDDATA_OFFSET); + return dword; +} + +static inline void write_mbox_dword(void __iomem *mmio_base, u32 dword) +{ + writel(dword, mmio_base + MBOX_WRDATA_OFFSET); +} + +static inline u64 read_mbox_header(void __iomem *mmio_base) +{ + u32 high, low; + + low = read_mbox_dword(mmio_base); + high = read_mbox_dword(mmio_base); + + return ((u64)high << 32) | low; +} + +static inline void write_mbox_header(void __iomem *mmio_base, u64 value) +{ + write_mbox_dword(mmio_base, value); + write_mbox_dword(mmio_base, value >> 32); +} + +static void write_mbox_data(void __iomem *mmio_base, u32 *chunk, unsigned int chunk_bytes) +{ + int i; + + /* + * The MMIO space is mapped as Uncached (UC). Each write arrives + * at the device as an individual transaction in program order. + * The device can then reassemble the sequence accordingly. + */ + for (i = 0; i < chunk_bytes / sizeof(u32); i++) + write_mbox_dword(mmio_base, chunk[i]); +} + +/* + * Prepare for a new microcode transfer: reset hardware and record the + * image size. + */ +static void init_stage(struct staging_state *ss) +{ + ss->ucode_len = get_totalsize(&ucode_patch_late->hdr); + + /* + * Abort any ongoing process, effectively resetting the device. + * Unlike regular mailbox data processing requests, this + * operation does not require a status check. + */ + writel(MASK_MBOX_CTRL_ABORT, ss->mmio_base + MBOX_CONTROL_OFFSET); +} + +/* + * Update the chunk size and decide whether another chunk can be sent. + * This accounts for remaining data and retry limits. + */ +static bool can_send_next_chunk(struct staging_state *ss, int *err) +{ + /* A page size or remaining bytes if this is the final chunk */ + ss->chunk_size = min(PAGE_SIZE, ss->ucode_len - ss->offset); + + /* + * Each microcode image is divided into chunks, each at most + * one page size. A 10-chunk image would typically require 10 + * transactions. + * + * However, the hardware managing the mailbox has limited + * resources and may not cache the entire image, potentially + * requesting the same chunk multiple times. + * + * To tolerate this behavior, allow up to twice the expected + * number of transactions (i.e., a 10-chunk image can take up to + * 20 attempts). + * + * If the number of attempts exceeds this limit, treat it as + * exceeding the maximum allowed transfer size. + */ + if (ss->bytes_sent + ss->chunk_size > ss->ucode_len * 2) { + *err = -EMSGSIZE; + return false; + } + + *err = 0; + return true; +} + +/* + * The hardware indicates completion by returning a sentinel end offset. + */ +static inline bool is_end_offset(u32 offset) +{ + return offset == UINT_MAX; +} + +/* + * Determine whether staging is complete: either the hardware signaled + * the end offset, or no more transactions are permitted (retry limit + * reached). + */ +static inline bool staging_is_complete(struct staging_state *ss, int *err) +{ + return is_end_offset(ss->offset) || !can_send_next_chunk(ss, err); +} + +/* + * Wait for the hardware to complete a transaction. + * Return 0 on success, or an error code on failure. + */ +static int wait_for_transaction(struct staging_state *ss) +{ + u32 timeout, status; + + /* Allow time for hardware to complete the operation: */ + for (timeout = 0; timeout < MBOX_XACTION_TIMEOUT_MS; timeout++) { + msleep(1); + + status = readl(ss->mmio_base + MBOX_STATUS_OFFSET); + /* Break out early if the hardware is ready: */ + if (status & MASK_MBOX_STATUS_READY) + break; + } + + /* Check for explicit error response */ + if (status & MASK_MBOX_STATUS_ERROR) + return -EIO; + + /* + * Hardware has neither responded to the action nor signaled any + * error. Treat this as a timeout. + */ + if (!(status & MASK_MBOX_STATUS_READY)) + return -ETIMEDOUT; + + return 0; +} + +/* + * Transmit a chunk of the microcode image to the hardware. + * Return 0 on success, or an error code on failure. + */ +static int send_data_chunk(struct staging_state *ss, void *ucode_ptr) +{ + u32 *src_chunk = ucode_ptr + ss->offset; + u16 mbox_size; + + /* + * Write a 'request' mailbox object in this order: + * 1. Mailbox header includes total size + * 2. Command header specifies the load operation + * 3. Data section contains a microcode chunk + * + * Thus, the mailbox size is two headers plus the chunk size. + */ + mbox_size = MBOX_HEADER_SIZE * 2 + ss->chunk_size; + write_mbox_header(ss->mmio_base, MBOX_HEADER(mbox_size)); + write_mbox_header(ss->mmio_base, MBOX_CMD_LOAD); + write_mbox_data(ss->mmio_base, src_chunk, ss->chunk_size); + ss->bytes_sent += ss->chunk_size; + + /* Notify the hardware that the mailbox is ready for processing. */ + writel(MASK_MBOX_CTRL_GO, ss->mmio_base + MBOX_CONTROL_OFFSET); + + return wait_for_transaction(ss); +} + +/* + * Retrieve the next offset from the hardware response. + * Return 0 on success, or an error code on failure. + */ +static int fetch_next_offset(struct staging_state *ss) +{ + const u64 expected_header = MBOX_HEADER(MBOX_HEADER_SIZE + MBOX_RESPONSE_SIZE); + u32 offset, status; + u64 header; + + /* + * The 'response' mailbox returns three fields, in order: + * 1. Header + * 2. Next offset in the microcode image + * 3. Status flags + */ + header = read_mbox_header(ss->mmio_base); + offset = read_mbox_dword(ss->mmio_base); + status = read_mbox_dword(ss->mmio_base); + + /* All valid responses must start with the expected header. */ + if (header != expected_header) { + pr_err_once("staging: invalid response header (0x%llx)\n", header); + return -EBADR; + } + + /* + * Verify the offset: If not at the end marker, it must not + * exceed the microcode image length. + */ + if (!is_end_offset(offset) && offset > ss->ucode_len) { + pr_err_once("staging: invalid offset (%u) past the image end (%u)\n", + offset, ss->ucode_len); + return -EINVAL; + } + + /* Hardware may report errors explicitly in the status field */ + if (status & MASK_MBOX_RESP_ERROR) + return -EPROTO; + + ss->offset = offset; + return 0; +} + +/* + * Handle the staging process using the mailbox MMIO interface. The + * microcode image is transferred in chunks until completion. + * Return 0 on success or an error code on failure. + */ +static int do_stage(u64 mmio_pa) +{ + struct staging_state ss = {}; + int err; + + ss.mmio_base = ioremap(mmio_pa, MBOX_REG_NUM * MBOX_REG_SIZE); + if (WARN_ON_ONCE(!ss.mmio_base)) + return -EADDRNOTAVAIL; + + init_stage(&ss); + + /* Perform the staging process while within the retry limit */ + while (!staging_is_complete(&ss, &err)) { + /* Send a chunk of microcode each time: */ + err = send_data_chunk(&ss, ucode_patch_late); + if (err) + break; + /* + * Then, ask the hardware which piece of the image it + * needs next. The same piece may be sent more than once. + */ + err = fetch_next_offset(&ss); + if (err) + break; + } + + iounmap(ss.mmio_base); + + return err; +} + +static void stage_microcode(void) +{ + unsigned int pkg_id = UINT_MAX; + int cpu, err; + u64 mmio_pa; + + if (!IS_ALIGNED(get_totalsize(&ucode_patch_late->hdr), sizeof(u32))) { + pr_err("Microcode image 32-bit misaligned (0x%x), staging failed.\n", + get_totalsize(&ucode_patch_late->hdr)); + return; + } + + lockdep_assert_cpus_held(); + + /* + * The MMIO address is unique per package, and all the SMT + * primary threads are online here. Find each MMIO space by + * their package IDs to avoid duplicate staging. + */ + for_each_cpu(cpu, cpu_primary_thread_mask) { + if (topology_logical_package_id(cpu) == pkg_id) + continue; + + pkg_id = topology_logical_package_id(cpu); + + err = rdmsrq_on_cpu(cpu, MSR_IA32_MCU_STAGING_MBOX_ADDR, &mmio_pa); + if (WARN_ON_ONCE(err)) + return; + + err = do_stage(mmio_pa); + if (err) { + pr_err("Error: staging failed (%d) for CPU%d at package %u.\n", + err, cpu, pkg_id); + return; + } + } + + pr_info("Staging of patch revision 0x%x succeeded.\n", ucode_patch_late->hdr.rev); +} + static enum ucode_state __apply_microcode(struct ucode_cpu_info *uci, struct microcode_intel *mc, u32 *cur_rev) @@ -627,6 +971,7 @@ static struct microcode_ops microcode_intel_ops = { .collect_cpu_info = collect_cpu_info, .apply_microcode = apply_microcode_late, .finalize_late_load = finalize_late_load, + .stage_microcode = stage_microcode, .use_nmi = IS_ENABLED(CONFIG_X86_64), }; @@ -638,6 +983,18 @@ static __init void calc_llc_size_per_core(struct cpuinfo_x86 *c) llc_size_per_core = (unsigned int)llc_size; } +static __init bool staging_available(void) +{ + u64 val; + + val = x86_read_arch_cap_msr(); + if (!(val & ARCH_CAP_MCU_ENUM)) + return false; + + rdmsrq(MSR_IA32_MCU_ENUMERATION, val); + return !!(val & MCU_STAGING); +} + struct microcode_ops * __init init_intel_microcode(void) { struct cpuinfo_x86 *c = &boot_cpu_data; @@ -648,6 +1005,11 @@ struct microcode_ops * __init init_intel_microcode(void) return NULL; } + if (staging_available()) { + microcode_intel_ops.use_staging = true; + pr_info("Enabled staging feature.\n"); + } + calc_llc_size_per_core(c); return µcode_intel_ops; diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h index ae8dbc2b908d..a10b547eda1e 100644 --- a/arch/x86/kernel/cpu/microcode/internal.h +++ b/arch/x86/kernel/cpu/microcode/internal.h @@ -31,10 +31,12 @@ struct microcode_ops { * See also the "Synchronization" section in microcode_core.c. */ enum ucode_state (*apply_microcode)(int cpu); + void (*stage_microcode)(void); int (*collect_cpu_info)(int cpu, struct cpu_signature *csig); void (*finalize_late_load)(int result); unsigned int nmi_safe : 1, - use_nmi : 1; + use_nmi : 1, + use_staging : 1; }; struct early_load_data { diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 8c18327eb10b..0863733858dc 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -89,7 +89,6 @@ static int mtrr_state_set; u64 mtrr_tom2; struct mtrr_state_type mtrr_state; -EXPORT_SYMBOL_GPL(mtrr_state); /* Reserved bits in the high portion of the MTRRphysBaseN MSR. */ u32 phys_hi_rsvd; diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 06ca5a30140c..3792ab4819dc 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -274,6 +274,11 @@ static void rdt_get_cdp_config(int level) rdt_resources_all[level].r_resctrl.cdp_capable = true; } +static void rdt_set_io_alloc_capable(struct rdt_resource *r) +{ + r->cache.io_alloc_capable = true; +} + static void rdt_get_cdp_l3_config(void) { rdt_get_cdp_config(RDT_RESOURCE_L3); @@ -719,6 +724,7 @@ enum { RDT_FLAG_SMBA, RDT_FLAG_BMEC, RDT_FLAG_ABMC, + RDT_FLAG_SDCIAE, }; #define RDT_OPT(idx, n, f) \ @@ -745,6 +751,7 @@ static struct rdt_options rdt_options[] __ro_after_init = { RDT_OPT(RDT_FLAG_SMBA, "smba", X86_FEATURE_SMBA), RDT_OPT(RDT_FLAG_BMEC, "bmec", X86_FEATURE_BMEC), RDT_OPT(RDT_FLAG_ABMC, "abmc", X86_FEATURE_ABMC), + RDT_OPT(RDT_FLAG_SDCIAE, "sdciae", X86_FEATURE_SDCIAE), }; #define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options) @@ -853,6 +860,8 @@ static __init bool get_rdt_alloc_resources(void) rdt_get_cache_alloc_cfg(1, r); if (rdt_cpu_has(X86_FEATURE_CDP_L3)) rdt_get_cdp_l3_config(); + if (rdt_cpu_has(X86_FEATURE_SDCIAE)) + rdt_set_io_alloc_capable(r); ret = true; } if (rdt_cpu_has(X86_FEATURE_CAT_L2)) { diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 1189c0df4ad7..b20e705606b8 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -91,3 +91,43 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, return hw_dom->ctrl_val[idx]; } + +bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r) +{ + return resctrl_to_arch_res(r)->sdciae_enabled; +} + +static void resctrl_sdciae_set_one_amd(void *arg) +{ + bool *enable = arg; + + if (*enable) + msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, SDCIAE_ENABLE_BIT); + else + msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, SDCIAE_ENABLE_BIT); +} + +static void _resctrl_sdciae_enable(struct rdt_resource *r, bool enable) +{ + struct rdt_ctrl_domain *d; + + /* Walking r->ctrl_domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + /* Update MSR_IA32_L3_QOS_EXT_CFG MSR on all the CPUs in all domains */ + list_for_each_entry(d, &r->ctrl_domains, hdr.list) + on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_sdciae_set_one_amd, &enable, 1); +} + +int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + + if (hw_res->r_resctrl.cache.io_alloc_capable && + hw_res->sdciae_enabled != enable) { + _resctrl_sdciae_enable(r, enable); + hw_res->sdciae_enabled = enable; + } + + return 0; +} diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 9f4c2f0aaf5c..4a916c84a322 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -46,6 +46,9 @@ struct arch_mbm_state { #define ABMC_EXTENDED_EVT_ID BIT(31) #define ABMC_EVT_ID BIT(0) +/* Setting bit 1 in MSR_IA32_L3_QOS_EXT_CFG enables the SDCIAE feature. */ +#define SDCIAE_ENABLE_BIT 1 + /** * struct rdt_hw_ctrl_domain - Arch private attributes of a set of CPUs that share * a resource for a control function @@ -112,6 +115,7 @@ struct msr_param { * @mbm_width: Monitor width, to detect and correct for overflow. * @cdp_enabled: CDP state of this resource * @mbm_cntr_assign_enabled: ABMC feature is enabled + * @sdciae_enabled: SDCIAE feature (backing "io_alloc") is enabled. * * Members of this structure are either private to the architecture * e.g. mbm_width, or accessed via helpers that provide abstraction. e.g. @@ -126,6 +130,7 @@ struct rdt_hw_resource { unsigned int mbm_width; bool cdp_enabled; bool mbm_cntr_assign_enabled; + bool sdciae_enabled; }; static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index c8945610d455..dffcc8307500 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -242,7 +242,9 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, u32 unused, u32 rmid, enum resctrl_event_id eventid, u64 *val, void *ignored) { + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); int cpu = cpumask_any(&d->hdr.cpu_mask); + struct arch_mbm_state *am; u64 msr_val; u32 prmid; int ret; @@ -251,12 +253,16 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, prmid = logical_rmid_to_physical_rmid(cpu, rmid); ret = __rmid_read_phys(prmid, eventid, &msr_val); - if (ret) - return ret; - *val = get_corrected_val(r, d, rmid, eventid, msr_val); + if (!ret) { + *val = get_corrected_val(r, d, rmid, eventid, msr_val); + } else if (ret == -EINVAL) { + am = get_arch_mbm_state(hw_dom, rmid, eventid); + if (am) + am->prev_msr = 0; + } - return 0; + return ret; } static int __cntr_id_read(u32 cntr_id, u64 *val) @@ -355,6 +361,7 @@ static const struct x86_cpu_id snc_cpu_ids[] __initconst = { X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0), X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0), X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0), + X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, 0), {} }; @@ -452,7 +459,16 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) r->mon.mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; } - if (rdt_cpu_has(X86_FEATURE_ABMC)) { + /* + * resctrl assumes a system that supports assignable counters can + * switch to "default" mode. Ensure that there is a "default" mode + * to switch to. This enforces a dependency between the independent + * X86_FEATURE_ABMC and X86_FEATURE_CQM_MBM_TOTAL/X86_FEATURE_CQM_MBM_LOCAL + * hardware features. + */ + if (rdt_cpu_has(X86_FEATURE_ABMC) && + (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL) || + rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL))) { r->mon.mbm_cntr_assignable = true; cpuid_count(0x80000020, 5, &eax, &ebx, &ecx, &edx); r->mon.num_mbm_cntrs = (ebx & GENMASK(15, 0)) + 1; diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index caa4dc885c21..cde4b6cd3471 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -43,7 +43,11 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 }, { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 }, { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 }, + { X86_FEATURE_SGX_EUPDATESVN, CPUID_EAX, 10, 0x00000012, 0 }, { X86_FEATURE_SGX_EDECCSSA, CPUID_EAX, 11, 0x00000012, 0 }, + { X86_FEATURE_OVERFLOW_RECOV, CPUID_EBX, 0, 0x80000007, 0 }, + { X86_FEATURE_SUCCOR, CPUID_EBX, 1, 0x80000007, 0 }, + { X86_FEATURE_SMCA, CPUID_EBX, 3, 0x80000007, 0 }, { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, @@ -53,6 +57,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 }, { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, { X86_FEATURE_ABMC, CPUID_EBX, 5, 0x80000020, 0 }, + { X86_FEATURE_SDCIAE, CPUID_EBX, 6, 0x80000020, 0 }, { X86_FEATURE_TSA_SQ_NO, CPUID_ECX, 1, 0x80000021, 0 }, { X86_FEATURE_TSA_L1_NO, CPUID_ECX, 2, 0x80000021, 0 }, { X86_FEATURE_AMD_WORKLOAD_CLASS, CPUID_EAX, 22, 0x80000021, 0 }, diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c index 7f8d1e11dbee..79d6020dfe9c 100644 --- a/arch/x86/kernel/cpu/sgx/driver.c +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -14,7 +14,7 @@ u64 sgx_attributes_reserved_mask; u64 sgx_xfrm_reserved_mask = ~0x3; u32 sgx_misc_reserved_mask; -static int sgx_open(struct inode *inode, struct file *file) +static int __sgx_open(struct inode *inode, struct file *file) { struct sgx_encl *encl; int ret; @@ -41,6 +41,23 @@ static int sgx_open(struct inode *inode, struct file *file) return 0; } +static int sgx_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = sgx_inc_usage_count(); + if (ret) + return ret; + + ret = __sgx_open(inode, file); + if (ret) { + sgx_dec_usage_count(); + return ret; + } + + return 0; +} + static int sgx_release(struct inode *inode, struct file *file) { struct sgx_encl *encl = file->private_data; diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 308dbbae6c6e..cf149b9f4916 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -765,6 +765,7 @@ void sgx_encl_release(struct kref *ref) WARN_ON_ONCE(encl->secs.epc_page); kfree(encl); + sgx_dec_usage_count(); } /* diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h index 42a088a337c5..74be751199a4 100644 --- a/arch/x86/kernel/cpu/sgx/encls.h +++ b/arch/x86/kernel/cpu/sgx/encls.h @@ -233,4 +233,9 @@ static inline int __eaug(struct sgx_pageinfo *pginfo, void *addr) return __encls_2(EAUG, pginfo, addr); } +/* Attempt to update CPUSVN at runtime. */ +static inline int __eupdatesvn(void) +{ + return __encls_ret_1(EUPDATESVN, ""); +} #endif /* _X86_ENCLS_H */ diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 2de01b379aa3..dc73194416ac 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -5,6 +5,7 @@ #include <linux/freezer.h> #include <linux/highmem.h> #include <linux/kthread.h> +#include <linux/kvm_types.h> #include <linux/miscdevice.h> #include <linux/node.h> #include <linux/pagemap.h> @@ -16,6 +17,7 @@ #include <linux/vmalloc.h> #include <asm/msr.h> #include <asm/sgx.h> +#include <asm/archrandom.h> #include "driver.h" #include "encl.h" #include "encls.h" @@ -915,7 +917,107 @@ int sgx_set_attribute(unsigned long *allowed_attributes, *allowed_attributes |= SGX_ATTR_PROVISIONKEY; return 0; } -EXPORT_SYMBOL_GPL(sgx_set_attribute); +EXPORT_SYMBOL_FOR_KVM(sgx_set_attribute); + +/* Counter to count the active SGX users */ +static int sgx_usage_count; + +/** + * sgx_update_svn() - Attempt to call ENCLS[EUPDATESVN]. + * + * This instruction attempts to update CPUSVN to the + * currently loaded microcode update SVN and generate new + * cryptographic assets. + * + * Return: + * * %0: - Success or not supported + * * %-EAGAIN: - Can be safely retried, failure is due to lack of + * * entropy in RNG + * * %-EIO: - Unexpected error, retries are not advisable + */ +static int sgx_update_svn(void) +{ + int ret; + + /* + * If EUPDATESVN is not available, it is ok to + * silently skip it to comply with legacy behavior. + */ + if (!cpu_feature_enabled(X86_FEATURE_SGX_EUPDATESVN)) + return 0; + + /* + * EPC is guaranteed to be empty when there are no users. + * Ensure we are on our first user before proceeding further. + */ + WARN(sgx_usage_count, "Elevated usage count when calling EUPDATESVN\n"); + + for (int i = 0; i < RDRAND_RETRY_LOOPS; i++) { + ret = __eupdatesvn(); + + /* Stop on success or unexpected errors: */ + if (ret != SGX_INSUFFICIENT_ENTROPY) + break; + } + + switch (ret) { + case 0: + /* + * SVN successfully updated. + * Let users know when the update was successful. + */ + pr_info("SVN updated successfully\n"); + return 0; + case SGX_NO_UPDATE: + /* + * SVN update failed since the current SVN is + * not newer than CPUSVN. This is the most + * common case and indicates no harm. + */ + return 0; + case SGX_INSUFFICIENT_ENTROPY: + /* + * SVN update failed due to lack of entropy in DRNG. + * Indicate to userspace that it should retry. + */ + return -EAGAIN; + default: + break; + } + + /* + * EUPDATESVN was called when EPC is empty, all other error + * codes are unexpected. + */ + ENCLS_WARN(ret, "EUPDATESVN"); + return -EIO; +} + +/* Mutex to ensure no concurrent EPC accesses during EUPDATESVN */ +static DEFINE_MUTEX(sgx_svn_lock); + +int sgx_inc_usage_count(void) +{ + int ret; + + guard(mutex)(&sgx_svn_lock); + + if (!sgx_usage_count) { + ret = sgx_update_svn(); + if (ret) + return ret; + } + + sgx_usage_count++; + + return 0; +} + +void sgx_dec_usage_count(void) +{ + guard(mutex)(&sgx_svn_lock); + sgx_usage_count--; +} static int __init sgx_init(void) { diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index d2dad21259a8..f5940393d9bd 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -102,6 +102,9 @@ static inline int __init sgx_vepc_init(void) } #endif +int sgx_inc_usage_count(void); +void sgx_dec_usage_count(void); + void sgx_update_lepubkeyhash(u64 *lepubkeyhash); #endif /* _X86_SGX_H */ diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c index 7aaa3652e31d..8de1f1a755f2 100644 --- a/arch/x86/kernel/cpu/sgx/virt.c +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -5,6 +5,7 @@ * Copyright(c) 2021 Intel Corporation. */ +#include <linux/kvm_types.h> #include <linux/miscdevice.h> #include <linux/mm.h> #include <linux/mman.h> @@ -255,10 +256,11 @@ static int sgx_vepc_release(struct inode *inode, struct file *file) xa_destroy(&vepc->page_array); kfree(vepc); + sgx_dec_usage_count(); return 0; } -static int sgx_vepc_open(struct inode *inode, struct file *file) +static int __sgx_vepc_open(struct inode *inode, struct file *file) { struct sgx_vepc *vepc; @@ -273,6 +275,23 @@ static int sgx_vepc_open(struct inode *inode, struct file *file) return 0; } +static int sgx_vepc_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = sgx_inc_usage_count(); + if (ret) + return ret; + + ret = __sgx_vepc_open(inode, file); + if (ret) { + sgx_dec_usage_count(); + return ret; + } + + return 0; +} + static long sgx_vepc_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -363,7 +382,7 @@ int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs, WARN_ON_ONCE(ret); return 0; } -EXPORT_SYMBOL_GPL(sgx_virt_ecreate); +EXPORT_SYMBOL_FOR_KVM(sgx_virt_ecreate); static int __sgx_virt_einit(void __user *sigstruct, void __user *token, void __user *secs) @@ -432,4 +451,4 @@ int sgx_virt_einit(void __user *sigstruct, void __user *token, return ret; } -EXPORT_SYMBOL_GPL(sgx_virt_einit); +EXPORT_SYMBOL_FOR_KVM(sgx_virt_einit); diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 6073a16628f9..f55ea3cdbf88 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -75,15 +75,11 @@ bool arch_match_cpu_phys_id(int cpu, u64 phys_id) return phys_id == (u64)cpuid_to_apicid[cpu]; } -#ifdef CONFIG_SMP static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { if (!(apicid & (__max_threads_per_core - 1))) cpumask_set_cpu(cpu, &__cpu_primary_thread_mask); } -#else -static inline void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { } -#endif /* * Convert the APIC ID to a domain level ID by masking out the low bits diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c index b5a5e1411469..71625795d711 100644 --- a/arch/x86/kernel/cpu/topology_common.c +++ b/arch/x86/kernel/cpu/topology_common.c @@ -16,6 +16,9 @@ EXPORT_SYMBOL_GPL(x86_topo_system); unsigned int __amd_nodes_per_pkg __ro_after_init; EXPORT_SYMBOL_GPL(__amd_nodes_per_pkg); +/* CPUs which are the primary SMT threads */ +struct cpumask __cpu_primary_thread_mask __read_mostly; + void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom, unsigned int shift, unsigned int ncpus) { diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c index 49782724a943..209b5a22d880 100644 --- a/arch/x86/kernel/cpu/tsx.c +++ b/arch/x86/kernel/cpu/tsx.c @@ -19,7 +19,17 @@ #undef pr_fmt #define pr_fmt(fmt) "tsx: " fmt -enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED; +enum tsx_ctrl_states { + TSX_CTRL_AUTO, + TSX_CTRL_ENABLE, + TSX_CTRL_DISABLE, + TSX_CTRL_RTM_ALWAYS_ABORT, + TSX_CTRL_NOT_SUPPORTED, +}; + +static enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = + IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO) ? TSX_CTRL_AUTO : + IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF) ? TSX_CTRL_DISABLE : TSX_CTRL_ENABLE; static void tsx_disable(void) { @@ -156,11 +166,28 @@ static void tsx_dev_mode_disable(void) } } -void __init tsx_init(void) +static int __init tsx_parse_cmdline(char *str) { - char arg[5] = {}; - int ret; + if (!str) + return -EINVAL; + + if (!strcmp(str, "on")) { + tsx_ctrl_state = TSX_CTRL_ENABLE; + } else if (!strcmp(str, "off")) { + tsx_ctrl_state = TSX_CTRL_DISABLE; + } else if (!strcmp(str, "auto")) { + tsx_ctrl_state = TSX_CTRL_AUTO; + } else { + tsx_ctrl_state = TSX_CTRL_DISABLE; + pr_err("invalid option, defaulting to off\n"); + } + + return 0; +} +early_param("tsx", tsx_parse_cmdline); +void __init tsx_init(void) +{ tsx_dev_mode_disable(); /* @@ -194,27 +221,8 @@ void __init tsx_init(void) return; } - ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg)); - if (ret >= 0) { - if (!strcmp(arg, "on")) { - tsx_ctrl_state = TSX_CTRL_ENABLE; - } else if (!strcmp(arg, "off")) { - tsx_ctrl_state = TSX_CTRL_DISABLE; - } else if (!strcmp(arg, "auto")) { - tsx_ctrl_state = x86_get_tsx_auto_mode(); - } else { - tsx_ctrl_state = TSX_CTRL_DISABLE; - pr_err("invalid option, defaulting to off\n"); - } - } else { - /* tsx= not provided */ - if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO)) - tsx_ctrl_state = x86_get_tsx_auto_mode(); - else if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF)) - tsx_ctrl_state = TSX_CTRL_DISABLE; - else - tsx_ctrl_state = TSX_CTRL_ENABLE; - } + if (tsx_ctrl_state == TSX_CTRL_AUTO) + tsx_ctrl_state = x86_get_tsx_auto_mode(); if (tsx_ctrl_state == TSX_CTRL_DISABLE) { tsx_disable(); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 71ee20102a8a..b10684dedc58 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -181,8 +181,8 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, * in false positive reports. Disable instrumentation to avoid those. */ __no_kmsan_checks -static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, const char *log_lvl) +static void __show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, const char *log_lvl) { struct unwind_state state; struct stack_info stack_info = {0}; @@ -303,6 +303,25 @@ next: } } +static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, const char *log_lvl) +{ + /* + * Disable KASAN to avoid false positives during walking another + * task's stacks, as values on these stacks may change concurrently + * with task execution. + */ + bool disable_kasan = task && task != current; + + if (disable_kasan) + kasan_disable_current(); + + __show_trace_log_lvl(task, regs, stack, log_lvl); + + if (disable_kasan) + kasan_enable_current(); +} + void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index c3acbd26408b..b15b97d3cb52 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -16,6 +16,7 @@ #include <linux/firmware-map.h> #include <linux/sort.h> #include <linux/memory_hotplug.h> +#include <linux/kvm_types.h> #include <asm/e820/api.h> #include <asm/setup.h> @@ -95,7 +96,7 @@ bool e820__mapped_raw_any(u64 start, u64 end, enum e820_type type) { return _e820__mapped_any(e820_table_firmware, start, end, type); } -EXPORT_SYMBOL_GPL(e820__mapped_raw_any); +EXPORT_SYMBOL_FOR_KVM(e820__mapped_raw_any); bool e820__mapped_any(u64 start, u64 end, enum e820_type type) { diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 1f71cc135e9a..da233f20ae6f 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -18,6 +18,7 @@ #include <uapi/asm/kvm.h> #include <linux/hardirq.h> +#include <linux/kvm_types.h> #include <linux/pkeys.h> #include <linux/vmalloc.h> @@ -276,7 +277,7 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) return true; } -EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate); +EXPORT_SYMBOL_FOR_KVM(fpu_alloc_guest_fpstate); void fpu_free_guest_fpstate(struct fpu_guest *gfpu) { @@ -291,7 +292,7 @@ void fpu_free_guest_fpstate(struct fpu_guest *gfpu) gfpu->fpstate = NULL; vfree(fpstate); } -EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate); +EXPORT_SYMBOL_FOR_KVM(fpu_free_guest_fpstate); /* * fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable @@ -313,7 +314,7 @@ int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures) return __xfd_enable_feature(xfeatures, guest_fpu); } -EXPORT_SYMBOL_GPL(fpu_enable_guest_xfd_features); +EXPORT_SYMBOL_FOR_KVM(fpu_enable_guest_xfd_features); #ifdef CONFIG_X86_64 void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) @@ -324,7 +325,7 @@ void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) xfd_update_state(guest_fpu->fpstate); fpregs_unlock(); } -EXPORT_SYMBOL_GPL(fpu_update_guest_xfd); +EXPORT_SYMBOL_FOR_KVM(fpu_update_guest_xfd); /** * fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state @@ -348,7 +349,7 @@ void fpu_sync_guest_vmexit_xfd_state(void) __this_cpu_write(xfd_state, fpstate->xfd); } } -EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state); +EXPORT_SYMBOL_FOR_KVM(fpu_sync_guest_vmexit_xfd_state); #endif /* CONFIG_X86_64 */ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) @@ -390,7 +391,7 @@ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) fpregs_unlock(); return 0; } -EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate); +EXPORT_SYMBOL_FOR_KVM(fpu_swap_kvm_fpstate); void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u64 xfeatures, u32 pkru) @@ -409,7 +410,7 @@ void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE; } } -EXPORT_SYMBOL_GPL(fpu_copy_guest_fpstate_to_uabi); +EXPORT_SYMBOL_FOR_KVM(fpu_copy_guest_fpstate_to_uabi); int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru) @@ -439,7 +440,7 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, return copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru); } -EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate); +EXPORT_SYMBOL_FOR_KVM(fpu_copy_uabi_to_guest_fpstate); #endif /* CONFIG_KVM */ void kernel_fpu_begin_mask(unsigned int kfpu_mask) @@ -825,6 +826,9 @@ void fpu__clear_user_states(struct fpu *fpu) !fpregs_state_valid(fpu, smp_processor_id())) os_xrstor_supervisor(fpu->fpstate); + /* Ensure XFD state is in sync before reloading XSTATE */ + xfd_update_state(fpu->fpstate); + /* Reset user states in registers. */ restore_fpregs_from_init_fpstate(XFEATURE_MASK_USER_RESTORE); @@ -854,7 +858,7 @@ void switch_fpu_return(void) fpregs_restore_userregs(); } -EXPORT_SYMBOL_GPL(switch_fpu_return); +EXPORT_SYMBOL_FOR_KVM(switch_fpu_return); void fpregs_lock_and_load(void) { @@ -889,7 +893,7 @@ void fpregs_assert_state_consistent(void) WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id())); } -EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent); +EXPORT_SYMBOL_FOR_KVM(fpregs_assert_state_consistent); #endif void fpregs_mark_activate(void) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 28e4fd65c9da..48113c5193aa 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -8,6 +8,7 @@ #include <linux/compat.h> #include <linux/cpu.h> #include <linux/mman.h> +#include <linux/kvm_types.h> #include <linux/nospec.h> #include <linux/pkeys.h> #include <linux/seq_file.h> @@ -1058,7 +1059,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) return __raw_xsave_addr(xsave, xfeature_nr); } -EXPORT_SYMBOL_GPL(get_xsave_addr); +EXPORT_SYMBOL_FOR_KVM(get_xsave_addr); /* * Given an xstate feature nr, calculate where in the xsave buffer the state is. @@ -1482,7 +1483,7 @@ void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeatu if (addr) memset(addr, 0, xstate_sizes[xfeature]); } -EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component); +EXPORT_SYMBOL_FOR_KVM(fpstate_clear_xstate_component); #endif #ifdef CONFIG_X86_64 @@ -1818,7 +1819,7 @@ u64 xstate_get_guest_group_perm(void) { return xstate_get_group_perm(true); } -EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm); +EXPORT_SYMBOL_FOR_KVM(xstate_get_guest_group_perm); /** * fpu_xstate_prctl - xstate permission operations diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 367da3638167..823dbdd0eb41 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -354,12 +354,17 @@ SYM_CODE_START(return_to_handler) UNWIND_HINT_UNDEFINED ANNOTATE_NOENDBR + /* Restore return_to_handler value that got eaten by previous ret instruction. */ + subq $8, %rsp + UNWIND_HINT_FUNC + /* Save ftrace_regs for function exit context */ subq $(FRAME_SIZE), %rsp movq %rax, RAX(%rsp) movq %rdx, RDX(%rsp) movq %rbp, RBP(%rsp) + movq %rsp, RSP(%rsp) movq %rsp, %rdi call ftrace_return_to_handler @@ -368,7 +373,8 @@ SYM_CODE_START(return_to_handler) movq RDX(%rsp), %rdx movq RAX(%rsp), %rax - addq $(FRAME_SIZE), %rsp + addq $(FRAME_SIZE) + 8, %rsp + /* * Jump back to the old return address. This cannot be JMP_NOSPEC rdi * since IBT would demand that contain ENDBR, which simply isn't so for diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index b01644c949b2..f846c15f21ca 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -24,6 +24,7 @@ #include <linux/percpu.h> #include <linux/kdebug.h> #include <linux/kernel.h> +#include <linux/kvm_types.h> #include <linux/export.h> #include <linux/sched.h> #include <linux/smp.h> @@ -489,7 +490,7 @@ void hw_breakpoint_restore(void) set_debugreg(DR6_RESERVED, 6); set_debugreg(__this_cpu_read(cpu_dr7), 7); } -EXPORT_SYMBOL_GPL(hw_breakpoint_restore); +EXPORT_SYMBOL_FOR_KVM(hw_breakpoint_restore); /* * Handle debug exception notifications. diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 10721a125226..86f4e574de02 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -12,6 +12,7 @@ #include <linux/delay.h> #include <linux/export.h> #include <linux/irq.h> +#include <linux/kvm_types.h> #include <asm/irq_stack.h> #include <asm/apic.h> @@ -361,7 +362,7 @@ void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)) synchronize_rcu(); } } -EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler); +EXPORT_SYMBOL_FOR_KVM(kvm_set_posted_intr_wakeup_handler); /* * Handler for POSTED_INTERRUPT_VECTOR. diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 3863d7709386..c1fac3a9fecc 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -141,7 +141,6 @@ bool can_boost(struct insn *insn, void *addr) { kprobe_opcode_t opcode; insn_byte_t prefix; - int i; if (search_exception_tables((unsigned long)addr)) return false; /* Page fault may occur on this address. */ @@ -154,7 +153,7 @@ bool can_boost(struct insn *insn, void *addr) if (insn->opcode.nbytes != 1) return false; - for_each_insn_prefix(insn, i, prefix) { + for_each_insn_prefix(insn, prefix) { insn_attr_t attr; attr = inat_get_opcode_attribute(prefix); diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 0aabd4c4e2c4..6f826a00eca2 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -103,7 +103,6 @@ static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val) asm ( ".pushsection .rodata\n" - "optprobe_template_func:\n" ".global optprobe_template_entry\n" "optprobe_template_entry:\n" #ifdef CONFIG_X86_64 @@ -160,9 +159,6 @@ asm ( "optprobe_template_end:\n" ".popsection\n"); -void optprobe_template_func(void); -STACK_FRAME_NON_STANDARD(optprobe_template_func); - #define TMPL_CLAC_IDX \ ((long)optprobe_template_clac - (long)optprobe_template_entry) #define TMPL_MOVE_IDX \ diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index b67d7c59dca0..204765004c72 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -29,6 +29,7 @@ #include <linux/syscore_ops.h> #include <linux/cc_platform.h> #include <linux/efi.h> +#include <linux/kvm_types.h> #include <asm/timer.h> #include <asm/cpu.h> #include <asm/traps.h> @@ -162,7 +163,7 @@ void kvm_async_pf_task_wait_schedule(u32 token) } finish_swait(&n.wq, &wait); } -EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule); +EXPORT_SYMBOL_FOR_KVM(kvm_async_pf_task_wait_schedule); static void apf_task_wake_one(struct kvm_task_sleep_node *n) { @@ -253,7 +254,7 @@ noinstr u32 kvm_read_and_reset_apf_flags(void) return flags; } -EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags); +EXPORT_SYMBOL_FOR_KVM(kvm_read_and_reset_apf_flags); noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) { diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 0ffbae902e2f..11c45ce42694 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -97,6 +97,7 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs, DEBUGP("%s relocate section %u to %u\n", apply ? "Applying" : "Clearing", relsec, sechdrs[relsec].sh_info); + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { size_t size; @@ -162,15 +163,17 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs, if (apply) { if (memcmp(loc, &zero, size)) { - pr_err("x86/modules: Invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n", - (int)ELF64_R_TYPE(rel[i].r_info), loc, val); + pr_err("x86/modules: Invalid relocation target, existing value is nonzero for sec %u, idx %u, type %d, loc %lx, val %llx\n", + relsec, i, (int)ELF64_R_TYPE(rel[i].r_info), + (unsigned long)loc, val); return -ENOEXEC; } write(loc, &val, size); } else { if (memcmp(loc, &val, size)) { - pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for type %d, loc %p, val %Lx\n", - (int)ELF64_R_TYPE(rel[i].r_info), loc, val); + pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for sec %u, idx %u, type %d, loc %lx, val %llx\n", + relsec, i, (int)ELF64_R_TYPE(rel[i].r_info), + (unsigned long)loc, val); return -ENOEXEC; } write(loc, &zero, size); @@ -179,8 +182,8 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs, return 0; overflow: - pr_err("overflow in relocation type %d val %Lx\n", - (int)ELF64_R_TYPE(rel[i].r_info), val); + pr_err("overflow in relocation type %d val %llx sec %u idx %d\n", + (int)ELF64_R_TYPE(rel[i].r_info), val, relsec, i); pr_err("`%s' likely not compiled with -mcmodel=kernel\n", me->name); return -ENOEXEC; diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index e17c16c54a37..4469c784eaa0 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -98,7 +98,7 @@ static int filter_write(u32 reg) if (!__ratelimit(&fw_rs)) return 0; - pr_warn("Write to unrecognized MSR 0x%x by %s (pid: %d).\n", + pr_warn("Write to unrecognized MSR 0x%x by %s (pid: %d), tainting CPU_OUT_OF_SPEC.\n", reg, current->comm, current->pid); pr_warn("See https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/about for details.\n"); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index be93ec7255bf..3d239ed12744 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -24,6 +24,7 @@ #include <linux/export.h> #include <linux/atomic.h> #include <linux/sched/clock.h> +#include <linux/kvm_types.h> #include <asm/cpu_entry_area.h> #include <asm/traps.h> @@ -613,9 +614,7 @@ DEFINE_IDTENTRY_RAW(exc_nmi_kvm_vmx) { exc_nmi(regs); } -#if IS_MODULE(CONFIG_KVM_INTEL) -EXPORT_SYMBOL_GPL(asm_exc_nmi_kvm_vmx); -#endif +EXPORT_SYMBOL_FOR_KVM(asm_exc_nmi_kvm_vmx); #endif #ifdef CONFIG_NMI_CHECK_CPU diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 52a5c03c353c..432c0a004c60 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -30,6 +30,7 @@ #include <linux/interrupt.h> #include <linux/delay.h> #include <linux/export.h> +#include <linux/kvm_types.h> #include <linux/ptrace.h> #include <linux/notifier.h> #include <linux/kprobes.h> @@ -303,9 +304,7 @@ void current_save_fsgs(void) save_fsgs(current); local_irq_restore(flags); } -#if IS_ENABLED(CONFIG_KVM) -EXPORT_SYMBOL_GPL(current_save_fsgs); -#endif +EXPORT_SYMBOL_FOR_KVM(current_save_fsgs); static __always_inline void loadseg(enum which_selector which, unsigned short sel) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 964f6b0a3d68..6032fa9ec753 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -13,6 +13,7 @@ #include <linux/objtool.h> #include <linux/pgtable.h> #include <linux/kexec.h> +#include <linux/kvm_types.h> #include <acpi/reboot.h> #include <asm/io.h> #include <asm/apic.h> @@ -541,7 +542,7 @@ void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback) rcu_assign_pointer(cpu_emergency_virt_callback, callback); } -EXPORT_SYMBOL_GPL(cpu_emergency_register_virt_callback); +EXPORT_SYMBOL_FOR_KVM(cpu_emergency_register_virt_callback); void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback) { @@ -551,7 +552,7 @@ void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback) rcu_assign_pointer(cpu_emergency_virt_callback, NULL); synchronize_rcu(); } -EXPORT_SYMBOL_GPL(cpu_emergency_unregister_virt_callback); +EXPORT_SYMBOL_FOR_KVM(cpu_emergency_unregister_virt_callback); /* * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 11e20bb13aca..4ffba68dc57b 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -95,9 +95,12 @@ SYM_CODE_START_NOALIGN(relocate_kernel) /* Leave CR4 in %r13 to enable the right paging mode later. */ movq %cr4, %r13 - /* Disable global pages immediately to ensure this mapping is RWX */ + /* + * Disable global pages immediately to ensure this mapping is RWX. + * Disable LASS before jumping to the identity mapped page. + */ movq %r13, %r12 - andq $~(X86_CR4_PGE), %r12 + andq $~(X86_CR4_PGE | X86_CR4_LASS), %r12 movq %r12, %cr4 /* Save %rsp and CRs. */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index eb289abece23..5cd6950ab672 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -103,9 +103,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_core_map); DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map); EXPORT_PER_CPU_SYMBOL(cpu_die_map); -/* CPUs which are the primary SMT threads */ -struct cpumask __cpu_primary_thread_mask __read_mostly; - /* Representing CPUs for which sibling maps can be computed */ static cpumask_var_t cpu_sibling_setup_mask; @@ -515,6 +512,76 @@ static void __init build_sched_topology(void) set_sched_topology(topology); } +#ifdef CONFIG_NUMA +static int sched_avg_remote_distance; +static int avg_remote_numa_distance(void) +{ + int i, j; + int distance, nr_remote, total_distance; + + if (sched_avg_remote_distance > 0) + return sched_avg_remote_distance; + + nr_remote = 0; + total_distance = 0; + for_each_node_state(i, N_CPU) { + for_each_node_state(j, N_CPU) { + distance = node_distance(i, j); + + if (distance >= REMOTE_DISTANCE) { + nr_remote++; + total_distance += distance; + } + } + } + if (nr_remote) + sched_avg_remote_distance = total_distance / nr_remote; + else + sched_avg_remote_distance = REMOTE_DISTANCE; + + return sched_avg_remote_distance; +} + +int arch_sched_node_distance(int from, int to) +{ + int d = node_distance(from, to); + + switch (boot_cpu_data.x86_vfm) { + case INTEL_GRANITERAPIDS_X: + case INTEL_ATOM_DARKMONT_X: + + if (!x86_has_numa_in_package || topology_max_packages() == 1 || + d < REMOTE_DISTANCE) + return d; + + /* + * With SNC enabled, there could be too many levels of remote + * NUMA node distances, creating NUMA domain levels + * including local nodes and partial remote nodes. + * + * Trim finer distance tuning for NUMA nodes in remote package + * for the purpose of building sched domains. Group NUMA nodes + * in the remote package in the same sched group. + * Simplify NUMA domains and avoid extra NUMA levels including + * different remote NUMA nodes and local nodes. + * + * GNR and CWF don't expect systems with more than 2 packages + * and more than 2 hops between packages. Single average remote + * distance won't be appropriate if there are more than 2 + * packages as average distance to different remote packages + * could be different. + */ + WARN_ONCE(topology_max_packages() > 2, + "sched: Expect only up to 2 packages for GNR or CWF, " + "but saw %d packages when building sched domains.", + topology_max_packages()); + + d = avg_remote_numa_distance(); + } + return d; +} +#endif /* CONFIG_NUMA */ + void set_cpu_sibling_map(int cpu) { bool has_smt = __max_threads_per_core > 1; @@ -1328,11 +1395,7 @@ void __noreturn hlt_play_dead(void) native_halt(); } -/* - * native_play_dead() is essentially a __noreturn function, but it can't - * be marked as such as the compiler may complain about it. - */ -void native_play_dead(void) +void __noreturn native_play_dead(void) { if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) __update_spec_ctrl(0); @@ -1351,7 +1414,7 @@ int native_cpu_disable(void) return -ENOSYS; } -void native_play_dead(void) +void __noreturn native_play_dead(void) { BUG(); } diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index 378c388d1b31..2892cdb14563 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -26,6 +26,11 @@ static const u8 xor5rax[] = { 0x2e, 0x2e, 0x2e, 0x31, 0xc0 }; static const u8 retinsn[] = { RET_INSN_OPCODE, 0xcc, 0xcc, 0xcc, 0xcc }; +/* + * ud1 (%edx),%rdi -- see __WARN_trap() / decode_bug() + */ +static const u8 warninsn[] = { 0x67, 0x48, 0x0f, 0xb9, 0x3a }; + static u8 __is_Jcc(u8 *insn) /* Jcc.d32 */ { u8 ret = 0; @@ -69,7 +74,10 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, emulate = code; code = &xor5rax; } - + if (func == &__WARN_trap) { + emulate = code; + code = &warninsn; + } break; case NOP: @@ -128,7 +136,8 @@ static void __static_call_validate(u8 *insn, bool tail, bool tramp) } else { if (opcode == CALL_INSN_OPCODE || !memcmp(insn, x86_nops[5], 5) || - !memcmp(insn, xor5rax, 5)) + !memcmp(insn, xor5rax, 5) || + !memcmp(insn, warninsn, 5)) return; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 6b22611e69cc..bcf1dedc1d00 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -31,6 +31,7 @@ #include <linux/kexec.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> +#include <linux/static_call.h> #include <linux/timer.h> #include <linux/init.h> #include <linux/bug.h> @@ -102,25 +103,37 @@ __always_inline int is_valid_bugaddr(unsigned long addr) * UBSan{0}: 67 0f b9 00 ud1 (%eax),%eax * UBSan{10}: 67 0f b9 40 10 ud1 0x10(%eax),%eax * static_call: 0f b9 cc ud1 %esp,%ecx + * __WARN_trap: 67 48 0f b9 3a ud1 (%edx),%reg * - * Notably UBSAN uses EAX, static_call uses ECX. + * Notable, since __WARN_trap can use all registers, the distinction between + * UD1 users is through R/M. */ __always_inline int decode_bug(unsigned long addr, s32 *imm, int *len) { unsigned long start = addr; + u8 v, reg, rm, rex = 0; + int type = BUG_UD1; bool lock = false; - u8 v; if (addr < TASK_SIZE_MAX) return BUG_NONE; - v = *(u8 *)(addr++); - if (v == INSN_ASOP) + for (;;) { v = *(u8 *)(addr++); + if (v == INSN_ASOP) + continue; - if (v == INSN_LOCK) { - lock = true; - v = *(u8 *)(addr++); + if (v == INSN_LOCK) { + lock = true; + continue; + } + + if ((v & 0xf0) == 0x40) { + rex = v; + continue; + } + + break; } switch (v) { @@ -156,18 +169,33 @@ __always_inline int decode_bug(unsigned long addr, s32 *imm, int *len) if (X86_MODRM_MOD(v) != 3 && X86_MODRM_RM(v) == 4) addr++; /* SIB */ + reg = X86_MODRM_REG(v) + 8*!!X86_REX_R(rex); + rm = X86_MODRM_RM(v) + 8*!!X86_REX_B(rex); + /* Decode immediate, if present */ switch (X86_MODRM_MOD(v)) { case 0: if (X86_MODRM_RM(v) == 5) - addr += 4; /* RIP + disp32 */ + addr += 4; /* RIP + disp32 */ + + if (rm == 0) /* (%eax) */ + type = BUG_UD1_UBSAN; + + if (rm == 2) { /* (%edx) */ + *imm = reg; + type = BUG_UD1_WARN; + } break; case 1: *imm = *(s8 *)addr; addr += 1; + if (rm == 0) /* (%eax) */ + type = BUG_UD1_UBSAN; break; case 2: *imm = *(s32 *)addr; addr += 4; + if (rm == 0) /* (%eax) */ + type = BUG_UD1_UBSAN; break; case 3: break; @@ -176,12 +204,76 @@ __always_inline int decode_bug(unsigned long addr, s32 *imm, int *len) /* record instruction length */ *len = addr - start; - if (X86_MODRM_REG(v) == 0) /* EAX */ - return BUG_UD1_UBSAN; + return type; +} - return BUG_UD1; +static inline unsigned long pt_regs_val(struct pt_regs *regs, int nr) +{ + int offset = pt_regs_offset(regs, nr); + if (WARN_ON_ONCE(offset < -0)) + return 0; + return *((unsigned long *)((void *)regs + offset)); } +#ifdef HAVE_ARCH_BUG_FORMAT_ARGS +DEFINE_STATIC_CALL(WARN_trap, __WARN_trap); +EXPORT_STATIC_CALL_TRAMP(WARN_trap); + +/* + * Create a va_list from an exception context. + */ +void *__warn_args(struct arch_va_list *args, struct pt_regs *regs) +{ + /* + * Register save area; populate with function call argument registers + */ + args->regs[0] = regs->di; + args->regs[1] = regs->si; + args->regs[2] = regs->dx; + args->regs[3] = regs->cx; + args->regs[4] = regs->r8; + args->regs[5] = regs->r9; + + /* + * From the ABI document: + * + * @gp_offset - the element holds the offset in bytes from + * reg_save_area to the place where the next available general purpose + * argument register is saved. In case all argument registers have + * been exhausted, it is set to the value 48 (6*8). + * + * @fp_offset - the element holds the offset in bytes from + * reg_save_area to the place where the next available floating point + * argument is saved. In case all argument registers have been + * exhausted, it is set to the value 176 (6*8 + 8*16) + * + * @overflow_arg_area - this pointer is used to fetch arguments passed + * on the stack. It is initialized with the address of the first + * argument passed on the stack, if any, and then always updated to + * point to the start of the next argument on the stack. + * + * @reg_save_area - the element points to the start of the register + * save area. + * + * Notably the vararg starts with the second argument and there are no + * floating point arguments in the kernel. + */ + args->args.gp_offset = 1*8; + args->args.fp_offset = 6*8 + 8*16; + args->args.reg_save_area = &args->regs; + args->args.overflow_arg_area = (void *)regs->sp; + + /* + * If the exception came from __WARN_trap, there is a return + * address on the stack, skip that. This is why any __WARN_trap() + * caller must inhibit tail-call optimization. + */ + if ((void *)regs->ip == &__WARN_trap) + args->args.overflow_arg_area += 8; + + return &args->args; +} +#endif /* HAVE_ARCH_BUG_FORMAT */ static nokprobe_inline int do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str, @@ -334,6 +426,11 @@ static noinstr bool handle_bug(struct pt_regs *regs) raw_local_irq_enable(); switch (ud_type) { + case BUG_UD1_WARN: + if (report_bug_entry((void *)pt_regs_val(regs, ud_imm), regs) == BUG_TRAP_TYPE_WARN) + handled = true; + break; + case BUG_UD2: if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) { handled = true; @@ -635,13 +732,23 @@ DEFINE_IDTENTRY(exc_bounds) enum kernel_gp_hint { GP_NO_HINT, GP_NON_CANONICAL, - GP_CANONICAL + GP_CANONICAL, + GP_LASS_VIOLATION, + GP_NULL_POINTER, +}; + +static const char * const kernel_gp_hint_help[] = { + [GP_NON_CANONICAL] = "probably for non-canonical address", + [GP_CANONICAL] = "maybe for address", + [GP_LASS_VIOLATION] = "probably LASS violation for address", + [GP_NULL_POINTER] = "kernel NULL pointer dereference", }; /* * When an uncaught #GP occurs, try to determine the memory address accessed by * the instruction and return that address to the caller. Also, try to figure - * out whether any part of the access to that address was non-canonical. + * out whether any part of the access to that address was non-canonical or + * across privilege levels. */ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs, unsigned long *addr) @@ -663,14 +770,28 @@ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs, return GP_NO_HINT; #ifdef CONFIG_X86_64 + /* Operand is in the kernel half */ + if (*addr >= ~__VIRTUAL_MASK) + return GP_CANONICAL; + + /* The last byte of the operand is not in the user canonical half */ + if (*addr + insn.opnd_bytes - 1 > __VIRTUAL_MASK) + return GP_NON_CANONICAL; + /* - * Check that: - * - the operand is not in the kernel half - * - the last byte of the operand is not in the user canonical half + * A NULL pointer dereference usually causes a #PF. However, it + * can result in a #GP when LASS is active. Provide the same + * hint in the rare case that the condition is hit without LASS. */ - if (*addr < ~__VIRTUAL_MASK && - *addr + insn.opnd_bytes - 1 > __VIRTUAL_MASK) - return GP_NON_CANONICAL; + if (*addr < PAGE_SIZE) + return GP_NULL_POINTER; + + /* + * Assume that LASS caused the exception, because the address is + * canonical and in the user half. + */ + if (cpu_feature_enabled(X86_FEATURE_LASS)) + return GP_LASS_VIOLATION; #endif return GP_CANONICAL; @@ -833,9 +954,7 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) if (hint != GP_NO_HINT) snprintf(desc, sizeof(desc), GPFSTR ", %s 0x%lx", - (hint == GP_NON_CANONICAL) ? "probably for non-canonical address" - : "maybe for address", - gp_addr); + kernel_gp_hint_help[hint], gp_addr); /* * KASAN is interested only in the non-canonical case, clear it diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 87e749106dda..7d3e13e14eab 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -11,6 +11,7 @@ #include <linux/cpufreq.h> #include <linux/delay.h> #include <linux/clocksource.h> +#include <linux/kvm_types.h> #include <linux/percpu.h> #include <linux/timex.h> #include <linux/static_key.h> diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 845aeaf36b8d..7be8e361ca55 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -17,6 +17,7 @@ #include <linux/kdebug.h> #include <asm/processor.h> #include <asm/insn.h> +#include <asm/insn-eval.h> #include <asm/mmu_context.h> #include <asm/nops.h> @@ -258,9 +259,8 @@ static volatile u32 good_2byte_insns[256 / 32] = { static bool is_prefix_bad(struct insn *insn) { insn_byte_t p; - int i; - for_each_insn_prefix(insn, i, p) { + for_each_insn_prefix(insn, p) { insn_attr_t attr; attr = inat_get_opcode_attribute(p); @@ -1158,35 +1158,12 @@ unlock: mmap_write_unlock(mm); } -static bool insn_is_nop(struct insn *insn) -{ - return insn->opcode.nbytes == 1 && insn->opcode.bytes[0] == 0x90; -} - -static bool insn_is_nopl(struct insn *insn) -{ - if (insn->opcode.nbytes != 2) - return false; - - if (insn->opcode.bytes[0] != 0x0f || insn->opcode.bytes[1] != 0x1f) - return false; - - if (!insn->modrm.nbytes) - return false; - - if (X86_MODRM_REG(insn->modrm.bytes[0]) != 0) - return false; - - /* 0f 1f /0 - NOPL */ - return true; -} - static bool can_optimize(struct insn *insn, unsigned long vaddr) { if (!insn->x86_64 || insn->length != 5) return false; - if (!insn_is_nop(insn) && !insn_is_nopl(insn)) + if (!insn_is_nop(insn)) return false; /* We can't do cross page atomic writes yet. */ @@ -1426,19 +1403,14 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) { u8 opc1 = OPCODE1(insn); insn_byte_t p; - int i; - /* x86_nops[insn->length]; same as jmp with .offs = 0 */ - if (insn->length <= ASM_NOP_MAX && - !memcmp(insn->kaddr, x86_nops[insn->length], insn->length)) + if (insn_is_nop(insn)) goto setup; switch (opc1) { case 0xeb: /* jmp 8 */ case 0xe9: /* jmp 32 */ break; - case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ - goto setup; case 0xe8: /* call relative */ branch_clear_offset(auprobe, insn); @@ -1463,7 +1435,7 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) * Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix. * No one uses these insns, reject any branch insns with such prefix. */ - for_each_insn_prefix(insn, i, p) { + for_each_insn_prefix(insn, p) { if (p == 0x66) return -ENOTSUPP; } @@ -1819,3 +1791,35 @@ bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, else return regs->sp <= ret->stack; } + +/* + * Heuristic-based check if uprobe is installed at the function entry. + * + * Under assumption of user code being compiled with frame pointers, + * `push %rbp/%ebp` is a good indicator that we indeed are. + * + * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern. + * If we get this wrong, captured stack trace might have one extra bogus + * entry, but the rest of stack trace will still be meaningful. + */ +bool is_uprobe_at_func_entry(struct pt_regs *regs) +{ + struct arch_uprobe *auprobe; + + if (!current->utask) + return false; + + auprobe = current->utask->auprobe; + if (!auprobe) + return false; + + /* push %rbp/%ebp */ + if (auprobe->insn[0] == 0x55) + return true; + + /* endbr64 (64-bit only) */ + if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn)) + return true; + + return false; +} diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 40ac4cb44ed2..487ad19a236e 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -108,16 +108,18 @@ void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS; - perf_get_x86_pmu_capability(&kvm_host_pmu); - /* * Hybrid PMUs don't play nice with virtualization without careful * configuration by userspace, and KVM's APIs for reporting supported * vPMU features do not account for hybrid PMUs. Disable vPMU support * for hybrid PMUs until KVM gains a way to let userspace opt-in. */ - if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) + if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) { enable_pmu = false; + memset(&kvm_host_pmu, 0, sizeof(kvm_host_pmu)); + } else { + perf_get_x86_pmu_capability(&kvm_host_pmu); + } if (enable_pmu) { /* diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index 743ab25ba787..81b4a7acf72e 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -78,7 +78,6 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_6_EAX] = { 6, 0, CPUID_EAX}, [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX}, [CPUID_7_ECX] = { 7, 0, CPUID_ECX}, - [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX}, [CPUID_7_EDX] = { 7, 0, CPUID_EDX}, [CPUID_7_1_EAX] = { 7, 1, CPUID_EAX}, [CPUID_12_EAX] = {0x00000012, 0, CPUID_EAX}, diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index f286b5706d7c..fef00546c885 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -216,7 +216,7 @@ static void avic_deactivate_vmcb(struct vcpu_svm *svm) * This function is called from IOMMU driver to notify * SVM to schedule in a particular vCPU of a particular VM. */ -int avic_ga_log_notifier(u32 ga_tag) +static int avic_ga_log_notifier(u32 ga_tag) { unsigned long flags; struct kvm_svm *kvm_svm; @@ -788,7 +788,7 @@ int avic_init_vcpu(struct vcpu_svm *svm) struct kvm_vcpu *vcpu = &svm->vcpu; INIT_LIST_HEAD(&svm->ir_list); - spin_lock_init(&svm->ir_list_lock); + raw_spin_lock_init(&svm->ir_list_lock); if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm)) return 0; @@ -816,9 +816,9 @@ static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd) if (!vcpu) return; - spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags); + raw_spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags); list_del(&irqfd->vcpu_list); - spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags); + raw_spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags); } int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, @@ -855,7 +855,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, * list of IRQs being posted to the vCPU, to ensure the IRTE * isn't programmed with stale pCPU/IsRunning information. */ - guard(spinlock_irqsave)(&svm->ir_list_lock); + guard(raw_spinlock_irqsave)(&svm->ir_list_lock); /* * Update the target pCPU for IOMMU doorbells if the vCPU is @@ -972,7 +972,7 @@ static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu, * up-to-date entry information, or that this task will wait until * svm_ir_list_add() completes to set the new target pCPU. */ - spin_lock_irqsave(&svm->ir_list_lock, flags); + raw_spin_lock_irqsave(&svm->ir_list_lock, flags); entry = svm->avic_physical_id_entry; WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); @@ -997,7 +997,7 @@ static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu, avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action); - spin_unlock_irqrestore(&svm->ir_list_lock, flags); + raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags); } void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -1035,7 +1035,7 @@ static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action) * or that this task will wait until svm_ir_list_add() completes to * mark the vCPU as not running. */ - spin_lock_irqsave(&svm->ir_list_lock, flags); + raw_spin_lock_irqsave(&svm->ir_list_lock, flags); avic_update_iommu_vcpu_affinity(vcpu, -1, action); @@ -1059,7 +1059,7 @@ static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action) svm->avic_physical_id_entry = entry; - spin_unlock_irqrestore(&svm->ir_list_lock, flags); + raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags); } void avic_vcpu_put(struct kvm_vcpu *vcpu) @@ -1243,3 +1243,9 @@ bool __init avic_hardware_setup(void) return true; } + +void avic_hardware_unsetup(void) +{ + if (avic) + amd_iommu_register_ga_log_notifier(NULL); +} diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index a6443feab252..da6e80b3ac35 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -677,11 +677,10 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 */ svm_copy_lbrs(vmcb02, vmcb12); vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS; - svm_update_lbrv(&svm->vcpu); - - } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) { + } else { svm_copy_lbrs(vmcb02, vmcb01); } + svm_update_lbrv(&svm->vcpu); } static inline bool is_evtinj_soft(u32 evtinj) @@ -833,11 +832,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, svm->soft_int_next_rip = vmcb12_rip; } - vmcb02->control.virt_ext = vmcb01->control.virt_ext & - LBR_CTL_ENABLE_MASK; - if (guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV)) - vmcb02->control.virt_ext |= - (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK); + /* LBR_CTL_ENABLE_MASK is controlled by svm_update_lbrv() */ if (!nested_vmcb_needs_vls_intercept(svm)) vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; @@ -1189,13 +1184,12 @@ int nested_svm_vmexit(struct vcpu_svm *svm) kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && - (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { + (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) svm_copy_lbrs(vmcb12, vmcb02); - svm_update_lbrv(vcpu); - } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) { + else svm_copy_lbrs(vmcb01, vmcb02); - svm_update_lbrv(vcpu); - } + + svm_update_lbrv(vcpu); if (vnmi) { if (vmcb02->control.int_ctl & V_NMI_BLOCKING_MASK) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 153c12dbf3eb..9d29b2e7e855 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -705,7 +705,11 @@ void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask) static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu) { - bool intercept = !(to_svm(vcpu)->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK); + struct vcpu_svm *svm = to_svm(vcpu); + bool intercept = !(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK); + + if (intercept == svm->lbr_msrs_intercepted) + return; svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHFROMIP, MSR_TYPE_RW, intercept); svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHTOIP, MSR_TYPE_RW, intercept); @@ -714,6 +718,8 @@ static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu) if (sev_es_guest(vcpu->kvm)) svm_set_intercept_for_msr(vcpu, MSR_IA32_DEBUGCTLMSR, MSR_TYPE_RW, intercept); + + svm->lbr_msrs_intercepted = intercept; } void svm_vcpu_free_msrpm(void *msrpm) @@ -806,60 +812,43 @@ void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb) vmcb_mark_dirty(to_vmcb, VMCB_LBR); } -void svm_enable_lbrv(struct kvm_vcpu *vcpu) +static void __svm_enable_lbrv(struct kvm_vcpu *vcpu) { - struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; - svm_recalc_lbr_msr_intercepts(vcpu); - - /* Move the LBR msrs to the vmcb02 so that the guest can see them. */ - if (is_guest_mode(vcpu)) - svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr); + to_svm(vcpu)->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; } -static void svm_disable_lbrv(struct kvm_vcpu *vcpu) +void svm_enable_lbrv(struct kvm_vcpu *vcpu) { - struct vcpu_svm *svm = to_svm(vcpu); - - KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm); - svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; + __svm_enable_lbrv(vcpu); svm_recalc_lbr_msr_intercepts(vcpu); - - /* - * Move the LBR msrs back to the vmcb01 to avoid copying them - * on nested guest entries. - */ - if (is_guest_mode(vcpu)) - svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb); } -static struct vmcb *svm_get_lbr_vmcb(struct vcpu_svm *svm) +static void __svm_disable_lbrv(struct kvm_vcpu *vcpu) { - /* - * If LBR virtualization is disabled, the LBR MSRs are always kept in - * vmcb01. If LBR virtualization is enabled and L1 is running VMs of - * its own, the MSRs are moved between vmcb01 and vmcb02 as needed. - */ - return svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK ? svm->vmcb : - svm->vmcb01.ptr; + KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm); + to_svm(vcpu)->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; } void svm_update_lbrv(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK; - bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) || + bool enable_lbrv = (svm->vmcb->save.dbgctl & DEBUGCTLMSR_LBR) || (is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)); - if (enable_lbrv == current_enable_lbrv) - return; + if (enable_lbrv && !current_enable_lbrv) + __svm_enable_lbrv(vcpu); + else if (!enable_lbrv && current_enable_lbrv) + __svm_disable_lbrv(vcpu); - if (enable_lbrv) - svm_enable_lbrv(vcpu); - else - svm_disable_lbrv(vcpu); + /* + * During nested transitions, it is possible that the current VMCB has + * LBR_CTL set, but the previous LBR_CTL had it cleared (or vice versa). + * In this case, even though LBR_CTL does not need an update, intercepts + * do, so always recalculate the intercepts here. + */ + svm_recalc_lbr_msr_intercepts(vcpu); } void disable_nmi_singlestep(struct vcpu_svm *svm) @@ -921,6 +910,8 @@ static void svm_hardware_unsetup(void) { int cpu; + avic_hardware_unsetup(); + sev_hardware_unsetup(); for_each_possible_cpu(cpu) @@ -1236,6 +1227,7 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu) } svm->x2avic_msrs_intercepted = true; + svm->lbr_msrs_intercepted = true; svm->vmcb01.ptr = page_address(vmcb01_page); svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); @@ -2722,19 +2714,19 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = svm->tsc_aux; break; case MSR_IA32_DEBUGCTLMSR: - msr_info->data = svm_get_lbr_vmcb(svm)->save.dbgctl; + msr_info->data = svm->vmcb->save.dbgctl; break; case MSR_IA32_LASTBRANCHFROMIP: - msr_info->data = svm_get_lbr_vmcb(svm)->save.br_from; + msr_info->data = svm->vmcb->save.br_from; break; case MSR_IA32_LASTBRANCHTOIP: - msr_info->data = svm_get_lbr_vmcb(svm)->save.br_to; + msr_info->data = svm->vmcb->save.br_to; break; case MSR_IA32_LASTINTFROMIP: - msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_from; + msr_info->data = svm->vmcb->save.last_excp_from; break; case MSR_IA32_LASTINTTOIP: - msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_to; + msr_info->data = svm->vmcb->save.last_excp_to; break; case MSR_VM_HSAVE_PA: msr_info->data = svm->nested.hsave_msr; @@ -3002,7 +2994,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) if (data & DEBUGCTL_RESERVED_BITS) return 1; - svm_get_lbr_vmcb(svm)->save.dbgctl = data; + if (svm->vmcb->save.dbgctl == data) + break; + + svm->vmcb->save.dbgctl = data; + vmcb_mark_dirty(svm->vmcb, VMCB_LBR); svm_update_lbrv(vcpu); break; case MSR_VM_HSAVE_PA: @@ -5386,12 +5382,6 @@ static __init int svm_hardware_setup(void) svm_hv_hardware_setup(); - for_each_possible_cpu(cpu) { - r = svm_cpu_init(cpu); - if (r) - goto err; - } - enable_apicv = avic_hardware_setup(); if (!enable_apicv) { enable_ipiv = false; @@ -5435,6 +5425,13 @@ static __init int svm_hardware_setup(void) svm_set_cpu_caps(); kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED; + + for_each_possible_cpu(cpu) { + r = svm_cpu_init(cpu); + if (r) + goto err; + } + return 0; err: diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index e4b04f435b3d..dd78e6402345 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -329,13 +329,14 @@ struct vcpu_svm { * back into remapped mode). */ struct list_head ir_list; - spinlock_t ir_list_lock; + raw_spinlock_t ir_list_lock; struct vcpu_sev_es_state sev_es; bool guest_state_loaded; bool x2avic_msrs_intercepted; + bool lbr_msrs_intercepted; /* Guest GIF value, used when vGIF is not enabled */ bool guest_gif; @@ -805,7 +806,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops; ) bool __init avic_hardware_setup(void); -int avic_ga_log_notifier(u32 ga_tag); +void avic_hardware_unsetup(void); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb); diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h index bc5ece76533a..412d0829d7a2 100644 --- a/arch/x86/kvm/vmx/common.h +++ b/arch/x86/kvm/vmx/common.h @@ -98,7 +98,7 @@ static inline int __vmx_handle_ept_violation(struct kvm_vcpu *vcpu, gpa_t gpa, error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK) ? PFERR_PRESENT_MASK : 0; - if (error_code & EPT_VIOLATION_GVA_IS_VALID) + if (exit_qualification & EPT_VIOLATION_GVA_IS_VALID) error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ? PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 76271962cb70..bcea087b642f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -6728,6 +6728,14 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, case EXIT_REASON_NOTIFY: /* Notify VM exit is not exposed to L1 */ return false; + case EXIT_REASON_SEAMCALL: + case EXIT_REASON_TDCALL: + /* + * SEAMCALL and TDCALL unconditionally VM-Exit, but aren't + * virtualized by KVM for L1 hypervisors, i.e. L1 should + * never want or expect such an exit. + */ + return false; default: return true; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f87c216d976d..91b6f2f3edc2 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6032,6 +6032,12 @@ static int handle_vmx_instruction(struct kvm_vcpu *vcpu) return 1; } +static int handle_tdx_instruction(struct kvm_vcpu *vcpu) +{ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} + #ifndef CONFIG_X86_SGX_KVM static int handle_encls(struct kvm_vcpu *vcpu) { @@ -6157,6 +6163,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_ENCLS] = handle_encls, [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, [EXIT_REASON_NOTIFY] = handle_notify, + [EXIT_REASON_SEAMCALL] = handle_tdx_instruction, + [EXIT_REASON_TDCALL] = handle_tdx_instruction, [EXIT_REASON_MSR_READ_IMM] = handle_rdmsr_imm, [EXIT_REASON_MSR_WRITE_IMM] = handle_wrmsr_imm, }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 42ecd093bb4c..c9c2aa6f4705 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3874,15 +3874,9 @@ static void record_steal_time(struct kvm_vcpu *vcpu) /* * Returns true if the MSR in question is managed via XSTATE, i.e. is context - * switched with the rest of guest FPU state. Note! S_CET is _not_ context - * switched via XSTATE even though it _is_ saved/restored via XSAVES/XRSTORS. - * Because S_CET is loaded on VM-Enter and VM-Exit via dedicated VMCS fields, - * the value saved/restored via XSTATE is always the host's value. That detail - * is _extremely_ important, as the guest's S_CET must _never_ be resident in - * hardware while executing in the host. Loading guest values for U_CET and - * PL[0-3]_SSP while executing in the kernel is safe, as U_CET is specific to - * userspace, and PL[0-3]_SSP are only consumed when transitioning to lower - * privilege levels, i.e. are effectively only consumed by userspace as well. + * switched with the rest of guest FPU state. + * + * Note, S_CET is _not_ saved/restored via XSAVES/XRSTORS. */ static bool is_xstate_managed_msr(struct kvm_vcpu *vcpu, u32 msr) { @@ -3905,6 +3899,11 @@ static bool is_xstate_managed_msr(struct kvm_vcpu *vcpu, u32 msr) * MSR that is managed via XSTATE. Note, the caller is responsible for doing * the initial FPU load, this helper only ensures that guest state is resident * in hardware (the kernel can load its FPU state in IRQ context). + * + * Note, loading guest values for U_CET and PL[0-3]_SSP while executing in the + * kernel is safe, as U_CET is specific to userspace, and PL[0-3]_SSP are only + * consumed when transitioning to lower privilege levels, i.e. are effectively + * only consumed by userspace as well. */ static __always_inline void kvm_access_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info, @@ -11807,6 +11806,9 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) /* Swap (qemu) user FPU context for the guest FPU context. */ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { + if (KVM_BUG_ON(vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm)) + return; + /* Exclude PKRU, it's restored separately immediately after VM-Exit. */ fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true); trace_kvm_fpu(1); @@ -11815,6 +11817,9 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) /* When vcpu_run ends, restore user space FPU context. */ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { + if (KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm)) + return; + fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false); ++vcpu->stat.fpu_reload; trace_kvm_fpu(0); @@ -12137,9 +12142,6 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, int r; vcpu_load(vcpu); - if (kvm_mpx_supported()) - kvm_load_guest_fpu(vcpu); - kvm_vcpu_srcu_read_lock(vcpu); r = kvm_apic_accept_events(vcpu); @@ -12156,9 +12158,6 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, out: kvm_vcpu_srcu_read_unlock(vcpu); - - if (kvm_mpx_supported()) - kvm_put_guest_fpu(vcpu); vcpu_put(vcpu); return r; } @@ -12788,6 +12787,7 @@ static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event) { struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate; u64 xfeatures_mask; + bool fpu_in_use; int i; /* @@ -12811,13 +12811,23 @@ static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event) BUILD_BUG_ON(sizeof(xfeatures_mask) * BITS_PER_BYTE <= XFEATURE_MAX); /* - * All paths that lead to INIT are required to load the guest's FPU - * state (because most paths are buried in KVM_RUN). - */ - kvm_put_guest_fpu(vcpu); + * Unload guest FPU state (if necessary) before zeroing XSTATE fields + * as the kernel can only modify the state when its resident in memory, + * i.e. when it's not loaded into hardware. + * + * WARN if the vCPU's desire to run, i.e. whether or not its in KVM_RUN, + * doesn't match the loaded/in-use state of the FPU, as KVM_RUN is the + * only path that can trigger INIT emulation _and_ loads FPU state, and + * KVM_RUN should _always_ load FPU state. + */ + WARN_ON_ONCE(vcpu->wants_to_run != fpstate->in_use); + fpu_in_use = fpstate->in_use; + if (fpu_in_use) + kvm_put_guest_fpu(vcpu); for_each_set_bit(i, (unsigned long *)&xfeatures_mask, XFEATURE_MAX) fpstate_clear_xstate_component(fpstate, i); - kvm_load_guest_fpu(vcpu); + if (fpu_in_use) + kvm_load_guest_fpu(vcpu); } void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -13941,10 +13951,11 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) #ifdef CONFIG_KVM_GUEST_MEMFD /* - * KVM doesn't yet support mmap() on guest_memfd for VMs with private memory - * (the private vs. shared tracking needs to be moved into guest_memfd). + * KVM doesn't yet support initializing guest_memfd memory as shared for VMs + * with private memory (the private vs. shared tracking needs to be moved into + * guest_memfd). */ -bool kvm_arch_supports_gmem_mmap(struct kvm *kvm) +bool kvm_arch_supports_gmem_init_shared(struct kvm *kvm) { return !kvm_arch_has_private_mem(kvm); } diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c index c5c60d07308c..824664c0ecbd 100644 --- a/arch/x86/lib/cache-smp.c +++ b/arch/x86/lib/cache-smp.c @@ -2,6 +2,7 @@ #include <asm/paravirt.h> #include <linux/smp.h> #include <linux/export.h> +#include <linux/kvm_types.h> static void __wbinvd(void *dummy) { @@ -12,7 +13,7 @@ void wbinvd_on_cpu(int cpu) { smp_call_function_single(cpu, __wbinvd, NULL, 1); } -EXPORT_SYMBOL(wbinvd_on_cpu); +EXPORT_SYMBOL_FOR_KVM(wbinvd_on_cpu); void wbinvd_on_all_cpus(void) { @@ -24,7 +25,7 @@ void wbinvd_on_cpus_mask(struct cpumask *cpus) { on_each_cpu_mask(cpus, __wbinvd, NULL, 1); } -EXPORT_SYMBOL_GPL(wbinvd_on_cpus_mask); +EXPORT_SYMBOL_FOR_KVM(wbinvd_on_cpus_mask); static void __wbnoinvd(void *dummy) { @@ -35,10 +36,10 @@ void wbnoinvd_on_all_cpus(void) { on_each_cpu(__wbnoinvd, NULL, 1); } -EXPORT_SYMBOL_GPL(wbnoinvd_on_all_cpus); +EXPORT_SYMBOL_FOR_KVM(wbnoinvd_on_all_cpus); void wbnoinvd_on_cpus_mask(struct cpumask *cpus) { on_each_cpu_mask(cpus, __wbnoinvd, NULL, 1); } -EXPORT_SYMBOL_GPL(wbnoinvd_on_cpus_mask); +EXPORT_SYMBOL_FOR_KVM(wbnoinvd_on_cpus_mask); diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 4e385cbfd444..e03eeec55cfe 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -63,11 +63,10 @@ static bool is_string_insn(struct insn *insn) bool insn_has_rep_prefix(struct insn *insn) { insn_byte_t p; - int i; insn_get_prefixes(insn); - for_each_insn_prefix(insn, i, p) { + for_each_insn_prefix(insn, p) { if (p == 0xf2 || p == 0xf3) return true; } @@ -92,13 +91,13 @@ bool insn_has_rep_prefix(struct insn *insn) static int get_seg_reg_override_idx(struct insn *insn) { int idx = INAT_SEG_REG_DEFAULT; - int num_overrides = 0, i; + int num_overrides = 0; insn_byte_t p; insn_get_prefixes(insn); /* Look for any segment override prefixes. */ - for_each_insn_prefix(insn, i, p) { + for_each_insn_prefix(insn, p) { insn_attr_t attr; attr = inat_get_opcode_attribute(p); @@ -1676,3 +1675,147 @@ enum insn_mmio_type insn_decode_mmio(struct insn *insn, int *bytes) return type; } + +/* + * Recognise typical NOP patterns for both 32bit and 64bit. + * + * Notably: + * - NOP, but not: REP NOP aka PAUSE + * - NOPL + * - MOV %reg, %reg + * - LEA 0(%reg),%reg + * - JMP +0 + * + * Must not have false-positives; instructions identified as a NOP might be + * emulated as a NOP (uprobe) or Run Length Encoded in a larger NOP + * (alternatives). + * + * False-negatives are fine; need not be exhaustive. + */ +bool insn_is_nop(struct insn *insn) +{ + u8 b3 = 0, x3 = 0, r3 = 0; + u8 b4 = 0, x4 = 0, r4 = 0, m = 0; + u8 modrm, modrm_mod, modrm_reg, modrm_rm; + u8 sib = 0, sib_scale, sib_index, sib_base; + u8 nrex, rex; + u8 p, rep = 0; + + if ((nrex = insn->rex_prefix.nbytes)) { + rex = insn->rex_prefix.bytes[nrex-1]; + + r3 = !!X86_REX_R(rex); + x3 = !!X86_REX_X(rex); + b3 = !!X86_REX_B(rex); + if (nrex > 1) { + r4 = !!X86_REX2_R(rex); + x4 = !!X86_REX2_X(rex); + b4 = !!X86_REX2_B(rex); + m = !!X86_REX2_M(rex); + } + + } else if (insn->vex_prefix.nbytes) { + /* + * Ignore VEX encoded NOPs + */ + return false; + } + + if (insn->modrm.nbytes) { + modrm = insn->modrm.bytes[0]; + modrm_mod = X86_MODRM_MOD(modrm); + modrm_reg = X86_MODRM_REG(modrm) + 8*r3 + 16*r4; + modrm_rm = X86_MODRM_RM(modrm) + 8*b3 + 16*b4; + modrm = 1; + } + + if (insn->sib.nbytes) { + sib = insn->sib.bytes[0]; + sib_scale = X86_SIB_SCALE(sib); + sib_index = X86_SIB_INDEX(sib) + 8*x3 + 16*x4; + sib_base = X86_SIB_BASE(sib) + 8*b3 + 16*b4; + sib = 1; + + modrm_rm = sib_base; + } + + for_each_insn_prefix(insn, p) { + if (p == 0xf3) /* REPE */ + rep = 1; + } + + /* + * Opcode map munging: + * + * REX2: 0 - single byte opcode + * 1 - 0f second byte opcode + */ + switch (m) { + case 0: break; + case 1: insn->opcode.value <<= 8; + insn->opcode.value |= 0x0f; + break; + default: + return false; + } + + switch (insn->opcode.bytes[0]) { + case 0x0f: /* 2nd byte */ + break; + + case 0x89: /* MOV */ + if (modrm_mod != 3) /* register-direct */ + return false; + + /* native size */ + if (insn->opnd_bytes != 4 * (1 + insn->x86_64)) + return false; + + return modrm_reg == modrm_rm; /* MOV %reg, %reg */ + + case 0x8d: /* LEA */ + if (modrm_mod == 0 || modrm_mod == 3) /* register-indirect with disp */ + return false; + + /* native size */ + if (insn->opnd_bytes != 4 * (1 + insn->x86_64)) + return false; + + if (insn->displacement.value != 0) + return false; + + if (sib && (sib_scale != 0 || sib_index != 4)) /* (%reg, %eiz, 1) */ + return false; + + for_each_insn_prefix(insn, p) { + if (p != 0x3e) /* DS */ + return false; + } + + return modrm_reg == modrm_rm; /* LEA 0(%reg), %reg */ + + case 0x90: /* NOP */ + if (b3 || b4) /* XCHG %r{8,16,24},%rax */ + return false; + + if (rep) /* REP NOP := PAUSE */ + return false; + + return true; + + case 0xe9: /* JMP.d32 */ + case 0xeb: /* JMP.d8 */ + return insn->immediate.value == 0; /* JMP +0 */ + + default: + return false; + } + + switch (insn->opcode.bytes[1]) { + case 0x1f: + return modrm_reg == 0; /* 0f 1f /0 -- NOPL */ + + default: + return false; + } +} diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c index b5893928d55c..8c7cd115b484 100644 --- a/arch/x86/lib/kaslr.c +++ b/arch/x86/lib/kaslr.c @@ -22,7 +22,7 @@ #include <asm/setup.h> #define debug_putstr(v) early_printk("%s", v) -#define has_cpuflag(f) boot_cpu_has(f) +#define has_cpuflag(f) cpu_feature_enabled(f) #define get_boot_seed() kaslr_offset() #endif diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index 4ef7c6dcbea6..dfdd1da89f36 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/export.h> +#include <linux/kvm_types.h> #include <linux/percpu.h> #include <linux/preempt.h> #include <asm/msr.h> @@ -103,7 +104,7 @@ int msr_set_bit(u32 msr, u8 bit) { return __flip_bit(msr, bit, true); } -EXPORT_SYMBOL_GPL(msr_set_bit); +EXPORT_SYMBOL_FOR_KVM(msr_set_bit); /** * msr_clear_bit - Clear @bit in a MSR @msr. @@ -119,7 +120,7 @@ int msr_clear_bit(u32 msr, u8 bit) { return __flip_bit(msr, bit, false); } -EXPORT_SYMBOL_GPL(msr_clear_bit); +EXPORT_SYMBOL_FOR_KVM(msr_clear_bit); #ifdef CONFIG_TRACEPOINTS void do_trace_write_msr(u32 msr, u64 val, int failed) diff --git a/arch/x86/math-emu/poly.h b/arch/x86/math-emu/poly.h index fc1c887ca073..654bfe4e29a0 100644 --- a/arch/x86/math-emu/poly.h +++ b/arch/x86/math-emu/poly.h @@ -39,7 +39,7 @@ asmlinkage void mul_Xsig_Xsig(Xsig *dest, const Xsig *mult); asmlinkage void shr_Xsig(Xsig *, const int n); asmlinkage int round_Xsig(Xsig *); asmlinkage int norm_Xsig(Xsig *); -asmlinkage void div_Xsig(Xsig *x1, const Xsig *x2, const Xsig *dest); +asmlinkage void div_Xsig(Xsig *x1, const Xsig *x2, Xsig *dest); /* Macro to extract the most significant 32 bits from a long long */ #define LL_MSW(x) (((unsigned long *)&x)[1]) diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index a4700ef6eb64..2afa7a23340e 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -486,7 +486,6 @@ void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm, #endif ptdump_walk_pgd_level_core(m, mm, pgd, false, false); } -EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); void ptdump_walk_user_pgd_level_checkwx(void) { diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index b68200a0e0c6..8a3d9722f602 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -42,6 +42,7 @@ #include <linux/highmem.h> #include <linux/fs.h> #include <linux/rbtree.h> +#include <linux/kvm_types.h> #include <asm/cpu_device_id.h> #include <asm/cacheflush.h> @@ -697,7 +698,7 @@ bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn) cm == _PAGE_CACHE_MODE_UC_MINUS || cm == _PAGE_CACHE_MODE_WC; } -EXPORT_SYMBOL_GPL(pat_pfn_immune_to_uc_mtrr); +EXPORT_SYMBOL_FOR_KVM(pat_pfn_immune_to_uc_mtrr); /** * memtype_reserve_io - Request a memory type mapping for a region of memory diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index d2d54b8c4dbb..970981893c9b 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -446,7 +446,7 @@ static void cpa_flush(struct cpa_data *cpa, int cache) } start = fix_addr(__cpa_addr(cpa, 0)); - end = fix_addr(__cpa_addr(cpa, cpa->numpages)); + end = start + cpa->numpages * PAGE_SIZE; if (cpa->force_flush_all) end = TLB_FLUSH_ALL; diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c index fc3f3d3e2ef2..8d31c6b9e184 100644 --- a/arch/x86/mm/physaddr.c +++ b/arch/x86/mm/physaddr.c @@ -31,17 +31,6 @@ unsigned long __phys_addr(unsigned long x) return x; } EXPORT_SYMBOL(__phys_addr); - -unsigned long __phys_addr_symbol(unsigned long x) -{ - unsigned long y = x - __START_KERNEL_map; - - /* only check upper bounds since lower bounds will trigger carry */ - VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE); - - return y + phys_base; -} -EXPORT_SYMBOL(__phys_addr_symbol); #endif bool __virt_addr_valid(unsigned long x) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 39f80111e6f1..f5b93e01e347 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -12,6 +12,7 @@ #include <linux/task_work.h> #include <linux/mmu_notifier.h> #include <linux/mmu_context.h> +#include <linux/kvm_types.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> @@ -911,11 +912,31 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, * CR3 and cpu_tlbstate.loaded_mm are not all in sync. */ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); - barrier(); - /* Start receiving IPIs and then read tlb_gen (and LAM below) */ + /* + * Make sure this CPU is set in mm_cpumask() such that we'll + * receive invalidation IPIs. + * + * Rely on the smp_mb() implied by cpumask_set_cpu()'s atomic + * operation, or explicitly provide one. Such that: + * + * switch_mm_irqs_off() flush_tlb_mm_range() + * smp_store_release(loaded_mm, SWITCHING); atomic64_inc_return(tlb_gen) + * smp_mb(); // here // smp_mb() implied + * atomic64_read(tlb_gen); this_cpu_read(loaded_mm); + * + * we properly order against flush_tlb_mm_range(), where the + * loaded_mm load can happen in mative_flush_tlb_multi() -> + * should_flush_tlb(). + * + * This way switch_mm() must see the new tlb_gen or + * flush_tlb_mm_range() must see the new loaded_mm, or both. + */ if (next != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next))) cpumask_set_cpu(cpu, mm_cpumask(next)); + else + smp_mb(); + next_tlb_gen = atomic64_read(&next->context.tlb_gen); ns = choose_new_asid(next, next_tlb_gen); @@ -1562,7 +1583,7 @@ unsigned long __get_current_cr3_fast(void) VM_BUG_ON(cr3 != __read_cr3()); return cr3; } -EXPORT_SYMBOL_GPL(__get_current_cr3_fast); +EXPORT_SYMBOL_FOR_KVM(__get_current_cr3_fast); /* * Flush one page in the kernel mapping @@ -1703,7 +1724,7 @@ void __flush_tlb_all(void) flush_tlb_local(); } } -EXPORT_SYMBOL_GPL(__flush_tlb_all); +EXPORT_SYMBOL_FOR_KVM(__flush_tlb_all); void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) { diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index d4c93d9e73e4..de5083cb1d37 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2701,7 +2701,7 @@ emit_jmp: /* Update cleanup_addr */ ctx->cleanup_addr = proglen; if (bpf_prog_was_classic(bpf_prog) && - !capable(CAP_SYS_ADMIN)) { + !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN)) { u8 *ip = image + addrs[i - 1]; if (emit_spectre_bhb_barrier(&prog, ip, bpf_prog)) diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index eac403248462..5ce4ebe99774 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -29,6 +29,7 @@ #include <linux/acpi.h> #include <linux/suspend.h> #include <linux/idr.h> +#include <linux/kvm_types.h> #include <asm/page.h> #include <asm/special_insns.h> #include <asm/msr-index.h> @@ -181,7 +182,7 @@ int tdx_cpu_enable(void) return 0; } -EXPORT_SYMBOL_GPL(tdx_cpu_enable); +EXPORT_SYMBOL_FOR_KVM(tdx_cpu_enable); /* * Add a memory region as a TDX memory block. The caller must make sure @@ -662,7 +663,7 @@ void tdx_quirk_reset_page(struct page *page) { tdx_quirk_reset_paddr(page_to_phys(page), PAGE_SIZE); } -EXPORT_SYMBOL_GPL(tdx_quirk_reset_page); +EXPORT_SYMBOL_FOR_KVM(tdx_quirk_reset_page); static void tdmr_quirk_reset_pamt(struct tdmr_info *tdmr) { @@ -1216,7 +1217,7 @@ int tdx_enable(void) return ret; } -EXPORT_SYMBOL_GPL(tdx_enable); +EXPORT_SYMBOL_FOR_KVM(tdx_enable); static bool is_pamt_page(unsigned long phys) { @@ -1477,13 +1478,13 @@ const struct tdx_sys_info *tdx_get_sysinfo(void) return p; } -EXPORT_SYMBOL_GPL(tdx_get_sysinfo); +EXPORT_SYMBOL_FOR_KVM(tdx_get_sysinfo); u32 tdx_get_nr_guest_keyids(void) { return tdx_nr_guest_keyids; } -EXPORT_SYMBOL_GPL(tdx_get_nr_guest_keyids); +EXPORT_SYMBOL_FOR_KVM(tdx_get_nr_guest_keyids); int tdx_guest_keyid_alloc(void) { @@ -1491,13 +1492,13 @@ int tdx_guest_keyid_alloc(void) tdx_guest_keyid_start + tdx_nr_guest_keyids - 1, GFP_KERNEL); } -EXPORT_SYMBOL_GPL(tdx_guest_keyid_alloc); +EXPORT_SYMBOL_FOR_KVM(tdx_guest_keyid_alloc); void tdx_guest_keyid_free(unsigned int keyid) { ida_free(&tdx_guest_keyid_pool, keyid); } -EXPORT_SYMBOL_GPL(tdx_guest_keyid_free); +EXPORT_SYMBOL_FOR_KVM(tdx_guest_keyid_free); static inline u64 tdx_tdr_pa(struct tdx_td *td) { @@ -1521,7 +1522,7 @@ noinstr u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args) return __seamcall_dirty_cache(__seamcall_saved_ret, TDH_VP_ENTER, args); } -EXPORT_SYMBOL_GPL(tdh_vp_enter); +EXPORT_SYMBOL_FOR_KVM(tdh_vp_enter); u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page) { @@ -1533,7 +1534,7 @@ u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page) tdx_clflush_page(tdcs_page); return seamcall(TDH_MNG_ADDCX, &args); } -EXPORT_SYMBOL_GPL(tdh_mng_addcx); +EXPORT_SYMBOL_FOR_KVM(tdh_mng_addcx); u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2) { @@ -1553,7 +1554,7 @@ u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page return ret; } -EXPORT_SYMBOL_GPL(tdh_mem_page_add); +EXPORT_SYMBOL_FOR_KVM(tdh_mem_page_add); u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2) { @@ -1572,7 +1573,7 @@ u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u return ret; } -EXPORT_SYMBOL_GPL(tdh_mem_sept_add); +EXPORT_SYMBOL_FOR_KVM(tdh_mem_sept_add); u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page) { @@ -1584,7 +1585,7 @@ u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page) tdx_clflush_page(tdcx_page); return seamcall(TDH_VP_ADDCX, &args); } -EXPORT_SYMBOL_GPL(tdh_vp_addcx); +EXPORT_SYMBOL_FOR_KVM(tdh_vp_addcx); u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2) { @@ -1603,7 +1604,7 @@ u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u return ret; } -EXPORT_SYMBOL_GPL(tdh_mem_page_aug); +EXPORT_SYMBOL_FOR_KVM(tdh_mem_page_aug); u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2) { @@ -1620,7 +1621,7 @@ u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u6 return ret; } -EXPORT_SYMBOL_GPL(tdh_mem_range_block); +EXPORT_SYMBOL_FOR_KVM(tdh_mem_range_block); u64 tdh_mng_key_config(struct tdx_td *td) { @@ -1630,7 +1631,7 @@ u64 tdh_mng_key_config(struct tdx_td *td) return seamcall(TDH_MNG_KEY_CONFIG, &args); } -EXPORT_SYMBOL_GPL(tdh_mng_key_config); +EXPORT_SYMBOL_FOR_KVM(tdh_mng_key_config); u64 tdh_mng_create(struct tdx_td *td, u16 hkid) { @@ -1642,7 +1643,7 @@ u64 tdh_mng_create(struct tdx_td *td, u16 hkid) tdx_clflush_page(td->tdr_page); return seamcall(TDH_MNG_CREATE, &args); } -EXPORT_SYMBOL_GPL(tdh_mng_create); +EXPORT_SYMBOL_FOR_KVM(tdh_mng_create); u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp) { @@ -1654,7 +1655,7 @@ u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp) tdx_clflush_page(vp->tdvpr_page); return seamcall(TDH_VP_CREATE, &args); } -EXPORT_SYMBOL_GPL(tdh_vp_create); +EXPORT_SYMBOL_FOR_KVM(tdh_vp_create); u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data) { @@ -1671,7 +1672,7 @@ u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data) return ret; } -EXPORT_SYMBOL_GPL(tdh_mng_rd); +EXPORT_SYMBOL_FOR_KVM(tdh_mng_rd); u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2) { @@ -1688,7 +1689,7 @@ u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2) return ret; } -EXPORT_SYMBOL_GPL(tdh_mr_extend); +EXPORT_SYMBOL_FOR_KVM(tdh_mr_extend); u64 tdh_mr_finalize(struct tdx_td *td) { @@ -1698,7 +1699,7 @@ u64 tdh_mr_finalize(struct tdx_td *td) return seamcall(TDH_MR_FINALIZE, &args); } -EXPORT_SYMBOL_GPL(tdh_mr_finalize); +EXPORT_SYMBOL_FOR_KVM(tdh_mr_finalize); u64 tdh_vp_flush(struct tdx_vp *vp) { @@ -1708,7 +1709,7 @@ u64 tdh_vp_flush(struct tdx_vp *vp) return seamcall(TDH_VP_FLUSH, &args); } -EXPORT_SYMBOL_GPL(tdh_vp_flush); +EXPORT_SYMBOL_FOR_KVM(tdh_vp_flush); u64 tdh_mng_vpflushdone(struct tdx_td *td) { @@ -1718,7 +1719,7 @@ u64 tdh_mng_vpflushdone(struct tdx_td *td) return seamcall(TDH_MNG_VPFLUSHDONE, &args); } -EXPORT_SYMBOL_GPL(tdh_mng_vpflushdone); +EXPORT_SYMBOL_FOR_KVM(tdh_mng_vpflushdone); u64 tdh_mng_key_freeid(struct tdx_td *td) { @@ -1728,7 +1729,7 @@ u64 tdh_mng_key_freeid(struct tdx_td *td) return seamcall(TDH_MNG_KEY_FREEID, &args); } -EXPORT_SYMBOL_GPL(tdh_mng_key_freeid); +EXPORT_SYMBOL_FOR_KVM(tdh_mng_key_freeid); u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err) { @@ -1744,7 +1745,7 @@ u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err) return ret; } -EXPORT_SYMBOL_GPL(tdh_mng_init); +EXPORT_SYMBOL_FOR_KVM(tdh_mng_init); u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data) { @@ -1761,7 +1762,7 @@ u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data) return ret; } -EXPORT_SYMBOL_GPL(tdh_vp_rd); +EXPORT_SYMBOL_FOR_KVM(tdh_vp_rd); u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask) { @@ -1774,7 +1775,7 @@ u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask) return seamcall(TDH_VP_WR, &args); } -EXPORT_SYMBOL_GPL(tdh_vp_wr); +EXPORT_SYMBOL_FOR_KVM(tdh_vp_wr); u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid) { @@ -1787,7 +1788,7 @@ u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid) /* apicid requires version == 1. */ return seamcall(TDH_VP_INIT | (1ULL << TDX_VERSION_SHIFT), &args); } -EXPORT_SYMBOL_GPL(tdh_vp_init); +EXPORT_SYMBOL_FOR_KVM(tdh_vp_init); /* * TDX ABI defines output operands as PT, OWNER and SIZE. These are TDX defined fomats. @@ -1809,7 +1810,7 @@ u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 return ret; } -EXPORT_SYMBOL_GPL(tdh_phymem_page_reclaim); +EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_reclaim); u64 tdh_mem_track(struct tdx_td *td) { @@ -1819,7 +1820,7 @@ u64 tdh_mem_track(struct tdx_td *td) return seamcall(TDH_MEM_TRACK, &args); } -EXPORT_SYMBOL_GPL(tdh_mem_track); +EXPORT_SYMBOL_FOR_KVM(tdh_mem_track); u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2) { @@ -1836,7 +1837,7 @@ u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u6 return ret; } -EXPORT_SYMBOL_GPL(tdh_mem_page_remove); +EXPORT_SYMBOL_FOR_KVM(tdh_mem_page_remove); u64 tdh_phymem_cache_wb(bool resume) { @@ -1846,7 +1847,7 @@ u64 tdh_phymem_cache_wb(bool resume) return seamcall(TDH_PHYMEM_CACHE_WB, &args); } -EXPORT_SYMBOL_GPL(tdh_phymem_cache_wb); +EXPORT_SYMBOL_FOR_KVM(tdh_phymem_cache_wb); u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td) { @@ -1856,7 +1857,7 @@ u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td) return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); } -EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_tdr); +EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_wbinvd_tdr); u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page) { @@ -1866,7 +1867,7 @@ u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page) return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); } -EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid); +EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_wbinvd_hkid); #ifdef CONFIG_KEXEC_CORE void tdx_cpu_flush_cache_for_kexec(void) @@ -1884,5 +1885,5 @@ void tdx_cpu_flush_cache_for_kexec(void) wbinvd(); this_cpu_write(cache_state_incoherent, false); } -EXPORT_SYMBOL_GPL(tdx_cpu_flush_cache_for_kexec); +EXPORT_SYMBOL_FOR_KVM(tdx_cpu_flush_cache_for_kexec); #endif |
