summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig4
-rw-r--r--arch/x86/Makefile8
-rw-r--r--arch/x86/boot/a20.c10
-rw-r--r--arch/x86/boot/boot.h2
-rw-r--r--arch/x86/boot/compressed/Makefile7
-rw-r--r--arch/x86/boot/compressed/misc.h11
-rw-r--r--arch/x86/boot/compressed/pgtable_64.c11
-rw-r--r--arch/x86/boot/compressed/sev-handle-vc.c3
-rw-r--r--arch/x86/boot/compressed/sev.c7
-rw-r--r--arch/x86/boot/compressed/sev.h6
-rw-r--r--arch/x86/boot/cpucheck.c16
-rw-r--r--arch/x86/boot/msr.h26
-rw-r--r--arch/x86/boot/startup/Makefile2
-rw-r--r--arch/x86/boot/startup/sev-shared.c2
-rw-r--r--arch/x86/coco/sev/vc-handle.c1
-rw-r--r--arch/x86/coco/sev/vc-shared.c11
-rw-r--r--arch/x86/crypto/Kconfig10
-rw-r--r--arch/x86/crypto/Makefile8
-rw-r--r--arch/x86/crypto/aes-gcm-aesni-x86_64.S12
-rw-r--r--arch/x86/crypto/aes-gcm-vaes-avx2.S1146
-rw-r--r--arch/x86/crypto/aes-gcm-vaes-avx512.S (renamed from arch/x86/crypto/aes-gcm-avx10-x86_64.S)722
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c264
-rw-r--r--arch/x86/crypto/polyval-clmulni_asm.S321
-rw-r--r--arch/x86/crypto/polyval-clmulni_glue.c180
-rw-r--r--arch/x86/entry/entry.S15
-rw-r--r--arch/x86/entry/entry_64.S3
-rw-r--r--arch/x86/entry/entry_64_fred.S3
-rw-r--r--arch/x86/entry/entry_fred.c4
-rw-r--r--arch/x86/entry/syscall_32.c3
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--arch/x86/events/amd/core.c12
-rw-r--r--arch/x86/events/core.c83
-rw-r--r--arch/x86/events/intel/core.c445
-rw-r--r--arch/x86/events/intel/cstate.c18
-rw-r--r--arch/x86/events/intel/ds.c604
-rw-r--r--arch/x86/events/intel/lbr.c3
-rw-r--r--arch/x86/events/intel/pt.c7
-rw-r--r--arch/x86/events/intel/uncore.c3
-rw-r--r--arch/x86/events/perf_event.h41
-rw-r--r--arch/x86/include/asm/alternative.h4
-rw-r--r--arch/x86/include/asm/amd/node.h1
-rw-r--r--arch/x86/include/asm/asm.h5
-rw-r--r--arch/x86/include/asm/bug.h147
-rw-r--r--arch/x86/include/asm/cpufeature.h1
-rw-r--r--arch/x86/include/asm/cpufeatures.h5
-rw-r--r--arch/x86/include/asm/fred.h2
-rw-r--r--arch/x86/include/asm/ftrace.h5
-rw-r--r--arch/x86/include/asm/futex.h75
-rw-r--r--arch/x86/include/asm/idtentry.h4
-rw-r--r--arch/x86/include/asm/insn-eval.h2
-rw-r--r--arch/x86/include/asm/insn.h5
-rw-r--r--arch/x86/include/asm/intel-family.h6
-rw-r--r--arch/x86/include/asm/intel_ds.h10
-rw-r--r--arch/x86/include/asm/jump_label.h1
-rw-r--r--arch/x86/include/asm/kvm_types.h5
-rw-r--r--arch/x86/include/asm/mce.h14
-rw-r--r--arch/x86/include/asm/msr-index.h30
-rw-r--r--arch/x86/include/asm/page_64.h17
-rw-r--r--arch/x86/include/asm/percpu.h5
-rw-r--r--arch/x86/include/asm/perf_event.h116
-rw-r--r--arch/x86/include/asm/ptrace.h24
-rw-r--r--arch/x86/include/asm/runtime-const.h4
-rw-r--r--arch/x86/include/asm/sgx.h97
-rw-r--r--arch/x86/include/asm/shared/msr.h15
-rw-r--r--arch/x86/include/asm/smp.h2
-rw-r--r--arch/x86/include/asm/svm.h1
-rw-r--r--arch/x86/include/asm/topology.h14
-rw-r--r--arch/x86/include/asm/uaccess.h12
-rw-r--r--arch/x86/include/asm/uaccess_64.h10
-rw-r--r--arch/x86/include/asm/unwind_user.h41
-rw-r--r--arch/x86/include/asm/uprobes.h9
-rw-r--r--arch/x86/include/uapi/asm/sgx.h10
-rw-r--r--arch/x86/include/uapi/asm/vmx.h1
-rw-r--r--arch/x86/kernel/acpi/apei.c2
-rw-r--r--arch/x86/kernel/acpi/cppc.c2
-rw-r--r--arch/x86/kernel/alternative.c80
-rw-r--r--arch/x86/kernel/amd_node.c150
-rw-r--r--arch/x86/kernel/apic/apic.c18
-rw-r--r--arch/x86/kernel/apic/apic_common.c3
-rw-r--r--arch/x86/kernel/apic/io_apic.c2
-rw-r--r--arch/x86/kernel/cpu/amd.c40
-rw-r--r--arch/x86/kernel/cpu/bugs.c262
-rw-r--r--arch/x86/kernel/cpu/bus_lock.c3
-rw-r--r--arch/x86/kernel/cpu/common.c13
-rw-r--r--arch/x86/kernel/cpu/cpu.h9
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c2
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c356
-rw-r--r--arch/x86/kernel/cpu/mce/core.c31
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h4
-rw-r--r--arch/x86/kernel/cpu/mce/threshold.c19
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c130
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c13
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c362
-rw-r--r--arch/x86/kernel/cpu/microcode/internal.h4
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h4
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c9
-rw-r--r--arch/x86/kernel/cpu/resctrl/ctrlmondata.c40
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h5
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c26
-rw-r--r--arch/x86/kernel/cpu/scattered.c2
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.c19
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.c1
-rw-r--r--arch/x86/kernel/cpu/sgx/encls.h5
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c104
-rw-r--r--arch/x86/kernel/cpu/sgx/sgx.h3
-rw-r--r--arch/x86/kernel/cpu/sgx/virt.c25
-rw-r--r--arch/x86/kernel/cpu/topology.c4
-rw-r--r--arch/x86/kernel/cpu/topology_common.c3
-rw-r--r--arch/x86/kernel/cpu/tsx.c58
-rw-r--r--arch/x86/kernel/dumpstack.c23
-rw-r--r--arch/x86/kernel/e820.c3
-rw-r--r--arch/x86/kernel/fpu/core.c24
-rw-r--r--arch/x86/kernel/fpu/xstate.c7
-rw-r--r--arch/x86/kernel/ftrace_64.S8
-rw-r--r--arch/x86/kernel/hw_breakpoint.c3
-rw-r--r--arch/x86/kernel/irq.c3
-rw-r--r--arch/x86/kernel/kprobes/core.c3
-rw-r--r--arch/x86/kernel/kprobes/opt.c4
-rw-r--r--arch/x86/kernel/kvm.c5
-rw-r--r--arch/x86/kernel/module.c15
-rw-r--r--arch/x86/kernel/msr.c2
-rw-r--r--arch/x86/kernel/nmi.c5
-rw-r--r--arch/x86/kernel/process_64.c5
-rw-r--r--arch/x86/kernel/reboot.c5
-rw-r--r--arch/x86/kernel/smpboot.c81
-rw-r--r--arch/x86/kernel/static_call.c13
-rw-r--r--arch/x86/kernel/traps.c119
-rw-r--r--arch/x86/kernel/tsc.c1
-rw-r--r--arch/x86/kernel/uprobes.c70
-rw-r--r--arch/x86/kvm/pmu.c8
-rw-r--r--arch/x86/kvm/svm/avic.c24
-rw-r--r--arch/x86/kvm/svm/nested.c20
-rw-r--r--arch/x86/kvm/svm/svm.c95
-rw-r--r--arch/x86/kvm/svm/svm.h5
-rw-r--r--arch/x86/kvm/vmx/common.h2
-rw-r--r--arch/x86/kvm/vmx/nested.c8
-rw-r--r--arch/x86/kvm/vmx/vmx.c8
-rw-r--r--arch/x86/kvm/x86.c57
-rw-r--r--arch/x86/lib/cache-smp.c9
-rw-r--r--arch/x86/lib/insn-eval.c151
-rw-r--r--arch/x86/lib/kaslr.c2
-rw-r--r--arch/x86/lib/msr.c5
-rw-r--r--arch/x86/math-emu/poly.h2
-rw-r--r--arch/x86/mm/dump_pagetables.c1
-rw-r--r--arch/x86/mm/init_64.c3
-rw-r--r--arch/x86/mm/pat/memtype.c3
-rw-r--r--arch/x86/mm/pat/set_memory.c2
-rw-r--r--arch/x86/mm/physaddr.c11
-rw-r--r--arch/x86/mm/tlb.c29
-rw-r--r--arch/x86/net/bpf_jit_comp.c2
-rw-r--r--arch/x86/virt/vmx/tdx/tdx.c69
153 files changed, 5098 insertions, 2367 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fa3b616af03a..34fb46d5341b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -261,6 +261,7 @@ config X86
select HAVE_FUNCTION_ERROR_INJECTION
select HAVE_KRETPROBES
select HAVE_RETHOOK
+ select HAVE_KLP_BUILD if X86_64
select HAVE_LIVEPATCH if X86_64
select HAVE_MIXED_BREAKPOINTS_REGS
select HAVE_MOD_ARCH_SPECIFIC
@@ -297,6 +298,7 @@ config X86
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UACCESS_VALIDATION if HAVE_OBJTOOL
select HAVE_UNSTABLE_SCHED_CLOCK
+ select HAVE_UNWIND_USER_FP if X86_64
select HAVE_USER_RETURN_NOTIFIER
select HAVE_GENERIC_VDSO
select VDSO_GETRANDOM if X86_64
@@ -379,7 +381,7 @@ config GENERIC_CSUM
config GENERIC_BUG
def_bool y
depends on BUG
- select GENERIC_BUG_RELATIVE_POINTERS if X86_64
+ select GENERIC_BUG_RELATIVE_POINTERS
config GENERIC_BUG_RELATIVE_POINTERS
bool
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 4db7e4bf69f5..1d403a3612ea 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -48,7 +48,8 @@ endif
# How to compile the 16-bit code. Note we always compile for -march=i386;
# that way we can complain to the user if the CPU is insufficient.
-REALMODE_CFLAGS := -std=gnu11 -m16 -g -Os -DDISABLE_BRANCH_PROFILING -D__DISABLE_EXPORTS \
+REALMODE_CFLAGS := -std=gnu11 -fms-extensions -m16 -g -Os \
+ -DDISABLE_BRANCH_PROFILING -D__DISABLE_EXPORTS \
-Wall -Wstrict-prototypes -march=i386 -mregparm=3 \
-fno-strict-aliasing -fomit-frame-pointer -fno-pic \
-mno-mmx -mno-sse $(call cc-option,-fcf-protection=none)
@@ -60,6 +61,7 @@ REALMODE_CFLAGS += $(cc_stack_align4)
REALMODE_CFLAGS += $(CLANG_FLAGS)
ifdef CONFIG_CC_IS_CLANG
REALMODE_CFLAGS += -Wno-gnu
+REALMODE_CFLAGS += -Wno-microsoft-anon-tag
endif
export REALMODE_CFLAGS
@@ -75,7 +77,7 @@ export BITS
#
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
#
-KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx
+KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -mno-sse4a
KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json
KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2
@@ -98,7 +100,7 @@ ifeq ($(CONFIG_X86_KERNEL_IBT),y)
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104816
#
KBUILD_CFLAGS += $(call cc-option,-fcf-protection=branch -fno-jump-tables)
-KBUILD_RUSTFLAGS += -Zcf-protection=branch -Zno-jump-tables
+KBUILD_RUSTFLAGS += -Zcf-protection=branch $(if $(call rustc-min-version,109300),-Cjump-tables=n,-Zno-jump-tables)
else
KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none)
endif
diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c
index a2b6b428922a..bda042933a05 100644
--- a/arch/x86/boot/a20.c
+++ b/arch/x86/boot/a20.c
@@ -135,29 +135,29 @@ int enable_a20(void)
(legacy free, etc.) */
if (a20_test_short())
return 0;
-
+
/* Next, try the BIOS (INT 0x15, AX=0x2401) */
enable_a20_bios();
if (a20_test_short())
return 0;
-
+
/* Try enabling A20 through the keyboard controller */
kbc_err = empty_8042();
if (a20_test_short())
return 0; /* BIOS worked, but with delayed reaction */
-
+
if (!kbc_err) {
enable_a20_kbc();
if (a20_test_long())
return 0;
}
-
+
/* Finally, try enabling the "fast A20 gate" */
enable_a20_fast();
if (a20_test_long())
return 0;
}
-
+
return -1;
}
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index a3c58ebe3662..8e3eab34dff4 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -193,8 +193,6 @@ static inline bool heap_free(size_t n)
void copy_to_fs(addr_t dst, void *src, size_t len);
void *copy_from_fs(void *dst, addr_t src, size_t len);
-void copy_to_gs(addr_t dst, void *src, size_t len);
-void *copy_from_gs(void *dst, addr_t src, size_t len);
/* a20.c */
int enable_a20(void);
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 74657589264d..68f9d7a1683b 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -25,7 +25,7 @@ targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
# avoid errors with '-march=i386', and future flags may depend on the target to
# be valid.
KBUILD_CFLAGS := -m$(BITS) -O2 $(CLANG_FLAGS)
-KBUILD_CFLAGS += -std=gnu11
+KBUILD_CFLAGS += -std=gnu11 -fms-extensions
KBUILD_CFLAGS += -fno-strict-aliasing -fPIE
KBUILD_CFLAGS += -Wundef
KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
@@ -36,7 +36,10 @@ KBUILD_CFLAGS += -mno-mmx -mno-sse
KBUILD_CFLAGS += -ffreestanding -fshort-wchar
KBUILD_CFLAGS += -fno-stack-protector
KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
-KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
+ifdef CONFIG_CC_IS_CLANG
+KBUILD_CFLAGS += -Wno-gnu
+KBUILD_CFLAGS += -Wno-microsoft-anon-tag
+endif
KBUILD_CFLAGS += -Wno-pointer-sign
KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
KBUILD_CFLAGS += -D__DISABLE_EXPORTS
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index db1048621ea2..fd855e32c9b9 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -152,17 +152,6 @@ bool insn_has_rep_prefix(struct insn *insn);
void sev_insn_decode_init(void);
bool early_setup_ghcb(void);
#else
-static inline void sev_enable(struct boot_params *bp)
-{
- /*
- * bp->cc_blob_address should only be set by boot/compressed kernel.
- * Initialize it to 0 unconditionally (thus here in this stub too) to
- * ensure that uninitialized values from buggy bootloaders aren't
- * propagated.
- */
- if (bp)
- bp->cc_blob_address = 0;
-}
static inline void snp_check_features(void) { }
static inline void sev_es_shutdown_ghcb(void) { }
static inline bool sev_es_check_ghcb_fault(unsigned long address)
diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
index bdd26050dff7..0e89e197e112 100644
--- a/arch/x86/boot/compressed/pgtable_64.c
+++ b/arch/x86/boot/compressed/pgtable_64.c
@@ -3,6 +3,7 @@
#include <asm/bootparam.h>
#include <asm/bootparam_utils.h>
#include <asm/e820/types.h>
+#include <asm/pgtable.h>
#include <asm/processor.h>
#include "../string.h"
#include "efi.h"
@@ -168,9 +169,10 @@ asmlinkage void configure_5level_paging(struct boot_params *bp, void *pgtable)
* For 4- to 5-level paging transition, set up current CR3 as
* the first and the only entry in a new top-level page table.
*/
- *trampoline_32bit = __native_read_cr3() | _PAGE_TABLE_NOENC;
+ *trampoline_32bit = native_read_cr3_pa() | _PAGE_TABLE_NOENC;
} else {
- unsigned long src;
+ u64 *new_cr3;
+ pgd_t *pgdp;
/*
* For 5- to 4-level paging transition, copy page table pointed
@@ -180,8 +182,9 @@ asmlinkage void configure_5level_paging(struct boot_params *bp, void *pgtable)
* We cannot just point to the page table from trampoline as it
* may be above 4G.
*/
- src = *(unsigned long *)__native_read_cr3() & PAGE_MASK;
- memcpy(trampoline_32bit, (void *)src, PAGE_SIZE);
+ pgdp = (pgd_t *)native_read_cr3_pa();
+ new_cr3 = (u64 *)(native_pgd_val(pgdp[0]) & PTE_PFN_MASK);
+ memcpy(trampoline_32bit, new_cr3, PAGE_SIZE);
}
toggle_la57(trampoline_32bit);
diff --git a/arch/x86/boot/compressed/sev-handle-vc.c b/arch/x86/boot/compressed/sev-handle-vc.c
index 7530ad8b768b..030001b46554 100644
--- a/arch/x86/boot/compressed/sev-handle-vc.c
+++ b/arch/x86/boot/compressed/sev-handle-vc.c
@@ -29,11 +29,10 @@
bool insn_has_rep_prefix(struct insn *insn)
{
insn_byte_t p;
- int i;
insn_get_prefixes(insn);
- for_each_insn_prefix(insn, i, p) {
+ for_each_insn_prefix(insn, p) {
if (p == 0xf2 || p == 0xf3)
return true;
}
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index 6e5c32a53d03..c8c1464b3a56 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -14,6 +14,7 @@
#include <asm/bootparam.h>
#include <asm/pgtable_types.h>
+#include <asm/shared/msr.h>
#include <asm/sev.h>
#include <asm/trapnr.h>
#include <asm/trap_pf.h>
@@ -397,7 +398,7 @@ void sev_enable(struct boot_params *bp)
}
/* Set the SME mask if this is an SEV guest. */
- boot_rdmsr(MSR_AMD64_SEV, &m);
+ raw_rdmsr(MSR_AMD64_SEV, &m);
sev_status = m.q;
if (!(sev_status & MSR_AMD64_SEV_ENABLED))
return;
@@ -446,7 +447,7 @@ u64 sev_get_status(void)
if (sev_check_cpu_support() < 0)
return 0;
- boot_rdmsr(MSR_AMD64_SEV, &m);
+ raw_rdmsr(MSR_AMD64_SEV, &m);
return m.q;
}
@@ -496,7 +497,7 @@ bool early_is_sevsnp_guest(void)
struct msr m;
/* Obtain the address of the calling area to use */
- boot_rdmsr(MSR_SVSM_CAA, &m);
+ raw_rdmsr(MSR_SVSM_CAA, &m);
boot_svsm_caa_pa = m.q;
/*
diff --git a/arch/x86/boot/compressed/sev.h b/arch/x86/boot/compressed/sev.h
index 92f79c21939c..22637b416b46 100644
--- a/arch/x86/boot/compressed/sev.h
+++ b/arch/x86/boot/compressed/sev.h
@@ -10,7 +10,7 @@
#ifdef CONFIG_AMD_MEM_ENCRYPT
-#include "../msr.h"
+#include <asm/shared/msr.h>
void snp_accept_memory(phys_addr_t start, phys_addr_t end);
u64 sev_get_status(void);
@@ -20,7 +20,7 @@ static inline u64 sev_es_rd_ghcb_msr(void)
{
struct msr m;
- boot_rdmsr(MSR_AMD64_SEV_ES_GHCB, &m);
+ raw_rdmsr(MSR_AMD64_SEV_ES_GHCB, &m);
return m.q;
}
@@ -30,7 +30,7 @@ static inline void sev_es_wr_ghcb_msr(u64 val)
struct msr m;
m.q = val;
- boot_wrmsr(MSR_AMD64_SEV_ES_GHCB, &m);
+ raw_wrmsr(MSR_AMD64_SEV_ES_GHCB, &m);
}
#else
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
index f82de8de5dc6..2e1bb936cba2 100644
--- a/arch/x86/boot/cpucheck.c
+++ b/arch/x86/boot/cpucheck.c
@@ -26,9 +26,9 @@
#include <asm/intel-family.h>
#include <asm/processor-flags.h>
#include <asm/msr-index.h>
+#include <asm/shared/msr.h>
#include "string.h"
-#include "msr.h"
static u32 err_flags[NCAPINTS];
@@ -134,9 +134,9 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
struct msr m;
- boot_rdmsr(MSR_K7_HWCR, &m);
+ raw_rdmsr(MSR_K7_HWCR, &m);
m.l &= ~(1 << 15);
- boot_wrmsr(MSR_K7_HWCR, &m);
+ raw_wrmsr(MSR_K7_HWCR, &m);
get_cpuflags(); /* Make sure it really did something */
err = check_cpuflags();
@@ -148,9 +148,9 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
struct msr m;
- boot_rdmsr(MSR_VIA_FCR, &m);
+ raw_rdmsr(MSR_VIA_FCR, &m);
m.l |= (1 << 1) | (1 << 7);
- boot_wrmsr(MSR_VIA_FCR, &m);
+ raw_wrmsr(MSR_VIA_FCR, &m);
set_bit(X86_FEATURE_CX8, cpu.flags);
err = check_cpuflags();
@@ -160,14 +160,14 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
struct msr m, m_tmp;
u32 level = 1;
- boot_rdmsr(0x80860004, &m);
+ raw_rdmsr(0x80860004, &m);
m_tmp = m;
m_tmp.l = ~0;
- boot_wrmsr(0x80860004, &m_tmp);
+ raw_wrmsr(0x80860004, &m_tmp);
asm("cpuid"
: "+a" (level), "=d" (cpu.flags[0])
: : "ecx", "ebx");
- boot_wrmsr(0x80860004, &m);
+ raw_wrmsr(0x80860004, &m);
err = check_cpuflags();
} else if (err == 0x01 &&
diff --git a/arch/x86/boot/msr.h b/arch/x86/boot/msr.h
deleted file mode 100644
index aed66f7ae199..000000000000
--- a/arch/x86/boot/msr.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Helpers/definitions related to MSR access.
- */
-
-#ifndef BOOT_MSR_H
-#define BOOT_MSR_H
-
-#include <asm/shared/msr.h>
-
-/*
- * The kernel proper already defines rdmsr()/wrmsr(), but they are not for the
- * boot kernel since they rely on tracepoint/exception handling infrastructure
- * that's not available here.
- */
-static inline void boot_rdmsr(unsigned int reg, struct msr *m)
-{
- asm volatile("rdmsr" : "=a" (m->l), "=d" (m->h) : "c" (reg));
-}
-
-static inline void boot_wrmsr(unsigned int reg, const struct msr *m)
-{
- asm volatile("wrmsr" : : "c" (reg), "a"(m->l), "d" (m->h) : "memory");
-}
-
-#endif /* BOOT_MSR_H */
diff --git a/arch/x86/boot/startup/Makefile b/arch/x86/boot/startup/Makefile
index e8fdf020b422..5e499cfb29b5 100644
--- a/arch/x86/boot/startup/Makefile
+++ b/arch/x86/boot/startup/Makefile
@@ -36,7 +36,7 @@ $(patsubst %.o,$(obj)/%.o,$(lib-y)): OBJECT_FILES_NON_STANDARD := y
# relocations, even if other objtool actions are being deferred.
#
$(pi-objs): objtool-enabled = 1
-$(pi-objs): objtool-args = $(if $(delay-objtool),,$(objtool-args-y)) --noabs
+$(pi-objs): objtool-args = $(if $(delay-objtool),--dry-run,$(objtool-args-y)) --noabs
#
# Confine the startup code by prefixing all symbols with __pi_ (for position
diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c
index 4e22ffd73516..a0fa8bb2b945 100644
--- a/arch/x86/boot/startup/sev-shared.c
+++ b/arch/x86/boot/startup/sev-shared.c
@@ -12,7 +12,7 @@
#include <asm/setup_data.h>
#ifndef __BOOT_COMPRESSED
-#define has_cpuflag(f) boot_cpu_has(f)
+#define has_cpuflag(f) cpu_feature_enabled(f)
#else
#undef WARN
#define WARN(condition, format...) (!!(condition))
diff --git a/arch/x86/coco/sev/vc-handle.c b/arch/x86/coco/sev/vc-handle.c
index 7fc136a35334..f08c7505ed82 100644
--- a/arch/x86/coco/sev/vc-handle.c
+++ b/arch/x86/coco/sev/vc-handle.c
@@ -352,7 +352,6 @@ fault:
#define sev_printk(fmt, ...) printk(fmt, ##__VA_ARGS__)
#define error(v)
-#define has_cpuflag(f) boot_cpu_has(f)
#include "vc-shared.c"
diff --git a/arch/x86/coco/sev/vc-shared.c b/arch/x86/coco/sev/vc-shared.c
index 9b01c9ad81be..58b2f985d546 100644
--- a/arch/x86/coco/sev/vc-shared.c
+++ b/arch/x86/coco/sev/vc-shared.c
@@ -1,5 +1,9 @@
// SPDX-License-Identifier: GPL-2.0
+#ifndef __BOOT_COMPRESSED
+#define has_cpuflag(f) cpu_feature_enabled(f)
+#endif
+
static enum es_result vc_check_opcode_bytes(struct es_em_ctxt *ctxt,
unsigned long exit_code)
{
@@ -546,6 +550,13 @@ static enum es_result vc_handle_cpuid(struct ghcb *ghcb,
/* xgetbv will cause #GP - use reset value for xcr0 */
ghcb_set_xcr0(ghcb, 1);
+ if (has_cpuflag(X86_FEATURE_SHSTK) && regs->ax == 0xd && regs->cx == 1) {
+ struct msr m;
+
+ raw_rdmsr(MSR_IA32_XSS, &m);
+ ghcb_set_xss(ghcb, m.q);
+ }
+
ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0);
if (ret != ES_OK)
return ret;
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
index 48d3076b6053..3fd2423d3cf8 100644
--- a/arch/x86/crypto/Kconfig
+++ b/arch/x86/crypto/Kconfig
@@ -353,16 +353,6 @@ config CRYPTO_NHPOLY1305_AVX2
Architecture: x86_64 using:
- AVX2 (Advanced Vector Extensions 2)
-config CRYPTO_POLYVAL_CLMUL_NI
- tristate "Hash functions: POLYVAL (CLMUL-NI)"
- depends on 64BIT
- select CRYPTO_POLYVAL
- help
- POLYVAL hash function for HCTR2
-
- Architecture: x86_64 using:
- - CLMUL-NI (carry-less multiplication new instructions)
-
config CRYPTO_SM3_AVX_X86_64
tristate "Hash functions: SM3 (AVX)"
depends on 64BIT
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 2d30d5d36145..5f2fb4f148fe 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -46,15 +46,13 @@ obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
aesni-intel-$(CONFIG_64BIT) += aes-ctr-avx-x86_64.o \
aes-gcm-aesni-x86_64.o \
- aes-xts-avx-x86_64.o \
- aes-gcm-avx10-x86_64.o
+ aes-gcm-vaes-avx2.o \
+ aes-gcm-vaes-avx512.o \
+ aes-xts-avx-x86_64.o
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
-obj-$(CONFIG_CRYPTO_POLYVAL_CLMUL_NI) += polyval-clmulni.o
-polyval-clmulni-y := polyval-clmulni_asm.o polyval-clmulni_glue.o
-
obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
diff --git a/arch/x86/crypto/aes-gcm-aesni-x86_64.S b/arch/x86/crypto/aes-gcm-aesni-x86_64.S
index 45940e2883a0..7c8a8a32bd3c 100644
--- a/arch/x86/crypto/aes-gcm-aesni-x86_64.S
+++ b/arch/x86/crypto/aes-gcm-aesni-x86_64.S
@@ -61,15 +61,15 @@
// for the *_aesni functions or AVX for the *_aesni_avx ones. (But it seems
// there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.)
//
-// The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is
+// The design generally follows that of aes-gcm-vaes-avx512.S, and that file is
// more thoroughly commented. This file has the following notable changes:
//
// - The vector length is fixed at 128-bit, i.e. xmm registers. This means
// there is only one AES block (and GHASH block) per register.
//
-// - Without AVX512 / AVX10, only 16 SIMD registers are available instead of
-// 32. We work around this by being much more careful about using
-// registers, relying heavily on loads to load values as they are needed.
+// - Without AVX512, only 16 SIMD registers are available instead of 32. We
+// work around this by being much more careful about using registers,
+// relying heavily on loads to load values as they are needed.
//
// - Masking is not available either. We work around this by implementing
// partial block loads and stores using overlapping scalar loads and stores
@@ -90,8 +90,8 @@
// multiplication instead of schoolbook multiplication. This saves one
// pclmulqdq instruction per block, at the cost of one 64-bit load, one
// pshufd, and 0.25 pxors per block. (This is without the three-argument
-// XOR support that would be provided by AVX512 / AVX10, which would be
-// more beneficial to schoolbook than Karatsuba.)
+// XOR support that would be provided by AVX512, which would be more
+// beneficial to schoolbook than Karatsuba.)
//
// As a rough approximation, we can assume that Karatsuba multiplication is
// faster than schoolbook multiplication in this context if one pshufd and
diff --git a/arch/x86/crypto/aes-gcm-vaes-avx2.S b/arch/x86/crypto/aes-gcm-vaes-avx2.S
new file mode 100644
index 000000000000..93c9504a488f
--- /dev/null
+++ b/arch/x86/crypto/aes-gcm-vaes-avx2.S
@@ -0,0 +1,1146 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// AES-GCM implementation for x86_64 CPUs that support the following CPU
+// features: VAES && VPCLMULQDQ && AVX2
+//
+// Copyright 2025 Google LLC
+//
+// Author: Eric Biggers <ebiggers@google.com>
+//
+//------------------------------------------------------------------------------
+//
+// This file is dual-licensed, meaning that you can use it under your choice of
+// either of the following two licenses:
+//
+// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy
+// of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// or
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// -----------------------------------------------------------------------------
+//
+// This is similar to aes-gcm-vaes-avx512.S, but it uses AVX2 instead of AVX512.
+// This means it can only use 16 vector registers instead of 32, the maximum
+// vector length is 32 bytes, and some instructions such as vpternlogd and
+// masked loads/stores are unavailable. However, it is able to run on CPUs that
+// have VAES without AVX512, namely AMD Zen 3 (including "Milan" server CPUs),
+// various Intel client CPUs such as Alder Lake, and Intel Sierra Forest.
+//
+// This implementation also uses Karatsuba multiplication instead of schoolbook
+// multiplication for GHASH in its main loop. This does not help much on Intel,
+// but it improves performance by ~5% on AMD Zen 3. Other factors weighing
+// slightly in favor of Karatsuba multiplication in this implementation are the
+// lower maximum vector length (which means there are fewer key powers, so we
+// can cache the halves of each key power XOR'd together and still use less
+// memory than the AVX512 implementation), and the unavailability of the
+// vpternlogd instruction (which helped schoolbook a bit more than Karatsuba).
+
+#include <linux/linkage.h>
+
+.section .rodata
+.p2align 4
+
+ // The below three 16-byte values must be in the order that they are, as
+ // they are really two 32-byte tables and a 16-byte value that overlap:
+ //
+ // - The first 32-byte table begins at .Lselect_high_bytes_table.
+ // For 0 <= len <= 16, the 16-byte value at
+ // '.Lselect_high_bytes_table + len' selects the high 'len' bytes of
+ // another 16-byte value when AND'ed with it.
+ //
+ // - The second 32-byte table begins at .Lrshift_and_bswap_table.
+ // For 0 <= len <= 16, the 16-byte value at
+ // '.Lrshift_and_bswap_table + len' is a vpshufb mask that does the
+ // following operation: right-shift by '16 - len' bytes (shifting in
+ // zeroes), then reflect all 16 bytes.
+ //
+ // - The 16-byte value at .Lbswap_mask is a vpshufb mask that reflects
+ // all 16 bytes.
+.Lselect_high_bytes_table:
+ .octa 0
+.Lrshift_and_bswap_table:
+ .octa 0xffffffffffffffffffffffffffffffff
+.Lbswap_mask:
+ .octa 0x000102030405060708090a0b0c0d0e0f
+
+ // Sixteen 0x0f bytes. By XOR'ing an entry of .Lrshift_and_bswap_table
+ // with this, we get a mask that left-shifts by '16 - len' bytes.
+.Lfifteens:
+ .octa 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+ // This is the GHASH reducing polynomial without its constant term, i.e.
+ // x^128 + x^7 + x^2 + x, represented using the backwards mapping
+ // between bits and polynomial coefficients.
+ //
+ // Alternatively, it can be interpreted as the naturally-ordered
+ // representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the
+ // "reversed" GHASH reducing polynomial without its x^128 term.
+.Lgfpoly:
+ .octa 0xc2000000000000000000000000000001
+
+ // Same as above, but with the (1 << 64) bit set.
+.Lgfpoly_and_internal_carrybit:
+ .octa 0xc2000000000000010000000000000001
+
+ // Values needed to prepare the initial vector of counter blocks.
+.Lctr_pattern:
+ .octa 0
+ .octa 1
+
+ // The number of AES blocks per vector, as a 128-bit value.
+.Linc_2blocks:
+ .octa 2
+
+// Offsets in struct aes_gcm_key_vaes_avx2
+#define OFFSETOF_AESKEYLEN 480
+#define OFFSETOF_H_POWERS 512
+#define NUM_H_POWERS 8
+#define OFFSETOFEND_H_POWERS (OFFSETOF_H_POWERS + (NUM_H_POWERS * 16))
+#define OFFSETOF_H_POWERS_XORED OFFSETOFEND_H_POWERS
+
+.text
+
+// Do one step of GHASH-multiplying the 128-bit lanes of \a by the 128-bit lanes
+// of \b and storing the reduced products in \dst. Uses schoolbook
+// multiplication.
+.macro _ghash_mul_step i, a, b, dst, gfpoly, t0, t1, t2
+.if \i == 0
+ vpclmulqdq $0x00, \a, \b, \t0 // LO = a_L * b_L
+ vpclmulqdq $0x01, \a, \b, \t1 // MI_0 = a_L * b_H
+.elseif \i == 1
+ vpclmulqdq $0x10, \a, \b, \t2 // MI_1 = a_H * b_L
+.elseif \i == 2
+ vpxor \t2, \t1, \t1 // MI = MI_0 + MI_1
+.elseif \i == 3
+ vpclmulqdq $0x01, \t0, \gfpoly, \t2 // LO_L*(x^63 + x^62 + x^57)
+.elseif \i == 4
+ vpshufd $0x4e, \t0, \t0 // Swap halves of LO
+.elseif \i == 5
+ vpxor \t0, \t1, \t1 // Fold LO into MI (part 1)
+ vpxor \t2, \t1, \t1 // Fold LO into MI (part 2)
+.elseif \i == 6
+ vpclmulqdq $0x11, \a, \b, \dst // HI = a_H * b_H
+.elseif \i == 7
+ vpclmulqdq $0x01, \t1, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57)
+.elseif \i == 8
+ vpshufd $0x4e, \t1, \t1 // Swap halves of MI
+.elseif \i == 9
+ vpxor \t1, \dst, \dst // Fold MI into HI (part 1)
+ vpxor \t0, \dst, \dst // Fold MI into HI (part 2)
+.endif
+.endm
+
+// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store
+// the reduced products in \dst. See _ghash_mul_step for full explanation.
+.macro _ghash_mul a, b, dst, gfpoly, t0, t1, t2
+.irp i, 0,1,2,3,4,5,6,7,8,9
+ _ghash_mul_step \i, \a, \b, \dst, \gfpoly, \t0, \t1, \t2
+.endr
+.endm
+
+// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the
+// *unreduced* products to \lo, \mi, and \hi.
+.macro _ghash_mul_noreduce a, b, lo, mi, hi, t0
+ vpclmulqdq $0x00, \a, \b, \t0 // a_L * b_L
+ vpxor \t0, \lo, \lo
+ vpclmulqdq $0x01, \a, \b, \t0 // a_L * b_H
+ vpxor \t0, \mi, \mi
+ vpclmulqdq $0x10, \a, \b, \t0 // a_H * b_L
+ vpxor \t0, \mi, \mi
+ vpclmulqdq $0x11, \a, \b, \t0 // a_H * b_H
+ vpxor \t0, \hi, \hi
+.endm
+
+// Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit
+// reduced products in \hi. See _ghash_mul_step for explanation of reduction.
+.macro _ghash_reduce lo, mi, hi, gfpoly, t0
+ vpclmulqdq $0x01, \lo, \gfpoly, \t0
+ vpshufd $0x4e, \lo, \lo
+ vpxor \lo, \mi, \mi
+ vpxor \t0, \mi, \mi
+ vpclmulqdq $0x01, \mi, \gfpoly, \t0
+ vpshufd $0x4e, \mi, \mi
+ vpxor \mi, \hi, \hi
+ vpxor \t0, \hi, \hi
+.endm
+
+// This is a specialized version of _ghash_mul that computes \a * \a, i.e. it
+// squares \a. It skips computing MI = (a_L * a_H) + (a_H * a_L) = 0.
+.macro _ghash_square a, dst, gfpoly, t0, t1
+ vpclmulqdq $0x00, \a, \a, \t0 // LO = a_L * a_L
+ vpclmulqdq $0x11, \a, \a, \dst // HI = a_H * a_H
+ vpclmulqdq $0x01, \t0, \gfpoly, \t1 // LO_L*(x^63 + x^62 + x^57)
+ vpshufd $0x4e, \t0, \t0 // Swap halves of LO
+ vpxor \t0, \t1, \t1 // Fold LO into MI
+ vpclmulqdq $0x01, \t1, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57)
+ vpshufd $0x4e, \t1, \t1 // Swap halves of MI
+ vpxor \t1, \dst, \dst // Fold MI into HI (part 1)
+ vpxor \t0, \dst, \dst // Fold MI into HI (part 2)
+.endm
+
+// void aes_gcm_precompute_vaes_avx2(struct aes_gcm_key_vaes_avx2 *key);
+//
+// Given the expanded AES key |key->base.aes_key|, derive the GHASH subkey and
+// initialize |key->h_powers| and |key->h_powers_xored|.
+//
+// We use h_powers[0..7] to store H^8 through H^1, and h_powers_xored[0..7] to
+// store the 64-bit halves of the key powers XOR'd together (for Karatsuba
+// multiplication) in the order 8,6,7,5,4,2,3,1.
+SYM_FUNC_START(aes_gcm_precompute_vaes_avx2)
+
+ // Function arguments
+ .set KEY, %rdi
+
+ // Additional local variables
+ .set POWERS_PTR, %rsi
+ .set RNDKEYLAST_PTR, %rdx
+ .set TMP0, %ymm0
+ .set TMP0_XMM, %xmm0
+ .set TMP1, %ymm1
+ .set TMP1_XMM, %xmm1
+ .set TMP2, %ymm2
+ .set TMP2_XMM, %xmm2
+ .set H_CUR, %ymm3
+ .set H_CUR_XMM, %xmm3
+ .set H_CUR2, %ymm4
+ .set H_INC, %ymm5
+ .set H_INC_XMM, %xmm5
+ .set GFPOLY, %ymm6
+ .set GFPOLY_XMM, %xmm6
+
+ // Encrypt an all-zeroes block to get the raw hash subkey.
+ movl OFFSETOF_AESKEYLEN(KEY), %eax
+ lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR
+ vmovdqu (KEY), H_CUR_XMM // Zero-th round key XOR all-zeroes block
+ lea 16(KEY), %rax
+1:
+ vaesenc (%rax), H_CUR_XMM, H_CUR_XMM
+ add $16, %rax
+ cmp %rax, RNDKEYLAST_PTR
+ jne 1b
+ vaesenclast (RNDKEYLAST_PTR), H_CUR_XMM, H_CUR_XMM
+
+ // Reflect the bytes of the raw hash subkey.
+ vpshufb .Lbswap_mask(%rip), H_CUR_XMM, H_CUR_XMM
+
+ // Finish preprocessing the byte-reflected hash subkey by multiplying it
+ // by x^-1 ("standard" interpretation of polynomial coefficients) or
+ // equivalently x^1 (natural interpretation). This gets the key into a
+ // format that avoids having to bit-reflect the data blocks later.
+ vpshufd $0xd3, H_CUR_XMM, TMP0_XMM
+ vpsrad $31, TMP0_XMM, TMP0_XMM
+ vpaddq H_CUR_XMM, H_CUR_XMM, H_CUR_XMM
+ vpand .Lgfpoly_and_internal_carrybit(%rip), TMP0_XMM, TMP0_XMM
+ vpxor TMP0_XMM, H_CUR_XMM, H_CUR_XMM
+
+ // Load the gfpoly constant.
+ vbroadcasti128 .Lgfpoly(%rip), GFPOLY
+
+ // Square H^1 to get H^2.
+ _ghash_square H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, TMP0_XMM, TMP1_XMM
+
+ // Create H_CUR = [H^2, H^1] and H_INC = [H^2, H^2].
+ vinserti128 $1, H_CUR_XMM, H_INC, H_CUR
+ vinserti128 $1, H_INC_XMM, H_INC, H_INC
+
+ // Compute H_CUR2 = [H^4, H^3].
+ _ghash_mul H_INC, H_CUR, H_CUR2, GFPOLY, TMP0, TMP1, TMP2
+
+ // Store [H^2, H^1] and [H^4, H^3].
+ vmovdqu H_CUR, OFFSETOF_H_POWERS+3*32(KEY)
+ vmovdqu H_CUR2, OFFSETOF_H_POWERS+2*32(KEY)
+
+ // For Karatsuba multiplication: compute and store the two 64-bit halves
+ // of each key power XOR'd together. Order is 4,2,3,1.
+ vpunpcklqdq H_CUR, H_CUR2, TMP0
+ vpunpckhqdq H_CUR, H_CUR2, TMP1
+ vpxor TMP1, TMP0, TMP0
+ vmovdqu TMP0, OFFSETOF_H_POWERS_XORED+32(KEY)
+
+ // Compute and store H_CUR = [H^6, H^5] and H_CUR2 = [H^8, H^7].
+ _ghash_mul H_INC, H_CUR2, H_CUR, GFPOLY, TMP0, TMP1, TMP2
+ _ghash_mul H_INC, H_CUR, H_CUR2, GFPOLY, TMP0, TMP1, TMP2
+ vmovdqu H_CUR, OFFSETOF_H_POWERS+1*32(KEY)
+ vmovdqu H_CUR2, OFFSETOF_H_POWERS+0*32(KEY)
+
+ // Again, compute and store the two 64-bit halves of each key power
+ // XOR'd together. Order is 8,6,7,5.
+ vpunpcklqdq H_CUR, H_CUR2, TMP0
+ vpunpckhqdq H_CUR, H_CUR2, TMP1
+ vpxor TMP1, TMP0, TMP0
+ vmovdqu TMP0, OFFSETOF_H_POWERS_XORED(KEY)
+
+ vzeroupper
+ RET
+SYM_FUNC_END(aes_gcm_precompute_vaes_avx2)
+
+// Do one step of the GHASH update of four vectors of data blocks.
+// \i: the step to do, 0 through 9
+// \ghashdata_ptr: pointer to the data blocks (ciphertext or AAD)
+// KEY: pointer to struct aes_gcm_key_vaes_avx2
+// BSWAP_MASK: mask for reflecting the bytes of blocks
+// H_POW[2-1]_XORED: cached values from KEY->h_powers_xored
+// TMP[0-2]: temporary registers. TMP[1-2] must be preserved across steps.
+// LO, MI: working state for this macro that must be preserved across steps
+// GHASH_ACC: the GHASH accumulator (input/output)
+.macro _ghash_step_4x i, ghashdata_ptr
+ .set HI, GHASH_ACC # alias
+ .set HI_XMM, GHASH_ACC_XMM
+.if \i == 0
+ // First vector
+ vmovdqu 0*32(\ghashdata_ptr), TMP1
+ vpshufb BSWAP_MASK, TMP1, TMP1
+ vmovdqu OFFSETOF_H_POWERS+0*32(KEY), TMP2
+ vpxor GHASH_ACC, TMP1, TMP1
+ vpclmulqdq $0x00, TMP2, TMP1, LO
+ vpclmulqdq $0x11, TMP2, TMP1, HI
+ vpunpckhqdq TMP1, TMP1, TMP0
+ vpxor TMP1, TMP0, TMP0
+ vpclmulqdq $0x00, H_POW2_XORED, TMP0, MI
+.elseif \i == 1
+.elseif \i == 2
+ // Second vector
+ vmovdqu 1*32(\ghashdata_ptr), TMP1
+ vpshufb BSWAP_MASK, TMP1, TMP1
+ vmovdqu OFFSETOF_H_POWERS+1*32(KEY), TMP2
+ vpclmulqdq $0x00, TMP2, TMP1, TMP0
+ vpxor TMP0, LO, LO
+ vpclmulqdq $0x11, TMP2, TMP1, TMP0
+ vpxor TMP0, HI, HI
+ vpunpckhqdq TMP1, TMP1, TMP0
+ vpxor TMP1, TMP0, TMP0
+ vpclmulqdq $0x10, H_POW2_XORED, TMP0, TMP0
+ vpxor TMP0, MI, MI
+.elseif \i == 3
+ // Third vector
+ vmovdqu 2*32(\ghashdata_ptr), TMP1
+ vpshufb BSWAP_MASK, TMP1, TMP1
+ vmovdqu OFFSETOF_H_POWERS+2*32(KEY), TMP2
+.elseif \i == 4
+ vpclmulqdq $0x00, TMP2, TMP1, TMP0
+ vpxor TMP0, LO, LO
+ vpclmulqdq $0x11, TMP2, TMP1, TMP0
+ vpxor TMP0, HI, HI
+.elseif \i == 5
+ vpunpckhqdq TMP1, TMP1, TMP0
+ vpxor TMP1, TMP0, TMP0
+ vpclmulqdq $0x00, H_POW1_XORED, TMP0, TMP0
+ vpxor TMP0, MI, MI
+
+ // Fourth vector
+ vmovdqu 3*32(\ghashdata_ptr), TMP1
+ vpshufb BSWAP_MASK, TMP1, TMP1
+.elseif \i == 6
+ vmovdqu OFFSETOF_H_POWERS+3*32(KEY), TMP2
+ vpclmulqdq $0x00, TMP2, TMP1, TMP0
+ vpxor TMP0, LO, LO
+ vpclmulqdq $0x11, TMP2, TMP1, TMP0
+ vpxor TMP0, HI, HI
+ vpunpckhqdq TMP1, TMP1, TMP0
+ vpxor TMP1, TMP0, TMP0
+ vpclmulqdq $0x10, H_POW1_XORED, TMP0, TMP0
+ vpxor TMP0, MI, MI
+.elseif \i == 7
+ // Finalize 'mi' following Karatsuba multiplication.
+ vpxor LO, MI, MI
+ vpxor HI, MI, MI
+
+ // Fold lo into mi.
+ vbroadcasti128 .Lgfpoly(%rip), TMP2
+ vpclmulqdq $0x01, LO, TMP2, TMP0
+ vpshufd $0x4e, LO, LO
+ vpxor LO, MI, MI
+ vpxor TMP0, MI, MI
+.elseif \i == 8
+ // Fold mi into hi.
+ vpclmulqdq $0x01, MI, TMP2, TMP0
+ vpshufd $0x4e, MI, MI
+ vpxor MI, HI, HI
+ vpxor TMP0, HI, HI
+.elseif \i == 9
+ vextracti128 $1, HI, TMP0_XMM
+ vpxor TMP0_XMM, HI_XMM, GHASH_ACC_XMM
+.endif
+.endm
+
+// Update GHASH with four vectors of data blocks. See _ghash_step_4x for full
+// explanation.
+.macro _ghash_4x ghashdata_ptr
+.irp i, 0,1,2,3,4,5,6,7,8,9
+ _ghash_step_4x \i, \ghashdata_ptr
+.endr
+.endm
+
+// Load 1 <= %ecx <= 16 bytes from the pointer \src into the xmm register \dst
+// and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}.
+.macro _load_partial_block src, dst, tmp64, tmp32
+ sub $8, %ecx // LEN - 8
+ jle .Lle8\@
+
+ // Load 9 <= LEN <= 16 bytes.
+ vmovq (\src), \dst // Load first 8 bytes
+ mov (\src, %rcx), %rax // Load last 8 bytes
+ neg %ecx
+ shl $3, %ecx
+ shr %cl, %rax // Discard overlapping bytes
+ vpinsrq $1, %rax, \dst, \dst
+ jmp .Ldone\@
+
+.Lle8\@:
+ add $4, %ecx // LEN - 4
+ jl .Llt4\@
+
+ // Load 4 <= LEN <= 8 bytes.
+ mov (\src), %eax // Load first 4 bytes
+ mov (\src, %rcx), \tmp32 // Load last 4 bytes
+ jmp .Lcombine\@
+
+.Llt4\@:
+ // Load 1 <= LEN <= 3 bytes.
+ add $2, %ecx // LEN - 2
+ movzbl (\src), %eax // Load first byte
+ jl .Lmovq\@
+ movzwl (\src, %rcx), \tmp32 // Load last 2 bytes
+.Lcombine\@:
+ shl $3, %ecx
+ shl %cl, \tmp64
+ or \tmp64, %rax // Combine the two parts
+.Lmovq\@:
+ vmovq %rax, \dst
+.Ldone\@:
+.endm
+
+// Store 1 <= %ecx <= 16 bytes from the xmm register \src to the pointer \dst.
+// Clobbers %rax, %rcx, and \tmp{64,32}.
+.macro _store_partial_block src, dst, tmp64, tmp32
+ sub $8, %ecx // LEN - 8
+ jl .Llt8\@
+
+ // Store 8 <= LEN <= 16 bytes.
+ vpextrq $1, \src, %rax
+ mov %ecx, \tmp32
+ shl $3, %ecx
+ ror %cl, %rax
+ mov %rax, (\dst, \tmp64) // Store last LEN - 8 bytes
+ vmovq \src, (\dst) // Store first 8 bytes
+ jmp .Ldone\@
+
+.Llt8\@:
+ add $4, %ecx // LEN - 4
+ jl .Llt4\@
+
+ // Store 4 <= LEN <= 7 bytes.
+ vpextrd $1, \src, %eax
+ mov %ecx, \tmp32
+ shl $3, %ecx
+ ror %cl, %eax
+ mov %eax, (\dst, \tmp64) // Store last LEN - 4 bytes
+ vmovd \src, (\dst) // Store first 4 bytes
+ jmp .Ldone\@
+
+.Llt4\@:
+ // Store 1 <= LEN <= 3 bytes.
+ vpextrb $0, \src, 0(\dst)
+ cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2?
+ jl .Ldone\@
+ vpextrb $1, \src, 1(\dst)
+ je .Ldone\@
+ vpextrb $2, \src, 2(\dst)
+.Ldone\@:
+.endm
+
+// void aes_gcm_aad_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+// u8 ghash_acc[16],
+// const u8 *aad, int aadlen);
+//
+// This function processes the AAD (Additional Authenticated Data) in GCM.
+// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
+// data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all
+// zeroes. |aadlen| must be a multiple of 16, except on the last call where it
+// can be any length. The caller must do any buffering needed to ensure this.
+//
+// This handles large amounts of AAD efficiently, while also keeping overhead
+// low for small amounts which is the common case. TLS and IPsec use less than
+// one block of AAD, but (uncommonly) other use cases may use much more.
+SYM_FUNC_START(aes_gcm_aad_update_vaes_avx2)
+
+ // Function arguments
+ .set KEY, %rdi
+ .set GHASH_ACC_PTR, %rsi
+ .set AAD, %rdx
+ .set AADLEN, %ecx // Must be %ecx for _load_partial_block
+ .set AADLEN64, %rcx // Zero-extend AADLEN before using!
+
+ // Additional local variables.
+ // %rax and %r8 are used as temporary registers.
+ .set TMP0, %ymm0
+ .set TMP0_XMM, %xmm0
+ .set TMP1, %ymm1
+ .set TMP1_XMM, %xmm1
+ .set TMP2, %ymm2
+ .set TMP2_XMM, %xmm2
+ .set LO, %ymm3
+ .set LO_XMM, %xmm3
+ .set MI, %ymm4
+ .set MI_XMM, %xmm4
+ .set GHASH_ACC, %ymm5
+ .set GHASH_ACC_XMM, %xmm5
+ .set BSWAP_MASK, %ymm6
+ .set BSWAP_MASK_XMM, %xmm6
+ .set GFPOLY, %ymm7
+ .set GFPOLY_XMM, %xmm7
+ .set H_POW2_XORED, %ymm8
+ .set H_POW1_XORED, %ymm9
+
+ // Load the bswap_mask and gfpoly constants. Since AADLEN is usually
+ // small, usually only 128-bit vectors will be used. So as an
+ // optimization, don't broadcast these constants to both 128-bit lanes
+ // quite yet.
+ vmovdqu .Lbswap_mask(%rip), BSWAP_MASK_XMM
+ vmovdqu .Lgfpoly(%rip), GFPOLY_XMM
+
+ // Load the GHASH accumulator.
+ vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM
+
+ // Check for the common case of AADLEN <= 16, as well as AADLEN == 0.
+ test AADLEN, AADLEN
+ jz .Laad_done
+ cmp $16, AADLEN
+ jle .Laad_lastblock
+
+ // AADLEN > 16, so we'll operate on full vectors. Broadcast bswap_mask
+ // and gfpoly to both 128-bit lanes.
+ vinserti128 $1, BSWAP_MASK_XMM, BSWAP_MASK, BSWAP_MASK
+ vinserti128 $1, GFPOLY_XMM, GFPOLY, GFPOLY
+
+ // If AADLEN >= 128, update GHASH with 128 bytes of AAD at a time.
+ add $-128, AADLEN // 128 is 4 bytes, -128 is 1 byte
+ jl .Laad_loop_4x_done
+ vmovdqu OFFSETOF_H_POWERS_XORED(KEY), H_POW2_XORED
+ vmovdqu OFFSETOF_H_POWERS_XORED+32(KEY), H_POW1_XORED
+.Laad_loop_4x:
+ _ghash_4x AAD
+ sub $-128, AAD
+ add $-128, AADLEN
+ jge .Laad_loop_4x
+.Laad_loop_4x_done:
+
+ // If AADLEN >= 32, update GHASH with 32 bytes of AAD at a time.
+ add $96, AADLEN
+ jl .Laad_loop_1x_done
+.Laad_loop_1x:
+ vmovdqu (AAD), TMP0
+ vpshufb BSWAP_MASK, TMP0, TMP0
+ vpxor TMP0, GHASH_ACC, GHASH_ACC
+ vmovdqu OFFSETOFEND_H_POWERS-32(KEY), TMP0
+ _ghash_mul TMP0, GHASH_ACC, GHASH_ACC, GFPOLY, TMP1, TMP2, LO
+ vextracti128 $1, GHASH_ACC, TMP0_XMM
+ vpxor TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+ add $32, AAD
+ sub $32, AADLEN
+ jge .Laad_loop_1x
+.Laad_loop_1x_done:
+ add $32, AADLEN
+ // Now 0 <= AADLEN < 32.
+
+ jz .Laad_done
+ cmp $16, AADLEN
+ jle .Laad_lastblock
+
+ // Update GHASH with the remaining 17 <= AADLEN <= 31 bytes of AAD.
+ mov AADLEN, AADLEN // Zero-extend AADLEN to AADLEN64.
+ vmovdqu (AAD), TMP0_XMM
+ vmovdqu -16(AAD, AADLEN64), TMP1_XMM
+ vpshufb BSWAP_MASK_XMM, TMP0_XMM, TMP0_XMM
+ vpxor TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+ lea .Lrshift_and_bswap_table(%rip), %rax
+ vpshufb -16(%rax, AADLEN64), TMP1_XMM, TMP1_XMM
+ vinserti128 $1, TMP1_XMM, GHASH_ACC, GHASH_ACC
+ vmovdqu OFFSETOFEND_H_POWERS-32(KEY), TMP0
+ _ghash_mul TMP0, GHASH_ACC, GHASH_ACC, GFPOLY, TMP1, TMP2, LO
+ vextracti128 $1, GHASH_ACC, TMP0_XMM
+ vpxor TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+ jmp .Laad_done
+
+.Laad_lastblock:
+ // Update GHASH with the remaining 1 <= AADLEN <= 16 bytes of AAD.
+ _load_partial_block AAD, TMP0_XMM, %r8, %r8d
+ vpshufb BSWAP_MASK_XMM, TMP0_XMM, TMP0_XMM
+ vpxor TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+ vmovdqu OFFSETOFEND_H_POWERS-16(KEY), TMP0_XMM
+ _ghash_mul TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM, GFPOLY_XMM, \
+ TMP1_XMM, TMP2_XMM, LO_XMM
+
+.Laad_done:
+ // Store the updated GHASH accumulator back to memory.
+ vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR)
+
+ vzeroupper
+ RET
+SYM_FUNC_END(aes_gcm_aad_update_vaes_avx2)
+
+// Do one non-last round of AES encryption on the blocks in the given AESDATA
+// vectors using the round key that has been broadcast to all 128-bit lanes of
+// \round_key.
+.macro _vaesenc round_key, vecs:vararg
+.irp i, \vecs
+ vaesenc \round_key, AESDATA\i, AESDATA\i
+.endr
+.endm
+
+// Generate counter blocks in the given AESDATA vectors, then do the zero-th AES
+// round on them. Clobbers TMP0.
+.macro _ctr_begin vecs:vararg
+ vbroadcasti128 .Linc_2blocks(%rip), TMP0
+.irp i, \vecs
+ vpshufb BSWAP_MASK, LE_CTR, AESDATA\i
+ vpaddd TMP0, LE_CTR, LE_CTR
+.endr
+.irp i, \vecs
+ vpxor RNDKEY0, AESDATA\i, AESDATA\i
+.endr
+.endm
+
+// Generate and encrypt counter blocks in the given AESDATA vectors, excluding
+// the last AES round. Clobbers %rax and TMP0.
+.macro _aesenc_loop vecs:vararg
+ _ctr_begin \vecs
+ lea 16(KEY), %rax
+.Laesenc_loop\@:
+ vbroadcasti128 (%rax), TMP0
+ _vaesenc TMP0, \vecs
+ add $16, %rax
+ cmp %rax, RNDKEYLAST_PTR
+ jne .Laesenc_loop\@
+.endm
+
+// Finalize the keystream blocks in the given AESDATA vectors by doing the last
+// AES round, then XOR those keystream blocks with the corresponding data.
+// Reduce latency by doing the XOR before the vaesenclast, utilizing the
+// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). Clobbers TMP0.
+.macro _aesenclast_and_xor vecs:vararg
+.irp i, \vecs
+ vpxor \i*32(SRC), RNDKEYLAST, TMP0
+ vaesenclast TMP0, AESDATA\i, AESDATA\i
+.endr
+.irp i, \vecs
+ vmovdqu AESDATA\i, \i*32(DST)
+.endr
+.endm
+
+// void aes_gcm_{enc,dec}_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+// const u32 le_ctr[4], u8 ghash_acc[16],
+// const u8 *src, u8 *dst, int datalen);
+//
+// This macro generates a GCM encryption or decryption update function with the
+// above prototype (with \enc selecting which one). The function computes the
+// next portion of the CTR keystream, XOR's it with |datalen| bytes from |src|,
+// and writes the resulting encrypted or decrypted data to |dst|. It also
+// updates the GHASH accumulator |ghash_acc| using the next |datalen| ciphertext
+// bytes.
+//
+// |datalen| must be a multiple of 16, except on the last call where it can be
+// any length. The caller must do any buffering needed to ensure this. Both
+// in-place and out-of-place en/decryption are supported.
+//
+// |le_ctr| must give the current counter in little-endian format. This
+// function loads the counter from |le_ctr| and increments the loaded counter as
+// needed, but it does *not* store the updated counter back to |le_ctr|. The
+// caller must update |le_ctr| if any more data segments follow. Internally,
+// only the low 32-bit word of the counter is incremented, following the GCM
+// standard.
+.macro _aes_gcm_update enc
+
+ // Function arguments
+ .set KEY, %rdi
+ .set LE_CTR_PTR, %rsi
+ .set LE_CTR_PTR32, %esi
+ .set GHASH_ACC_PTR, %rdx
+ .set SRC, %rcx // Assumed to be %rcx.
+ // See .Ltail_xor_and_ghash_1to16bytes
+ .set DST, %r8
+ .set DATALEN, %r9d
+ .set DATALEN64, %r9 // Zero-extend DATALEN before using!
+
+ // Additional local variables
+
+ // %rax is used as a temporary register. LE_CTR_PTR is also available
+ // as a temporary register after the counter is loaded.
+
+ // AES key length in bytes
+ .set AESKEYLEN, %r10d
+ .set AESKEYLEN64, %r10
+
+ // Pointer to the last AES round key for the chosen AES variant
+ .set RNDKEYLAST_PTR, %r11
+
+ // BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values
+ // using vpshufb, copied to all 128-bit lanes.
+ .set BSWAP_MASK, %ymm0
+ .set BSWAP_MASK_XMM, %xmm0
+
+ // GHASH_ACC is the accumulator variable for GHASH. When fully reduced,
+ // only the lowest 128-bit lane can be nonzero. When not fully reduced,
+ // more than one lane may be used, and they need to be XOR'd together.
+ .set GHASH_ACC, %ymm1
+ .set GHASH_ACC_XMM, %xmm1
+
+ // TMP[0-2] are temporary registers.
+ .set TMP0, %ymm2
+ .set TMP0_XMM, %xmm2
+ .set TMP1, %ymm3
+ .set TMP1_XMM, %xmm3
+ .set TMP2, %ymm4
+ .set TMP2_XMM, %xmm4
+
+ // LO and MI are used to accumulate unreduced GHASH products.
+ .set LO, %ymm5
+ .set LO_XMM, %xmm5
+ .set MI, %ymm6
+ .set MI_XMM, %xmm6
+
+ // H_POW[2-1]_XORED contain cached values from KEY->h_powers_xored. The
+ // descending numbering reflects the order of the key powers.
+ .set H_POW2_XORED, %ymm7
+ .set H_POW2_XORED_XMM, %xmm7
+ .set H_POW1_XORED, %ymm8
+
+ // RNDKEY0 caches the zero-th round key, and RNDKEYLAST the last one.
+ .set RNDKEY0, %ymm9
+ .set RNDKEYLAST, %ymm10
+
+ // LE_CTR contains the next set of little-endian counter blocks.
+ .set LE_CTR, %ymm11
+
+ // AESDATA[0-3] hold the counter blocks that are being encrypted by AES.
+ .set AESDATA0, %ymm12
+ .set AESDATA0_XMM, %xmm12
+ .set AESDATA1, %ymm13
+ .set AESDATA1_XMM, %xmm13
+ .set AESDATA2, %ymm14
+ .set AESDATA3, %ymm15
+
+.if \enc
+ .set GHASHDATA_PTR, DST
+.else
+ .set GHASHDATA_PTR, SRC
+.endif
+
+ vbroadcasti128 .Lbswap_mask(%rip), BSWAP_MASK
+
+ // Load the GHASH accumulator and the starting counter.
+ vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM
+ vbroadcasti128 (LE_CTR_PTR), LE_CTR
+
+ // Load the AES key length in bytes.
+ movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
+
+ // Make RNDKEYLAST_PTR point to the last AES round key. This is the
+ // round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256
+ // respectively. Then load the zero-th and last round keys.
+ lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR
+ vbroadcasti128 (KEY), RNDKEY0
+ vbroadcasti128 (RNDKEYLAST_PTR), RNDKEYLAST
+
+ // Finish initializing LE_CTR by adding 1 to the second block.
+ vpaddd .Lctr_pattern(%rip), LE_CTR, LE_CTR
+
+ // If there are at least 128 bytes of data, then continue into the loop
+ // that processes 128 bytes of data at a time. Otherwise skip it.
+ add $-128, DATALEN // 128 is 4 bytes, -128 is 1 byte
+ jl .Lcrypt_loop_4x_done\@
+
+ vmovdqu OFFSETOF_H_POWERS_XORED(KEY), H_POW2_XORED
+ vmovdqu OFFSETOF_H_POWERS_XORED+32(KEY), H_POW1_XORED
+
+ // Main loop: en/decrypt and hash 4 vectors (128 bytes) at a time.
+
+.if \enc
+ // Encrypt the first 4 vectors of plaintext blocks.
+ _aesenc_loop 0,1,2,3
+ _aesenclast_and_xor 0,1,2,3
+ sub $-128, SRC // 128 is 4 bytes, -128 is 1 byte
+ add $-128, DATALEN
+ jl .Lghash_last_ciphertext_4x\@
+.endif
+
+.align 16
+.Lcrypt_loop_4x\@:
+
+ // Start the AES encryption of the counter blocks.
+ _ctr_begin 0,1,2,3
+ cmp $24, AESKEYLEN
+ jl 128f // AES-128?
+ je 192f // AES-192?
+ // AES-256
+ vbroadcasti128 -13*16(RNDKEYLAST_PTR), TMP0
+ _vaesenc TMP0, 0,1,2,3
+ vbroadcasti128 -12*16(RNDKEYLAST_PTR), TMP0
+ _vaesenc TMP0, 0,1,2,3
+192:
+ vbroadcasti128 -11*16(RNDKEYLAST_PTR), TMP0
+ _vaesenc TMP0, 0,1,2,3
+ vbroadcasti128 -10*16(RNDKEYLAST_PTR), TMP0
+ _vaesenc TMP0, 0,1,2,3
+128:
+
+ // Finish the AES encryption of the counter blocks in AESDATA[0-3],
+ // interleaved with the GHASH update of the ciphertext blocks.
+.irp i, 9,8,7,6,5,4,3,2,1
+ _ghash_step_4x (9 - \i), GHASHDATA_PTR
+ vbroadcasti128 -\i*16(RNDKEYLAST_PTR), TMP0
+ _vaesenc TMP0, 0,1,2,3
+.endr
+ _ghash_step_4x 9, GHASHDATA_PTR
+.if \enc
+ sub $-128, DST // 128 is 4 bytes, -128 is 1 byte
+.endif
+ _aesenclast_and_xor 0,1,2,3
+ sub $-128, SRC
+.if !\enc
+ sub $-128, DST
+.endif
+ add $-128, DATALEN
+ jge .Lcrypt_loop_4x\@
+
+.if \enc
+.Lghash_last_ciphertext_4x\@:
+ // Update GHASH with the last set of ciphertext blocks.
+ _ghash_4x DST
+ sub $-128, DST
+.endif
+
+.Lcrypt_loop_4x_done\@:
+
+ // Undo the extra subtraction by 128 and check whether data remains.
+ sub $-128, DATALEN // 128 is 4 bytes, -128 is 1 byte
+ jz .Ldone\@
+
+ // The data length isn't a multiple of 128 bytes. Process the remaining
+ // data of length 1 <= DATALEN < 128.
+ //
+ // Since there are enough key powers available for all remaining data,
+ // there is no need to do a GHASH reduction after each iteration.
+ // Instead, multiply each remaining block by its own key power, and only
+ // do a GHASH reduction at the very end.
+
+ // Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N
+ // is the number of blocks that remain.
+ .set POWERS_PTR, LE_CTR_PTR // LE_CTR_PTR is free to be reused.
+ .set POWERS_PTR32, LE_CTR_PTR32
+ mov DATALEN, %eax
+ neg %rax
+ and $~15, %rax // -round_up(DATALEN, 16)
+ lea OFFSETOFEND_H_POWERS(KEY,%rax), POWERS_PTR
+
+ // Start collecting the unreduced GHASH intermediate value LO, MI, HI.
+ .set HI, H_POW2_XORED // H_POW2_XORED is free to be reused.
+ .set HI_XMM, H_POW2_XORED_XMM
+ vpxor LO_XMM, LO_XMM, LO_XMM
+ vpxor MI_XMM, MI_XMM, MI_XMM
+ vpxor HI_XMM, HI_XMM, HI_XMM
+
+ // 1 <= DATALEN < 128. Generate 2 or 4 more vectors of keystream blocks
+ // excluding the last AES round, depending on the remaining DATALEN.
+ cmp $64, DATALEN
+ jg .Ltail_gen_4_keystream_vecs\@
+ _aesenc_loop 0,1
+ cmp $32, DATALEN
+ jge .Ltail_xor_and_ghash_full_vec_loop\@
+ jmp .Ltail_xor_and_ghash_partial_vec\@
+.Ltail_gen_4_keystream_vecs\@:
+ _aesenc_loop 0,1,2,3
+
+ // XOR the remaining data and accumulate the unreduced GHASH products
+ // for DATALEN >= 32, starting with one full 32-byte vector at a time.
+.Ltail_xor_and_ghash_full_vec_loop\@:
+.if \enc
+ _aesenclast_and_xor 0
+ vpshufb BSWAP_MASK, AESDATA0, AESDATA0
+.else
+ vmovdqu (SRC), TMP1
+ vpxor TMP1, RNDKEYLAST, TMP0
+ vaesenclast TMP0, AESDATA0, AESDATA0
+ vmovdqu AESDATA0, (DST)
+ vpshufb BSWAP_MASK, TMP1, AESDATA0
+.endif
+ // The ciphertext blocks (i.e. GHASH input data) are now in AESDATA0.
+ vpxor GHASH_ACC, AESDATA0, AESDATA0
+ vmovdqu (POWERS_PTR), TMP2
+ _ghash_mul_noreduce TMP2, AESDATA0, LO, MI, HI, TMP0
+ vmovdqa AESDATA1, AESDATA0
+ vmovdqa AESDATA2, AESDATA1
+ vmovdqa AESDATA3, AESDATA2
+ vpxor GHASH_ACC_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+ add $32, SRC
+ add $32, DST
+ add $32, POWERS_PTR
+ sub $32, DATALEN
+ cmp $32, DATALEN
+ jge .Ltail_xor_and_ghash_full_vec_loop\@
+ test DATALEN, DATALEN
+ jz .Ltail_ghash_reduce\@
+
+.Ltail_xor_and_ghash_partial_vec\@:
+ // XOR the remaining data and accumulate the unreduced GHASH products,
+ // for 1 <= DATALEN < 32.
+ vaesenclast RNDKEYLAST, AESDATA0, AESDATA0
+ cmp $16, DATALEN
+ jle .Ltail_xor_and_ghash_1to16bytes\@
+
+ // Handle 17 <= DATALEN < 32.
+
+ // Load a vpshufb mask that will right-shift by '32 - DATALEN' bytes
+ // (shifting in zeroes), then reflect all 16 bytes.
+ lea .Lrshift_and_bswap_table(%rip), %rax
+ vmovdqu -16(%rax, DATALEN64), TMP2_XMM
+
+ // Move the second keystream block to its own register and left-align it
+ vextracti128 $1, AESDATA0, AESDATA1_XMM
+ vpxor .Lfifteens(%rip), TMP2_XMM, TMP0_XMM
+ vpshufb TMP0_XMM, AESDATA1_XMM, AESDATA1_XMM
+
+ // Using overlapping loads and stores, XOR the source data with the
+ // keystream and write the destination data. Then prepare the GHASH
+ // input data: the full ciphertext block and the zero-padded partial
+ // ciphertext block, both byte-reflected, in AESDATA0.
+.if \enc
+ vpxor -16(SRC, DATALEN64), AESDATA1_XMM, AESDATA1_XMM
+ vpxor (SRC), AESDATA0_XMM, AESDATA0_XMM
+ vmovdqu AESDATA1_XMM, -16(DST, DATALEN64)
+ vmovdqu AESDATA0_XMM, (DST)
+ vpshufb TMP2_XMM, AESDATA1_XMM, AESDATA1_XMM
+ vpshufb BSWAP_MASK_XMM, AESDATA0_XMM, AESDATA0_XMM
+.else
+ vmovdqu -16(SRC, DATALEN64), TMP1_XMM
+ vmovdqu (SRC), TMP0_XMM
+ vpxor TMP1_XMM, AESDATA1_XMM, AESDATA1_XMM
+ vpxor TMP0_XMM, AESDATA0_XMM, AESDATA0_XMM
+ vmovdqu AESDATA1_XMM, -16(DST, DATALEN64)
+ vmovdqu AESDATA0_XMM, (DST)
+ vpshufb TMP2_XMM, TMP1_XMM, AESDATA1_XMM
+ vpshufb BSWAP_MASK_XMM, TMP0_XMM, AESDATA0_XMM
+.endif
+ vpxor GHASH_ACC_XMM, AESDATA0_XMM, AESDATA0_XMM
+ vinserti128 $1, AESDATA1_XMM, AESDATA0, AESDATA0
+ vmovdqu (POWERS_PTR), TMP2
+ jmp .Ltail_ghash_last_vec\@
+
+.Ltail_xor_and_ghash_1to16bytes\@:
+ // Handle 1 <= DATALEN <= 16. Carefully load and store the
+ // possibly-partial block, which we mustn't access out of bounds.
+ vmovdqu (POWERS_PTR), TMP2_XMM
+ mov SRC, KEY // Free up %rcx, assuming SRC == %rcx
+ mov DATALEN, %ecx
+ _load_partial_block KEY, TMP0_XMM, POWERS_PTR, POWERS_PTR32
+ vpxor TMP0_XMM, AESDATA0_XMM, AESDATA0_XMM
+ mov DATALEN, %ecx
+ _store_partial_block AESDATA0_XMM, DST, POWERS_PTR, POWERS_PTR32
+.if \enc
+ lea .Lselect_high_bytes_table(%rip), %rax
+ vpshufb BSWAP_MASK_XMM, AESDATA0_XMM, AESDATA0_XMM
+ vpand (%rax, DATALEN64), AESDATA0_XMM, AESDATA0_XMM
+.else
+ vpshufb BSWAP_MASK_XMM, TMP0_XMM, AESDATA0_XMM
+.endif
+ vpxor GHASH_ACC_XMM, AESDATA0_XMM, AESDATA0_XMM
+
+.Ltail_ghash_last_vec\@:
+ // Accumulate the unreduced GHASH products for the last 1-2 blocks. The
+ // GHASH input data is in AESDATA0. If only one block remains, then the
+ // second block in AESDATA0 is zero and does not affect the result.
+ _ghash_mul_noreduce TMP2, AESDATA0, LO, MI, HI, TMP0
+
+.Ltail_ghash_reduce\@:
+ // Finally, do the GHASH reduction.
+ vbroadcasti128 .Lgfpoly(%rip), TMP0
+ _ghash_reduce LO, MI, HI, TMP0, TMP1
+ vextracti128 $1, HI, GHASH_ACC_XMM
+ vpxor HI_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+
+.Ldone\@:
+ // Store the updated GHASH accumulator back to memory.
+ vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR)
+
+ vzeroupper
+ RET
+.endm
+
+// void aes_gcm_enc_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+// const u32 le_ctr[4], u8 ghash_acc[16],
+// u64 total_aadlen, u64 total_datalen);
+// bool aes_gcm_dec_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+// const u32 le_ctr[4], const u8 ghash_acc[16],
+// u64 total_aadlen, u64 total_datalen,
+// const u8 tag[16], int taglen);
+//
+// This macro generates one of the above two functions (with \enc selecting
+// which one). Both functions finish computing the GCM authentication tag by
+// updating GHASH with the lengths block and encrypting the GHASH accumulator.
+// |total_aadlen| and |total_datalen| must be the total length of the additional
+// authenticated data and the en/decrypted data in bytes, respectively.
+//
+// The encryption function then stores the full-length (16-byte) computed
+// authentication tag to |ghash_acc|. The decryption function instead loads the
+// expected authentication tag (the one that was transmitted) from the 16-byte
+// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the
+// computed tag in constant time, and returns true if and only if they match.
+.macro _aes_gcm_final enc
+
+ // Function arguments
+ .set KEY, %rdi
+ .set LE_CTR_PTR, %rsi
+ .set GHASH_ACC_PTR, %rdx
+ .set TOTAL_AADLEN, %rcx
+ .set TOTAL_DATALEN, %r8
+ .set TAG, %r9
+ .set TAGLEN, %r10d // Originally at 8(%rsp)
+ .set TAGLEN64, %r10
+
+ // Additional local variables.
+ // %rax and %xmm0-%xmm3 are used as temporary registers.
+ .set AESKEYLEN, %r11d
+ .set AESKEYLEN64, %r11
+ .set GFPOLY, %xmm4
+ .set BSWAP_MASK, %xmm5
+ .set LE_CTR, %xmm6
+ .set GHASH_ACC, %xmm7
+ .set H_POW1, %xmm8
+
+ // Load some constants.
+ vmovdqa .Lgfpoly(%rip), GFPOLY
+ vmovdqa .Lbswap_mask(%rip), BSWAP_MASK
+
+ // Load the AES key length in bytes.
+ movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
+
+ // Set up a counter block with 1 in the low 32-bit word. This is the
+ // counter that produces the ciphertext needed to encrypt the auth tag.
+ // GFPOLY has 1 in the low word, so grab the 1 from there using a blend.
+ vpblendd $0xe, (LE_CTR_PTR), GFPOLY, LE_CTR
+
+ // Build the lengths block and XOR it with the GHASH accumulator.
+ // Although the lengths block is defined as the AAD length followed by
+ // the en/decrypted data length, both in big-endian byte order, a byte
+ // reflection of the full block is needed because of the way we compute
+ // GHASH (see _ghash_mul_step). By using little-endian values in the
+ // opposite order, we avoid having to reflect any bytes here.
+ vmovq TOTAL_DATALEN, %xmm0
+ vpinsrq $1, TOTAL_AADLEN, %xmm0, %xmm0
+ vpsllq $3, %xmm0, %xmm0 // Bytes to bits
+ vpxor (GHASH_ACC_PTR), %xmm0, GHASH_ACC
+
+ // Load the first hash key power (H^1), which is stored last.
+ vmovdqu OFFSETOFEND_H_POWERS-16(KEY), H_POW1
+
+ // Load TAGLEN if decrypting.
+.if !\enc
+ movl 8(%rsp), TAGLEN
+.endif
+
+ // Make %rax point to the last AES round key for the chosen AES variant.
+ lea 6*16(KEY,AESKEYLEN64,4), %rax
+
+ // Start the AES encryption of the counter block by swapping the counter
+ // block to big-endian and XOR-ing it with the zero-th AES round key.
+ vpshufb BSWAP_MASK, LE_CTR, %xmm0
+ vpxor (KEY), %xmm0, %xmm0
+
+ // Complete the AES encryption and multiply GHASH_ACC by H^1.
+ // Interleave the AES and GHASH instructions to improve performance.
+ cmp $24, AESKEYLEN
+ jl 128f // AES-128?
+ je 192f // AES-192?
+ // AES-256
+ vaesenc -13*16(%rax), %xmm0, %xmm0
+ vaesenc -12*16(%rax), %xmm0, %xmm0
+192:
+ vaesenc -11*16(%rax), %xmm0, %xmm0
+ vaesenc -10*16(%rax), %xmm0, %xmm0
+128:
+.irp i, 0,1,2,3,4,5,6,7,8
+ _ghash_mul_step \i, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+ %xmm1, %xmm2, %xmm3
+ vaesenc (\i-9)*16(%rax), %xmm0, %xmm0
+.endr
+ _ghash_mul_step 9, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+ %xmm1, %xmm2, %xmm3
+
+ // Undo the byte reflection of the GHASH accumulator.
+ vpshufb BSWAP_MASK, GHASH_ACC, GHASH_ACC
+
+ // Do the last AES round and XOR the resulting keystream block with the
+ // GHASH accumulator to produce the full computed authentication tag.
+ //
+ // Reduce latency by taking advantage of the property vaesenclast(key,
+ // a) ^ b == vaesenclast(key ^ b, a). I.e., XOR GHASH_ACC into the last
+ // round key, instead of XOR'ing the final AES output with GHASH_ACC.
+ //
+ // enc_final then returns the computed auth tag, while dec_final
+ // compares it with the transmitted one and returns a bool. To compare
+ // the tags, dec_final XORs them together and uses vptest to check
+ // whether the result is all-zeroes. This should be constant-time.
+ // dec_final applies the vaesenclast optimization to this additional
+ // value XOR'd too.
+.if \enc
+ vpxor (%rax), GHASH_ACC, %xmm1
+ vaesenclast %xmm1, %xmm0, GHASH_ACC
+ vmovdqu GHASH_ACC, (GHASH_ACC_PTR)
+.else
+ vpxor (TAG), GHASH_ACC, GHASH_ACC
+ vpxor (%rax), GHASH_ACC, GHASH_ACC
+ vaesenclast GHASH_ACC, %xmm0, %xmm0
+ lea .Lselect_high_bytes_table(%rip), %rax
+ vmovdqu (%rax, TAGLEN64), %xmm1
+ vpshufb BSWAP_MASK, %xmm1, %xmm1 // select low bytes, not high
+ xor %eax, %eax
+ vptest %xmm1, %xmm0
+ sete %al
+.endif
+ // No need for vzeroupper here, since only used xmm registers were used.
+ RET
+.endm
+
+SYM_FUNC_START(aes_gcm_enc_update_vaes_avx2)
+ _aes_gcm_update 1
+SYM_FUNC_END(aes_gcm_enc_update_vaes_avx2)
+SYM_FUNC_START(aes_gcm_dec_update_vaes_avx2)
+ _aes_gcm_update 0
+SYM_FUNC_END(aes_gcm_dec_update_vaes_avx2)
+
+SYM_FUNC_START(aes_gcm_enc_final_vaes_avx2)
+ _aes_gcm_final 1
+SYM_FUNC_END(aes_gcm_enc_final_vaes_avx2)
+SYM_FUNC_START(aes_gcm_dec_final_vaes_avx2)
+ _aes_gcm_final 0
+SYM_FUNC_END(aes_gcm_dec_final_vaes_avx2)
diff --git a/arch/x86/crypto/aes-gcm-avx10-x86_64.S b/arch/x86/crypto/aes-gcm-vaes-avx512.S
index 02ee11083d4f..06b71314d65c 100644
--- a/arch/x86/crypto/aes-gcm-avx10-x86_64.S
+++ b/arch/x86/crypto/aes-gcm-vaes-avx512.S
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
//
-// VAES and VPCLMULQDQ optimized AES-GCM for x86_64
+// AES-GCM implementation for x86_64 CPUs that support the following CPU
+// features: VAES && VPCLMULQDQ && AVX512BW && AVX512VL && BMI2
//
// Copyright 2024 Google LLC
//
@@ -45,41 +46,6 @@
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
-//
-//------------------------------------------------------------------------------
-//
-// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
-// support VAES (vector AES), VPCLMULQDQ (vector carryless multiplication), and
-// either AVX512 or AVX10. Some of the functions, notably the encryption and
-// decryption update functions which are the most performance-critical, are
-// provided in two variants generated from a macro: one using 256-bit vectors
-// (suffix: vaes_avx10_256) and one using 512-bit vectors (vaes_avx10_512). The
-// other, "shared" functions (vaes_avx10) use at most 256-bit vectors.
-//
-// The functions that use 512-bit vectors are intended for CPUs that support
-// 512-bit vectors *and* where using them doesn't cause significant
-// downclocking. They require the following CPU features:
-//
-// VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/512)
-//
-// The other functions require the following CPU features:
-//
-// VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/256)
-//
-// All functions use the "System V" ABI. The Windows ABI is not supported.
-//
-// Note that we use "avx10" in the names of the functions as a shorthand to
-// really mean "AVX10 or a certain set of AVX512 features". Due to Intel's
-// introduction of AVX512 and then its replacement by AVX10, there doesn't seem
-// to be a simple way to name things that makes sense on all CPUs.
-//
-// Note that the macros that support both 256-bit and 512-bit vectors could
-// fairly easily be changed to support 128-bit too. However, this would *not*
-// be sufficient to allow the code to run on CPUs without AVX512 or AVX10,
-// because the code heavily uses several features of these extensions other than
-// the vector length: the increase in the number of SIMD registers from 16 to
-// 32, masking support, and new instructions such as vpternlogd (which can do a
-// three-argument XOR). These features are very useful for AES-GCM.
#include <linux/linkage.h>
@@ -104,16 +70,14 @@
.Lgfpoly_and_internal_carrybit:
.octa 0xc2000000000000010000000000000001
- // The below constants are used for incrementing the counter blocks.
- // ctr_pattern points to the four 128-bit values [0, 1, 2, 3].
- // inc_2blocks and inc_4blocks point to the single 128-bit values 2 and
- // 4. Note that the same '2' is reused in ctr_pattern and inc_2blocks.
+ // Values needed to prepare the initial vector of counter blocks.
.Lctr_pattern:
.octa 0
.octa 1
-.Linc_2blocks:
.octa 2
.octa 3
+
+ // The number of AES blocks per vector, as a 128-bit value.
.Linc_4blocks:
.octa 4
@@ -130,29 +94,13 @@
// Offset to end of hash key powers array in the key struct.
//
// This is immediately followed by three zeroized padding blocks, which are
-// included so that partial vectors can be handled more easily. E.g. if VL=64
-// and two blocks remain, we load the 4 values [H^2, H^1, 0, 0]. The most
-// padding blocks needed is 3, which occurs if [H^1, 0, 0, 0] is loaded.
+// included so that partial vectors can be handled more easily. E.g. if two
+// blocks remain, we load the 4 values [H^2, H^1, 0, 0]. The most padding
+// blocks needed is 3, which occurs if [H^1, 0, 0, 0] is loaded.
#define OFFSETOFEND_H_POWERS (OFFSETOF_H_POWERS + (NUM_H_POWERS * 16))
.text
-// Set the vector length in bytes. This sets the VL variable and defines
-// register aliases V0-V31 that map to the ymm or zmm registers.
-.macro _set_veclen vl
- .set VL, \vl
-.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
- 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
-.if VL == 32
- .set V\i, %ymm\i
-.elseif VL == 64
- .set V\i, %zmm\i
-.else
- .error "Unsupported vector length"
-.endif
-.endr
-.endm
-
// The _ghash_mul_step macro does one step of GHASH multiplication of the
// 128-bit lanes of \a by the corresponding 128-bit lanes of \b and storing the
// reduced products in \dst. \t0, \t1, and \t2 are temporary registers of the
@@ -312,39 +260,44 @@
vpternlogd $0x96, \t0, \mi, \hi
.endm
-// void aes_gcm_precompute_##suffix(struct aes_gcm_key_avx10 *key);
-//
-// Given the expanded AES key |key->aes_key|, this function derives the GHASH
-// subkey and initializes |key->ghash_key_powers| with powers of it.
-//
-// The number of key powers initialized is NUM_H_POWERS, and they are stored in
-// the order H^NUM_H_POWERS to H^1. The zeroized padding blocks after the key
-// powers themselves are also initialized.
+// This is a specialized version of _ghash_mul that computes \a * \a, i.e. it
+// squares \a. It skips computing MI = (a_L * a_H) + (a_H * a_L) = 0.
+.macro _ghash_square a, dst, gfpoly, t0, t1
+ vpclmulqdq $0x00, \a, \a, \t0 // LO = a_L * a_L
+ vpclmulqdq $0x11, \a, \a, \dst // HI = a_H * a_H
+ vpclmulqdq $0x01, \t0, \gfpoly, \t1 // LO_L*(x^63 + x^62 + x^57)
+ vpshufd $0x4e, \t0, \t0 // Swap halves of LO
+ vpxord \t0, \t1, \t1 // Fold LO into MI
+ vpclmulqdq $0x01, \t1, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57)
+ vpshufd $0x4e, \t1, \t1 // Swap halves of MI
+ vpternlogd $0x96, \t0, \t1, \dst // Fold MI into HI
+.endm
+
+// void aes_gcm_precompute_vaes_avx512(struct aes_gcm_key_vaes_avx512 *key);
//
-// This macro supports both VL=32 and VL=64. _set_veclen must have been invoked
-// with the desired length. In the VL=32 case, the function computes twice as
-// many key powers than are actually used by the VL=32 GCM update functions.
-// This is done to keep the key format the same regardless of vector length.
-.macro _aes_gcm_precompute
+// Given the expanded AES key |key->base.aes_key|, derive the GHASH subkey and
+// initialize |key->h_powers| and |key->padding|.
+SYM_FUNC_START(aes_gcm_precompute_vaes_avx512)
// Function arguments
.set KEY, %rdi
- // Additional local variables. V0-V2 and %rax are used as temporaries.
+ // Additional local variables.
+ // %zmm[0-2] and %rax are used as temporaries.
.set POWERS_PTR, %rsi
.set RNDKEYLAST_PTR, %rdx
- .set H_CUR, V3
+ .set H_CUR, %zmm3
.set H_CUR_YMM, %ymm3
.set H_CUR_XMM, %xmm3
- .set H_INC, V4
+ .set H_INC, %zmm4
.set H_INC_YMM, %ymm4
.set H_INC_XMM, %xmm4
- .set GFPOLY, V5
+ .set GFPOLY, %zmm5
.set GFPOLY_YMM, %ymm5
.set GFPOLY_XMM, %xmm5
// Get pointer to lowest set of key powers (located at end of array).
- lea OFFSETOFEND_H_POWERS-VL(KEY), POWERS_PTR
+ lea OFFSETOFEND_H_POWERS-64(KEY), POWERS_PTR
// Encrypt an all-zeroes block to get the raw hash subkey.
movl OFFSETOF_AESKEYLEN(KEY), %eax
@@ -363,8 +316,8 @@
// Zeroize the padding blocks.
vpxor %xmm0, %xmm0, %xmm0
- vmovdqu %ymm0, VL(POWERS_PTR)
- vmovdqu %xmm0, VL+2*16(POWERS_PTR)
+ vmovdqu %ymm0, 64(POWERS_PTR)
+ vmovdqu %xmm0, 64+2*16(POWERS_PTR)
// Finish preprocessing the first key power, H^1. Since this GHASH
// implementation operates directly on values with the backwards bit
@@ -397,54 +350,44 @@
// special needs to be done to make this happen, though: H^1 * H^1 would
// end up with two factors of x^-1, but the multiplication consumes one.
// So the product H^2 ends up with the desired one factor of x^-1.
- _ghash_mul H_CUR_XMM, H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, \
- %xmm0, %xmm1, %xmm2
+ _ghash_square H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, %xmm0, %xmm1
// Create H_CUR_YMM = [H^2, H^1] and H_INC_YMM = [H^2, H^2].
vinserti128 $1, H_CUR_XMM, H_INC_YMM, H_CUR_YMM
vinserti128 $1, H_INC_XMM, H_INC_YMM, H_INC_YMM
-.if VL == 64
// Create H_CUR = [H^4, H^3, H^2, H^1] and H_INC = [H^4, H^4, H^4, H^4].
_ghash_mul H_INC_YMM, H_CUR_YMM, H_INC_YMM, GFPOLY_YMM, \
%ymm0, %ymm1, %ymm2
vinserti64x4 $1, H_CUR_YMM, H_INC, H_CUR
vshufi64x2 $0, H_INC, H_INC, H_INC
-.endif
// Store the lowest set of key powers.
vmovdqu8 H_CUR, (POWERS_PTR)
- // Compute and store the remaining key powers. With VL=32, repeatedly
- // multiply [H^(i+1), H^i] by [H^2, H^2] to get [H^(i+3), H^(i+2)].
- // With VL=64, repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by
+ // Compute and store the remaining key powers.
+ // Repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by
// [H^4, H^4, H^4, H^4] to get [H^(i+7), H^(i+6), H^(i+5), H^(i+4)].
- mov $(NUM_H_POWERS*16/VL) - 1, %eax
-.Lprecompute_next\@:
- sub $VL, POWERS_PTR
- _ghash_mul H_INC, H_CUR, H_CUR, GFPOLY, V0, V1, V2
+ mov $3, %eax
+.Lprecompute_next:
+ sub $64, POWERS_PTR
+ _ghash_mul H_INC, H_CUR, H_CUR, GFPOLY, %zmm0, %zmm1, %zmm2
vmovdqu8 H_CUR, (POWERS_PTR)
dec %eax
- jnz .Lprecompute_next\@
+ jnz .Lprecompute_next
vzeroupper // This is needed after using ymm or zmm registers.
RET
-.endm
+SYM_FUNC_END(aes_gcm_precompute_vaes_avx512)
// XOR together the 128-bit lanes of \src (whose low lane is \src_xmm) and store
// the result in \dst_xmm. This implicitly zeroizes the other lanes of dst.
.macro _horizontal_xor src, src_xmm, dst_xmm, t0_xmm, t1_xmm, t2_xmm
vextracti32x4 $1, \src, \t0_xmm
-.if VL == 32
- vpxord \t0_xmm, \src_xmm, \dst_xmm
-.elseif VL == 64
vextracti32x4 $2, \src, \t1_xmm
vextracti32x4 $3, \src, \t2_xmm
vpxord \t0_xmm, \src_xmm, \dst_xmm
vpternlogd $0x96, \t1_xmm, \t2_xmm, \dst_xmm
-.else
- .error "Unsupported vector length"
-.endif
.endm
// Do one step of the GHASH update of the data blocks given in the vector
@@ -458,25 +401,21 @@
//
// The GHASH update does: GHASH_ACC = H_POW4*(GHASHDATA0 + GHASH_ACC) +
// H_POW3*GHASHDATA1 + H_POW2*GHASHDATA2 + H_POW1*GHASHDATA3, where the
-// operations are vectorized operations on vectors of 16-byte blocks. E.g.,
-// with VL=32 there are 2 blocks per vector and the vectorized terms correspond
-// to the following non-vectorized terms:
-//
-// H_POW4*(GHASHDATA0 + GHASH_ACC) => H^8*(blk0 + GHASH_ACC_XMM) and H^7*(blk1 + 0)
-// H_POW3*GHASHDATA1 => H^6*blk2 and H^5*blk3
-// H_POW2*GHASHDATA2 => H^4*blk4 and H^3*blk5
-// H_POW1*GHASHDATA3 => H^2*blk6 and H^1*blk7
+// operations are vectorized operations on 512-bit vectors of 128-bit blocks.
+// The vectorized terms correspond to the following non-vectorized terms:
//
-// With VL=64, we use 4 blocks/vector, H^16 through H^1, and blk0 through blk15.
+// H_POW4*(GHASHDATA0 + GHASH_ACC) => H^16*(blk0 + GHASH_ACC_XMM),
+// H^15*(blk1 + 0), H^14*(blk2 + 0), and H^13*(blk3 + 0)
+// H_POW3*GHASHDATA1 => H^12*blk4, H^11*blk5, H^10*blk6, and H^9*blk7
+// H_POW2*GHASHDATA2 => H^8*blk8, H^7*blk9, H^6*blk10, and H^5*blk11
+// H_POW1*GHASHDATA3 => H^4*blk12, H^3*blk13, H^2*blk14, and H^1*blk15
//
// More concretely, this code does:
// - Do vectorized "schoolbook" multiplications to compute the intermediate
// 256-bit product of each block and its corresponding hash key power.
-// There are 4*VL/16 of these intermediate products.
-// - Sum (XOR) the intermediate 256-bit products across vectors. This leaves
-// VL/16 256-bit intermediate values.
+// - Sum (XOR) the intermediate 256-bit products across vectors.
// - Do a vectorized reduction of these 256-bit intermediate values to
-// 128-bits each. This leaves VL/16 128-bit intermediate values.
+// 128-bits each.
// - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM.
//
// See _ghash_mul_step for the full explanation of the operations performed for
@@ -532,85 +471,224 @@
.endif
.endm
-// Do one non-last round of AES encryption on the counter blocks in V0-V3 using
-// the round key that has been broadcast to all 128-bit lanes of \round_key.
+// Update GHASH with four vectors of data blocks. See _ghash_step_4x for full
+// explanation.
+.macro _ghash_4x
+.irp i, 0,1,2,3,4,5,6,7,8,9
+ _ghash_step_4x \i
+.endr
+.endm
+
+// void aes_gcm_aad_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+// u8 ghash_acc[16],
+// const u8 *aad, int aadlen);
+//
+// This function processes the AAD (Additional Authenticated Data) in GCM.
+// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
+// data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all
+// zeroes. |aadlen| must be a multiple of 16, except on the last call where it
+// can be any length. The caller must do any buffering needed to ensure this.
+//
+// This handles large amounts of AAD efficiently, while also keeping overhead
+// low for small amounts which is the common case. TLS and IPsec use less than
+// one block of AAD, but (uncommonly) other use cases may use much more.
+SYM_FUNC_START(aes_gcm_aad_update_vaes_avx512)
+
+ // Function arguments
+ .set KEY, %rdi
+ .set GHASH_ACC_PTR, %rsi
+ .set AAD, %rdx
+ .set AADLEN, %ecx
+ .set AADLEN64, %rcx // Zero-extend AADLEN before using!
+
+ // Additional local variables.
+ // %rax and %k1 are used as temporary registers.
+ .set GHASHDATA0, %zmm0
+ .set GHASHDATA0_XMM, %xmm0
+ .set GHASHDATA1, %zmm1
+ .set GHASHDATA1_XMM, %xmm1
+ .set GHASHDATA2, %zmm2
+ .set GHASHDATA2_XMM, %xmm2
+ .set GHASHDATA3, %zmm3
+ .set BSWAP_MASK, %zmm4
+ .set BSWAP_MASK_XMM, %xmm4
+ .set GHASH_ACC, %zmm5
+ .set GHASH_ACC_XMM, %xmm5
+ .set H_POW4, %zmm6
+ .set H_POW3, %zmm7
+ .set H_POW2, %zmm8
+ .set H_POW1, %zmm9
+ .set H_POW1_XMM, %xmm9
+ .set GFPOLY, %zmm10
+ .set GFPOLY_XMM, %xmm10
+ .set GHASHTMP0, %zmm11
+ .set GHASHTMP1, %zmm12
+ .set GHASHTMP2, %zmm13
+
+ // Load the GHASH accumulator.
+ vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM
+
+ // Check for the common case of AADLEN <= 16, as well as AADLEN == 0.
+ cmp $16, AADLEN
+ jg .Laad_more_than_16bytes
+ test AADLEN, AADLEN
+ jz .Laad_done
+
+ // Fast path: update GHASH with 1 <= AADLEN <= 16 bytes of AAD.
+ vmovdqu .Lbswap_mask(%rip), BSWAP_MASK_XMM
+ vmovdqu .Lgfpoly(%rip), GFPOLY_XMM
+ mov $-1, %eax
+ bzhi AADLEN, %eax, %eax
+ kmovd %eax, %k1
+ vmovdqu8 (AAD), GHASHDATA0_XMM{%k1}{z}
+ vmovdqu OFFSETOFEND_H_POWERS-16(KEY), H_POW1_XMM
+ vpshufb BSWAP_MASK_XMM, GHASHDATA0_XMM, GHASHDATA0_XMM
+ vpxor GHASHDATA0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+ _ghash_mul H_POW1_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM, GFPOLY_XMM, \
+ GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM
+ jmp .Laad_done
+
+.Laad_more_than_16bytes:
+ vbroadcasti32x4 .Lbswap_mask(%rip), BSWAP_MASK
+ vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY
+
+ // If AADLEN >= 256, update GHASH with 256 bytes of AAD at a time.
+ sub $256, AADLEN
+ jl .Laad_loop_4x_done
+ vmovdqu8 OFFSETOFEND_H_POWERS-4*64(KEY), H_POW4
+ vmovdqu8 OFFSETOFEND_H_POWERS-3*64(KEY), H_POW3
+ vmovdqu8 OFFSETOFEND_H_POWERS-2*64(KEY), H_POW2
+ vmovdqu8 OFFSETOFEND_H_POWERS-1*64(KEY), H_POW1
+.Laad_loop_4x:
+ vmovdqu8 0*64(AAD), GHASHDATA0
+ vmovdqu8 1*64(AAD), GHASHDATA1
+ vmovdqu8 2*64(AAD), GHASHDATA2
+ vmovdqu8 3*64(AAD), GHASHDATA3
+ _ghash_4x
+ add $256, AAD
+ sub $256, AADLEN
+ jge .Laad_loop_4x
+.Laad_loop_4x_done:
+
+ // If AADLEN >= 64, update GHASH with 64 bytes of AAD at a time.
+ add $192, AADLEN
+ jl .Laad_loop_1x_done
+ vmovdqu8 OFFSETOFEND_H_POWERS-1*64(KEY), H_POW1
+.Laad_loop_1x:
+ vmovdqu8 (AAD), GHASHDATA0
+ vpshufb BSWAP_MASK, GHASHDATA0, GHASHDATA0
+ vpxord GHASHDATA0, GHASH_ACC, GHASH_ACC
+ _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+ GHASHDATA0, GHASHDATA1, GHASHDATA2
+ _horizontal_xor GHASH_ACC, GHASH_ACC_XMM, GHASH_ACC_XMM, \
+ GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM
+ add $64, AAD
+ sub $64, AADLEN
+ jge .Laad_loop_1x
+.Laad_loop_1x_done:
+
+ // Update GHASH with the remaining 0 <= AADLEN < 64 bytes of AAD.
+ add $64, AADLEN
+ jz .Laad_done
+ mov $-1, %rax
+ bzhi AADLEN64, %rax, %rax
+ kmovq %rax, %k1
+ vmovdqu8 (AAD), GHASHDATA0{%k1}{z}
+ neg AADLEN64
+ and $~15, AADLEN64 // -round_up(AADLEN, 16)
+ vmovdqu8 OFFSETOFEND_H_POWERS(KEY,AADLEN64), H_POW1
+ vpshufb BSWAP_MASK, GHASHDATA0, GHASHDATA0
+ vpxord GHASHDATA0, GHASH_ACC, GHASH_ACC
+ _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+ GHASHDATA0, GHASHDATA1, GHASHDATA2
+ _horizontal_xor GHASH_ACC, GHASH_ACC_XMM, GHASH_ACC_XMM, \
+ GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM
+
+.Laad_done:
+ // Store the updated GHASH accumulator back to memory.
+ vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR)
+
+ vzeroupper // This is needed after using ymm or zmm registers.
+ RET
+SYM_FUNC_END(aes_gcm_aad_update_vaes_avx512)
+
+// Do one non-last round of AES encryption on the blocks in %zmm[0-3] using the
+// round key that has been broadcast to all 128-bit lanes of \round_key.
.macro _vaesenc_4x round_key
- vaesenc \round_key, V0, V0
- vaesenc \round_key, V1, V1
- vaesenc \round_key, V2, V2
- vaesenc \round_key, V3, V3
+ vaesenc \round_key, %zmm0, %zmm0
+ vaesenc \round_key, %zmm1, %zmm1
+ vaesenc \round_key, %zmm2, %zmm2
+ vaesenc \round_key, %zmm3, %zmm3
.endm
// Start the AES encryption of four vectors of counter blocks.
.macro _ctr_begin_4x
// Increment LE_CTR four times to generate four vectors of little-endian
- // counter blocks, swap each to big-endian, and store them in V0-V3.
- vpshufb BSWAP_MASK, LE_CTR, V0
+ // counter blocks, swap each to big-endian, and store them in %zmm[0-3].
+ vpshufb BSWAP_MASK, LE_CTR, %zmm0
vpaddd LE_CTR_INC, LE_CTR, LE_CTR
- vpshufb BSWAP_MASK, LE_CTR, V1
+ vpshufb BSWAP_MASK, LE_CTR, %zmm1
vpaddd LE_CTR_INC, LE_CTR, LE_CTR
- vpshufb BSWAP_MASK, LE_CTR, V2
+ vpshufb BSWAP_MASK, LE_CTR, %zmm2
vpaddd LE_CTR_INC, LE_CTR, LE_CTR
- vpshufb BSWAP_MASK, LE_CTR, V3
+ vpshufb BSWAP_MASK, LE_CTR, %zmm3
vpaddd LE_CTR_INC, LE_CTR, LE_CTR
// AES "round zero": XOR in the zero-th round key.
- vpxord RNDKEY0, V0, V0
- vpxord RNDKEY0, V1, V1
- vpxord RNDKEY0, V2, V2
- vpxord RNDKEY0, V3, V3
+ vpxord RNDKEY0, %zmm0, %zmm0
+ vpxord RNDKEY0, %zmm1, %zmm1
+ vpxord RNDKEY0, %zmm2, %zmm2
+ vpxord RNDKEY0, %zmm3, %zmm3
.endm
-// Do the last AES round for four vectors of counter blocks V0-V3, XOR source
-// data with the resulting keystream, and write the result to DST and
+// Do the last AES round for four vectors of counter blocks %zmm[0-3], XOR
+// source data with the resulting keystream, and write the result to DST and
// GHASHDATA[0-3]. (Implementation differs slightly, but has the same effect.)
.macro _aesenclast_and_xor_4x
// XOR the source data with the last round key, saving the result in
// GHASHDATA[0-3]. This reduces latency by taking advantage of the
// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
- vpxord 0*VL(SRC), RNDKEYLAST, GHASHDATA0
- vpxord 1*VL(SRC), RNDKEYLAST, GHASHDATA1
- vpxord 2*VL(SRC), RNDKEYLAST, GHASHDATA2
- vpxord 3*VL(SRC), RNDKEYLAST, GHASHDATA3
+ vpxord 0*64(SRC), RNDKEYLAST, GHASHDATA0
+ vpxord 1*64(SRC), RNDKEYLAST, GHASHDATA1
+ vpxord 2*64(SRC), RNDKEYLAST, GHASHDATA2
+ vpxord 3*64(SRC), RNDKEYLAST, GHASHDATA3
// Do the last AES round. This handles the XOR with the source data
// too, as per the optimization described above.
- vaesenclast GHASHDATA0, V0, GHASHDATA0
- vaesenclast GHASHDATA1, V1, GHASHDATA1
- vaesenclast GHASHDATA2, V2, GHASHDATA2
- vaesenclast GHASHDATA3, V3, GHASHDATA3
+ vaesenclast GHASHDATA0, %zmm0, GHASHDATA0
+ vaesenclast GHASHDATA1, %zmm1, GHASHDATA1
+ vaesenclast GHASHDATA2, %zmm2, GHASHDATA2
+ vaesenclast GHASHDATA3, %zmm3, GHASHDATA3
// Store the en/decrypted data to DST.
- vmovdqu8 GHASHDATA0, 0*VL(DST)
- vmovdqu8 GHASHDATA1, 1*VL(DST)
- vmovdqu8 GHASHDATA2, 2*VL(DST)
- vmovdqu8 GHASHDATA3, 3*VL(DST)
+ vmovdqu8 GHASHDATA0, 0*64(DST)
+ vmovdqu8 GHASHDATA1, 1*64(DST)
+ vmovdqu8 GHASHDATA2, 2*64(DST)
+ vmovdqu8 GHASHDATA3, 3*64(DST)
.endm
-// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_avx10 *key,
-// const u32 le_ctr[4], u8 ghash_acc[16],
-// const u8 *src, u8 *dst, int datalen);
+// void aes_gcm_{enc,dec}_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+// const u32 le_ctr[4], u8 ghash_acc[16],
+// const u8 *src, u8 *dst, int datalen);
//
// This macro generates a GCM encryption or decryption update function with the
-// above prototype (with \enc selecting which one). This macro supports both
-// VL=32 and VL=64. _set_veclen must have been invoked with the desired length.
-//
-// This function computes the next portion of the CTR keystream, XOR's it with
-// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted
-// data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the
-// next |datalen| ciphertext bytes.
+// above prototype (with \enc selecting which one). The function computes the
+// next portion of the CTR keystream, XOR's it with |datalen| bytes from |src|,
+// and writes the resulting encrypted or decrypted data to |dst|. It also
+// updates the GHASH accumulator |ghash_acc| using the next |datalen| ciphertext
+// bytes.
//
// |datalen| must be a multiple of 16, except on the last call where it can be
// any length. The caller must do any buffering needed to ensure this. Both
// in-place and out-of-place en/decryption are supported.
//
-// |le_ctr| must give the current counter in little-endian format. For a new
-// message, the low word of the counter must be 2. This function loads the
-// counter from |le_ctr| and increments the loaded counter as needed, but it
-// does *not* store the updated counter back to |le_ctr|. The caller must
-// update |le_ctr| if any more data segments follow. Internally, only the low
-// 32-bit word of the counter is incremented, following the GCM standard.
+// |le_ctr| must give the current counter in little-endian format. This
+// function loads the counter from |le_ctr| and increments the loaded counter as
+// needed, but it does *not* store the updated counter back to |le_ctr|. The
+// caller must update |le_ctr| if any more data segments follow. Internally,
+// only the low 32-bit word of the counter is incremented, following the GCM
+// standard.
.macro _aes_gcm_update enc
// Function arguments
@@ -634,69 +712,69 @@
// Pointer to the last AES round key for the chosen AES variant
.set RNDKEYLAST_PTR, %r11
- // In the main loop, V0-V3 are used as AES input and output. Elsewhere
- // they are used as temporary registers.
+ // In the main loop, %zmm[0-3] are used as AES input and output.
+ // Elsewhere they are used as temporary registers.
// GHASHDATA[0-3] hold the ciphertext blocks and GHASH input data.
- .set GHASHDATA0, V4
+ .set GHASHDATA0, %zmm4
.set GHASHDATA0_XMM, %xmm4
- .set GHASHDATA1, V5
+ .set GHASHDATA1, %zmm5
.set GHASHDATA1_XMM, %xmm5
- .set GHASHDATA2, V6
+ .set GHASHDATA2, %zmm6
.set GHASHDATA2_XMM, %xmm6
- .set GHASHDATA3, V7
+ .set GHASHDATA3, %zmm7
// BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values
// using vpshufb, copied to all 128-bit lanes.
- .set BSWAP_MASK, V8
+ .set BSWAP_MASK, %zmm8
// RNDKEY temporarily holds the next AES round key.
- .set RNDKEY, V9
+ .set RNDKEY, %zmm9
// GHASH_ACC is the accumulator variable for GHASH. When fully reduced,
// only the lowest 128-bit lane can be nonzero. When not fully reduced,
// more than one lane may be used, and they need to be XOR'd together.
- .set GHASH_ACC, V10
+ .set GHASH_ACC, %zmm10
.set GHASH_ACC_XMM, %xmm10
// LE_CTR_INC is the vector of 32-bit words that need to be added to a
// vector of little-endian counter blocks to advance it forwards.
- .set LE_CTR_INC, V11
+ .set LE_CTR_INC, %zmm11
// LE_CTR contains the next set of little-endian counter blocks.
- .set LE_CTR, V12
+ .set LE_CTR, %zmm12
// RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys,
// copied to all 128-bit lanes. RNDKEY0 is the zero-th round key,
// RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last.
- .set RNDKEY0, V13
- .set RNDKEYLAST, V14
- .set RNDKEY_M9, V15
- .set RNDKEY_M8, V16
- .set RNDKEY_M7, V17
- .set RNDKEY_M6, V18
- .set RNDKEY_M5, V19
- .set RNDKEY_M4, V20
- .set RNDKEY_M3, V21
- .set RNDKEY_M2, V22
- .set RNDKEY_M1, V23
+ .set RNDKEY0, %zmm13
+ .set RNDKEYLAST, %zmm14
+ .set RNDKEY_M9, %zmm15
+ .set RNDKEY_M8, %zmm16
+ .set RNDKEY_M7, %zmm17
+ .set RNDKEY_M6, %zmm18
+ .set RNDKEY_M5, %zmm19
+ .set RNDKEY_M4, %zmm20
+ .set RNDKEY_M3, %zmm21
+ .set RNDKEY_M2, %zmm22
+ .set RNDKEY_M1, %zmm23
// GHASHTMP[0-2] are temporary variables used by _ghash_step_4x. These
// cannot coincide with anything used for AES encryption, since for
// performance reasons GHASH and AES encryption are interleaved.
- .set GHASHTMP0, V24
- .set GHASHTMP1, V25
- .set GHASHTMP2, V26
+ .set GHASHTMP0, %zmm24
+ .set GHASHTMP1, %zmm25
+ .set GHASHTMP2, %zmm26
- // H_POW[4-1] contain the powers of the hash key H^(4*VL/16)...H^1. The
+ // H_POW[4-1] contain the powers of the hash key H^16...H^1. The
// descending numbering reflects the order of the key powers.
- .set H_POW4, V27
- .set H_POW3, V28
- .set H_POW2, V29
- .set H_POW1, V30
+ .set H_POW4, %zmm27
+ .set H_POW3, %zmm28
+ .set H_POW2, %zmm29
+ .set H_POW1, %zmm30
// GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes.
- .set GFPOLY, V31
+ .set GFPOLY, %zmm31
// Load some constants.
vbroadcasti32x4 .Lbswap_mask(%rip), BSWAP_MASK
@@ -719,29 +797,23 @@
// Finish initializing LE_CTR by adding [0, 1, ...] to its low words.
vpaddd .Lctr_pattern(%rip), LE_CTR, LE_CTR
- // Initialize LE_CTR_INC to contain VL/16 in all 128-bit lanes.
-.if VL == 32
- vbroadcasti32x4 .Linc_2blocks(%rip), LE_CTR_INC
-.elseif VL == 64
+ // Load 4 into all 128-bit lanes of LE_CTR_INC.
vbroadcasti32x4 .Linc_4blocks(%rip), LE_CTR_INC
-.else
- .error "Unsupported vector length"
-.endif
- // If there are at least 4*VL bytes of data, then continue into the loop
- // that processes 4*VL bytes of data at a time. Otherwise skip it.
+ // If there are at least 256 bytes of data, then continue into the loop
+ // that processes 256 bytes of data at a time. Otherwise skip it.
//
- // Pre-subtracting 4*VL from DATALEN saves an instruction from the main
+ // Pre-subtracting 256 from DATALEN saves an instruction from the main
// loop and also ensures that at least one write always occurs to
// DATALEN, zero-extending it and allowing DATALEN64 to be used later.
- add $-4*VL, DATALEN // shorter than 'sub 4*VL' when VL=32
+ sub $256, DATALEN
jl .Lcrypt_loop_4x_done\@
// Load powers of the hash key.
- vmovdqu8 OFFSETOFEND_H_POWERS-4*VL(KEY), H_POW4
- vmovdqu8 OFFSETOFEND_H_POWERS-3*VL(KEY), H_POW3
- vmovdqu8 OFFSETOFEND_H_POWERS-2*VL(KEY), H_POW2
- vmovdqu8 OFFSETOFEND_H_POWERS-1*VL(KEY), H_POW1
+ vmovdqu8 OFFSETOFEND_H_POWERS-4*64(KEY), H_POW4
+ vmovdqu8 OFFSETOFEND_H_POWERS-3*64(KEY), H_POW3
+ vmovdqu8 OFFSETOFEND_H_POWERS-2*64(KEY), H_POW2
+ vmovdqu8 OFFSETOFEND_H_POWERS-1*64(KEY), H_POW1
// Main loop: en/decrypt and hash 4 vectors at a time.
//
@@ -770,9 +842,9 @@
cmp %rax, RNDKEYLAST_PTR
jne 1b
_aesenclast_and_xor_4x
- sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32
- sub $-4*VL, DST
- add $-4*VL, DATALEN
+ add $256, SRC
+ add $256, DST
+ sub $256, DATALEN
jl .Lghash_last_ciphertext_4x\@
.endif
@@ -786,10 +858,10 @@
// If decrypting, load more ciphertext blocks into GHASHDATA[0-3]. If
// encrypting, GHASHDATA[0-3] already contain the previous ciphertext.
.if !\enc
- vmovdqu8 0*VL(SRC), GHASHDATA0
- vmovdqu8 1*VL(SRC), GHASHDATA1
- vmovdqu8 2*VL(SRC), GHASHDATA2
- vmovdqu8 3*VL(SRC), GHASHDATA3
+ vmovdqu8 0*64(SRC), GHASHDATA0
+ vmovdqu8 1*64(SRC), GHASHDATA1
+ vmovdqu8 2*64(SRC), GHASHDATA2
+ vmovdqu8 3*64(SRC), GHASHDATA3
.endif
// Start the AES encryption of the counter blocks.
@@ -809,44 +881,44 @@
_vaesenc_4x RNDKEY
128:
- // Finish the AES encryption of the counter blocks in V0-V3, interleaved
- // with the GHASH update of the ciphertext blocks in GHASHDATA[0-3].
+ // Finish the AES encryption of the counter blocks in %zmm[0-3],
+ // interleaved with the GHASH update of the ciphertext blocks in
+ // GHASHDATA[0-3].
.irp i, 9,8,7,6,5,4,3,2,1
_ghash_step_4x (9 - \i)
_vaesenc_4x RNDKEY_M\i
.endr
_ghash_step_4x 9
_aesenclast_and_xor_4x
- sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32
- sub $-4*VL, DST
- add $-4*VL, DATALEN
+ add $256, SRC
+ add $256, DST
+ sub $256, DATALEN
jge .Lcrypt_loop_4x\@
.if \enc
.Lghash_last_ciphertext_4x\@:
// Update GHASH with the last set of ciphertext blocks.
-.irp i, 0,1,2,3,4,5,6,7,8,9
- _ghash_step_4x \i
-.endr
+ _ghash_4x
.endif
.Lcrypt_loop_4x_done\@:
- // Undo the extra subtraction by 4*VL and check whether data remains.
- sub $-4*VL, DATALEN // shorter than 'add 4*VL' when VL=32
+ // Undo the extra subtraction by 256 and check whether data remains.
+ add $256, DATALEN
jz .Ldone\@
- // The data length isn't a multiple of 4*VL. Process the remaining data
- // of length 1 <= DATALEN < 4*VL, up to one vector (VL bytes) at a time.
- // Going one vector at a time may seem inefficient compared to having
- // separate code paths for each possible number of vectors remaining.
- // However, using a loop keeps the code size down, and it performs
- // surprising well; modern CPUs will start executing the next iteration
- // before the previous one finishes and also predict the number of loop
- // iterations. For a similar reason, we roll up the AES rounds.
+ // The data length isn't a multiple of 256 bytes. Process the remaining
+ // data of length 1 <= DATALEN < 256, up to one 64-byte vector at a
+ // time. Going one vector at a time may seem inefficient compared to
+ // having separate code paths for each possible number of vectors
+ // remaining. However, using a loop keeps the code size down, and it
+ // performs surprising well; modern CPUs will start executing the next
+ // iteration before the previous one finishes and also predict the
+ // number of loop iterations. For a similar reason, we roll up the AES
+ // rounds.
//
- // On the last iteration, the remaining length may be less than VL.
- // Handle this using masking.
+ // On the last iteration, the remaining length may be less than 64
+ // bytes. Handle this using masking.
//
// Since there are enough key powers available for all remaining data,
// there is no need to do a GHASH reduction after each iteration.
@@ -875,65 +947,60 @@
.Lcrypt_loop_1x\@:
// Select the appropriate mask for this iteration: all 1's if
- // DATALEN >= VL, otherwise DATALEN 1's. Do this branchlessly using the
+ // DATALEN >= 64, otherwise DATALEN 1's. Do this branchlessly using the
// bzhi instruction from BMI2. (This relies on DATALEN <= 255.)
-.if VL < 64
- mov $-1, %eax
- bzhi DATALEN, %eax, %eax
- kmovd %eax, %k1
-.else
mov $-1, %rax
bzhi DATALEN64, %rax, %rax
kmovq %rax, %k1
-.endif
// Encrypt a vector of counter blocks. This does not need to be masked.
- vpshufb BSWAP_MASK, LE_CTR, V0
+ vpshufb BSWAP_MASK, LE_CTR, %zmm0
vpaddd LE_CTR_INC, LE_CTR, LE_CTR
- vpxord RNDKEY0, V0, V0
+ vpxord RNDKEY0, %zmm0, %zmm0
lea 16(KEY), %rax
1:
vbroadcasti32x4 (%rax), RNDKEY
- vaesenc RNDKEY, V0, V0
+ vaesenc RNDKEY, %zmm0, %zmm0
add $16, %rax
cmp %rax, RNDKEYLAST_PTR
jne 1b
- vaesenclast RNDKEYLAST, V0, V0
+ vaesenclast RNDKEYLAST, %zmm0, %zmm0
// XOR the data with the appropriate number of keystream bytes.
- vmovdqu8 (SRC), V1{%k1}{z}
- vpxord V1, V0, V0
- vmovdqu8 V0, (DST){%k1}
+ vmovdqu8 (SRC), %zmm1{%k1}{z}
+ vpxord %zmm1, %zmm0, %zmm0
+ vmovdqu8 %zmm0, (DST){%k1}
// Update GHASH with the ciphertext block(s), without reducing.
//
- // In the case of DATALEN < VL, the ciphertext is zero-padded to VL.
- // (If decrypting, it's done by the above masked load. If encrypting,
- // it's done by the below masked register-to-register move.) Note that
- // if DATALEN <= VL - 16, there will be additional padding beyond the
- // padding of the last block specified by GHASH itself; i.e., there may
- // be whole block(s) that get processed by the GHASH multiplication and
- // reduction instructions but should not actually be included in the
+ // In the case of DATALEN < 64, the ciphertext is zero-padded to 64
+ // bytes. (If decrypting, it's done by the above masked load. If
+ // encrypting, it's done by the below masked register-to-register move.)
+ // Note that if DATALEN <= 48, there will be additional padding beyond
+ // the padding of the last block specified by GHASH itself; i.e., there
+ // may be whole block(s) that get processed by the GHASH multiplication
+ // and reduction instructions but should not actually be included in the
// GHASH. However, any such blocks are all-zeroes, and the values that
// they're multiplied with are also all-zeroes. Therefore they just add
// 0 * 0 = 0 to the final GHASH result, which makes no difference.
vmovdqu8 (POWERS_PTR), H_POW1
.if \enc
- vmovdqu8 V0, V1{%k1}{z}
+ vmovdqu8 %zmm0, %zmm1{%k1}{z}
.endif
- vpshufb BSWAP_MASK, V1, V0
- vpxord GHASH_ACC, V0, V0
- _ghash_mul_noreduce H_POW1, V0, LO, MI, HI, GHASHDATA3, V1, V2, V3
+ vpshufb BSWAP_MASK, %zmm1, %zmm0
+ vpxord GHASH_ACC, %zmm0, %zmm0
+ _ghash_mul_noreduce H_POW1, %zmm0, LO, MI, HI, \
+ GHASHDATA3, %zmm1, %zmm2, %zmm3
vpxor GHASH_ACC_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
- add $VL, POWERS_PTR
- add $VL, SRC
- add $VL, DST
- sub $VL, DATALEN
+ add $64, POWERS_PTR
+ add $64, SRC
+ add $64, DST
+ sub $64, DATALEN
jg .Lcrypt_loop_1x\@
// Finally, do the GHASH reduction.
- _ghash_reduce LO, MI, HI, GFPOLY, V0
+ _ghash_reduce LO, MI, HI, GFPOLY, %zmm0
_horizontal_xor HI, HI_XMM, GHASH_ACC_XMM, %xmm0, %xmm1, %xmm2
.Ldone\@:
@@ -944,14 +1011,14 @@
RET
.endm
-// void aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
-// const u32 le_ctr[4], u8 ghash_acc[16],
-// u64 total_aadlen, u64 total_datalen);
-// bool aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
-// const u32 le_ctr[4],
-// const u8 ghash_acc[16],
-// u64 total_aadlen, u64 total_datalen,
-// const u8 tag[16], int taglen);
+// void aes_gcm_enc_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+// const u32 le_ctr[4], u8 ghash_acc[16],
+// u64 total_aadlen, u64 total_datalen);
+// bool aes_gcm_dec_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+// const u32 le_ctr[4],
+// const u8 ghash_acc[16],
+// u64 total_aadlen, u64 total_datalen,
+// const u8 tag[16], int taglen);
//
// This macro generates one of the above two functions (with \enc selecting
// which one). Both functions finish computing the GCM authentication tag by
@@ -1081,119 +1148,16 @@
RET
.endm
-_set_veclen 32
-SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_256)
- _aes_gcm_precompute
-SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_256)
-SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_256)
- _aes_gcm_update 1
-SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_256)
-SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_256)
- _aes_gcm_update 0
-SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_256)
-
-_set_veclen 64
-SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_512)
- _aes_gcm_precompute
-SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_512)
-SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_512)
+SYM_FUNC_START(aes_gcm_enc_update_vaes_avx512)
_aes_gcm_update 1
-SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_512)
-SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_512)
+SYM_FUNC_END(aes_gcm_enc_update_vaes_avx512)
+SYM_FUNC_START(aes_gcm_dec_update_vaes_avx512)
_aes_gcm_update 0
-SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_512)
-
-// void aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key,
-// u8 ghash_acc[16],
-// const u8 *aad, int aadlen);
-//
-// This function processes the AAD (Additional Authenticated Data) in GCM.
-// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
-// data given by |aad| and |aadlen|. |key->ghash_key_powers| must have been
-// initialized. On the first call, |ghash_acc| must be all zeroes. |aadlen|
-// must be a multiple of 16, except on the last call where it can be any length.
-// The caller must do any buffering needed to ensure this.
-//
-// AES-GCM is almost always used with small amounts of AAD, less than 32 bytes.
-// Therefore, for AAD processing we currently only provide this implementation
-// which uses 256-bit vectors (ymm registers) and only has a 1x-wide loop. This
-// keeps the code size down, and it enables some micro-optimizations, e.g. using
-// VEX-coded instructions instead of EVEX-coded to save some instruction bytes.
-// To optimize for large amounts of AAD, we could implement a 4x-wide loop and
-// provide a version using 512-bit vectors, but that doesn't seem to be useful.
-SYM_FUNC_START(aes_gcm_aad_update_vaes_avx10)
-
- // Function arguments
- .set KEY, %rdi
- .set GHASH_ACC_PTR, %rsi
- .set AAD, %rdx
- .set AADLEN, %ecx
- .set AADLEN64, %rcx // Zero-extend AADLEN before using!
-
- // Additional local variables.
- // %rax, %ymm0-%ymm3, and %k1 are used as temporary registers.
- .set BSWAP_MASK, %ymm4
- .set GFPOLY, %ymm5
- .set GHASH_ACC, %ymm6
- .set GHASH_ACC_XMM, %xmm6
- .set H_POW1, %ymm7
-
- // Load some constants.
- vbroadcasti128 .Lbswap_mask(%rip), BSWAP_MASK
- vbroadcasti128 .Lgfpoly(%rip), GFPOLY
-
- // Load the GHASH accumulator.
- vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM
-
- // Update GHASH with 32 bytes of AAD at a time.
- //
- // Pre-subtracting 32 from AADLEN saves an instruction from the loop and
- // also ensures that at least one write always occurs to AADLEN,
- // zero-extending it and allowing AADLEN64 to be used later.
- sub $32, AADLEN
- jl .Laad_loop_1x_done
- vmovdqu8 OFFSETOFEND_H_POWERS-32(KEY), H_POW1 // [H^2, H^1]
-.Laad_loop_1x:
- vmovdqu (AAD), %ymm0
- vpshufb BSWAP_MASK, %ymm0, %ymm0
- vpxor %ymm0, GHASH_ACC, GHASH_ACC
- _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
- %ymm0, %ymm1, %ymm2
- vextracti128 $1, GHASH_ACC, %xmm0
- vpxor %xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM
- add $32, AAD
- sub $32, AADLEN
- jge .Laad_loop_1x
-.Laad_loop_1x_done:
- add $32, AADLEN
- jz .Laad_done
-
- // Update GHASH with the remaining 1 <= AADLEN < 32 bytes of AAD.
- mov $-1, %eax
- bzhi AADLEN, %eax, %eax
- kmovd %eax, %k1
- vmovdqu8 (AAD), %ymm0{%k1}{z}
- neg AADLEN64
- and $~15, AADLEN64 // -round_up(AADLEN, 16)
- vmovdqu8 OFFSETOFEND_H_POWERS(KEY,AADLEN64), H_POW1
- vpshufb BSWAP_MASK, %ymm0, %ymm0
- vpxor %ymm0, GHASH_ACC, GHASH_ACC
- _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
- %ymm0, %ymm1, %ymm2
- vextracti128 $1, GHASH_ACC, %xmm0
- vpxor %xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM
-
-.Laad_done:
- // Store the updated GHASH accumulator back to memory.
- vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR)
-
- vzeroupper // This is needed after using ymm or zmm registers.
- RET
-SYM_FUNC_END(aes_gcm_aad_update_vaes_avx10)
+SYM_FUNC_END(aes_gcm_dec_update_vaes_avx512)
-SYM_FUNC_START(aes_gcm_enc_final_vaes_avx10)
+SYM_FUNC_START(aes_gcm_enc_final_vaes_avx512)
_aes_gcm_final 1
-SYM_FUNC_END(aes_gcm_enc_final_vaes_avx10)
-SYM_FUNC_START(aes_gcm_dec_final_vaes_avx10)
+SYM_FUNC_END(aes_gcm_enc_final_vaes_avx512)
+SYM_FUNC_START(aes_gcm_dec_final_vaes_avx512)
_aes_gcm_final 0
-SYM_FUNC_END(aes_gcm_dec_final_vaes_avx10)
+SYM_FUNC_END(aes_gcm_dec_final_vaes_avx512)
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index d953ac470aae..bb6e2c47ffc6 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -874,8 +874,38 @@ struct aes_gcm_key_aesni {
#define AES_GCM_KEY_AESNI_SIZE \
(sizeof(struct aes_gcm_key_aesni) + (15 & ~(CRYPTO_MINALIGN - 1)))
-/* Key struct used by the VAES + AVX10 implementations of AES-GCM */
-struct aes_gcm_key_avx10 {
+/* Key struct used by the VAES + AVX2 implementation of AES-GCM */
+struct aes_gcm_key_vaes_avx2 {
+ /*
+ * Common part of the key. The assembly code prefers 16-byte alignment
+ * for the round keys; we get this by them being located at the start of
+ * the struct and the whole struct being 32-byte aligned.
+ */
+ struct aes_gcm_key base;
+
+ /*
+ * Powers of the hash key H^8 through H^1. These are 128-bit values.
+ * They all have an extra factor of x^-1 and are byte-reversed.
+ * The assembly code prefers 32-byte alignment for this.
+ */
+ u64 h_powers[8][2] __aligned(32);
+
+ /*
+ * Each entry in this array contains the two halves of an entry of
+ * h_powers XOR'd together, in the following order:
+ * H^8,H^6,H^7,H^5,H^4,H^2,H^3,H^1 i.e. indices 0,2,1,3,4,6,5,7.
+ * This is used for Karatsuba multiplication.
+ */
+ u64 h_powers_xored[8];
+};
+
+#define AES_GCM_KEY_VAES_AVX2(key) \
+ container_of((key), struct aes_gcm_key_vaes_avx2, base)
+#define AES_GCM_KEY_VAES_AVX2_SIZE \
+ (sizeof(struct aes_gcm_key_vaes_avx2) + (31 & ~(CRYPTO_MINALIGN - 1)))
+
+/* Key struct used by the VAES + AVX512 implementation of AES-GCM */
+struct aes_gcm_key_vaes_avx512 {
/*
* Common part of the key. The assembly code prefers 16-byte alignment
* for the round keys; we get this by them being located at the start of
@@ -895,10 +925,10 @@ struct aes_gcm_key_avx10 {
/* Three padding blocks required by the assembly code */
u64 padding[3][2];
};
-#define AES_GCM_KEY_AVX10(key) \
- container_of((key), struct aes_gcm_key_avx10, base)
-#define AES_GCM_KEY_AVX10_SIZE \
- (sizeof(struct aes_gcm_key_avx10) + (63 & ~(CRYPTO_MINALIGN - 1)))
+#define AES_GCM_KEY_VAES_AVX512(key) \
+ container_of((key), struct aes_gcm_key_vaes_avx512, base)
+#define AES_GCM_KEY_VAES_AVX512_SIZE \
+ (sizeof(struct aes_gcm_key_vaes_avx512) + (63 & ~(CRYPTO_MINALIGN - 1)))
/*
* These flags are passed to the AES-GCM helper functions to specify the
@@ -910,14 +940,16 @@ struct aes_gcm_key_avx10 {
#define FLAG_RFC4106 BIT(0)
#define FLAG_ENC BIT(1)
#define FLAG_AVX BIT(2)
-#define FLAG_AVX10_256 BIT(3)
-#define FLAG_AVX10_512 BIT(4)
+#define FLAG_VAES_AVX2 BIT(3)
+#define FLAG_VAES_AVX512 BIT(4)
static inline struct aes_gcm_key *
aes_gcm_key_get(struct crypto_aead *tfm, int flags)
{
- if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
+ if (flags & FLAG_VAES_AVX512)
return PTR_ALIGN(crypto_aead_ctx(tfm), 64);
+ else if (flags & FLAG_VAES_AVX2)
+ return PTR_ALIGN(crypto_aead_ctx(tfm), 32);
else
return PTR_ALIGN(crypto_aead_ctx(tfm), 16);
}
@@ -927,26 +959,16 @@ aes_gcm_precompute_aesni(struct aes_gcm_key_aesni *key);
asmlinkage void
aes_gcm_precompute_aesni_avx(struct aes_gcm_key_aesni *key);
asmlinkage void
-aes_gcm_precompute_vaes_avx10_256(struct aes_gcm_key_avx10 *key);
+aes_gcm_precompute_vaes_avx2(struct aes_gcm_key_vaes_avx2 *key);
asmlinkage void
-aes_gcm_precompute_vaes_avx10_512(struct aes_gcm_key_avx10 *key);
+aes_gcm_precompute_vaes_avx512(struct aes_gcm_key_vaes_avx512 *key);
static void aes_gcm_precompute(struct aes_gcm_key *key, int flags)
{
- /*
- * To make things a bit easier on the assembly side, the AVX10
- * implementations use the same key format. Therefore, a single
- * function using 256-bit vectors would suffice here. However, it's
- * straightforward to provide a 512-bit one because of how the assembly
- * code is structured, and it works nicely because the total size of the
- * key powers is a multiple of 512 bits. So we take advantage of that.
- *
- * A similar situation applies to the AES-NI implementations.
- */
- if (flags & FLAG_AVX10_512)
- aes_gcm_precompute_vaes_avx10_512(AES_GCM_KEY_AVX10(key));
- else if (flags & FLAG_AVX10_256)
- aes_gcm_precompute_vaes_avx10_256(AES_GCM_KEY_AVX10(key));
+ if (flags & FLAG_VAES_AVX512)
+ aes_gcm_precompute_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key));
+ else if (flags & FLAG_VAES_AVX2)
+ aes_gcm_precompute_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key));
else if (flags & FLAG_AVX)
aes_gcm_precompute_aesni_avx(AES_GCM_KEY_AESNI(key));
else
@@ -960,15 +982,21 @@ asmlinkage void
aes_gcm_aad_update_aesni_avx(const struct aes_gcm_key_aesni *key,
u8 ghash_acc[16], const u8 *aad, int aadlen);
asmlinkage void
-aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key,
- u8 ghash_acc[16], const u8 *aad, int aadlen);
+aes_gcm_aad_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+ u8 ghash_acc[16], const u8 *aad, int aadlen);
+asmlinkage void
+aes_gcm_aad_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+ u8 ghash_acc[16], const u8 *aad, int aadlen);
static void aes_gcm_aad_update(const struct aes_gcm_key *key, u8 ghash_acc[16],
const u8 *aad, int aadlen, int flags)
{
- if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
- aes_gcm_aad_update_vaes_avx10(AES_GCM_KEY_AVX10(key), ghash_acc,
- aad, aadlen);
+ if (flags & FLAG_VAES_AVX512)
+ aes_gcm_aad_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
+ ghash_acc, aad, aadlen);
+ else if (flags & FLAG_VAES_AVX2)
+ aes_gcm_aad_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
+ ghash_acc, aad, aadlen);
else if (flags & FLAG_AVX)
aes_gcm_aad_update_aesni_avx(AES_GCM_KEY_AESNI(key), ghash_acc,
aad, aadlen);
@@ -986,13 +1014,13 @@ aes_gcm_enc_update_aesni_avx(const struct aes_gcm_key_aesni *key,
const u32 le_ctr[4], u8 ghash_acc[16],
const u8 *src, u8 *dst, int datalen);
asmlinkage void
-aes_gcm_enc_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key,
- const u32 le_ctr[4], u8 ghash_acc[16],
- const u8 *src, u8 *dst, int datalen);
+aes_gcm_enc_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
asmlinkage void
-aes_gcm_enc_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key,
- const u32 le_ctr[4], u8 ghash_acc[16],
- const u8 *src, u8 *dst, int datalen);
+aes_gcm_enc_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
asmlinkage void
aes_gcm_dec_update_aesni(const struct aes_gcm_key_aesni *key,
@@ -1003,13 +1031,13 @@ aes_gcm_dec_update_aesni_avx(const struct aes_gcm_key_aesni *key,
const u32 le_ctr[4], u8 ghash_acc[16],
const u8 *src, u8 *dst, int datalen);
asmlinkage void
-aes_gcm_dec_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key,
- const u32 le_ctr[4], u8 ghash_acc[16],
- const u8 *src, u8 *dst, int datalen);
+aes_gcm_dec_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
asmlinkage void
-aes_gcm_dec_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key,
- const u32 le_ctr[4], u8 ghash_acc[16],
- const u8 *src, u8 *dst, int datalen);
+aes_gcm_dec_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
/* __always_inline to optimize out the branches based on @flags */
static __always_inline void
@@ -1018,14 +1046,14 @@ aes_gcm_update(const struct aes_gcm_key *key,
const u8 *src, u8 *dst, int datalen, int flags)
{
if (flags & FLAG_ENC) {
- if (flags & FLAG_AVX10_512)
- aes_gcm_enc_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key),
- le_ctr, ghash_acc,
- src, dst, datalen);
- else if (flags & FLAG_AVX10_256)
- aes_gcm_enc_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key),
- le_ctr, ghash_acc,
- src, dst, datalen);
+ if (flags & FLAG_VAES_AVX512)
+ aes_gcm_enc_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
+ le_ctr, ghash_acc,
+ src, dst, datalen);
+ else if (flags & FLAG_VAES_AVX2)
+ aes_gcm_enc_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
+ le_ctr, ghash_acc,
+ src, dst, datalen);
else if (flags & FLAG_AVX)
aes_gcm_enc_update_aesni_avx(AES_GCM_KEY_AESNI(key),
le_ctr, ghash_acc,
@@ -1034,14 +1062,14 @@ aes_gcm_update(const struct aes_gcm_key *key,
aes_gcm_enc_update_aesni(AES_GCM_KEY_AESNI(key), le_ctr,
ghash_acc, src, dst, datalen);
} else {
- if (flags & FLAG_AVX10_512)
- aes_gcm_dec_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key),
- le_ctr, ghash_acc,
- src, dst, datalen);
- else if (flags & FLAG_AVX10_256)
- aes_gcm_dec_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key),
- le_ctr, ghash_acc,
- src, dst, datalen);
+ if (flags & FLAG_VAES_AVX512)
+ aes_gcm_dec_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
+ le_ctr, ghash_acc,
+ src, dst, datalen);
+ else if (flags & FLAG_VAES_AVX2)
+ aes_gcm_dec_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
+ le_ctr, ghash_acc,
+ src, dst, datalen);
else if (flags & FLAG_AVX)
aes_gcm_dec_update_aesni_avx(AES_GCM_KEY_AESNI(key),
le_ctr, ghash_acc,
@@ -1062,9 +1090,13 @@ aes_gcm_enc_final_aesni_avx(const struct aes_gcm_key_aesni *key,
const u32 le_ctr[4], u8 ghash_acc[16],
u64 total_aadlen, u64 total_datalen);
asmlinkage void
-aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
- const u32 le_ctr[4], u8 ghash_acc[16],
- u64 total_aadlen, u64 total_datalen);
+aes_gcm_enc_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ u64 total_aadlen, u64 total_datalen);
+asmlinkage void
+aes_gcm_enc_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ u64 total_aadlen, u64 total_datalen);
/* __always_inline to optimize out the branches based on @flags */
static __always_inline void
@@ -1072,10 +1104,14 @@ aes_gcm_enc_final(const struct aes_gcm_key *key,
const u32 le_ctr[4], u8 ghash_acc[16],
u64 total_aadlen, u64 total_datalen, int flags)
{
- if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
- aes_gcm_enc_final_vaes_avx10(AES_GCM_KEY_AVX10(key),
- le_ctr, ghash_acc,
- total_aadlen, total_datalen);
+ if (flags & FLAG_VAES_AVX512)
+ aes_gcm_enc_final_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
+ le_ctr, ghash_acc,
+ total_aadlen, total_datalen);
+ else if (flags & FLAG_VAES_AVX2)
+ aes_gcm_enc_final_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
+ le_ctr, ghash_acc,
+ total_aadlen, total_datalen);
else if (flags & FLAG_AVX)
aes_gcm_enc_final_aesni_avx(AES_GCM_KEY_AESNI(key),
le_ctr, ghash_acc,
@@ -1097,10 +1133,15 @@ aes_gcm_dec_final_aesni_avx(const struct aes_gcm_key_aesni *key,
u64 total_aadlen, u64 total_datalen,
const u8 tag[16], int taglen);
asmlinkage bool __must_check
-aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
- const u32 le_ctr[4], const u8 ghash_acc[16],
- u64 total_aadlen, u64 total_datalen,
- const u8 tag[16], int taglen);
+aes_gcm_dec_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key,
+ const u32 le_ctr[4], const u8 ghash_acc[16],
+ u64 total_aadlen, u64 total_datalen,
+ const u8 tag[16], int taglen);
+asmlinkage bool __must_check
+aes_gcm_dec_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
+ const u32 le_ctr[4], const u8 ghash_acc[16],
+ u64 total_aadlen, u64 total_datalen,
+ const u8 tag[16], int taglen);
/* __always_inline to optimize out the branches based on @flags */
static __always_inline bool __must_check
@@ -1108,11 +1149,16 @@ aes_gcm_dec_final(const struct aes_gcm_key *key, const u32 le_ctr[4],
u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen,
u8 tag[16], int taglen, int flags)
{
- if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
- return aes_gcm_dec_final_vaes_avx10(AES_GCM_KEY_AVX10(key),
- le_ctr, ghash_acc,
- total_aadlen, total_datalen,
- tag, taglen);
+ if (flags & FLAG_VAES_AVX512)
+ return aes_gcm_dec_final_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key),
+ le_ctr, ghash_acc,
+ total_aadlen, total_datalen,
+ tag, taglen);
+ else if (flags & FLAG_VAES_AVX2)
+ return aes_gcm_dec_final_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key),
+ le_ctr, ghash_acc,
+ total_aadlen, total_datalen,
+ tag, taglen);
else if (flags & FLAG_AVX)
return aes_gcm_dec_final_aesni_avx(AES_GCM_KEY_AESNI(key),
le_ctr, ghash_acc,
@@ -1195,10 +1241,14 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key,
BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers) != 496);
BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers_xored) != 624);
BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_times_x64) != 688);
- BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_enc) != 0);
- BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_length) != 480);
- BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, h_powers) != 512);
- BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, padding) != 768);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, base.aes_key.key_enc) != 0);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, base.aes_key.key_length) != 480);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, h_powers) != 512);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, h_powers_xored) != 640);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, base.aes_key.key_enc) != 0);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, base.aes_key.key_length) != 480);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, h_powers) != 512);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, padding) != 768);
if (likely(crypto_simd_usable())) {
err = aes_check_keylen(keylen);
@@ -1231,8 +1281,9 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key,
gf128mul_lle(&h, (const be128 *)x_to_the_minus1);
/* Compute the needed key powers */
- if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) {
- struct aes_gcm_key_avx10 *k = AES_GCM_KEY_AVX10(key);
+ if (flags & FLAG_VAES_AVX512) {
+ struct aes_gcm_key_vaes_avx512 *k =
+ AES_GCM_KEY_VAES_AVX512(key);
for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) {
k->h_powers[i][0] = be64_to_cpu(h.b);
@@ -1240,6 +1291,22 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key,
gf128mul_lle(&h, &h1);
}
memset(k->padding, 0, sizeof(k->padding));
+ } else if (flags & FLAG_VAES_AVX2) {
+ struct aes_gcm_key_vaes_avx2 *k =
+ AES_GCM_KEY_VAES_AVX2(key);
+ static const u8 indices[8] = { 0, 2, 1, 3, 4, 6, 5, 7 };
+
+ for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) {
+ k->h_powers[i][0] = be64_to_cpu(h.b);
+ k->h_powers[i][1] = be64_to_cpu(h.a);
+ gf128mul_lle(&h, &h1);
+ }
+ for (i = 0; i < ARRAY_SIZE(k->h_powers_xored); i++) {
+ int j = indices[i];
+
+ k->h_powers_xored[i] = k->h_powers[j][0] ^
+ k->h_powers[j][1];
+ }
} else {
struct aes_gcm_key_aesni *k = AES_GCM_KEY_AESNI(key);
@@ -1508,15 +1575,15 @@ DEFINE_GCM_ALGS(aesni_avx, FLAG_AVX,
"generic-gcm-aesni-avx", "rfc4106-gcm-aesni-avx",
AES_GCM_KEY_AESNI_SIZE, 500);
-/* aes_gcm_algs_vaes_avx10_256 */
-DEFINE_GCM_ALGS(vaes_avx10_256, FLAG_AVX10_256,
- "generic-gcm-vaes-avx10_256", "rfc4106-gcm-vaes-avx10_256",
- AES_GCM_KEY_AVX10_SIZE, 700);
+/* aes_gcm_algs_vaes_avx2 */
+DEFINE_GCM_ALGS(vaes_avx2, FLAG_VAES_AVX2,
+ "generic-gcm-vaes-avx2", "rfc4106-gcm-vaes-avx2",
+ AES_GCM_KEY_VAES_AVX2_SIZE, 600);
-/* aes_gcm_algs_vaes_avx10_512 */
-DEFINE_GCM_ALGS(vaes_avx10_512, FLAG_AVX10_512,
- "generic-gcm-vaes-avx10_512", "rfc4106-gcm-vaes-avx10_512",
- AES_GCM_KEY_AVX10_SIZE, 800);
+/* aes_gcm_algs_vaes_avx512 */
+DEFINE_GCM_ALGS(vaes_avx512, FLAG_VAES_AVX512,
+ "generic-gcm-vaes-avx512", "rfc4106-gcm-vaes-avx512",
+ AES_GCM_KEY_VAES_AVX512_SIZE, 800);
static int __init register_avx_algs(void)
{
@@ -1548,6 +1615,10 @@ static int __init register_avx_algs(void)
ARRAY_SIZE(skcipher_algs_vaes_avx2));
if (err)
return err;
+ err = crypto_register_aeads(aes_gcm_algs_vaes_avx2,
+ ARRAY_SIZE(aes_gcm_algs_vaes_avx2));
+ if (err)
+ return err;
if (!boot_cpu_has(X86_FEATURE_AVX512BW) ||
!boot_cpu_has(X86_FEATURE_AVX512VL) ||
@@ -1556,26 +1627,21 @@ static int __init register_avx_algs(void)
XFEATURE_MASK_AVX512, NULL))
return 0;
- err = crypto_register_aeads(aes_gcm_algs_vaes_avx10_256,
- ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256));
- if (err)
- return err;
-
if (boot_cpu_has(X86_FEATURE_PREFER_YMM)) {
int i;
for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx512); i++)
skcipher_algs_vaes_avx512[i].base.cra_priority = 1;
- for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++)
- aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1;
+ for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx512); i++)
+ aes_gcm_algs_vaes_avx512[i].base.cra_priority = 1;
}
err = crypto_register_skciphers(skcipher_algs_vaes_avx512,
ARRAY_SIZE(skcipher_algs_vaes_avx512));
if (err)
return err;
- err = crypto_register_aeads(aes_gcm_algs_vaes_avx10_512,
- ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512));
+ err = crypto_register_aeads(aes_gcm_algs_vaes_avx512,
+ ARRAY_SIZE(aes_gcm_algs_vaes_avx512));
if (err)
return err;
@@ -1595,8 +1661,8 @@ static void unregister_avx_algs(void)
unregister_aeads(aes_gcm_algs_aesni_avx);
unregister_skciphers(skcipher_algs_vaes_avx2);
unregister_skciphers(skcipher_algs_vaes_avx512);
- unregister_aeads(aes_gcm_algs_vaes_avx10_256);
- unregister_aeads(aes_gcm_algs_vaes_avx10_512);
+ unregister_aeads(aes_gcm_algs_vaes_avx2);
+ unregister_aeads(aes_gcm_algs_vaes_avx512);
}
#else /* CONFIG_X86_64 */
static struct aead_alg aes_gcm_algs_aesni[0];
diff --git a/arch/x86/crypto/polyval-clmulni_asm.S b/arch/x86/crypto/polyval-clmulni_asm.S
deleted file mode 100644
index a6ebe4e7dd2b..000000000000
--- a/arch/x86/crypto/polyval-clmulni_asm.S
+++ /dev/null
@@ -1,321 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright 2021 Google LLC
- */
-/*
- * This is an efficient implementation of POLYVAL using intel PCLMULQDQ-NI
- * instructions. It works on 8 blocks at a time, by precomputing the first 8
- * keys powers h^8, ..., h^1 in the POLYVAL finite field. This precomputation
- * allows us to split finite field multiplication into two steps.
- *
- * In the first step, we consider h^i, m_i as normal polynomials of degree less
- * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
- * is simply polynomial multiplication.
- *
- * In the second step, we compute the reduction of p(x) modulo the finite field
- * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
- *
- * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
- * multiplication is finite field multiplication. The advantage is that the
- * two-step process only requires 1 finite field reduction for every 8
- * polynomial multiplications. Further parallelism is gained by interleaving the
- * multiplications and polynomial reductions.
- */
-
-#include <linux/linkage.h>
-#include <asm/frame.h>
-
-#define STRIDE_BLOCKS 8
-
-#define GSTAR %xmm7
-#define PL %xmm8
-#define PH %xmm9
-#define TMP_XMM %xmm11
-#define LO %xmm12
-#define HI %xmm13
-#define MI %xmm14
-#define SUM %xmm15
-
-#define KEY_POWERS %rdi
-#define MSG %rsi
-#define BLOCKS_LEFT %rdx
-#define ACCUMULATOR %rcx
-#define TMP %rax
-
-.section .rodata.cst16.gstar, "aM", @progbits, 16
-.align 16
-
-.Lgstar:
- .quad 0xc200000000000000, 0xc200000000000000
-
-.text
-
-/*
- * Performs schoolbook1_iteration on two lists of 128-bit polynomials of length
- * count pointed to by MSG and KEY_POWERS.
- */
-.macro schoolbook1 count
- .set i, 0
- .rept (\count)
- schoolbook1_iteration i 0
- .set i, (i +1)
- .endr
-.endm
-
-/*
- * Computes the product of two 128-bit polynomials at the memory locations
- * specified by (MSG + 16*i) and (KEY_POWERS + 16*i) and XORs the components of
- * the 256-bit product into LO, MI, HI.
- *
- * Given:
- * X = [X_1 : X_0]
- * Y = [Y_1 : Y_0]
- *
- * We compute:
- * LO += X_0 * Y_0
- * MI += X_0 * Y_1 + X_1 * Y_0
- * HI += X_1 * Y_1
- *
- * Later, the 256-bit result can be extracted as:
- * [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
- * This step is done when computing the polynomial reduction for efficiency
- * reasons.
- *
- * If xor_sum == 1, then also XOR the value of SUM into m_0. This avoids an
- * extra multiplication of SUM and h^8.
- */
-.macro schoolbook1_iteration i xor_sum
- movups (16*\i)(MSG), %xmm0
- .if (\i == 0 && \xor_sum == 1)
- pxor SUM, %xmm0
- .endif
- vpclmulqdq $0x01, (16*\i)(KEY_POWERS), %xmm0, %xmm2
- vpclmulqdq $0x00, (16*\i)(KEY_POWERS), %xmm0, %xmm1
- vpclmulqdq $0x10, (16*\i)(KEY_POWERS), %xmm0, %xmm3
- vpclmulqdq $0x11, (16*\i)(KEY_POWERS), %xmm0, %xmm4
- vpxor %xmm2, MI, MI
- vpxor %xmm1, LO, LO
- vpxor %xmm4, HI, HI
- vpxor %xmm3, MI, MI
-.endm
-
-/*
- * Performs the same computation as schoolbook1_iteration, except we expect the
- * arguments to already be loaded into xmm0 and xmm1 and we set the result
- * registers LO, MI, and HI directly rather than XOR'ing into them.
- */
-.macro schoolbook1_noload
- vpclmulqdq $0x01, %xmm0, %xmm1, MI
- vpclmulqdq $0x10, %xmm0, %xmm1, %xmm2
- vpclmulqdq $0x00, %xmm0, %xmm1, LO
- vpclmulqdq $0x11, %xmm0, %xmm1, HI
- vpxor %xmm2, MI, MI
-.endm
-
-/*
- * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
- * the result in PL, PH.
- * [PH : PL] = [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
- */
-.macro schoolbook2
- vpslldq $8, MI, PL
- vpsrldq $8, MI, PH
- pxor LO, PL
- pxor HI, PH
-.endm
-
-/*
- * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
- *
- * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
- * x^128 + x^127 + x^126 + x^121 + 1.
- *
- * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
- * product of two 128-bit polynomials in Montgomery form. We need to reduce it
- * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor
- * of x^128, this product has two extra factors of x^128. To get it back into
- * Montgomery form, we need to remove one of these factors by dividing by x^128.
- *
- * To accomplish both of these goals, we add multiples of g(x) that cancel out
- * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
- * bits are zero, the polynomial division by x^128 can be done by right shifting.
- *
- * Since the only nonzero term in the low 64 bits of g(x) is the constant term,
- * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can
- * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
- * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to
- * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
- * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191.
- *
- * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
- * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
- * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
- * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
- * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
- *
- * So our final computation is:
- * T = T_1 : T_0 = g*(x) * P_0
- * V = V_1 : V_0 = g*(x) * (P_1 + T_0)
- * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
- *
- * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
- * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
- * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V.
- */
-.macro montgomery_reduction dest
- vpclmulqdq $0x00, PL, GSTAR, TMP_XMM # TMP_XMM = T_1 : T_0 = P_0 * g*(x)
- pshufd $0b01001110, TMP_XMM, TMP_XMM # TMP_XMM = T_0 : T_1
- pxor PL, TMP_XMM # TMP_XMM = P_1 + T_0 : P_0 + T_1
- pxor TMP_XMM, PH # PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
- pclmulqdq $0x11, GSTAR, TMP_XMM # TMP_XMM = V_1 : V_0 = V = [(P_1 + T_0) * g*(x)]
- vpxor TMP_XMM, PH, \dest
-.endm
-
-/*
- * Compute schoolbook multiplication for 8 blocks
- * m_0h^8 + ... + m_7h^1
- *
- * If reduce is set, also computes the montgomery reduction of the
- * previous full_stride call and XORs with the first message block.
- * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
- * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
- */
-.macro full_stride reduce
- pxor LO, LO
- pxor HI, HI
- pxor MI, MI
-
- schoolbook1_iteration 7 0
- .if \reduce
- vpclmulqdq $0x00, PL, GSTAR, TMP_XMM
- .endif
-
- schoolbook1_iteration 6 0
- .if \reduce
- pshufd $0b01001110, TMP_XMM, TMP_XMM
- .endif
-
- schoolbook1_iteration 5 0
- .if \reduce
- pxor PL, TMP_XMM
- .endif
-
- schoolbook1_iteration 4 0
- .if \reduce
- pxor TMP_XMM, PH
- .endif
-
- schoolbook1_iteration 3 0
- .if \reduce
- pclmulqdq $0x11, GSTAR, TMP_XMM
- .endif
-
- schoolbook1_iteration 2 0
- .if \reduce
- vpxor TMP_XMM, PH, SUM
- .endif
-
- schoolbook1_iteration 1 0
-
- schoolbook1_iteration 0 1
-
- addq $(8*16), MSG
- schoolbook2
-.endm
-
-/*
- * Process BLOCKS_LEFT blocks, where 0 < BLOCKS_LEFT < STRIDE_BLOCKS
- */
-.macro partial_stride
- mov BLOCKS_LEFT, TMP
- shlq $4, TMP
- addq $(16*STRIDE_BLOCKS), KEY_POWERS
- subq TMP, KEY_POWERS
-
- movups (MSG), %xmm0
- pxor SUM, %xmm0
- movaps (KEY_POWERS), %xmm1
- schoolbook1_noload
- dec BLOCKS_LEFT
- addq $16, MSG
- addq $16, KEY_POWERS
-
- test $4, BLOCKS_LEFT
- jz .Lpartial4BlocksDone
- schoolbook1 4
- addq $(4*16), MSG
- addq $(4*16), KEY_POWERS
-.Lpartial4BlocksDone:
- test $2, BLOCKS_LEFT
- jz .Lpartial2BlocksDone
- schoolbook1 2
- addq $(2*16), MSG
- addq $(2*16), KEY_POWERS
-.Lpartial2BlocksDone:
- test $1, BLOCKS_LEFT
- jz .LpartialDone
- schoolbook1 1
-.LpartialDone:
- schoolbook2
- montgomery_reduction SUM
-.endm
-
-/*
- * Perform montgomery multiplication in GF(2^128) and store result in op1.
- *
- * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1
- * If op1, op2 are in montgomery form, this computes the montgomery
- * form of op1*op2.
- *
- * void clmul_polyval_mul(u8 *op1, const u8 *op2);
- */
-SYM_FUNC_START(clmul_polyval_mul)
- FRAME_BEGIN
- vmovdqa .Lgstar(%rip), GSTAR
- movups (%rdi), %xmm0
- movups (%rsi), %xmm1
- schoolbook1_noload
- schoolbook2
- montgomery_reduction SUM
- movups SUM, (%rdi)
- FRAME_END
- RET
-SYM_FUNC_END(clmul_polyval_mul)
-
-/*
- * Perform polynomial evaluation as specified by POLYVAL. This computes:
- * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
- * where n=nblocks, h is the hash key, and m_i are the message blocks.
- *
- * rdi - pointer to precomputed key powers h^8 ... h^1
- * rsi - pointer to message blocks
- * rdx - number of blocks to hash
- * rcx - pointer to the accumulator
- *
- * void clmul_polyval_update(const struct polyval_tfm_ctx *keys,
- * const u8 *in, size_t nblocks, u8 *accumulator);
- */
-SYM_FUNC_START(clmul_polyval_update)
- FRAME_BEGIN
- vmovdqa .Lgstar(%rip), GSTAR
- movups (ACCUMULATOR), SUM
- subq $STRIDE_BLOCKS, BLOCKS_LEFT
- js .LstrideLoopExit
- full_stride 0
- subq $STRIDE_BLOCKS, BLOCKS_LEFT
- js .LstrideLoopExitReduce
-.LstrideLoop:
- full_stride 1
- subq $STRIDE_BLOCKS, BLOCKS_LEFT
- jns .LstrideLoop
-.LstrideLoopExitReduce:
- montgomery_reduction SUM
-.LstrideLoopExit:
- add $STRIDE_BLOCKS, BLOCKS_LEFT
- jz .LskipPartial
- partial_stride
-.LskipPartial:
- movups SUM, (ACCUMULATOR)
- FRAME_END
- RET
-SYM_FUNC_END(clmul_polyval_update)
diff --git a/arch/x86/crypto/polyval-clmulni_glue.c b/arch/x86/crypto/polyval-clmulni_glue.c
deleted file mode 100644
index 6b466867f91a..000000000000
--- a/arch/x86/crypto/polyval-clmulni_glue.c
+++ /dev/null
@@ -1,180 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Glue code for POLYVAL using PCMULQDQ-NI
- *
- * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi>
- * Copyright (c) 2009 Intel Corp.
- * Author: Huang Ying <ying.huang@intel.com>
- * Copyright 2021 Google LLC
- */
-
-/*
- * Glue code based on ghash-clmulni-intel_glue.c.
- *
- * This implementation of POLYVAL uses montgomery multiplication
- * accelerated by PCLMULQDQ-NI to implement the finite field
- * operations.
- */
-
-#include <asm/cpu_device_id.h>
-#include <asm/fpu/api.h>
-#include <crypto/internal/hash.h>
-#include <crypto/polyval.h>
-#include <crypto/utils.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#define POLYVAL_ALIGN 16
-#define POLYVAL_ALIGN_ATTR __aligned(POLYVAL_ALIGN)
-#define POLYVAL_ALIGN_EXTRA ((POLYVAL_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1))
-#define POLYVAL_CTX_SIZE (sizeof(struct polyval_tfm_ctx) + POLYVAL_ALIGN_EXTRA)
-#define NUM_KEY_POWERS 8
-
-struct polyval_tfm_ctx {
- /*
- * These powers must be in the order h^8, ..., h^1.
- */
- u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE] POLYVAL_ALIGN_ATTR;
-};
-
-struct polyval_desc_ctx {
- u8 buffer[POLYVAL_BLOCK_SIZE];
-};
-
-asmlinkage void clmul_polyval_update(const struct polyval_tfm_ctx *keys,
- const u8 *in, size_t nblocks, u8 *accumulator);
-asmlinkage void clmul_polyval_mul(u8 *op1, const u8 *op2);
-
-static inline struct polyval_tfm_ctx *polyval_tfm_ctx(struct crypto_shash *tfm)
-{
- return PTR_ALIGN(crypto_shash_ctx(tfm), POLYVAL_ALIGN);
-}
-
-static void internal_polyval_update(const struct polyval_tfm_ctx *keys,
- const u8 *in, size_t nblocks, u8 *accumulator)
-{
- kernel_fpu_begin();
- clmul_polyval_update(keys, in, nblocks, accumulator);
- kernel_fpu_end();
-}
-
-static void internal_polyval_mul(u8 *op1, const u8 *op2)
-{
- kernel_fpu_begin();
- clmul_polyval_mul(op1, op2);
- kernel_fpu_end();
-}
-
-static int polyval_x86_setkey(struct crypto_shash *tfm,
- const u8 *key, unsigned int keylen)
-{
- struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(tfm);
- int i;
-
- if (keylen != POLYVAL_BLOCK_SIZE)
- return -EINVAL;
-
- memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE);
-
- for (i = NUM_KEY_POWERS-2; i >= 0; i--) {
- memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE);
- internal_polyval_mul(tctx->key_powers[i],
- tctx->key_powers[i+1]);
- }
-
- return 0;
-}
-
-static int polyval_x86_init(struct shash_desc *desc)
-{
- struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-
- memset(dctx, 0, sizeof(*dctx));
-
- return 0;
-}
-
-static int polyval_x86_update(struct shash_desc *desc,
- const u8 *src, unsigned int srclen)
-{
- struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
- const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm);
- unsigned int nblocks;
-
- do {
- /* Allow rescheduling every 4K bytes. */
- nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE;
- internal_polyval_update(tctx, src, nblocks, dctx->buffer);
- srclen -= nblocks * POLYVAL_BLOCK_SIZE;
- src += nblocks * POLYVAL_BLOCK_SIZE;
- } while (srclen >= POLYVAL_BLOCK_SIZE);
-
- return srclen;
-}
-
-static int polyval_x86_finup(struct shash_desc *desc, const u8 *src,
- unsigned int len, u8 *dst)
-{
- struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
- const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm);
-
- if (len) {
- crypto_xor(dctx->buffer, src, len);
- internal_polyval_mul(dctx->buffer,
- tctx->key_powers[NUM_KEY_POWERS-1]);
- }
-
- memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE);
-
- return 0;
-}
-
-static struct shash_alg polyval_alg = {
- .digestsize = POLYVAL_DIGEST_SIZE,
- .init = polyval_x86_init,
- .update = polyval_x86_update,
- .finup = polyval_x86_finup,
- .setkey = polyval_x86_setkey,
- .descsize = sizeof(struct polyval_desc_ctx),
- .base = {
- .cra_name = "polyval",
- .cra_driver_name = "polyval-clmulni",
- .cra_priority = 200,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
- .cra_blocksize = POLYVAL_BLOCK_SIZE,
- .cra_ctxsize = POLYVAL_CTX_SIZE,
- .cra_module = THIS_MODULE,
- },
-};
-
-__maybe_unused static const struct x86_cpu_id pcmul_cpu_id[] = {
- X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL),
- {}
-};
-MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id);
-
-static int __init polyval_clmulni_mod_init(void)
-{
- if (!x86_match_cpu(pcmul_cpu_id))
- return -ENODEV;
-
- if (!boot_cpu_has(X86_FEATURE_AVX))
- return -ENODEV;
-
- return crypto_register_shash(&polyval_alg);
-}
-
-static void __exit polyval_clmulni_mod_exit(void)
-{
- crypto_unregister_shash(&polyval_alg);
-}
-
-module_init(polyval_clmulni_mod_init);
-module_exit(polyval_clmulni_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("POLYVAL hash function accelerated by PCLMULQDQ-NI");
-MODULE_ALIAS_CRYPTO("polyval");
-MODULE_ALIAS_CRYPTO("polyval-clmulni");
diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S
index 8e9a0cc20a4a..6ba2b3adcef0 100644
--- a/arch/x86/entry/entry.S
+++ b/arch/x86/entry/entry.S
@@ -4,6 +4,7 @@
*/
#include <linux/export.h>
+#include <linux/kvm_types.h>
#include <linux/linkage.h>
#include <linux/objtool.h>
#include <asm/msr-index.h>
@@ -29,8 +30,15 @@ SYM_FUNC_START(write_ibpb)
FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_BUG_IBPB_NO_RET
RET
SYM_FUNC_END(write_ibpb)
-/* For KVM */
-EXPORT_SYMBOL_GPL(write_ibpb);
+EXPORT_SYMBOL_FOR_KVM(write_ibpb);
+
+SYM_FUNC_START(__WARN_trap)
+ ANNOTATE_NOENDBR
+ ANNOTATE_REACHABLE
+ ud1 (%edx), %_ASM_ARG1
+ RET
+SYM_FUNC_END(__WARN_trap)
+EXPORT_SYMBOL(__WARN_trap)
.popsection
@@ -48,8 +56,7 @@ SYM_CODE_START_NOALIGN(x86_verw_sel)
.word __KERNEL_DS
.align L1_CACHE_BYTES, 0xcc
SYM_CODE_END(x86_verw_sel);
-/* For KVM */
-EXPORT_SYMBOL_GPL(x86_verw_sel);
+EXPORT_SYMBOL_FOR_KVM(x86_verw_sel);
.popsection
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index ed04a968cc7d..f9983a1907bf 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -19,6 +19,7 @@
* - idtentry: Define exception entry points.
*/
#include <linux/export.h>
+#include <linux/kvm_types.h>
#include <linux/linkage.h>
#include <asm/segment.h>
#include <asm/cache.h>
@@ -1566,5 +1567,5 @@ SYM_FUNC_START(clear_bhb_loop)
pop %rbp
RET
SYM_FUNC_END(clear_bhb_loop)
-EXPORT_SYMBOL_GPL(clear_bhb_loop)
+EXPORT_SYMBOL_FOR_KVM(clear_bhb_loop)
STACK_FRAME_NON_STANDARD(clear_bhb_loop)
diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S
index fafbd3e68cb8..894f7f16eb80 100644
--- a/arch/x86/entry/entry_64_fred.S
+++ b/arch/x86/entry/entry_64_fred.S
@@ -4,6 +4,7 @@
*/
#include <linux/export.h>
+#include <linux/kvm_types.h>
#include <asm/asm.h>
#include <asm/fred.h>
@@ -146,5 +147,5 @@ SYM_FUNC_START(asm_fred_entry_from_kvm)
RET
SYM_FUNC_END(asm_fred_entry_from_kvm)
-EXPORT_SYMBOL_GPL(asm_fred_entry_from_kvm);
+EXPORT_SYMBOL_FOR_KVM(asm_fred_entry_from_kvm);
#endif
diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c
index f004a4dc74c2..94e626cc6a07 100644
--- a/arch/x86/entry/entry_fred.c
+++ b/arch/x86/entry/entry_fred.c
@@ -78,13 +78,13 @@ static noinstr void fred_intx(struct pt_regs *regs)
static __always_inline void fred_other(struct pt_regs *regs)
{
/* The compiler can fold these conditions into a single test */
- if (likely(regs->fred_ss.vector == FRED_SYSCALL && regs->fred_ss.lm)) {
+ if (likely(regs->fred_ss.vector == FRED_SYSCALL && regs->fred_ss.l)) {
regs->orig_ax = regs->ax;
regs->ax = -ENOSYS;
do_syscall_64(regs, regs->orig_ax);
return;
} else if (ia32_enabled() &&
- likely(regs->fred_ss.vector == FRED_SYSENTER && !regs->fred_ss.lm)) {
+ likely(regs->fred_ss.vector == FRED_SYSENTER && !regs->fred_ss.l)) {
regs->orig_ax = regs->ax;
regs->ax = -ENOSYS;
do_fast_syscall_32(regs);
diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index 2b15ea17bb7c..a67a644d0cfe 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -274,9 +274,10 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
* fetch EBP before invoking any of the syscall entry work
* functions.
*/
- syscall_enter_from_user_mode_prepare(regs);
+ enter_from_user_mode(regs);
instrumentation_begin();
+ local_irq_enable();
/* Fetch EBP from where the vDSO stashed it. */
if (IS_ENABLED(CONFIG_X86_64)) {
/*
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 4877e16da69a..e979a3eac7a3 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -475,3 +475,4 @@
467 i386 open_tree_attr sys_open_tree_attr
468 i386 file_getattr sys_file_getattr
469 i386 file_setattr sys_file_setattr
+470 i386 listns sys_listns
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index ced2a1deecd7..8a4ac4841be6 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -394,6 +394,7 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
+470 common listns sys_listns
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index b20661b8621d..44656d2fb555 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -2,6 +2,7 @@
#include <linux/perf_event.h>
#include <linux/jump_label.h>
#include <linux/export.h>
+#include <linux/kvm_types.h>
#include <linux/types.h>
#include <linux/init.h>
#include <linux/slab.h>
@@ -763,7 +764,12 @@ static void amd_pmu_enable_all(int added)
if (!test_bit(idx, cpuc->active_mask))
continue;
- amd_pmu_enable_event(cpuc->events[idx]);
+ /*
+ * FIXME: cpuc->events[idx] can become NULL in a subtle race
+ * condition with NMI->throttle->x86_pmu_stop().
+ */
+ if (cpuc->events[idx])
+ amd_pmu_enable_event(cpuc->events[idx]);
}
}
@@ -1569,7 +1575,7 @@ void amd_pmu_enable_virt(void)
/* Reload all events */
amd_pmu_reload_virt();
}
-EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
+EXPORT_SYMBOL_FOR_KVM(amd_pmu_enable_virt);
void amd_pmu_disable_virt(void)
{
@@ -1586,4 +1592,4 @@ void amd_pmu_disable_virt(void)
/* Reload all events */
amd_pmu_reload_virt();
}
-EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);
+EXPORT_SYMBOL_FOR_KVM(amd_pmu_disable_virt);
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 745caa6c15a3..0c38a31d5fc7 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -20,6 +20,7 @@
#include <linux/export.h>
#include <linux/init.h>
#include <linux/kdebug.h>
+#include <linux/kvm_types.h>
#include <linux/sched/mm.h>
#include <linux/sched/clock.h>
#include <linux/uaccess.h>
@@ -554,14 +555,22 @@ static inline int precise_br_compat(struct perf_event *event)
return m == b;
}
-int x86_pmu_max_precise(void)
+int x86_pmu_max_precise(struct pmu *pmu)
{
int precise = 0;
- /* Support for constant skid */
if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
- precise++;
+ /* arch PEBS */
+ if (x86_pmu.arch_pebs) {
+ precise = 2;
+ if (hybrid(pmu, arch_pebs_cap).pdists)
+ precise++;
+
+ return precise;
+ }
+ /* legacy PEBS - support for constant skid */
+ precise++;
/* Support for IP fixup */
if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
precise++;
@@ -569,13 +578,14 @@ int x86_pmu_max_precise(void)
if (x86_pmu.pebs_prec_dist)
precise++;
}
+
return precise;
}
int x86_pmu_hw_config(struct perf_event *event)
{
if (event->attr.precise_ip) {
- int precise = x86_pmu_max_precise();
+ int precise = x86_pmu_max_precise(event->pmu);
if (event->attr.precise_ip > precise)
return -EOPNOTSUPP;
@@ -714,7 +724,7 @@ struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data)
{
return static_call(x86_pmu_guest_get_msrs)(nr, data);
}
-EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
+EXPORT_SYMBOL_FOR_KVM(perf_guest_get_msrs);
/*
* There may be PMI landing after enabled=0. The PMI hitting could be before or
@@ -1344,6 +1354,7 @@ static void x86_pmu_enable(struct pmu *pmu)
hwc->state |= PERF_HES_ARCH;
x86_pmu_stop(event, PERF_EF_UPDATE);
+ cpuc->events[hwc->idx] = NULL;
}
/*
@@ -1365,6 +1376,7 @@ static void x86_pmu_enable(struct pmu *pmu)
* if cpuc->enabled = 0, then no wrmsr as
* per x86_pmu_enable_event()
*/
+ cpuc->events[hwc->idx] = event;
x86_pmu_start(event, PERF_EF_RELOAD);
}
cpuc->n_added = 0;
@@ -1531,7 +1543,6 @@ static void x86_pmu_start(struct perf_event *event, int flags)
event->hw.state = 0;
- cpuc->events[idx] = event;
__set_bit(idx, cpuc->active_mask);
static_call(x86_pmu_enable)(event);
perf_event_update_userpage(event);
@@ -1610,7 +1621,6 @@ void x86_pmu_stop(struct perf_event *event, int flags)
if (test_bit(hwc->idx, cpuc->active_mask)) {
static_call(x86_pmu_disable)(event);
__clear_bit(hwc->idx, cpuc->active_mask);
- cpuc->events[hwc->idx] = NULL;
WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
hwc->state |= PERF_HES_STOPPED;
}
@@ -1648,6 +1658,7 @@ static void x86_pmu_del(struct perf_event *event, int flags)
* Not a TXN, therefore cleanup properly.
*/
x86_pmu_stop(event, PERF_EF_UPDATE);
+ cpuc->events[event->hw.idx] = NULL;
for (i = 0; i < cpuc->n_events; i++) {
if (event == cpuc->event_list[i])
@@ -2629,7 +2640,9 @@ static ssize_t max_precise_show(struct device *cdev,
struct device_attribute *attr,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise());
+ struct pmu *pmu = dev_get_drvdata(cdev);
+
+ return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise(pmu));
}
static DEVICE_ATTR_RO(max_precise);
@@ -2789,13 +2802,13 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re
return;
}
- if (perf_callchain_store(entry, regs->ip))
- return;
-
- if (perf_hw_regs(regs))
+ if (perf_hw_regs(regs)) {
+ if (perf_callchain_store(entry, regs->ip))
+ return;
unwind_start(&state, current, regs, NULL);
- else
+ } else {
unwind_start(&state, current, NULL, (void *)regs->sp);
+ }
for (; !unwind_done(&state); unwind_next_frame(&state)) {
addr = unwind_get_return_address(&state);
@@ -2845,46 +2858,6 @@ static unsigned long get_segment_base(unsigned int segment)
return get_desc_base(desc);
}
-#ifdef CONFIG_UPROBES
-/*
- * Heuristic-based check if uprobe is installed at the function entry.
- *
- * Under assumption of user code being compiled with frame pointers,
- * `push %rbp/%ebp` is a good indicator that we indeed are.
- *
- * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern.
- * If we get this wrong, captured stack trace might have one extra bogus
- * entry, but the rest of stack trace will still be meaningful.
- */
-static bool is_uprobe_at_func_entry(struct pt_regs *regs)
-{
- struct arch_uprobe *auprobe;
-
- if (!current->utask)
- return false;
-
- auprobe = current->utask->auprobe;
- if (!auprobe)
- return false;
-
- /* push %rbp/%ebp */
- if (auprobe->insn[0] == 0x55)
- return true;
-
- /* endbr64 (64-bit only) */
- if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn))
- return true;
-
- return false;
-}
-
-#else
-static bool is_uprobe_at_func_entry(struct pt_regs *regs)
-{
- return false;
-}
-#endif /* CONFIG_UPROBES */
-
#ifdef CONFIG_IA32_EMULATION
#include <linux/compat.h>
@@ -3106,7 +3079,7 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
cap->events_mask_len = x86_pmu.events_mask_len;
cap->pebs_ept = x86_pmu.pebs_ept;
}
-EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
+EXPORT_SYMBOL_FOR_KVM(perf_get_x86_pmu_capability);
u64 perf_get_hw_event_config(int hw_event)
{
@@ -3117,4 +3090,4 @@ u64 perf_get_hw_event_config(int hw_event)
return 0;
}
-EXPORT_SYMBOL_GPL(perf_get_hw_event_config);
+EXPORT_SYMBOL_FOR_KVM(perf_get_hw_event_config);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 28f5468a6ea3..853fe073bab3 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2563,6 +2563,44 @@ static void intel_pmu_disable_fixed(struct perf_event *event)
cpuc->fixed_ctrl_val &= ~mask;
}
+static inline void __intel_pmu_update_event_ext(int idx, u64 ext)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ u32 msr;
+
+ if (idx < INTEL_PMC_IDX_FIXED) {
+ msr = MSR_IA32_PMC_V6_GP0_CFG_C +
+ x86_pmu.addr_offset(idx, false);
+ } else {
+ msr = MSR_IA32_PMC_V6_FX0_CFG_C +
+ x86_pmu.addr_offset(idx - INTEL_PMC_IDX_FIXED, false);
+ }
+
+ cpuc->cfg_c_val[idx] = ext;
+ wrmsrq(msr, ext);
+}
+
+static void intel_pmu_disable_event_ext(struct perf_event *event)
+{
+ /*
+ * Only clear CFG_C MSR for PEBS counter group events,
+ * it avoids the HW counter's value to be added into
+ * other PEBS records incorrectly after PEBS counter
+ * group events are disabled.
+ *
+ * For other events, it's unnecessary to clear CFG_C MSRs
+ * since CFG_C doesn't take effect if counter is in
+ * disabled state. That helps to reduce the WRMSR overhead
+ * in context switches.
+ */
+ if (!is_pebs_counter_event_group(event))
+ return;
+
+ __intel_pmu_update_event_ext(event->hw.idx, 0);
+}
+
+DEFINE_STATIC_CALL_NULL(intel_pmu_disable_event_ext, intel_pmu_disable_event_ext);
+
static void intel_pmu_disable_event(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
@@ -2571,9 +2609,12 @@ static void intel_pmu_disable_event(struct perf_event *event)
switch (idx) {
case 0 ... INTEL_PMC_IDX_FIXED - 1:
intel_clear_masks(event, idx);
+ static_call_cond(intel_pmu_disable_event_ext)(event);
x86_pmu_disable_event(event);
break;
case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1:
+ static_call_cond(intel_pmu_disable_event_ext)(event);
+ fallthrough;
case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END:
intel_pmu_disable_fixed(event);
break;
@@ -2940,6 +2981,79 @@ static void intel_pmu_enable_acr(struct perf_event *event)
DEFINE_STATIC_CALL_NULL(intel_pmu_enable_acr_event, intel_pmu_enable_acr);
+static void intel_pmu_enable_event_ext(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ union arch_pebs_index old, new;
+ struct arch_pebs_cap cap;
+ u64 ext = 0;
+
+ cap = hybrid(cpuc->pmu, arch_pebs_cap);
+
+ if (event->attr.precise_ip) {
+ u64 pebs_data_cfg = intel_get_arch_pebs_data_config(event);
+
+ ext |= ARCH_PEBS_EN;
+ if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD)
+ ext |= (-hwc->sample_period) & ARCH_PEBS_RELOAD;
+
+ if (pebs_data_cfg && cap.caps) {
+ if (pebs_data_cfg & PEBS_DATACFG_MEMINFO)
+ ext |= ARCH_PEBS_AUX & cap.caps;
+
+ if (pebs_data_cfg & PEBS_DATACFG_GP)
+ ext |= ARCH_PEBS_GPR & cap.caps;
+
+ if (pebs_data_cfg & PEBS_DATACFG_XMMS)
+ ext |= ARCH_PEBS_VECR_XMM & cap.caps;
+
+ if (pebs_data_cfg & PEBS_DATACFG_LBRS)
+ ext |= ARCH_PEBS_LBR & cap.caps;
+
+ if (pebs_data_cfg &
+ (PEBS_DATACFG_CNTR_MASK << PEBS_DATACFG_CNTR_SHIFT))
+ ext |= ARCH_PEBS_CNTR_GP & cap.caps;
+
+ if (pebs_data_cfg &
+ (PEBS_DATACFG_FIX_MASK << PEBS_DATACFG_FIX_SHIFT))
+ ext |= ARCH_PEBS_CNTR_FIXED & cap.caps;
+
+ if (pebs_data_cfg & PEBS_DATACFG_METRICS)
+ ext |= ARCH_PEBS_CNTR_METRICS & cap.caps;
+ }
+
+ if (cpuc->n_pebs == cpuc->n_large_pebs)
+ new.thresh = ARCH_PEBS_THRESH_MULTI;
+ else
+ new.thresh = ARCH_PEBS_THRESH_SINGLE;
+
+ rdmsrq(MSR_IA32_PEBS_INDEX, old.whole);
+ if (new.thresh != old.thresh || !old.en) {
+ if (old.thresh == ARCH_PEBS_THRESH_MULTI && old.wr > 0) {
+ /*
+ * Large PEBS was enabled.
+ * Drain PEBS buffer before applying the single PEBS.
+ */
+ intel_pmu_drain_pebs_buffer();
+ } else {
+ new.wr = 0;
+ new.full = 0;
+ new.en = 1;
+ wrmsrq(MSR_IA32_PEBS_INDEX, new.whole);
+ }
+ }
+ }
+
+ if (is_pebs_counter_event_group(event))
+ ext |= ARCH_PEBS_CNTR_ALLOW;
+
+ if (cpuc->cfg_c_val[hwc->idx] != ext)
+ __intel_pmu_update_event_ext(hwc->idx, ext);
+}
+
+DEFINE_STATIC_CALL_NULL(intel_pmu_enable_event_ext, intel_pmu_enable_event_ext);
+
static void intel_pmu_enable_event(struct perf_event *event)
{
u64 enable_mask = ARCH_PERFMON_EVENTSEL_ENABLE;
@@ -2955,10 +3069,12 @@ static void intel_pmu_enable_event(struct perf_event *event)
enable_mask |= ARCH_PERFMON_EVENTSEL_BR_CNTR;
intel_set_masks(event, idx);
static_call_cond(intel_pmu_enable_acr_event)(event);
+ static_call_cond(intel_pmu_enable_event_ext)(event);
__x86_pmu_enable_event(hwc, enable_mask);
break;
case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1:
static_call_cond(intel_pmu_enable_acr_event)(event);
+ static_call_cond(intel_pmu_enable_event_ext)(event);
fallthrough;
case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END:
intel_pmu_enable_fixed(event);
@@ -3216,6 +3332,19 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
}
/*
+ * Arch PEBS sets bit 54 in the global status register
+ */
+ if (__test_and_clear_bit(GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT,
+ (unsigned long *)&status)) {
+ handled++;
+ static_call(x86_pmu_drain_pebs)(regs, &data);
+
+ if (cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS] &&
+ is_pebs_counter_event_group(cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS]))
+ status &= ~GLOBAL_STATUS_PERF_METRICS_OVF_BIT;
+ }
+
+ /*
* Intel PT
*/
if (__test_and_clear_bit(GLOBAL_STATUS_TRACE_TOPAPMI_BIT, (unsigned long *)&status)) {
@@ -3269,7 +3398,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
* The PEBS buffer has to be drained before handling the A-PMI
*/
if (is_pebs_counter_event_group(event))
- x86_pmu.drain_pebs(regs, &data);
+ static_call(x86_pmu_drain_pebs)(regs, &data);
last_period = event->hw.last_period;
@@ -4029,7 +4158,9 @@ static unsigned long intel_pmu_large_pebs_flags(struct perf_event *event)
if (!event->attr.exclude_kernel)
flags &= ~PERF_SAMPLE_REGS_USER;
if (event->attr.sample_regs_user & ~PEBS_GP_REGS)
- flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR);
+ flags &= ~PERF_SAMPLE_REGS_USER;
+ if (event->attr.sample_regs_intr & ~PEBS_GP_REGS)
+ flags &= ~PERF_SAMPLE_REGS_INTR;
return flags;
}
@@ -4204,6 +4335,20 @@ static bool intel_pmu_is_acr_group(struct perf_event *event)
return false;
}
+static inline bool intel_pmu_has_pebs_counter_group(struct pmu *pmu)
+{
+ u64 caps;
+
+ if (x86_pmu.intel_cap.pebs_format >= 6 && x86_pmu.intel_cap.pebs_baseline)
+ return true;
+
+ caps = hybrid(pmu, arch_pebs_cap).caps;
+ if (x86_pmu.arch_pebs && (caps & ARCH_PEBS_CNTR_MASK))
+ return true;
+
+ return false;
+}
+
static inline void intel_pmu_set_acr_cntr_constr(struct perf_event *event,
u64 *cause_mask, int *num)
{
@@ -4237,6 +4382,8 @@ static int intel_pmu_hw_config(struct perf_event *event)
}
if (event->attr.precise_ip) {
+ struct arch_pebs_cap pebs_cap = hybrid(event->pmu, arch_pebs_cap);
+
if ((event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_FIXED_VLBR_EVENT)
return -EINVAL;
@@ -4250,6 +4397,15 @@ static int intel_pmu_hw_config(struct perf_event *event)
}
if (x86_pmu.pebs_aliases)
x86_pmu.pebs_aliases(event);
+
+ if (x86_pmu.arch_pebs) {
+ u64 cntr_mask = hybrid(event->pmu, intel_ctrl) &
+ ~GLOBAL_CTRL_EN_PERF_METRICS;
+ u64 pebs_mask = event->attr.precise_ip >= 3 ?
+ pebs_cap.pdists : pebs_cap.counters;
+ if (cntr_mask != pebs_mask)
+ event->hw.dyn_constraint &= pebs_mask;
+ }
}
if (needs_branch_stack(event)) {
@@ -4341,8 +4497,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
}
if ((event->attr.sample_type & PERF_SAMPLE_READ) &&
- (x86_pmu.intel_cap.pebs_format >= 6) &&
- x86_pmu.intel_cap.pebs_baseline &&
+ intel_pmu_has_pebs_counter_group(event->pmu) &&
is_sampling_event(event) &&
event->attr.precise_ip)
event->group_leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR;
@@ -5212,7 +5367,13 @@ err:
static int intel_pmu_cpu_prepare(int cpu)
{
- return intel_cpuc_prepare(&per_cpu(cpu_hw_events, cpu), cpu);
+ int ret;
+
+ ret = intel_cpuc_prepare(&per_cpu(cpu_hw_events, cpu), cpu);
+ if (ret)
+ return ret;
+
+ return alloc_arch_pebs_buf_on_cpu(cpu);
}
static void flip_smm_bit(void *data)
@@ -5257,6 +5418,163 @@ static void intel_pmu_check_event_constraints(struct event_constraint *event_con
u64 fixed_cntr_mask,
u64 intel_ctrl);
+enum dyn_constr_type {
+ DYN_CONSTR_NONE,
+ DYN_CONSTR_BR_CNTR,
+ DYN_CONSTR_ACR_CNTR,
+ DYN_CONSTR_ACR_CAUSE,
+ DYN_CONSTR_PEBS,
+ DYN_CONSTR_PDIST,
+
+ DYN_CONSTR_MAX,
+};
+
+static const char * const dyn_constr_type_name[] = {
+ [DYN_CONSTR_NONE] = "a normal event",
+ [DYN_CONSTR_BR_CNTR] = "a branch counter logging event",
+ [DYN_CONSTR_ACR_CNTR] = "an auto-counter reload event",
+ [DYN_CONSTR_ACR_CAUSE] = "an auto-counter reload cause event",
+ [DYN_CONSTR_PEBS] = "a PEBS event",
+ [DYN_CONSTR_PDIST] = "a PEBS PDIST event",
+};
+
+static void __intel_pmu_check_dyn_constr(struct event_constraint *constr,
+ enum dyn_constr_type type, u64 mask)
+{
+ struct event_constraint *c1, *c2;
+ int new_weight, check_weight;
+ u64 new_mask, check_mask;
+
+ for_each_event_constraint(c1, constr) {
+ new_mask = c1->idxmsk64 & mask;
+ new_weight = hweight64(new_mask);
+
+ /* ignore topdown perf metrics event */
+ if (c1->idxmsk64 & INTEL_PMC_MSK_TOPDOWN)
+ continue;
+
+ if (!new_weight && fls64(c1->idxmsk64) < INTEL_PMC_IDX_FIXED) {
+ pr_info("The event 0x%llx is not supported as %s.\n",
+ c1->code, dyn_constr_type_name[type]);
+ }
+
+ if (new_weight <= 1)
+ continue;
+
+ for_each_event_constraint(c2, c1 + 1) {
+ bool check_fail = false;
+
+ check_mask = c2->idxmsk64 & mask;
+ check_weight = hweight64(check_mask);
+
+ if (c2->idxmsk64 & INTEL_PMC_MSK_TOPDOWN ||
+ !check_weight)
+ continue;
+
+ /* The same constraints or no overlap */
+ if (new_mask == check_mask ||
+ (new_mask ^ check_mask) == (new_mask | check_mask))
+ continue;
+
+ /*
+ * A scheduler issue may be triggered in the following cases.
+ * - Two overlap constraints have the same weight.
+ * E.g., A constraints: 0x3, B constraints: 0x6
+ * event counter failure case
+ * B PMC[2:1] 1
+ * A PMC[1:0] 0
+ * A PMC[1:0] FAIL
+ * - Two overlap constraints have different weight.
+ * The constraint has a low weight, but has high last bit.
+ * E.g., A constraints: 0x7, B constraints: 0xC
+ * event counter failure case
+ * B PMC[3:2] 2
+ * A PMC[2:0] 0
+ * A PMC[2:0] 1
+ * A PMC[2:0] FAIL
+ */
+ if (new_weight == check_weight) {
+ check_fail = true;
+ } else if (new_weight < check_weight) {
+ if ((new_mask | check_mask) != check_mask &&
+ fls64(new_mask) > fls64(check_mask))
+ check_fail = true;
+ } else {
+ if ((new_mask | check_mask) != new_mask &&
+ fls64(new_mask) < fls64(check_mask))
+ check_fail = true;
+ }
+
+ if (check_fail) {
+ pr_info("The two events 0x%llx and 0x%llx may not be "
+ "fully scheduled under some circumstances as "
+ "%s.\n",
+ c1->code, c2->code, dyn_constr_type_name[type]);
+ }
+ }
+ }
+}
+
+static void intel_pmu_check_dyn_constr(struct pmu *pmu,
+ struct event_constraint *constr,
+ u64 cntr_mask)
+{
+ enum dyn_constr_type i;
+ u64 mask;
+
+ for (i = DYN_CONSTR_NONE; i < DYN_CONSTR_MAX; i++) {
+ mask = 0;
+ switch (i) {
+ case DYN_CONSTR_NONE:
+ mask = cntr_mask;
+ break;
+ case DYN_CONSTR_BR_CNTR:
+ if (x86_pmu.flags & PMU_FL_BR_CNTR)
+ mask = x86_pmu.lbr_counters;
+ break;
+ case DYN_CONSTR_ACR_CNTR:
+ mask = hybrid(pmu, acr_cntr_mask64) & GENMASK_ULL(INTEL_PMC_MAX_GENERIC - 1, 0);
+ break;
+ case DYN_CONSTR_ACR_CAUSE:
+ if (hybrid(pmu, acr_cntr_mask64) == hybrid(pmu, acr_cause_mask64))
+ continue;
+ mask = hybrid(pmu, acr_cause_mask64) & GENMASK_ULL(INTEL_PMC_MAX_GENERIC - 1, 0);
+ break;
+ case DYN_CONSTR_PEBS:
+ if (x86_pmu.arch_pebs)
+ mask = hybrid(pmu, arch_pebs_cap).counters;
+ break;
+ case DYN_CONSTR_PDIST:
+ if (x86_pmu.arch_pebs)
+ mask = hybrid(pmu, arch_pebs_cap).pdists;
+ break;
+ default:
+ pr_warn("Unsupported dynamic constraint type %d\n", i);
+ }
+
+ if (mask)
+ __intel_pmu_check_dyn_constr(constr, i, mask);
+ }
+}
+
+static void intel_pmu_check_event_constraints_all(struct pmu *pmu)
+{
+ struct event_constraint *event_constraints = hybrid(pmu, event_constraints);
+ struct event_constraint *pebs_constraints = hybrid(pmu, pebs_constraints);
+ u64 cntr_mask = hybrid(pmu, cntr_mask64);
+ u64 fixed_cntr_mask = hybrid(pmu, fixed_cntr_mask64);
+ u64 intel_ctrl = hybrid(pmu, intel_ctrl);
+
+ intel_pmu_check_event_constraints(event_constraints, cntr_mask,
+ fixed_cntr_mask, intel_ctrl);
+
+ if (event_constraints)
+ intel_pmu_check_dyn_constr(pmu, event_constraints, cntr_mask);
+
+ if (pebs_constraints)
+ intel_pmu_check_dyn_constr(pmu, pebs_constraints, cntr_mask);
+}
+
static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs);
static inline bool intel_pmu_broken_perf_cap(void)
@@ -5269,34 +5587,89 @@ static inline bool intel_pmu_broken_perf_cap(void)
return false;
}
+static inline void __intel_update_pmu_caps(struct pmu *pmu)
+{
+ struct pmu *dest_pmu = pmu ? pmu : x86_get_pmu(smp_processor_id());
+
+ if (hybrid(pmu, arch_pebs_cap).caps & ARCH_PEBS_VECR_XMM)
+ dest_pmu->capabilities |= PERF_PMU_CAP_EXTENDED_REGS;
+}
+
+static inline void __intel_update_large_pebs_flags(struct pmu *pmu)
+{
+ u64 caps = hybrid(pmu, arch_pebs_cap).caps;
+
+ x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME;
+ if (caps & ARCH_PEBS_LBR)
+ x86_pmu.large_pebs_flags |= PERF_SAMPLE_BRANCH_STACK;
+ if (caps & ARCH_PEBS_CNTR_MASK)
+ x86_pmu.large_pebs_flags |= PERF_SAMPLE_READ;
+
+ if (!(caps & ARCH_PEBS_AUX))
+ x86_pmu.large_pebs_flags &= ~PERF_SAMPLE_DATA_SRC;
+ if (!(caps & ARCH_PEBS_GPR)) {
+ x86_pmu.large_pebs_flags &=
+ ~(PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER);
+ }
+}
+
+#define counter_mask(_gp, _fixed) ((_gp) | ((u64)(_fixed) << INTEL_PMC_IDX_FIXED))
+
static void update_pmu_cap(struct pmu *pmu)
{
- unsigned int cntr, fixed_cntr, ecx, edx;
- union cpuid35_eax eax;
- union cpuid35_ebx ebx;
+ unsigned int eax, ebx, ecx, edx;
+ union cpuid35_eax eax_0;
+ union cpuid35_ebx ebx_0;
+ u64 cntrs_mask = 0;
+ u64 pebs_mask = 0;
+ u64 pdists_mask = 0;
- cpuid(ARCH_PERFMON_EXT_LEAF, &eax.full, &ebx.full, &ecx, &edx);
+ cpuid(ARCH_PERFMON_EXT_LEAF, &eax_0.full, &ebx_0.full, &ecx, &edx);
- if (ebx.split.umask2)
+ if (ebx_0.split.umask2)
hybrid(pmu, config_mask) |= ARCH_PERFMON_EVENTSEL_UMASK2;
- if (ebx.split.eq)
+ if (ebx_0.split.eq)
hybrid(pmu, config_mask) |= ARCH_PERFMON_EVENTSEL_EQ;
- if (eax.split.cntr_subleaf) {
+ if (eax_0.split.cntr_subleaf) {
cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_NUM_COUNTER_LEAF,
- &cntr, &fixed_cntr, &ecx, &edx);
- hybrid(pmu, cntr_mask64) = cntr;
- hybrid(pmu, fixed_cntr_mask64) = fixed_cntr;
+ &eax, &ebx, &ecx, &edx);
+ hybrid(pmu, cntr_mask64) = eax;
+ hybrid(pmu, fixed_cntr_mask64) = ebx;
+ cntrs_mask = counter_mask(eax, ebx);
}
- if (eax.split.acr_subleaf) {
+ if (eax_0.split.acr_subleaf) {
cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_ACR_LEAF,
- &cntr, &fixed_cntr, &ecx, &edx);
+ &eax, &ebx, &ecx, &edx);
/* The mask of the counters which can be reloaded */
- hybrid(pmu, acr_cntr_mask64) = cntr | ((u64)fixed_cntr << INTEL_PMC_IDX_FIXED);
-
+ hybrid(pmu, acr_cntr_mask64) = counter_mask(eax, ebx);
/* The mask of the counters which can cause a reload of reloadable counters */
- hybrid(pmu, acr_cause_mask64) = ecx | ((u64)edx << INTEL_PMC_IDX_FIXED);
+ hybrid(pmu, acr_cause_mask64) = counter_mask(ecx, edx);
+ }
+
+ /* Bits[5:4] should be set simultaneously if arch-PEBS is supported */
+ if (eax_0.split.pebs_caps_subleaf && eax_0.split.pebs_cnts_subleaf) {
+ cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_PEBS_CAP_LEAF,
+ &eax, &ebx, &ecx, &edx);
+ hybrid(pmu, arch_pebs_cap).caps = (u64)ebx << 32;
+
+ cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_PEBS_COUNTER_LEAF,
+ &eax, &ebx, &ecx, &edx);
+ pebs_mask = counter_mask(eax, ecx);
+ pdists_mask = counter_mask(ebx, edx);
+ hybrid(pmu, arch_pebs_cap).counters = pebs_mask;
+ hybrid(pmu, arch_pebs_cap).pdists = pdists_mask;
+
+ if (WARN_ON((pebs_mask | pdists_mask) & ~cntrs_mask)) {
+ x86_pmu.arch_pebs = 0;
+ } else {
+ __intel_update_pmu_caps(pmu);
+ __intel_update_large_pebs_flags(pmu);
+ }
+ } else {
+ WARN_ON(x86_pmu.arch_pebs == 1);
+ x86_pmu.arch_pebs = 0;
}
if (!intel_pmu_broken_perf_cap()) {
@@ -5319,10 +5692,7 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu)
else
pmu->intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS;
- intel_pmu_check_event_constraints(pmu->event_constraints,
- pmu->cntr_mask64,
- pmu->fixed_cntr_mask64,
- pmu->intel_ctrl);
+ intel_pmu_check_event_constraints_all(&pmu->pmu);
intel_pmu_check_extra_regs(pmu->extra_regs);
}
@@ -5418,6 +5788,7 @@ static void intel_pmu_cpu_starting(int cpu)
return;
init_debug_store_on_cpu(cpu);
+ init_arch_pebs_on_cpu(cpu);
/*
* Deal with CPUs that don't clear their LBRs on power-up, and that may
* even boot with LBRs enabled.
@@ -5456,6 +5827,8 @@ static void intel_pmu_cpu_starting(int cpu)
}
}
+ __intel_update_pmu_caps(cpuc->pmu);
+
if (!cpuc->shared_regs)
return;
@@ -5515,6 +5888,7 @@ static void free_excl_cntrs(struct cpu_hw_events *cpuc)
static void intel_pmu_cpu_dying(int cpu)
{
fini_debug_store_on_cpu(cpu);
+ fini_arch_pebs_on_cpu(cpu);
}
void intel_cpuc_finish(struct cpu_hw_events *cpuc)
@@ -5535,6 +5909,7 @@ static void intel_pmu_cpu_dead(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+ release_arch_pebs_buf_on_cpu(cpu);
intel_cpuc_finish(cpuc);
if (is_hybrid() && cpuc->pmu)
@@ -6250,7 +6625,7 @@ tsx_is_visible(struct kobject *kobj, struct attribute *attr, int i)
static umode_t
pebs_is_visible(struct kobject *kobj, struct attribute *attr, int i)
{
- return x86_pmu.ds_pebs ? attr->mode : 0;
+ return intel_pmu_has_pebs() ? attr->mode : 0;
}
static umode_t
@@ -6940,8 +7315,11 @@ __init int intel_pmu_init(void)
* Many features on and after V6 require dynamic constraint,
* e.g., Arch PEBS, ACR.
*/
- if (version >= 6)
+ if (version >= 6) {
x86_pmu.flags |= PMU_FL_DYN_CONSTRAINT;
+ x86_pmu.late_setup = intel_pmu_late_setup;
+ }
+
/*
* Install the hw-cache-events table:
*/
@@ -7596,6 +7974,7 @@ __init int intel_pmu_init(void)
break;
case INTEL_PANTHERLAKE_L:
+ case INTEL_WILDCATLAKE_L:
pr_cont("Pantherlake Hybrid events, ");
name = "pantherlake_hybrid";
goto lnl_common;
@@ -7726,6 +8105,14 @@ __init int intel_pmu_init(void)
if (!is_hybrid() && boot_cpu_has(X86_FEATURE_ARCH_PERFMON_EXT))
update_pmu_cap(NULL);
+ if (x86_pmu.arch_pebs) {
+ static_call_update(intel_pmu_disable_event_ext,
+ intel_pmu_disable_event_ext);
+ static_call_update(intel_pmu_enable_event_ext,
+ intel_pmu_enable_event_ext);
+ pr_cont("Architectural PEBS, ");
+ }
+
intel_pmu_check_counters_mask(&x86_pmu.cntr_mask64,
&x86_pmu.fixed_cntr_mask64,
&x86_pmu.intel_ctrl);
@@ -7734,10 +8121,8 @@ __init int intel_pmu_init(void)
if (x86_pmu.intel_cap.anythread_deprecated)
x86_pmu.format_attrs = intel_arch_formats_attr;
- intel_pmu_check_event_constraints(x86_pmu.event_constraints,
- x86_pmu.cntr_mask64,
- x86_pmu.fixed_cntr_mask64,
- x86_pmu.intel_ctrl);
+ intel_pmu_check_event_constraints_all(NULL);
+
/*
* Access LBR MSR may cause #GP under certain circumstances.
* Check all LBR MSR here.
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index ec753e39b007..fa67fda6e45b 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -41,7 +41,7 @@
* MSR_CORE_C1_RES: CORE C1 Residency Counter
* perf code: 0x00
* Available model: SLM,AMT,GLM,CNL,ICX,TNT,ADL,RPL
- * MTL,SRF,GRR,ARL,LNL
+ * MTL,SRF,GRR,ARL,LNL,PTL
* Scope: Core (each processor core has a MSR)
* MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
* perf code: 0x01
@@ -53,31 +53,32 @@
* Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
* SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX,
* TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF,
- * GRR,ARL,LNL
+ * GRR,ARL,LNL,PTL
* Scope: Core
* MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
* perf code: 0x03
* Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML,
- * ICL,TGL,RKL,ADL,RPL,MTL,ARL,LNL
+ * ICL,TGL,RKL,ADL,RPL,MTL,ARL,LNL,
+ * PTL
* Scope: Core
* MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter.
* perf code: 0x00
* Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL,
* KBL,CML,ICL,ICX,TGL,TNT,RKL,ADL,
- * RPL,SPR,MTL,ARL,LNL,SRF
+ * RPL,SPR,MTL,ARL,LNL,SRF,PTL
* Scope: Package (physical package)
* MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter.
* perf code: 0x01
* Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL,
* GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL,
- * ADL,RPL,MTL,ARL,LNL
+ * ADL,RPL,MTL,ARL
* Scope: Package (physical package)
* MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter.
* perf code: 0x02
* Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
* SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX,
* TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF,
- * ARL,LNL
+ * ARL,LNL,PTL
* Scope: Package (physical package)
* MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter.
* perf code: 0x03
@@ -96,7 +97,7 @@
* MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
* perf code: 0x06
* Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL,
- * TNT,RKL,ADL,RPL,MTL,ARL,LNL
+ * TNT,RKL,ADL,RPL,MTL,ARL,LNL,PTL
* Scope: Package (physical package)
* MSR_MODULE_C6_RES_MS: Module C6 Residency Counter.
* perf code: 0x00
@@ -522,7 +523,6 @@ static const struct cstate_model lnl_cstates __initconst = {
BIT(PERF_CSTATE_CORE_C7_RES),
.pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) |
- BIT(PERF_CSTATE_PKG_C3_RES) |
BIT(PERF_CSTATE_PKG_C6_RES) |
BIT(PERF_CSTATE_PKG_C10_RES),
};
@@ -628,6 +628,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &adl_cstates),
X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, &srf_cstates),
X86_MATCH_VFM(INTEL_ATOM_CRESTMONT, &grr_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, &srf_cstates),
X86_MATCH_VFM(INTEL_ICELAKE_L, &icl_cstates),
X86_MATCH_VFM(INTEL_ICELAKE, &icl_cstates),
@@ -652,6 +653,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
X86_MATCH_VFM(INTEL_ARROWLAKE_H, &adl_cstates),
X86_MATCH_VFM(INTEL_ARROWLAKE_U, &adl_cstates),
X86_MATCH_VFM(INTEL_LUNARLAKE_M, &lnl_cstates),
+ X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &lnl_cstates),
{ },
};
MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index c0b7ac1c7594..feb1c3cf63e4 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -317,7 +317,8 @@ static u64 __grt_latency_data(struct perf_event *event, u64 status,
{
u64 val;
- WARN_ON_ONCE(hybrid_pmu(event->pmu)->pmu_type == hybrid_big);
+ WARN_ON_ONCE(is_hybrid() &&
+ hybrid_pmu(event->pmu)->pmu_type == hybrid_big);
dse &= PERF_PEBS_DATA_SOURCE_GRT_MASK;
val = hybrid_var(event->pmu, pebs_data_source)[dse];
@@ -625,13 +626,18 @@ static int alloc_pebs_buffer(int cpu)
int max, node = cpu_to_node(cpu);
void *buffer, *insn_buff, *cea;
- if (!x86_pmu.ds_pebs)
+ if (!intel_pmu_has_pebs())
return 0;
buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu);
if (unlikely(!buffer))
return -ENOMEM;
+ if (x86_pmu.arch_pebs) {
+ hwev->pebs_vaddr = buffer;
+ return 0;
+ }
+
/*
* HSW+ already provides us the eventing ip; no need to allocate this
* buffer then.
@@ -644,7 +650,7 @@ static int alloc_pebs_buffer(int cpu)
}
per_cpu(insn_buffer, cpu) = insn_buff;
}
- hwev->ds_pebs_vaddr = buffer;
+ hwev->pebs_vaddr = buffer;
/* Update the cpu entry area mapping */
cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
ds->pebs_buffer_base = (unsigned long) cea;
@@ -660,17 +666,20 @@ static void release_pebs_buffer(int cpu)
struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
void *cea;
- if (!x86_pmu.ds_pebs)
+ if (!intel_pmu_has_pebs())
return;
- kfree(per_cpu(insn_buffer, cpu));
- per_cpu(insn_buffer, cpu) = NULL;
+ if (x86_pmu.ds_pebs) {
+ kfree(per_cpu(insn_buffer, cpu));
+ per_cpu(insn_buffer, cpu) = NULL;
- /* Clear the fixmap */
- cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
- ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
- dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
- hwev->ds_pebs_vaddr = NULL;
+ /* Clear the fixmap */
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
+ ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
+ }
+
+ dsfree_pages(hwev->pebs_vaddr, x86_pmu.pebs_buffer_size);
+ hwev->pebs_vaddr = NULL;
}
static int alloc_bts_buffer(int cpu)
@@ -823,6 +832,56 @@ void reserve_ds_buffers(void)
}
}
+inline int alloc_arch_pebs_buf_on_cpu(int cpu)
+{
+ if (!x86_pmu.arch_pebs)
+ return 0;
+
+ return alloc_pebs_buffer(cpu);
+}
+
+inline void release_arch_pebs_buf_on_cpu(int cpu)
+{
+ if (!x86_pmu.arch_pebs)
+ return;
+
+ release_pebs_buffer(cpu);
+}
+
+void init_arch_pebs_on_cpu(int cpu)
+{
+ struct cpu_hw_events *cpuc = per_cpu_ptr(&cpu_hw_events, cpu);
+ u64 arch_pebs_base;
+
+ if (!x86_pmu.arch_pebs)
+ return;
+
+ if (!cpuc->pebs_vaddr) {
+ WARN(1, "Fail to allocate PEBS buffer on CPU %d\n", cpu);
+ x86_pmu.pebs_active = 0;
+ return;
+ }
+
+ /*
+ * 4KB-aligned pointer of the output buffer
+ * (__alloc_pages_node() return page aligned address)
+ * Buffer Size = 4KB * 2^SIZE
+ * contiguous physical buffer (__alloc_pages_node() with order)
+ */
+ arch_pebs_base = virt_to_phys(cpuc->pebs_vaddr) | PEBS_BUFFER_SHIFT;
+ wrmsr_on_cpu(cpu, MSR_IA32_PEBS_BASE, (u32)arch_pebs_base,
+ (u32)(arch_pebs_base >> 32));
+ x86_pmu.pebs_active = 1;
+}
+
+inline void fini_arch_pebs_on_cpu(int cpu)
+{
+ if (!x86_pmu.arch_pebs)
+ return;
+
+ wrmsr_on_cpu(cpu, MSR_IA32_PEBS_BASE, 0, 0);
+}
+
/*
* BTS
*/
@@ -1470,6 +1529,25 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc,
}
}
+u64 intel_get_arch_pebs_data_config(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ u64 pebs_data_cfg = 0;
+ u64 cntr_mask;
+
+ if (WARN_ON(event->hw.idx < 0 || event->hw.idx >= X86_PMC_IDX_MAX))
+ return 0;
+
+ pebs_data_cfg |= pebs_update_adaptive_cfg(event);
+
+ cntr_mask = (PEBS_DATACFG_CNTR_MASK << PEBS_DATACFG_CNTR_SHIFT) |
+ (PEBS_DATACFG_FIX_MASK << PEBS_DATACFG_FIX_SHIFT) |
+ PEBS_DATACFG_CNTR | PEBS_DATACFG_METRICS;
+ pebs_data_cfg |= cpuc->pebs_data_cfg & cntr_mask;
+
+ return pebs_data_cfg;
+}
+
void intel_pmu_pebs_add(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1531,6 +1609,15 @@ static inline void intel_pmu_drain_large_pebs(struct cpu_hw_events *cpuc)
intel_pmu_drain_pebs_buffer();
}
+static void __intel_pmu_pebs_enable(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+
+ hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
+ cpuc->pebs_enabled |= 1ULL << hwc->idx;
+}
+
void intel_pmu_pebs_enable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1539,9 +1626,7 @@ void intel_pmu_pebs_enable(struct perf_event *event)
struct debug_store *ds = cpuc->ds;
unsigned int idx = hwc->idx;
- hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
-
- cpuc->pebs_enabled |= 1ULL << hwc->idx;
+ __intel_pmu_pebs_enable(event);
if ((event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) && (x86_pmu.version < 5))
cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
@@ -1603,14 +1688,22 @@ void intel_pmu_pebs_del(struct perf_event *event)
pebs_update_state(needed_cb, cpuc, event, false);
}
-void intel_pmu_pebs_disable(struct perf_event *event)
+static void __intel_pmu_pebs_disable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
intel_pmu_drain_large_pebs(cpuc);
-
cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
+ hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
+}
+
+void intel_pmu_pebs_disable(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+
+ __intel_pmu_pebs_disable(event);
if ((event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) &&
(x86_pmu.version < 5))
@@ -1622,8 +1715,6 @@ void intel_pmu_pebs_disable(struct perf_event *event)
if (cpuc->enabled)
wrmsrq(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
-
- hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
}
void intel_pmu_pebs_enable_all(void)
@@ -2059,6 +2150,90 @@ static inline void __setup_pebs_counter_group(struct cpu_hw_events *cpuc,
#define PEBS_LATENCY_MASK 0xffff
+static inline void __setup_perf_sample_data(struct perf_event *event,
+ struct pt_regs *iregs,
+ struct perf_sample_data *data)
+{
+ perf_sample_data_init(data, 0, event->hw.last_period);
+
+ /*
+ * We must however always use iregs for the unwinder to stay sane; the
+ * record BP,SP,IP can point into thin air when the record is from a
+ * previous PMI context or an (I)RET happened between the record and
+ * PMI.
+ */
+ perf_sample_save_callchain(data, event, iregs);
+}
+
+static inline void __setup_pebs_basic_group(struct perf_event *event,
+ struct pt_regs *regs,
+ struct perf_sample_data *data,
+ u64 sample_type, u64 ip,
+ u64 tsc, u16 retire)
+{
+ /* The ip in basic is EventingIP */
+ set_linear_ip(regs, ip);
+ regs->flags = PERF_EFLAGS_EXACT;
+ setup_pebs_time(event, data, tsc);
+
+ if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT)
+ data->weight.var3_w = retire;
+}
+
+static inline void __setup_pebs_gpr_group(struct perf_event *event,
+ struct pt_regs *regs,
+ struct pebs_gprs *gprs,
+ u64 sample_type)
+{
+ if (event->attr.precise_ip < 2) {
+ set_linear_ip(regs, gprs->ip);
+ regs->flags &= ~PERF_EFLAGS_EXACT;
+ }
+
+ if (sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER))
+ adaptive_pebs_save_regs(regs, gprs);
+}
+
+static inline void __setup_pebs_meminfo_group(struct perf_event *event,
+ struct perf_sample_data *data,
+ u64 sample_type, u64 latency,
+ u16 instr_latency, u64 address,
+ u64 aux, u64 tsx_tuning, u64 ax)
+{
+ if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
+ u64 tsx_latency = intel_get_tsx_weight(tsx_tuning);
+
+ data->weight.var2_w = instr_latency;
+
+ /*
+ * Although meminfo::latency is defined as a u64,
+ * only the lower 32 bits include the valid data
+ * in practice on Ice Lake and earlier platforms.
+ */
+ if (sample_type & PERF_SAMPLE_WEIGHT)
+ data->weight.full = latency ?: tsx_latency;
+ else
+ data->weight.var1_dw = (u32)latency ?: tsx_latency;
+
+ data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
+ }
+
+ if (sample_type & PERF_SAMPLE_DATA_SRC) {
+ data->data_src.val = get_data_src(event, aux);
+ data->sample_flags |= PERF_SAMPLE_DATA_SRC;
+ }
+
+ if (sample_type & PERF_SAMPLE_ADDR_TYPE) {
+ data->addr = address;
+ data->sample_flags |= PERF_SAMPLE_ADDR;
+ }
+
+ if (sample_type & PERF_SAMPLE_TRANSACTION) {
+ data->txn = intel_get_tsx_transaction(tsx_tuning, ax);
+ data->sample_flags |= PERF_SAMPLE_TRANSACTION;
+ }
+}
+
/*
* With adaptive PEBS the layout depends on what fields are configured.
*/
@@ -2068,12 +2243,14 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
struct pt_regs *regs)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ u64 sample_type = event->attr.sample_type;
struct pebs_basic *basic = __pebs;
void *next_record = basic + 1;
- u64 sample_type, format_group;
struct pebs_meminfo *meminfo = NULL;
struct pebs_gprs *gprs = NULL;
struct x86_perf_regs *perf_regs;
+ u64 format_group;
+ u16 retire;
if (basic == NULL)
return;
@@ -2081,31 +2258,17 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
perf_regs = container_of(regs, struct x86_perf_regs, regs);
perf_regs->xmm_regs = NULL;
- sample_type = event->attr.sample_type;
format_group = basic->format_group;
- perf_sample_data_init(data, 0, event->hw.last_period);
- setup_pebs_time(event, data, basic->tsc);
-
- /*
- * We must however always use iregs for the unwinder to stay sane; the
- * record BP,SP,IP can point into thin air when the record is from a
- * previous PMI context or an (I)RET happened between the record and
- * PMI.
- */
- perf_sample_save_callchain(data, event, iregs);
+ __setup_perf_sample_data(event, iregs, data);
*regs = *iregs;
- /* The ip in basic is EventingIP */
- set_linear_ip(regs, basic->ip);
- regs->flags = PERF_EFLAGS_EXACT;
- if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) {
- if (x86_pmu.flags & PMU_FL_RETIRE_LATENCY)
- data->weight.var3_w = basic->retire_latency;
- else
- data->weight.var3_w = 0;
- }
+ /* basic group */
+ retire = x86_pmu.flags & PMU_FL_RETIRE_LATENCY ?
+ basic->retire_latency : 0;
+ __setup_pebs_basic_group(event, regs, data, sample_type,
+ basic->ip, basic->tsc, retire);
/*
* The record for MEMINFO is in front of GP
@@ -2121,54 +2284,20 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
gprs = next_record;
next_record = gprs + 1;
- if (event->attr.precise_ip < 2) {
- set_linear_ip(regs, gprs->ip);
- regs->flags &= ~PERF_EFLAGS_EXACT;
- }
-
- if (sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER))
- adaptive_pebs_save_regs(regs, gprs);
+ __setup_pebs_gpr_group(event, regs, gprs, sample_type);
}
if (format_group & PEBS_DATACFG_MEMINFO) {
- if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
- u64 latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ?
- meminfo->cache_latency : meminfo->mem_latency;
-
- if (x86_pmu.flags & PMU_FL_INSTR_LATENCY)
- data->weight.var2_w = meminfo->instr_latency;
-
- /*
- * Although meminfo::latency is defined as a u64,
- * only the lower 32 bits include the valid data
- * in practice on Ice Lake and earlier platforms.
- */
- if (sample_type & PERF_SAMPLE_WEIGHT) {
- data->weight.full = latency ?:
- intel_get_tsx_weight(meminfo->tsx_tuning);
- } else {
- data->weight.var1_dw = (u32)latency ?:
- intel_get_tsx_weight(meminfo->tsx_tuning);
- }
-
- data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
- }
-
- if (sample_type & PERF_SAMPLE_DATA_SRC) {
- data->data_src.val = get_data_src(event, meminfo->aux);
- data->sample_flags |= PERF_SAMPLE_DATA_SRC;
- }
-
- if (sample_type & PERF_SAMPLE_ADDR_TYPE) {
- data->addr = meminfo->address;
- data->sample_flags |= PERF_SAMPLE_ADDR;
- }
-
- if (sample_type & PERF_SAMPLE_TRANSACTION) {
- data->txn = intel_get_tsx_transaction(meminfo->tsx_tuning,
- gprs ? gprs->ax : 0);
- data->sample_flags |= PERF_SAMPLE_TRANSACTION;
- }
+ u64 latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ?
+ meminfo->cache_latency : meminfo->mem_latency;
+ u64 instr_latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ?
+ meminfo->instr_latency : 0;
+ u64 ax = gprs ? gprs->ax : 0;
+
+ __setup_pebs_meminfo_group(event, data, sample_type, latency,
+ instr_latency, meminfo->address,
+ meminfo->aux, meminfo->tsx_tuning,
+ ax);
}
if (format_group & PEBS_DATACFG_XMMS) {
@@ -2219,6 +2348,135 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
format_group);
}
+static inline bool arch_pebs_record_continued(struct arch_pebs_header *header)
+{
+ /* Continue bit or null PEBS record indicates fragment follows. */
+ return header->cont || !(header->format & GENMASK_ULL(63, 16));
+}
+
+static void setup_arch_pebs_sample_data(struct perf_event *event,
+ struct pt_regs *iregs,
+ void *__pebs,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ u64 sample_type = event->attr.sample_type;
+ struct arch_pebs_header *header = NULL;
+ struct arch_pebs_aux *meminfo = NULL;
+ struct arch_pebs_gprs *gprs = NULL;
+ struct x86_perf_regs *perf_regs;
+ void *next_record;
+ void *at = __pebs;
+
+ if (at == NULL)
+ return;
+
+ perf_regs = container_of(regs, struct x86_perf_regs, regs);
+ perf_regs->xmm_regs = NULL;
+
+ __setup_perf_sample_data(event, iregs, data);
+
+ *regs = *iregs;
+
+again:
+ header = at;
+ next_record = at + sizeof(struct arch_pebs_header);
+ if (header->basic) {
+ struct arch_pebs_basic *basic = next_record;
+ u16 retire = 0;
+
+ next_record = basic + 1;
+
+ if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT)
+ retire = basic->valid ? basic->retire : 0;
+ __setup_pebs_basic_group(event, regs, data, sample_type,
+ basic->ip, basic->tsc, retire);
+ }
+
+ /*
+ * The record for MEMINFO is in front of GP
+ * But PERF_SAMPLE_TRANSACTION needs gprs->ax.
+ * Save the pointer here but process later.
+ */
+ if (header->aux) {
+ meminfo = next_record;
+ next_record = meminfo + 1;
+ }
+
+ if (header->gpr) {
+ gprs = next_record;
+ next_record = gprs + 1;
+
+ __setup_pebs_gpr_group(event, regs,
+ (struct pebs_gprs *)gprs,
+ sample_type);
+ }
+
+ if (header->aux) {
+ u64 ax = gprs ? gprs->ax : 0;
+
+ __setup_pebs_meminfo_group(event, data, sample_type,
+ meminfo->cache_latency,
+ meminfo->instr_latency,
+ meminfo->address, meminfo->aux,
+ meminfo->tsx_tuning, ax);
+ }
+
+ if (header->xmm) {
+ struct pebs_xmm *xmm;
+
+ next_record += sizeof(struct arch_pebs_xer_header);
+
+ xmm = next_record;
+ perf_regs->xmm_regs = xmm->xmm;
+ next_record = xmm + 1;
+ }
+
+ if (header->lbr) {
+ struct arch_pebs_lbr_header *lbr_header = next_record;
+ struct lbr_entry *lbr;
+ int num_lbr;
+
+ next_record = lbr_header + 1;
+ lbr = next_record;
+
+ num_lbr = header->lbr == ARCH_PEBS_LBR_NUM_VAR ?
+ lbr_header->depth :
+ header->lbr * ARCH_PEBS_BASE_LBR_ENTRIES;
+ next_record += num_lbr * sizeof(struct lbr_entry);
+
+ if (has_branch_stack(event)) {
+ intel_pmu_store_pebs_lbrs(lbr);
+ intel_pmu_lbr_save_brstack(data, cpuc, event);
+ }
+ }
+
+ if (header->cntr) {
+ struct arch_pebs_cntr_header *cntr = next_record;
+ unsigned int nr;
+
+ next_record += sizeof(struct arch_pebs_cntr_header);
+
+ if (is_pebs_counter_event_group(event)) {
+ __setup_pebs_counter_group(cpuc, event,
+ (struct pebs_cntr_header *)cntr, next_record);
+ data->sample_flags |= PERF_SAMPLE_READ;
+ }
+
+ nr = hweight32(cntr->cntr) + hweight32(cntr->fixed);
+ if (cntr->metrics == INTEL_CNTR_METRICS)
+ nr += 2;
+ next_record += nr * sizeof(u64);
+ }
+
+ /* Parse followed fragments if there are. */
+ if (arch_pebs_record_continued(header)) {
+ at = at + header->size;
+ goto again;
+ }
+}
+
static inline void *
get_next_pebs_record_by_bit(void *base, void *top, int bit)
{
@@ -2601,6 +2859,57 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d
}
}
+static __always_inline void
+__intel_pmu_handle_pebs_record(struct pt_regs *iregs,
+ struct pt_regs *regs,
+ struct perf_sample_data *data,
+ void *at, u64 pebs_status,
+ short *counts, void **last,
+ setup_fn setup_sample)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct perf_event *event;
+ int bit;
+
+ for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) {
+ event = cpuc->events[bit];
+
+ if (WARN_ON_ONCE(!event) ||
+ WARN_ON_ONCE(!event->attr.precise_ip))
+ continue;
+
+ if (counts[bit]++) {
+ __intel_pmu_pebs_event(event, iregs, regs, data,
+ last[bit], setup_sample);
+ }
+
+ last[bit] = at;
+ }
+}
+
+static __always_inline void
+__intel_pmu_handle_last_pebs_record(struct pt_regs *iregs,
+ struct pt_regs *regs,
+ struct perf_sample_data *data,
+ u64 mask, short *counts, void **last,
+ setup_fn setup_sample)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct perf_event *event;
+ int bit;
+
+ for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) {
+ if (!counts[bit])
+ continue;
+
+ event = cpuc->events[bit];
+
+ __intel_pmu_pebs_last_event(event, iregs, regs, data, last[bit],
+ counts[bit], setup_sample);
+ }
+
+}
+
static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data)
{
short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
@@ -2610,9 +2919,7 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
struct x86_perf_regs perf_regs;
struct pt_regs *regs = &perf_regs.regs;
struct pebs_basic *basic;
- struct perf_event *event;
void *base, *at, *top;
- int bit;
u64 mask;
if (!x86_pmu.pebs_active)
@@ -2625,6 +2932,7 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
mask = hybrid(cpuc->pmu, pebs_events_mask) |
(hybrid(cpuc->pmu, fixed_cntr_mask64) << INTEL_PMC_IDX_FIXED);
+ mask &= cpuc->pebs_enabled;
if (unlikely(base >= top)) {
intel_pmu_pebs_event_update_no_drain(cpuc, mask);
@@ -2642,38 +2950,114 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
if (basic->format_size != cpuc->pebs_record_size)
continue;
- pebs_status = basic->applicable_counters & cpuc->pebs_enabled & mask;
- for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) {
- event = cpuc->events[bit];
+ pebs_status = mask & basic->applicable_counters;
+ __intel_pmu_handle_pebs_record(iregs, regs, data, at,
+ pebs_status, counts, last,
+ setup_pebs_adaptive_sample_data);
+ }
- if (WARN_ON_ONCE(!event) ||
- WARN_ON_ONCE(!event->attr.precise_ip))
- continue;
+ __intel_pmu_handle_last_pebs_record(iregs, regs, data, mask, counts, last,
+ setup_pebs_adaptive_sample_data);
+}
- if (counts[bit]++) {
- __intel_pmu_pebs_event(event, iregs, regs, data, last[bit],
- setup_pebs_adaptive_sample_data);
- }
- last[bit] = at;
- }
+static void intel_pmu_drain_arch_pebs(struct pt_regs *iregs,
+ struct perf_sample_data *data)
+{
+ short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
+ void *last[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS];
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ union arch_pebs_index index;
+ struct x86_perf_regs perf_regs;
+ struct pt_regs *regs = &perf_regs.regs;
+ void *base, *at, *top;
+ u64 mask;
+
+ rdmsrq(MSR_IA32_PEBS_INDEX, index.whole);
+
+ if (unlikely(!index.wr)) {
+ intel_pmu_pebs_event_update_no_drain(cpuc, X86_PMC_IDX_MAX);
+ return;
}
- for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) {
- if (!counts[bit])
+ base = cpuc->pebs_vaddr;
+ top = cpuc->pebs_vaddr + (index.wr << ARCH_PEBS_INDEX_WR_SHIFT);
+
+ index.wr = 0;
+ index.full = 0;
+ index.en = 1;
+ if (cpuc->n_pebs == cpuc->n_large_pebs)
+ index.thresh = ARCH_PEBS_THRESH_MULTI;
+ else
+ index.thresh = ARCH_PEBS_THRESH_SINGLE;
+ wrmsrq(MSR_IA32_PEBS_INDEX, index.whole);
+
+ mask = hybrid(cpuc->pmu, arch_pebs_cap).counters & cpuc->pebs_enabled;
+
+ if (!iregs)
+ iregs = &dummy_iregs;
+
+ /* Process all but the last event for each counter. */
+ for (at = base; at < top;) {
+ struct arch_pebs_header *header;
+ struct arch_pebs_basic *basic;
+ u64 pebs_status;
+
+ header = at;
+
+ if (WARN_ON_ONCE(!header->size))
+ break;
+
+ /* 1st fragment or single record must have basic group */
+ if (!header->basic) {
+ at += header->size;
continue;
+ }
- event = cpuc->events[bit];
+ basic = at + sizeof(struct arch_pebs_header);
+ pebs_status = mask & basic->applicable_counters;
+ __intel_pmu_handle_pebs_record(iregs, regs, data, at,
+ pebs_status, counts, last,
+ setup_arch_pebs_sample_data);
+
+ /* Skip non-last fragments */
+ while (arch_pebs_record_continued(header)) {
+ if (!header->size)
+ break;
+ at += header->size;
+ header = at;
+ }
- __intel_pmu_pebs_last_event(event, iregs, regs, data, last[bit],
- counts[bit], setup_pebs_adaptive_sample_data);
+ /* Skip last fragment or the single record */
+ at += header->size;
}
+
+ __intel_pmu_handle_last_pebs_record(iregs, regs, data, mask,
+ counts, last,
+ setup_arch_pebs_sample_data);
+}
+
+static void __init intel_arch_pebs_init(void)
+{
+ /*
+ * Current hybrid platforms always both support arch-PEBS or not
+ * on all kinds of cores. So directly set x86_pmu.arch_pebs flag
+ * if boot cpu supports arch-PEBS.
+ */
+ x86_pmu.arch_pebs = 1;
+ x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE;
+ x86_pmu.drain_pebs = intel_pmu_drain_arch_pebs;
+ x86_pmu.pebs_capable = ~0ULL;
+ x86_pmu.flags |= PMU_FL_PEBS_ALL;
+
+ x86_pmu.pebs_enable = __intel_pmu_pebs_enable;
+ x86_pmu.pebs_disable = __intel_pmu_pebs_disable;
}
/*
* PEBS probe and setup
*/
-void __init intel_pebs_init(void)
+static void __init intel_ds_pebs_init(void)
{
/*
* No support for 32bit formats
@@ -2735,10 +3119,8 @@ void __init intel_pebs_init(void)
break;
case 6:
- if (x86_pmu.intel_cap.pebs_baseline) {
+ if (x86_pmu.intel_cap.pebs_baseline)
x86_pmu.large_pebs_flags |= PERF_SAMPLE_READ;
- x86_pmu.late_setup = intel_pmu_late_setup;
- }
fallthrough;
case 5:
x86_pmu.pebs_ept = 1;
@@ -2788,6 +3170,14 @@ void __init intel_pebs_init(void)
}
}
+void __init intel_pebs_init(void)
+{
+ if (x86_pmu.intel_cap.pebs_format == 0xf)
+ intel_arch_pebs_init();
+ else
+ intel_ds_pebs_init();
+}
+
void perf_restore_debug_store(void)
{
struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 7aa59966e7c3..72f2adcda7c6 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/kvm_types.h>
#include <linux/perf_event.h>
#include <linux/types.h>
@@ -1705,7 +1706,7 @@ void x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
lbr->info = x86_pmu.lbr_info;
lbr->has_callstack = x86_pmu_has_lbr_callstack();
}
-EXPORT_SYMBOL_GPL(x86_perf_get_lbr);
+EXPORT_SYMBOL_FOR_KVM(x86_perf_get_lbr);
struct event_constraint vlbr_constraint =
__EVENT_CONSTRAINT(INTEL_FIXED_VLBR_EVENT, (1ULL << INTEL_PMC_IDX_FIXED_VLBR),
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index e8cf29d2b10c..44524a387c58 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -17,6 +17,7 @@
#include <linux/limits.h>
#include <linux/slab.h>
#include <linux/device.h>
+#include <linux/kvm_types.h>
#include <asm/cpuid/api.h>
#include <asm/perf_event.h>
@@ -82,13 +83,13 @@ u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability)
return (c & cd->mask) >> shift;
}
-EXPORT_SYMBOL_GPL(intel_pt_validate_cap);
+EXPORT_SYMBOL_FOR_KVM(intel_pt_validate_cap);
u32 intel_pt_validate_hw_cap(enum pt_capabilities cap)
{
return intel_pt_validate_cap(pt_pmu.caps, cap);
}
-EXPORT_SYMBOL_GPL(intel_pt_validate_hw_cap);
+EXPORT_SYMBOL_FOR_KVM(intel_pt_validate_hw_cap);
static ssize_t pt_cap_show(struct device *cdev,
struct device_attribute *attr,
@@ -1590,7 +1591,7 @@ void intel_pt_handle_vmx(int on)
local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(intel_pt_handle_vmx);
+EXPORT_SYMBOL_FOR_KVM(intel_pt_handle_vmx);
/*
* PMU callbacks
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index a762f7f5b161..e228e564b15e 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1325,8 +1325,6 @@ static void uncore_pci_sub_driver_init(void)
continue;
pmu = &type->pmus[UNCORE_PCI_DEV_IDX(ids->driver_data)];
- if (!pmu)
- continue;
if (uncore_pci_get_dev_die_info(pci_sub_dev, &die))
continue;
@@ -1895,6 +1893,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_MATCH_VFM(INTEL_ARROWLAKE_H, &mtl_uncore_init),
X86_MATCH_VFM(INTEL_LUNARLAKE_M, &lnl_uncore_init),
X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &ptl_uncore_init),
+ X86_MATCH_VFM(INTEL_WILDCATLAKE_L, &ptl_uncore_init),
X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &spr_uncore_init),
X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &spr_uncore_init),
X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &gnr_uncore_init),
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 2b969386dcdd..3161ec0a3416 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -283,8 +283,9 @@ struct cpu_hw_events {
* Intel DebugStore bits
*/
struct debug_store *ds;
- void *ds_pebs_vaddr;
void *ds_bts_vaddr;
+ /* DS based PEBS or arch-PEBS buffer address */
+ void *pebs_vaddr;
u64 pebs_enabled;
int n_pebs;
int n_large_pebs;
@@ -303,6 +304,8 @@ struct cpu_hw_events {
/* Intel ACR configuration */
u64 acr_cfg_b[X86_PMC_IDX_MAX];
u64 acr_cfg_c[X86_PMC_IDX_MAX];
+ /* Cached CFG_C values */
+ u64 cfg_c_val[X86_PMC_IDX_MAX];
/*
* Intel LBR bits
@@ -708,6 +711,12 @@ enum hybrid_pmu_type {
hybrid_big_small_tiny = hybrid_big | hybrid_small_tiny,
};
+struct arch_pebs_cap {
+ u64 caps;
+ u64 counters;
+ u64 pdists;
+};
+
struct x86_hybrid_pmu {
struct pmu pmu;
const char *name;
@@ -752,6 +761,8 @@ struct x86_hybrid_pmu {
mid_ack :1,
enabled_ack :1;
+ struct arch_pebs_cap arch_pebs_cap;
+
u64 pebs_data_source[PERF_PEBS_DATA_SOURCE_MAX];
};
@@ -906,7 +917,7 @@ struct x86_pmu {
union perf_capabilities intel_cap;
/*
- * Intel DebugStore bits
+ * Intel DebugStore and PEBS bits
*/
unsigned int bts :1,
bts_active :1,
@@ -917,7 +928,8 @@ struct x86_pmu {
pebs_no_tlb :1,
pebs_no_isolation :1,
pebs_block :1,
- pebs_ept :1;
+ pebs_ept :1,
+ arch_pebs :1;
int pebs_record_size;
int pebs_buffer_size;
u64 pebs_events_mask;
@@ -930,6 +942,11 @@ struct x86_pmu {
u64 pebs_capable;
/*
+ * Intel Architectural PEBS
+ */
+ struct arch_pebs_cap arch_pebs_cap;
+
+ /*
* Intel LBR
*/
unsigned int lbr_tos, lbr_from, lbr_to,
@@ -1124,7 +1141,6 @@ static struct perf_pmu_format_hybrid_attr format_attr_hybrid_##_name = {\
.pmu_type = _pmu, \
}
-int is_x86_event(struct perf_event *event);
struct pmu *x86_get_pmu(unsigned int cpu);
extern struct x86_pmu x86_pmu __read_mostly;
@@ -1217,7 +1233,7 @@ int x86_reserve_hardware(void);
void x86_release_hardware(void);
-int x86_pmu_max_precise(void);
+int x86_pmu_max_precise(struct pmu *pmu);
void hw_perf_lbr_event_destroy(struct perf_event *event);
@@ -1604,6 +1620,14 @@ extern void intel_cpuc_finish(struct cpu_hw_events *cpuc);
int intel_pmu_init(void);
+int alloc_arch_pebs_buf_on_cpu(int cpu);
+
+void release_arch_pebs_buf_on_cpu(int cpu);
+
+void init_arch_pebs_on_cpu(int cpu);
+
+void fini_arch_pebs_on_cpu(int cpu);
+
void init_debug_store_on_cpu(int cpu);
void fini_debug_store_on_cpu(int cpu);
@@ -1760,6 +1784,8 @@ void intel_pmu_pebs_data_source_cmt(void);
void intel_pmu_pebs_data_source_lnl(void);
+u64 intel_get_arch_pebs_data_config(struct perf_event *event);
+
int intel_pmu_setup_lbr_filter(struct perf_event *event);
void intel_pt_interrupt(void);
@@ -1792,6 +1818,11 @@ static inline int intel_pmu_max_num_pebs(struct pmu *pmu)
return fls((u32)hybrid(pmu, pebs_events_mask));
}
+static inline bool intel_pmu_has_pebs(void)
+{
+ return x86_pmu.ds_pebs || x86_pmu.arch_pebs;
+}
+
#else /* CONFIG_CPU_SUP_INTEL */
static inline void reserve_ds_buffers(void)
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 15bc07a5ebb3..b14c045679e1 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -198,6 +198,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
#define ALTINSTR_ENTRY(ft_flags) \
".pushsection .altinstructions,\"a\"\n" \
+ ANNOTATE_DATA_SPECIAL \
" .long 771b - .\n" /* label */ \
" .long 774f - .\n" /* new instruction */ \
" .4byte " __stringify(ft_flags) "\n" /* feature + flags */ \
@@ -207,6 +208,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
#define ALTINSTR_REPLACEMENT(newinstr) /* replacement */ \
".pushsection .altinstr_replacement, \"ax\"\n" \
+ ANNOTATE_DATA_SPECIAL \
"# ALT: replacement\n" \
"774:\n\t" newinstr "\n775:\n" \
".popsection\n"
@@ -337,6 +339,7 @@ void nop_func(void);
* instruction. See apply_alternatives().
*/
.macro altinstr_entry orig alt ft_flags orig_len alt_len
+ ANNOTATE_DATA_SPECIAL
.long \orig - .
.long \alt - .
.4byte \ft_flags
@@ -365,6 +368,7 @@ void nop_func(void);
.popsection ; \
.pushsection .altinstr_replacement,"ax" ; \
743: \
+ ANNOTATE_DATA_SPECIAL ; \
newinst ; \
744: \
.popsection ;
diff --git a/arch/x86/include/asm/amd/node.h b/arch/x86/include/asm/amd/node.h
index 23fe617898a8..a672b8765fa8 100644
--- a/arch/x86/include/asm/amd/node.h
+++ b/arch/x86/include/asm/amd/node.h
@@ -23,7 +23,6 @@
#define AMD_NODE0_PCI_SLOT 0x18
struct pci_dev *amd_node_get_func(u16 node, u8 func);
-struct pci_dev *amd_node_get_root(u16 node);
static inline u16 amd_num_nodes(void)
{
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index d5c8d3afe196..bd62bd87a841 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -2,6 +2,8 @@
#ifndef _ASM_X86_ASM_H
#define _ASM_X86_ASM_H
+#include <linux/annotate.h>
+
#ifdef __ASSEMBLER__
# define __ASM_FORM(x, ...) x,## __VA_ARGS__
# define __ASM_FORM_RAW(x, ...) x,## __VA_ARGS__
@@ -132,6 +134,7 @@ static __always_inline __pure void *rip_rel_ptr(void *p)
# define _ASM_EXTABLE_TYPE(from, to, type) \
.pushsection "__ex_table","a" ; \
.balign 4 ; \
+ ANNOTATE_DATA_SPECIAL ; \
.long (from) - . ; \
.long (to) - . ; \
.long type ; \
@@ -179,6 +182,7 @@ static __always_inline __pure void *rip_rel_ptr(void *p)
# define _ASM_EXTABLE_TYPE(from, to, type) \
" .pushsection \"__ex_table\",\"a\"\n" \
" .balign 4\n" \
+ ANNOTATE_DATA_SPECIAL \
" .long (" #from ") - .\n" \
" .long (" #to ") - .\n" \
" .long " __stringify(type) " \n" \
@@ -187,6 +191,7 @@ static __always_inline __pure void *rip_rel_ptr(void *p)
# define _ASM_EXTABLE_TYPE_REG(from, to, type, reg) \
" .pushsection \"__ex_table\",\"a\"\n" \
" .balign 4\n" \
+ ANNOTATE_DATA_SPECIAL \
" .long (" #from ") - .\n" \
" .long (" #to ") - .\n" \
DEFINE_EXTABLE_TYPE_REG \
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index 880ca15073ed..ab5bba6cf7f5 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -7,6 +7,11 @@
#include <linux/objtool.h>
#include <asm/asm.h>
+#ifndef __ASSEMBLY__
+struct bug_entry;
+extern void __WARN_trap(struct bug_entry *bug, ...);
+#endif
+
/*
* Despite that some emulators terminate on UD2, we use it for WARN().
*/
@@ -31,52 +36,77 @@
#define BUG_UD2 0xfffe
#define BUG_UD1 0xfffd
#define BUG_UD1_UBSAN 0xfffc
+#define BUG_UD1_WARN 0xfffb
#define BUG_UDB 0xffd6
#define BUG_LOCK 0xfff0
#ifdef CONFIG_GENERIC_BUG
-#ifdef CONFIG_X86_32
-# define __BUG_REL(val) ".long " val
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+#define __BUG_ENTRY_VERBOSE(file, line) \
+ "\t.long " file " - .\t# bug_entry::file\n" \
+ "\t.word " line "\t# bug_entry::line\n"
#else
-# define __BUG_REL(val) ".long " val " - ."
+#define __BUG_ENTRY_VERBOSE(file, line)
#endif
-#ifdef CONFIG_DEBUG_BUGVERBOSE
-#define __BUG_ENTRY(file, line, flags) \
- "2:\t" __BUG_REL("1b") "\t# bug_entry::bug_addr\n" \
- "\t" __BUG_REL(file) "\t# bug_entry::file\n" \
- "\t.word " line "\t# bug_entry::line\n" \
- "\t.word " flags "\t# bug_entry::flags\n"
+#if defined(CONFIG_X86_64) || defined(CONFIG_DEBUG_BUGVERBOSE_DETAILED)
+#define HAVE_ARCH_BUG_FORMAT
+#define __BUG_ENTRY_FORMAT(format) \
+ "\t.long " format " - .\t# bug_entry::format\n"
#else
-#define __BUG_ENTRY(file, line, flags) \
- "2:\t" __BUG_REL("1b") "\t# bug_entry::bug_addr\n" \
- "\t.word " flags "\t# bug_entry::flags\n"
+#define __BUG_ENTRY_FORMAT(format)
+#endif
+
+#ifdef CONFIG_X86_64
+#define HAVE_ARCH_BUG_FORMAT_ARGS
#endif
-#define _BUG_FLAGS_ASM(ins, file, line, flags, size, extra) \
- "1:\t" ins "\n" \
- ".pushsection __bug_table,\"aw\"\n" \
- __BUG_ENTRY(file, line, flags) \
+#define __BUG_ENTRY(format, file, line, flags) \
+ "\t.long 1b - ." "\t# bug_entry::bug_addr\n" \
+ __BUG_ENTRY_FORMAT(format) \
+ __BUG_ENTRY_VERBOSE(file, line) \
+ "\t.word " flags "\t# bug_entry::flags\n"
+
+#define _BUG_FLAGS_ASM(format, file, line, flags, size, extra) \
+ ".pushsection __bug_table,\"aw\"\n\t" \
+ ANNOTATE_DATA_SPECIAL \
+ "2:\n\t" \
+ __BUG_ENTRY(format, file, line, flags) \
"\t.org 2b + " size "\n" \
".popsection\n" \
extra
-#define _BUG_FLAGS(ins, flags, extra) \
+#ifdef CONFIG_DEBUG_BUGVERBOSE_DETAILED
+#define WARN_CONDITION_STR(cond_str) cond_str
+#else
+#define WARN_CONDITION_STR(cond_str) ""
+#endif
+
+#define _BUG_FLAGS(cond_str, ins, flags, extra) \
do { \
- asm_inline volatile(_BUG_FLAGS_ASM(ins, "%c0", \
- "%c1", "%c2", "%c3", extra) \
- : : "i" (__FILE__), "i" (__LINE__), \
- "i" (flags), \
- "i" (sizeof(struct bug_entry))); \
+ asm_inline volatile("1:\t" ins "\n" \
+ _BUG_FLAGS_ASM("%c[fmt]", "%c[file]", \
+ "%c[line]", "%c[fl]", \
+ "%c[size]", extra) \
+ : : [fmt] "i" (WARN_CONDITION_STR(cond_str)), \
+ [file] "i" (__FILE__), \
+ [line] "i" (__LINE__), \
+ [fl] "i" (flags), \
+ [size] "i" (sizeof(struct bug_entry))); \
} while (0)
#define ARCH_WARN_ASM(file, line, flags, size) \
- _BUG_FLAGS_ASM(ASM_UD2, file, line, flags, size, "")
+ ".pushsection .rodata.str1.1, \"aMS\", @progbits, 1\n" \
+ "99:\n" \
+ "\t.string \"\"\n" \
+ ".popsection\n" \
+ "1:\t " ASM_UD2 "\n" \
+ _BUG_FLAGS_ASM("99b", file, line, flags, size, "")
#else
-#define _BUG_FLAGS(ins, flags, extra) asm volatile(ins)
+#define _BUG_FLAGS(cond_str, ins, flags, extra) asm volatile(ins)
#endif /* CONFIG_GENERIC_BUG */
@@ -84,7 +114,7 @@ do { \
#define BUG() \
do { \
instrumentation_begin(); \
- _BUG_FLAGS(ASM_UD2, 0, ""); \
+ _BUG_FLAGS("", ASM_UD2, 0, ""); \
__builtin_unreachable(); \
} while (0)
@@ -97,14 +127,69 @@ do { \
#define ARCH_WARN_REACHABLE ANNOTATE_REACHABLE(1b)
-#define __WARN_FLAGS(flags) \
-do { \
- __auto_type __flags = BUGFLAG_WARNING|(flags); \
- instrumentation_begin(); \
- _BUG_FLAGS(ASM_UD2, __flags, ARCH_WARN_REACHABLE); \
- instrumentation_end(); \
+#define __WARN_FLAGS(cond_str, flags) \
+do { \
+ __auto_type __flags = BUGFLAG_WARNING|(flags); \
+ instrumentation_begin(); \
+ _BUG_FLAGS(cond_str, ASM_UD2, __flags, ARCH_WARN_REACHABLE); \
+ instrumentation_end(); \
} while (0)
+#ifdef HAVE_ARCH_BUG_FORMAT_ARGS
+
+#ifndef __ASSEMBLY__
+#include <linux/static_call_types.h>
+DECLARE_STATIC_CALL(WARN_trap, __WARN_trap);
+
+struct pt_regs;
+struct sysv_va_list { /* from AMD64 System V ABI */
+ unsigned int gp_offset;
+ unsigned int fp_offset;
+ void *overflow_arg_area;
+ void *reg_save_area;
+};
+struct arch_va_list {
+ unsigned long regs[6];
+ struct sysv_va_list args;
+};
+extern void *__warn_args(struct arch_va_list *args, struct pt_regs *regs);
+#endif /* __ASSEMBLY__ */
+
+#define __WARN_bug_entry(flags, format) ({ \
+ struct bug_entry *bug; \
+ asm_inline volatile("lea (2f)(%%rip), %[addr]\n1:\n" \
+ _BUG_FLAGS_ASM("%c[fmt]", "%c[file]", \
+ "%c[line]", "%c[fl]", \
+ "%c[size]", "") \
+ : [addr] "=r" (bug) \
+ : [fmt] "i" (format), \
+ [file] "i" (__FILE__), \
+ [line] "i" (__LINE__), \
+ [fl] "i" (flags), \
+ [size] "i" (sizeof(struct bug_entry))); \
+ bug; })
+
+#define __WARN_print_arg(flags, format, arg...) \
+do { \
+ int __flags = (flags) | BUGFLAG_WARNING | BUGFLAG_ARGS ; \
+ static_call_mod(WARN_trap)(__WARN_bug_entry(__flags, format), ## arg); \
+ asm (""); /* inhibit tail-call optimization */ \
+} while (0)
+
+#define __WARN_printf(taint, fmt, arg...) \
+ __WARN_print_arg(BUGFLAG_TAINT(taint), fmt, ## arg)
+
+#define WARN_ONCE(cond, format, arg...) ({ \
+ int __ret_warn_on = !!(cond); \
+ if (unlikely(__ret_warn_on)) { \
+ __WARN_print_arg(BUGFLAG_ONCE|BUGFLAG_TAINT(TAINT_WARN),\
+ format, ## arg); \
+ } \
+ __ret_warn_on; \
+})
+
+#endif /* HAVE_ARCH_BUG_FORMAT_ARGS */
+
#include <asm-generic/bug.h>
#endif /* _ASM_X86_BUG_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 96acb669bed4..4b1a6ade1700 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -101,6 +101,7 @@ static __always_inline bool _static_cpu_has(u16 bit)
asm goto(ALTERNATIVE_TERNARY("jmp 6f", %c[feature], "", "jmp %l[t_no]")
".pushsection .altinstr_aux,\"ax\"\n"
"6:\n"
+ ANNOTATE_DATA_SPECIAL
" testb %[bitnum], %a[cap_byte]\n"
" jnz %l[t_yes]\n"
" jmp %l[t_no]\n"
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 6f82302991d0..d90ce601917c 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -321,7 +321,7 @@
#define X86_FEATURE_FSRS (12*32+11) /* Fast short REP STOSB */
#define X86_FEATURE_FSRC (12*32+12) /* Fast short REP {CMPSB,SCASB} */
#define X86_FEATURE_FRED (12*32+17) /* "fred" Flexible Return and Event Delivery */
-#define X86_FEATURE_LKGS (12*32+18) /* Load "kernel" (userspace) GS */
+#define X86_FEATURE_LKGS (12*32+18) /* Like MOV_GS except MSR_KERNEL_GS_BASE = GS.base */
#define X86_FEATURE_WRMSRNS (12*32+19) /* Non-serializing WRMSR */
#define X86_FEATURE_AMX_FP16 (12*32+21) /* AMX fp16 Support */
#define X86_FEATURE_AVX_IFMA (12*32+23) /* Support for VPMADD52[H,L]UQ */
@@ -503,6 +503,9 @@
#define X86_FEATURE_IBPB_EXIT_TO_USER (21*32+14) /* Use IBPB on exit-to-userspace, see VMSCAPE bug */
#define X86_FEATURE_ABMC (21*32+15) /* Assignable Bandwidth Monitoring Counters */
#define X86_FEATURE_MSR_IMM (21*32+16) /* MSR immediate form instructions */
+#define X86_FEATURE_SGX_EUPDATESVN (21*32+17) /* Support for ENCLS[EUPDATESVN] instruction */
+
+#define X86_FEATURE_SDCIAE (21*32+18) /* L3 Smart Data Cache Injection Allocation Enforcement */
/*
* BUG word(s)
diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h
index 12b34d5b2953..2bb65677c079 100644
--- a/arch/x86/include/asm/fred.h
+++ b/arch/x86/include/asm/fred.h
@@ -79,7 +79,7 @@ static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int
.type = type,
.vector = vector,
.nmi = type == EVENT_TYPE_NMI,
- .lm = 1,
+ .l = 1,
};
asm_fred_entry_from_kvm(ss);
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 93156ac4ffe0..b08c95872eed 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -56,6 +56,11 @@ arch_ftrace_get_regs(struct ftrace_regs *fregs)
return &arch_ftrace_regs(fregs)->regs;
}
+#define arch_ftrace_partial_regs(regs) do { \
+ regs->flags &= ~X86_EFLAGS_FIXED; \
+ regs->cs = __KERNEL_CS; \
+} while (0)
+
#define arch_ftrace_fill_perf_regs(fregs, _regs) do { \
(_regs)->ip = arch_ftrace_regs(fregs)->regs.ip; \
(_regs)->sp = arch_ftrace_regs(fregs)->regs.sp; \
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
index 6e2458088800..fe5d9a10d900 100644
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -46,38 +46,31 @@ do { \
} while(0)
static __always_inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
- u32 __user *uaddr)
+ u32 __user *uaddr)
{
- if (can_do_masked_user_access())
- uaddr = masked_user_access_begin(uaddr);
- else if (!user_access_begin(uaddr, sizeof(u32)))
- return -EFAULT;
-
- switch (op) {
- case FUTEX_OP_SET:
- unsafe_atomic_op1("xchgl %0, %2", oval, uaddr, oparg, Efault);
- break;
- case FUTEX_OP_ADD:
- unsafe_atomic_op1(LOCK_PREFIX "xaddl %0, %2", oval,
- uaddr, oparg, Efault);
- break;
- case FUTEX_OP_OR:
- unsafe_atomic_op2("orl %4, %3", oval, uaddr, oparg, Efault);
- break;
- case FUTEX_OP_ANDN:
- unsafe_atomic_op2("andl %4, %3", oval, uaddr, ~oparg, Efault);
- break;
- case FUTEX_OP_XOR:
- unsafe_atomic_op2("xorl %4, %3", oval, uaddr, oparg, Efault);
- break;
- default:
- user_access_end();
- return -ENOSYS;
+ scoped_user_rw_access(uaddr, Efault) {
+ switch (op) {
+ case FUTEX_OP_SET:
+ unsafe_atomic_op1("xchgl %0, %2", oval, uaddr, oparg, Efault);
+ break;
+ case FUTEX_OP_ADD:
+ unsafe_atomic_op1(LOCK_PREFIX "xaddl %0, %2", oval, uaddr, oparg, Efault);
+ break;
+ case FUTEX_OP_OR:
+ unsafe_atomic_op2("orl %4, %3", oval, uaddr, oparg, Efault);
+ break;
+ case FUTEX_OP_ANDN:
+ unsafe_atomic_op2("andl %4, %3", oval, uaddr, ~oparg, Efault);
+ break;
+ case FUTEX_OP_XOR:
+ unsafe_atomic_op2("xorl %4, %3", oval, uaddr, oparg, Efault);
+ break;
+ default:
+ return -ENOSYS;
+ }
}
- user_access_end();
return 0;
Efault:
- user_access_end();
return -EFAULT;
}
@@ -86,21 +79,19 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
{
int ret = 0;
- if (can_do_masked_user_access())
- uaddr = masked_user_access_begin(uaddr);
- else if (!user_access_begin(uaddr, sizeof(u32)))
- return -EFAULT;
- asm volatile("\n"
- "1:\t" LOCK_PREFIX "cmpxchgl %3, %2\n"
- "2:\n"
- _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %0) \
- : "+r" (ret), "=a" (oldval), "+m" (*uaddr)
- : "r" (newval), "1" (oldval)
- : "memory"
- );
- user_access_end();
- *uval = oldval;
+ scoped_user_rw_access(uaddr, Efault) {
+ asm_inline volatile("\n"
+ "1:\t" LOCK_PREFIX "cmpxchgl %3, %2\n"
+ "2:\n"
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %0)
+ : "+r" (ret), "=a" (oldval), "+m" (*uaddr)
+ : "r" (newval), "1" (oldval)
+ : "memory");
+ *uval = oldval;
+ }
return ret;
+Efault:
+ return -EFAULT;
}
#endif
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index abd637e54e94..3218770670d3 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -393,7 +393,7 @@ static __always_inline void __##func(struct pt_regs *regs)
/**
* DEFINE_IDTENTRY_VC_KERNEL - Emit code for VMM communication handler
- when raised from kernel mode
+ * when raised from kernel mode
* @func: Function name of the entry point
*
* Maps to DEFINE_IDTENTRY_RAW_ERRORCODE
@@ -403,7 +403,7 @@ static __always_inline void __##func(struct pt_regs *regs)
/**
* DEFINE_IDTENTRY_VC_USER - Emit code for VMM communication handler
- when raised from user mode
+ * when raised from user mode
* @func: Function name of the entry point
*
* Maps to DEFINE_IDTENTRY_RAW_ERRORCODE
diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h
index 54368a43abf6..4733e9064ee5 100644
--- a/arch/x86/include/asm/insn-eval.h
+++ b/arch/x86/include/asm/insn-eval.h
@@ -44,4 +44,6 @@ enum insn_mmio_type {
enum insn_mmio_type insn_decode_mmio(struct insn *insn, int *bytes);
+bool insn_is_nop(struct insn *insn);
+
#endif /* _ASM_X86_INSN_EVAL_H */
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index 091f88c8254d..846d21c1a7f8 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -312,7 +312,6 @@ static inline int insn_offset_immediate(struct insn *insn)
/**
* for_each_insn_prefix() -- Iterate prefixes in the instruction
* @insn: Pointer to struct insn.
- * @idx: Index storage.
* @prefix: Prefix byte.
*
* Iterate prefix bytes of given @insn. Each prefix byte is stored in @prefix
@@ -321,8 +320,8 @@ static inline int insn_offset_immediate(struct insn *insn)
* Since prefixes.nbytes can be bigger than 4 if some prefixes
* are repeated, it cannot be used for looping over the prefixes.
*/
-#define for_each_insn_prefix(insn, idx, prefix) \
- for (idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++)
+#define for_each_insn_prefix(insn, prefix) \
+ for (int idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++)
#define POP_SS_OPCODE 0x1f
#define MOV_SREG_OPCODE 0x8e
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index f32a0eca2ae5..950bfd006905 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -150,12 +150,12 @@
#define INTEL_LUNARLAKE_M IFM(6, 0xBD) /* Lion Cove / Skymont */
-#define INTEL_PANTHERLAKE_L IFM(6, 0xCC) /* Cougar Cove / Crestmont */
+#define INTEL_PANTHERLAKE_L IFM(6, 0xCC) /* Cougar Cove / Darkmont */
#define INTEL_WILDCATLAKE_L IFM(6, 0xD5)
-#define INTEL_NOVALAKE IFM(18, 0x01)
-#define INTEL_NOVALAKE_L IFM(18, 0x03)
+#define INTEL_NOVALAKE IFM(18, 0x01) /* Coyote Cove / Arctic Wolf */
+#define INTEL_NOVALAKE_L IFM(18, 0x03) /* Coyote Cove / Arctic Wolf */
/* "Small Core" Processors (Atom/E-Core) */
diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h
index 5dbeac48a5b9..695f87efbeb8 100644
--- a/arch/x86/include/asm/intel_ds.h
+++ b/arch/x86/include/asm/intel_ds.h
@@ -4,7 +4,15 @@
#include <linux/percpu-defs.h>
#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
-#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
+#define PEBS_BUFFER_SHIFT 4
+#define PEBS_BUFFER_SIZE (PAGE_SIZE << PEBS_BUFFER_SHIFT)
+
+/*
+ * The largest PEBS record could consume a page, ensure
+ * a record at least can be written after triggering PMI.
+ */
+#define ARCH_PEBS_THRESH_MULTI ((PEBS_BUFFER_SIZE - PAGE_SIZE) >> PEBS_BUFFER_SHIFT)
+#define ARCH_PEBS_THRESH_SINGLE 1
/* The maximal number of PEBS events: */
#define MAX_PEBS_EVENTS_FMT4 8
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 61dd1dee7812..e0a6930a4029 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -15,6 +15,7 @@
#define JUMP_TABLE_ENTRY(key, label) \
".pushsection __jump_table, \"aw\" \n\t" \
_ASM_ALIGN "\n\t" \
+ ANNOTATE_DATA_SPECIAL \
".long 1b - . \n\t" \
".long " label " - . \n\t" \
_ASM_PTR " " key " - . \n\t" \
diff --git a/arch/x86/include/asm/kvm_types.h b/arch/x86/include/asm/kvm_types.h
index 23268a188e70..d7c704ed1be9 100644
--- a/arch/x86/include/asm/kvm_types.h
+++ b/arch/x86/include/asm/kvm_types.h
@@ -10,6 +10,11 @@
#define KVM_SUB_MODULES kvm-intel
#else
#undef KVM_SUB_MODULES
+/*
+ * Don't export symbols for KVM without vendor modules, as kvm.ko is built iff
+ * at least one vendor module is enabled.
+ */
+#define EXPORT_SYMBOL_FOR_KVM(symbol)
#endif
#define KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE 40
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 31e3cb550fb3..2d98886de09a 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -48,6 +48,7 @@
/* AMD-specific bits */
#define MCI_STATUS_TCC BIT_ULL(55) /* Task context corrupt */
+#define MCI_STATUS_PADDRV BIT_ULL(54) /* Valid System Physical Address */
#define MCI_STATUS_SYNDV BIT_ULL(53) /* synd reg. valid */
#define MCI_STATUS_DEFERRED BIT_ULL(44) /* uncorrected error, deferred exception */
#define MCI_STATUS_POISON BIT_ULL(43) /* access poisonous data */
@@ -62,6 +63,7 @@
*/
#define MCI_CONFIG_MCAX 0x1
#define MCI_CONFIG_FRUTEXT BIT_ULL(9)
+#define MCI_CONFIG_PADDRV BIT_ULL(11)
#define MCI_IPID_MCATYPE 0xFFFF0000
#define MCI_IPID_HWID 0xFFF
@@ -166,6 +168,12 @@
#define MCE_IN_KERNEL_COPYIN BIT_ULL(7)
/*
+ * Indicates that handler should check and clear Deferred error registers
+ * rather than common ones.
+ */
+#define MCE_CHECK_DFR_REGS BIT_ULL(8)
+
+/*
* This structure contains all data related to the MCE log. Also
* carries a signature to make it easier to find from external
* debugging tools. Each entry is only valid when its finished flag
@@ -302,6 +310,12 @@ DECLARE_PER_CPU(struct mce, injectm);
/* Disable CMCI/polling for MCA bank claimed by firmware */
extern void mce_disable_bank(int bank);
+#ifdef CONFIG_X86_MCE_THRESHOLD
+void mce_save_apei_thr_limit(u32 thr_limit);
+#else
+static inline void mce_save_apei_thr_limit(u32 thr_limit) { }
+#endif /* CONFIG_X86_MCE_THRESHOLD */
+
/*
* Exception handler
*/
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 9e1720d73244..3d0a0950d20a 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -166,6 +166,10 @@
* Processor MMIO stale data
* vulnerabilities.
*/
+#define ARCH_CAP_MCU_ENUM BIT(16) /*
+ * Indicates the presence of microcode update
+ * feature enumeration and status information.
+ */
#define ARCH_CAP_FB_CLEAR BIT(17) /*
* VERW clears CPU fill buffer
* even on MDS_NO CPUs.
@@ -327,6 +331,26 @@
PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \
PERF_CAP_PEBS_TIMING_INFO)
+/* Arch PEBS */
+#define MSR_IA32_PEBS_BASE 0x000003f4
+#define MSR_IA32_PEBS_INDEX 0x000003f5
+#define ARCH_PEBS_OFFSET_MASK 0x7fffff
+#define ARCH_PEBS_INDEX_WR_SHIFT 4
+
+#define ARCH_PEBS_RELOAD 0xffffffff
+#define ARCH_PEBS_CNTR_ALLOW BIT_ULL(35)
+#define ARCH_PEBS_CNTR_GP BIT_ULL(36)
+#define ARCH_PEBS_CNTR_FIXED BIT_ULL(37)
+#define ARCH_PEBS_CNTR_METRICS BIT_ULL(38)
+#define ARCH_PEBS_LBR_SHIFT 40
+#define ARCH_PEBS_LBR (0x3ull << ARCH_PEBS_LBR_SHIFT)
+#define ARCH_PEBS_VECR_XMM BIT_ULL(49)
+#define ARCH_PEBS_GPR BIT_ULL(61)
+#define ARCH_PEBS_AUX BIT_ULL(62)
+#define ARCH_PEBS_EN BIT_ULL(63)
+#define ARCH_PEBS_CNTR_MASK (ARCH_PEBS_CNTR_GP | ARCH_PEBS_CNTR_FIXED | \
+ ARCH_PEBS_CNTR_METRICS)
+
#define MSR_IA32_RTIT_CTL 0x00000570
#define RTIT_CTL_TRACEEN BIT(0)
#define RTIT_CTL_CYCLEACC BIT(1)
@@ -929,6 +953,10 @@
#define MSR_IA32_APICBASE_BASE (0xfffff<<12)
#define MSR_IA32_UCODE_WRITE 0x00000079
+
+#define MSR_IA32_MCU_ENUMERATION 0x0000007b
+#define MCU_STAGING BIT(4)
+
#define MSR_IA32_UCODE_REV 0x0000008b
/* Intel SGX Launch Enclave Public Key Hash MSRs */
@@ -1226,6 +1254,8 @@
#define MSR_IA32_VMX_VMFUNC 0x00000491
#define MSR_IA32_VMX_PROCBASED_CTLS3 0x00000492
+#define MSR_IA32_MCU_STAGING_MBOX_ADDR 0x000007a5
+
/* Resctrl MSRs: */
/* - Intel: */
#define MSR_IA32_L3_QOS_CFG 0xc81
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 015d23f3e01f..2f0e47be79a4 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -9,6 +9,7 @@
#include <asm/alternative.h>
#include <linux/kmsan-checks.h>
+#include <linux/mmdebug.h>
/* duplicated to the one in bootmem.h */
extern unsigned long max_pfn;
@@ -31,18 +32,28 @@ static __always_inline unsigned long __phys_addr_nodebug(unsigned long x)
#ifdef CONFIG_DEBUG_VIRTUAL
extern unsigned long __phys_addr(unsigned long);
-extern unsigned long __phys_addr_symbol(unsigned long);
#else
#define __phys_addr(x) __phys_addr_nodebug(x)
-#define __phys_addr_symbol(x) \
- ((unsigned long)(x) - __START_KERNEL_map + phys_base)
#endif
+static inline unsigned long __phys_addr_symbol(unsigned long x)
+{
+ unsigned long y = x - __START_KERNEL_map;
+
+ /* only check upper bounds since lower bounds will trigger carry */
+ VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
+
+ return y + phys_base;
+}
+
#define __phys_reloc_hide(x) (x)
void clear_page_orig(void *page);
void clear_page_rep(void *page);
void clear_page_erms(void *page);
+KCFI_REFERENCE(clear_page_orig);
+KCFI_REFERENCE(clear_page_rep);
+KCFI_REFERENCE(clear_page_erms);
static inline void clear_page(void *page)
{
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 332428caaed2..725d0eff7acd 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -23,6 +23,7 @@
#else /* !__ASSEMBLY__: */
#include <linux/args.h>
+#include <linux/bits.h>
#include <linux/build_bug.h>
#include <linux/stringify.h>
#include <asm/asm.h>
@@ -572,9 +573,9 @@ do { \
#define x86_this_cpu_constant_test_bit(_nr, _var) \
({ \
unsigned long __percpu *addr__ = \
- (unsigned long __percpu *)&(_var) + ((_nr) / BITS_PER_LONG); \
+ (unsigned long __percpu *)&(_var) + BIT_WORD(_nr); \
\
- !!((1UL << ((_nr) % BITS_PER_LONG)) & raw_cpu_read(*addr__)); \
+ !!(BIT_MASK(_nr) & raw_cpu_read(*addr__)); \
})
#define x86_this_cpu_variable_test_bit(_nr, _var) \
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 49a4d442f3fc..7276ba70c88a 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -141,16 +141,16 @@
#define ARCH_PERFMON_EVENTS_COUNT 7
#define PEBS_DATACFG_MEMINFO BIT_ULL(0)
-#define PEBS_DATACFG_GP BIT_ULL(1)
+#define PEBS_DATACFG_GP BIT_ULL(1)
#define PEBS_DATACFG_XMMS BIT_ULL(2)
#define PEBS_DATACFG_LBRS BIT_ULL(3)
-#define PEBS_DATACFG_LBR_SHIFT 24
#define PEBS_DATACFG_CNTR BIT_ULL(4)
+#define PEBS_DATACFG_METRICS BIT_ULL(5)
+#define PEBS_DATACFG_LBR_SHIFT 24
#define PEBS_DATACFG_CNTR_SHIFT 32
#define PEBS_DATACFG_CNTR_MASK GENMASK_ULL(15, 0)
#define PEBS_DATACFG_FIX_SHIFT 48
#define PEBS_DATACFG_FIX_MASK GENMASK_ULL(7, 0)
-#define PEBS_DATACFG_METRICS BIT_ULL(5)
/* Steal the highest bit of pebs_data_cfg for SW usage */
#define PEBS_UPDATE_DS_SW BIT_ULL(63)
@@ -200,6 +200,8 @@ union cpuid10_edx {
#define ARCH_PERFMON_EXT_LEAF 0x00000023
#define ARCH_PERFMON_NUM_COUNTER_LEAF 0x1
#define ARCH_PERFMON_ACR_LEAF 0x2
+#define ARCH_PERFMON_PEBS_CAP_LEAF 0x4
+#define ARCH_PERFMON_PEBS_COUNTER_LEAF 0x5
union cpuid35_eax {
struct {
@@ -210,7 +212,10 @@ union cpuid35_eax {
unsigned int acr_subleaf:1;
/* Events Sub-Leaf */
unsigned int events_subleaf:1;
- unsigned int reserved:28;
+ /* arch-PEBS Sub-Leaves */
+ unsigned int pebs_caps_subleaf:1;
+ unsigned int pebs_cnts_subleaf:1;
+ unsigned int reserved:26;
} split;
unsigned int full;
};
@@ -432,6 +437,8 @@ static inline bool is_topdown_idx(int idx)
#define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(GLOBAL_STATUS_LBRS_FROZEN_BIT)
#define GLOBAL_STATUS_TRACE_TOPAPMI_BIT 55
#define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(GLOBAL_STATUS_TRACE_TOPAPMI_BIT)
+#define GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT 54
+#define GLOBAL_STATUS_ARCH_PEBS_THRESHOLD BIT_ULL(GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT)
#define GLOBAL_STATUS_PERF_METRICS_OVF_BIT 48
#define GLOBAL_CTRL_EN_PERF_METRICS BIT_ULL(48)
@@ -503,6 +510,107 @@ struct pebs_cntr_header {
#define INTEL_CNTR_METRICS 0x3
/*
+ * Arch PEBS
+ */
+union arch_pebs_index {
+ struct {
+ u64 rsvd:4,
+ wr:23,
+ rsvd2:4,
+ full:1,
+ en:1,
+ rsvd3:3,
+ thresh:23,
+ rsvd4:5;
+ };
+ u64 whole;
+};
+
+struct arch_pebs_header {
+ union {
+ u64 format;
+ struct {
+ u64 size:16, /* Record size */
+ rsvd:14,
+ mode:1, /* 64BIT_MODE */
+ cont:1,
+ rsvd2:3,
+ cntr:5,
+ lbr:2,
+ rsvd3:7,
+ xmm:1,
+ ymmh:1,
+ rsvd4:2,
+ opmask:1,
+ zmmh:1,
+ h16zmm:1,
+ rsvd5:5,
+ gpr:1,
+ aux:1,
+ basic:1;
+ };
+ };
+ u64 rsvd6;
+};
+
+struct arch_pebs_basic {
+ u64 ip;
+ u64 applicable_counters;
+ u64 tsc;
+ u64 retire :16, /* Retire Latency */
+ valid :1,
+ rsvd :47;
+ u64 rsvd2;
+ u64 rsvd3;
+};
+
+struct arch_pebs_aux {
+ u64 address;
+ u64 rsvd;
+ u64 rsvd2;
+ u64 rsvd3;
+ u64 rsvd4;
+ u64 aux;
+ u64 instr_latency :16,
+ pad2 :16,
+ cache_latency :16,
+ pad3 :16;
+ u64 tsx_tuning;
+};
+
+struct arch_pebs_gprs {
+ u64 flags, ip, ax, cx, dx, bx, sp, bp, si, di;
+ u64 r8, r9, r10, r11, r12, r13, r14, r15, ssp;
+ u64 rsvd;
+};
+
+struct arch_pebs_xer_header {
+ u64 xstate;
+ u64 rsvd;
+};
+
+#define ARCH_PEBS_LBR_NAN 0x0
+#define ARCH_PEBS_LBR_NUM_8 0x1
+#define ARCH_PEBS_LBR_NUM_16 0x2
+#define ARCH_PEBS_LBR_NUM_VAR 0x3
+#define ARCH_PEBS_BASE_LBR_ENTRIES 8
+struct arch_pebs_lbr_header {
+ u64 rsvd;
+ u64 ctl;
+ u64 depth;
+ u64 ler_from;
+ u64 ler_to;
+ u64 ler_info;
+};
+
+struct arch_pebs_cntr_header {
+ u32 cntr;
+ u32 fixed;
+ u32 metrics;
+ u32 reserved;
+};
+
+/*
* AMD Extended Performance Monitoring and Debug cpuid feature detection
*/
#define EXT_PERFMON_DEBUG_FEATURES 0x80000022
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 50f75467f73d..35d062a2e304 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -84,8 +84,8 @@ struct fred_ss {
: 4,
/* Event was incident to enclave execution */
enclave : 1,
- /* CPU was in long mode */
- lm : 1,
+ /* CPU was in 64-bit mode */
+ l : 1,
/*
* Nested exception during FRED delivery, not set
* for #DF.
@@ -187,12 +187,12 @@ convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
extern void send_sigtrap(struct pt_regs *regs, int error_code, int si_code);
-static inline unsigned long regs_return_value(struct pt_regs *regs)
+static __always_inline unsigned long regs_return_value(struct pt_regs *regs)
{
return regs->ax;
}
-static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
+static __always_inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
{
regs->ax = rc;
}
@@ -277,34 +277,34 @@ static __always_inline bool ip_within_syscall_gap(struct pt_regs *regs)
}
#endif
-static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
+static __always_inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
{
return regs->sp;
}
-static inline unsigned long instruction_pointer(struct pt_regs *regs)
+static __always_inline unsigned long instruction_pointer(struct pt_regs *regs)
{
return regs->ip;
}
-static inline void instruction_pointer_set(struct pt_regs *regs,
- unsigned long val)
+static __always_inline
+void instruction_pointer_set(struct pt_regs *regs, unsigned long val)
{
regs->ip = val;
}
-static inline unsigned long frame_pointer(struct pt_regs *regs)
+static __always_inline unsigned long frame_pointer(struct pt_regs *regs)
{
return regs->bp;
}
-static inline unsigned long user_stack_pointer(struct pt_regs *regs)
+static __always_inline unsigned long user_stack_pointer(struct pt_regs *regs)
{
return regs->sp;
}
-static inline void user_stack_pointer_set(struct pt_regs *regs,
- unsigned long val)
+static __always_inline
+void user_stack_pointer_set(struct pt_regs *regs, unsigned long val)
{
regs->sp = val;
}
diff --git a/arch/x86/include/asm/runtime-const.h b/arch/x86/include/asm/runtime-const.h
index 8d983cfd06ea..e5a13dc8816e 100644
--- a/arch/x86/include/asm/runtime-const.h
+++ b/arch/x86/include/asm/runtime-const.h
@@ -2,6 +2,10 @@
#ifndef _ASM_RUNTIME_CONST_H
#define _ASM_RUNTIME_CONST_H
+#ifdef MODULE
+ #error "Cannot use runtime-const infrastructure from modules"
+#endif
+
#ifdef __ASSEMBLY__
.macro RUNTIME_CONST_PTR sym reg
diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h
index 6a0069761508..04958459a7ca 100644
--- a/arch/x86/include/asm/sgx.h
+++ b/arch/x86/include/asm/sgx.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/**
+/*
* Copyright(c) 2016-20 Intel Corporation.
*
* Intel Software Guard Extensions (SGX) support.
@@ -28,21 +28,22 @@
#define SGX_CPUID_EPC_MASK GENMASK(3, 0)
enum sgx_encls_function {
- ECREATE = 0x00,
- EADD = 0x01,
- EINIT = 0x02,
- EREMOVE = 0x03,
- EDGBRD = 0x04,
- EDGBWR = 0x05,
- EEXTEND = 0x06,
- ELDU = 0x08,
- EBLOCK = 0x09,
- EPA = 0x0A,
- EWB = 0x0B,
- ETRACK = 0x0C,
- EAUG = 0x0D,
- EMODPR = 0x0E,
- EMODT = 0x0F,
+ ECREATE = 0x00,
+ EADD = 0x01,
+ EINIT = 0x02,
+ EREMOVE = 0x03,
+ EDGBRD = 0x04,
+ EDGBWR = 0x05,
+ EEXTEND = 0x06,
+ ELDU = 0x08,
+ EBLOCK = 0x09,
+ EPA = 0x0A,
+ EWB = 0x0B,
+ ETRACK = 0x0C,
+ EAUG = 0x0D,
+ EMODPR = 0x0E,
+ EMODT = 0x0F,
+ EUPDATESVN = 0x18,
};
/**
@@ -65,15 +66,19 @@ enum sgx_encls_function {
/**
* enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV
- * %SGX_EPC_PAGE_CONFLICT: Page is being written by other ENCLS function.
- * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not
+ * @SGX_EPC_PAGE_CONFLICT: Page is being written by other ENCLS function.
+ * @SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not
* been completed yet.
- * %SGX_CHILD_PRESENT SECS has child pages present in the EPC.
- * %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's
+ * @SGX_CHILD_PRESENT: SECS has child pages present in the EPC.
+ * @SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's
* public key does not match IA32_SGXLEPUBKEYHASH.
- * %SGX_PAGE_NOT_MODIFIABLE: The EPC page cannot be modified because it
+ * @SGX_PAGE_NOT_MODIFIABLE: The EPC page cannot be modified because it
* is in the PENDING or MODIFIED state.
- * %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received
+ * @SGX_INSUFFICIENT_ENTROPY: Insufficient entropy in RNG.
+ * @SGX_NO_UPDATE: EUPDATESVN could not update the CPUSVN because the
+ * current SVN was not newer than CPUSVN. This is the most
+ * common error code returned by EUPDATESVN.
+ * @SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received
*/
enum sgx_return_code {
SGX_EPC_PAGE_CONFLICT = 7,
@@ -81,6 +86,8 @@ enum sgx_return_code {
SGX_CHILD_PRESENT = 13,
SGX_INVALID_EINITTOKEN = 16,
SGX_PAGE_NOT_MODIFIABLE = 20,
+ SGX_INSUFFICIENT_ENTROPY = 29,
+ SGX_NO_UPDATE = 31,
SGX_UNMASKED_EVENT = 128,
};
@@ -89,7 +96,7 @@ enum sgx_return_code {
/**
* enum sgx_miscselect - additional information to an SSA frame
- * %SGX_MISC_EXINFO: Report #PF or #GP to the SSA frame.
+ * @SGX_MISC_EXINFO: Report #PF or #GP to the SSA frame.
*
* Save State Area (SSA) is a stack inside the enclave used to store processor
* state when an exception or interrupt occurs. This enum defines additional
@@ -105,17 +112,17 @@ enum sgx_miscselect {
#define SGX_SSA_MISC_EXINFO_SIZE 16
/**
- * enum sgx_attributes - the attributes field in &struct sgx_secs
- * %SGX_ATTR_INIT: Enclave can be entered (is initialized).
- * %SGX_ATTR_DEBUG: Allow ENCLS(EDBGRD) and ENCLS(EDBGWR).
- * %SGX_ATTR_MODE64BIT: Tell that this a 64-bit enclave.
- * %SGX_ATTR_PROVISIONKEY: Allow to use provisioning keys for remote
+ * enum sgx_attribute - the attributes field in &struct sgx_secs
+ * @SGX_ATTR_INIT: Enclave can be entered (is initialized).
+ * @SGX_ATTR_DEBUG: Allow ENCLS(EDBGRD) and ENCLS(EDBGWR).
+ * @SGX_ATTR_MODE64BIT: Tell that this a 64-bit enclave.
+ * @SGX_ATTR_PROVISIONKEY: Allow to use provisioning keys for remote
* attestation.
- * %SGX_ATTR_KSS: Allow to use key separation and sharing (KSS).
- * %SGX_ATTR_EINITTOKENKEY: Allow to use token signing key that is used to
+ * @SGX_ATTR_KSS: Allow to use key separation and sharing (KSS).
+ * @SGX_ATTR_EINITTOKENKEY: Allow to use token signing key that is used to
* sign cryptographic tokens that can be passed to
* EINIT as an authorization to run an enclave.
- * %SGX_ATTR_ASYNC_EXIT_NOTIFY: Allow enclaves to be notified after an
+ * @SGX_ATTR_ASYNC_EXIT_NOTIFY: Allow enclaves to be notified after an
* asynchronous exit has occurred.
*/
enum sgx_attribute {
@@ -188,7 +195,7 @@ struct sgx_secs {
/**
* enum sgx_tcs_flags - execution flags for TCS
- * %SGX_TCS_DBGOPTIN: If enabled allows single-stepping and breakpoints
+ * @SGX_TCS_DBGOPTIN: If enabled allows single-stepping and breakpoints
* inside an enclave. It is cleared by EADD but can
* be set later with EDBGWR.
*/
@@ -253,11 +260,11 @@ struct sgx_pageinfo {
/**
* enum sgx_page_type - bits in the SECINFO flags defining the page type
- * %SGX_PAGE_TYPE_SECS: a SECS page
- * %SGX_PAGE_TYPE_TCS: a TCS page
- * %SGX_PAGE_TYPE_REG: a regular page
- * %SGX_PAGE_TYPE_VA: a VA page
- * %SGX_PAGE_TYPE_TRIM: a page in trimmed state
+ * @SGX_PAGE_TYPE_SECS: a SECS page
+ * @SGX_PAGE_TYPE_TCS: a TCS page
+ * @SGX_PAGE_TYPE_REG: a regular page
+ * @SGX_PAGE_TYPE_VA: a VA page
+ * @SGX_PAGE_TYPE_TRIM: a page in trimmed state
*
* Make sure when making changes to this enum that its values can still fit
* in the bitfield within &struct sgx_encl_page
@@ -275,14 +282,14 @@ enum sgx_page_type {
/**
* enum sgx_secinfo_flags - the flags field in &struct sgx_secinfo
- * %SGX_SECINFO_R: allow read
- * %SGX_SECINFO_W: allow write
- * %SGX_SECINFO_X: allow execution
- * %SGX_SECINFO_SECS: a SECS page
- * %SGX_SECINFO_TCS: a TCS page
- * %SGX_SECINFO_REG: a regular page
- * %SGX_SECINFO_VA: a VA page
- * %SGX_SECINFO_TRIM: a page in trimmed state
+ * @SGX_SECINFO_R: allow read
+ * @SGX_SECINFO_W: allow write
+ * @SGX_SECINFO_X: allow execution
+ * @SGX_SECINFO_SECS: a SECS page
+ * @SGX_SECINFO_TCS: a TCS page
+ * @SGX_SECINFO_REG: a regular page
+ * @SGX_SECINFO_VA: a VA page
+ * @SGX_SECINFO_TRIM: a page in trimmed state
*/
enum sgx_secinfo_flags {
SGX_SECINFO_R = BIT(0),
diff --git a/arch/x86/include/asm/shared/msr.h b/arch/x86/include/asm/shared/msr.h
index 1e6ec10b3a15..a20b1c08c99f 100644
--- a/arch/x86/include/asm/shared/msr.h
+++ b/arch/x86/include/asm/shared/msr.h
@@ -12,4 +12,19 @@ struct msr {
};
};
+/*
+ * The kernel proper already defines rdmsr()/wrmsr(), but they are not for the
+ * boot kernel since they rely on tracepoint/exception handling infrastructure
+ * that's not available here.
+ */
+static inline void raw_rdmsr(unsigned int reg, struct msr *m)
+{
+ asm volatile("rdmsr" : "=a" (m->l), "=d" (m->h) : "c" (reg));
+}
+
+static inline void raw_wrmsr(unsigned int reg, const struct msr *m)
+{
+ asm volatile("wrmsr" : : "c" (reg), "a"(m->l), "d" (m->h) : "memory");
+}
+
#endif /* _ASM_X86_SHARED_MSR_H */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 22bfebe6776d..84951572ab81 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -109,7 +109,7 @@ int common_cpu_up(unsigned int cpunum, struct task_struct *tidle);
int native_kick_ap(unsigned int cpu, struct task_struct *tidle);
int native_cpu_disable(void);
void __noreturn hlt_play_dead(void);
-void native_play_dead(void);
+void __noreturn native_play_dead(void);
void play_dead_common(void);
void wbinvd_on_cpu(int cpu);
void wbinvd_on_all_cpus(void);
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 17f6c3fedeee..0581c477d466 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -701,5 +701,6 @@ DEFINE_GHCB_ACCESSORS(sw_exit_info_1)
DEFINE_GHCB_ACCESSORS(sw_exit_info_2)
DEFINE_GHCB_ACCESSORS(sw_scratch)
DEFINE_GHCB_ACCESSORS(xcr0)
+DEFINE_GHCB_ACCESSORS(xss)
#endif
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 21041898157a..1fadf0cf520c 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -218,6 +218,12 @@ static inline unsigned int topology_amd_nodes_per_pkg(void)
return __amd_nodes_per_pkg;
}
+#else /* CONFIG_SMP */
+static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; }
+static inline int topology_max_smt_threads(void) { return 1; }
+static inline unsigned int topology_amd_nodes_per_pkg(void) { return 1; }
+#endif /* !CONFIG_SMP */
+
extern struct cpumask __cpu_primary_thread_mask;
#define cpu_primary_thread_mask ((const struct cpumask *)&__cpu_primary_thread_mask)
@@ -241,12 +247,6 @@ static inline bool topology_is_core_online(unsigned int cpu)
}
#define topology_is_core_online topology_is_core_online
-#else /* CONFIG_SMP */
-static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; }
-static inline int topology_max_smt_threads(void) { return 1; }
-static inline unsigned int topology_amd_nodes_per_pkg(void) { return 1; }
-#endif /* !CONFIG_SMP */
-
static inline void arch_fix_phys_package_id(int num, u32 slot)
{
}
@@ -325,4 +325,6 @@ static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled
extern void arch_scale_freq_tick(void);
#define arch_scale_freq_tick arch_scale_freq_tick
+extern int arch_sched_node_distance(int from, int to);
+
#endif /* _ASM_X86_TOPOLOGY_H */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 91a3fb8ae7ff..367297b188c3 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -528,18 +528,18 @@ static __must_check __always_inline bool user_access_begin(const void __user *pt
#define user_access_save() smap_save()
#define user_access_restore(x) smap_restore(x)
-#define unsafe_put_user(x, ptr, label) \
+#define arch_unsafe_put_user(x, ptr, label) \
__put_user_size((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)), label)
#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
-#define unsafe_get_user(x, ptr, err_label) \
+#define arch_unsafe_get_user(x, ptr, err_label) \
do { \
__inttype(*(ptr)) __gu_val; \
__get_user_size(__gu_val, (ptr), sizeof(*(ptr)), err_label); \
(x) = (__force __typeof__(*(ptr)))__gu_val; \
} while (0)
#else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT
-#define unsafe_get_user(x, ptr, err_label) \
+#define arch_unsafe_get_user(x, ptr, err_label) \
do { \
int __gu_err; \
__inttype(*(ptr)) __gu_val; \
@@ -618,11 +618,11 @@ do { \
} while (0)
#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
-#define __get_kernel_nofault(dst, src, type, err_label) \
+#define arch_get_kernel_nofault(dst, src, type, err_label) \
__get_user_size(*((type *)(dst)), (__force type __user *)(src), \
sizeof(type), err_label)
#else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT
-#define __get_kernel_nofault(dst, src, type, err_label) \
+#define arch_get_kernel_nofault(dst, src, type, err_label) \
do { \
int __kr_err; \
\
@@ -633,7 +633,7 @@ do { \
} while (0)
#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT
-#define __put_kernel_nofault(dst, src, type, err_label) \
+#define arch_put_kernel_nofault(dst, src, type, err_label) \
__put_user_size(*((type *)(src)), (__force type __user *)(dst), \
sizeof(type), err_label)
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index c8a5ae35c871..641f45c22f9d 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -12,12 +12,12 @@
#include <asm/cpufeatures.h>
#include <asm/page.h>
#include <asm/percpu.h>
-#include <asm/runtime-const.h>
-/*
- * Virtual variable: there's no actual backing store for this,
- * it can purely be used as 'runtime_const_ptr(USER_PTR_MAX)'
- */
+#ifdef MODULE
+ #define runtime_const_ptr(sym) (sym)
+#else
+ #include <asm/runtime-const.h>
+#endif
extern unsigned long USER_PTR_MAX;
#ifdef CONFIG_ADDRESS_MASKING
diff --git a/arch/x86/include/asm/unwind_user.h b/arch/x86/include/asm/unwind_user.h
new file mode 100644
index 000000000000..12064284bc4e
--- /dev/null
+++ b/arch/x86/include/asm/unwind_user.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_UNWIND_USER_H
+#define _ASM_X86_UNWIND_USER_H
+
+#ifdef CONFIG_HAVE_UNWIND_USER_FP
+
+#include <asm/ptrace.h>
+#include <asm/uprobes.h>
+
+#define ARCH_INIT_USER_FP_FRAME(ws) \
+ .cfa_off = 2*(ws), \
+ .ra_off = -1*(ws), \
+ .fp_off = -2*(ws), \
+ .use_fp = true,
+
+#define ARCH_INIT_USER_FP_ENTRY_FRAME(ws) \
+ .cfa_off = 1*(ws), \
+ .ra_off = -1*(ws), \
+ .fp_off = 0, \
+ .use_fp = false,
+
+static inline int unwind_user_word_size(struct pt_regs *regs)
+{
+ /* We can't unwind VM86 stacks */
+ if (regs->flags & X86_VM_MASK)
+ return 0;
+#ifdef CONFIG_X86_64
+ if (!user_64bit_mode(regs))
+ return sizeof(int);
+#endif
+ return sizeof(long);
+}
+
+static inline bool unwind_user_at_function_start(struct pt_regs *regs)
+{
+ return is_uprobe_at_func_entry(regs);
+}
+
+#endif /* CONFIG_HAVE_UNWIND_USER_FP */
+
+#endif /* _ASM_X86_UNWIND_USER_H */
diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
index 1ee2e5115955..362210c79998 100644
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -62,4 +62,13 @@ struct arch_uprobe_task {
unsigned int saved_tf;
};
+#ifdef CONFIG_UPROBES
+extern bool is_uprobe_at_func_entry(struct pt_regs *regs);
+#else
+static bool is_uprobe_at_func_entry(struct pt_regs *regs)
+{
+ return false;
+}
+#endif /* CONFIG_UPROBES */
+
#endif /* _ASM_UPROBES_H */
diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h
index 2dd35bbdc822..3c4d52072189 100644
--- a/arch/x86/include/uapi/asm/sgx.h
+++ b/arch/x86/include/uapi/asm/sgx.h
@@ -10,7 +10,7 @@
/**
* enum sgx_page_flags - page control flags
- * %SGX_PAGE_MEASURE: Measure the page contents with a sequence of
+ * @SGX_PAGE_MEASURE: Measure the page contents with a sequence of
* ENCLS[EEXTEND] operations.
*/
enum sgx_page_flags {
@@ -143,6 +143,12 @@ struct sgx_enclave_run;
/**
* typedef sgx_enclave_user_handler_t - Exit handler function accepted by
* __vdso_sgx_enter_enclave()
+ * @rdi: RDI at the time of EEXIT, undefined on AEX
+ * @rsi: RSI at the time of EEXIT, undefined on AEX
+ * @rdx: RDX at the time of EEXIT, undefined on AEX
+ * @rsp: RSP (untrusted) at the time of EEXIT or AEX
+ * @r8: R8 at the time of EEXIT, undefined on AEX
+ * @r9: R9 at the time of EEXIT, undefined on AEX
* @run: The run instance given by the caller
*
* The register parameters contain the snapshot of their values at enclave
@@ -166,7 +172,7 @@ typedef int (*sgx_enclave_user_handler_t)(long rdi, long rsi, long rdx,
* @exception_addr: The address that triggered the exception
* @user_handler: User provided callback run on exception
* @user_data: Data passed to the user handler
- * @reserved Reserved for future extensions
+ * @reserved: Reserved for future extensions
*
* If @user_handler is provided, the handler will be invoked on all return paths
* of the normal flow. The user handler may transfer control, e.g. via a
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 9792e329343e..1baa86dfe029 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -93,6 +93,7 @@
#define EXIT_REASON_TPAUSE 68
#define EXIT_REASON_BUS_LOCK 74
#define EXIT_REASON_NOTIFY 75
+#define EXIT_REASON_SEAMCALL 76
#define EXIT_REASON_TDCALL 77
#define EXIT_REASON_MSR_READ_IMM 84
#define EXIT_REASON_MSR_WRITE_IMM 85
diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c
index 0916f00a992e..e21419e686eb 100644
--- a/arch/x86/kernel/acpi/apei.c
+++ b/arch/x86/kernel/acpi/apei.c
@@ -19,6 +19,8 @@ int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data)
if (!cmc->enabled)
return 0;
+ mce_save_apei_thr_limit(cmc->notify.error_threshold_value);
+
/*
* We expect HEST to provide a list of MC banks that report errors
* in firmware first mode. Otherwise, return non-zero value to
diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c
index 7047124490f6..d7c8ef1e354d 100644
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -196,7 +196,7 @@ int amd_detect_prefcore(bool *detected)
break;
}
- for_each_present_cpu(cpu) {
+ for_each_online_cpu(cpu) {
u32 tmp;
int ret;
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 5b09f89070f0..74f4c659f9c9 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -9,6 +9,7 @@
#include <asm/text-patching.h>
#include <asm/insn.h>
+#include <asm/insn-eval.h>
#include <asm/ibt.h>
#include <asm/set_memory.h>
#include <asm/nmi.h>
@@ -346,25 +347,6 @@ static void add_nop(u8 *buf, unsigned int len)
}
/*
- * Matches NOP and NOPL, not any of the other possible NOPs.
- */
-static bool insn_is_nop(struct insn *insn)
-{
- /* Anything NOP, but no REP NOP */
- if (insn->opcode.bytes[0] == 0x90 &&
- (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3))
- return true;
-
- /* NOPL */
- if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F)
- return true;
-
- /* TODO: more nops */
-
- return false;
-}
-
-/*
* Find the offset of the first non-NOP instruction starting at @offset
* but no further than @len.
*/
@@ -559,7 +541,7 @@ EXPORT_SYMBOL(BUG_func);
* Rewrite the "call BUG_func" replacement to point to the target of the
* indirect pv_ops call "call *disp(%ip)".
*/
-static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
+static unsigned int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
{
void *target, *bug = &BUG_func;
s32 disp;
@@ -643,7 +625,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
* order.
*/
for (a = start; a < end; a++) {
- int insn_buff_sz = 0;
+ unsigned int insn_buff_sz = 0;
/*
* In case of nested ALTERNATIVE()s the outer alternative might
@@ -683,11 +665,8 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
memcpy(insn_buff, replacement, a->replacementlen);
insn_buff_sz = a->replacementlen;
- if (a->flags & ALT_FLAG_DIRECT_CALL) {
+ if (a->flags & ALT_FLAG_DIRECT_CALL)
insn_buff_sz = alt_replace_call(instr, insn_buff, a);
- if (insn_buff_sz < 0)
- continue;
- }
for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
insn_buff[insn_buff_sz] = 0x90;
@@ -2244,21 +2223,34 @@ int alternatives_text_reserved(void *start, void *end)
* See entry_{32,64}.S for more details.
*/
-/*
- * We define the int3_magic() function in assembly to control the calling
- * convention such that we can 'call' it from assembly.
- */
+extern void int3_selftest_asm(unsigned int *ptr);
-extern void int3_magic(unsigned int *ptr); /* defined in asm */
+asm (
+" .pushsection .init.text, \"ax\", @progbits\n"
+" .type int3_selftest_asm, @function\n"
+"int3_selftest_asm:\n"
+ ANNOTATE_NOENDBR
+ /*
+ * INT3 padded with NOP to CALL_INSN_SIZE. The INT3 triggers an
+ * exception, then the int3_exception_nb notifier emulates a call to
+ * int3_selftest_callee().
+ */
+" int3; nop; nop; nop; nop\n"
+ ASM_RET
+" .size int3_selftest_asm, . - int3_selftest_asm\n"
+" .popsection\n"
+);
+
+extern void int3_selftest_callee(unsigned int *ptr);
asm (
" .pushsection .init.text, \"ax\", @progbits\n"
-" .type int3_magic, @function\n"
-"int3_magic:\n"
+" .type int3_selftest_callee, @function\n"
+"int3_selftest_callee:\n"
ANNOTATE_NOENDBR
-" movl $1, (%" _ASM_ARG1 ")\n"
+" movl $0x1234, (%" _ASM_ARG1 ")\n"
ASM_RET
-" .size int3_magic, .-int3_magic\n"
+" .size int3_selftest_callee, . - int3_selftest_callee\n"
" .popsection\n"
);
@@ -2267,7 +2259,7 @@ extern void int3_selftest_ip(void); /* defined in asm below */
static int __init
int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
{
- unsigned long selftest = (unsigned long)&int3_selftest_ip;
+ unsigned long selftest = (unsigned long)&int3_selftest_asm;
struct die_args *args = data;
struct pt_regs *regs = args->regs;
@@ -2282,7 +2274,7 @@ int3_exception_notify(struct notifier_block *self, unsigned long val, void *data
if (regs->ip - INT3_INSN_SIZE != selftest)
return NOTIFY_DONE;
- int3_emulate_call(regs, (unsigned long)&int3_magic);
+ int3_emulate_call(regs, (unsigned long)&int3_selftest_callee);
return NOTIFY_STOP;
}
@@ -2298,19 +2290,11 @@ static noinline void __init int3_selftest(void)
BUG_ON(register_die_notifier(&int3_exception_nb));
/*
- * Basically: int3_magic(&val); but really complicated :-)
- *
- * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb
- * notifier above will emulate CALL for us.
+ * Basically: int3_selftest_callee(&val); but really complicated :-)
*/
- asm volatile ("int3_selftest_ip:\n\t"
- ANNOTATE_NOENDBR
- " int3; nop; nop; nop; nop\n\t"
- : ASM_CALL_CONSTRAINT
- : __ASM_SEL_RAW(a, D) (&val)
- : "memory");
-
- BUG_ON(val != 1);
+ int3_selftest_asm(&val);
+
+ BUG_ON(val != 0x1234);
unregister_die_notifier(&int3_exception_nb);
}
diff --git a/arch/x86/kernel/amd_node.c b/arch/x86/kernel/amd_node.c
index a40176b62eb5..3d0a4768d603 100644
--- a/arch/x86/kernel/amd_node.c
+++ b/arch/x86/kernel/amd_node.c
@@ -34,62 +34,6 @@ struct pci_dev *amd_node_get_func(u16 node, u8 func)
return pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(AMD_NODE0_PCI_SLOT + node, func));
}
-#define DF_BLK_INST_CNT 0x040
-#define DF_CFG_ADDR_CNTL_LEGACY 0x084
-#define DF_CFG_ADDR_CNTL_DF4 0xC04
-
-#define DF_MAJOR_REVISION GENMASK(27, 24)
-
-static u16 get_cfg_addr_cntl_offset(struct pci_dev *df_f0)
-{
- u32 reg;
-
- /*
- * Revision fields added for DF4 and later.
- *
- * Major revision of '0' is found pre-DF4. Field is Read-as-Zero.
- */
- if (pci_read_config_dword(df_f0, DF_BLK_INST_CNT, &reg))
- return 0;
-
- if (reg & DF_MAJOR_REVISION)
- return DF_CFG_ADDR_CNTL_DF4;
-
- return DF_CFG_ADDR_CNTL_LEGACY;
-}
-
-struct pci_dev *amd_node_get_root(u16 node)
-{
- struct pci_dev *root;
- u16 cntl_off;
- u8 bus;
-
- if (!cpu_feature_enabled(X86_FEATURE_ZEN))
- return NULL;
-
- /*
- * D18F0xXXX [Config Address Control] (DF::CfgAddressCntl)
- * Bits [7:0] (SecBusNum) holds the bus number of the root device for
- * this Data Fabric instance. The segment, device, and function will be 0.
- */
- struct pci_dev *df_f0 __free(pci_dev_put) = amd_node_get_func(node, 0);
- if (!df_f0)
- return NULL;
-
- cntl_off = get_cfg_addr_cntl_offset(df_f0);
- if (!cntl_off)
- return NULL;
-
- if (pci_read_config_byte(df_f0, cntl_off, &bus))
- return NULL;
-
- /* Grab the pointer for the actual root device instance. */
- root = pci_get_domain_bus_and_slot(0, bus, 0);
-
- pci_dbg(root, "is root for AMD node %u\n", node);
- return root;
-}
-
static struct pci_dev **amd_roots;
/* Protect the PCI config register pairs used for SMN. */
@@ -274,51 +218,21 @@ DEFINE_SHOW_STORE_ATTRIBUTE(smn_node);
DEFINE_SHOW_STORE_ATTRIBUTE(smn_address);
DEFINE_SHOW_STORE_ATTRIBUTE(smn_value);
-static int amd_cache_roots(void)
-{
- u16 node, num_nodes = amd_num_nodes();
-
- amd_roots = kcalloc(num_nodes, sizeof(*amd_roots), GFP_KERNEL);
- if (!amd_roots)
- return -ENOMEM;
-
- for (node = 0; node < num_nodes; node++)
- amd_roots[node] = amd_node_get_root(node);
-
- return 0;
-}
-
-static int reserve_root_config_spaces(void)
+static struct pci_dev *get_next_root(struct pci_dev *root)
{
- struct pci_dev *root = NULL;
- struct pci_bus *bus = NULL;
-
- while ((bus = pci_find_next_bus(bus))) {
- /* Root device is Device 0 Function 0 on each Primary Bus. */
- root = pci_get_slot(bus, 0);
- if (!root)
+ while ((root = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, root))) {
+ /* Root device is Device 0 Function 0. */
+ if (root->devfn)
continue;
if (root->vendor != PCI_VENDOR_ID_AMD &&
root->vendor != PCI_VENDOR_ID_HYGON)
continue;
- pci_dbg(root, "Reserving PCI config space\n");
-
- /*
- * There are a few SMN index/data pairs and other registers
- * that shouldn't be accessed by user space.
- * So reserve the entire PCI config space for simplicity rather
- * than covering specific registers piecemeal.
- */
- if (!pci_request_config_region_exclusive(root, 0, PCI_CFG_SPACE_SIZE, NULL)) {
- pci_err(root, "Failed to reserve config space\n");
- return -EEXIST;
- }
+ break;
}
- smn_exclusive = true;
- return 0;
+ return root;
}
static bool enable_dfs;
@@ -332,7 +246,8 @@ __setup("amd_smn_debugfs_enable", amd_smn_enable_dfs);
static int __init amd_smn_init(void)
{
- int err;
+ u16 count, num_roots, roots_per_node, node, num_nodes;
+ struct pci_dev *root;
if (!cpu_feature_enabled(X86_FEATURE_ZEN))
return 0;
@@ -342,13 +257,48 @@ static int __init amd_smn_init(void)
if (amd_roots)
return 0;
- err = amd_cache_roots();
- if (err)
- return err;
+ num_roots = 0;
+ root = NULL;
+ while ((root = get_next_root(root))) {
+ pci_dbg(root, "Reserving PCI config space\n");
- err = reserve_root_config_spaces();
- if (err)
- return err;
+ /*
+ * There are a few SMN index/data pairs and other registers
+ * that shouldn't be accessed by user space. So reserve the
+ * entire PCI config space for simplicity rather than covering
+ * specific registers piecemeal.
+ */
+ if (!pci_request_config_region_exclusive(root, 0, PCI_CFG_SPACE_SIZE, NULL)) {
+ pci_err(root, "Failed to reserve config space\n");
+ return -EEXIST;
+ }
+
+ num_roots++;
+ }
+
+ pr_debug("Found %d AMD root devices\n", num_roots);
+
+ if (!num_roots)
+ return -ENODEV;
+
+ num_nodes = amd_num_nodes();
+ amd_roots = kcalloc(num_nodes, sizeof(*amd_roots), GFP_KERNEL);
+ if (!amd_roots)
+ return -ENOMEM;
+
+ roots_per_node = num_roots / num_nodes;
+
+ count = 0;
+ node = 0;
+ root = NULL;
+ while (node < num_nodes && (root = get_next_root(root))) {
+ /* Use one root for each node and skip the rest. */
+ if (count++ % roots_per_node)
+ continue;
+
+ pci_dbg(root, "is root for AMD node %u\n", node);
+ amd_roots[node++] = root;
+ }
if (enable_dfs) {
debugfs_dir = debugfs_create_dir("amd_smn", arch_debugfs_dir);
@@ -358,6 +308,8 @@ static int __init amd_smn_init(void)
debugfs_create_file("value", 0600, debugfs_dir, NULL, &smn_value_fops);
}
+ smn_exclusive = true;
+
return 0;
}
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 680d305589a3..9c29e12b84e5 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -36,6 +36,7 @@
#include <linux/dmi.h>
#include <linux/smp.h>
#include <linux/mm.h>
+#include <linux/kvm_types.h>
#include <xen/xen.h>
@@ -173,6 +174,7 @@ static struct resource lapic_resource = {
.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
};
+/* Measured in ticks per HZ. */
unsigned int lapic_timer_period = 0;
static void apic_pm_activate(void);
@@ -792,6 +794,7 @@ static int __init calibrate_APIC_clock(void)
{
struct clock_event_device *levt = this_cpu_ptr(&lapic_events);
u64 tsc_perj = 0, tsc_start = 0;
+ long delta_tsc_khz, bus_khz;
unsigned long jif_start;
unsigned long deltaj;
long delta, deltatsc;
@@ -894,14 +897,15 @@ static int __init calibrate_APIC_clock(void)
apic_pr_verbose("..... calibration result: %u\n", lapic_timer_period);
if (boot_cpu_has(X86_FEATURE_TSC)) {
- apic_pr_verbose("..... CPU clock speed is %ld.%04ld MHz.\n",
- (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ),
- (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ));
+ delta_tsc_khz = (deltatsc * HZ) / (1000 * LAPIC_CAL_LOOPS);
+
+ apic_pr_verbose("..... CPU clock speed is %ld.%03ld MHz.\n",
+ delta_tsc_khz / 1000, delta_tsc_khz % 1000);
}
- apic_pr_verbose("..... host bus clock speed is %u.%04u MHz.\n",
- lapic_timer_period / (1000000 / HZ),
- lapic_timer_period % (1000000 / HZ));
+ bus_khz = (long)lapic_timer_period * HZ / 1000;
+ apic_pr_verbose("..... host bus clock speed is %ld.%03ld MHz.\n",
+ bus_khz / 1000, bus_khz % 1000);
/*
* Do a sanity check on the APIC calibration result
@@ -2316,7 +2320,7 @@ u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid)
dest |= msg->arch_addr_hi.destid_8_31 << 8;
return dest;
}
-EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid);
+EXPORT_SYMBOL_FOR_KVM(x86_msi_msg_get_destid);
static void __init apic_bsp_up_setup(void)
{
diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c
index 9ef3be866832..2ed3b5c88c7f 100644
--- a/arch/x86/kernel/apic/apic_common.c
+++ b/arch/x86/kernel/apic/apic_common.c
@@ -4,6 +4,7 @@
* SPDX-License-Identifier: GPL-2.0
*/
#include <linux/irq.h>
+#include <linux/kvm_types.h>
#include <asm/apic.h>
#include "local.h"
@@ -25,7 +26,7 @@ u32 default_cpu_present_to_apicid(int mps_cpu)
else
return BAD_APICID;
}
-EXPORT_SYMBOL_GPL(default_cpu_present_to_apicid);
+EXPORT_SYMBOL_FOR_KVM(default_cpu_present_to_apicid);
/*
* Set up the logical destination ID when the APIC operates in logical
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 5ba2feb2c04c..1e0442e867b1 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2864,7 +2864,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
ioapic = mp_irqdomain_ioapic_idx(domain);
pin = info->ioapic.pin;
- if (irq_find_mapping(domain, (irq_hw_number_t)pin) > 0)
+ if (irq_resolve_mapping(domain, (irq_hw_number_t)pin))
return -EEXIST;
data = kzalloc(sizeof(*data), GFP_KERNEL);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 5398db4dedb4..bc94ff1e250a 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -3,7 +3,7 @@
#include <linux/bitops.h>
#include <linux/elf.h>
#include <linux/mm.h>
-
+#include <linux/kvm_types.h>
#include <linux/io.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
@@ -516,7 +516,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
setup_force_cpu_cap(X86_FEATURE_ZEN5);
break;
case 0x50 ... 0x5f:
- case 0x90 ... 0xaf:
+ case 0x80 ... 0xaf:
case 0xc0 ... 0xcf:
setup_force_cpu_cap(X86_FEATURE_ZEN6);
break;
@@ -1035,8 +1035,26 @@ static void init_amd_zen4(struct cpuinfo_x86 *c)
}
}
+static const struct x86_cpu_id zen5_rdseed_microcode[] = {
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x02, 0x1, 0x0b00215a),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x08, 0x1, 0x0b008121),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x11, 0x0, 0x0b101054),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x24, 0x0, 0x0b204037),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x44, 0x0, 0x0b404035),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x44, 0x1, 0x0b404108),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x60, 0x0, 0x0b600037),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x68, 0x0, 0x0b608038),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x70, 0x0, 0x0b700037),
+ {},
+};
+
static void init_amd_zen5(struct cpuinfo_x86 *c)
{
+ if (!x86_match_min_microcode_rev(zen5_rdseed_microcode)) {
+ clear_cpu_cap(c, X86_FEATURE_RDSEED);
+ msr_clear_bit(MSR_AMD64_CPUID_FN_7, 18);
+ pr_emerg_once("RDSEED32 is broken. Disabling the corresponding CPUID bit.\n");
+ }
}
static void init_amd(struct cpuinfo_x86 *c)
@@ -1300,7 +1318,7 @@ unsigned long amd_get_dr_addr_mask(unsigned int dr)
return per_cpu(amd_dr_addr_mask[dr], smp_processor_id());
}
-EXPORT_SYMBOL_GPL(amd_get_dr_addr_mask);
+EXPORT_SYMBOL_FOR_KVM(amd_get_dr_addr_mask);
static void zenbleed_check_cpu(void *unused)
{
@@ -1355,11 +1373,23 @@ static __init int print_s5_reset_status_mmio(void)
return 0;
value = ioread32(addr);
- iounmap(addr);
/* Value with "all bits set" is an error response and should be ignored. */
- if (value == U32_MAX)
+ if (value == U32_MAX) {
+ iounmap(addr);
return 0;
+ }
+
+ /*
+ * Clear all reason bits so they won't be retained if the next reset
+ * does not update the register. Besides, some bits are never cleared by
+ * hardware so it's software's responsibility to clear them.
+ *
+ * Writing the value back effectively clears all reason bits as they are
+ * write-1-to-clear.
+ */
+ iowrite32(value, addr);
+ iounmap(addr);
for (i = 0; i < ARRAY_SIZE(s5_reset_reason_txt); i++) {
if (!(value & BIT(i)))
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 6a526ae1fe99..d8660770dc6a 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -16,6 +16,7 @@
#include <linux/sched/smt.h>
#include <linux/pgtable.h>
#include <linux/bpf.h>
+#include <linux/kvm_types.h>
#include <asm/spec-ctrl.h>
#include <asm/cmdline.h>
@@ -53,56 +54,8 @@
* mitigation option.
*/
-static void __init spectre_v1_select_mitigation(void);
-static void __init spectre_v1_apply_mitigation(void);
-static void __init spectre_v2_select_mitigation(void);
-static void __init spectre_v2_update_mitigation(void);
-static void __init spectre_v2_apply_mitigation(void);
-static void __init retbleed_select_mitigation(void);
-static void __init retbleed_update_mitigation(void);
-static void __init retbleed_apply_mitigation(void);
-static void __init spectre_v2_user_select_mitigation(void);
-static void __init spectre_v2_user_update_mitigation(void);
-static void __init spectre_v2_user_apply_mitigation(void);
-static void __init ssb_select_mitigation(void);
-static void __init ssb_apply_mitigation(void);
-static void __init l1tf_select_mitigation(void);
-static void __init l1tf_apply_mitigation(void);
-static void __init mds_select_mitigation(void);
-static void __init mds_update_mitigation(void);
-static void __init mds_apply_mitigation(void);
-static void __init taa_select_mitigation(void);
-static void __init taa_update_mitigation(void);
-static void __init taa_apply_mitigation(void);
-static void __init mmio_select_mitigation(void);
-static void __init mmio_update_mitigation(void);
-static void __init mmio_apply_mitigation(void);
-static void __init rfds_select_mitigation(void);
-static void __init rfds_update_mitigation(void);
-static void __init rfds_apply_mitigation(void);
-static void __init srbds_select_mitigation(void);
-static void __init srbds_apply_mitigation(void);
-static void __init l1d_flush_select_mitigation(void);
-static void __init srso_select_mitigation(void);
-static void __init srso_update_mitigation(void);
-static void __init srso_apply_mitigation(void);
-static void __init gds_select_mitigation(void);
-static void __init gds_apply_mitigation(void);
-static void __init bhi_select_mitigation(void);
-static void __init bhi_update_mitigation(void);
-static void __init bhi_apply_mitigation(void);
-static void __init its_select_mitigation(void);
-static void __init its_update_mitigation(void);
-static void __init its_apply_mitigation(void);
-static void __init tsa_select_mitigation(void);
-static void __init tsa_apply_mitigation(void);
-static void __init vmscape_select_mitigation(void);
-static void __init vmscape_update_mitigation(void);
-static void __init vmscape_apply_mitigation(void);
-
/* The base value of the SPEC_CTRL MSR without task-specific bits set */
u64 x86_spec_ctrl_base;
-EXPORT_SYMBOL_GPL(x86_spec_ctrl_base);
/* The current value of the SPEC_CTRL MSR with task-specific bits set */
DEFINE_PER_CPU(u64, x86_spec_ctrl_current);
@@ -179,7 +132,7 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
/* Control IBPB on vCPU load */
DEFINE_STATIC_KEY_FALSE(switch_vcpu_ibpb);
-EXPORT_SYMBOL_GPL(switch_vcpu_ibpb);
+EXPORT_SYMBOL_FOR_KVM(switch_vcpu_ibpb);
/* Control CPU buffer clear before idling (halt, mwait) */
DEFINE_STATIC_KEY_FALSE(cpu_buf_idle_clear);
@@ -198,7 +151,7 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
* mitigation is required.
*/
DEFINE_STATIC_KEY_FALSE(cpu_buf_vm_clear);
-EXPORT_SYMBOL_GPL(cpu_buf_vm_clear);
+EXPORT_SYMBOL_FOR_KVM(cpu_buf_vm_clear);
#undef pr_fmt
#define pr_fmt(fmt) "mitigations: " fmt
@@ -233,99 +186,6 @@ static void __init cpu_print_attack_vectors(void)
}
}
-void __init cpu_select_mitigations(void)
-{
- /*
- * Read the SPEC_CTRL MSR to account for reserved bits which may
- * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD
- * init code as it is not enumerated and depends on the family.
- */
- if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) {
- rdmsrq(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
-
- /*
- * Previously running kernel (kexec), may have some controls
- * turned ON. Clear them and let the mitigations setup below
- * rediscover them based on configuration.
- */
- x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK;
- }
-
- x86_arch_cap_msr = x86_read_arch_cap_msr();
-
- cpu_print_attack_vectors();
-
- /* Select the proper CPU mitigations before patching alternatives: */
- spectre_v1_select_mitigation();
- spectre_v2_select_mitigation();
- retbleed_select_mitigation();
- spectre_v2_user_select_mitigation();
- ssb_select_mitigation();
- l1tf_select_mitigation();
- mds_select_mitigation();
- taa_select_mitigation();
- mmio_select_mitigation();
- rfds_select_mitigation();
- srbds_select_mitigation();
- l1d_flush_select_mitigation();
- srso_select_mitigation();
- gds_select_mitigation();
- its_select_mitigation();
- bhi_select_mitigation();
- tsa_select_mitigation();
- vmscape_select_mitigation();
-
- /*
- * After mitigations are selected, some may need to update their
- * choices.
- */
- spectre_v2_update_mitigation();
- /*
- * retbleed_update_mitigation() relies on the state set by
- * spectre_v2_update_mitigation(); specifically it wants to know about
- * spectre_v2=ibrs.
- */
- retbleed_update_mitigation();
- /*
- * its_update_mitigation() depends on spectre_v2_update_mitigation()
- * and retbleed_update_mitigation().
- */
- its_update_mitigation();
-
- /*
- * spectre_v2_user_update_mitigation() depends on
- * retbleed_update_mitigation(), specifically the STIBP
- * selection is forced for UNRET or IBPB.
- */
- spectre_v2_user_update_mitigation();
- mds_update_mitigation();
- taa_update_mitigation();
- mmio_update_mitigation();
- rfds_update_mitigation();
- bhi_update_mitigation();
- /* srso_update_mitigation() depends on retbleed_update_mitigation(). */
- srso_update_mitigation();
- vmscape_update_mitigation();
-
- spectre_v1_apply_mitigation();
- spectre_v2_apply_mitigation();
- retbleed_apply_mitigation();
- spectre_v2_user_apply_mitigation();
- ssb_apply_mitigation();
- l1tf_apply_mitigation();
- mds_apply_mitigation();
- taa_apply_mitigation();
- mmio_apply_mitigation();
- rfds_apply_mitigation();
- srbds_apply_mitigation();
- srso_apply_mitigation();
- gds_apply_mitigation();
- its_apply_mitigation();
- bhi_apply_mitigation();
- tsa_apply_mitigation();
- vmscape_apply_mitigation();
-}
-
/*
* NOTE: This function is *only* called for SVM, since Intel uses
* MSR_IA32_SPEC_CTRL for SSBD.
@@ -366,7 +226,7 @@ x86_virt_spec_ctrl(u64 guest_virt_spec_ctrl, bool setguest)
speculation_ctrl_update(tif);
}
}
-EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl);
+EXPORT_SYMBOL_FOR_KVM(x86_virt_spec_ctrl);
static void x86_amd_ssb_disable(void)
{
@@ -1032,7 +892,7 @@ bool gds_ucode_mitigated(void)
return (gds_mitigation == GDS_MITIGATION_FULL ||
gds_mitigation == GDS_MITIGATION_FULL_LOCKED);
}
-EXPORT_SYMBOL_GPL(gds_ucode_mitigated);
+EXPORT_SYMBOL_FOR_KVM(gds_ucode_mitigated);
void update_gds_msr(void)
{
@@ -1463,7 +1323,9 @@ static void __init retbleed_update_mitigation(void)
break;
default:
if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) {
- pr_err(RETBLEED_INTEL_MSG);
+ if (retbleed_mitigation != RETBLEED_MITIGATION_NONE)
+ pr_err(RETBLEED_INTEL_MSG);
+
retbleed_mitigation = RETBLEED_MITIGATION_NONE;
}
}
@@ -1825,13 +1687,6 @@ void unpriv_ebpf_notify(int new_state)
}
#endif
-static inline bool match_option(const char *arg, int arglen, const char *opt)
-{
- int len = strlen(opt);
-
- return len == arglen && !strncmp(arg, opt, len);
-}
-
/* The kernel command line selection for spectre v2 */
enum spectre_v2_mitigation_cmd {
SPECTRE_V2_CMD_NONE,
@@ -2864,7 +2719,7 @@ void x86_spec_ctrl_setup_ap(void)
}
bool itlb_multihit_kvm_mitigation;
-EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);
+EXPORT_SYMBOL_FOR_KVM(itlb_multihit_kvm_mitigation);
#undef pr_fmt
#define pr_fmt(fmt) "L1TF: " fmt
@@ -2872,11 +2727,9 @@ EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);
/* Default mitigation for L1TF-affected CPUs */
enum l1tf_mitigations l1tf_mitigation __ro_after_init =
IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_AUTO : L1TF_MITIGATION_OFF;
-#if IS_ENABLED(CONFIG_KVM_INTEL)
-EXPORT_SYMBOL_GPL(l1tf_mitigation);
-#endif
+EXPORT_SYMBOL_FOR_KVM(l1tf_mitigation);
enum vmx_l1d_flush_state l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
-EXPORT_SYMBOL_GPL(l1tf_vmx_mitigation);
+EXPORT_SYMBOL_FOR_KVM(l1tf_vmx_mitigation);
/*
* These CPUs all support 44bits physical address space internally in the
@@ -3376,6 +3229,99 @@ void cpu_bugs_smt_update(void)
mutex_unlock(&spec_ctrl_mutex);
}
+void __init cpu_select_mitigations(void)
+{
+ /*
+ * Read the SPEC_CTRL MSR to account for reserved bits which may
+ * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD
+ * init code as it is not enumerated and depends on the family.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) {
+ rdmsrq(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+
+ /*
+ * Previously running kernel (kexec), may have some controls
+ * turned ON. Clear them and let the mitigations setup below
+ * rediscover them based on configuration.
+ */
+ x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK;
+ }
+
+ x86_arch_cap_msr = x86_read_arch_cap_msr();
+
+ cpu_print_attack_vectors();
+
+ /* Select the proper CPU mitigations before patching alternatives: */
+ spectre_v1_select_mitigation();
+ spectre_v2_select_mitigation();
+ retbleed_select_mitigation();
+ spectre_v2_user_select_mitigation();
+ ssb_select_mitigation();
+ l1tf_select_mitigation();
+ mds_select_mitigation();
+ taa_select_mitigation();
+ mmio_select_mitigation();
+ rfds_select_mitigation();
+ srbds_select_mitigation();
+ l1d_flush_select_mitigation();
+ srso_select_mitigation();
+ gds_select_mitigation();
+ its_select_mitigation();
+ bhi_select_mitigation();
+ tsa_select_mitigation();
+ vmscape_select_mitigation();
+
+ /*
+ * After mitigations are selected, some may need to update their
+ * choices.
+ */
+ spectre_v2_update_mitigation();
+ /*
+ * retbleed_update_mitigation() relies on the state set by
+ * spectre_v2_update_mitigation(); specifically it wants to know about
+ * spectre_v2=ibrs.
+ */
+ retbleed_update_mitigation();
+ /*
+ * its_update_mitigation() depends on spectre_v2_update_mitigation()
+ * and retbleed_update_mitigation().
+ */
+ its_update_mitigation();
+
+ /*
+ * spectre_v2_user_update_mitigation() depends on
+ * retbleed_update_mitigation(), specifically the STIBP
+ * selection is forced for UNRET or IBPB.
+ */
+ spectre_v2_user_update_mitigation();
+ mds_update_mitigation();
+ taa_update_mitigation();
+ mmio_update_mitigation();
+ rfds_update_mitigation();
+ bhi_update_mitigation();
+ /* srso_update_mitigation() depends on retbleed_update_mitigation(). */
+ srso_update_mitigation();
+ vmscape_update_mitigation();
+
+ spectre_v1_apply_mitigation();
+ spectre_v2_apply_mitigation();
+ retbleed_apply_mitigation();
+ spectre_v2_user_apply_mitigation();
+ ssb_apply_mitigation();
+ l1tf_apply_mitigation();
+ mds_apply_mitigation();
+ taa_apply_mitigation();
+ mmio_apply_mitigation();
+ rfds_apply_mitigation();
+ srbds_apply_mitigation();
+ srso_apply_mitigation();
+ gds_apply_mitigation();
+ its_apply_mitigation();
+ bhi_apply_mitigation();
+ tsa_apply_mitigation();
+ vmscape_apply_mitigation();
+}
+
#ifdef CONFIG_SYSFS
#define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion"
diff --git a/arch/x86/kernel/cpu/bus_lock.c b/arch/x86/kernel/cpu/bus_lock.c
index 981f8b1f0792..dbc99a47be45 100644
--- a/arch/x86/kernel/cpu/bus_lock.c
+++ b/arch/x86/kernel/cpu/bus_lock.c
@@ -6,6 +6,7 @@
#include <linux/workqueue.h>
#include <linux/delay.h>
#include <linux/cpuhotplug.h>
+#include <linux/kvm_types.h>
#include <asm/cpu_device_id.h>
#include <asm/cmdline.h>
#include <asm/traps.h>
@@ -289,7 +290,7 @@ bool handle_guest_split_lock(unsigned long ip)
force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
return false;
}
-EXPORT_SYMBOL_GPL(handle_guest_split_lock);
+EXPORT_SYMBOL_FOR_KVM(handle_guest_split_lock);
void bus_lock_init(void)
{
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index d01dd88fae7d..e7ab22fce3b5 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -7,6 +7,7 @@
#include <linux/bitops.h>
#include <linux/kernel.h>
#include <linux/export.h>
+#include <linux/kvm_types.h>
#include <linux/percpu.h>
#include <linux/string.h>
#include <linux/ctype.h>
@@ -78,6 +79,10 @@
DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
EXPORT_PER_CPU_SYMBOL(cpu_info);
+/* Used for modules: built-in code uses runtime constants */
+unsigned long USER_PTR_MAX;
+EXPORT_SYMBOL(USER_PTR_MAX);
+
u32 elf_hwcap2 __read_mostly;
/* Number of siblings per CPU package */
@@ -482,14 +487,14 @@ void cr4_update_irqsoff(unsigned long set, unsigned long clear)
__write_cr4(newval);
}
}
-EXPORT_SYMBOL(cr4_update_irqsoff);
+EXPORT_SYMBOL_FOR_KVM(cr4_update_irqsoff);
/* Read the CR4 shadow. */
unsigned long cr4_read_shadow(void)
{
return this_cpu_read(cpu_tlbstate.cr4);
}
-EXPORT_SYMBOL_GPL(cr4_read_shadow);
+EXPORT_SYMBOL_FOR_KVM(cr4_read_shadow);
void cr4_init(void)
{
@@ -744,7 +749,7 @@ void load_direct_gdt(int cpu)
gdt_descr.size = GDT_SIZE - 1;
load_gdt(&gdt_descr);
}
-EXPORT_SYMBOL_GPL(load_direct_gdt);
+EXPORT_SYMBOL_FOR_KVM(load_direct_gdt);
/* Load a fixmap remapping of the per-cpu GDT */
void load_fixmap_gdt(int cpu)
@@ -2597,7 +2602,7 @@ void __init arch_cpu_finalize_init(void)
alternative_instructions();
if (IS_ENABLED(CONFIG_X86_64)) {
- unsigned long USER_PTR_MAX = TASK_SIZE_MAX;
+ USER_PTR_MAX = TASK_SIZE_MAX;
/*
* Enable this when LAM is gated on LASS support
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index bc38b2d56f26..5c7a3a71191a 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -42,15 +42,6 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],
*const __x86_cpu_dev_end[];
#ifdef CONFIG_CPU_SUP_INTEL
-enum tsx_ctrl_states {
- TSX_CTRL_ENABLE,
- TSX_CTRL_DISABLE,
- TSX_CTRL_RTM_ALWAYS_ABORT,
- TSX_CTRL_NOT_SUPPORTED,
-};
-
-extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state;
-
extern void __init tsx_init(void);
void tsx_ap_init(void);
void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index 98d0cdd82574..146f6f8b0650 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -72,6 +72,7 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC },
{ X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_TOTAL },
{ X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_LOCAL },
+ { X86_FEATURE_SDCIAE, X86_FEATURE_CAT_L3 },
{ X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL },
{ X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW },
{ X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES },
@@ -79,6 +80,7 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_SGX_LC, X86_FEATURE_SGX },
{ X86_FEATURE_SGX1, X86_FEATURE_SGX },
{ X86_FEATURE_SGX2, X86_FEATURE_SGX1 },
+ { X86_FEATURE_SGX_EUPDATESVN, X86_FEATURE_SGX1 },
{ X86_FEATURE_SGX_EDECCSSA, X86_FEATURE_SGX1 },
{ X86_FEATURE_XFD, X86_FEATURE_XSAVES },
{ X86_FEATURE_XFD, X86_FEATURE_XGETBV1 },
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index d6906442f49b..3f1dda355307 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -43,9 +43,6 @@
/* Deferred error settings */
#define MSR_CU_DEF_ERR 0xC0000410
#define MASK_DEF_LVTOFF 0x000000F0
-#define MASK_DEF_INT_TYPE 0x00000006
-#define DEF_LVT_OFF 0x2
-#define DEF_INT_TYPE_APIC 0x2
/* Scalable MCA: */
@@ -54,6 +51,17 @@
static bool thresholding_irq_en;
+struct mce_amd_cpu_data {
+ mce_banks_t thr_intr_banks;
+ mce_banks_t dfr_intr_banks;
+
+ u32 thr_intr_en: 1,
+ dfr_intr_en: 1,
+ __resv: 30;
+};
+
+static DEFINE_PER_CPU_READ_MOSTLY(struct mce_amd_cpu_data, mce_amd_data);
+
static const char * const th_names[] = {
"load_store",
"insn_fetch",
@@ -79,6 +87,8 @@ struct smca_bank {
const struct smca_hwid *hwid;
u32 id; /* Value of MCA_IPID[InstanceId]. */
u8 sysfs_id; /* Value used for sysfs name. */
+ u64 paddrv :1, /* Physical Address Valid bit in MCA_CONFIG */
+ __reserved :63;
};
static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks);
@@ -264,6 +274,7 @@ void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
static void smca_configure(unsigned int bank, unsigned int cpu)
{
+ struct mce_amd_cpu_data *data = this_cpu_ptr(&mce_amd_data);
u8 *bank_counts = this_cpu_ptr(smca_bank_counts);
const struct smca_hwid *s_hwid;
unsigned int i, hwid_mcatype;
@@ -294,11 +305,33 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
* APIC based interrupt. First, check that no interrupt has been
* set.
*/
- if ((low & BIT(5)) && !((high >> 5) & 0x3))
+ if ((low & BIT(5)) && !((high >> 5) & 0x3) && data->dfr_intr_en) {
+ __set_bit(bank, data->dfr_intr_banks);
high |= BIT(5);
+ }
+
+ /*
+ * SMCA Corrected Error Interrupt
+ *
+ * MCA_CONFIG[IntPresent] is bit 10, and tells us if the bank can
+ * send an MCA Thresholding interrupt without the OS initializing
+ * this feature. This can be used if the threshold limit is managed
+ * by the platform.
+ *
+ * MCA_CONFIG[IntEn] is bit 40 (8 in the high portion of the MSR).
+ * The OS should set this to inform the platform that the OS is ready
+ * to handle the MCA Thresholding interrupt.
+ */
+ if ((low & BIT(10)) && data->thr_intr_en) {
+ __set_bit(bank, data->thr_intr_banks);
+ high |= BIT(8);
+ }
this_cpu_ptr(mce_banks_array)[bank].lsb_in_status = !!(low & BIT(8));
+ if (low & MCI_CONFIG_PADDRV)
+ this_cpu_ptr(smca_banks)[bank].paddrv = 1;
+
wrmsr(smca_config, low, high);
}
@@ -368,6 +401,14 @@ static bool lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
{
int msr = (hi & MASK_LVTOFF_HI) >> 20;
+ /*
+ * On SMCA CPUs, LVT offset is programmed at a different MSR, and
+ * the BIOS provides the value. The original field where LVT offset
+ * was set is reserved. Return early here:
+ */
+ if (mce_flags.smca)
+ return false;
+
if (apic < 0) {
pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
"for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
@@ -376,14 +417,6 @@ static bool lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
}
if (apic != msr) {
- /*
- * On SMCA CPUs, LVT offset is programmed at a different MSR, and
- * the BIOS provides the value. The original field where LVT offset
- * was set is reserved. Return early here:
- */
- if (mce_flags.smca)
- return false;
-
pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
"for bank %d, block %d (MSR%08X=0x%x%08x)\n",
b->cpu, apic, b->bank, b->block, b->address, hi, lo);
@@ -443,6 +476,36 @@ static void threshold_restart_block(void *_tr)
wrmsr(tr->b->address, lo, hi);
}
+static void threshold_restart_bank(unsigned int bank, bool intr_en)
+{
+ struct threshold_bank **thr_banks = this_cpu_read(threshold_banks);
+ struct threshold_block *block, *tmp;
+ struct thresh_restart tr;
+
+ if (!thr_banks || !thr_banks[bank])
+ return;
+
+ memset(&tr, 0, sizeof(tr));
+
+ list_for_each_entry_safe(block, tmp, &thr_banks[bank]->miscj, miscj) {
+ tr.b = block;
+ tr.b->interrupt_enable = intr_en;
+ threshold_restart_block(&tr);
+ }
+}
+
+/* Try to use the threshold limit reported through APEI. */
+static u16 get_thr_limit(void)
+{
+ u32 thr_limit = mce_get_apei_thr_limit();
+
+ /* Fallback to old default if APEI limit is not available. */
+ if (!thr_limit)
+ return THRESHOLD_MAX;
+
+ return min(thr_limit, THRESHOLD_MAX);
+}
+
static void mce_threshold_block_init(struct threshold_block *b, int offset)
{
struct thresh_restart tr = {
@@ -451,7 +514,7 @@ static void mce_threshold_block_init(struct threshold_block *b, int offset)
.lvt_off = offset,
};
- b->threshold_limit = THRESHOLD_MAX;
+ b->threshold_limit = get_thr_limit();
threshold_restart_block(&tr);
};
@@ -464,41 +527,6 @@ static int setup_APIC_mce_threshold(int reserved, int new)
return reserved;
}
-static int setup_APIC_deferred_error(int reserved, int new)
-{
- if (reserved < 0 && !setup_APIC_eilvt(new, DEFERRED_ERROR_VECTOR,
- APIC_EILVT_MSG_FIX, 0))
- return new;
-
- return reserved;
-}
-
-static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
-{
- u32 low = 0, high = 0;
- int def_offset = -1, def_new;
-
- if (rdmsr_safe(MSR_CU_DEF_ERR, &low, &high))
- return;
-
- def_new = (low & MASK_DEF_LVTOFF) >> 4;
- if (!(low & MASK_DEF_LVTOFF)) {
- pr_err(FW_BUG "Your BIOS is not setting up LVT offset 0x2 for deferred error IRQs correctly.\n");
- def_new = DEF_LVT_OFF;
- low = (low & ~MASK_DEF_LVTOFF) | (DEF_LVT_OFF << 4);
- }
-
- def_offset = setup_APIC_deferred_error(def_offset, def_new);
- if ((def_offset == def_new) &&
- (deferred_error_int_vector != amd_deferred_error_interrupt))
- deferred_error_int_vector = amd_deferred_error_interrupt;
-
- if (!mce_flags.smca)
- low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC;
-
- wrmsr(MSR_CU_DEF_ERR, low, high);
-}
-
static u32 get_block_address(u32 current_addr, u32 low, u32 high,
unsigned int bank, unsigned int block,
unsigned int cpu)
@@ -534,12 +562,10 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high,
return addr;
}
-static int
-prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
- int offset, u32 misc_high)
+static int prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
+ int offset, u32 misc_high)
{
unsigned int cpu = smp_processor_id();
- u32 smca_low, smca_high;
struct threshold_block b;
int new;
@@ -556,20 +582,13 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
if (!b.interrupt_capable)
goto done;
+ __set_bit(bank, this_cpu_ptr(&mce_amd_data)->thr_intr_banks);
b.interrupt_enable = 1;
- if (!mce_flags.smca) {
- new = (misc_high & MASK_LVTOFF_HI) >> 20;
- goto set_offset;
- }
-
- /* Gather LVT offset for thresholding: */
- if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
- goto out;
-
- new = (smca_low & SMCA_THR_LVT_OFF) >> 12;
+ if (mce_flags.smca)
+ goto done;
-set_offset:
+ new = (misc_high & MASK_LVTOFF_HI) >> 20;
offset = setup_APIC_mce_threshold(offset, new);
if (offset == new)
thresholding_irq_en = true;
@@ -577,7 +596,6 @@ set_offset:
done:
mce_threshold_block_init(&b, offset);
-out:
return offset;
}
@@ -668,6 +686,32 @@ static void amd_apply_cpu_quirks(struct cpuinfo_x86 *c)
mce_banks[0].ctl = 0;
}
+/*
+ * Enable the APIC LVT interrupt vectors once per-CPU. This should be done before hardware is
+ * ready to send interrupts.
+ *
+ * Individual error sources are enabled later during per-bank init.
+ */
+static void smca_enable_interrupt_vectors(void)
+{
+ struct mce_amd_cpu_data *data = this_cpu_ptr(&mce_amd_data);
+ u64 mca_intr_cfg, offset;
+
+ if (!mce_flags.smca || !mce_flags.succor)
+ return;
+
+ if (rdmsrq_safe(MSR_CU_DEF_ERR, &mca_intr_cfg))
+ return;
+
+ offset = (mca_intr_cfg & SMCA_THR_LVT_OFF) >> 12;
+ if (!setup_APIC_eilvt(offset, THRESHOLD_APIC_VECTOR, APIC_EILVT_MSG_FIX, 0))
+ data->thr_intr_en = 1;
+
+ offset = (mca_intr_cfg & MASK_DEF_LVTOFF) >> 4;
+ if (!setup_APIC_eilvt(offset, DEFERRED_ERROR_VECTOR, APIC_EILVT_MSG_FIX, 0))
+ data->dfr_intr_en = 1;
+}
+
/* cpu init entry point, called from mce.c with preempt off */
void mce_amd_feature_init(struct cpuinfo_x86 *c)
{
@@ -679,10 +723,16 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
mce_flags.amd_threshold = 1;
+ smca_enable_interrupt_vectors();
+
for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
- if (mce_flags.smca)
+ if (mce_flags.smca) {
smca_configure(bank, cpu);
+ if (!this_cpu_ptr(&mce_amd_data)->thr_intr_en)
+ continue;
+ }
+
disable_err_thresholding(c, bank);
for (block = 0; block < NR_BLOCKS; ++block) {
@@ -703,9 +753,6 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
offset = prepare_threshold_block(bank, block, address, offset, high);
}
}
-
- if (mce_flags.succor)
- deferred_error_interrupt_enable(c);
}
void smca_bsp_init(void)
@@ -748,9 +795,9 @@ bool amd_mce_is_memory_error(struct mce *m)
}
/*
- * AMD systems do not have an explicit indicator that the value in MCA_ADDR is
- * a system physical address. Therefore, individual cases need to be detected.
- * Future cases and checks will be added as needed.
+ * Some AMD systems have an explicit indicator that the value in MCA_ADDR is a
+ * system physical address. Individual cases though, need to be detected for
+ * other systems. Future cases will be added as needed.
*
* 1) General case
* a) Assume address is not usable.
@@ -764,6 +811,8 @@ bool amd_mce_is_memory_error(struct mce *m)
* a) Reported in legacy bank 4 with extended error code (XEC) 8.
* b) MCA_STATUS[43] is *not* defined as poison in legacy bank 4. Therefore,
* this bit should not be checked.
+ * 4) MCI_STATUS_PADDRVAL is set
+ * a) Will provide a valid system physical address.
*
* NOTE: SMCA UMC memory errors fall into case #1.
*/
@@ -777,6 +826,9 @@ bool amd_mce_usable_address(struct mce *m)
return false;
}
+ if (this_cpu_ptr(smca_banks)[m->bank].paddrv)
+ return m->status & MCI_STATUS_PADDRV;
+
/* Check poison bit for all other bank types. */
if (m->status & MCI_STATUS_POISON)
return true;
@@ -785,37 +837,6 @@ bool amd_mce_usable_address(struct mce *m)
return false;
}
-static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
-{
- struct mce_hw_err err;
- struct mce *m = &err.m;
-
- mce_prep_record(&err);
-
- m->status = status;
- m->misc = misc;
- m->bank = bank;
- m->tsc = rdtsc();
-
- if (m->status & MCI_STATUS_ADDRV) {
- m->addr = addr;
-
- smca_extract_err_addr(m);
- }
-
- if (mce_flags.smca) {
- rdmsrq(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid);
-
- if (m->status & MCI_STATUS_SYNDV) {
- rdmsrq(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd);
- rdmsrq(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1);
- rdmsrq(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2);
- }
- }
-
- mce_log(&err);
-}
-
DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error)
{
trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
@@ -825,103 +846,20 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error)
apic_eoi();
}
-/*
- * Returns true if the logged error is deferred. False, otherwise.
- */
-static inline bool
-_log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc)
-{
- u64 status, addr = 0;
-
- rdmsrq(msr_stat, status);
- if (!(status & MCI_STATUS_VAL))
- return false;
-
- if (status & MCI_STATUS_ADDRV)
- rdmsrq(msr_addr, addr);
-
- __log_error(bank, status, addr, misc);
-
- wrmsrq(msr_stat, 0);
-
- return status & MCI_STATUS_DEFERRED;
-}
-
-static bool _log_error_deferred(unsigned int bank, u32 misc)
-{
- if (!_log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS),
- mca_msr_reg(bank, MCA_ADDR), misc))
- return false;
-
- /*
- * Non-SMCA systems don't have MCA_DESTAT/MCA_DEADDR registers.
- * Return true here to avoid accessing these registers.
- */
- if (!mce_flags.smca)
- return true;
-
- /* Clear MCA_DESTAT if the deferred error was logged from MCA_STATUS. */
- wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
- return true;
-}
-
-/*
- * We have three scenarios for checking for Deferred errors:
- *
- * 1) Non-SMCA systems check MCA_STATUS and log error if found.
- * 2) SMCA systems check MCA_STATUS. If error is found then log it and also
- * clear MCA_DESTAT.
- * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and
- * log it.
- */
-static void log_error_deferred(unsigned int bank)
-{
- if (_log_error_deferred(bank, 0))
- return;
-
- /*
- * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check
- * for a valid error.
- */
- _log_error_bank(bank, MSR_AMD64_SMCA_MCx_DESTAT(bank),
- MSR_AMD64_SMCA_MCx_DEADDR(bank), 0);
-}
-
/* APIC interrupt handler for deferred errors */
static void amd_deferred_error_interrupt(void)
{
- unsigned int bank;
-
- for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank)
- log_error_deferred(bank);
+ machine_check_poll(MCP_TIMESTAMP, &this_cpu_ptr(&mce_amd_data)->dfr_intr_banks);
}
-static void log_error_thresholding(unsigned int bank, u64 misc)
+void mce_amd_handle_storm(unsigned int bank, bool on)
{
- _log_error_deferred(bank, misc);
+ threshold_restart_bank(bank, on);
}
-static void log_and_reset_block(struct threshold_block *block)
+static void amd_reset_thr_limit(unsigned int bank)
{
- struct thresh_restart tr;
- u32 low = 0, high = 0;
-
- if (!block)
- return;
-
- if (rdmsr_safe(block->address, &low, &high))
- return;
-
- if (!(high & MASK_OVERFLOW_HI))
- return;
-
- /* Log the MCE which caused the threshold event. */
- log_error_thresholding(block->bank, ((u64)high << 32) | low);
-
- /* Reset threshold block after logging error. */
- memset(&tr, 0, sizeof(tr));
- tr.b = block;
- threshold_restart_block(&tr);
+ threshold_restart_bank(bank, true);
}
/*
@@ -930,33 +868,21 @@ static void log_and_reset_block(struct threshold_block *block)
*/
static void amd_threshold_interrupt(void)
{
- struct threshold_bank **bp = this_cpu_read(threshold_banks), *thr_bank;
- unsigned int bank, cpu = smp_processor_id();
- struct threshold_block *block, *tmp;
-
- /*
- * Validate that the threshold bank has been initialized already. The
- * handler is installed at boot time, but on a hotplug event the
- * interrupt might fire before the data has been initialized.
- */
- if (!bp)
- return;
-
- for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
- if (!(per_cpu(bank_map, cpu) & BIT_ULL(bank)))
- continue;
-
- thr_bank = bp[bank];
- if (!thr_bank)
- continue;
-
- list_for_each_entry_safe(block, tmp, &thr_bank->miscj, miscj)
- log_and_reset_block(block);
- }
+ machine_check_poll(MCP_TIMESTAMP, &this_cpu_ptr(&mce_amd_data)->thr_intr_banks);
}
void amd_clear_bank(struct mce *m)
{
+ amd_reset_thr_limit(m->bank);
+
+ /* Clear MCA_DESTAT for all deferred errors even those logged in MCA_STATUS. */
+ if (m->status & MCI_STATUS_DEFERRED)
+ mce_wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank), 0);
+
+ /* Don't clear MCA_STATUS if MCA_DESTAT was used exclusively. */
+ if (m->kflags & MCE_CHECK_DFR_REGS)
+ return;
+
mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0);
}
@@ -1172,7 +1098,7 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb
b->address = address;
b->interrupt_enable = 0;
b->interrupt_capable = lvt_interrupt_supported(bank, high);
- b->threshold_limit = THRESHOLD_MAX;
+ b->threshold_limit = get_thr_limit();
if (b->interrupt_capable) {
default_attrs[2] = &interrupt_enable.attr;
@@ -1183,6 +1109,8 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb
list_add(&b->miscj, &tb->miscj);
+ mce_threshold_block_init(b, (high & MASK_LVTOFF_HI) >> 20);
+
err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b));
if (err)
goto out_free;
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 460e90a1a0b1..4aff14e04287 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -687,7 +687,10 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i)
m->misc = mce_rdmsrq(mca_msr_reg(i, MCA_MISC));
if (m->status & MCI_STATUS_ADDRV) {
- m->addr = mce_rdmsrq(mca_msr_reg(i, MCA_ADDR));
+ if (m->kflags & MCE_CHECK_DFR_REGS)
+ m->addr = mce_rdmsrq(MSR_AMD64_SMCA_MCx_DEADDR(i));
+ else
+ m->addr = mce_rdmsrq(mca_msr_reg(i, MCA_ADDR));
/*
* Mask the reported address by the reported granularity.
@@ -715,6 +718,29 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i)
DEFINE_PER_CPU(unsigned, mce_poll_count);
/*
+ * We have three scenarios for checking for Deferred errors:
+ *
+ * 1) Non-SMCA systems check MCA_STATUS and log error if found.
+ * 2) SMCA systems check MCA_STATUS. If error is found then log it and also
+ * clear MCA_DESTAT.
+ * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and
+ * log it.
+ */
+static bool smca_should_log_poll_error(struct mce *m)
+{
+ if (m->status & MCI_STATUS_VAL)
+ return true;
+
+ m->status = mce_rdmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank));
+ if ((m->status & MCI_STATUS_VAL) && (m->status & MCI_STATUS_DEFERRED)) {
+ m->kflags |= MCE_CHECK_DFR_REGS;
+ return true;
+ }
+
+ return false;
+}
+
+/*
* Newer Intel systems that support software error
* recovery need to make additional checks. Other
* CPUs should skip over uncorrected errors, but log
@@ -740,6 +766,9 @@ static bool should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *err)
{
struct mce *m = &err->m;
+ if (mce_flags.smca)
+ return smca_should_log_poll_error(m);
+
/* If this entry is not valid, ignore it. */
if (!(m->status & MCI_STATUS_VAL))
return false;
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index b0e00ec5cc8c..a31cf984619c 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -67,6 +67,7 @@ void mce_track_storm(struct mce *mce);
void mce_inherit_storm(unsigned int bank);
bool mce_get_storm_mode(void);
void mce_set_storm_mode(bool storm);
+u32 mce_get_apei_thr_limit(void);
#else
static inline void cmci_storm_begin(unsigned int bank) {}
static inline void cmci_storm_end(unsigned int bank) {}
@@ -74,6 +75,7 @@ static inline void mce_track_storm(struct mce *mce) {}
static inline void mce_inherit_storm(unsigned int bank) {}
static inline bool mce_get_storm_mode(void) { return false; }
static inline void mce_set_storm_mode(bool storm) {}
+static inline u32 mce_get_apei_thr_limit(void) { return 0; }
#endif
/*
@@ -267,6 +269,7 @@ void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m);
#ifdef CONFIG_X86_MCE_AMD
void mce_threshold_create_device(unsigned int cpu);
void mce_threshold_remove_device(unsigned int cpu);
+void mce_amd_handle_storm(unsigned int bank, bool on);
extern bool amd_filter_mce(struct mce *m);
bool amd_mce_usable_address(struct mce *m);
void amd_clear_bank(struct mce *m);
@@ -299,6 +302,7 @@ void smca_bsp_init(void);
#else
static inline void mce_threshold_create_device(unsigned int cpu) { }
static inline void mce_threshold_remove_device(unsigned int cpu) { }
+static inline void mce_amd_handle_storm(unsigned int bank, bool on) { }
static inline bool amd_filter_mce(struct mce *m) { return false; }
static inline bool amd_mce_usable_address(struct mce *m) { return false; }
static inline void amd_clear_bank(struct mce *m) { }
diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c
index f4a007616468..0d13c9ffcba0 100644
--- a/arch/x86/kernel/cpu/mce/threshold.c
+++ b/arch/x86/kernel/cpu/mce/threshold.c
@@ -13,6 +13,19 @@
#include "internal.h"
+static u32 mce_apei_thr_limit;
+
+void mce_save_apei_thr_limit(u32 thr_limit)
+{
+ mce_apei_thr_limit = thr_limit;
+ pr_info("HEST corrected error threshold limit: %u\n", thr_limit);
+}
+
+u32 mce_get_apei_thr_limit(void)
+{
+ return mce_apei_thr_limit;
+}
+
static void default_threshold_interrupt(void)
{
pr_err("Unexpected threshold interrupt at vector %x\n",
@@ -63,6 +76,9 @@ static void mce_handle_storm(unsigned int bank, bool on)
case X86_VENDOR_INTEL:
mce_intel_handle_storm(bank, on);
break;
+ case X86_VENDOR_AMD:
+ mce_amd_handle_storm(bank, on);
+ break;
}
}
@@ -85,7 +101,8 @@ void cmci_storm_end(unsigned int bank)
{
struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
- __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ if (!mce_flags.amd_threshold)
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
storm->banks[bank].history = 0;
storm->banks[bank].in_storm_mode = false;
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index cdce885e2fd5..3821a985f4ff 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -186,60 +186,92 @@ static u32 cpuid_to_ucode_rev(unsigned int val)
return p.ucode_rev;
}
+static u32 get_cutoff_revision(u32 rev)
+{
+ switch (rev >> 8) {
+ case 0x80012: return 0x8001277; break;
+ case 0x80082: return 0x800820f; break;
+ case 0x83010: return 0x830107c; break;
+ case 0x86001: return 0x860010e; break;
+ case 0x86081: return 0x8608108; break;
+ case 0x87010: return 0x8701034; break;
+ case 0x8a000: return 0x8a0000a; break;
+ case 0xa0010: return 0xa00107a; break;
+ case 0xa0011: return 0xa0011da; break;
+ case 0xa0012: return 0xa001243; break;
+ case 0xa0082: return 0xa00820e; break;
+ case 0xa1011: return 0xa101153; break;
+ case 0xa1012: return 0xa10124e; break;
+ case 0xa1081: return 0xa108109; break;
+ case 0xa2010: return 0xa20102f; break;
+ case 0xa2012: return 0xa201212; break;
+ case 0xa4041: return 0xa404109; break;
+ case 0xa5000: return 0xa500013; break;
+ case 0xa6012: return 0xa60120a; break;
+ case 0xa7041: return 0xa704109; break;
+ case 0xa7052: return 0xa705208; break;
+ case 0xa7080: return 0xa708009; break;
+ case 0xa70c0: return 0xa70C009; break;
+ case 0xaa001: return 0xaa00116; break;
+ case 0xaa002: return 0xaa00218; break;
+ case 0xb0021: return 0xb002146; break;
+ case 0xb0081: return 0xb008111; break;
+ case 0xb1010: return 0xb101046; break;
+ case 0xb2040: return 0xb204031; break;
+ case 0xb4040: return 0xb404031; break;
+ case 0xb4041: return 0xb404101; break;
+ case 0xb6000: return 0xb600031; break;
+ case 0xb6080: return 0xb608031; break;
+ case 0xb7000: return 0xb700031; break;
+ default: break;
+
+ }
+ return 0;
+}
+
static bool need_sha_check(u32 cur_rev)
{
+ u32 cutoff;
+
if (!cur_rev) {
cur_rev = cpuid_to_ucode_rev(bsp_cpuid_1_eax);
pr_info_once("No current revision, generating the lowest one: 0x%x\n", cur_rev);
}
- switch (cur_rev >> 8) {
- case 0x80012: return cur_rev <= 0x800126f; break;
- case 0x80082: return cur_rev <= 0x800820f; break;
- case 0x83010: return cur_rev <= 0x830107c; break;
- case 0x86001: return cur_rev <= 0x860010e; break;
- case 0x86081: return cur_rev <= 0x8608108; break;
- case 0x87010: return cur_rev <= 0x8701034; break;
- case 0x8a000: return cur_rev <= 0x8a0000a; break;
- case 0xa0010: return cur_rev <= 0xa00107a; break;
- case 0xa0011: return cur_rev <= 0xa0011da; break;
- case 0xa0012: return cur_rev <= 0xa001243; break;
- case 0xa0082: return cur_rev <= 0xa00820e; break;
- case 0xa1011: return cur_rev <= 0xa101153; break;
- case 0xa1012: return cur_rev <= 0xa10124e; break;
- case 0xa1081: return cur_rev <= 0xa108109; break;
- case 0xa2010: return cur_rev <= 0xa20102f; break;
- case 0xa2012: return cur_rev <= 0xa201212; break;
- case 0xa4041: return cur_rev <= 0xa404109; break;
- case 0xa5000: return cur_rev <= 0xa500013; break;
- case 0xa6012: return cur_rev <= 0xa60120a; break;
- case 0xa7041: return cur_rev <= 0xa704109; break;
- case 0xa7052: return cur_rev <= 0xa705208; break;
- case 0xa7080: return cur_rev <= 0xa708009; break;
- case 0xa70c0: return cur_rev <= 0xa70C009; break;
- case 0xaa001: return cur_rev <= 0xaa00116; break;
- case 0xaa002: return cur_rev <= 0xaa00218; break;
- case 0xb0021: return cur_rev <= 0xb002146; break;
- case 0xb1010: return cur_rev <= 0xb101046; break;
- case 0xb2040: return cur_rev <= 0xb204031; break;
- case 0xb4040: return cur_rev <= 0xb404031; break;
- case 0xb6000: return cur_rev <= 0xb600031; break;
- case 0xb7000: return cur_rev <= 0xb700031; break;
- default: break;
- }
+ cutoff = get_cutoff_revision(cur_rev);
+ if (cutoff)
+ return cur_rev <= cutoff;
pr_info("You should not be seeing this. Please send the following couple of lines to x86-<at>-kernel.org\n");
pr_info("CPUID(1).EAX: 0x%x, current revision: 0x%x\n", bsp_cpuid_1_eax, cur_rev);
return true;
}
+static bool cpu_has_entrysign(void)
+{
+ unsigned int fam = x86_family(bsp_cpuid_1_eax);
+ unsigned int model = x86_model(bsp_cpuid_1_eax);
+
+ if (fam == 0x17 || fam == 0x19)
+ return true;
+
+ if (fam == 0x1a) {
+ if (model <= 0x2f ||
+ (0x40 <= model && model <= 0x4f) ||
+ (0x60 <= model && model <= 0x6f))
+ return true;
+ }
+
+ return false;
+}
+
static bool verify_sha256_digest(u32 patch_id, u32 cur_rev, const u8 *data, unsigned int len)
{
struct patch_digest *pd = NULL;
u8 digest[SHA256_DIGEST_SIZE];
int i;
- if (x86_family(bsp_cpuid_1_eax) < 0x17)
+ if (!cpu_has_entrysign())
return true;
if (!need_sha_check(cur_rev))
@@ -473,6 +505,7 @@ static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size)
{
u8 family = x86_family(bsp_cpuid_1_eax);
struct microcode_header_amd *mc_hdr;
+ u32 cur_rev, cutoff, patch_rev;
u32 sh_psize;
u16 proc_id;
u8 patch_fam;
@@ -512,11 +545,32 @@ static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size)
proc_id = mc_hdr->processor_rev_id;
patch_fam = 0xf + (proc_id >> 12);
- ucode_dbg("Patch-ID 0x%08x: family: 0x%x\n", mc_hdr->patch_id, patch_fam);
-
if (patch_fam != family)
return 1;
+ cur_rev = get_patch_level();
+
+ /* No cutoff revision means old/unaffected by signing algorithm weakness => matches */
+ cutoff = get_cutoff_revision(cur_rev);
+ if (!cutoff)
+ goto ok;
+
+ patch_rev = mc_hdr->patch_id;
+
+ ucode_dbg("cur_rev: 0x%x, cutoff: 0x%x, patch_rev: 0x%x\n",
+ cur_rev, cutoff, patch_rev);
+
+ if (cur_rev <= cutoff && patch_rev <= cutoff)
+ goto ok;
+
+ if (cur_rev > cutoff && patch_rev > cutoff)
+ goto ok;
+
+ return 1;
+
+ok:
+ ucode_dbg("Patch-ID 0x%08x: family: 0x%x\n", mc_hdr->patch_id, patch_fam);
+
return 0;
}
@@ -585,8 +639,6 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc)
mc = (struct microcode_amd *)(buf + SECTION_HDR_SIZE);
- ucode_dbg("patch_id: 0x%x\n", mc->hdr.patch_id);
-
if (mc_patch_matches(mc, eq_id)) {
desc->psize = patch_size;
desc->mc = mc;
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index f75c140906d0..ccc83b0bf63c 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -136,7 +136,7 @@ bool __init microcode_loader_disabled(void)
return dis_ucode_ldr;
}
-static void early_parse_cmdline(void)
+static void __init early_parse_cmdline(void)
{
char cmd_buf[64] = {};
char *s, *p = cmd_buf;
@@ -589,6 +589,17 @@ static int load_late_stop_cpus(bool is_safe)
pr_err("You should switch to early loading, if possible.\n");
}
+ /*
+ * Pre-load the microcode image into a staging device. This
+ * process is preemptible and does not require stopping CPUs.
+ * Successful staging simplifies the subsequent late-loading
+ * process, reducing rendezvous time.
+ *
+ * Even if the transfer fails, the update will proceed as usual.
+ */
+ if (microcode_ops->use_staging)
+ microcode_ops->stage_microcode();
+
atomic_set(&late_cpus_in, num_online_cpus());
atomic_set(&offline_in_nmi, 0);
loops_per_usec = loops_per_jiffy / (TICK_NSEC / 1000);
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 371ca6eac00e..8744f3adc2a0 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -13,12 +13,15 @@
#define pr_fmt(fmt) "microcode: " fmt
#include <linux/earlycpio.h>
#include <linux/firmware.h>
+#include <linux/pci_ids.h>
#include <linux/uaccess.h>
#include <linux/initrd.h>
#include <linux/kernel.h>
+#include <linux/delay.h>
#include <linux/slab.h>
#include <linux/cpu.h>
#include <linux/uio.h>
+#include <linux/io.h>
#include <linux/mm.h>
#include <asm/cpu_device_id.h>
@@ -33,6 +36,38 @@ static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
#define UCODE_BSP_LOADED ((struct microcode_intel *)0x1UL)
+/* Defines for the microcode staging mailbox interface */
+#define MBOX_REG_NUM 4
+#define MBOX_REG_SIZE sizeof(u32)
+
+#define MBOX_CONTROL_OFFSET 0x0
+#define MBOX_STATUS_OFFSET 0x4
+#define MBOX_WRDATA_OFFSET 0x8
+#define MBOX_RDDATA_OFFSET 0xc
+
+#define MASK_MBOX_CTRL_ABORT BIT(0)
+#define MASK_MBOX_CTRL_GO BIT(31)
+
+#define MASK_MBOX_STATUS_ERROR BIT(2)
+#define MASK_MBOX_STATUS_READY BIT(31)
+
+#define MASK_MBOX_RESP_SUCCESS BIT(0)
+#define MASK_MBOX_RESP_PROGRESS BIT(1)
+#define MASK_MBOX_RESP_ERROR BIT(2)
+
+#define MBOX_CMD_LOAD 0x3
+#define MBOX_OBJ_STAGING 0xb
+#define MBOX_HEADER(size) ((PCI_VENDOR_ID_INTEL) | \
+ (MBOX_OBJ_STAGING << 16) | \
+ ((u64)((size) / sizeof(u32)) << 32))
+
+/* The size of each mailbox header */
+#define MBOX_HEADER_SIZE sizeof(u64)
+/* The size of staging hardware response */
+#define MBOX_RESPONSE_SIZE sizeof(u64)
+
+#define MBOX_XACTION_TIMEOUT_MS (10 * MSEC_PER_SEC)
+
/* Current microcode patch used in early patching on the APs. */
static struct microcode_intel *ucode_patch_va __read_mostly;
static struct microcode_intel *ucode_patch_late __read_mostly;
@@ -54,6 +89,23 @@ struct extended_sigtable {
struct extended_signature sigs[];
};
+/**
+ * struct staging_state - Track the current staging process state
+ *
+ * @mmio_base: MMIO base address for staging
+ * @ucode_len: Total size of the microcode image
+ * @chunk_size: Size of each data piece
+ * @bytes_sent: Total bytes transmitted so far
+ * @offset: Current offset in the microcode image
+ */
+struct staging_state {
+ void __iomem *mmio_base;
+ unsigned int ucode_len;
+ unsigned int chunk_size;
+ unsigned int bytes_sent;
+ unsigned int offset;
+};
+
#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
@@ -299,6 +351,298 @@ static __init struct microcode_intel *scan_microcode(void *data, size_t size,
return size ? NULL : patch;
}
+static inline u32 read_mbox_dword(void __iomem *mmio_base)
+{
+ u32 dword = readl(mmio_base + MBOX_RDDATA_OFFSET);
+
+ /* Acknowledge read completion to the staging hardware */
+ writel(0, mmio_base + MBOX_RDDATA_OFFSET);
+ return dword;
+}
+
+static inline void write_mbox_dword(void __iomem *mmio_base, u32 dword)
+{
+ writel(dword, mmio_base + MBOX_WRDATA_OFFSET);
+}
+
+static inline u64 read_mbox_header(void __iomem *mmio_base)
+{
+ u32 high, low;
+
+ low = read_mbox_dword(mmio_base);
+ high = read_mbox_dword(mmio_base);
+
+ return ((u64)high << 32) | low;
+}
+
+static inline void write_mbox_header(void __iomem *mmio_base, u64 value)
+{
+ write_mbox_dword(mmio_base, value);
+ write_mbox_dword(mmio_base, value >> 32);
+}
+
+static void write_mbox_data(void __iomem *mmio_base, u32 *chunk, unsigned int chunk_bytes)
+{
+ int i;
+
+ /*
+ * The MMIO space is mapped as Uncached (UC). Each write arrives
+ * at the device as an individual transaction in program order.
+ * The device can then reassemble the sequence accordingly.
+ */
+ for (i = 0; i < chunk_bytes / sizeof(u32); i++)
+ write_mbox_dword(mmio_base, chunk[i]);
+}
+
+/*
+ * Prepare for a new microcode transfer: reset hardware and record the
+ * image size.
+ */
+static void init_stage(struct staging_state *ss)
+{
+ ss->ucode_len = get_totalsize(&ucode_patch_late->hdr);
+
+ /*
+ * Abort any ongoing process, effectively resetting the device.
+ * Unlike regular mailbox data processing requests, this
+ * operation does not require a status check.
+ */
+ writel(MASK_MBOX_CTRL_ABORT, ss->mmio_base + MBOX_CONTROL_OFFSET);
+}
+
+/*
+ * Update the chunk size and decide whether another chunk can be sent.
+ * This accounts for remaining data and retry limits.
+ */
+static bool can_send_next_chunk(struct staging_state *ss, int *err)
+{
+ /* A page size or remaining bytes if this is the final chunk */
+ ss->chunk_size = min(PAGE_SIZE, ss->ucode_len - ss->offset);
+
+ /*
+ * Each microcode image is divided into chunks, each at most
+ * one page size. A 10-chunk image would typically require 10
+ * transactions.
+ *
+ * However, the hardware managing the mailbox has limited
+ * resources and may not cache the entire image, potentially
+ * requesting the same chunk multiple times.
+ *
+ * To tolerate this behavior, allow up to twice the expected
+ * number of transactions (i.e., a 10-chunk image can take up to
+ * 20 attempts).
+ *
+ * If the number of attempts exceeds this limit, treat it as
+ * exceeding the maximum allowed transfer size.
+ */
+ if (ss->bytes_sent + ss->chunk_size > ss->ucode_len * 2) {
+ *err = -EMSGSIZE;
+ return false;
+ }
+
+ *err = 0;
+ return true;
+}
+
+/*
+ * The hardware indicates completion by returning a sentinel end offset.
+ */
+static inline bool is_end_offset(u32 offset)
+{
+ return offset == UINT_MAX;
+}
+
+/*
+ * Determine whether staging is complete: either the hardware signaled
+ * the end offset, or no more transactions are permitted (retry limit
+ * reached).
+ */
+static inline bool staging_is_complete(struct staging_state *ss, int *err)
+{
+ return is_end_offset(ss->offset) || !can_send_next_chunk(ss, err);
+}
+
+/*
+ * Wait for the hardware to complete a transaction.
+ * Return 0 on success, or an error code on failure.
+ */
+static int wait_for_transaction(struct staging_state *ss)
+{
+ u32 timeout, status;
+
+ /* Allow time for hardware to complete the operation: */
+ for (timeout = 0; timeout < MBOX_XACTION_TIMEOUT_MS; timeout++) {
+ msleep(1);
+
+ status = readl(ss->mmio_base + MBOX_STATUS_OFFSET);
+ /* Break out early if the hardware is ready: */
+ if (status & MASK_MBOX_STATUS_READY)
+ break;
+ }
+
+ /* Check for explicit error response */
+ if (status & MASK_MBOX_STATUS_ERROR)
+ return -EIO;
+
+ /*
+ * Hardware has neither responded to the action nor signaled any
+ * error. Treat this as a timeout.
+ */
+ if (!(status & MASK_MBOX_STATUS_READY))
+ return -ETIMEDOUT;
+
+ return 0;
+}
+
+/*
+ * Transmit a chunk of the microcode image to the hardware.
+ * Return 0 on success, or an error code on failure.
+ */
+static int send_data_chunk(struct staging_state *ss, void *ucode_ptr)
+{
+ u32 *src_chunk = ucode_ptr + ss->offset;
+ u16 mbox_size;
+
+ /*
+ * Write a 'request' mailbox object in this order:
+ * 1. Mailbox header includes total size
+ * 2. Command header specifies the load operation
+ * 3. Data section contains a microcode chunk
+ *
+ * Thus, the mailbox size is two headers plus the chunk size.
+ */
+ mbox_size = MBOX_HEADER_SIZE * 2 + ss->chunk_size;
+ write_mbox_header(ss->mmio_base, MBOX_HEADER(mbox_size));
+ write_mbox_header(ss->mmio_base, MBOX_CMD_LOAD);
+ write_mbox_data(ss->mmio_base, src_chunk, ss->chunk_size);
+ ss->bytes_sent += ss->chunk_size;
+
+ /* Notify the hardware that the mailbox is ready for processing. */
+ writel(MASK_MBOX_CTRL_GO, ss->mmio_base + MBOX_CONTROL_OFFSET);
+
+ return wait_for_transaction(ss);
+}
+
+/*
+ * Retrieve the next offset from the hardware response.
+ * Return 0 on success, or an error code on failure.
+ */
+static int fetch_next_offset(struct staging_state *ss)
+{
+ const u64 expected_header = MBOX_HEADER(MBOX_HEADER_SIZE + MBOX_RESPONSE_SIZE);
+ u32 offset, status;
+ u64 header;
+
+ /*
+ * The 'response' mailbox returns three fields, in order:
+ * 1. Header
+ * 2. Next offset in the microcode image
+ * 3. Status flags
+ */
+ header = read_mbox_header(ss->mmio_base);
+ offset = read_mbox_dword(ss->mmio_base);
+ status = read_mbox_dword(ss->mmio_base);
+
+ /* All valid responses must start with the expected header. */
+ if (header != expected_header) {
+ pr_err_once("staging: invalid response header (0x%llx)\n", header);
+ return -EBADR;
+ }
+
+ /*
+ * Verify the offset: If not at the end marker, it must not
+ * exceed the microcode image length.
+ */
+ if (!is_end_offset(offset) && offset > ss->ucode_len) {
+ pr_err_once("staging: invalid offset (%u) past the image end (%u)\n",
+ offset, ss->ucode_len);
+ return -EINVAL;
+ }
+
+ /* Hardware may report errors explicitly in the status field */
+ if (status & MASK_MBOX_RESP_ERROR)
+ return -EPROTO;
+
+ ss->offset = offset;
+ return 0;
+}
+
+/*
+ * Handle the staging process using the mailbox MMIO interface. The
+ * microcode image is transferred in chunks until completion.
+ * Return 0 on success or an error code on failure.
+ */
+static int do_stage(u64 mmio_pa)
+{
+ struct staging_state ss = {};
+ int err;
+
+ ss.mmio_base = ioremap(mmio_pa, MBOX_REG_NUM * MBOX_REG_SIZE);
+ if (WARN_ON_ONCE(!ss.mmio_base))
+ return -EADDRNOTAVAIL;
+
+ init_stage(&ss);
+
+ /* Perform the staging process while within the retry limit */
+ while (!staging_is_complete(&ss, &err)) {
+ /* Send a chunk of microcode each time: */
+ err = send_data_chunk(&ss, ucode_patch_late);
+ if (err)
+ break;
+ /*
+ * Then, ask the hardware which piece of the image it
+ * needs next. The same piece may be sent more than once.
+ */
+ err = fetch_next_offset(&ss);
+ if (err)
+ break;
+ }
+
+ iounmap(ss.mmio_base);
+
+ return err;
+}
+
+static void stage_microcode(void)
+{
+ unsigned int pkg_id = UINT_MAX;
+ int cpu, err;
+ u64 mmio_pa;
+
+ if (!IS_ALIGNED(get_totalsize(&ucode_patch_late->hdr), sizeof(u32))) {
+ pr_err("Microcode image 32-bit misaligned (0x%x), staging failed.\n",
+ get_totalsize(&ucode_patch_late->hdr));
+ return;
+ }
+
+ lockdep_assert_cpus_held();
+
+ /*
+ * The MMIO address is unique per package, and all the SMT
+ * primary threads are online here. Find each MMIO space by
+ * their package IDs to avoid duplicate staging.
+ */
+ for_each_cpu(cpu, cpu_primary_thread_mask) {
+ if (topology_logical_package_id(cpu) == pkg_id)
+ continue;
+
+ pkg_id = topology_logical_package_id(cpu);
+
+ err = rdmsrq_on_cpu(cpu, MSR_IA32_MCU_STAGING_MBOX_ADDR, &mmio_pa);
+ if (WARN_ON_ONCE(err))
+ return;
+
+ err = do_stage(mmio_pa);
+ if (err) {
+ pr_err("Error: staging failed (%d) for CPU%d at package %u.\n",
+ err, cpu, pkg_id);
+ return;
+ }
+ }
+
+ pr_info("Staging of patch revision 0x%x succeeded.\n", ucode_patch_late->hdr.rev);
+}
+
static enum ucode_state __apply_microcode(struct ucode_cpu_info *uci,
struct microcode_intel *mc,
u32 *cur_rev)
@@ -627,6 +971,7 @@ static struct microcode_ops microcode_intel_ops = {
.collect_cpu_info = collect_cpu_info,
.apply_microcode = apply_microcode_late,
.finalize_late_load = finalize_late_load,
+ .stage_microcode = stage_microcode,
.use_nmi = IS_ENABLED(CONFIG_X86_64),
};
@@ -638,6 +983,18 @@ static __init void calc_llc_size_per_core(struct cpuinfo_x86 *c)
llc_size_per_core = (unsigned int)llc_size;
}
+static __init bool staging_available(void)
+{
+ u64 val;
+
+ val = x86_read_arch_cap_msr();
+ if (!(val & ARCH_CAP_MCU_ENUM))
+ return false;
+
+ rdmsrq(MSR_IA32_MCU_ENUMERATION, val);
+ return !!(val & MCU_STAGING);
+}
+
struct microcode_ops * __init init_intel_microcode(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
@@ -648,6 +1005,11 @@ struct microcode_ops * __init init_intel_microcode(void)
return NULL;
}
+ if (staging_available()) {
+ microcode_intel_ops.use_staging = true;
+ pr_info("Enabled staging feature.\n");
+ }
+
calc_llc_size_per_core(c);
return &microcode_intel_ops;
diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h
index ae8dbc2b908d..a10b547eda1e 100644
--- a/arch/x86/kernel/cpu/microcode/internal.h
+++ b/arch/x86/kernel/cpu/microcode/internal.h
@@ -31,10 +31,12 @@ struct microcode_ops {
* See also the "Synchronization" section in microcode_core.c.
*/
enum ucode_state (*apply_microcode)(int cpu);
+ void (*stage_microcode)(void);
int (*collect_cpu_info)(int cpu, struct cpu_signature *csig);
void (*finalize_late_load)(int result);
unsigned int nmi_safe : 1,
- use_nmi : 1;
+ use_nmi : 1,
+ use_staging : 1;
};
struct early_load_data {
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 8c18327eb10b..0863733858dc 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -89,7 +89,6 @@ static int mtrr_state_set;
u64 mtrr_tom2;
struct mtrr_state_type mtrr_state;
-EXPORT_SYMBOL_GPL(mtrr_state);
/* Reserved bits in the high portion of the MTRRphysBaseN MSR. */
u32 phys_hi_rsvd;
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 5655f253d929..2de3bd2f95d1 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -46,10 +46,6 @@ struct set_mtrr_context {
u32 ccr3;
};
-void set_mtrr_done(struct set_mtrr_context *ctxt);
-void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
-void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
-
void fill_mtrr_var_range(unsigned int index,
u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
bool get_mtrr_state(void);
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 06ca5a30140c..3792ab4819dc 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -274,6 +274,11 @@ static void rdt_get_cdp_config(int level)
rdt_resources_all[level].r_resctrl.cdp_capable = true;
}
+static void rdt_set_io_alloc_capable(struct rdt_resource *r)
+{
+ r->cache.io_alloc_capable = true;
+}
+
static void rdt_get_cdp_l3_config(void)
{
rdt_get_cdp_config(RDT_RESOURCE_L3);
@@ -719,6 +724,7 @@ enum {
RDT_FLAG_SMBA,
RDT_FLAG_BMEC,
RDT_FLAG_ABMC,
+ RDT_FLAG_SDCIAE,
};
#define RDT_OPT(idx, n, f) \
@@ -745,6 +751,7 @@ static struct rdt_options rdt_options[] __ro_after_init = {
RDT_OPT(RDT_FLAG_SMBA, "smba", X86_FEATURE_SMBA),
RDT_OPT(RDT_FLAG_BMEC, "bmec", X86_FEATURE_BMEC),
RDT_OPT(RDT_FLAG_ABMC, "abmc", X86_FEATURE_ABMC),
+ RDT_OPT(RDT_FLAG_SDCIAE, "sdciae", X86_FEATURE_SDCIAE),
};
#define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options)
@@ -853,6 +860,8 @@ static __init bool get_rdt_alloc_resources(void)
rdt_get_cache_alloc_cfg(1, r);
if (rdt_cpu_has(X86_FEATURE_CDP_L3))
rdt_get_cdp_l3_config();
+ if (rdt_cpu_has(X86_FEATURE_SDCIAE))
+ rdt_set_io_alloc_capable(r);
ret = true;
}
if (rdt_cpu_has(X86_FEATURE_CAT_L2)) {
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 1189c0df4ad7..b20e705606b8 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -91,3 +91,43 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d,
return hw_dom->ctrl_val[idx];
}
+
+bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r)
+{
+ return resctrl_to_arch_res(r)->sdciae_enabled;
+}
+
+static void resctrl_sdciae_set_one_amd(void *arg)
+{
+ bool *enable = arg;
+
+ if (*enable)
+ msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, SDCIAE_ENABLE_BIT);
+ else
+ msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, SDCIAE_ENABLE_BIT);
+}
+
+static void _resctrl_sdciae_enable(struct rdt_resource *r, bool enable)
+{
+ struct rdt_ctrl_domain *d;
+
+ /* Walking r->ctrl_domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
+ /* Update MSR_IA32_L3_QOS_EXT_CFG MSR on all the CPUs in all domains */
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list)
+ on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_sdciae_set_one_amd, &enable, 1);
+}
+
+int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+
+ if (hw_res->r_resctrl.cache.io_alloc_capable &&
+ hw_res->sdciae_enabled != enable) {
+ _resctrl_sdciae_enable(r, enable);
+ hw_res->sdciae_enabled = enable;
+ }
+
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 9f4c2f0aaf5c..4a916c84a322 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -46,6 +46,9 @@ struct arch_mbm_state {
#define ABMC_EXTENDED_EVT_ID BIT(31)
#define ABMC_EVT_ID BIT(0)
+/* Setting bit 1 in MSR_IA32_L3_QOS_EXT_CFG enables the SDCIAE feature. */
+#define SDCIAE_ENABLE_BIT 1
+
/**
* struct rdt_hw_ctrl_domain - Arch private attributes of a set of CPUs that share
* a resource for a control function
@@ -112,6 +115,7 @@ struct msr_param {
* @mbm_width: Monitor width, to detect and correct for overflow.
* @cdp_enabled: CDP state of this resource
* @mbm_cntr_assign_enabled: ABMC feature is enabled
+ * @sdciae_enabled: SDCIAE feature (backing "io_alloc") is enabled.
*
* Members of this structure are either private to the architecture
* e.g. mbm_width, or accessed via helpers that provide abstraction. e.g.
@@ -126,6 +130,7 @@ struct rdt_hw_resource {
unsigned int mbm_width;
bool cdp_enabled;
bool mbm_cntr_assign_enabled;
+ bool sdciae_enabled;
};
static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r)
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index c8945610d455..dffcc8307500 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -242,7 +242,9 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d,
u32 unused, u32 rmid, enum resctrl_event_id eventid,
u64 *val, void *ignored)
{
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
int cpu = cpumask_any(&d->hdr.cpu_mask);
+ struct arch_mbm_state *am;
u64 msr_val;
u32 prmid;
int ret;
@@ -251,12 +253,16 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d,
prmid = logical_rmid_to_physical_rmid(cpu, rmid);
ret = __rmid_read_phys(prmid, eventid, &msr_val);
- if (ret)
- return ret;
- *val = get_corrected_val(r, d, rmid, eventid, msr_val);
+ if (!ret) {
+ *val = get_corrected_val(r, d, rmid, eventid, msr_val);
+ } else if (ret == -EINVAL) {
+ am = get_arch_mbm_state(hw_dom, rmid, eventid);
+ if (am)
+ am->prev_msr = 0;
+ }
- return 0;
+ return ret;
}
static int __cntr_id_read(u32 cntr_id, u64 *val)
@@ -355,6 +361,7 @@ static const struct x86_cpu_id snc_cpu_ids[] __initconst = {
X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0),
X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0),
X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0),
+ X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, 0),
{}
};
@@ -452,7 +459,16 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r)
r->mon.mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS;
}
- if (rdt_cpu_has(X86_FEATURE_ABMC)) {
+ /*
+ * resctrl assumes a system that supports assignable counters can
+ * switch to "default" mode. Ensure that there is a "default" mode
+ * to switch to. This enforces a dependency between the independent
+ * X86_FEATURE_ABMC and X86_FEATURE_CQM_MBM_TOTAL/X86_FEATURE_CQM_MBM_LOCAL
+ * hardware features.
+ */
+ if (rdt_cpu_has(X86_FEATURE_ABMC) &&
+ (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL) ||
+ rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL))) {
r->mon.mbm_cntr_assignable = true;
cpuid_count(0x80000020, 5, &eax, &ebx, &ecx, &edx);
r->mon.num_mbm_cntrs = (ebx & GENMASK(15, 0)) + 1;
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 271f548ad156..cde4b6cd3471 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -43,6 +43,7 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 },
{ X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 },
{ X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 },
+ { X86_FEATURE_SGX_EUPDATESVN, CPUID_EAX, 10, 0x00000012, 0 },
{ X86_FEATURE_SGX_EDECCSSA, CPUID_EAX, 11, 0x00000012, 0 },
{ X86_FEATURE_OVERFLOW_RECOV, CPUID_EBX, 0, 0x80000007, 0 },
{ X86_FEATURE_SUCCOR, CPUID_EBX, 1, 0x80000007, 0 },
@@ -56,6 +57,7 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 },
{ X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 },
{ X86_FEATURE_ABMC, CPUID_EBX, 5, 0x80000020, 0 },
+ { X86_FEATURE_SDCIAE, CPUID_EBX, 6, 0x80000020, 0 },
{ X86_FEATURE_TSA_SQ_NO, CPUID_ECX, 1, 0x80000021, 0 },
{ X86_FEATURE_TSA_L1_NO, CPUID_ECX, 2, 0x80000021, 0 },
{ X86_FEATURE_AMD_WORKLOAD_CLASS, CPUID_EAX, 22, 0x80000021, 0 },
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
index 7f8d1e11dbee..79d6020dfe9c 100644
--- a/arch/x86/kernel/cpu/sgx/driver.c
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -14,7 +14,7 @@ u64 sgx_attributes_reserved_mask;
u64 sgx_xfrm_reserved_mask = ~0x3;
u32 sgx_misc_reserved_mask;
-static int sgx_open(struct inode *inode, struct file *file)
+static int __sgx_open(struct inode *inode, struct file *file)
{
struct sgx_encl *encl;
int ret;
@@ -41,6 +41,23 @@ static int sgx_open(struct inode *inode, struct file *file)
return 0;
}
+static int sgx_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ ret = sgx_inc_usage_count();
+ if (ret)
+ return ret;
+
+ ret = __sgx_open(inode, file);
+ if (ret) {
+ sgx_dec_usage_count();
+ return ret;
+ }
+
+ return 0;
+}
+
static int sgx_release(struct inode *inode, struct file *file)
{
struct sgx_encl *encl = file->private_data;
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 308dbbae6c6e..cf149b9f4916 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -765,6 +765,7 @@ void sgx_encl_release(struct kref *ref)
WARN_ON_ONCE(encl->secs.epc_page);
kfree(encl);
+ sgx_dec_usage_count();
}
/*
diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h
index 42a088a337c5..74be751199a4 100644
--- a/arch/x86/kernel/cpu/sgx/encls.h
+++ b/arch/x86/kernel/cpu/sgx/encls.h
@@ -233,4 +233,9 @@ static inline int __eaug(struct sgx_pageinfo *pginfo, void *addr)
return __encls_2(EAUG, pginfo, addr);
}
+/* Attempt to update CPUSVN at runtime. */
+static inline int __eupdatesvn(void)
+{
+ return __encls_ret_1(EUPDATESVN, "");
+}
#endif /* _X86_ENCLS_H */
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 2de01b379aa3..dc73194416ac 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -5,6 +5,7 @@
#include <linux/freezer.h>
#include <linux/highmem.h>
#include <linux/kthread.h>
+#include <linux/kvm_types.h>
#include <linux/miscdevice.h>
#include <linux/node.h>
#include <linux/pagemap.h>
@@ -16,6 +17,7 @@
#include <linux/vmalloc.h>
#include <asm/msr.h>
#include <asm/sgx.h>
+#include <asm/archrandom.h>
#include "driver.h"
#include "encl.h"
#include "encls.h"
@@ -915,7 +917,107 @@ int sgx_set_attribute(unsigned long *allowed_attributes,
*allowed_attributes |= SGX_ATTR_PROVISIONKEY;
return 0;
}
-EXPORT_SYMBOL_GPL(sgx_set_attribute);
+EXPORT_SYMBOL_FOR_KVM(sgx_set_attribute);
+
+/* Counter to count the active SGX users */
+static int sgx_usage_count;
+
+/**
+ * sgx_update_svn() - Attempt to call ENCLS[EUPDATESVN].
+ *
+ * This instruction attempts to update CPUSVN to the
+ * currently loaded microcode update SVN and generate new
+ * cryptographic assets.
+ *
+ * Return:
+ * * %0: - Success or not supported
+ * * %-EAGAIN: - Can be safely retried, failure is due to lack of
+ * * entropy in RNG
+ * * %-EIO: - Unexpected error, retries are not advisable
+ */
+static int sgx_update_svn(void)
+{
+ int ret;
+
+ /*
+ * If EUPDATESVN is not available, it is ok to
+ * silently skip it to comply with legacy behavior.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_SGX_EUPDATESVN))
+ return 0;
+
+ /*
+ * EPC is guaranteed to be empty when there are no users.
+ * Ensure we are on our first user before proceeding further.
+ */
+ WARN(sgx_usage_count, "Elevated usage count when calling EUPDATESVN\n");
+
+ for (int i = 0; i < RDRAND_RETRY_LOOPS; i++) {
+ ret = __eupdatesvn();
+
+ /* Stop on success or unexpected errors: */
+ if (ret != SGX_INSUFFICIENT_ENTROPY)
+ break;
+ }
+
+ switch (ret) {
+ case 0:
+ /*
+ * SVN successfully updated.
+ * Let users know when the update was successful.
+ */
+ pr_info("SVN updated successfully\n");
+ return 0;
+ case SGX_NO_UPDATE:
+ /*
+ * SVN update failed since the current SVN is
+ * not newer than CPUSVN. This is the most
+ * common case and indicates no harm.
+ */
+ return 0;
+ case SGX_INSUFFICIENT_ENTROPY:
+ /*
+ * SVN update failed due to lack of entropy in DRNG.
+ * Indicate to userspace that it should retry.
+ */
+ return -EAGAIN;
+ default:
+ break;
+ }
+
+ /*
+ * EUPDATESVN was called when EPC is empty, all other error
+ * codes are unexpected.
+ */
+ ENCLS_WARN(ret, "EUPDATESVN");
+ return -EIO;
+}
+
+/* Mutex to ensure no concurrent EPC accesses during EUPDATESVN */
+static DEFINE_MUTEX(sgx_svn_lock);
+
+int sgx_inc_usage_count(void)
+{
+ int ret;
+
+ guard(mutex)(&sgx_svn_lock);
+
+ if (!sgx_usage_count) {
+ ret = sgx_update_svn();
+ if (ret)
+ return ret;
+ }
+
+ sgx_usage_count++;
+
+ return 0;
+}
+
+void sgx_dec_usage_count(void)
+{
+ guard(mutex)(&sgx_svn_lock);
+ sgx_usage_count--;
+}
static int __init sgx_init(void)
{
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index d2dad21259a8..f5940393d9bd 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -102,6 +102,9 @@ static inline int __init sgx_vepc_init(void)
}
#endif
+int sgx_inc_usage_count(void);
+void sgx_dec_usage_count(void);
+
void sgx_update_lepubkeyhash(u64 *lepubkeyhash);
#endif /* _X86_SGX_H */
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
index 7aaa3652e31d..8de1f1a755f2 100644
--- a/arch/x86/kernel/cpu/sgx/virt.c
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -5,6 +5,7 @@
* Copyright(c) 2021 Intel Corporation.
*/
+#include <linux/kvm_types.h>
#include <linux/miscdevice.h>
#include <linux/mm.h>
#include <linux/mman.h>
@@ -255,10 +256,11 @@ static int sgx_vepc_release(struct inode *inode, struct file *file)
xa_destroy(&vepc->page_array);
kfree(vepc);
+ sgx_dec_usage_count();
return 0;
}
-static int sgx_vepc_open(struct inode *inode, struct file *file)
+static int __sgx_vepc_open(struct inode *inode, struct file *file)
{
struct sgx_vepc *vepc;
@@ -273,6 +275,23 @@ static int sgx_vepc_open(struct inode *inode, struct file *file)
return 0;
}
+static int sgx_vepc_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ ret = sgx_inc_usage_count();
+ if (ret)
+ return ret;
+
+ ret = __sgx_vepc_open(inode, file);
+ if (ret) {
+ sgx_dec_usage_count();
+ return ret;
+ }
+
+ return 0;
+}
+
static long sgx_vepc_ioctl(struct file *file,
unsigned int cmd, unsigned long arg)
{
@@ -363,7 +382,7 @@ int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
WARN_ON_ONCE(ret);
return 0;
}
-EXPORT_SYMBOL_GPL(sgx_virt_ecreate);
+EXPORT_SYMBOL_FOR_KVM(sgx_virt_ecreate);
static int __sgx_virt_einit(void __user *sigstruct, void __user *token,
void __user *secs)
@@ -432,4 +451,4 @@ int sgx_virt_einit(void __user *sigstruct, void __user *token,
return ret;
}
-EXPORT_SYMBOL_GPL(sgx_virt_einit);
+EXPORT_SYMBOL_FOR_KVM(sgx_virt_einit);
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index 6073a16628f9..f55ea3cdbf88 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -75,15 +75,11 @@ bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
return phys_id == (u64)cpuid_to_apicid[cpu];
}
-#ifdef CONFIG_SMP
static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid)
{
if (!(apicid & (__max_threads_per_core - 1)))
cpumask_set_cpu(cpu, &__cpu_primary_thread_mask);
}
-#else
-static inline void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { }
-#endif
/*
* Convert the APIC ID to a domain level ID by masking out the low bits
diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c
index b5a5e1411469..71625795d711 100644
--- a/arch/x86/kernel/cpu/topology_common.c
+++ b/arch/x86/kernel/cpu/topology_common.c
@@ -16,6 +16,9 @@ EXPORT_SYMBOL_GPL(x86_topo_system);
unsigned int __amd_nodes_per_pkg __ro_after_init;
EXPORT_SYMBOL_GPL(__amd_nodes_per_pkg);
+/* CPUs which are the primary SMT threads */
+struct cpumask __cpu_primary_thread_mask __read_mostly;
+
void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom,
unsigned int shift, unsigned int ncpus)
{
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
index 49782724a943..209b5a22d880 100644
--- a/arch/x86/kernel/cpu/tsx.c
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -19,7 +19,17 @@
#undef pr_fmt
#define pr_fmt(fmt) "tsx: " fmt
-enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED;
+enum tsx_ctrl_states {
+ TSX_CTRL_AUTO,
+ TSX_CTRL_ENABLE,
+ TSX_CTRL_DISABLE,
+ TSX_CTRL_RTM_ALWAYS_ABORT,
+ TSX_CTRL_NOT_SUPPORTED,
+};
+
+static enum tsx_ctrl_states tsx_ctrl_state __ro_after_init =
+ IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO) ? TSX_CTRL_AUTO :
+ IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF) ? TSX_CTRL_DISABLE : TSX_CTRL_ENABLE;
static void tsx_disable(void)
{
@@ -156,11 +166,28 @@ static void tsx_dev_mode_disable(void)
}
}
-void __init tsx_init(void)
+static int __init tsx_parse_cmdline(char *str)
{
- char arg[5] = {};
- int ret;
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "on")) {
+ tsx_ctrl_state = TSX_CTRL_ENABLE;
+ } else if (!strcmp(str, "off")) {
+ tsx_ctrl_state = TSX_CTRL_DISABLE;
+ } else if (!strcmp(str, "auto")) {
+ tsx_ctrl_state = TSX_CTRL_AUTO;
+ } else {
+ tsx_ctrl_state = TSX_CTRL_DISABLE;
+ pr_err("invalid option, defaulting to off\n");
+ }
+
+ return 0;
+}
+early_param("tsx", tsx_parse_cmdline);
+void __init tsx_init(void)
+{
tsx_dev_mode_disable();
/*
@@ -194,27 +221,8 @@ void __init tsx_init(void)
return;
}
- ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg));
- if (ret >= 0) {
- if (!strcmp(arg, "on")) {
- tsx_ctrl_state = TSX_CTRL_ENABLE;
- } else if (!strcmp(arg, "off")) {
- tsx_ctrl_state = TSX_CTRL_DISABLE;
- } else if (!strcmp(arg, "auto")) {
- tsx_ctrl_state = x86_get_tsx_auto_mode();
- } else {
- tsx_ctrl_state = TSX_CTRL_DISABLE;
- pr_err("invalid option, defaulting to off\n");
- }
- } else {
- /* tsx= not provided */
- if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO))
- tsx_ctrl_state = x86_get_tsx_auto_mode();
- else if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF))
- tsx_ctrl_state = TSX_CTRL_DISABLE;
- else
- tsx_ctrl_state = TSX_CTRL_ENABLE;
- }
+ if (tsx_ctrl_state == TSX_CTRL_AUTO)
+ tsx_ctrl_state = x86_get_tsx_auto_mode();
if (tsx_ctrl_state == TSX_CTRL_DISABLE) {
tsx_disable();
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 71ee20102a8a..b10684dedc58 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -181,8 +181,8 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs,
* in false positive reports. Disable instrumentation to avoid those.
*/
__no_kmsan_checks
-static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, const char *log_lvl)
+static void __show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, const char *log_lvl)
{
struct unwind_state state;
struct stack_info stack_info = {0};
@@ -303,6 +303,25 @@ next:
}
}
+static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, const char *log_lvl)
+{
+ /*
+ * Disable KASAN to avoid false positives during walking another
+ * task's stacks, as values on these stacks may change concurrently
+ * with task execution.
+ */
+ bool disable_kasan = task && task != current;
+
+ if (disable_kasan)
+ kasan_disable_current();
+
+ __show_trace_log_lvl(task, regs, stack, log_lvl);
+
+ if (disable_kasan)
+ kasan_enable_current();
+}
+
void show_stack(struct task_struct *task, unsigned long *sp,
const char *loglvl)
{
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index c3acbd26408b..b15b97d3cb52 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -16,6 +16,7 @@
#include <linux/firmware-map.h>
#include <linux/sort.h>
#include <linux/memory_hotplug.h>
+#include <linux/kvm_types.h>
#include <asm/e820/api.h>
#include <asm/setup.h>
@@ -95,7 +96,7 @@ bool e820__mapped_raw_any(u64 start, u64 end, enum e820_type type)
{
return _e820__mapped_any(e820_table_firmware, start, end, type);
}
-EXPORT_SYMBOL_GPL(e820__mapped_raw_any);
+EXPORT_SYMBOL_FOR_KVM(e820__mapped_raw_any);
bool e820__mapped_any(u64 start, u64 end, enum e820_type type)
{
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 1f71cc135e9a..da233f20ae6f 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -18,6 +18,7 @@
#include <uapi/asm/kvm.h>
#include <linux/hardirq.h>
+#include <linux/kvm_types.h>
#include <linux/pkeys.h>
#include <linux/vmalloc.h>
@@ -276,7 +277,7 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
return true;
}
-EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate);
+EXPORT_SYMBOL_FOR_KVM(fpu_alloc_guest_fpstate);
void fpu_free_guest_fpstate(struct fpu_guest *gfpu)
{
@@ -291,7 +292,7 @@ void fpu_free_guest_fpstate(struct fpu_guest *gfpu)
gfpu->fpstate = NULL;
vfree(fpstate);
}
-EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate);
+EXPORT_SYMBOL_FOR_KVM(fpu_free_guest_fpstate);
/*
* fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable
@@ -313,7 +314,7 @@ int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures)
return __xfd_enable_feature(xfeatures, guest_fpu);
}
-EXPORT_SYMBOL_GPL(fpu_enable_guest_xfd_features);
+EXPORT_SYMBOL_FOR_KVM(fpu_enable_guest_xfd_features);
#ifdef CONFIG_X86_64
void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd)
@@ -324,7 +325,7 @@ void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd)
xfd_update_state(guest_fpu->fpstate);
fpregs_unlock();
}
-EXPORT_SYMBOL_GPL(fpu_update_guest_xfd);
+EXPORT_SYMBOL_FOR_KVM(fpu_update_guest_xfd);
/**
* fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state
@@ -348,7 +349,7 @@ void fpu_sync_guest_vmexit_xfd_state(void)
__this_cpu_write(xfd_state, fpstate->xfd);
}
}
-EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state);
+EXPORT_SYMBOL_FOR_KVM(fpu_sync_guest_vmexit_xfd_state);
#endif /* CONFIG_X86_64 */
int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
@@ -390,7 +391,7 @@ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
fpregs_unlock();
return 0;
}
-EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate);
+EXPORT_SYMBOL_FOR_KVM(fpu_swap_kvm_fpstate);
void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf,
unsigned int size, u64 xfeatures, u32 pkru)
@@ -409,7 +410,7 @@ void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf,
ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE;
}
}
-EXPORT_SYMBOL_GPL(fpu_copy_guest_fpstate_to_uabi);
+EXPORT_SYMBOL_FOR_KVM(fpu_copy_guest_fpstate_to_uabi);
int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
u64 xcr0, u32 *vpkru)
@@ -439,7 +440,7 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
return copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru);
}
-EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate);
+EXPORT_SYMBOL_FOR_KVM(fpu_copy_uabi_to_guest_fpstate);
#endif /* CONFIG_KVM */
void kernel_fpu_begin_mask(unsigned int kfpu_mask)
@@ -825,6 +826,9 @@ void fpu__clear_user_states(struct fpu *fpu)
!fpregs_state_valid(fpu, smp_processor_id()))
os_xrstor_supervisor(fpu->fpstate);
+ /* Ensure XFD state is in sync before reloading XSTATE */
+ xfd_update_state(fpu->fpstate);
+
/* Reset user states in registers. */
restore_fpregs_from_init_fpstate(XFEATURE_MASK_USER_RESTORE);
@@ -854,7 +858,7 @@ void switch_fpu_return(void)
fpregs_restore_userregs();
}
-EXPORT_SYMBOL_GPL(switch_fpu_return);
+EXPORT_SYMBOL_FOR_KVM(switch_fpu_return);
void fpregs_lock_and_load(void)
{
@@ -889,7 +893,7 @@ void fpregs_assert_state_consistent(void)
WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id()));
}
-EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent);
+EXPORT_SYMBOL_FOR_KVM(fpregs_assert_state_consistent);
#endif
void fpregs_mark_activate(void)
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 28e4fd65c9da..48113c5193aa 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -8,6 +8,7 @@
#include <linux/compat.h>
#include <linux/cpu.h>
#include <linux/mman.h>
+#include <linux/kvm_types.h>
#include <linux/nospec.h>
#include <linux/pkeys.h>
#include <linux/seq_file.h>
@@ -1058,7 +1059,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
return __raw_xsave_addr(xsave, xfeature_nr);
}
-EXPORT_SYMBOL_GPL(get_xsave_addr);
+EXPORT_SYMBOL_FOR_KVM(get_xsave_addr);
/*
* Given an xstate feature nr, calculate where in the xsave buffer the state is.
@@ -1482,7 +1483,7 @@ void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeatu
if (addr)
memset(addr, 0, xstate_sizes[xfeature]);
}
-EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
+EXPORT_SYMBOL_FOR_KVM(fpstate_clear_xstate_component);
#endif
#ifdef CONFIG_X86_64
@@ -1818,7 +1819,7 @@ u64 xstate_get_guest_group_perm(void)
{
return xstate_get_group_perm(true);
}
-EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);
+EXPORT_SYMBOL_FOR_KVM(xstate_get_guest_group_perm);
/**
* fpu_xstate_prctl - xstate permission operations
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 367da3638167..823dbdd0eb41 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -354,12 +354,17 @@ SYM_CODE_START(return_to_handler)
UNWIND_HINT_UNDEFINED
ANNOTATE_NOENDBR
+ /* Restore return_to_handler value that got eaten by previous ret instruction. */
+ subq $8, %rsp
+ UNWIND_HINT_FUNC
+
/* Save ftrace_regs for function exit context */
subq $(FRAME_SIZE), %rsp
movq %rax, RAX(%rsp)
movq %rdx, RDX(%rsp)
movq %rbp, RBP(%rsp)
+ movq %rsp, RSP(%rsp)
movq %rsp, %rdi
call ftrace_return_to_handler
@@ -368,7 +373,8 @@ SYM_CODE_START(return_to_handler)
movq RDX(%rsp), %rdx
movq RAX(%rsp), %rax
- addq $(FRAME_SIZE), %rsp
+ addq $(FRAME_SIZE) + 8, %rsp
+
/*
* Jump back to the old return address. This cannot be JMP_NOSPEC rdi
* since IBT would demand that contain ENDBR, which simply isn't so for
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index b01644c949b2..f846c15f21ca 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -24,6 +24,7 @@
#include <linux/percpu.h>
#include <linux/kdebug.h>
#include <linux/kernel.h>
+#include <linux/kvm_types.h>
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/smp.h>
@@ -489,7 +490,7 @@ void hw_breakpoint_restore(void)
set_debugreg(DR6_RESERVED, 6);
set_debugreg(__this_cpu_read(cpu_dr7), 7);
}
-EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
+EXPORT_SYMBOL_FOR_KVM(hw_breakpoint_restore);
/*
* Handle debug exception notifications.
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 10721a125226..86f4e574de02 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -12,6 +12,7 @@
#include <linux/delay.h>
#include <linux/export.h>
#include <linux/irq.h>
+#include <linux/kvm_types.h>
#include <asm/irq_stack.h>
#include <asm/apic.h>
@@ -361,7 +362,7 @@ void kvm_set_posted_intr_wakeup_handler(void (*handler)(void))
synchronize_rcu();
}
}
-EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler);
+EXPORT_SYMBOL_FOR_KVM(kvm_set_posted_intr_wakeup_handler);
/*
* Handler for POSTED_INTERRUPT_VECTOR.
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 3863d7709386..c1fac3a9fecc 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -141,7 +141,6 @@ bool can_boost(struct insn *insn, void *addr)
{
kprobe_opcode_t opcode;
insn_byte_t prefix;
- int i;
if (search_exception_tables((unsigned long)addr))
return false; /* Page fault may occur on this address. */
@@ -154,7 +153,7 @@ bool can_boost(struct insn *insn, void *addr)
if (insn->opcode.nbytes != 1)
return false;
- for_each_insn_prefix(insn, i, prefix) {
+ for_each_insn_prefix(insn, prefix) {
insn_attr_t attr;
attr = inat_get_opcode_attribute(prefix);
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 0aabd4c4e2c4..6f826a00eca2 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -103,7 +103,6 @@ static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
asm (
".pushsection .rodata\n"
- "optprobe_template_func:\n"
".global optprobe_template_entry\n"
"optprobe_template_entry:\n"
#ifdef CONFIG_X86_64
@@ -160,9 +159,6 @@ asm (
"optprobe_template_end:\n"
".popsection\n");
-void optprobe_template_func(void);
-STACK_FRAME_NON_STANDARD(optprobe_template_func);
-
#define TMPL_CLAC_IDX \
((long)optprobe_template_clac - (long)optprobe_template_entry)
#define TMPL_MOVE_IDX \
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index b67d7c59dca0..204765004c72 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -29,6 +29,7 @@
#include <linux/syscore_ops.h>
#include <linux/cc_platform.h>
#include <linux/efi.h>
+#include <linux/kvm_types.h>
#include <asm/timer.h>
#include <asm/cpu.h>
#include <asm/traps.h>
@@ -162,7 +163,7 @@ void kvm_async_pf_task_wait_schedule(u32 token)
}
finish_swait(&n.wq, &wait);
}
-EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule);
+EXPORT_SYMBOL_FOR_KVM(kvm_async_pf_task_wait_schedule);
static void apf_task_wake_one(struct kvm_task_sleep_node *n)
{
@@ -253,7 +254,7 @@ noinstr u32 kvm_read_and_reset_apf_flags(void)
return flags;
}
-EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags);
+EXPORT_SYMBOL_FOR_KVM(kvm_read_and_reset_apf_flags);
noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
{
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 0ffbae902e2f..11c45ce42694 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -97,6 +97,7 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
DEBUGP("%s relocate section %u to %u\n",
apply ? "Applying" : "Clearing",
relsec, sechdrs[relsec].sh_info);
+
for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
size_t size;
@@ -162,15 +163,17 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
if (apply) {
if (memcmp(loc, &zero, size)) {
- pr_err("x86/modules: Invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n",
- (int)ELF64_R_TYPE(rel[i].r_info), loc, val);
+ pr_err("x86/modules: Invalid relocation target, existing value is nonzero for sec %u, idx %u, type %d, loc %lx, val %llx\n",
+ relsec, i, (int)ELF64_R_TYPE(rel[i].r_info),
+ (unsigned long)loc, val);
return -ENOEXEC;
}
write(loc, &val, size);
} else {
if (memcmp(loc, &val, size)) {
- pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for type %d, loc %p, val %Lx\n",
- (int)ELF64_R_TYPE(rel[i].r_info), loc, val);
+ pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for sec %u, idx %u, type %d, loc %lx, val %llx\n",
+ relsec, i, (int)ELF64_R_TYPE(rel[i].r_info),
+ (unsigned long)loc, val);
return -ENOEXEC;
}
write(loc, &zero, size);
@@ -179,8 +182,8 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
return 0;
overflow:
- pr_err("overflow in relocation type %d val %Lx\n",
- (int)ELF64_R_TYPE(rel[i].r_info), val);
+ pr_err("overflow in relocation type %d val %llx sec %u idx %d\n",
+ (int)ELF64_R_TYPE(rel[i].r_info), val, relsec, i);
pr_err("`%s' likely not compiled with -mcmodel=kernel\n",
me->name);
return -ENOEXEC;
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index e17c16c54a37..4469c784eaa0 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -98,7 +98,7 @@ static int filter_write(u32 reg)
if (!__ratelimit(&fw_rs))
return 0;
- pr_warn("Write to unrecognized MSR 0x%x by %s (pid: %d).\n",
+ pr_warn("Write to unrecognized MSR 0x%x by %s (pid: %d), tainting CPU_OUT_OF_SPEC.\n",
reg, current->comm, current->pid);
pr_warn("See https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/about for details.\n");
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index be93ec7255bf..3d239ed12744 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -24,6 +24,7 @@
#include <linux/export.h>
#include <linux/atomic.h>
#include <linux/sched/clock.h>
+#include <linux/kvm_types.h>
#include <asm/cpu_entry_area.h>
#include <asm/traps.h>
@@ -613,9 +614,7 @@ DEFINE_IDTENTRY_RAW(exc_nmi_kvm_vmx)
{
exc_nmi(regs);
}
-#if IS_MODULE(CONFIG_KVM_INTEL)
-EXPORT_SYMBOL_GPL(asm_exc_nmi_kvm_vmx);
-#endif
+EXPORT_SYMBOL_FOR_KVM(asm_exc_nmi_kvm_vmx);
#endif
#ifdef CONFIG_NMI_CHECK_CPU
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 52a5c03c353c..432c0a004c60 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -30,6 +30,7 @@
#include <linux/interrupt.h>
#include <linux/delay.h>
#include <linux/export.h>
+#include <linux/kvm_types.h>
#include <linux/ptrace.h>
#include <linux/notifier.h>
#include <linux/kprobes.h>
@@ -303,9 +304,7 @@ void current_save_fsgs(void)
save_fsgs(current);
local_irq_restore(flags);
}
-#if IS_ENABLED(CONFIG_KVM)
-EXPORT_SYMBOL_GPL(current_save_fsgs);
-#endif
+EXPORT_SYMBOL_FOR_KVM(current_save_fsgs);
static __always_inline void loadseg(enum which_selector which,
unsigned short sel)
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 964f6b0a3d68..6032fa9ec753 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -13,6 +13,7 @@
#include <linux/objtool.h>
#include <linux/pgtable.h>
#include <linux/kexec.h>
+#include <linux/kvm_types.h>
#include <acpi/reboot.h>
#include <asm/io.h>
#include <asm/apic.h>
@@ -541,7 +542,7 @@ void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback)
rcu_assign_pointer(cpu_emergency_virt_callback, callback);
}
-EXPORT_SYMBOL_GPL(cpu_emergency_register_virt_callback);
+EXPORT_SYMBOL_FOR_KVM(cpu_emergency_register_virt_callback);
void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback)
{
@@ -551,7 +552,7 @@ void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback)
rcu_assign_pointer(cpu_emergency_virt_callback, NULL);
synchronize_rcu();
}
-EXPORT_SYMBOL_GPL(cpu_emergency_unregister_virt_callback);
+EXPORT_SYMBOL_FOR_KVM(cpu_emergency_unregister_virt_callback);
/*
* Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index eb289abece23..5cd6950ab672 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -103,9 +103,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_core_map);
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
EXPORT_PER_CPU_SYMBOL(cpu_die_map);
-/* CPUs which are the primary SMT threads */
-struct cpumask __cpu_primary_thread_mask __read_mostly;
-
/* Representing CPUs for which sibling maps can be computed */
static cpumask_var_t cpu_sibling_setup_mask;
@@ -515,6 +512,76 @@ static void __init build_sched_topology(void)
set_sched_topology(topology);
}
+#ifdef CONFIG_NUMA
+static int sched_avg_remote_distance;
+static int avg_remote_numa_distance(void)
+{
+ int i, j;
+ int distance, nr_remote, total_distance;
+
+ if (sched_avg_remote_distance > 0)
+ return sched_avg_remote_distance;
+
+ nr_remote = 0;
+ total_distance = 0;
+ for_each_node_state(i, N_CPU) {
+ for_each_node_state(j, N_CPU) {
+ distance = node_distance(i, j);
+
+ if (distance >= REMOTE_DISTANCE) {
+ nr_remote++;
+ total_distance += distance;
+ }
+ }
+ }
+ if (nr_remote)
+ sched_avg_remote_distance = total_distance / nr_remote;
+ else
+ sched_avg_remote_distance = REMOTE_DISTANCE;
+
+ return sched_avg_remote_distance;
+}
+
+int arch_sched_node_distance(int from, int to)
+{
+ int d = node_distance(from, to);
+
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_GRANITERAPIDS_X:
+ case INTEL_ATOM_DARKMONT_X:
+
+ if (!x86_has_numa_in_package || topology_max_packages() == 1 ||
+ d < REMOTE_DISTANCE)
+ return d;
+
+ /*
+ * With SNC enabled, there could be too many levels of remote
+ * NUMA node distances, creating NUMA domain levels
+ * including local nodes and partial remote nodes.
+ *
+ * Trim finer distance tuning for NUMA nodes in remote package
+ * for the purpose of building sched domains. Group NUMA nodes
+ * in the remote package in the same sched group.
+ * Simplify NUMA domains and avoid extra NUMA levels including
+ * different remote NUMA nodes and local nodes.
+ *
+ * GNR and CWF don't expect systems with more than 2 packages
+ * and more than 2 hops between packages. Single average remote
+ * distance won't be appropriate if there are more than 2
+ * packages as average distance to different remote packages
+ * could be different.
+ */
+ WARN_ONCE(topology_max_packages() > 2,
+ "sched: Expect only up to 2 packages for GNR or CWF, "
+ "but saw %d packages when building sched domains.",
+ topology_max_packages());
+
+ d = avg_remote_numa_distance();
+ }
+ return d;
+}
+#endif /* CONFIG_NUMA */
+
void set_cpu_sibling_map(int cpu)
{
bool has_smt = __max_threads_per_core > 1;
@@ -1328,11 +1395,7 @@ void __noreturn hlt_play_dead(void)
native_halt();
}
-/*
- * native_play_dead() is essentially a __noreturn function, but it can't
- * be marked as such as the compiler may complain about it.
- */
-void native_play_dead(void)
+void __noreturn native_play_dead(void)
{
if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
__update_spec_ctrl(0);
@@ -1351,7 +1414,7 @@ int native_cpu_disable(void)
return -ENOSYS;
}
-void native_play_dead(void)
+void __noreturn native_play_dead(void)
{
BUG();
}
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index 378c388d1b31..2892cdb14563 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -26,6 +26,11 @@ static const u8 xor5rax[] = { 0x2e, 0x2e, 0x2e, 0x31, 0xc0 };
static const u8 retinsn[] = { RET_INSN_OPCODE, 0xcc, 0xcc, 0xcc, 0xcc };
+/*
+ * ud1 (%edx),%rdi -- see __WARN_trap() / decode_bug()
+ */
+static const u8 warninsn[] = { 0x67, 0x48, 0x0f, 0xb9, 0x3a };
+
static u8 __is_Jcc(u8 *insn) /* Jcc.d32 */
{
u8 ret = 0;
@@ -69,7 +74,10 @@ static void __ref __static_call_transform(void *insn, enum insn_type type,
emulate = code;
code = &xor5rax;
}
-
+ if (func == &__WARN_trap) {
+ emulate = code;
+ code = &warninsn;
+ }
break;
case NOP:
@@ -128,7 +136,8 @@ static void __static_call_validate(u8 *insn, bool tail, bool tramp)
} else {
if (opcode == CALL_INSN_OPCODE ||
!memcmp(insn, x86_nops[5], 5) ||
- !memcmp(insn, xor5rax, 5))
+ !memcmp(insn, xor5rax, 5) ||
+ !memcmp(insn, warninsn, 5))
return;
}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1b9177b93433..bcf1dedc1d00 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -31,6 +31,7 @@
#include <linux/kexec.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
+#include <linux/static_call.h>
#include <linux/timer.h>
#include <linux/init.h>
#include <linux/bug.h>
@@ -102,25 +103,37 @@ __always_inline int is_valid_bugaddr(unsigned long addr)
* UBSan{0}: 67 0f b9 00 ud1 (%eax),%eax
* UBSan{10}: 67 0f b9 40 10 ud1 0x10(%eax),%eax
* static_call: 0f b9 cc ud1 %esp,%ecx
+ * __WARN_trap: 67 48 0f b9 3a ud1 (%edx),%reg
*
- * Notably UBSAN uses EAX, static_call uses ECX.
+ * Notable, since __WARN_trap can use all registers, the distinction between
+ * UD1 users is through R/M.
*/
__always_inline int decode_bug(unsigned long addr, s32 *imm, int *len)
{
unsigned long start = addr;
+ u8 v, reg, rm, rex = 0;
+ int type = BUG_UD1;
bool lock = false;
- u8 v;
if (addr < TASK_SIZE_MAX)
return BUG_NONE;
- v = *(u8 *)(addr++);
- if (v == INSN_ASOP)
+ for (;;) {
v = *(u8 *)(addr++);
+ if (v == INSN_ASOP)
+ continue;
- if (v == INSN_LOCK) {
- lock = true;
- v = *(u8 *)(addr++);
+ if (v == INSN_LOCK) {
+ lock = true;
+ continue;
+ }
+
+ if ((v & 0xf0) == 0x40) {
+ rex = v;
+ continue;
+ }
+
+ break;
}
switch (v) {
@@ -156,18 +169,33 @@ __always_inline int decode_bug(unsigned long addr, s32 *imm, int *len)
if (X86_MODRM_MOD(v) != 3 && X86_MODRM_RM(v) == 4)
addr++; /* SIB */
+ reg = X86_MODRM_REG(v) + 8*!!X86_REX_R(rex);
+ rm = X86_MODRM_RM(v) + 8*!!X86_REX_B(rex);
+
/* Decode immediate, if present */
switch (X86_MODRM_MOD(v)) {
case 0: if (X86_MODRM_RM(v) == 5)
- addr += 4; /* RIP + disp32 */
+ addr += 4; /* RIP + disp32 */
+
+ if (rm == 0) /* (%eax) */
+ type = BUG_UD1_UBSAN;
+
+ if (rm == 2) { /* (%edx) */
+ *imm = reg;
+ type = BUG_UD1_WARN;
+ }
break;
case 1: *imm = *(s8 *)addr;
addr += 1;
+ if (rm == 0) /* (%eax) */
+ type = BUG_UD1_UBSAN;
break;
case 2: *imm = *(s32 *)addr;
addr += 4;
+ if (rm == 0) /* (%eax) */
+ type = BUG_UD1_UBSAN;
break;
case 3: break;
@@ -176,12 +204,76 @@ __always_inline int decode_bug(unsigned long addr, s32 *imm, int *len)
/* record instruction length */
*len = addr - start;
- if (X86_MODRM_REG(v) == 0) /* EAX */
- return BUG_UD1_UBSAN;
+ return type;
+}
- return BUG_UD1;
+static inline unsigned long pt_regs_val(struct pt_regs *regs, int nr)
+{
+ int offset = pt_regs_offset(regs, nr);
+ if (WARN_ON_ONCE(offset < -0))
+ return 0;
+ return *((unsigned long *)((void *)regs + offset));
}
+#ifdef HAVE_ARCH_BUG_FORMAT_ARGS
+DEFINE_STATIC_CALL(WARN_trap, __WARN_trap);
+EXPORT_STATIC_CALL_TRAMP(WARN_trap);
+
+/*
+ * Create a va_list from an exception context.
+ */
+void *__warn_args(struct arch_va_list *args, struct pt_regs *regs)
+{
+ /*
+ * Register save area; populate with function call argument registers
+ */
+ args->regs[0] = regs->di;
+ args->regs[1] = regs->si;
+ args->regs[2] = regs->dx;
+ args->regs[3] = regs->cx;
+ args->regs[4] = regs->r8;
+ args->regs[5] = regs->r9;
+
+ /*
+ * From the ABI document:
+ *
+ * @gp_offset - the element holds the offset in bytes from
+ * reg_save_area to the place where the next available general purpose
+ * argument register is saved. In case all argument registers have
+ * been exhausted, it is set to the value 48 (6*8).
+ *
+ * @fp_offset - the element holds the offset in bytes from
+ * reg_save_area to the place where the next available floating point
+ * argument is saved. In case all argument registers have been
+ * exhausted, it is set to the value 176 (6*8 + 8*16)
+ *
+ * @overflow_arg_area - this pointer is used to fetch arguments passed
+ * on the stack. It is initialized with the address of the first
+ * argument passed on the stack, if any, and then always updated to
+ * point to the start of the next argument on the stack.
+ *
+ * @reg_save_area - the element points to the start of the register
+ * save area.
+ *
+ * Notably the vararg starts with the second argument and there are no
+ * floating point arguments in the kernel.
+ */
+ args->args.gp_offset = 1*8;
+ args->args.fp_offset = 6*8 + 8*16;
+ args->args.reg_save_area = &args->regs;
+ args->args.overflow_arg_area = (void *)regs->sp;
+
+ /*
+ * If the exception came from __WARN_trap, there is a return
+ * address on the stack, skip that. This is why any __WARN_trap()
+ * caller must inhibit tail-call optimization.
+ */
+ if ((void *)regs->ip == &__WARN_trap)
+ args->args.overflow_arg_area += 8;
+
+ return &args->args;
+}
+#endif /* HAVE_ARCH_BUG_FORMAT */
static nokprobe_inline int
do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str,
@@ -334,6 +426,11 @@ static noinstr bool handle_bug(struct pt_regs *regs)
raw_local_irq_enable();
switch (ud_type) {
+ case BUG_UD1_WARN:
+ if (report_bug_entry((void *)pt_regs_val(regs, ud_imm), regs) == BUG_TRAP_TYPE_WARN)
+ handled = true;
+ break;
+
case BUG_UD2:
if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) {
handled = true;
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 87e749106dda..7d3e13e14eab 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -11,6 +11,7 @@
#include <linux/cpufreq.h>
#include <linux/delay.h>
#include <linux/clocksource.h>
+#include <linux/kvm_types.h>
#include <linux/percpu.h>
#include <linux/timex.h>
#include <linux/static_key.h>
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 845aeaf36b8d..7be8e361ca55 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -17,6 +17,7 @@
#include <linux/kdebug.h>
#include <asm/processor.h>
#include <asm/insn.h>
+#include <asm/insn-eval.h>
#include <asm/mmu_context.h>
#include <asm/nops.h>
@@ -258,9 +259,8 @@ static volatile u32 good_2byte_insns[256 / 32] = {
static bool is_prefix_bad(struct insn *insn)
{
insn_byte_t p;
- int i;
- for_each_insn_prefix(insn, i, p) {
+ for_each_insn_prefix(insn, p) {
insn_attr_t attr;
attr = inat_get_opcode_attribute(p);
@@ -1158,35 +1158,12 @@ unlock:
mmap_write_unlock(mm);
}
-static bool insn_is_nop(struct insn *insn)
-{
- return insn->opcode.nbytes == 1 && insn->opcode.bytes[0] == 0x90;
-}
-
-static bool insn_is_nopl(struct insn *insn)
-{
- if (insn->opcode.nbytes != 2)
- return false;
-
- if (insn->opcode.bytes[0] != 0x0f || insn->opcode.bytes[1] != 0x1f)
- return false;
-
- if (!insn->modrm.nbytes)
- return false;
-
- if (X86_MODRM_REG(insn->modrm.bytes[0]) != 0)
- return false;
-
- /* 0f 1f /0 - NOPL */
- return true;
-}
-
static bool can_optimize(struct insn *insn, unsigned long vaddr)
{
if (!insn->x86_64 || insn->length != 5)
return false;
- if (!insn_is_nop(insn) && !insn_is_nopl(insn))
+ if (!insn_is_nop(insn))
return false;
/* We can't do cross page atomic writes yet. */
@@ -1426,19 +1403,14 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
{
u8 opc1 = OPCODE1(insn);
insn_byte_t p;
- int i;
- /* x86_nops[insn->length]; same as jmp with .offs = 0 */
- if (insn->length <= ASM_NOP_MAX &&
- !memcmp(insn->kaddr, x86_nops[insn->length], insn->length))
+ if (insn_is_nop(insn))
goto setup;
switch (opc1) {
case 0xeb: /* jmp 8 */
case 0xe9: /* jmp 32 */
break;
- case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */
- goto setup;
case 0xe8: /* call relative */
branch_clear_offset(auprobe, insn);
@@ -1463,7 +1435,7 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
* Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix.
* No one uses these insns, reject any branch insns with such prefix.
*/
- for_each_insn_prefix(insn, i, p) {
+ for_each_insn_prefix(insn, p) {
if (p == 0x66)
return -ENOTSUPP;
}
@@ -1819,3 +1791,35 @@ bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
else
return regs->sp <= ret->stack;
}
+
+/*
+ * Heuristic-based check if uprobe is installed at the function entry.
+ *
+ * Under assumption of user code being compiled with frame pointers,
+ * `push %rbp/%ebp` is a good indicator that we indeed are.
+ *
+ * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern.
+ * If we get this wrong, captured stack trace might have one extra bogus
+ * entry, but the rest of stack trace will still be meaningful.
+ */
+bool is_uprobe_at_func_entry(struct pt_regs *regs)
+{
+ struct arch_uprobe *auprobe;
+
+ if (!current->utask)
+ return false;
+
+ auprobe = current->utask->auprobe;
+ if (!auprobe)
+ return false;
+
+ /* push %rbp/%ebp */
+ if (auprobe->insn[0] == 0x55)
+ return true;
+
+ /* endbr64 (64-bit only) */
+ if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn))
+ return true;
+
+ return false;
+}
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 40ac4cb44ed2..487ad19a236e 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -108,16 +108,18 @@ void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops)
bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL;
int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS;
- perf_get_x86_pmu_capability(&kvm_host_pmu);
-
/*
* Hybrid PMUs don't play nice with virtualization without careful
* configuration by userspace, and KVM's APIs for reporting supported
* vPMU features do not account for hybrid PMUs. Disable vPMU support
* for hybrid PMUs until KVM gains a way to let userspace opt-in.
*/
- if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) {
enable_pmu = false;
+ memset(&kvm_host_pmu, 0, sizeof(kvm_host_pmu));
+ } else {
+ perf_get_x86_pmu_capability(&kvm_host_pmu);
+ }
if (enable_pmu) {
/*
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index f286b5706d7c..fef00546c885 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -216,7 +216,7 @@ static void avic_deactivate_vmcb(struct vcpu_svm *svm)
* This function is called from IOMMU driver to notify
* SVM to schedule in a particular vCPU of a particular VM.
*/
-int avic_ga_log_notifier(u32 ga_tag)
+static int avic_ga_log_notifier(u32 ga_tag)
{
unsigned long flags;
struct kvm_svm *kvm_svm;
@@ -788,7 +788,7 @@ int avic_init_vcpu(struct vcpu_svm *svm)
struct kvm_vcpu *vcpu = &svm->vcpu;
INIT_LIST_HEAD(&svm->ir_list);
- spin_lock_init(&svm->ir_list_lock);
+ raw_spin_lock_init(&svm->ir_list_lock);
if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm))
return 0;
@@ -816,9 +816,9 @@ static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd)
if (!vcpu)
return;
- spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags);
+ raw_spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags);
list_del(&irqfd->vcpu_list);
- spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags);
+ raw_spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags);
}
int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
@@ -855,7 +855,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
* list of IRQs being posted to the vCPU, to ensure the IRTE
* isn't programmed with stale pCPU/IsRunning information.
*/
- guard(spinlock_irqsave)(&svm->ir_list_lock);
+ guard(raw_spinlock_irqsave)(&svm->ir_list_lock);
/*
* Update the target pCPU for IOMMU doorbells if the vCPU is
@@ -972,7 +972,7 @@ static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu,
* up-to-date entry information, or that this task will wait until
* svm_ir_list_add() completes to set the new target pCPU.
*/
- spin_lock_irqsave(&svm->ir_list_lock, flags);
+ raw_spin_lock_irqsave(&svm->ir_list_lock, flags);
entry = svm->avic_physical_id_entry;
WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
@@ -997,7 +997,7 @@ static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu,
avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action);
- spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+ raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags);
}
void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -1035,7 +1035,7 @@ static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action)
* or that this task will wait until svm_ir_list_add() completes to
* mark the vCPU as not running.
*/
- spin_lock_irqsave(&svm->ir_list_lock, flags);
+ raw_spin_lock_irqsave(&svm->ir_list_lock, flags);
avic_update_iommu_vcpu_affinity(vcpu, -1, action);
@@ -1059,7 +1059,7 @@ static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action)
svm->avic_physical_id_entry = entry;
- spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+ raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags);
}
void avic_vcpu_put(struct kvm_vcpu *vcpu)
@@ -1243,3 +1243,9 @@ bool __init avic_hardware_setup(void)
return true;
}
+
+void avic_hardware_unsetup(void)
+{
+ if (avic)
+ amd_iommu_register_ga_log_notifier(NULL);
+}
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index a6443feab252..da6e80b3ac35 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -677,11 +677,10 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
*/
svm_copy_lbrs(vmcb02, vmcb12);
vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS;
- svm_update_lbrv(&svm->vcpu);
-
- } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
+ } else {
svm_copy_lbrs(vmcb02, vmcb01);
}
+ svm_update_lbrv(&svm->vcpu);
}
static inline bool is_evtinj_soft(u32 evtinj)
@@ -833,11 +832,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
svm->soft_int_next_rip = vmcb12_rip;
}
- vmcb02->control.virt_ext = vmcb01->control.virt_ext &
- LBR_CTL_ENABLE_MASK;
- if (guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV))
- vmcb02->control.virt_ext |=
- (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK);
+ /* LBR_CTL_ENABLE_MASK is controlled by svm_update_lbrv() */
if (!nested_vmcb_needs_vls_intercept(svm))
vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
@@ -1189,13 +1184,12 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
- (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
+ (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)))
svm_copy_lbrs(vmcb12, vmcb02);
- svm_update_lbrv(vcpu);
- } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
+ else
svm_copy_lbrs(vmcb01, vmcb02);
- svm_update_lbrv(vcpu);
- }
+
+ svm_update_lbrv(vcpu);
if (vnmi) {
if (vmcb02->control.int_ctl & V_NMI_BLOCKING_MASK)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 153c12dbf3eb..9d29b2e7e855 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -705,7 +705,11 @@ void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask)
static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu)
{
- bool intercept = !(to_svm(vcpu)->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK);
+ struct vcpu_svm *svm = to_svm(vcpu);
+ bool intercept = !(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK);
+
+ if (intercept == svm->lbr_msrs_intercepted)
+ return;
svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHFROMIP, MSR_TYPE_RW, intercept);
svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHTOIP, MSR_TYPE_RW, intercept);
@@ -714,6 +718,8 @@ static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu)
if (sev_es_guest(vcpu->kvm))
svm_set_intercept_for_msr(vcpu, MSR_IA32_DEBUGCTLMSR, MSR_TYPE_RW, intercept);
+
+ svm->lbr_msrs_intercepted = intercept;
}
void svm_vcpu_free_msrpm(void *msrpm)
@@ -806,60 +812,43 @@ void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
vmcb_mark_dirty(to_vmcb, VMCB_LBR);
}
-void svm_enable_lbrv(struct kvm_vcpu *vcpu)
+static void __svm_enable_lbrv(struct kvm_vcpu *vcpu)
{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
- svm_recalc_lbr_msr_intercepts(vcpu);
-
- /* Move the LBR msrs to the vmcb02 so that the guest can see them. */
- if (is_guest_mode(vcpu))
- svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
+ to_svm(vcpu)->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
}
-static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
+void svm_enable_lbrv(struct kvm_vcpu *vcpu)
{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm);
- svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
+ __svm_enable_lbrv(vcpu);
svm_recalc_lbr_msr_intercepts(vcpu);
-
- /*
- * Move the LBR msrs back to the vmcb01 to avoid copying them
- * on nested guest entries.
- */
- if (is_guest_mode(vcpu))
- svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
}
-static struct vmcb *svm_get_lbr_vmcb(struct vcpu_svm *svm)
+static void __svm_disable_lbrv(struct kvm_vcpu *vcpu)
{
- /*
- * If LBR virtualization is disabled, the LBR MSRs are always kept in
- * vmcb01. If LBR virtualization is enabled and L1 is running VMs of
- * its own, the MSRs are moved between vmcb01 and vmcb02 as needed.
- */
- return svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK ? svm->vmcb :
- svm->vmcb01.ptr;
+ KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm);
+ to_svm(vcpu)->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
}
void svm_update_lbrv(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK;
- bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) ||
+ bool enable_lbrv = (svm->vmcb->save.dbgctl & DEBUGCTLMSR_LBR) ||
(is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK));
- if (enable_lbrv == current_enable_lbrv)
- return;
+ if (enable_lbrv && !current_enable_lbrv)
+ __svm_enable_lbrv(vcpu);
+ else if (!enable_lbrv && current_enable_lbrv)
+ __svm_disable_lbrv(vcpu);
- if (enable_lbrv)
- svm_enable_lbrv(vcpu);
- else
- svm_disable_lbrv(vcpu);
+ /*
+ * During nested transitions, it is possible that the current VMCB has
+ * LBR_CTL set, but the previous LBR_CTL had it cleared (or vice versa).
+ * In this case, even though LBR_CTL does not need an update, intercepts
+ * do, so always recalculate the intercepts here.
+ */
+ svm_recalc_lbr_msr_intercepts(vcpu);
}
void disable_nmi_singlestep(struct vcpu_svm *svm)
@@ -921,6 +910,8 @@ static void svm_hardware_unsetup(void)
{
int cpu;
+ avic_hardware_unsetup();
+
sev_hardware_unsetup();
for_each_possible_cpu(cpu)
@@ -1236,6 +1227,7 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu)
}
svm->x2avic_msrs_intercepted = true;
+ svm->lbr_msrs_intercepted = true;
svm->vmcb01.ptr = page_address(vmcb01_page);
svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
@@ -2722,19 +2714,19 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = svm->tsc_aux;
break;
case MSR_IA32_DEBUGCTLMSR:
- msr_info->data = svm_get_lbr_vmcb(svm)->save.dbgctl;
+ msr_info->data = svm->vmcb->save.dbgctl;
break;
case MSR_IA32_LASTBRANCHFROMIP:
- msr_info->data = svm_get_lbr_vmcb(svm)->save.br_from;
+ msr_info->data = svm->vmcb->save.br_from;
break;
case MSR_IA32_LASTBRANCHTOIP:
- msr_info->data = svm_get_lbr_vmcb(svm)->save.br_to;
+ msr_info->data = svm->vmcb->save.br_to;
break;
case MSR_IA32_LASTINTFROMIP:
- msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_from;
+ msr_info->data = svm->vmcb->save.last_excp_from;
break;
case MSR_IA32_LASTINTTOIP:
- msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_to;
+ msr_info->data = svm->vmcb->save.last_excp_to;
break;
case MSR_VM_HSAVE_PA:
msr_info->data = svm->nested.hsave_msr;
@@ -3002,7 +2994,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
if (data & DEBUGCTL_RESERVED_BITS)
return 1;
- svm_get_lbr_vmcb(svm)->save.dbgctl = data;
+ if (svm->vmcb->save.dbgctl == data)
+ break;
+
+ svm->vmcb->save.dbgctl = data;
+ vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
svm_update_lbrv(vcpu);
break;
case MSR_VM_HSAVE_PA:
@@ -5386,12 +5382,6 @@ static __init int svm_hardware_setup(void)
svm_hv_hardware_setup();
- for_each_possible_cpu(cpu) {
- r = svm_cpu_init(cpu);
- if (r)
- goto err;
- }
-
enable_apicv = avic_hardware_setup();
if (!enable_apicv) {
enable_ipiv = false;
@@ -5435,6 +5425,13 @@ static __init int svm_hardware_setup(void)
svm_set_cpu_caps();
kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED;
+
+ for_each_possible_cpu(cpu) {
+ r = svm_cpu_init(cpu);
+ if (r)
+ goto err;
+ }
+
return 0;
err:
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index e4b04f435b3d..dd78e6402345 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -329,13 +329,14 @@ struct vcpu_svm {
* back into remapped mode).
*/
struct list_head ir_list;
- spinlock_t ir_list_lock;
+ raw_spinlock_t ir_list_lock;
struct vcpu_sev_es_state sev_es;
bool guest_state_loaded;
bool x2avic_msrs_intercepted;
+ bool lbr_msrs_intercepted;
/* Guest GIF value, used when vGIF is not enabled */
bool guest_gif;
@@ -805,7 +806,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
)
bool __init avic_hardware_setup(void);
-int avic_ga_log_notifier(u32 ga_tag);
+void avic_hardware_unsetup(void);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb);
diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h
index bc5ece76533a..412d0829d7a2 100644
--- a/arch/x86/kvm/vmx/common.h
+++ b/arch/x86/kvm/vmx/common.h
@@ -98,7 +98,7 @@ static inline int __vmx_handle_ept_violation(struct kvm_vcpu *vcpu, gpa_t gpa,
error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK)
? PFERR_PRESENT_MASK : 0;
- if (error_code & EPT_VIOLATION_GVA_IS_VALID)
+ if (exit_qualification & EPT_VIOLATION_GVA_IS_VALID)
error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ?
PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 76271962cb70..bcea087b642f 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -6728,6 +6728,14 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
case EXIT_REASON_NOTIFY:
/* Notify VM exit is not exposed to L1 */
return false;
+ case EXIT_REASON_SEAMCALL:
+ case EXIT_REASON_TDCALL:
+ /*
+ * SEAMCALL and TDCALL unconditionally VM-Exit, but aren't
+ * virtualized by KVM for L1 hypervisors, i.e. L1 should
+ * never want or expect such an exit.
+ */
+ return false;
default:
return true;
}
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f87c216d976d..91b6f2f3edc2 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6032,6 +6032,12 @@ static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_tdx_instruction(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
#ifndef CONFIG_X86_SGX_KVM
static int handle_encls(struct kvm_vcpu *vcpu)
{
@@ -6157,6 +6163,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_ENCLS] = handle_encls,
[EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit,
[EXIT_REASON_NOTIFY] = handle_notify,
+ [EXIT_REASON_SEAMCALL] = handle_tdx_instruction,
+ [EXIT_REASON_TDCALL] = handle_tdx_instruction,
[EXIT_REASON_MSR_READ_IMM] = handle_rdmsr_imm,
[EXIT_REASON_MSR_WRITE_IMM] = handle_wrmsr_imm,
};
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 42ecd093bb4c..c9c2aa6f4705 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3874,15 +3874,9 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
/*
* Returns true if the MSR in question is managed via XSTATE, i.e. is context
- * switched with the rest of guest FPU state. Note! S_CET is _not_ context
- * switched via XSTATE even though it _is_ saved/restored via XSAVES/XRSTORS.
- * Because S_CET is loaded on VM-Enter and VM-Exit via dedicated VMCS fields,
- * the value saved/restored via XSTATE is always the host's value. That detail
- * is _extremely_ important, as the guest's S_CET must _never_ be resident in
- * hardware while executing in the host. Loading guest values for U_CET and
- * PL[0-3]_SSP while executing in the kernel is safe, as U_CET is specific to
- * userspace, and PL[0-3]_SSP are only consumed when transitioning to lower
- * privilege levels, i.e. are effectively only consumed by userspace as well.
+ * switched with the rest of guest FPU state.
+ *
+ * Note, S_CET is _not_ saved/restored via XSAVES/XRSTORS.
*/
static bool is_xstate_managed_msr(struct kvm_vcpu *vcpu, u32 msr)
{
@@ -3905,6 +3899,11 @@ static bool is_xstate_managed_msr(struct kvm_vcpu *vcpu, u32 msr)
* MSR that is managed via XSTATE. Note, the caller is responsible for doing
* the initial FPU load, this helper only ensures that guest state is resident
* in hardware (the kernel can load its FPU state in IRQ context).
+ *
+ * Note, loading guest values for U_CET and PL[0-3]_SSP while executing in the
+ * kernel is safe, as U_CET is specific to userspace, and PL[0-3]_SSP are only
+ * consumed when transitioning to lower privilege levels, i.e. are effectively
+ * only consumed by userspace as well.
*/
static __always_inline void kvm_access_xstate_msr(struct kvm_vcpu *vcpu,
struct msr_data *msr_info,
@@ -11807,6 +11806,9 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
/* Swap (qemu) user FPU context for the guest FPU context. */
static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
{
+ if (KVM_BUG_ON(vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm))
+ return;
+
/* Exclude PKRU, it's restored separately immediately after VM-Exit. */
fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true);
trace_kvm_fpu(1);
@@ -11815,6 +11817,9 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
/* When vcpu_run ends, restore user space FPU context. */
static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
+ if (KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm))
+ return;
+
fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false);
++vcpu->stat.fpu_reload;
trace_kvm_fpu(0);
@@ -12137,9 +12142,6 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
int r;
vcpu_load(vcpu);
- if (kvm_mpx_supported())
- kvm_load_guest_fpu(vcpu);
-
kvm_vcpu_srcu_read_lock(vcpu);
r = kvm_apic_accept_events(vcpu);
@@ -12156,9 +12158,6 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
out:
kvm_vcpu_srcu_read_unlock(vcpu);
-
- if (kvm_mpx_supported())
- kvm_put_guest_fpu(vcpu);
vcpu_put(vcpu);
return r;
}
@@ -12788,6 +12787,7 @@ static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
u64 xfeatures_mask;
+ bool fpu_in_use;
int i;
/*
@@ -12811,13 +12811,23 @@ static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event)
BUILD_BUG_ON(sizeof(xfeatures_mask) * BITS_PER_BYTE <= XFEATURE_MAX);
/*
- * All paths that lead to INIT are required to load the guest's FPU
- * state (because most paths are buried in KVM_RUN).
- */
- kvm_put_guest_fpu(vcpu);
+ * Unload guest FPU state (if necessary) before zeroing XSTATE fields
+ * as the kernel can only modify the state when its resident in memory,
+ * i.e. when it's not loaded into hardware.
+ *
+ * WARN if the vCPU's desire to run, i.e. whether or not its in KVM_RUN,
+ * doesn't match the loaded/in-use state of the FPU, as KVM_RUN is the
+ * only path that can trigger INIT emulation _and_ loads FPU state, and
+ * KVM_RUN should _always_ load FPU state.
+ */
+ WARN_ON_ONCE(vcpu->wants_to_run != fpstate->in_use);
+ fpu_in_use = fpstate->in_use;
+ if (fpu_in_use)
+ kvm_put_guest_fpu(vcpu);
for_each_set_bit(i, (unsigned long *)&xfeatures_mask, XFEATURE_MAX)
fpstate_clear_xstate_component(fpstate, i);
- kvm_load_guest_fpu(vcpu);
+ if (fpu_in_use)
+ kvm_load_guest_fpu(vcpu);
}
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -13941,10 +13951,11 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
#ifdef CONFIG_KVM_GUEST_MEMFD
/*
- * KVM doesn't yet support mmap() on guest_memfd for VMs with private memory
- * (the private vs. shared tracking needs to be moved into guest_memfd).
+ * KVM doesn't yet support initializing guest_memfd memory as shared for VMs
+ * with private memory (the private vs. shared tracking needs to be moved into
+ * guest_memfd).
*/
-bool kvm_arch_supports_gmem_mmap(struct kvm *kvm)
+bool kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
{
return !kvm_arch_has_private_mem(kvm);
}
diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c
index c5c60d07308c..824664c0ecbd 100644
--- a/arch/x86/lib/cache-smp.c
+++ b/arch/x86/lib/cache-smp.c
@@ -2,6 +2,7 @@
#include <asm/paravirt.h>
#include <linux/smp.h>
#include <linux/export.h>
+#include <linux/kvm_types.h>
static void __wbinvd(void *dummy)
{
@@ -12,7 +13,7 @@ void wbinvd_on_cpu(int cpu)
{
smp_call_function_single(cpu, __wbinvd, NULL, 1);
}
-EXPORT_SYMBOL(wbinvd_on_cpu);
+EXPORT_SYMBOL_FOR_KVM(wbinvd_on_cpu);
void wbinvd_on_all_cpus(void)
{
@@ -24,7 +25,7 @@ void wbinvd_on_cpus_mask(struct cpumask *cpus)
{
on_each_cpu_mask(cpus, __wbinvd, NULL, 1);
}
-EXPORT_SYMBOL_GPL(wbinvd_on_cpus_mask);
+EXPORT_SYMBOL_FOR_KVM(wbinvd_on_cpus_mask);
static void __wbnoinvd(void *dummy)
{
@@ -35,10 +36,10 @@ void wbnoinvd_on_all_cpus(void)
{
on_each_cpu(__wbnoinvd, NULL, 1);
}
-EXPORT_SYMBOL_GPL(wbnoinvd_on_all_cpus);
+EXPORT_SYMBOL_FOR_KVM(wbnoinvd_on_all_cpus);
void wbnoinvd_on_cpus_mask(struct cpumask *cpus)
{
on_each_cpu_mask(cpus, __wbnoinvd, NULL, 1);
}
-EXPORT_SYMBOL_GPL(wbnoinvd_on_cpus_mask);
+EXPORT_SYMBOL_FOR_KVM(wbnoinvd_on_cpus_mask);
diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c
index 4e385cbfd444..e03eeec55cfe 100644
--- a/arch/x86/lib/insn-eval.c
+++ b/arch/x86/lib/insn-eval.c
@@ -63,11 +63,10 @@ static bool is_string_insn(struct insn *insn)
bool insn_has_rep_prefix(struct insn *insn)
{
insn_byte_t p;
- int i;
insn_get_prefixes(insn);
- for_each_insn_prefix(insn, i, p) {
+ for_each_insn_prefix(insn, p) {
if (p == 0xf2 || p == 0xf3)
return true;
}
@@ -92,13 +91,13 @@ bool insn_has_rep_prefix(struct insn *insn)
static int get_seg_reg_override_idx(struct insn *insn)
{
int idx = INAT_SEG_REG_DEFAULT;
- int num_overrides = 0, i;
+ int num_overrides = 0;
insn_byte_t p;
insn_get_prefixes(insn);
/* Look for any segment override prefixes. */
- for_each_insn_prefix(insn, i, p) {
+ for_each_insn_prefix(insn, p) {
insn_attr_t attr;
attr = inat_get_opcode_attribute(p);
@@ -1676,3 +1675,147 @@ enum insn_mmio_type insn_decode_mmio(struct insn *insn, int *bytes)
return type;
}
+
+/*
+ * Recognise typical NOP patterns for both 32bit and 64bit.
+ *
+ * Notably:
+ * - NOP, but not: REP NOP aka PAUSE
+ * - NOPL
+ * - MOV %reg, %reg
+ * - LEA 0(%reg),%reg
+ * - JMP +0
+ *
+ * Must not have false-positives; instructions identified as a NOP might be
+ * emulated as a NOP (uprobe) or Run Length Encoded in a larger NOP
+ * (alternatives).
+ *
+ * False-negatives are fine; need not be exhaustive.
+ */
+bool insn_is_nop(struct insn *insn)
+{
+ u8 b3 = 0, x3 = 0, r3 = 0;
+ u8 b4 = 0, x4 = 0, r4 = 0, m = 0;
+ u8 modrm, modrm_mod, modrm_reg, modrm_rm;
+ u8 sib = 0, sib_scale, sib_index, sib_base;
+ u8 nrex, rex;
+ u8 p, rep = 0;
+
+ if ((nrex = insn->rex_prefix.nbytes)) {
+ rex = insn->rex_prefix.bytes[nrex-1];
+
+ r3 = !!X86_REX_R(rex);
+ x3 = !!X86_REX_X(rex);
+ b3 = !!X86_REX_B(rex);
+ if (nrex > 1) {
+ r4 = !!X86_REX2_R(rex);
+ x4 = !!X86_REX2_X(rex);
+ b4 = !!X86_REX2_B(rex);
+ m = !!X86_REX2_M(rex);
+ }
+
+ } else if (insn->vex_prefix.nbytes) {
+ /*
+ * Ignore VEX encoded NOPs
+ */
+ return false;
+ }
+
+ if (insn->modrm.nbytes) {
+ modrm = insn->modrm.bytes[0];
+ modrm_mod = X86_MODRM_MOD(modrm);
+ modrm_reg = X86_MODRM_REG(modrm) + 8*r3 + 16*r4;
+ modrm_rm = X86_MODRM_RM(modrm) + 8*b3 + 16*b4;
+ modrm = 1;
+ }
+
+ if (insn->sib.nbytes) {
+ sib = insn->sib.bytes[0];
+ sib_scale = X86_SIB_SCALE(sib);
+ sib_index = X86_SIB_INDEX(sib) + 8*x3 + 16*x4;
+ sib_base = X86_SIB_BASE(sib) + 8*b3 + 16*b4;
+ sib = 1;
+
+ modrm_rm = sib_base;
+ }
+
+ for_each_insn_prefix(insn, p) {
+ if (p == 0xf3) /* REPE */
+ rep = 1;
+ }
+
+ /*
+ * Opcode map munging:
+ *
+ * REX2: 0 - single byte opcode
+ * 1 - 0f second byte opcode
+ */
+ switch (m) {
+ case 0: break;
+ case 1: insn->opcode.value <<= 8;
+ insn->opcode.value |= 0x0f;
+ break;
+ default:
+ return false;
+ }
+
+ switch (insn->opcode.bytes[0]) {
+ case 0x0f: /* 2nd byte */
+ break;
+
+ case 0x89: /* MOV */
+ if (modrm_mod != 3) /* register-direct */
+ return false;
+
+ /* native size */
+ if (insn->opnd_bytes != 4 * (1 + insn->x86_64))
+ return false;
+
+ return modrm_reg == modrm_rm; /* MOV %reg, %reg */
+
+ case 0x8d: /* LEA */
+ if (modrm_mod == 0 || modrm_mod == 3) /* register-indirect with disp */
+ return false;
+
+ /* native size */
+ if (insn->opnd_bytes != 4 * (1 + insn->x86_64))
+ return false;
+
+ if (insn->displacement.value != 0)
+ return false;
+
+ if (sib && (sib_scale != 0 || sib_index != 4)) /* (%reg, %eiz, 1) */
+ return false;
+
+ for_each_insn_prefix(insn, p) {
+ if (p != 0x3e) /* DS */
+ return false;
+ }
+
+ return modrm_reg == modrm_rm; /* LEA 0(%reg), %reg */
+
+ case 0x90: /* NOP */
+ if (b3 || b4) /* XCHG %r{8,16,24},%rax */
+ return false;
+
+ if (rep) /* REP NOP := PAUSE */
+ return false;
+
+ return true;
+
+ case 0xe9: /* JMP.d32 */
+ case 0xeb: /* JMP.d8 */
+ return insn->immediate.value == 0; /* JMP +0 */
+
+ default:
+ return false;
+ }
+
+ switch (insn->opcode.bytes[1]) {
+ case 0x1f:
+ return modrm_reg == 0; /* 0f 1f /0 -- NOPL */
+
+ default:
+ return false;
+ }
+}
diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c
index b5893928d55c..8c7cd115b484 100644
--- a/arch/x86/lib/kaslr.c
+++ b/arch/x86/lib/kaslr.c
@@ -22,7 +22,7 @@
#include <asm/setup.h>
#define debug_putstr(v) early_printk("%s", v)
-#define has_cpuflag(f) boot_cpu_has(f)
+#define has_cpuflag(f) cpu_feature_enabled(f)
#define get_boot_seed() kaslr_offset()
#endif
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 4ef7c6dcbea6..dfdd1da89f36 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/export.h>
+#include <linux/kvm_types.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
#include <asm/msr.h>
@@ -103,7 +104,7 @@ int msr_set_bit(u32 msr, u8 bit)
{
return __flip_bit(msr, bit, true);
}
-EXPORT_SYMBOL_GPL(msr_set_bit);
+EXPORT_SYMBOL_FOR_KVM(msr_set_bit);
/**
* msr_clear_bit - Clear @bit in a MSR @msr.
@@ -119,7 +120,7 @@ int msr_clear_bit(u32 msr, u8 bit)
{
return __flip_bit(msr, bit, false);
}
-EXPORT_SYMBOL_GPL(msr_clear_bit);
+EXPORT_SYMBOL_FOR_KVM(msr_clear_bit);
#ifdef CONFIG_TRACEPOINTS
void do_trace_write_msr(u32 msr, u64 val, int failed)
diff --git a/arch/x86/math-emu/poly.h b/arch/x86/math-emu/poly.h
index fc1c887ca073..654bfe4e29a0 100644
--- a/arch/x86/math-emu/poly.h
+++ b/arch/x86/math-emu/poly.h
@@ -39,7 +39,7 @@ asmlinkage void mul_Xsig_Xsig(Xsig *dest, const Xsig *mult);
asmlinkage void shr_Xsig(Xsig *, const int n);
asmlinkage int round_Xsig(Xsig *);
asmlinkage int norm_Xsig(Xsig *);
-asmlinkage void div_Xsig(Xsig *x1, const Xsig *x2, const Xsig *dest);
+asmlinkage void div_Xsig(Xsig *x1, const Xsig *x2, Xsig *dest);
/* Macro to extract the most significant 32 bits from a long long */
#define LL_MSW(x) (((unsigned long *)&x)[1])
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index a4700ef6eb64..2afa7a23340e 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -486,7 +486,6 @@ void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm,
#endif
ptdump_walk_pgd_level_core(m, mm, pgd, false, false);
}
-EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
void ptdump_walk_user_pgd_level_checkwx(void)
{
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 0e4270e20fad..1044aafd5d94 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -504,9 +504,6 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
continue;
}
- if (0)
- pr_info(" pte=%p addr=%lx pte=%016lx\n", pte, paddr,
- pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte);
pages++;
set_pte_init(pte, pfn_pte(paddr >> PAGE_SHIFT, prot), init);
paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE;
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index b68200a0e0c6..8a3d9722f602 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -42,6 +42,7 @@
#include <linux/highmem.h>
#include <linux/fs.h>
#include <linux/rbtree.h>
+#include <linux/kvm_types.h>
#include <asm/cpu_device_id.h>
#include <asm/cacheflush.h>
@@ -697,7 +698,7 @@ bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn)
cm == _PAGE_CACHE_MODE_UC_MINUS ||
cm == _PAGE_CACHE_MODE_WC;
}
-EXPORT_SYMBOL_GPL(pat_pfn_immune_to_uc_mtrr);
+EXPORT_SYMBOL_FOR_KVM(pat_pfn_immune_to_uc_mtrr);
/**
* memtype_reserve_io - Request a memory type mapping for a region of memory
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index d2d54b8c4dbb..970981893c9b 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -446,7 +446,7 @@ static void cpa_flush(struct cpa_data *cpa, int cache)
}
start = fix_addr(__cpa_addr(cpa, 0));
- end = fix_addr(__cpa_addr(cpa, cpa->numpages));
+ end = start + cpa->numpages * PAGE_SIZE;
if (cpa->force_flush_all)
end = TLB_FLUSH_ALL;
diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c
index fc3f3d3e2ef2..8d31c6b9e184 100644
--- a/arch/x86/mm/physaddr.c
+++ b/arch/x86/mm/physaddr.c
@@ -31,17 +31,6 @@ unsigned long __phys_addr(unsigned long x)
return x;
}
EXPORT_SYMBOL(__phys_addr);
-
-unsigned long __phys_addr_symbol(unsigned long x)
-{
- unsigned long y = x - __START_KERNEL_map;
-
- /* only check upper bounds since lower bounds will trigger carry */
- VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
-
- return y + phys_base;
-}
-EXPORT_SYMBOL(__phys_addr_symbol);
#endif
bool __virt_addr_valid(unsigned long x)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 39f80111e6f1..f5b93e01e347 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -12,6 +12,7 @@
#include <linux/task_work.h>
#include <linux/mmu_notifier.h>
#include <linux/mmu_context.h>
+#include <linux/kvm_types.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -911,11 +912,31 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
* CR3 and cpu_tlbstate.loaded_mm are not all in sync.
*/
this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
- barrier();
- /* Start receiving IPIs and then read tlb_gen (and LAM below) */
+ /*
+ * Make sure this CPU is set in mm_cpumask() such that we'll
+ * receive invalidation IPIs.
+ *
+ * Rely on the smp_mb() implied by cpumask_set_cpu()'s atomic
+ * operation, or explicitly provide one. Such that:
+ *
+ * switch_mm_irqs_off() flush_tlb_mm_range()
+ * smp_store_release(loaded_mm, SWITCHING); atomic64_inc_return(tlb_gen)
+ * smp_mb(); // here // smp_mb() implied
+ * atomic64_read(tlb_gen); this_cpu_read(loaded_mm);
+ *
+ * we properly order against flush_tlb_mm_range(), where the
+ * loaded_mm load can happen in mative_flush_tlb_multi() ->
+ * should_flush_tlb().
+ *
+ * This way switch_mm() must see the new tlb_gen or
+ * flush_tlb_mm_range() must see the new loaded_mm, or both.
+ */
if (next != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))
cpumask_set_cpu(cpu, mm_cpumask(next));
+ else
+ smp_mb();
+
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
ns = choose_new_asid(next, next_tlb_gen);
@@ -1562,7 +1583,7 @@ unsigned long __get_current_cr3_fast(void)
VM_BUG_ON(cr3 != __read_cr3());
return cr3;
}
-EXPORT_SYMBOL_GPL(__get_current_cr3_fast);
+EXPORT_SYMBOL_FOR_KVM(__get_current_cr3_fast);
/*
* Flush one page in the kernel mapping
@@ -1703,7 +1724,7 @@ void __flush_tlb_all(void)
flush_tlb_local();
}
}
-EXPORT_SYMBOL_GPL(__flush_tlb_all);
+EXPORT_SYMBOL_FOR_KVM(__flush_tlb_all);
void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
{
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index d4c93d9e73e4..de5083cb1d37 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -2701,7 +2701,7 @@ emit_jmp:
/* Update cleanup_addr */
ctx->cleanup_addr = proglen;
if (bpf_prog_was_classic(bpf_prog) &&
- !capable(CAP_SYS_ADMIN)) {
+ !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN)) {
u8 *ip = image + addrs[i - 1];
if (emit_spectre_bhb_barrier(&prog, ip, bpf_prog))
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index eac403248462..5ce4ebe99774 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -29,6 +29,7 @@
#include <linux/acpi.h>
#include <linux/suspend.h>
#include <linux/idr.h>
+#include <linux/kvm_types.h>
#include <asm/page.h>
#include <asm/special_insns.h>
#include <asm/msr-index.h>
@@ -181,7 +182,7 @@ int tdx_cpu_enable(void)
return 0;
}
-EXPORT_SYMBOL_GPL(tdx_cpu_enable);
+EXPORT_SYMBOL_FOR_KVM(tdx_cpu_enable);
/*
* Add a memory region as a TDX memory block. The caller must make sure
@@ -662,7 +663,7 @@ void tdx_quirk_reset_page(struct page *page)
{
tdx_quirk_reset_paddr(page_to_phys(page), PAGE_SIZE);
}
-EXPORT_SYMBOL_GPL(tdx_quirk_reset_page);
+EXPORT_SYMBOL_FOR_KVM(tdx_quirk_reset_page);
static void tdmr_quirk_reset_pamt(struct tdmr_info *tdmr)
{
@@ -1216,7 +1217,7 @@ int tdx_enable(void)
return ret;
}
-EXPORT_SYMBOL_GPL(tdx_enable);
+EXPORT_SYMBOL_FOR_KVM(tdx_enable);
static bool is_pamt_page(unsigned long phys)
{
@@ -1477,13 +1478,13 @@ const struct tdx_sys_info *tdx_get_sysinfo(void)
return p;
}
-EXPORT_SYMBOL_GPL(tdx_get_sysinfo);
+EXPORT_SYMBOL_FOR_KVM(tdx_get_sysinfo);
u32 tdx_get_nr_guest_keyids(void)
{
return tdx_nr_guest_keyids;
}
-EXPORT_SYMBOL_GPL(tdx_get_nr_guest_keyids);
+EXPORT_SYMBOL_FOR_KVM(tdx_get_nr_guest_keyids);
int tdx_guest_keyid_alloc(void)
{
@@ -1491,13 +1492,13 @@ int tdx_guest_keyid_alloc(void)
tdx_guest_keyid_start + tdx_nr_guest_keyids - 1,
GFP_KERNEL);
}
-EXPORT_SYMBOL_GPL(tdx_guest_keyid_alloc);
+EXPORT_SYMBOL_FOR_KVM(tdx_guest_keyid_alloc);
void tdx_guest_keyid_free(unsigned int keyid)
{
ida_free(&tdx_guest_keyid_pool, keyid);
}
-EXPORT_SYMBOL_GPL(tdx_guest_keyid_free);
+EXPORT_SYMBOL_FOR_KVM(tdx_guest_keyid_free);
static inline u64 tdx_tdr_pa(struct tdx_td *td)
{
@@ -1521,7 +1522,7 @@ noinstr u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args)
return __seamcall_dirty_cache(__seamcall_saved_ret, TDH_VP_ENTER, args);
}
-EXPORT_SYMBOL_GPL(tdh_vp_enter);
+EXPORT_SYMBOL_FOR_KVM(tdh_vp_enter);
u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page)
{
@@ -1533,7 +1534,7 @@ u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page)
tdx_clflush_page(tdcs_page);
return seamcall(TDH_MNG_ADDCX, &args);
}
-EXPORT_SYMBOL_GPL(tdh_mng_addcx);
+EXPORT_SYMBOL_FOR_KVM(tdh_mng_addcx);
u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2)
{
@@ -1553,7 +1554,7 @@ u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page
return ret;
}
-EXPORT_SYMBOL_GPL(tdh_mem_page_add);
+EXPORT_SYMBOL_FOR_KVM(tdh_mem_page_add);
u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2)
{
@@ -1572,7 +1573,7 @@ u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u
return ret;
}
-EXPORT_SYMBOL_GPL(tdh_mem_sept_add);
+EXPORT_SYMBOL_FOR_KVM(tdh_mem_sept_add);
u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page)
{
@@ -1584,7 +1585,7 @@ u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page)
tdx_clflush_page(tdcx_page);
return seamcall(TDH_VP_ADDCX, &args);
}
-EXPORT_SYMBOL_GPL(tdh_vp_addcx);
+EXPORT_SYMBOL_FOR_KVM(tdh_vp_addcx);
u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2)
{
@@ -1603,7 +1604,7 @@ u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u
return ret;
}
-EXPORT_SYMBOL_GPL(tdh_mem_page_aug);
+EXPORT_SYMBOL_FOR_KVM(tdh_mem_page_aug);
u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2)
{
@@ -1620,7 +1621,7 @@ u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u6
return ret;
}
-EXPORT_SYMBOL_GPL(tdh_mem_range_block);
+EXPORT_SYMBOL_FOR_KVM(tdh_mem_range_block);
u64 tdh_mng_key_config(struct tdx_td *td)
{
@@ -1630,7 +1631,7 @@ u64 tdh_mng_key_config(struct tdx_td *td)
return seamcall(TDH_MNG_KEY_CONFIG, &args);
}
-EXPORT_SYMBOL_GPL(tdh_mng_key_config);
+EXPORT_SYMBOL_FOR_KVM(tdh_mng_key_config);
u64 tdh_mng_create(struct tdx_td *td, u16 hkid)
{
@@ -1642,7 +1643,7 @@ u64 tdh_mng_create(struct tdx_td *td, u16 hkid)
tdx_clflush_page(td->tdr_page);
return seamcall(TDH_MNG_CREATE, &args);
}
-EXPORT_SYMBOL_GPL(tdh_mng_create);
+EXPORT_SYMBOL_FOR_KVM(tdh_mng_create);
u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp)
{
@@ -1654,7 +1655,7 @@ u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp)
tdx_clflush_page(vp->tdvpr_page);
return seamcall(TDH_VP_CREATE, &args);
}
-EXPORT_SYMBOL_GPL(tdh_vp_create);
+EXPORT_SYMBOL_FOR_KVM(tdh_vp_create);
u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data)
{
@@ -1671,7 +1672,7 @@ u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data)
return ret;
}
-EXPORT_SYMBOL_GPL(tdh_mng_rd);
+EXPORT_SYMBOL_FOR_KVM(tdh_mng_rd);
u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2)
{
@@ -1688,7 +1689,7 @@ u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2)
return ret;
}
-EXPORT_SYMBOL_GPL(tdh_mr_extend);
+EXPORT_SYMBOL_FOR_KVM(tdh_mr_extend);
u64 tdh_mr_finalize(struct tdx_td *td)
{
@@ -1698,7 +1699,7 @@ u64 tdh_mr_finalize(struct tdx_td *td)
return seamcall(TDH_MR_FINALIZE, &args);
}
-EXPORT_SYMBOL_GPL(tdh_mr_finalize);
+EXPORT_SYMBOL_FOR_KVM(tdh_mr_finalize);
u64 tdh_vp_flush(struct tdx_vp *vp)
{
@@ -1708,7 +1709,7 @@ u64 tdh_vp_flush(struct tdx_vp *vp)
return seamcall(TDH_VP_FLUSH, &args);
}
-EXPORT_SYMBOL_GPL(tdh_vp_flush);
+EXPORT_SYMBOL_FOR_KVM(tdh_vp_flush);
u64 tdh_mng_vpflushdone(struct tdx_td *td)
{
@@ -1718,7 +1719,7 @@ u64 tdh_mng_vpflushdone(struct tdx_td *td)
return seamcall(TDH_MNG_VPFLUSHDONE, &args);
}
-EXPORT_SYMBOL_GPL(tdh_mng_vpflushdone);
+EXPORT_SYMBOL_FOR_KVM(tdh_mng_vpflushdone);
u64 tdh_mng_key_freeid(struct tdx_td *td)
{
@@ -1728,7 +1729,7 @@ u64 tdh_mng_key_freeid(struct tdx_td *td)
return seamcall(TDH_MNG_KEY_FREEID, &args);
}
-EXPORT_SYMBOL_GPL(tdh_mng_key_freeid);
+EXPORT_SYMBOL_FOR_KVM(tdh_mng_key_freeid);
u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err)
{
@@ -1744,7 +1745,7 @@ u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err)
return ret;
}
-EXPORT_SYMBOL_GPL(tdh_mng_init);
+EXPORT_SYMBOL_FOR_KVM(tdh_mng_init);
u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data)
{
@@ -1761,7 +1762,7 @@ u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data)
return ret;
}
-EXPORT_SYMBOL_GPL(tdh_vp_rd);
+EXPORT_SYMBOL_FOR_KVM(tdh_vp_rd);
u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask)
{
@@ -1774,7 +1775,7 @@ u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask)
return seamcall(TDH_VP_WR, &args);
}
-EXPORT_SYMBOL_GPL(tdh_vp_wr);
+EXPORT_SYMBOL_FOR_KVM(tdh_vp_wr);
u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid)
{
@@ -1787,7 +1788,7 @@ u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid)
/* apicid requires version == 1. */
return seamcall(TDH_VP_INIT | (1ULL << TDX_VERSION_SHIFT), &args);
}
-EXPORT_SYMBOL_GPL(tdh_vp_init);
+EXPORT_SYMBOL_FOR_KVM(tdh_vp_init);
/*
* TDX ABI defines output operands as PT, OWNER and SIZE. These are TDX defined fomats.
@@ -1809,7 +1810,7 @@ u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64
return ret;
}
-EXPORT_SYMBOL_GPL(tdh_phymem_page_reclaim);
+EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_reclaim);
u64 tdh_mem_track(struct tdx_td *td)
{
@@ -1819,7 +1820,7 @@ u64 tdh_mem_track(struct tdx_td *td)
return seamcall(TDH_MEM_TRACK, &args);
}
-EXPORT_SYMBOL_GPL(tdh_mem_track);
+EXPORT_SYMBOL_FOR_KVM(tdh_mem_track);
u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2)
{
@@ -1836,7 +1837,7 @@ u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u6
return ret;
}
-EXPORT_SYMBOL_GPL(tdh_mem_page_remove);
+EXPORT_SYMBOL_FOR_KVM(tdh_mem_page_remove);
u64 tdh_phymem_cache_wb(bool resume)
{
@@ -1846,7 +1847,7 @@ u64 tdh_phymem_cache_wb(bool resume)
return seamcall(TDH_PHYMEM_CACHE_WB, &args);
}
-EXPORT_SYMBOL_GPL(tdh_phymem_cache_wb);
+EXPORT_SYMBOL_FOR_KVM(tdh_phymem_cache_wb);
u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td)
{
@@ -1856,7 +1857,7 @@ u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td)
return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
}
-EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_tdr);
+EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_wbinvd_tdr);
u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page)
{
@@ -1866,7 +1867,7 @@ u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page)
return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
}
-EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid);
+EXPORT_SYMBOL_FOR_KVM(tdh_phymem_page_wbinvd_hkid);
#ifdef CONFIG_KEXEC_CORE
void tdx_cpu_flush_cache_for_kexec(void)
@@ -1884,5 +1885,5 @@ void tdx_cpu_flush_cache_for_kexec(void)
wbinvd();
this_cpu_write(cache_state_incoherent, false);
}
-EXPORT_SYMBOL_GPL(tdx_cpu_flush_cache_for_kexec);
+EXPORT_SYMBOL_FOR_KVM(tdx_cpu_flush_cache_for_kexec);
#endif