summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/acpi/apei.c2
-rw-r--r--arch/x86/kernel/acpi/cppc.c2
-rw-r--r--arch/x86/kernel/alternative.c96
-rw-r--r--arch/x86/kernel/amd_node.c150
-rw-r--r--arch/x86/kernel/apic/apic.c18
-rw-r--r--arch/x86/kernel/apic/apic_common.c3
-rw-r--r--arch/x86/kernel/apic/io_apic.c2
-rw-r--r--arch/x86/kernel/cpu/amd.c24
-rw-r--r--arch/x86/kernel/cpu/bugs.c251
-rw-r--r--arch/x86/kernel/cpu/bus_lock.c3
-rw-r--r--arch/x86/kernel/cpu/common.c45
-rw-r--r--arch/x86/kernel/cpu/cpu.h9
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c3
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c356
-rw-r--r--arch/x86/kernel/cpu/mce/core.c31
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h4
-rw-r--r--arch/x86/kernel/cpu/mce/threshold.c19
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c130
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c13
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c362
-rw-r--r--arch/x86/kernel/cpu/microcode/internal.h4
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h4
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c9
-rw-r--r--arch/x86/kernel/cpu/resctrl/ctrlmondata.c40
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h5
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c1
-rw-r--r--arch/x86/kernel/cpu/scattered.c5
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.c19
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.c1
-rw-r--r--arch/x86/kernel/cpu/sgx/encls.h5
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c104
-rw-r--r--arch/x86/kernel/cpu/sgx/sgx.h3
-rw-r--r--arch/x86/kernel/cpu/sgx/virt.c25
-rw-r--r--arch/x86/kernel/cpu/topology.c4
-rw-r--r--arch/x86/kernel/cpu/topology_common.c3
-rw-r--r--arch/x86/kernel/cpu/tsx.c58
-rw-r--r--arch/x86/kernel/dumpstack.c23
-rw-r--r--arch/x86/kernel/e820.c3
-rw-r--r--arch/x86/kernel/fpu/core.c24
-rw-r--r--arch/x86/kernel/fpu/xstate.c7
-rw-r--r--arch/x86/kernel/ftrace_64.S8
-rw-r--r--arch/x86/kernel/hw_breakpoint.c3
-rw-r--r--arch/x86/kernel/irq.c3
-rw-r--r--arch/x86/kernel/kprobes/core.c3
-rw-r--r--arch/x86/kernel/kprobes/opt.c4
-rw-r--r--arch/x86/kernel/kvm.c5
-rw-r--r--arch/x86/kernel/module.c15
-rw-r--r--arch/x86/kernel/msr.c2
-rw-r--r--arch/x86/kernel/nmi.c5
-rw-r--r--arch/x86/kernel/process_64.c5
-rw-r--r--arch/x86/kernel/reboot.c5
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S7
-rw-r--r--arch/x86/kernel/smpboot.c81
-rw-r--r--arch/x86/kernel/static_call.c13
-rw-r--r--arch/x86/kernel/traps.c163
-rw-r--r--arch/x86/kernel/tsc.c1
-rw-r--r--arch/x86/kernel/uprobes.c70
58 files changed, 1526 insertions, 738 deletions
diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c
index 0916f00a992e..e21419e686eb 100644
--- a/arch/x86/kernel/acpi/apei.c
+++ b/arch/x86/kernel/acpi/apei.c
@@ -19,6 +19,8 @@ int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data)
if (!cmc->enabled)
return 0;
+ mce_save_apei_thr_limit(cmc->notify.error_threshold_value);
+
/*
* We expect HEST to provide a list of MC banks that report errors
* in firmware first mode. Otherwise, return non-zero value to
diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c
index 7047124490f6..d7c8ef1e354d 100644
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -196,7 +196,7 @@ int amd_detect_prefcore(bool *detected)
break;
}
- for_each_present_cpu(cpu) {
+ for_each_online_cpu(cpu) {
u32 tmp;
int ret;
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 8ee5ff547357..74f4c659f9c9 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -9,6 +9,7 @@
#include <asm/text-patching.h>
#include <asm/insn.h>
+#include <asm/insn-eval.h>
#include <asm/ibt.h>
#include <asm/set_memory.h>
#include <asm/nmi.h>
@@ -346,25 +347,6 @@ static void add_nop(u8 *buf, unsigned int len)
}
/*
- * Matches NOP and NOPL, not any of the other possible NOPs.
- */
-static bool insn_is_nop(struct insn *insn)
-{
- /* Anything NOP, but no REP NOP */
- if (insn->opcode.bytes[0] == 0x90 &&
- (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3))
- return true;
-
- /* NOPL */
- if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F)
- return true;
-
- /* TODO: more nops */
-
- return false;
-}
-
-/*
* Find the offset of the first non-NOP instruction starting at @offset
* but no further than @len.
*/
@@ -559,7 +541,7 @@ EXPORT_SYMBOL(BUG_func);
* Rewrite the "call BUG_func" replacement to point to the target of the
* indirect pv_ops call "call *disp(%ip)".
*/
-static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
+static unsigned int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
{
void *target, *bug = &BUG_func;
s32 disp;
@@ -643,7 +625,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
* order.
*/
for (a = start; a < end; a++) {
- int insn_buff_sz = 0;
+ unsigned int insn_buff_sz = 0;
/*
* In case of nested ALTERNATIVE()s the outer alternative might
@@ -683,11 +665,8 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
memcpy(insn_buff, replacement, a->replacementlen);
insn_buff_sz = a->replacementlen;
- if (a->flags & ALT_FLAG_DIRECT_CALL) {
+ if (a->flags & ALT_FLAG_DIRECT_CALL)
insn_buff_sz = alt_replace_call(instr, insn_buff, a);
- if (insn_buff_sz < 0)
- continue;
- }
for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
insn_buff[insn_buff_sz] = 0x90;
@@ -2244,21 +2223,34 @@ int alternatives_text_reserved(void *start, void *end)
* See entry_{32,64}.S for more details.
*/
-/*
- * We define the int3_magic() function in assembly to control the calling
- * convention such that we can 'call' it from assembly.
- */
+extern void int3_selftest_asm(unsigned int *ptr);
-extern void int3_magic(unsigned int *ptr); /* defined in asm */
+asm (
+" .pushsection .init.text, \"ax\", @progbits\n"
+" .type int3_selftest_asm, @function\n"
+"int3_selftest_asm:\n"
+ ANNOTATE_NOENDBR
+ /*
+ * INT3 padded with NOP to CALL_INSN_SIZE. The INT3 triggers an
+ * exception, then the int3_exception_nb notifier emulates a call to
+ * int3_selftest_callee().
+ */
+" int3; nop; nop; nop; nop\n"
+ ASM_RET
+" .size int3_selftest_asm, . - int3_selftest_asm\n"
+" .popsection\n"
+);
+
+extern void int3_selftest_callee(unsigned int *ptr);
asm (
" .pushsection .init.text, \"ax\", @progbits\n"
-" .type int3_magic, @function\n"
-"int3_magic:\n"
+" .type int3_selftest_callee, @function\n"
+"int3_selftest_callee:\n"
ANNOTATE_NOENDBR
-" movl $1, (%" _ASM_ARG1 ")\n"
+" movl $0x1234, (%" _ASM_ARG1 ")\n"
ASM_RET
-" .size int3_magic, .-int3_magic\n"
+" .size int3_selftest_callee, . - int3_selftest_callee\n"
" .popsection\n"
);
@@ -2267,7 +2259,7 @@ extern void int3_selftest_ip(void); /* defined in asm below */
static int __init
int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
{
- unsigned long selftest = (unsigned long)&int3_selftest_ip;
+ unsigned long selftest = (unsigned long)&int3_selftest_asm;
struct die_args *args = data;
struct pt_regs *regs = args->regs;
@@ -2282,7 +2274,7 @@ int3_exception_notify(struct notifier_block *self, unsigned long val, void *data
if (regs->ip - INT3_INSN_SIZE != selftest)
return NOTIFY_DONE;
- int3_emulate_call(regs, (unsigned long)&int3_magic);
+ int3_emulate_call(regs, (unsigned long)&int3_selftest_callee);
return NOTIFY_STOP;
}
@@ -2298,19 +2290,11 @@ static noinline void __init int3_selftest(void)
BUG_ON(register_die_notifier(&int3_exception_nb));
/*
- * Basically: int3_magic(&val); but really complicated :-)
- *
- * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb
- * notifier above will emulate CALL for us.
+ * Basically: int3_selftest_callee(&val); but really complicated :-)
*/
- asm volatile ("int3_selftest_ip:\n\t"
- ANNOTATE_NOENDBR
- " int3; nop; nop; nop; nop\n\t"
- : ASM_CALL_CONSTRAINT
- : __ASM_SEL_RAW(a, D) (&val)
- : "memory");
+ int3_selftest_asm(&val);
- BUG_ON(val != 1);
+ BUG_ON(val != 0x1234);
unregister_die_notifier(&int3_exception_nb);
}
@@ -2469,16 +2453,30 @@ void __init_or_module text_poke_early(void *addr, const void *opcode,
__ro_after_init struct mm_struct *text_poke_mm;
__ro_after_init unsigned long text_poke_mm_addr;
+/*
+ * Text poking creates and uses a mapping in the lower half of the
+ * address space. Relax LASS enforcement when accessing the poking
+ * address.
+ *
+ * objtool enforces a strict policy of "no function calls within AC=1
+ * regions". Adhere to the policy by using inline versions of
+ * memcpy()/memset() that will never result in a function call.
+ */
+
static void text_poke_memcpy(void *dst, const void *src, size_t len)
{
- memcpy(dst, src, len);
+ lass_stac();
+ __inline_memcpy(dst, src, len);
+ lass_clac();
}
static void text_poke_memset(void *dst, const void *src, size_t len)
{
int c = *(const int *)src;
- memset(dst, c, len);
+ lass_stac();
+ __inline_memset(dst, c, len);
+ lass_clac();
}
typedef void text_poke_f(void *dst, const void *src, size_t len);
diff --git a/arch/x86/kernel/amd_node.c b/arch/x86/kernel/amd_node.c
index a40176b62eb5..3d0a4768d603 100644
--- a/arch/x86/kernel/amd_node.c
+++ b/arch/x86/kernel/amd_node.c
@@ -34,62 +34,6 @@ struct pci_dev *amd_node_get_func(u16 node, u8 func)
return pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(AMD_NODE0_PCI_SLOT + node, func));
}
-#define DF_BLK_INST_CNT 0x040
-#define DF_CFG_ADDR_CNTL_LEGACY 0x084
-#define DF_CFG_ADDR_CNTL_DF4 0xC04
-
-#define DF_MAJOR_REVISION GENMASK(27, 24)
-
-static u16 get_cfg_addr_cntl_offset(struct pci_dev *df_f0)
-{
- u32 reg;
-
- /*
- * Revision fields added for DF4 and later.
- *
- * Major revision of '0' is found pre-DF4. Field is Read-as-Zero.
- */
- if (pci_read_config_dword(df_f0, DF_BLK_INST_CNT, &reg))
- return 0;
-
- if (reg & DF_MAJOR_REVISION)
- return DF_CFG_ADDR_CNTL_DF4;
-
- return DF_CFG_ADDR_CNTL_LEGACY;
-}
-
-struct pci_dev *amd_node_get_root(u16 node)
-{
- struct pci_dev *root;
- u16 cntl_off;
- u8 bus;
-
- if (!cpu_feature_enabled(X86_FEATURE_ZEN))
- return NULL;
-
- /*
- * D18F0xXXX [Config Address Control] (DF::CfgAddressCntl)
- * Bits [7:0] (SecBusNum) holds the bus number of the root device for
- * this Data Fabric instance. The segment, device, and function will be 0.
- */
- struct pci_dev *df_f0 __free(pci_dev_put) = amd_node_get_func(node, 0);
- if (!df_f0)
- return NULL;
-
- cntl_off = get_cfg_addr_cntl_offset(df_f0);
- if (!cntl_off)
- return NULL;
-
- if (pci_read_config_byte(df_f0, cntl_off, &bus))
- return NULL;
-
- /* Grab the pointer for the actual root device instance. */
- root = pci_get_domain_bus_and_slot(0, bus, 0);
-
- pci_dbg(root, "is root for AMD node %u\n", node);
- return root;
-}
-
static struct pci_dev **amd_roots;
/* Protect the PCI config register pairs used for SMN. */
@@ -274,51 +218,21 @@ DEFINE_SHOW_STORE_ATTRIBUTE(smn_node);
DEFINE_SHOW_STORE_ATTRIBUTE(smn_address);
DEFINE_SHOW_STORE_ATTRIBUTE(smn_value);
-static int amd_cache_roots(void)
-{
- u16 node, num_nodes = amd_num_nodes();
-
- amd_roots = kcalloc(num_nodes, sizeof(*amd_roots), GFP_KERNEL);
- if (!amd_roots)
- return -ENOMEM;
-
- for (node = 0; node < num_nodes; node++)
- amd_roots[node] = amd_node_get_root(node);
-
- return 0;
-}
-
-static int reserve_root_config_spaces(void)
+static struct pci_dev *get_next_root(struct pci_dev *root)
{
- struct pci_dev *root = NULL;
- struct pci_bus *bus = NULL;
-
- while ((bus = pci_find_next_bus(bus))) {
- /* Root device is Device 0 Function 0 on each Primary Bus. */
- root = pci_get_slot(bus, 0);
- if (!root)
+ while ((root = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, root))) {
+ /* Root device is Device 0 Function 0. */
+ if (root->devfn)
continue;
if (root->vendor != PCI_VENDOR_ID_AMD &&
root->vendor != PCI_VENDOR_ID_HYGON)
continue;
- pci_dbg(root, "Reserving PCI config space\n");
-
- /*
- * There are a few SMN index/data pairs and other registers
- * that shouldn't be accessed by user space.
- * So reserve the entire PCI config space for simplicity rather
- * than covering specific registers piecemeal.
- */
- if (!pci_request_config_region_exclusive(root, 0, PCI_CFG_SPACE_SIZE, NULL)) {
- pci_err(root, "Failed to reserve config space\n");
- return -EEXIST;
- }
+ break;
}
- smn_exclusive = true;
- return 0;
+ return root;
}
static bool enable_dfs;
@@ -332,7 +246,8 @@ __setup("amd_smn_debugfs_enable", amd_smn_enable_dfs);
static int __init amd_smn_init(void)
{
- int err;
+ u16 count, num_roots, roots_per_node, node, num_nodes;
+ struct pci_dev *root;
if (!cpu_feature_enabled(X86_FEATURE_ZEN))
return 0;
@@ -342,13 +257,48 @@ static int __init amd_smn_init(void)
if (amd_roots)
return 0;
- err = amd_cache_roots();
- if (err)
- return err;
+ num_roots = 0;
+ root = NULL;
+ while ((root = get_next_root(root))) {
+ pci_dbg(root, "Reserving PCI config space\n");
- err = reserve_root_config_spaces();
- if (err)
- return err;
+ /*
+ * There are a few SMN index/data pairs and other registers
+ * that shouldn't be accessed by user space. So reserve the
+ * entire PCI config space for simplicity rather than covering
+ * specific registers piecemeal.
+ */
+ if (!pci_request_config_region_exclusive(root, 0, PCI_CFG_SPACE_SIZE, NULL)) {
+ pci_err(root, "Failed to reserve config space\n");
+ return -EEXIST;
+ }
+
+ num_roots++;
+ }
+
+ pr_debug("Found %d AMD root devices\n", num_roots);
+
+ if (!num_roots)
+ return -ENODEV;
+
+ num_nodes = amd_num_nodes();
+ amd_roots = kcalloc(num_nodes, sizeof(*amd_roots), GFP_KERNEL);
+ if (!amd_roots)
+ return -ENOMEM;
+
+ roots_per_node = num_roots / num_nodes;
+
+ count = 0;
+ node = 0;
+ root = NULL;
+ while (node < num_nodes && (root = get_next_root(root))) {
+ /* Use one root for each node and skip the rest. */
+ if (count++ % roots_per_node)
+ continue;
+
+ pci_dbg(root, "is root for AMD node %u\n", node);
+ amd_roots[node++] = root;
+ }
if (enable_dfs) {
debugfs_dir = debugfs_create_dir("amd_smn", arch_debugfs_dir);
@@ -358,6 +308,8 @@ static int __init amd_smn_init(void)
debugfs_create_file("value", 0600, debugfs_dir, NULL, &smn_value_fops);
}
+ smn_exclusive = true;
+
return 0;
}
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 680d305589a3..9c29e12b84e5 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -36,6 +36,7 @@
#include <linux/dmi.h>
#include <linux/smp.h>
#include <linux/mm.h>
+#include <linux/kvm_types.h>
#include <xen/xen.h>
@@ -173,6 +174,7 @@ static struct resource lapic_resource = {
.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
};
+/* Measured in ticks per HZ. */
unsigned int lapic_timer_period = 0;
static void apic_pm_activate(void);
@@ -792,6 +794,7 @@ static int __init calibrate_APIC_clock(void)
{
struct clock_event_device *levt = this_cpu_ptr(&lapic_events);
u64 tsc_perj = 0, tsc_start = 0;
+ long delta_tsc_khz, bus_khz;
unsigned long jif_start;
unsigned long deltaj;
long delta, deltatsc;
@@ -894,14 +897,15 @@ static int __init calibrate_APIC_clock(void)
apic_pr_verbose("..... calibration result: %u\n", lapic_timer_period);
if (boot_cpu_has(X86_FEATURE_TSC)) {
- apic_pr_verbose("..... CPU clock speed is %ld.%04ld MHz.\n",
- (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ),
- (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ));
+ delta_tsc_khz = (deltatsc * HZ) / (1000 * LAPIC_CAL_LOOPS);
+
+ apic_pr_verbose("..... CPU clock speed is %ld.%03ld MHz.\n",
+ delta_tsc_khz / 1000, delta_tsc_khz % 1000);
}
- apic_pr_verbose("..... host bus clock speed is %u.%04u MHz.\n",
- lapic_timer_period / (1000000 / HZ),
- lapic_timer_period % (1000000 / HZ));
+ bus_khz = (long)lapic_timer_period * HZ / 1000;
+ apic_pr_verbose("..... host bus clock speed is %ld.%03ld MHz.\n",
+ bus_khz / 1000, bus_khz % 1000);
/*
* Do a sanity check on the APIC calibration result
@@ -2316,7 +2320,7 @@ u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid)
dest |= msg->arch_addr_hi.destid_8_31 << 8;
return dest;
}
-EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid);
+EXPORT_SYMBOL_FOR_KVM(x86_msi_msg_get_destid);
static void __init apic_bsp_up_setup(void)
{
diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c
index 9ef3be866832..2ed3b5c88c7f 100644
--- a/arch/x86/kernel/apic/apic_common.c
+++ b/arch/x86/kernel/apic/apic_common.c
@@ -4,6 +4,7 @@
* SPDX-License-Identifier: GPL-2.0
*/
#include <linux/irq.h>
+#include <linux/kvm_types.h>
#include <asm/apic.h>
#include "local.h"
@@ -25,7 +26,7 @@ u32 default_cpu_present_to_apicid(int mps_cpu)
else
return BAD_APICID;
}
-EXPORT_SYMBOL_GPL(default_cpu_present_to_apicid);
+EXPORT_SYMBOL_FOR_KVM(default_cpu_present_to_apicid);
/*
* Set up the logical destination ID when the APIC operates in logical
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 5ba2feb2c04c..1e0442e867b1 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2864,7 +2864,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
ioapic = mp_irqdomain_ioapic_idx(domain);
pin = info->ioapic.pin;
- if (irq_find_mapping(domain, (irq_hw_number_t)pin) > 0)
+ if (irq_resolve_mapping(domain, (irq_hw_number_t)pin))
return -EEXIST;
data = kzalloc(sizeof(*data), GFP_KERNEL);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index ccaa51ce63f6..bc94ff1e250a 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -3,7 +3,7 @@
#include <linux/bitops.h>
#include <linux/elf.h>
#include <linux/mm.h>
-
+#include <linux/kvm_types.h>
#include <linux/io.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
@@ -516,7 +516,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
setup_force_cpu_cap(X86_FEATURE_ZEN5);
break;
case 0x50 ... 0x5f:
- case 0x90 ... 0xaf:
+ case 0x80 ... 0xaf:
case 0xc0 ... 0xcf:
setup_force_cpu_cap(X86_FEATURE_ZEN6);
break;
@@ -1035,8 +1035,26 @@ static void init_amd_zen4(struct cpuinfo_x86 *c)
}
}
+static const struct x86_cpu_id zen5_rdseed_microcode[] = {
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x02, 0x1, 0x0b00215a),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x08, 0x1, 0x0b008121),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x11, 0x0, 0x0b101054),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x24, 0x0, 0x0b204037),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x44, 0x0, 0x0b404035),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x44, 0x1, 0x0b404108),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x60, 0x0, 0x0b600037),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x68, 0x0, 0x0b608038),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x70, 0x0, 0x0b700037),
+ {},
+};
+
static void init_amd_zen5(struct cpuinfo_x86 *c)
{
+ if (!x86_match_min_microcode_rev(zen5_rdseed_microcode)) {
+ clear_cpu_cap(c, X86_FEATURE_RDSEED);
+ msr_clear_bit(MSR_AMD64_CPUID_FN_7, 18);
+ pr_emerg_once("RDSEED32 is broken. Disabling the corresponding CPUID bit.\n");
+ }
}
static void init_amd(struct cpuinfo_x86 *c)
@@ -1300,7 +1318,7 @@ unsigned long amd_get_dr_addr_mask(unsigned int dr)
return per_cpu(amd_dr_addr_mask[dr], smp_processor_id());
}
-EXPORT_SYMBOL_GPL(amd_get_dr_addr_mask);
+EXPORT_SYMBOL_FOR_KVM(amd_get_dr_addr_mask);
static void zenbleed_check_cpu(void *unused)
{
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index d7fa03bf51b4..d8660770dc6a 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -16,6 +16,7 @@
#include <linux/sched/smt.h>
#include <linux/pgtable.h>
#include <linux/bpf.h>
+#include <linux/kvm_types.h>
#include <asm/spec-ctrl.h>
#include <asm/cmdline.h>
@@ -53,56 +54,8 @@
* mitigation option.
*/
-static void __init spectre_v1_select_mitigation(void);
-static void __init spectre_v1_apply_mitigation(void);
-static void __init spectre_v2_select_mitigation(void);
-static void __init spectre_v2_update_mitigation(void);
-static void __init spectre_v2_apply_mitigation(void);
-static void __init retbleed_select_mitigation(void);
-static void __init retbleed_update_mitigation(void);
-static void __init retbleed_apply_mitigation(void);
-static void __init spectre_v2_user_select_mitigation(void);
-static void __init spectre_v2_user_update_mitigation(void);
-static void __init spectre_v2_user_apply_mitigation(void);
-static void __init ssb_select_mitigation(void);
-static void __init ssb_apply_mitigation(void);
-static void __init l1tf_select_mitigation(void);
-static void __init l1tf_apply_mitigation(void);
-static void __init mds_select_mitigation(void);
-static void __init mds_update_mitigation(void);
-static void __init mds_apply_mitigation(void);
-static void __init taa_select_mitigation(void);
-static void __init taa_update_mitigation(void);
-static void __init taa_apply_mitigation(void);
-static void __init mmio_select_mitigation(void);
-static void __init mmio_update_mitigation(void);
-static void __init mmio_apply_mitigation(void);
-static void __init rfds_select_mitigation(void);
-static void __init rfds_update_mitigation(void);
-static void __init rfds_apply_mitigation(void);
-static void __init srbds_select_mitigation(void);
-static void __init srbds_apply_mitigation(void);
-static void __init l1d_flush_select_mitigation(void);
-static void __init srso_select_mitigation(void);
-static void __init srso_update_mitigation(void);
-static void __init srso_apply_mitigation(void);
-static void __init gds_select_mitigation(void);
-static void __init gds_apply_mitigation(void);
-static void __init bhi_select_mitigation(void);
-static void __init bhi_update_mitigation(void);
-static void __init bhi_apply_mitigation(void);
-static void __init its_select_mitigation(void);
-static void __init its_update_mitigation(void);
-static void __init its_apply_mitigation(void);
-static void __init tsa_select_mitigation(void);
-static void __init tsa_apply_mitigation(void);
-static void __init vmscape_select_mitigation(void);
-static void __init vmscape_update_mitigation(void);
-static void __init vmscape_apply_mitigation(void);
-
/* The base value of the SPEC_CTRL MSR without task-specific bits set */
u64 x86_spec_ctrl_base;
-EXPORT_SYMBOL_GPL(x86_spec_ctrl_base);
/* The current value of the SPEC_CTRL MSR with task-specific bits set */
DEFINE_PER_CPU(u64, x86_spec_ctrl_current);
@@ -179,7 +132,7 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
/* Control IBPB on vCPU load */
DEFINE_STATIC_KEY_FALSE(switch_vcpu_ibpb);
-EXPORT_SYMBOL_GPL(switch_vcpu_ibpb);
+EXPORT_SYMBOL_FOR_KVM(switch_vcpu_ibpb);
/* Control CPU buffer clear before idling (halt, mwait) */
DEFINE_STATIC_KEY_FALSE(cpu_buf_idle_clear);
@@ -198,7 +151,7 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
* mitigation is required.
*/
DEFINE_STATIC_KEY_FALSE(cpu_buf_vm_clear);
-EXPORT_SYMBOL_GPL(cpu_buf_vm_clear);
+EXPORT_SYMBOL_FOR_KVM(cpu_buf_vm_clear);
#undef pr_fmt
#define pr_fmt(fmt) "mitigations: " fmt
@@ -233,99 +186,6 @@ static void __init cpu_print_attack_vectors(void)
}
}
-void __init cpu_select_mitigations(void)
-{
- /*
- * Read the SPEC_CTRL MSR to account for reserved bits which may
- * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD
- * init code as it is not enumerated and depends on the family.
- */
- if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) {
- rdmsrq(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
-
- /*
- * Previously running kernel (kexec), may have some controls
- * turned ON. Clear them and let the mitigations setup below
- * rediscover them based on configuration.
- */
- x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK;
- }
-
- x86_arch_cap_msr = x86_read_arch_cap_msr();
-
- cpu_print_attack_vectors();
-
- /* Select the proper CPU mitigations before patching alternatives: */
- spectre_v1_select_mitigation();
- spectre_v2_select_mitigation();
- retbleed_select_mitigation();
- spectre_v2_user_select_mitigation();
- ssb_select_mitigation();
- l1tf_select_mitigation();
- mds_select_mitigation();
- taa_select_mitigation();
- mmio_select_mitigation();
- rfds_select_mitigation();
- srbds_select_mitigation();
- l1d_flush_select_mitigation();
- srso_select_mitigation();
- gds_select_mitigation();
- its_select_mitigation();
- bhi_select_mitigation();
- tsa_select_mitigation();
- vmscape_select_mitigation();
-
- /*
- * After mitigations are selected, some may need to update their
- * choices.
- */
- spectre_v2_update_mitigation();
- /*
- * retbleed_update_mitigation() relies on the state set by
- * spectre_v2_update_mitigation(); specifically it wants to know about
- * spectre_v2=ibrs.
- */
- retbleed_update_mitigation();
- /*
- * its_update_mitigation() depends on spectre_v2_update_mitigation()
- * and retbleed_update_mitigation().
- */
- its_update_mitigation();
-
- /*
- * spectre_v2_user_update_mitigation() depends on
- * retbleed_update_mitigation(), specifically the STIBP
- * selection is forced for UNRET or IBPB.
- */
- spectre_v2_user_update_mitigation();
- mds_update_mitigation();
- taa_update_mitigation();
- mmio_update_mitigation();
- rfds_update_mitigation();
- bhi_update_mitigation();
- /* srso_update_mitigation() depends on retbleed_update_mitigation(). */
- srso_update_mitigation();
- vmscape_update_mitigation();
-
- spectre_v1_apply_mitigation();
- spectre_v2_apply_mitigation();
- retbleed_apply_mitigation();
- spectre_v2_user_apply_mitigation();
- ssb_apply_mitigation();
- l1tf_apply_mitigation();
- mds_apply_mitigation();
- taa_apply_mitigation();
- mmio_apply_mitigation();
- rfds_apply_mitigation();
- srbds_apply_mitigation();
- srso_apply_mitigation();
- gds_apply_mitigation();
- its_apply_mitigation();
- bhi_apply_mitigation();
- tsa_apply_mitigation();
- vmscape_apply_mitigation();
-}
-
/*
* NOTE: This function is *only* called for SVM, since Intel uses
* MSR_IA32_SPEC_CTRL for SSBD.
@@ -366,7 +226,7 @@ x86_virt_spec_ctrl(u64 guest_virt_spec_ctrl, bool setguest)
speculation_ctrl_update(tif);
}
}
-EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl);
+EXPORT_SYMBOL_FOR_KVM(x86_virt_spec_ctrl);
static void x86_amd_ssb_disable(void)
{
@@ -1032,7 +892,7 @@ bool gds_ucode_mitigated(void)
return (gds_mitigation == GDS_MITIGATION_FULL ||
gds_mitigation == GDS_MITIGATION_FULL_LOCKED);
}
-EXPORT_SYMBOL_GPL(gds_ucode_mitigated);
+EXPORT_SYMBOL_FOR_KVM(gds_ucode_mitigated);
void update_gds_msr(void)
{
@@ -2859,7 +2719,7 @@ void x86_spec_ctrl_setup_ap(void)
}
bool itlb_multihit_kvm_mitigation;
-EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);
+EXPORT_SYMBOL_FOR_KVM(itlb_multihit_kvm_mitigation);
#undef pr_fmt
#define pr_fmt(fmt) "L1TF: " fmt
@@ -2867,11 +2727,9 @@ EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);
/* Default mitigation for L1TF-affected CPUs */
enum l1tf_mitigations l1tf_mitigation __ro_after_init =
IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_AUTO : L1TF_MITIGATION_OFF;
-#if IS_ENABLED(CONFIG_KVM_INTEL)
-EXPORT_SYMBOL_GPL(l1tf_mitigation);
-#endif
+EXPORT_SYMBOL_FOR_KVM(l1tf_mitigation);
enum vmx_l1d_flush_state l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
-EXPORT_SYMBOL_GPL(l1tf_vmx_mitigation);
+EXPORT_SYMBOL_FOR_KVM(l1tf_vmx_mitigation);
/*
* These CPUs all support 44bits physical address space internally in the
@@ -3371,6 +3229,99 @@ void cpu_bugs_smt_update(void)
mutex_unlock(&spec_ctrl_mutex);
}
+void __init cpu_select_mitigations(void)
+{
+ /*
+ * Read the SPEC_CTRL MSR to account for reserved bits which may
+ * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD
+ * init code as it is not enumerated and depends on the family.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) {
+ rdmsrq(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+
+ /*
+ * Previously running kernel (kexec), may have some controls
+ * turned ON. Clear them and let the mitigations setup below
+ * rediscover them based on configuration.
+ */
+ x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK;
+ }
+
+ x86_arch_cap_msr = x86_read_arch_cap_msr();
+
+ cpu_print_attack_vectors();
+
+ /* Select the proper CPU mitigations before patching alternatives: */
+ spectre_v1_select_mitigation();
+ spectre_v2_select_mitigation();
+ retbleed_select_mitigation();
+ spectre_v2_user_select_mitigation();
+ ssb_select_mitigation();
+ l1tf_select_mitigation();
+ mds_select_mitigation();
+ taa_select_mitigation();
+ mmio_select_mitigation();
+ rfds_select_mitigation();
+ srbds_select_mitigation();
+ l1d_flush_select_mitigation();
+ srso_select_mitigation();
+ gds_select_mitigation();
+ its_select_mitigation();
+ bhi_select_mitigation();
+ tsa_select_mitigation();
+ vmscape_select_mitigation();
+
+ /*
+ * After mitigations are selected, some may need to update their
+ * choices.
+ */
+ spectre_v2_update_mitigation();
+ /*
+ * retbleed_update_mitigation() relies on the state set by
+ * spectre_v2_update_mitigation(); specifically it wants to know about
+ * spectre_v2=ibrs.
+ */
+ retbleed_update_mitigation();
+ /*
+ * its_update_mitigation() depends on spectre_v2_update_mitigation()
+ * and retbleed_update_mitigation().
+ */
+ its_update_mitigation();
+
+ /*
+ * spectre_v2_user_update_mitigation() depends on
+ * retbleed_update_mitigation(), specifically the STIBP
+ * selection is forced for UNRET or IBPB.
+ */
+ spectre_v2_user_update_mitigation();
+ mds_update_mitigation();
+ taa_update_mitigation();
+ mmio_update_mitigation();
+ rfds_update_mitigation();
+ bhi_update_mitigation();
+ /* srso_update_mitigation() depends on retbleed_update_mitigation(). */
+ srso_update_mitigation();
+ vmscape_update_mitigation();
+
+ spectre_v1_apply_mitigation();
+ spectre_v2_apply_mitigation();
+ retbleed_apply_mitigation();
+ spectre_v2_user_apply_mitigation();
+ ssb_apply_mitigation();
+ l1tf_apply_mitigation();
+ mds_apply_mitigation();
+ taa_apply_mitigation();
+ mmio_apply_mitigation();
+ rfds_apply_mitigation();
+ srbds_apply_mitigation();
+ srso_apply_mitigation();
+ gds_apply_mitigation();
+ its_apply_mitigation();
+ bhi_apply_mitigation();
+ tsa_apply_mitigation();
+ vmscape_apply_mitigation();
+}
+
#ifdef CONFIG_SYSFS
#define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion"
diff --git a/arch/x86/kernel/cpu/bus_lock.c b/arch/x86/kernel/cpu/bus_lock.c
index 981f8b1f0792..dbc99a47be45 100644
--- a/arch/x86/kernel/cpu/bus_lock.c
+++ b/arch/x86/kernel/cpu/bus_lock.c
@@ -6,6 +6,7 @@
#include <linux/workqueue.h>
#include <linux/delay.h>
#include <linux/cpuhotplug.h>
+#include <linux/kvm_types.h>
#include <asm/cpu_device_id.h>
#include <asm/cmdline.h>
#include <asm/traps.h>
@@ -289,7 +290,7 @@ bool handle_guest_split_lock(unsigned long ip)
force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
return false;
}
-EXPORT_SYMBOL_GPL(handle_guest_split_lock);
+EXPORT_SYMBOL_FOR_KVM(handle_guest_split_lock);
void bus_lock_init(void)
{
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c7d3512914ca..e7ab22fce3b5 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -7,6 +7,7 @@
#include <linux/bitops.h>
#include <linux/kernel.h>
#include <linux/export.h>
+#include <linux/kvm_types.h>
#include <linux/percpu.h>
#include <linux/string.h>
#include <linux/ctype.h>
@@ -78,6 +79,10 @@
DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
EXPORT_PER_CPU_SYMBOL(cpu_info);
+/* Used for modules: built-in code uses runtime constants */
+unsigned long USER_PTR_MAX;
+EXPORT_SYMBOL(USER_PTR_MAX);
+
u32 elf_hwcap2 __read_mostly;
/* Number of siblings per CPU package */
@@ -401,6 +406,28 @@ out:
cr4_clear_bits(X86_CR4_UMIP);
}
+static __always_inline void setup_lass(struct cpuinfo_x86 *c)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_LASS))
+ return;
+
+ /*
+ * Legacy vsyscall page access causes a #GP when LASS is active.
+ * Disable LASS because the #GP handler doesn't support vsyscall
+ * emulation.
+ *
+ * Also disable LASS when running under EFI, as some runtime and
+ * boot services rely on 1:1 mappings in the lower half.
+ */
+ if (IS_ENABLED(CONFIG_X86_VSYSCALL_EMULATION) ||
+ IS_ENABLED(CONFIG_EFI)) {
+ setup_clear_cpu_cap(X86_FEATURE_LASS);
+ return;
+ }
+
+ cr4_set_bits(X86_CR4_LASS);
+}
+
/* These bits should not change their value after CPU init is finished. */
static const unsigned long cr4_pinned_mask = X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP |
X86_CR4_FSGSBASE | X86_CR4_CET | X86_CR4_FRED;
@@ -460,14 +487,14 @@ void cr4_update_irqsoff(unsigned long set, unsigned long clear)
__write_cr4(newval);
}
}
-EXPORT_SYMBOL(cr4_update_irqsoff);
+EXPORT_SYMBOL_FOR_KVM(cr4_update_irqsoff);
/* Read the CR4 shadow. */
unsigned long cr4_read_shadow(void)
{
return this_cpu_read(cpu_tlbstate.cr4);
}
-EXPORT_SYMBOL_GPL(cr4_read_shadow);
+EXPORT_SYMBOL_FOR_KVM(cr4_read_shadow);
void cr4_init(void)
{
@@ -722,7 +749,7 @@ void load_direct_gdt(int cpu)
gdt_descr.size = GDT_SIZE - 1;
load_gdt(&gdt_descr);
}
-EXPORT_SYMBOL_GPL(load_direct_gdt);
+EXPORT_SYMBOL_FOR_KVM(load_direct_gdt);
/* Load a fixmap remapping of the per-cpu GDT */
void load_fixmap_gdt(int cpu)
@@ -1021,12 +1048,8 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[CPUID_8000_0001_EDX] = edx;
}
- if (c->extended_cpuid_level >= 0x80000007) {
- cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
-
- c->x86_capability[CPUID_8000_0007_EBX] = ebx;
- c->x86_power = edx;
- }
+ if (c->extended_cpuid_level >= 0x80000007)
+ c->x86_power = cpuid_edx(0x80000007);
if (c->extended_cpuid_level >= 0x80000008) {
cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
@@ -2011,10 +2034,10 @@ static void identify_cpu(struct cpuinfo_x86 *c)
/* Disable the PN if appropriate */
squash_the_stupid_serial_number(c);
- /* Set up SMEP/SMAP/UMIP */
setup_smep(c);
setup_smap(c);
setup_umip(c);
+ setup_lass(c);
/* Enable FSGSBASE instructions if available. */
if (cpu_has(c, X86_FEATURE_FSGSBASE)) {
@@ -2579,7 +2602,7 @@ void __init arch_cpu_finalize_init(void)
alternative_instructions();
if (IS_ENABLED(CONFIG_X86_64)) {
- unsigned long USER_PTR_MAX = TASK_SIZE_MAX;
+ USER_PTR_MAX = TASK_SIZE_MAX;
/*
* Enable this when LAM is gated on LASS support
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index bc38b2d56f26..5c7a3a71191a 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -42,15 +42,6 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],
*const __x86_cpu_dev_end[];
#ifdef CONFIG_CPU_SUP_INTEL
-enum tsx_ctrl_states {
- TSX_CTRL_ENABLE,
- TSX_CTRL_DISABLE,
- TSX_CTRL_RTM_ALWAYS_ABORT,
- TSX_CTRL_NOT_SUPPORTED,
-};
-
-extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state;
-
extern void __init tsx_init(void);
void tsx_ap_init(void);
void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index 46efcbd6afa4..146f6f8b0650 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -72,6 +72,7 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC },
{ X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_TOTAL },
{ X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_LOCAL },
+ { X86_FEATURE_SDCIAE, X86_FEATURE_CAT_L3 },
{ X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL },
{ X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW },
{ X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES },
@@ -79,6 +80,7 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_SGX_LC, X86_FEATURE_SGX },
{ X86_FEATURE_SGX1, X86_FEATURE_SGX },
{ X86_FEATURE_SGX2, X86_FEATURE_SGX1 },
+ { X86_FEATURE_SGX_EUPDATESVN, X86_FEATURE_SGX1 },
{ X86_FEATURE_SGX_EDECCSSA, X86_FEATURE_SGX1 },
{ X86_FEATURE_XFD, X86_FEATURE_XSAVES },
{ X86_FEATURE_XFD, X86_FEATURE_XGETBV1 },
@@ -89,6 +91,7 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_SHSTK, X86_FEATURE_XSAVES },
{ X86_FEATURE_FRED, X86_FEATURE_LKGS },
{ X86_FEATURE_SPEC_CTRL_SSBD, X86_FEATURE_SPEC_CTRL },
+ { X86_FEATURE_LASS, X86_FEATURE_SMAP },
{}
};
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index d6906442f49b..3f1dda355307 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -43,9 +43,6 @@
/* Deferred error settings */
#define MSR_CU_DEF_ERR 0xC0000410
#define MASK_DEF_LVTOFF 0x000000F0
-#define MASK_DEF_INT_TYPE 0x00000006
-#define DEF_LVT_OFF 0x2
-#define DEF_INT_TYPE_APIC 0x2
/* Scalable MCA: */
@@ -54,6 +51,17 @@
static bool thresholding_irq_en;
+struct mce_amd_cpu_data {
+ mce_banks_t thr_intr_banks;
+ mce_banks_t dfr_intr_banks;
+
+ u32 thr_intr_en: 1,
+ dfr_intr_en: 1,
+ __resv: 30;
+};
+
+static DEFINE_PER_CPU_READ_MOSTLY(struct mce_amd_cpu_data, mce_amd_data);
+
static const char * const th_names[] = {
"load_store",
"insn_fetch",
@@ -79,6 +87,8 @@ struct smca_bank {
const struct smca_hwid *hwid;
u32 id; /* Value of MCA_IPID[InstanceId]. */
u8 sysfs_id; /* Value used for sysfs name. */
+ u64 paddrv :1, /* Physical Address Valid bit in MCA_CONFIG */
+ __reserved :63;
};
static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks);
@@ -264,6 +274,7 @@ void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
static void smca_configure(unsigned int bank, unsigned int cpu)
{
+ struct mce_amd_cpu_data *data = this_cpu_ptr(&mce_amd_data);
u8 *bank_counts = this_cpu_ptr(smca_bank_counts);
const struct smca_hwid *s_hwid;
unsigned int i, hwid_mcatype;
@@ -294,11 +305,33 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
* APIC based interrupt. First, check that no interrupt has been
* set.
*/
- if ((low & BIT(5)) && !((high >> 5) & 0x3))
+ if ((low & BIT(5)) && !((high >> 5) & 0x3) && data->dfr_intr_en) {
+ __set_bit(bank, data->dfr_intr_banks);
high |= BIT(5);
+ }
+
+ /*
+ * SMCA Corrected Error Interrupt
+ *
+ * MCA_CONFIG[IntPresent] is bit 10, and tells us if the bank can
+ * send an MCA Thresholding interrupt without the OS initializing
+ * this feature. This can be used if the threshold limit is managed
+ * by the platform.
+ *
+ * MCA_CONFIG[IntEn] is bit 40 (8 in the high portion of the MSR).
+ * The OS should set this to inform the platform that the OS is ready
+ * to handle the MCA Thresholding interrupt.
+ */
+ if ((low & BIT(10)) && data->thr_intr_en) {
+ __set_bit(bank, data->thr_intr_banks);
+ high |= BIT(8);
+ }
this_cpu_ptr(mce_banks_array)[bank].lsb_in_status = !!(low & BIT(8));
+ if (low & MCI_CONFIG_PADDRV)
+ this_cpu_ptr(smca_banks)[bank].paddrv = 1;
+
wrmsr(smca_config, low, high);
}
@@ -368,6 +401,14 @@ static bool lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
{
int msr = (hi & MASK_LVTOFF_HI) >> 20;
+ /*
+ * On SMCA CPUs, LVT offset is programmed at a different MSR, and
+ * the BIOS provides the value. The original field where LVT offset
+ * was set is reserved. Return early here:
+ */
+ if (mce_flags.smca)
+ return false;
+
if (apic < 0) {
pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
"for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
@@ -376,14 +417,6 @@ static bool lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
}
if (apic != msr) {
- /*
- * On SMCA CPUs, LVT offset is programmed at a different MSR, and
- * the BIOS provides the value. The original field where LVT offset
- * was set is reserved. Return early here:
- */
- if (mce_flags.smca)
- return false;
-
pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
"for bank %d, block %d (MSR%08X=0x%x%08x)\n",
b->cpu, apic, b->bank, b->block, b->address, hi, lo);
@@ -443,6 +476,36 @@ static void threshold_restart_block(void *_tr)
wrmsr(tr->b->address, lo, hi);
}
+static void threshold_restart_bank(unsigned int bank, bool intr_en)
+{
+ struct threshold_bank **thr_banks = this_cpu_read(threshold_banks);
+ struct threshold_block *block, *tmp;
+ struct thresh_restart tr;
+
+ if (!thr_banks || !thr_banks[bank])
+ return;
+
+ memset(&tr, 0, sizeof(tr));
+
+ list_for_each_entry_safe(block, tmp, &thr_banks[bank]->miscj, miscj) {
+ tr.b = block;
+ tr.b->interrupt_enable = intr_en;
+ threshold_restart_block(&tr);
+ }
+}
+
+/* Try to use the threshold limit reported through APEI. */
+static u16 get_thr_limit(void)
+{
+ u32 thr_limit = mce_get_apei_thr_limit();
+
+ /* Fallback to old default if APEI limit is not available. */
+ if (!thr_limit)
+ return THRESHOLD_MAX;
+
+ return min(thr_limit, THRESHOLD_MAX);
+}
+
static void mce_threshold_block_init(struct threshold_block *b, int offset)
{
struct thresh_restart tr = {
@@ -451,7 +514,7 @@ static void mce_threshold_block_init(struct threshold_block *b, int offset)
.lvt_off = offset,
};
- b->threshold_limit = THRESHOLD_MAX;
+ b->threshold_limit = get_thr_limit();
threshold_restart_block(&tr);
};
@@ -464,41 +527,6 @@ static int setup_APIC_mce_threshold(int reserved, int new)
return reserved;
}
-static int setup_APIC_deferred_error(int reserved, int new)
-{
- if (reserved < 0 && !setup_APIC_eilvt(new, DEFERRED_ERROR_VECTOR,
- APIC_EILVT_MSG_FIX, 0))
- return new;
-
- return reserved;
-}
-
-static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
-{
- u32 low = 0, high = 0;
- int def_offset = -1, def_new;
-
- if (rdmsr_safe(MSR_CU_DEF_ERR, &low, &high))
- return;
-
- def_new = (low & MASK_DEF_LVTOFF) >> 4;
- if (!(low & MASK_DEF_LVTOFF)) {
- pr_err(FW_BUG "Your BIOS is not setting up LVT offset 0x2 for deferred error IRQs correctly.\n");
- def_new = DEF_LVT_OFF;
- low = (low & ~MASK_DEF_LVTOFF) | (DEF_LVT_OFF << 4);
- }
-
- def_offset = setup_APIC_deferred_error(def_offset, def_new);
- if ((def_offset == def_new) &&
- (deferred_error_int_vector != amd_deferred_error_interrupt))
- deferred_error_int_vector = amd_deferred_error_interrupt;
-
- if (!mce_flags.smca)
- low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC;
-
- wrmsr(MSR_CU_DEF_ERR, low, high);
-}
-
static u32 get_block_address(u32 current_addr, u32 low, u32 high,
unsigned int bank, unsigned int block,
unsigned int cpu)
@@ -534,12 +562,10 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high,
return addr;
}
-static int
-prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
- int offset, u32 misc_high)
+static int prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
+ int offset, u32 misc_high)
{
unsigned int cpu = smp_processor_id();
- u32 smca_low, smca_high;
struct threshold_block b;
int new;
@@ -556,20 +582,13 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
if (!b.interrupt_capable)
goto done;
+ __set_bit(bank, this_cpu_ptr(&mce_amd_data)->thr_intr_banks);
b.interrupt_enable = 1;
- if (!mce_flags.smca) {
- new = (misc_high & MASK_LVTOFF_HI) >> 20;
- goto set_offset;
- }
-
- /* Gather LVT offset for thresholding: */
- if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
- goto out;
-
- new = (smca_low & SMCA_THR_LVT_OFF) >> 12;
+ if (mce_flags.smca)
+ goto done;
-set_offset:
+ new = (misc_high & MASK_LVTOFF_HI) >> 20;
offset = setup_APIC_mce_threshold(offset, new);
if (offset == new)
thresholding_irq_en = true;
@@ -577,7 +596,6 @@ set_offset:
done:
mce_threshold_block_init(&b, offset);
-out:
return offset;
}
@@ -668,6 +686,32 @@ static void amd_apply_cpu_quirks(struct cpuinfo_x86 *c)
mce_banks[0].ctl = 0;
}
+/*
+ * Enable the APIC LVT interrupt vectors once per-CPU. This should be done before hardware is
+ * ready to send interrupts.
+ *
+ * Individual error sources are enabled later during per-bank init.
+ */
+static void smca_enable_interrupt_vectors(void)
+{
+ struct mce_amd_cpu_data *data = this_cpu_ptr(&mce_amd_data);
+ u64 mca_intr_cfg, offset;
+
+ if (!mce_flags.smca || !mce_flags.succor)
+ return;
+
+ if (rdmsrq_safe(MSR_CU_DEF_ERR, &mca_intr_cfg))
+ return;
+
+ offset = (mca_intr_cfg & SMCA_THR_LVT_OFF) >> 12;
+ if (!setup_APIC_eilvt(offset, THRESHOLD_APIC_VECTOR, APIC_EILVT_MSG_FIX, 0))
+ data->thr_intr_en = 1;
+
+ offset = (mca_intr_cfg & MASK_DEF_LVTOFF) >> 4;
+ if (!setup_APIC_eilvt(offset, DEFERRED_ERROR_VECTOR, APIC_EILVT_MSG_FIX, 0))
+ data->dfr_intr_en = 1;
+}
+
/* cpu init entry point, called from mce.c with preempt off */
void mce_amd_feature_init(struct cpuinfo_x86 *c)
{
@@ -679,10 +723,16 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
mce_flags.amd_threshold = 1;
+ smca_enable_interrupt_vectors();
+
for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
- if (mce_flags.smca)
+ if (mce_flags.smca) {
smca_configure(bank, cpu);
+ if (!this_cpu_ptr(&mce_amd_data)->thr_intr_en)
+ continue;
+ }
+
disable_err_thresholding(c, bank);
for (block = 0; block < NR_BLOCKS; ++block) {
@@ -703,9 +753,6 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
offset = prepare_threshold_block(bank, block, address, offset, high);
}
}
-
- if (mce_flags.succor)
- deferred_error_interrupt_enable(c);
}
void smca_bsp_init(void)
@@ -748,9 +795,9 @@ bool amd_mce_is_memory_error(struct mce *m)
}
/*
- * AMD systems do not have an explicit indicator that the value in MCA_ADDR is
- * a system physical address. Therefore, individual cases need to be detected.
- * Future cases and checks will be added as needed.
+ * Some AMD systems have an explicit indicator that the value in MCA_ADDR is a
+ * system physical address. Individual cases though, need to be detected for
+ * other systems. Future cases will be added as needed.
*
* 1) General case
* a) Assume address is not usable.
@@ -764,6 +811,8 @@ bool amd_mce_is_memory_error(struct mce *m)
* a) Reported in legacy bank 4 with extended error code (XEC) 8.
* b) MCA_STATUS[43] is *not* defined as poison in legacy bank 4. Therefore,
* this bit should not be checked.
+ * 4) MCI_STATUS_PADDRVAL is set
+ * a) Will provide a valid system physical address.
*
* NOTE: SMCA UMC memory errors fall into case #1.
*/
@@ -777,6 +826,9 @@ bool amd_mce_usable_address(struct mce *m)
return false;
}
+ if (this_cpu_ptr(smca_banks)[m->bank].paddrv)
+ return m->status & MCI_STATUS_PADDRV;
+
/* Check poison bit for all other bank types. */
if (m->status & MCI_STATUS_POISON)
return true;
@@ -785,37 +837,6 @@ bool amd_mce_usable_address(struct mce *m)
return false;
}
-static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
-{
- struct mce_hw_err err;
- struct mce *m = &err.m;
-
- mce_prep_record(&err);
-
- m->status = status;
- m->misc = misc;
- m->bank = bank;
- m->tsc = rdtsc();
-
- if (m->status & MCI_STATUS_ADDRV) {
- m->addr = addr;
-
- smca_extract_err_addr(m);
- }
-
- if (mce_flags.smca) {
- rdmsrq(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid);
-
- if (m->status & MCI_STATUS_SYNDV) {
- rdmsrq(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd);
- rdmsrq(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1);
- rdmsrq(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2);
- }
- }
-
- mce_log(&err);
-}
-
DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error)
{
trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
@@ -825,103 +846,20 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error)
apic_eoi();
}
-/*
- * Returns true if the logged error is deferred. False, otherwise.
- */
-static inline bool
-_log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc)
-{
- u64 status, addr = 0;
-
- rdmsrq(msr_stat, status);
- if (!(status & MCI_STATUS_VAL))
- return false;
-
- if (status & MCI_STATUS_ADDRV)
- rdmsrq(msr_addr, addr);
-
- __log_error(bank, status, addr, misc);
-
- wrmsrq(msr_stat, 0);
-
- return status & MCI_STATUS_DEFERRED;
-}
-
-static bool _log_error_deferred(unsigned int bank, u32 misc)
-{
- if (!_log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS),
- mca_msr_reg(bank, MCA_ADDR), misc))
- return false;
-
- /*
- * Non-SMCA systems don't have MCA_DESTAT/MCA_DEADDR registers.
- * Return true here to avoid accessing these registers.
- */
- if (!mce_flags.smca)
- return true;
-
- /* Clear MCA_DESTAT if the deferred error was logged from MCA_STATUS. */
- wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
- return true;
-}
-
-/*
- * We have three scenarios for checking for Deferred errors:
- *
- * 1) Non-SMCA systems check MCA_STATUS and log error if found.
- * 2) SMCA systems check MCA_STATUS. If error is found then log it and also
- * clear MCA_DESTAT.
- * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and
- * log it.
- */
-static void log_error_deferred(unsigned int bank)
-{
- if (_log_error_deferred(bank, 0))
- return;
-
- /*
- * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check
- * for a valid error.
- */
- _log_error_bank(bank, MSR_AMD64_SMCA_MCx_DESTAT(bank),
- MSR_AMD64_SMCA_MCx_DEADDR(bank), 0);
-}
-
/* APIC interrupt handler for deferred errors */
static void amd_deferred_error_interrupt(void)
{
- unsigned int bank;
-
- for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank)
- log_error_deferred(bank);
+ machine_check_poll(MCP_TIMESTAMP, &this_cpu_ptr(&mce_amd_data)->dfr_intr_banks);
}
-static void log_error_thresholding(unsigned int bank, u64 misc)
+void mce_amd_handle_storm(unsigned int bank, bool on)
{
- _log_error_deferred(bank, misc);
+ threshold_restart_bank(bank, on);
}
-static void log_and_reset_block(struct threshold_block *block)
+static void amd_reset_thr_limit(unsigned int bank)
{
- struct thresh_restart tr;
- u32 low = 0, high = 0;
-
- if (!block)
- return;
-
- if (rdmsr_safe(block->address, &low, &high))
- return;
-
- if (!(high & MASK_OVERFLOW_HI))
- return;
-
- /* Log the MCE which caused the threshold event. */
- log_error_thresholding(block->bank, ((u64)high << 32) | low);
-
- /* Reset threshold block after logging error. */
- memset(&tr, 0, sizeof(tr));
- tr.b = block;
- threshold_restart_block(&tr);
+ threshold_restart_bank(bank, true);
}
/*
@@ -930,33 +868,21 @@ static void log_and_reset_block(struct threshold_block *block)
*/
static void amd_threshold_interrupt(void)
{
- struct threshold_bank **bp = this_cpu_read(threshold_banks), *thr_bank;
- unsigned int bank, cpu = smp_processor_id();
- struct threshold_block *block, *tmp;
-
- /*
- * Validate that the threshold bank has been initialized already. The
- * handler is installed at boot time, but on a hotplug event the
- * interrupt might fire before the data has been initialized.
- */
- if (!bp)
- return;
-
- for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
- if (!(per_cpu(bank_map, cpu) & BIT_ULL(bank)))
- continue;
-
- thr_bank = bp[bank];
- if (!thr_bank)
- continue;
-
- list_for_each_entry_safe(block, tmp, &thr_bank->miscj, miscj)
- log_and_reset_block(block);
- }
+ machine_check_poll(MCP_TIMESTAMP, &this_cpu_ptr(&mce_amd_data)->thr_intr_banks);
}
void amd_clear_bank(struct mce *m)
{
+ amd_reset_thr_limit(m->bank);
+
+ /* Clear MCA_DESTAT for all deferred errors even those logged in MCA_STATUS. */
+ if (m->status & MCI_STATUS_DEFERRED)
+ mce_wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank), 0);
+
+ /* Don't clear MCA_STATUS if MCA_DESTAT was used exclusively. */
+ if (m->kflags & MCE_CHECK_DFR_REGS)
+ return;
+
mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0);
}
@@ -1172,7 +1098,7 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb
b->address = address;
b->interrupt_enable = 0;
b->interrupt_capable = lvt_interrupt_supported(bank, high);
- b->threshold_limit = THRESHOLD_MAX;
+ b->threshold_limit = get_thr_limit();
if (b->interrupt_capable) {
default_attrs[2] = &interrupt_enable.attr;
@@ -1183,6 +1109,8 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb
list_add(&b->miscj, &tb->miscj);
+ mce_threshold_block_init(b, (high & MASK_LVTOFF_HI) >> 20);
+
err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b));
if (err)
goto out_free;
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 460e90a1a0b1..4aff14e04287 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -687,7 +687,10 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i)
m->misc = mce_rdmsrq(mca_msr_reg(i, MCA_MISC));
if (m->status & MCI_STATUS_ADDRV) {
- m->addr = mce_rdmsrq(mca_msr_reg(i, MCA_ADDR));
+ if (m->kflags & MCE_CHECK_DFR_REGS)
+ m->addr = mce_rdmsrq(MSR_AMD64_SMCA_MCx_DEADDR(i));
+ else
+ m->addr = mce_rdmsrq(mca_msr_reg(i, MCA_ADDR));
/*
* Mask the reported address by the reported granularity.
@@ -715,6 +718,29 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i)
DEFINE_PER_CPU(unsigned, mce_poll_count);
/*
+ * We have three scenarios for checking for Deferred errors:
+ *
+ * 1) Non-SMCA systems check MCA_STATUS and log error if found.
+ * 2) SMCA systems check MCA_STATUS. If error is found then log it and also
+ * clear MCA_DESTAT.
+ * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and
+ * log it.
+ */
+static bool smca_should_log_poll_error(struct mce *m)
+{
+ if (m->status & MCI_STATUS_VAL)
+ return true;
+
+ m->status = mce_rdmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank));
+ if ((m->status & MCI_STATUS_VAL) && (m->status & MCI_STATUS_DEFERRED)) {
+ m->kflags |= MCE_CHECK_DFR_REGS;
+ return true;
+ }
+
+ return false;
+}
+
+/*
* Newer Intel systems that support software error
* recovery need to make additional checks. Other
* CPUs should skip over uncorrected errors, but log
@@ -740,6 +766,9 @@ static bool should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *err)
{
struct mce *m = &err->m;
+ if (mce_flags.smca)
+ return smca_should_log_poll_error(m);
+
/* If this entry is not valid, ignore it. */
if (!(m->status & MCI_STATUS_VAL))
return false;
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index b0e00ec5cc8c..a31cf984619c 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -67,6 +67,7 @@ void mce_track_storm(struct mce *mce);
void mce_inherit_storm(unsigned int bank);
bool mce_get_storm_mode(void);
void mce_set_storm_mode(bool storm);
+u32 mce_get_apei_thr_limit(void);
#else
static inline void cmci_storm_begin(unsigned int bank) {}
static inline void cmci_storm_end(unsigned int bank) {}
@@ -74,6 +75,7 @@ static inline void mce_track_storm(struct mce *mce) {}
static inline void mce_inherit_storm(unsigned int bank) {}
static inline bool mce_get_storm_mode(void) { return false; }
static inline void mce_set_storm_mode(bool storm) {}
+static inline u32 mce_get_apei_thr_limit(void) { return 0; }
#endif
/*
@@ -267,6 +269,7 @@ void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m);
#ifdef CONFIG_X86_MCE_AMD
void mce_threshold_create_device(unsigned int cpu);
void mce_threshold_remove_device(unsigned int cpu);
+void mce_amd_handle_storm(unsigned int bank, bool on);
extern bool amd_filter_mce(struct mce *m);
bool amd_mce_usable_address(struct mce *m);
void amd_clear_bank(struct mce *m);
@@ -299,6 +302,7 @@ void smca_bsp_init(void);
#else
static inline void mce_threshold_create_device(unsigned int cpu) { }
static inline void mce_threshold_remove_device(unsigned int cpu) { }
+static inline void mce_amd_handle_storm(unsigned int bank, bool on) { }
static inline bool amd_filter_mce(struct mce *m) { return false; }
static inline bool amd_mce_usable_address(struct mce *m) { return false; }
static inline void amd_clear_bank(struct mce *m) { }
diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c
index f4a007616468..0d13c9ffcba0 100644
--- a/arch/x86/kernel/cpu/mce/threshold.c
+++ b/arch/x86/kernel/cpu/mce/threshold.c
@@ -13,6 +13,19 @@
#include "internal.h"
+static u32 mce_apei_thr_limit;
+
+void mce_save_apei_thr_limit(u32 thr_limit)
+{
+ mce_apei_thr_limit = thr_limit;
+ pr_info("HEST corrected error threshold limit: %u\n", thr_limit);
+}
+
+u32 mce_get_apei_thr_limit(void)
+{
+ return mce_apei_thr_limit;
+}
+
static void default_threshold_interrupt(void)
{
pr_err("Unexpected threshold interrupt at vector %x\n",
@@ -63,6 +76,9 @@ static void mce_handle_storm(unsigned int bank, bool on)
case X86_VENDOR_INTEL:
mce_intel_handle_storm(bank, on);
break;
+ case X86_VENDOR_AMD:
+ mce_amd_handle_storm(bank, on);
+ break;
}
}
@@ -85,7 +101,8 @@ void cmci_storm_end(unsigned int bank)
{
struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
- __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ if (!mce_flags.amd_threshold)
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
storm->banks[bank].history = 0;
storm->banks[bank].in_storm_mode = false;
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 28ed8c089024..3821a985f4ff 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -186,60 +186,92 @@ static u32 cpuid_to_ucode_rev(unsigned int val)
return p.ucode_rev;
}
+static u32 get_cutoff_revision(u32 rev)
+{
+ switch (rev >> 8) {
+ case 0x80012: return 0x8001277; break;
+ case 0x80082: return 0x800820f; break;
+ case 0x83010: return 0x830107c; break;
+ case 0x86001: return 0x860010e; break;
+ case 0x86081: return 0x8608108; break;
+ case 0x87010: return 0x8701034; break;
+ case 0x8a000: return 0x8a0000a; break;
+ case 0xa0010: return 0xa00107a; break;
+ case 0xa0011: return 0xa0011da; break;
+ case 0xa0012: return 0xa001243; break;
+ case 0xa0082: return 0xa00820e; break;
+ case 0xa1011: return 0xa101153; break;
+ case 0xa1012: return 0xa10124e; break;
+ case 0xa1081: return 0xa108109; break;
+ case 0xa2010: return 0xa20102f; break;
+ case 0xa2012: return 0xa201212; break;
+ case 0xa4041: return 0xa404109; break;
+ case 0xa5000: return 0xa500013; break;
+ case 0xa6012: return 0xa60120a; break;
+ case 0xa7041: return 0xa704109; break;
+ case 0xa7052: return 0xa705208; break;
+ case 0xa7080: return 0xa708009; break;
+ case 0xa70c0: return 0xa70C009; break;
+ case 0xaa001: return 0xaa00116; break;
+ case 0xaa002: return 0xaa00218; break;
+ case 0xb0021: return 0xb002146; break;
+ case 0xb0081: return 0xb008111; break;
+ case 0xb1010: return 0xb101046; break;
+ case 0xb2040: return 0xb204031; break;
+ case 0xb4040: return 0xb404031; break;
+ case 0xb4041: return 0xb404101; break;
+ case 0xb6000: return 0xb600031; break;
+ case 0xb6080: return 0xb608031; break;
+ case 0xb7000: return 0xb700031; break;
+ default: break;
+
+ }
+ return 0;
+}
+
static bool need_sha_check(u32 cur_rev)
{
+ u32 cutoff;
+
if (!cur_rev) {
cur_rev = cpuid_to_ucode_rev(bsp_cpuid_1_eax);
pr_info_once("No current revision, generating the lowest one: 0x%x\n", cur_rev);
}
- switch (cur_rev >> 8) {
- case 0x80012: return cur_rev <= 0x8001277; break;
- case 0x80082: return cur_rev <= 0x800820f; break;
- case 0x83010: return cur_rev <= 0x830107c; break;
- case 0x86001: return cur_rev <= 0x860010e; break;
- case 0x86081: return cur_rev <= 0x8608108; break;
- case 0x87010: return cur_rev <= 0x8701034; break;
- case 0x8a000: return cur_rev <= 0x8a0000a; break;
- case 0xa0010: return cur_rev <= 0xa00107a; break;
- case 0xa0011: return cur_rev <= 0xa0011da; break;
- case 0xa0012: return cur_rev <= 0xa001243; break;
- case 0xa0082: return cur_rev <= 0xa00820e; break;
- case 0xa1011: return cur_rev <= 0xa101153; break;
- case 0xa1012: return cur_rev <= 0xa10124e; break;
- case 0xa1081: return cur_rev <= 0xa108109; break;
- case 0xa2010: return cur_rev <= 0xa20102f; break;
- case 0xa2012: return cur_rev <= 0xa201212; break;
- case 0xa4041: return cur_rev <= 0xa404109; break;
- case 0xa5000: return cur_rev <= 0xa500013; break;
- case 0xa6012: return cur_rev <= 0xa60120a; break;
- case 0xa7041: return cur_rev <= 0xa704109; break;
- case 0xa7052: return cur_rev <= 0xa705208; break;
- case 0xa7080: return cur_rev <= 0xa708009; break;
- case 0xa70c0: return cur_rev <= 0xa70C009; break;
- case 0xaa001: return cur_rev <= 0xaa00116; break;
- case 0xaa002: return cur_rev <= 0xaa00218; break;
- case 0xb0021: return cur_rev <= 0xb002146; break;
- case 0xb1010: return cur_rev <= 0xb101046; break;
- case 0xb2040: return cur_rev <= 0xb204031; break;
- case 0xb4040: return cur_rev <= 0xb404031; break;
- case 0xb6000: return cur_rev <= 0xb600031; break;
- case 0xb7000: return cur_rev <= 0xb700031; break;
- default: break;
- }
+ cutoff = get_cutoff_revision(cur_rev);
+ if (cutoff)
+ return cur_rev <= cutoff;
pr_info("You should not be seeing this. Please send the following couple of lines to x86-<at>-kernel.org\n");
pr_info("CPUID(1).EAX: 0x%x, current revision: 0x%x\n", bsp_cpuid_1_eax, cur_rev);
return true;
}
+static bool cpu_has_entrysign(void)
+{
+ unsigned int fam = x86_family(bsp_cpuid_1_eax);
+ unsigned int model = x86_model(bsp_cpuid_1_eax);
+
+ if (fam == 0x17 || fam == 0x19)
+ return true;
+
+ if (fam == 0x1a) {
+ if (model <= 0x2f ||
+ (0x40 <= model && model <= 0x4f) ||
+ (0x60 <= model && model <= 0x6f))
+ return true;
+ }
+
+ return false;
+}
+
static bool verify_sha256_digest(u32 patch_id, u32 cur_rev, const u8 *data, unsigned int len)
{
struct patch_digest *pd = NULL;
u8 digest[SHA256_DIGEST_SIZE];
int i;
- if (x86_family(bsp_cpuid_1_eax) < 0x17)
+ if (!cpu_has_entrysign())
return true;
if (!need_sha_check(cur_rev))
@@ -473,6 +505,7 @@ static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size)
{
u8 family = x86_family(bsp_cpuid_1_eax);
struct microcode_header_amd *mc_hdr;
+ u32 cur_rev, cutoff, patch_rev;
u32 sh_psize;
u16 proc_id;
u8 patch_fam;
@@ -512,11 +545,32 @@ static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size)
proc_id = mc_hdr->processor_rev_id;
patch_fam = 0xf + (proc_id >> 12);
- ucode_dbg("Patch-ID 0x%08x: family: 0x%x\n", mc_hdr->patch_id, patch_fam);
-
if (patch_fam != family)
return 1;
+ cur_rev = get_patch_level();
+
+ /* No cutoff revision means old/unaffected by signing algorithm weakness => matches */
+ cutoff = get_cutoff_revision(cur_rev);
+ if (!cutoff)
+ goto ok;
+
+ patch_rev = mc_hdr->patch_id;
+
+ ucode_dbg("cur_rev: 0x%x, cutoff: 0x%x, patch_rev: 0x%x\n",
+ cur_rev, cutoff, patch_rev);
+
+ if (cur_rev <= cutoff && patch_rev <= cutoff)
+ goto ok;
+
+ if (cur_rev > cutoff && patch_rev > cutoff)
+ goto ok;
+
+ return 1;
+
+ok:
+ ucode_dbg("Patch-ID 0x%08x: family: 0x%x\n", mc_hdr->patch_id, patch_fam);
+
return 0;
}
@@ -585,8 +639,6 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc)
mc = (struct microcode_amd *)(buf + SECTION_HDR_SIZE);
- ucode_dbg("patch_id: 0x%x\n", mc->hdr.patch_id);
-
if (mc_patch_matches(mc, eq_id)) {
desc->psize = patch_size;
desc->mc = mc;
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index f75c140906d0..ccc83b0bf63c 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -136,7 +136,7 @@ bool __init microcode_loader_disabled(void)
return dis_ucode_ldr;
}
-static void early_parse_cmdline(void)
+static void __init early_parse_cmdline(void)
{
char cmd_buf[64] = {};
char *s, *p = cmd_buf;
@@ -589,6 +589,17 @@ static int load_late_stop_cpus(bool is_safe)
pr_err("You should switch to early loading, if possible.\n");
}
+ /*
+ * Pre-load the microcode image into a staging device. This
+ * process is preemptible and does not require stopping CPUs.
+ * Successful staging simplifies the subsequent late-loading
+ * process, reducing rendezvous time.
+ *
+ * Even if the transfer fails, the update will proceed as usual.
+ */
+ if (microcode_ops->use_staging)
+ microcode_ops->stage_microcode();
+
atomic_set(&late_cpus_in, num_online_cpus());
atomic_set(&offline_in_nmi, 0);
loops_per_usec = loops_per_jiffy / (TICK_NSEC / 1000);
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 371ca6eac00e..8744f3adc2a0 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -13,12 +13,15 @@
#define pr_fmt(fmt) "microcode: " fmt
#include <linux/earlycpio.h>
#include <linux/firmware.h>
+#include <linux/pci_ids.h>
#include <linux/uaccess.h>
#include <linux/initrd.h>
#include <linux/kernel.h>
+#include <linux/delay.h>
#include <linux/slab.h>
#include <linux/cpu.h>
#include <linux/uio.h>
+#include <linux/io.h>
#include <linux/mm.h>
#include <asm/cpu_device_id.h>
@@ -33,6 +36,38 @@ static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
#define UCODE_BSP_LOADED ((struct microcode_intel *)0x1UL)
+/* Defines for the microcode staging mailbox interface */
+#define MBOX_REG_NUM 4
+#define MBOX_REG_SIZE sizeof(u32)
+
+#define MBOX_CONTROL_OFFSET 0x0
+#define MBOX_STATUS_OFFSET 0x4
+#define MBOX_WRDATA_OFFSET 0x8
+#define MBOX_RDDATA_OFFSET 0xc
+
+#define MASK_MBOX_CTRL_ABORT BIT(0)
+#define MASK_MBOX_CTRL_GO BIT(31)
+
+#define MASK_MBOX_STATUS_ERROR BIT(2)
+#define MASK_MBOX_STATUS_READY BIT(31)
+
+#define MASK_MBOX_RESP_SUCCESS BIT(0)
+#define MASK_MBOX_RESP_PROGRESS BIT(1)
+#define MASK_MBOX_RESP_ERROR BIT(2)
+
+#define MBOX_CMD_LOAD 0x3
+#define MBOX_OBJ_STAGING 0xb
+#define MBOX_HEADER(size) ((PCI_VENDOR_ID_INTEL) | \
+ (MBOX_OBJ_STAGING << 16) | \
+ ((u64)((size) / sizeof(u32)) << 32))
+
+/* The size of each mailbox header */
+#define MBOX_HEADER_SIZE sizeof(u64)
+/* The size of staging hardware response */
+#define MBOX_RESPONSE_SIZE sizeof(u64)
+
+#define MBOX_XACTION_TIMEOUT_MS (10 * MSEC_PER_SEC)
+
/* Current microcode patch used in early patching on the APs. */
static struct microcode_intel *ucode_patch_va __read_mostly;
static struct microcode_intel *ucode_patch_late __read_mostly;
@@ -54,6 +89,23 @@ struct extended_sigtable {
struct extended_signature sigs[];
};
+/**
+ * struct staging_state - Track the current staging process state
+ *
+ * @mmio_base: MMIO base address for staging
+ * @ucode_len: Total size of the microcode image
+ * @chunk_size: Size of each data piece
+ * @bytes_sent: Total bytes transmitted so far
+ * @offset: Current offset in the microcode image
+ */
+struct staging_state {
+ void __iomem *mmio_base;
+ unsigned int ucode_len;
+ unsigned int chunk_size;
+ unsigned int bytes_sent;
+ unsigned int offset;
+};
+
#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
@@ -299,6 +351,298 @@ static __init struct microcode_intel *scan_microcode(void *data, size_t size,
return size ? NULL : patch;
}
+static inline u32 read_mbox_dword(void __iomem *mmio_base)
+{
+ u32 dword = readl(mmio_base + MBOX_RDDATA_OFFSET);
+
+ /* Acknowledge read completion to the staging hardware */
+ writel(0, mmio_base + MBOX_RDDATA_OFFSET);
+ return dword;
+}
+
+static inline void write_mbox_dword(void __iomem *mmio_base, u32 dword)
+{
+ writel(dword, mmio_base + MBOX_WRDATA_OFFSET);
+}
+
+static inline u64 read_mbox_header(void __iomem *mmio_base)
+{
+ u32 high, low;
+
+ low = read_mbox_dword(mmio_base);
+ high = read_mbox_dword(mmio_base);
+
+ return ((u64)high << 32) | low;
+}
+
+static inline void write_mbox_header(void __iomem *mmio_base, u64 value)
+{
+ write_mbox_dword(mmio_base, value);
+ write_mbox_dword(mmio_base, value >> 32);
+}
+
+static void write_mbox_data(void __iomem *mmio_base, u32 *chunk, unsigned int chunk_bytes)
+{
+ int i;
+
+ /*
+ * The MMIO space is mapped as Uncached (UC). Each write arrives
+ * at the device as an individual transaction in program order.
+ * The device can then reassemble the sequence accordingly.
+ */
+ for (i = 0; i < chunk_bytes / sizeof(u32); i++)
+ write_mbox_dword(mmio_base, chunk[i]);
+}
+
+/*
+ * Prepare for a new microcode transfer: reset hardware and record the
+ * image size.
+ */
+static void init_stage(struct staging_state *ss)
+{
+ ss->ucode_len = get_totalsize(&ucode_patch_late->hdr);
+
+ /*
+ * Abort any ongoing process, effectively resetting the device.
+ * Unlike regular mailbox data processing requests, this
+ * operation does not require a status check.
+ */
+ writel(MASK_MBOX_CTRL_ABORT, ss->mmio_base + MBOX_CONTROL_OFFSET);
+}
+
+/*
+ * Update the chunk size and decide whether another chunk can be sent.
+ * This accounts for remaining data and retry limits.
+ */
+static bool can_send_next_chunk(struct staging_state *ss, int *err)
+{
+ /* A page size or remaining bytes if this is the final chunk */
+ ss->chunk_size = min(PAGE_SIZE, ss->ucode_len - ss->offset);
+
+ /*
+ * Each microcode image is divided into chunks, each at most
+ * one page size. A 10-chunk image would typically require 10
+ * transactions.
+ *
+ * However, the hardware managing the mailbox has limited
+ * resources and may not cache the entire image, potentially
+ * requesting the same chunk multiple times.
+ *
+ * To tolerate this behavior, allow up to twice the expected
+ * number of transactions (i.e., a 10-chunk image can take up to
+ * 20 attempts).
+ *
+ * If the number of attempts exceeds this limit, treat it as
+ * exceeding the maximum allowed transfer size.
+ */
+ if (ss->bytes_sent + ss->chunk_size > ss->ucode_len * 2) {
+ *err = -EMSGSIZE;
+ return false;
+ }
+
+ *err = 0;
+ return true;
+}
+
+/*
+ * The hardware indicates completion by returning a sentinel end offset.
+ */
+static inline bool is_end_offset(u32 offset)
+{
+ return offset == UINT_MAX;
+}
+
+/*
+ * Determine whether staging is complete: either the hardware signaled
+ * the end offset, or no more transactions are permitted (retry limit
+ * reached).
+ */
+static inline bool staging_is_complete(struct staging_state *ss, int *err)
+{
+ return is_end_offset(ss->offset) || !can_send_next_chunk(ss, err);
+}
+
+/*
+ * Wait for the hardware to complete a transaction.
+ * Return 0 on success, or an error code on failure.
+ */
+static int wait_for_transaction(struct staging_state *ss)
+{
+ u32 timeout, status;
+
+ /* Allow time for hardware to complete the operation: */
+ for (timeout = 0; timeout < MBOX_XACTION_TIMEOUT_MS; timeout++) {
+ msleep(1);
+
+ status = readl(ss->mmio_base + MBOX_STATUS_OFFSET);
+ /* Break out early if the hardware is ready: */
+ if (status & MASK_MBOX_STATUS_READY)
+ break;
+ }
+
+ /* Check for explicit error response */
+ if (status & MASK_MBOX_STATUS_ERROR)
+ return -EIO;
+
+ /*
+ * Hardware has neither responded to the action nor signaled any
+ * error. Treat this as a timeout.
+ */
+ if (!(status & MASK_MBOX_STATUS_READY))
+ return -ETIMEDOUT;
+
+ return 0;
+}
+
+/*
+ * Transmit a chunk of the microcode image to the hardware.
+ * Return 0 on success, or an error code on failure.
+ */
+static int send_data_chunk(struct staging_state *ss, void *ucode_ptr)
+{
+ u32 *src_chunk = ucode_ptr + ss->offset;
+ u16 mbox_size;
+
+ /*
+ * Write a 'request' mailbox object in this order:
+ * 1. Mailbox header includes total size
+ * 2. Command header specifies the load operation
+ * 3. Data section contains a microcode chunk
+ *
+ * Thus, the mailbox size is two headers plus the chunk size.
+ */
+ mbox_size = MBOX_HEADER_SIZE * 2 + ss->chunk_size;
+ write_mbox_header(ss->mmio_base, MBOX_HEADER(mbox_size));
+ write_mbox_header(ss->mmio_base, MBOX_CMD_LOAD);
+ write_mbox_data(ss->mmio_base, src_chunk, ss->chunk_size);
+ ss->bytes_sent += ss->chunk_size;
+
+ /* Notify the hardware that the mailbox is ready for processing. */
+ writel(MASK_MBOX_CTRL_GO, ss->mmio_base + MBOX_CONTROL_OFFSET);
+
+ return wait_for_transaction(ss);
+}
+
+/*
+ * Retrieve the next offset from the hardware response.
+ * Return 0 on success, or an error code on failure.
+ */
+static int fetch_next_offset(struct staging_state *ss)
+{
+ const u64 expected_header = MBOX_HEADER(MBOX_HEADER_SIZE + MBOX_RESPONSE_SIZE);
+ u32 offset, status;
+ u64 header;
+
+ /*
+ * The 'response' mailbox returns three fields, in order:
+ * 1. Header
+ * 2. Next offset in the microcode image
+ * 3. Status flags
+ */
+ header = read_mbox_header(ss->mmio_base);
+ offset = read_mbox_dword(ss->mmio_base);
+ status = read_mbox_dword(ss->mmio_base);
+
+ /* All valid responses must start with the expected header. */
+ if (header != expected_header) {
+ pr_err_once("staging: invalid response header (0x%llx)\n", header);
+ return -EBADR;
+ }
+
+ /*
+ * Verify the offset: If not at the end marker, it must not
+ * exceed the microcode image length.
+ */
+ if (!is_end_offset(offset) && offset > ss->ucode_len) {
+ pr_err_once("staging: invalid offset (%u) past the image end (%u)\n",
+ offset, ss->ucode_len);
+ return -EINVAL;
+ }
+
+ /* Hardware may report errors explicitly in the status field */
+ if (status & MASK_MBOX_RESP_ERROR)
+ return -EPROTO;
+
+ ss->offset = offset;
+ return 0;
+}
+
+/*
+ * Handle the staging process using the mailbox MMIO interface. The
+ * microcode image is transferred in chunks until completion.
+ * Return 0 on success or an error code on failure.
+ */
+static int do_stage(u64 mmio_pa)
+{
+ struct staging_state ss = {};
+ int err;
+
+ ss.mmio_base = ioremap(mmio_pa, MBOX_REG_NUM * MBOX_REG_SIZE);
+ if (WARN_ON_ONCE(!ss.mmio_base))
+ return -EADDRNOTAVAIL;
+
+ init_stage(&ss);
+
+ /* Perform the staging process while within the retry limit */
+ while (!staging_is_complete(&ss, &err)) {
+ /* Send a chunk of microcode each time: */
+ err = send_data_chunk(&ss, ucode_patch_late);
+ if (err)
+ break;
+ /*
+ * Then, ask the hardware which piece of the image it
+ * needs next. The same piece may be sent more than once.
+ */
+ err = fetch_next_offset(&ss);
+ if (err)
+ break;
+ }
+
+ iounmap(ss.mmio_base);
+
+ return err;
+}
+
+static void stage_microcode(void)
+{
+ unsigned int pkg_id = UINT_MAX;
+ int cpu, err;
+ u64 mmio_pa;
+
+ if (!IS_ALIGNED(get_totalsize(&ucode_patch_late->hdr), sizeof(u32))) {
+ pr_err("Microcode image 32-bit misaligned (0x%x), staging failed.\n",
+ get_totalsize(&ucode_patch_late->hdr));
+ return;
+ }
+
+ lockdep_assert_cpus_held();
+
+ /*
+ * The MMIO address is unique per package, and all the SMT
+ * primary threads are online here. Find each MMIO space by
+ * their package IDs to avoid duplicate staging.
+ */
+ for_each_cpu(cpu, cpu_primary_thread_mask) {
+ if (topology_logical_package_id(cpu) == pkg_id)
+ continue;
+
+ pkg_id = topology_logical_package_id(cpu);
+
+ err = rdmsrq_on_cpu(cpu, MSR_IA32_MCU_STAGING_MBOX_ADDR, &mmio_pa);
+ if (WARN_ON_ONCE(err))
+ return;
+
+ err = do_stage(mmio_pa);
+ if (err) {
+ pr_err("Error: staging failed (%d) for CPU%d at package %u.\n",
+ err, cpu, pkg_id);
+ return;
+ }
+ }
+
+ pr_info("Staging of patch revision 0x%x succeeded.\n", ucode_patch_late->hdr.rev);
+}
+
static enum ucode_state __apply_microcode(struct ucode_cpu_info *uci,
struct microcode_intel *mc,
u32 *cur_rev)
@@ -627,6 +971,7 @@ static struct microcode_ops microcode_intel_ops = {
.collect_cpu_info = collect_cpu_info,
.apply_microcode = apply_microcode_late,
.finalize_late_load = finalize_late_load,
+ .stage_microcode = stage_microcode,
.use_nmi = IS_ENABLED(CONFIG_X86_64),
};
@@ -638,6 +983,18 @@ static __init void calc_llc_size_per_core(struct cpuinfo_x86 *c)
llc_size_per_core = (unsigned int)llc_size;
}
+static __init bool staging_available(void)
+{
+ u64 val;
+
+ val = x86_read_arch_cap_msr();
+ if (!(val & ARCH_CAP_MCU_ENUM))
+ return false;
+
+ rdmsrq(MSR_IA32_MCU_ENUMERATION, val);
+ return !!(val & MCU_STAGING);
+}
+
struct microcode_ops * __init init_intel_microcode(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
@@ -648,6 +1005,11 @@ struct microcode_ops * __init init_intel_microcode(void)
return NULL;
}
+ if (staging_available()) {
+ microcode_intel_ops.use_staging = true;
+ pr_info("Enabled staging feature.\n");
+ }
+
calc_llc_size_per_core(c);
return &microcode_intel_ops;
diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h
index ae8dbc2b908d..a10b547eda1e 100644
--- a/arch/x86/kernel/cpu/microcode/internal.h
+++ b/arch/x86/kernel/cpu/microcode/internal.h
@@ -31,10 +31,12 @@ struct microcode_ops {
* See also the "Synchronization" section in microcode_core.c.
*/
enum ucode_state (*apply_microcode)(int cpu);
+ void (*stage_microcode)(void);
int (*collect_cpu_info)(int cpu, struct cpu_signature *csig);
void (*finalize_late_load)(int result);
unsigned int nmi_safe : 1,
- use_nmi : 1;
+ use_nmi : 1,
+ use_staging : 1;
};
struct early_load_data {
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 8c18327eb10b..0863733858dc 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -89,7 +89,6 @@ static int mtrr_state_set;
u64 mtrr_tom2;
struct mtrr_state_type mtrr_state;
-EXPORT_SYMBOL_GPL(mtrr_state);
/* Reserved bits in the high portion of the MTRRphysBaseN MSR. */
u32 phys_hi_rsvd;
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 5655f253d929..2de3bd2f95d1 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -46,10 +46,6 @@ struct set_mtrr_context {
u32 ccr3;
};
-void set_mtrr_done(struct set_mtrr_context *ctxt);
-void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
-void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
-
void fill_mtrr_var_range(unsigned int index,
u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
bool get_mtrr_state(void);
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 06ca5a30140c..3792ab4819dc 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -274,6 +274,11 @@ static void rdt_get_cdp_config(int level)
rdt_resources_all[level].r_resctrl.cdp_capable = true;
}
+static void rdt_set_io_alloc_capable(struct rdt_resource *r)
+{
+ r->cache.io_alloc_capable = true;
+}
+
static void rdt_get_cdp_l3_config(void)
{
rdt_get_cdp_config(RDT_RESOURCE_L3);
@@ -719,6 +724,7 @@ enum {
RDT_FLAG_SMBA,
RDT_FLAG_BMEC,
RDT_FLAG_ABMC,
+ RDT_FLAG_SDCIAE,
};
#define RDT_OPT(idx, n, f) \
@@ -745,6 +751,7 @@ static struct rdt_options rdt_options[] __ro_after_init = {
RDT_OPT(RDT_FLAG_SMBA, "smba", X86_FEATURE_SMBA),
RDT_OPT(RDT_FLAG_BMEC, "bmec", X86_FEATURE_BMEC),
RDT_OPT(RDT_FLAG_ABMC, "abmc", X86_FEATURE_ABMC),
+ RDT_OPT(RDT_FLAG_SDCIAE, "sdciae", X86_FEATURE_SDCIAE),
};
#define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options)
@@ -853,6 +860,8 @@ static __init bool get_rdt_alloc_resources(void)
rdt_get_cache_alloc_cfg(1, r);
if (rdt_cpu_has(X86_FEATURE_CDP_L3))
rdt_get_cdp_l3_config();
+ if (rdt_cpu_has(X86_FEATURE_SDCIAE))
+ rdt_set_io_alloc_capable(r);
ret = true;
}
if (rdt_cpu_has(X86_FEATURE_CAT_L2)) {
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 1189c0df4ad7..b20e705606b8 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -91,3 +91,43 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d,
return hw_dom->ctrl_val[idx];
}
+
+bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r)
+{
+ return resctrl_to_arch_res(r)->sdciae_enabled;
+}
+
+static void resctrl_sdciae_set_one_amd(void *arg)
+{
+ bool *enable = arg;
+
+ if (*enable)
+ msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, SDCIAE_ENABLE_BIT);
+ else
+ msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, SDCIAE_ENABLE_BIT);
+}
+
+static void _resctrl_sdciae_enable(struct rdt_resource *r, bool enable)
+{
+ struct rdt_ctrl_domain *d;
+
+ /* Walking r->ctrl_domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
+ /* Update MSR_IA32_L3_QOS_EXT_CFG MSR on all the CPUs in all domains */
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list)
+ on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_sdciae_set_one_amd, &enable, 1);
+}
+
+int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+
+ if (hw_res->r_resctrl.cache.io_alloc_capable &&
+ hw_res->sdciae_enabled != enable) {
+ _resctrl_sdciae_enable(r, enable);
+ hw_res->sdciae_enabled = enable;
+ }
+
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 9f4c2f0aaf5c..4a916c84a322 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -46,6 +46,9 @@ struct arch_mbm_state {
#define ABMC_EXTENDED_EVT_ID BIT(31)
#define ABMC_EVT_ID BIT(0)
+/* Setting bit 1 in MSR_IA32_L3_QOS_EXT_CFG enables the SDCIAE feature. */
+#define SDCIAE_ENABLE_BIT 1
+
/**
* struct rdt_hw_ctrl_domain - Arch private attributes of a set of CPUs that share
* a resource for a control function
@@ -112,6 +115,7 @@ struct msr_param {
* @mbm_width: Monitor width, to detect and correct for overflow.
* @cdp_enabled: CDP state of this resource
* @mbm_cntr_assign_enabled: ABMC feature is enabled
+ * @sdciae_enabled: SDCIAE feature (backing "io_alloc") is enabled.
*
* Members of this structure are either private to the architecture
* e.g. mbm_width, or accessed via helpers that provide abstraction. e.g.
@@ -126,6 +130,7 @@ struct rdt_hw_resource {
unsigned int mbm_width;
bool cdp_enabled;
bool mbm_cntr_assign_enabled;
+ bool sdciae_enabled;
};
static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r)
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index fe1a2aa53c16..dffcc8307500 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -361,6 +361,7 @@ static const struct x86_cpu_id snc_cpu_ids[] __initconst = {
X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0),
X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0),
X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0),
+ X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, 0),
{}
};
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index caa4dc885c21..cde4b6cd3471 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -43,7 +43,11 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 },
{ X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 },
{ X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 },
+ { X86_FEATURE_SGX_EUPDATESVN, CPUID_EAX, 10, 0x00000012, 0 },
{ X86_FEATURE_SGX_EDECCSSA, CPUID_EAX, 11, 0x00000012, 0 },
+ { X86_FEATURE_OVERFLOW_RECOV, CPUID_EBX, 0, 0x80000007, 0 },
+ { X86_FEATURE_SUCCOR, CPUID_EBX, 1, 0x80000007, 0 },
+ { X86_FEATURE_SMCA, CPUID_EBX, 3, 0x80000007, 0 },
{ X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
@@ -53,6 +57,7 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 },
{ X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 },
{ X86_FEATURE_ABMC, CPUID_EBX, 5, 0x80000020, 0 },
+ { X86_FEATURE_SDCIAE, CPUID_EBX, 6, 0x80000020, 0 },
{ X86_FEATURE_TSA_SQ_NO, CPUID_ECX, 1, 0x80000021, 0 },
{ X86_FEATURE_TSA_L1_NO, CPUID_ECX, 2, 0x80000021, 0 },
{ X86_FEATURE_AMD_WORKLOAD_CLASS, CPUID_EAX, 22, 0x80000021, 0 },
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
index 7f8d1e11dbee..79d6020dfe9c 100644
--- a/arch/x86/kernel/cpu/sgx/driver.c
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -14,7 +14,7 @@ u64 sgx_attributes_reserved_mask;
u64 sgx_xfrm_reserved_mask = ~0x3;
u32 sgx_misc_reserved_mask;
-static int sgx_open(struct inode *inode, struct file *file)
+static int __sgx_open(struct inode *inode, struct file *file)
{
struct sgx_encl *encl;
int ret;
@@ -41,6 +41,23 @@ static int sgx_open(struct inode *inode, struct file *file)
return 0;
}
+static int sgx_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ ret = sgx_inc_usage_count();
+ if (ret)
+ return ret;
+
+ ret = __sgx_open(inode, file);
+ if (ret) {
+ sgx_dec_usage_count();
+ return ret;
+ }
+
+ return 0;
+}
+
static int sgx_release(struct inode *inode, struct file *file)
{
struct sgx_encl *encl = file->private_data;
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 308dbbae6c6e..cf149b9f4916 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -765,6 +765,7 @@ void sgx_encl_release(struct kref *ref)
WARN_ON_ONCE(encl->secs.epc_page);
kfree(encl);
+ sgx_dec_usage_count();
}
/*
diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h
index 42a088a337c5..74be751199a4 100644
--- a/arch/x86/kernel/cpu/sgx/encls.h
+++ b/arch/x86/kernel/cpu/sgx/encls.h
@@ -233,4 +233,9 @@ static inline int __eaug(struct sgx_pageinfo *pginfo, void *addr)
return __encls_2(EAUG, pginfo, addr);
}
+/* Attempt to update CPUSVN at runtime. */
+static inline int __eupdatesvn(void)
+{
+ return __encls_ret_1(EUPDATESVN, "");
+}
#endif /* _X86_ENCLS_H */
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 2de01b379aa3..dc73194416ac 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -5,6 +5,7 @@
#include <linux/freezer.h>
#include <linux/highmem.h>
#include <linux/kthread.h>
+#include <linux/kvm_types.h>
#include <linux/miscdevice.h>
#include <linux/node.h>
#include <linux/pagemap.h>
@@ -16,6 +17,7 @@
#include <linux/vmalloc.h>
#include <asm/msr.h>
#include <asm/sgx.h>
+#include <asm/archrandom.h>
#include "driver.h"
#include "encl.h"
#include "encls.h"
@@ -915,7 +917,107 @@ int sgx_set_attribute(unsigned long *allowed_attributes,
*allowed_attributes |= SGX_ATTR_PROVISIONKEY;
return 0;
}
-EXPORT_SYMBOL_GPL(sgx_set_attribute);
+EXPORT_SYMBOL_FOR_KVM(sgx_set_attribute);
+
+/* Counter to count the active SGX users */
+static int sgx_usage_count;
+
+/**
+ * sgx_update_svn() - Attempt to call ENCLS[EUPDATESVN].
+ *
+ * This instruction attempts to update CPUSVN to the
+ * currently loaded microcode update SVN and generate new
+ * cryptographic assets.
+ *
+ * Return:
+ * * %0: - Success or not supported
+ * * %-EAGAIN: - Can be safely retried, failure is due to lack of
+ * * entropy in RNG
+ * * %-EIO: - Unexpected error, retries are not advisable
+ */
+static int sgx_update_svn(void)
+{
+ int ret;
+
+ /*
+ * If EUPDATESVN is not available, it is ok to
+ * silently skip it to comply with legacy behavior.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_SGX_EUPDATESVN))
+ return 0;
+
+ /*
+ * EPC is guaranteed to be empty when there are no users.
+ * Ensure we are on our first user before proceeding further.
+ */
+ WARN(sgx_usage_count, "Elevated usage count when calling EUPDATESVN\n");
+
+ for (int i = 0; i < RDRAND_RETRY_LOOPS; i++) {
+ ret = __eupdatesvn();
+
+ /* Stop on success or unexpected errors: */
+ if (ret != SGX_INSUFFICIENT_ENTROPY)
+ break;
+ }
+
+ switch (ret) {
+ case 0:
+ /*
+ * SVN successfully updated.
+ * Let users know when the update was successful.
+ */
+ pr_info("SVN updated successfully\n");
+ return 0;
+ case SGX_NO_UPDATE:
+ /*
+ * SVN update failed since the current SVN is
+ * not newer than CPUSVN. This is the most
+ * common case and indicates no harm.
+ */
+ return 0;
+ case SGX_INSUFFICIENT_ENTROPY:
+ /*
+ * SVN update failed due to lack of entropy in DRNG.
+ * Indicate to userspace that it should retry.
+ */
+ return -EAGAIN;
+ default:
+ break;
+ }
+
+ /*
+ * EUPDATESVN was called when EPC is empty, all other error
+ * codes are unexpected.
+ */
+ ENCLS_WARN(ret, "EUPDATESVN");
+ return -EIO;
+}
+
+/* Mutex to ensure no concurrent EPC accesses during EUPDATESVN */
+static DEFINE_MUTEX(sgx_svn_lock);
+
+int sgx_inc_usage_count(void)
+{
+ int ret;
+
+ guard(mutex)(&sgx_svn_lock);
+
+ if (!sgx_usage_count) {
+ ret = sgx_update_svn();
+ if (ret)
+ return ret;
+ }
+
+ sgx_usage_count++;
+
+ return 0;
+}
+
+void sgx_dec_usage_count(void)
+{
+ guard(mutex)(&sgx_svn_lock);
+ sgx_usage_count--;
+}
static int __init sgx_init(void)
{
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index d2dad21259a8..f5940393d9bd 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -102,6 +102,9 @@ static inline int __init sgx_vepc_init(void)
}
#endif
+int sgx_inc_usage_count(void);
+void sgx_dec_usage_count(void);
+
void sgx_update_lepubkeyhash(u64 *lepubkeyhash);
#endif /* _X86_SGX_H */
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
index 7aaa3652e31d..8de1f1a755f2 100644
--- a/arch/x86/kernel/cpu/sgx/virt.c
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -5,6 +5,7 @@
* Copyright(c) 2021 Intel Corporation.
*/
+#include <linux/kvm_types.h>
#include <linux/miscdevice.h>
#include <linux/mm.h>
#include <linux/mman.h>
@@ -255,10 +256,11 @@ static int sgx_vepc_release(struct inode *inode, struct file *file)
xa_destroy(&vepc->page_array);
kfree(vepc);
+ sgx_dec_usage_count();
return 0;
}
-static int sgx_vepc_open(struct inode *inode, struct file *file)
+static int __sgx_vepc_open(struct inode *inode, struct file *file)
{
struct sgx_vepc *vepc;
@@ -273,6 +275,23 @@ static int sgx_vepc_open(struct inode *inode, struct file *file)
return 0;
}
+static int sgx_vepc_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ ret = sgx_inc_usage_count();
+ if (ret)
+ return ret;
+
+ ret = __sgx_vepc_open(inode, file);
+ if (ret) {
+ sgx_dec_usage_count();
+ return ret;
+ }
+
+ return 0;
+}
+
static long sgx_vepc_ioctl(struct file *file,
unsigned int cmd, unsigned long arg)
{
@@ -363,7 +382,7 @@ int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
WARN_ON_ONCE(ret);
return 0;
}
-EXPORT_SYMBOL_GPL(sgx_virt_ecreate);
+EXPORT_SYMBOL_FOR_KVM(sgx_virt_ecreate);
static int __sgx_virt_einit(void __user *sigstruct, void __user *token,
void __user *secs)
@@ -432,4 +451,4 @@ int sgx_virt_einit(void __user *sigstruct, void __user *token,
return ret;
}
-EXPORT_SYMBOL_GPL(sgx_virt_einit);
+EXPORT_SYMBOL_FOR_KVM(sgx_virt_einit);
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index 6073a16628f9..f55ea3cdbf88 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -75,15 +75,11 @@ bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
return phys_id == (u64)cpuid_to_apicid[cpu];
}
-#ifdef CONFIG_SMP
static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid)
{
if (!(apicid & (__max_threads_per_core - 1)))
cpumask_set_cpu(cpu, &__cpu_primary_thread_mask);
}
-#else
-static inline void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { }
-#endif
/*
* Convert the APIC ID to a domain level ID by masking out the low bits
diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c
index b5a5e1411469..71625795d711 100644
--- a/arch/x86/kernel/cpu/topology_common.c
+++ b/arch/x86/kernel/cpu/topology_common.c
@@ -16,6 +16,9 @@ EXPORT_SYMBOL_GPL(x86_topo_system);
unsigned int __amd_nodes_per_pkg __ro_after_init;
EXPORT_SYMBOL_GPL(__amd_nodes_per_pkg);
+/* CPUs which are the primary SMT threads */
+struct cpumask __cpu_primary_thread_mask __read_mostly;
+
void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom,
unsigned int shift, unsigned int ncpus)
{
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
index 49782724a943..209b5a22d880 100644
--- a/arch/x86/kernel/cpu/tsx.c
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -19,7 +19,17 @@
#undef pr_fmt
#define pr_fmt(fmt) "tsx: " fmt
-enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED;
+enum tsx_ctrl_states {
+ TSX_CTRL_AUTO,
+ TSX_CTRL_ENABLE,
+ TSX_CTRL_DISABLE,
+ TSX_CTRL_RTM_ALWAYS_ABORT,
+ TSX_CTRL_NOT_SUPPORTED,
+};
+
+static enum tsx_ctrl_states tsx_ctrl_state __ro_after_init =
+ IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO) ? TSX_CTRL_AUTO :
+ IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF) ? TSX_CTRL_DISABLE : TSX_CTRL_ENABLE;
static void tsx_disable(void)
{
@@ -156,11 +166,28 @@ static void tsx_dev_mode_disable(void)
}
}
-void __init tsx_init(void)
+static int __init tsx_parse_cmdline(char *str)
{
- char arg[5] = {};
- int ret;
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "on")) {
+ tsx_ctrl_state = TSX_CTRL_ENABLE;
+ } else if (!strcmp(str, "off")) {
+ tsx_ctrl_state = TSX_CTRL_DISABLE;
+ } else if (!strcmp(str, "auto")) {
+ tsx_ctrl_state = TSX_CTRL_AUTO;
+ } else {
+ tsx_ctrl_state = TSX_CTRL_DISABLE;
+ pr_err("invalid option, defaulting to off\n");
+ }
+
+ return 0;
+}
+early_param("tsx", tsx_parse_cmdline);
+void __init tsx_init(void)
+{
tsx_dev_mode_disable();
/*
@@ -194,27 +221,8 @@ void __init tsx_init(void)
return;
}
- ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg));
- if (ret >= 0) {
- if (!strcmp(arg, "on")) {
- tsx_ctrl_state = TSX_CTRL_ENABLE;
- } else if (!strcmp(arg, "off")) {
- tsx_ctrl_state = TSX_CTRL_DISABLE;
- } else if (!strcmp(arg, "auto")) {
- tsx_ctrl_state = x86_get_tsx_auto_mode();
- } else {
- tsx_ctrl_state = TSX_CTRL_DISABLE;
- pr_err("invalid option, defaulting to off\n");
- }
- } else {
- /* tsx= not provided */
- if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO))
- tsx_ctrl_state = x86_get_tsx_auto_mode();
- else if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF))
- tsx_ctrl_state = TSX_CTRL_DISABLE;
- else
- tsx_ctrl_state = TSX_CTRL_ENABLE;
- }
+ if (tsx_ctrl_state == TSX_CTRL_AUTO)
+ tsx_ctrl_state = x86_get_tsx_auto_mode();
if (tsx_ctrl_state == TSX_CTRL_DISABLE) {
tsx_disable();
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 71ee20102a8a..b10684dedc58 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -181,8 +181,8 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs,
* in false positive reports. Disable instrumentation to avoid those.
*/
__no_kmsan_checks
-static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, const char *log_lvl)
+static void __show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, const char *log_lvl)
{
struct unwind_state state;
struct stack_info stack_info = {0};
@@ -303,6 +303,25 @@ next:
}
}
+static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, const char *log_lvl)
+{
+ /*
+ * Disable KASAN to avoid false positives during walking another
+ * task's stacks, as values on these stacks may change concurrently
+ * with task execution.
+ */
+ bool disable_kasan = task && task != current;
+
+ if (disable_kasan)
+ kasan_disable_current();
+
+ __show_trace_log_lvl(task, regs, stack, log_lvl);
+
+ if (disable_kasan)
+ kasan_enable_current();
+}
+
void show_stack(struct task_struct *task, unsigned long *sp,
const char *loglvl)
{
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index c3acbd26408b..b15b97d3cb52 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -16,6 +16,7 @@
#include <linux/firmware-map.h>
#include <linux/sort.h>
#include <linux/memory_hotplug.h>
+#include <linux/kvm_types.h>
#include <asm/e820/api.h>
#include <asm/setup.h>
@@ -95,7 +96,7 @@ bool e820__mapped_raw_any(u64 start, u64 end, enum e820_type type)
{
return _e820__mapped_any(e820_table_firmware, start, end, type);
}
-EXPORT_SYMBOL_GPL(e820__mapped_raw_any);
+EXPORT_SYMBOL_FOR_KVM(e820__mapped_raw_any);
bool e820__mapped_any(u64 start, u64 end, enum e820_type type)
{
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 1f71cc135e9a..da233f20ae6f 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -18,6 +18,7 @@
#include <uapi/asm/kvm.h>
#include <linux/hardirq.h>
+#include <linux/kvm_types.h>
#include <linux/pkeys.h>
#include <linux/vmalloc.h>
@@ -276,7 +277,7 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
return true;
}
-EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate);
+EXPORT_SYMBOL_FOR_KVM(fpu_alloc_guest_fpstate);
void fpu_free_guest_fpstate(struct fpu_guest *gfpu)
{
@@ -291,7 +292,7 @@ void fpu_free_guest_fpstate(struct fpu_guest *gfpu)
gfpu->fpstate = NULL;
vfree(fpstate);
}
-EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate);
+EXPORT_SYMBOL_FOR_KVM(fpu_free_guest_fpstate);
/*
* fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable
@@ -313,7 +314,7 @@ int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures)
return __xfd_enable_feature(xfeatures, guest_fpu);
}
-EXPORT_SYMBOL_GPL(fpu_enable_guest_xfd_features);
+EXPORT_SYMBOL_FOR_KVM(fpu_enable_guest_xfd_features);
#ifdef CONFIG_X86_64
void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd)
@@ -324,7 +325,7 @@ void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd)
xfd_update_state(guest_fpu->fpstate);
fpregs_unlock();
}
-EXPORT_SYMBOL_GPL(fpu_update_guest_xfd);
+EXPORT_SYMBOL_FOR_KVM(fpu_update_guest_xfd);
/**
* fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state
@@ -348,7 +349,7 @@ void fpu_sync_guest_vmexit_xfd_state(void)
__this_cpu_write(xfd_state, fpstate->xfd);
}
}
-EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state);
+EXPORT_SYMBOL_FOR_KVM(fpu_sync_guest_vmexit_xfd_state);
#endif /* CONFIG_X86_64 */
int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
@@ -390,7 +391,7 @@ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
fpregs_unlock();
return 0;
}
-EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate);
+EXPORT_SYMBOL_FOR_KVM(fpu_swap_kvm_fpstate);
void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf,
unsigned int size, u64 xfeatures, u32 pkru)
@@ -409,7 +410,7 @@ void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf,
ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE;
}
}
-EXPORT_SYMBOL_GPL(fpu_copy_guest_fpstate_to_uabi);
+EXPORT_SYMBOL_FOR_KVM(fpu_copy_guest_fpstate_to_uabi);
int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
u64 xcr0, u32 *vpkru)
@@ -439,7 +440,7 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
return copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru);
}
-EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate);
+EXPORT_SYMBOL_FOR_KVM(fpu_copy_uabi_to_guest_fpstate);
#endif /* CONFIG_KVM */
void kernel_fpu_begin_mask(unsigned int kfpu_mask)
@@ -825,6 +826,9 @@ void fpu__clear_user_states(struct fpu *fpu)
!fpregs_state_valid(fpu, smp_processor_id()))
os_xrstor_supervisor(fpu->fpstate);
+ /* Ensure XFD state is in sync before reloading XSTATE */
+ xfd_update_state(fpu->fpstate);
+
/* Reset user states in registers. */
restore_fpregs_from_init_fpstate(XFEATURE_MASK_USER_RESTORE);
@@ -854,7 +858,7 @@ void switch_fpu_return(void)
fpregs_restore_userregs();
}
-EXPORT_SYMBOL_GPL(switch_fpu_return);
+EXPORT_SYMBOL_FOR_KVM(switch_fpu_return);
void fpregs_lock_and_load(void)
{
@@ -889,7 +893,7 @@ void fpregs_assert_state_consistent(void)
WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id()));
}
-EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent);
+EXPORT_SYMBOL_FOR_KVM(fpregs_assert_state_consistent);
#endif
void fpregs_mark_activate(void)
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 28e4fd65c9da..48113c5193aa 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -8,6 +8,7 @@
#include <linux/compat.h>
#include <linux/cpu.h>
#include <linux/mman.h>
+#include <linux/kvm_types.h>
#include <linux/nospec.h>
#include <linux/pkeys.h>
#include <linux/seq_file.h>
@@ -1058,7 +1059,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
return __raw_xsave_addr(xsave, xfeature_nr);
}
-EXPORT_SYMBOL_GPL(get_xsave_addr);
+EXPORT_SYMBOL_FOR_KVM(get_xsave_addr);
/*
* Given an xstate feature nr, calculate where in the xsave buffer the state is.
@@ -1482,7 +1483,7 @@ void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeatu
if (addr)
memset(addr, 0, xstate_sizes[xfeature]);
}
-EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
+EXPORT_SYMBOL_FOR_KVM(fpstate_clear_xstate_component);
#endif
#ifdef CONFIG_X86_64
@@ -1818,7 +1819,7 @@ u64 xstate_get_guest_group_perm(void)
{
return xstate_get_group_perm(true);
}
-EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);
+EXPORT_SYMBOL_FOR_KVM(xstate_get_guest_group_perm);
/**
* fpu_xstate_prctl - xstate permission operations
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 367da3638167..823dbdd0eb41 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -354,12 +354,17 @@ SYM_CODE_START(return_to_handler)
UNWIND_HINT_UNDEFINED
ANNOTATE_NOENDBR
+ /* Restore return_to_handler value that got eaten by previous ret instruction. */
+ subq $8, %rsp
+ UNWIND_HINT_FUNC
+
/* Save ftrace_regs for function exit context */
subq $(FRAME_SIZE), %rsp
movq %rax, RAX(%rsp)
movq %rdx, RDX(%rsp)
movq %rbp, RBP(%rsp)
+ movq %rsp, RSP(%rsp)
movq %rsp, %rdi
call ftrace_return_to_handler
@@ -368,7 +373,8 @@ SYM_CODE_START(return_to_handler)
movq RDX(%rsp), %rdx
movq RAX(%rsp), %rax
- addq $(FRAME_SIZE), %rsp
+ addq $(FRAME_SIZE) + 8, %rsp
+
/*
* Jump back to the old return address. This cannot be JMP_NOSPEC rdi
* since IBT would demand that contain ENDBR, which simply isn't so for
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index b01644c949b2..f846c15f21ca 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -24,6 +24,7 @@
#include <linux/percpu.h>
#include <linux/kdebug.h>
#include <linux/kernel.h>
+#include <linux/kvm_types.h>
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/smp.h>
@@ -489,7 +490,7 @@ void hw_breakpoint_restore(void)
set_debugreg(DR6_RESERVED, 6);
set_debugreg(__this_cpu_read(cpu_dr7), 7);
}
-EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
+EXPORT_SYMBOL_FOR_KVM(hw_breakpoint_restore);
/*
* Handle debug exception notifications.
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 10721a125226..86f4e574de02 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -12,6 +12,7 @@
#include <linux/delay.h>
#include <linux/export.h>
#include <linux/irq.h>
+#include <linux/kvm_types.h>
#include <asm/irq_stack.h>
#include <asm/apic.h>
@@ -361,7 +362,7 @@ void kvm_set_posted_intr_wakeup_handler(void (*handler)(void))
synchronize_rcu();
}
}
-EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler);
+EXPORT_SYMBOL_FOR_KVM(kvm_set_posted_intr_wakeup_handler);
/*
* Handler for POSTED_INTERRUPT_VECTOR.
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 3863d7709386..c1fac3a9fecc 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -141,7 +141,6 @@ bool can_boost(struct insn *insn, void *addr)
{
kprobe_opcode_t opcode;
insn_byte_t prefix;
- int i;
if (search_exception_tables((unsigned long)addr))
return false; /* Page fault may occur on this address. */
@@ -154,7 +153,7 @@ bool can_boost(struct insn *insn, void *addr)
if (insn->opcode.nbytes != 1)
return false;
- for_each_insn_prefix(insn, i, prefix) {
+ for_each_insn_prefix(insn, prefix) {
insn_attr_t attr;
attr = inat_get_opcode_attribute(prefix);
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 0aabd4c4e2c4..6f826a00eca2 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -103,7 +103,6 @@ static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
asm (
".pushsection .rodata\n"
- "optprobe_template_func:\n"
".global optprobe_template_entry\n"
"optprobe_template_entry:\n"
#ifdef CONFIG_X86_64
@@ -160,9 +159,6 @@ asm (
"optprobe_template_end:\n"
".popsection\n");
-void optprobe_template_func(void);
-STACK_FRAME_NON_STANDARD(optprobe_template_func);
-
#define TMPL_CLAC_IDX \
((long)optprobe_template_clac - (long)optprobe_template_entry)
#define TMPL_MOVE_IDX \
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index b67d7c59dca0..204765004c72 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -29,6 +29,7 @@
#include <linux/syscore_ops.h>
#include <linux/cc_platform.h>
#include <linux/efi.h>
+#include <linux/kvm_types.h>
#include <asm/timer.h>
#include <asm/cpu.h>
#include <asm/traps.h>
@@ -162,7 +163,7 @@ void kvm_async_pf_task_wait_schedule(u32 token)
}
finish_swait(&n.wq, &wait);
}
-EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule);
+EXPORT_SYMBOL_FOR_KVM(kvm_async_pf_task_wait_schedule);
static void apf_task_wake_one(struct kvm_task_sleep_node *n)
{
@@ -253,7 +254,7 @@ noinstr u32 kvm_read_and_reset_apf_flags(void)
return flags;
}
-EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags);
+EXPORT_SYMBOL_FOR_KVM(kvm_read_and_reset_apf_flags);
noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
{
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 0ffbae902e2f..11c45ce42694 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -97,6 +97,7 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
DEBUGP("%s relocate section %u to %u\n",
apply ? "Applying" : "Clearing",
relsec, sechdrs[relsec].sh_info);
+
for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
size_t size;
@@ -162,15 +163,17 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
if (apply) {
if (memcmp(loc, &zero, size)) {
- pr_err("x86/modules: Invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n",
- (int)ELF64_R_TYPE(rel[i].r_info), loc, val);
+ pr_err("x86/modules: Invalid relocation target, existing value is nonzero for sec %u, idx %u, type %d, loc %lx, val %llx\n",
+ relsec, i, (int)ELF64_R_TYPE(rel[i].r_info),
+ (unsigned long)loc, val);
return -ENOEXEC;
}
write(loc, &val, size);
} else {
if (memcmp(loc, &val, size)) {
- pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for type %d, loc %p, val %Lx\n",
- (int)ELF64_R_TYPE(rel[i].r_info), loc, val);
+ pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for sec %u, idx %u, type %d, loc %lx, val %llx\n",
+ relsec, i, (int)ELF64_R_TYPE(rel[i].r_info),
+ (unsigned long)loc, val);
return -ENOEXEC;
}
write(loc, &zero, size);
@@ -179,8 +182,8 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
return 0;
overflow:
- pr_err("overflow in relocation type %d val %Lx\n",
- (int)ELF64_R_TYPE(rel[i].r_info), val);
+ pr_err("overflow in relocation type %d val %llx sec %u idx %d\n",
+ (int)ELF64_R_TYPE(rel[i].r_info), val, relsec, i);
pr_err("`%s' likely not compiled with -mcmodel=kernel\n",
me->name);
return -ENOEXEC;
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index e17c16c54a37..4469c784eaa0 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -98,7 +98,7 @@ static int filter_write(u32 reg)
if (!__ratelimit(&fw_rs))
return 0;
- pr_warn("Write to unrecognized MSR 0x%x by %s (pid: %d).\n",
+ pr_warn("Write to unrecognized MSR 0x%x by %s (pid: %d), tainting CPU_OUT_OF_SPEC.\n",
reg, current->comm, current->pid);
pr_warn("See https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/about for details.\n");
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index be93ec7255bf..3d239ed12744 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -24,6 +24,7 @@
#include <linux/export.h>
#include <linux/atomic.h>
#include <linux/sched/clock.h>
+#include <linux/kvm_types.h>
#include <asm/cpu_entry_area.h>
#include <asm/traps.h>
@@ -613,9 +614,7 @@ DEFINE_IDTENTRY_RAW(exc_nmi_kvm_vmx)
{
exc_nmi(regs);
}
-#if IS_MODULE(CONFIG_KVM_INTEL)
-EXPORT_SYMBOL_GPL(asm_exc_nmi_kvm_vmx);
-#endif
+EXPORT_SYMBOL_FOR_KVM(asm_exc_nmi_kvm_vmx);
#endif
#ifdef CONFIG_NMI_CHECK_CPU
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 52a5c03c353c..432c0a004c60 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -30,6 +30,7 @@
#include <linux/interrupt.h>
#include <linux/delay.h>
#include <linux/export.h>
+#include <linux/kvm_types.h>
#include <linux/ptrace.h>
#include <linux/notifier.h>
#include <linux/kprobes.h>
@@ -303,9 +304,7 @@ void current_save_fsgs(void)
save_fsgs(current);
local_irq_restore(flags);
}
-#if IS_ENABLED(CONFIG_KVM)
-EXPORT_SYMBOL_GPL(current_save_fsgs);
-#endif
+EXPORT_SYMBOL_FOR_KVM(current_save_fsgs);
static __always_inline void loadseg(enum which_selector which,
unsigned short sel)
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 964f6b0a3d68..6032fa9ec753 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -13,6 +13,7 @@
#include <linux/objtool.h>
#include <linux/pgtable.h>
#include <linux/kexec.h>
+#include <linux/kvm_types.h>
#include <acpi/reboot.h>
#include <asm/io.h>
#include <asm/apic.h>
@@ -541,7 +542,7 @@ void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback)
rcu_assign_pointer(cpu_emergency_virt_callback, callback);
}
-EXPORT_SYMBOL_GPL(cpu_emergency_register_virt_callback);
+EXPORT_SYMBOL_FOR_KVM(cpu_emergency_register_virt_callback);
void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback)
{
@@ -551,7 +552,7 @@ void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback)
rcu_assign_pointer(cpu_emergency_virt_callback, NULL);
synchronize_rcu();
}
-EXPORT_SYMBOL_GPL(cpu_emergency_unregister_virt_callback);
+EXPORT_SYMBOL_FOR_KVM(cpu_emergency_unregister_virt_callback);
/*
* Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 11e20bb13aca..4ffba68dc57b 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -95,9 +95,12 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
/* Leave CR4 in %r13 to enable the right paging mode later. */
movq %cr4, %r13
- /* Disable global pages immediately to ensure this mapping is RWX */
+ /*
+ * Disable global pages immediately to ensure this mapping is RWX.
+ * Disable LASS before jumping to the identity mapped page.
+ */
movq %r13, %r12
- andq $~(X86_CR4_PGE), %r12
+ andq $~(X86_CR4_PGE | X86_CR4_LASS), %r12
movq %r12, %cr4
/* Save %rsp and CRs. */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index eb289abece23..5cd6950ab672 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -103,9 +103,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_core_map);
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
EXPORT_PER_CPU_SYMBOL(cpu_die_map);
-/* CPUs which are the primary SMT threads */
-struct cpumask __cpu_primary_thread_mask __read_mostly;
-
/* Representing CPUs for which sibling maps can be computed */
static cpumask_var_t cpu_sibling_setup_mask;
@@ -515,6 +512,76 @@ static void __init build_sched_topology(void)
set_sched_topology(topology);
}
+#ifdef CONFIG_NUMA
+static int sched_avg_remote_distance;
+static int avg_remote_numa_distance(void)
+{
+ int i, j;
+ int distance, nr_remote, total_distance;
+
+ if (sched_avg_remote_distance > 0)
+ return sched_avg_remote_distance;
+
+ nr_remote = 0;
+ total_distance = 0;
+ for_each_node_state(i, N_CPU) {
+ for_each_node_state(j, N_CPU) {
+ distance = node_distance(i, j);
+
+ if (distance >= REMOTE_DISTANCE) {
+ nr_remote++;
+ total_distance += distance;
+ }
+ }
+ }
+ if (nr_remote)
+ sched_avg_remote_distance = total_distance / nr_remote;
+ else
+ sched_avg_remote_distance = REMOTE_DISTANCE;
+
+ return sched_avg_remote_distance;
+}
+
+int arch_sched_node_distance(int from, int to)
+{
+ int d = node_distance(from, to);
+
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_GRANITERAPIDS_X:
+ case INTEL_ATOM_DARKMONT_X:
+
+ if (!x86_has_numa_in_package || topology_max_packages() == 1 ||
+ d < REMOTE_DISTANCE)
+ return d;
+
+ /*
+ * With SNC enabled, there could be too many levels of remote
+ * NUMA node distances, creating NUMA domain levels
+ * including local nodes and partial remote nodes.
+ *
+ * Trim finer distance tuning for NUMA nodes in remote package
+ * for the purpose of building sched domains. Group NUMA nodes
+ * in the remote package in the same sched group.
+ * Simplify NUMA domains and avoid extra NUMA levels including
+ * different remote NUMA nodes and local nodes.
+ *
+ * GNR and CWF don't expect systems with more than 2 packages
+ * and more than 2 hops between packages. Single average remote
+ * distance won't be appropriate if there are more than 2
+ * packages as average distance to different remote packages
+ * could be different.
+ */
+ WARN_ONCE(topology_max_packages() > 2,
+ "sched: Expect only up to 2 packages for GNR or CWF, "
+ "but saw %d packages when building sched domains.",
+ topology_max_packages());
+
+ d = avg_remote_numa_distance();
+ }
+ return d;
+}
+#endif /* CONFIG_NUMA */
+
void set_cpu_sibling_map(int cpu)
{
bool has_smt = __max_threads_per_core > 1;
@@ -1328,11 +1395,7 @@ void __noreturn hlt_play_dead(void)
native_halt();
}
-/*
- * native_play_dead() is essentially a __noreturn function, but it can't
- * be marked as such as the compiler may complain about it.
- */
-void native_play_dead(void)
+void __noreturn native_play_dead(void)
{
if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
__update_spec_ctrl(0);
@@ -1351,7 +1414,7 @@ int native_cpu_disable(void)
return -ENOSYS;
}
-void native_play_dead(void)
+void __noreturn native_play_dead(void)
{
BUG();
}
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index 378c388d1b31..2892cdb14563 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -26,6 +26,11 @@ static const u8 xor5rax[] = { 0x2e, 0x2e, 0x2e, 0x31, 0xc0 };
static const u8 retinsn[] = { RET_INSN_OPCODE, 0xcc, 0xcc, 0xcc, 0xcc };
+/*
+ * ud1 (%edx),%rdi -- see __WARN_trap() / decode_bug()
+ */
+static const u8 warninsn[] = { 0x67, 0x48, 0x0f, 0xb9, 0x3a };
+
static u8 __is_Jcc(u8 *insn) /* Jcc.d32 */
{
u8 ret = 0;
@@ -69,7 +74,10 @@ static void __ref __static_call_transform(void *insn, enum insn_type type,
emulate = code;
code = &xor5rax;
}
-
+ if (func == &__WARN_trap) {
+ emulate = code;
+ code = &warninsn;
+ }
break;
case NOP:
@@ -128,7 +136,8 @@ static void __static_call_validate(u8 *insn, bool tail, bool tramp)
} else {
if (opcode == CALL_INSN_OPCODE ||
!memcmp(insn, x86_nops[5], 5) ||
- !memcmp(insn, xor5rax, 5))
+ !memcmp(insn, xor5rax, 5) ||
+ !memcmp(insn, warninsn, 5))
return;
}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 6b22611e69cc..bcf1dedc1d00 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -31,6 +31,7 @@
#include <linux/kexec.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
+#include <linux/static_call.h>
#include <linux/timer.h>
#include <linux/init.h>
#include <linux/bug.h>
@@ -102,25 +103,37 @@ __always_inline int is_valid_bugaddr(unsigned long addr)
* UBSan{0}: 67 0f b9 00 ud1 (%eax),%eax
* UBSan{10}: 67 0f b9 40 10 ud1 0x10(%eax),%eax
* static_call: 0f b9 cc ud1 %esp,%ecx
+ * __WARN_trap: 67 48 0f b9 3a ud1 (%edx),%reg
*
- * Notably UBSAN uses EAX, static_call uses ECX.
+ * Notable, since __WARN_trap can use all registers, the distinction between
+ * UD1 users is through R/M.
*/
__always_inline int decode_bug(unsigned long addr, s32 *imm, int *len)
{
unsigned long start = addr;
+ u8 v, reg, rm, rex = 0;
+ int type = BUG_UD1;
bool lock = false;
- u8 v;
if (addr < TASK_SIZE_MAX)
return BUG_NONE;
- v = *(u8 *)(addr++);
- if (v == INSN_ASOP)
+ for (;;) {
v = *(u8 *)(addr++);
+ if (v == INSN_ASOP)
+ continue;
- if (v == INSN_LOCK) {
- lock = true;
- v = *(u8 *)(addr++);
+ if (v == INSN_LOCK) {
+ lock = true;
+ continue;
+ }
+
+ if ((v & 0xf0) == 0x40) {
+ rex = v;
+ continue;
+ }
+
+ break;
}
switch (v) {
@@ -156,18 +169,33 @@ __always_inline int decode_bug(unsigned long addr, s32 *imm, int *len)
if (X86_MODRM_MOD(v) != 3 && X86_MODRM_RM(v) == 4)
addr++; /* SIB */
+ reg = X86_MODRM_REG(v) + 8*!!X86_REX_R(rex);
+ rm = X86_MODRM_RM(v) + 8*!!X86_REX_B(rex);
+
/* Decode immediate, if present */
switch (X86_MODRM_MOD(v)) {
case 0: if (X86_MODRM_RM(v) == 5)
- addr += 4; /* RIP + disp32 */
+ addr += 4; /* RIP + disp32 */
+
+ if (rm == 0) /* (%eax) */
+ type = BUG_UD1_UBSAN;
+
+ if (rm == 2) { /* (%edx) */
+ *imm = reg;
+ type = BUG_UD1_WARN;
+ }
break;
case 1: *imm = *(s8 *)addr;
addr += 1;
+ if (rm == 0) /* (%eax) */
+ type = BUG_UD1_UBSAN;
break;
case 2: *imm = *(s32 *)addr;
addr += 4;
+ if (rm == 0) /* (%eax) */
+ type = BUG_UD1_UBSAN;
break;
case 3: break;
@@ -176,12 +204,76 @@ __always_inline int decode_bug(unsigned long addr, s32 *imm, int *len)
/* record instruction length */
*len = addr - start;
- if (X86_MODRM_REG(v) == 0) /* EAX */
- return BUG_UD1_UBSAN;
+ return type;
+}
- return BUG_UD1;
+static inline unsigned long pt_regs_val(struct pt_regs *regs, int nr)
+{
+ int offset = pt_regs_offset(regs, nr);
+ if (WARN_ON_ONCE(offset < -0))
+ return 0;
+ return *((unsigned long *)((void *)regs + offset));
}
+#ifdef HAVE_ARCH_BUG_FORMAT_ARGS
+DEFINE_STATIC_CALL(WARN_trap, __WARN_trap);
+EXPORT_STATIC_CALL_TRAMP(WARN_trap);
+
+/*
+ * Create a va_list from an exception context.
+ */
+void *__warn_args(struct arch_va_list *args, struct pt_regs *regs)
+{
+ /*
+ * Register save area; populate with function call argument registers
+ */
+ args->regs[0] = regs->di;
+ args->regs[1] = regs->si;
+ args->regs[2] = regs->dx;
+ args->regs[3] = regs->cx;
+ args->regs[4] = regs->r8;
+ args->regs[5] = regs->r9;
+
+ /*
+ * From the ABI document:
+ *
+ * @gp_offset - the element holds the offset in bytes from
+ * reg_save_area to the place where the next available general purpose
+ * argument register is saved. In case all argument registers have
+ * been exhausted, it is set to the value 48 (6*8).
+ *
+ * @fp_offset - the element holds the offset in bytes from
+ * reg_save_area to the place where the next available floating point
+ * argument is saved. In case all argument registers have been
+ * exhausted, it is set to the value 176 (6*8 + 8*16)
+ *
+ * @overflow_arg_area - this pointer is used to fetch arguments passed
+ * on the stack. It is initialized with the address of the first
+ * argument passed on the stack, if any, and then always updated to
+ * point to the start of the next argument on the stack.
+ *
+ * @reg_save_area - the element points to the start of the register
+ * save area.
+ *
+ * Notably the vararg starts with the second argument and there are no
+ * floating point arguments in the kernel.
+ */
+ args->args.gp_offset = 1*8;
+ args->args.fp_offset = 6*8 + 8*16;
+ args->args.reg_save_area = &args->regs;
+ args->args.overflow_arg_area = (void *)regs->sp;
+
+ /*
+ * If the exception came from __WARN_trap, there is a return
+ * address on the stack, skip that. This is why any __WARN_trap()
+ * caller must inhibit tail-call optimization.
+ */
+ if ((void *)regs->ip == &__WARN_trap)
+ args->args.overflow_arg_area += 8;
+
+ return &args->args;
+}
+#endif /* HAVE_ARCH_BUG_FORMAT */
static nokprobe_inline int
do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str,
@@ -334,6 +426,11 @@ static noinstr bool handle_bug(struct pt_regs *regs)
raw_local_irq_enable();
switch (ud_type) {
+ case BUG_UD1_WARN:
+ if (report_bug_entry((void *)pt_regs_val(regs, ud_imm), regs) == BUG_TRAP_TYPE_WARN)
+ handled = true;
+ break;
+
case BUG_UD2:
if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) {
handled = true;
@@ -635,13 +732,23 @@ DEFINE_IDTENTRY(exc_bounds)
enum kernel_gp_hint {
GP_NO_HINT,
GP_NON_CANONICAL,
- GP_CANONICAL
+ GP_CANONICAL,
+ GP_LASS_VIOLATION,
+ GP_NULL_POINTER,
+};
+
+static const char * const kernel_gp_hint_help[] = {
+ [GP_NON_CANONICAL] = "probably for non-canonical address",
+ [GP_CANONICAL] = "maybe for address",
+ [GP_LASS_VIOLATION] = "probably LASS violation for address",
+ [GP_NULL_POINTER] = "kernel NULL pointer dereference",
};
/*
* When an uncaught #GP occurs, try to determine the memory address accessed by
* the instruction and return that address to the caller. Also, try to figure
- * out whether any part of the access to that address was non-canonical.
+ * out whether any part of the access to that address was non-canonical or
+ * across privilege levels.
*/
static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs,
unsigned long *addr)
@@ -663,14 +770,28 @@ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs,
return GP_NO_HINT;
#ifdef CONFIG_X86_64
+ /* Operand is in the kernel half */
+ if (*addr >= ~__VIRTUAL_MASK)
+ return GP_CANONICAL;
+
+ /* The last byte of the operand is not in the user canonical half */
+ if (*addr + insn.opnd_bytes - 1 > __VIRTUAL_MASK)
+ return GP_NON_CANONICAL;
+
/*
- * Check that:
- * - the operand is not in the kernel half
- * - the last byte of the operand is not in the user canonical half
+ * A NULL pointer dereference usually causes a #PF. However, it
+ * can result in a #GP when LASS is active. Provide the same
+ * hint in the rare case that the condition is hit without LASS.
*/
- if (*addr < ~__VIRTUAL_MASK &&
- *addr + insn.opnd_bytes - 1 > __VIRTUAL_MASK)
- return GP_NON_CANONICAL;
+ if (*addr < PAGE_SIZE)
+ return GP_NULL_POINTER;
+
+ /*
+ * Assume that LASS caused the exception, because the address is
+ * canonical and in the user half.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_LASS))
+ return GP_LASS_VIOLATION;
#endif
return GP_CANONICAL;
@@ -833,9 +954,7 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
if (hint != GP_NO_HINT)
snprintf(desc, sizeof(desc), GPFSTR ", %s 0x%lx",
- (hint == GP_NON_CANONICAL) ? "probably for non-canonical address"
- : "maybe for address",
- gp_addr);
+ kernel_gp_hint_help[hint], gp_addr);
/*
* KASAN is interested only in the non-canonical case, clear it
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 87e749106dda..7d3e13e14eab 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -11,6 +11,7 @@
#include <linux/cpufreq.h>
#include <linux/delay.h>
#include <linux/clocksource.h>
+#include <linux/kvm_types.h>
#include <linux/percpu.h>
#include <linux/timex.h>
#include <linux/static_key.h>
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 845aeaf36b8d..7be8e361ca55 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -17,6 +17,7 @@
#include <linux/kdebug.h>
#include <asm/processor.h>
#include <asm/insn.h>
+#include <asm/insn-eval.h>
#include <asm/mmu_context.h>
#include <asm/nops.h>
@@ -258,9 +259,8 @@ static volatile u32 good_2byte_insns[256 / 32] = {
static bool is_prefix_bad(struct insn *insn)
{
insn_byte_t p;
- int i;
- for_each_insn_prefix(insn, i, p) {
+ for_each_insn_prefix(insn, p) {
insn_attr_t attr;
attr = inat_get_opcode_attribute(p);
@@ -1158,35 +1158,12 @@ unlock:
mmap_write_unlock(mm);
}
-static bool insn_is_nop(struct insn *insn)
-{
- return insn->opcode.nbytes == 1 && insn->opcode.bytes[0] == 0x90;
-}
-
-static bool insn_is_nopl(struct insn *insn)
-{
- if (insn->opcode.nbytes != 2)
- return false;
-
- if (insn->opcode.bytes[0] != 0x0f || insn->opcode.bytes[1] != 0x1f)
- return false;
-
- if (!insn->modrm.nbytes)
- return false;
-
- if (X86_MODRM_REG(insn->modrm.bytes[0]) != 0)
- return false;
-
- /* 0f 1f /0 - NOPL */
- return true;
-}
-
static bool can_optimize(struct insn *insn, unsigned long vaddr)
{
if (!insn->x86_64 || insn->length != 5)
return false;
- if (!insn_is_nop(insn) && !insn_is_nopl(insn))
+ if (!insn_is_nop(insn))
return false;
/* We can't do cross page atomic writes yet. */
@@ -1426,19 +1403,14 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
{
u8 opc1 = OPCODE1(insn);
insn_byte_t p;
- int i;
- /* x86_nops[insn->length]; same as jmp with .offs = 0 */
- if (insn->length <= ASM_NOP_MAX &&
- !memcmp(insn->kaddr, x86_nops[insn->length], insn->length))
+ if (insn_is_nop(insn))
goto setup;
switch (opc1) {
case 0xeb: /* jmp 8 */
case 0xe9: /* jmp 32 */
break;
- case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */
- goto setup;
case 0xe8: /* call relative */
branch_clear_offset(auprobe, insn);
@@ -1463,7 +1435,7 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
* Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix.
* No one uses these insns, reject any branch insns with such prefix.
*/
- for_each_insn_prefix(insn, i, p) {
+ for_each_insn_prefix(insn, p) {
if (p == 0x66)
return -ENOTSUPP;
}
@@ -1819,3 +1791,35 @@ bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
else
return regs->sp <= ret->stack;
}
+
+/*
+ * Heuristic-based check if uprobe is installed at the function entry.
+ *
+ * Under assumption of user code being compiled with frame pointers,
+ * `push %rbp/%ebp` is a good indicator that we indeed are.
+ *
+ * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern.
+ * If we get this wrong, captured stack trace might have one extra bogus
+ * entry, but the rest of stack trace will still be meaningful.
+ */
+bool is_uprobe_at_func_entry(struct pt_regs *regs)
+{
+ struct arch_uprobe *auprobe;
+
+ if (!current->utask)
+ return false;
+
+ auprobe = current->utask->auprobe;
+ if (!auprobe)
+ return false;
+
+ /* push %rbp/%ebp */
+ if (auprobe->insn[0] == 0x55)
+ return true;
+
+ /* endbr64 (64-bit only) */
+ if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn))
+ return true;
+
+ return false;
+}