diff options
Diffstat (limited to 'arch/x86/kernel')
41 files changed, 2502 insertions, 931 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0d2a6d953be9..bc184dd38d99 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -148,7 +148,7 @@ obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o -obj-$(CONFIG_CFI_CLANG) += cfi.o +obj-$(CONFIG_CFI) += cfi.o obj-$(CONFIG_CALL_THUNKS) += callthunks.o diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 7bde68247b5f..79ae9cb50019 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1170,7 +1170,7 @@ void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } #ifdef CONFIG_CFI_AUTO_DEFAULT # define __CFI_DEFAULT CFI_AUTO -#elif defined(CONFIG_CFI_CLANG) +#elif defined(CONFIG_CFI) # define __CFI_DEFAULT CFI_KCFI #else # define __CFI_DEFAULT CFI_OFF @@ -1182,7 +1182,7 @@ enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT; bool cfi_bhi __ro_after_init = false; #endif -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI u32 cfi_get_func_hash(void *func) { u32 hash; diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 52d1808ee360..581db89477f9 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -18,6 +18,7 @@ ifeq ($(CONFIG_X86_64),y) # APIC probe will depend on the listing order here obj-$(CONFIG_X86_NUMACHIP) += apic_numachip.o obj-$(CONFIG_X86_UV) += x2apic_uv_x.o +obj-$(CONFIG_AMD_SECURE_AVIC) += x2apic_savic.o obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o obj-y += apic_flat_64.o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index d73ba5a7b623..680d305589a3 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -592,6 +592,8 @@ static void setup_APIC_timer(void) 0xF, ~0UL); } else clockevents_register_device(levt); + + apic_update_vector(smp_processor_id(), LOCAL_TIMER_VECTOR, true); } /* @@ -1168,6 +1170,9 @@ void disable_local_APIC(void) if (!apic_accessible()) return; + if (apic->teardown) + apic->teardown(); + apic_soft_disable(); #ifdef CONFIG_X86_32 @@ -1428,63 +1433,61 @@ union apic_ir { u32 regs[APIC_IR_REGS]; }; -static bool apic_check_and_ack(union apic_ir *irr, union apic_ir *isr) +static bool apic_check_and_eoi_isr(union apic_ir *isr) { int i, bit; - /* Read the IRRs */ - for (i = 0; i < APIC_IR_REGS; i++) - irr->regs[i] = apic_read(APIC_IRR + i * 0x10); - /* Read the ISRs */ for (i = 0; i < APIC_IR_REGS; i++) isr->regs[i] = apic_read(APIC_ISR + i * 0x10); + /* If the ISR map empty, nothing to do here. */ + if (bitmap_empty(isr->map, APIC_IR_BITS)) + return true; + /* - * If the ISR map is not empty. ACK the APIC and run another round - * to verify whether a pending IRR has been unblocked and turned - * into a ISR. + * There can be multiple ISR bits set when a high priority + * interrupt preempted a lower priority one. Issue an EOI for each + * set bit. The priority traversal order does not matter as there + * can't be new ISR bits raised at this point. What matters is that + * an EOI is issued for each ISR bit. */ - if (!bitmap_empty(isr->map, APIC_IR_BITS)) { - /* - * There can be multiple ISR bits set when a high priority - * interrupt preempted a lower priority one. Issue an ACK - * per set bit. - */ - for_each_set_bit(bit, isr->map, APIC_IR_BITS) - apic_eoi(); - return true; - } + for_each_set_bit(bit, isr->map, APIC_IR_BITS) + apic_eoi(); - return !bitmap_empty(irr->map, APIC_IR_BITS); + /* Reread the ISRs, they should be empty now */ + for (i = 0; i < APIC_IR_REGS; i++) + isr->regs[i] = apic_read(APIC_ISR + i * 0x10); + + return bitmap_empty(isr->map, APIC_IR_BITS); } /* - * After a crash, we no longer service the interrupts and a pending - * interrupt from previous kernel might still have ISR bit set. + * If a CPU services an interrupt and crashes before issuing EOI to the + * local APIC, the corresponding ISR bit is still set when the crashing CPU + * jumps into a crash kernel. Read the ISR and issue an EOI for each set + * bit to acknowledge it as otherwise these slots would be locked forever + * waiting for an EOI. * - * Most probably by now the CPU has serviced that pending interrupt and it - * might not have done the apic_eoi() because it thought, interrupt - * came from i8259 as ExtInt. LAPIC did not get EOI so it does not clear - * the ISR bit and cpu thinks it has already serviced the interrupt. Hence - * a vector might get locked. It was noticed for timer irq (vector - * 0x31). Issue an extra EOI to clear ISR. + * If there are pending bits in the IRR, then they won't be converted into + * ISR bits as the CPU has interrupts disabled. They will be delivered once + * the CPU enables interrupts and there is nothing which can prevent that. * - * If there are pending IRR bits they turn into ISR bits after a higher - * priority ISR bit has been acked. + * In the worst case this results in spurious interrupt warnings. */ -static void apic_pending_intr_clear(void) +static void apic_clear_isr(void) { - union apic_ir irr, isr; + union apic_ir ir; unsigned int i; - /* 512 loops are way oversized and give the APIC a chance to obey. */ - for (i = 0; i < 512; i++) { - if (!apic_check_and_ack(&irr, &isr)) - return; - } - /* Dump the IRR/ISR content if that failed */ - pr_warn("APIC: Stale IRR: %256pb ISR: %256pb\n", irr.map, isr.map); + if (!apic_check_and_eoi_isr(&ir)) + pr_warn("APIC: Stale ISR: %256pb\n", ir.map); + + for (i = 0; i < APIC_IR_REGS; i++) + ir.regs[i] = apic_read(APIC_IRR + i * 0x10); + + if (!bitmap_empty(ir.map, APIC_IR_BITS)) + pr_warn("APIC: Stale IRR: %256pb\n", ir.map); } /** @@ -1503,6 +1506,9 @@ static void setup_local_APIC(void) return; } + if (apic->setup) + apic->setup(); + /* * If this comes from kexec/kcrash the APIC might be enabled in * SPIV. Soft disable it before doing further initialization. @@ -1541,8 +1547,7 @@ static void setup_local_APIC(void) value |= 0x10; apic_write(APIC_TASKPRI, value); - /* Clear eventually stale ISR/IRR bits */ - apic_pending_intr_clear(); + apic_clear_isr(); /* * Now that we are all set up, enable the APIC diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index a947b46a8b64..bddc54465399 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -134,13 +134,20 @@ static void apic_update_irq_cfg(struct irq_data *irqd, unsigned int vector, apicd->hw_irq_cfg.vector = vector; apicd->hw_irq_cfg.dest_apicid = apic->calc_dest_apicid(cpu); + + apic_update_vector(cpu, vector, true); + irq_data_update_effective_affinity(irqd, cpumask_of(cpu)); - trace_vector_config(irqd->irq, vector, cpu, - apicd->hw_irq_cfg.dest_apicid); + trace_vector_config(irqd->irq, vector, cpu, apicd->hw_irq_cfg.dest_apicid); } -static void apic_update_vector(struct irq_data *irqd, unsigned int newvec, - unsigned int newcpu) +static void apic_free_vector(unsigned int cpu, unsigned int vector, bool managed) +{ + apic_update_vector(cpu, vector, false); + irq_matrix_free(vector_matrix, cpu, vector, managed); +} + +static void chip_data_update(struct irq_data *irqd, unsigned int newvec, unsigned int newcpu) { struct apic_chip_data *apicd = apic_chip_data(irqd); struct irq_desc *desc = irq_data_to_desc(irqd); @@ -174,8 +181,7 @@ static void apic_update_vector(struct irq_data *irqd, unsigned int newvec, apicd->prev_cpu = apicd->cpu; WARN_ON_ONCE(apicd->cpu == newcpu); } else { - irq_matrix_free(vector_matrix, apicd->cpu, apicd->vector, - managed); + apic_free_vector(apicd->cpu, apicd->vector, managed); } setnew: @@ -261,7 +267,7 @@ assign_vector_locked(struct irq_data *irqd, const struct cpumask *dest) trace_vector_alloc(irqd->irq, vector, resvd, vector); if (vector < 0) return vector; - apic_update_vector(irqd, vector, cpu); + chip_data_update(irqd, vector, cpu); return 0; } @@ -337,7 +343,7 @@ assign_managed_vector(struct irq_data *irqd, const struct cpumask *dest) trace_vector_alloc_managed(irqd->irq, vector, vector); if (vector < 0) return vector; - apic_update_vector(irqd, vector, cpu); + chip_data_update(irqd, vector, cpu); return 0; } @@ -357,7 +363,7 @@ static void clear_irq_vector(struct irq_data *irqd) apicd->prev_cpu); per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_SHUTDOWN; - irq_matrix_free(vector_matrix, apicd->cpu, vector, managed); + apic_free_vector(apicd->cpu, vector, managed); apicd->vector = 0; /* Clean up move in progress */ @@ -366,7 +372,7 @@ static void clear_irq_vector(struct irq_data *irqd) return; per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_SHUTDOWN; - irq_matrix_free(vector_matrix, apicd->prev_cpu, vector, managed); + apic_free_vector(apicd->prev_cpu, vector, managed); apicd->prev_vector = 0; apicd->move_in_progress = 0; hlist_del_init(&apicd->clist); @@ -905,7 +911,7 @@ static void free_moved_vector(struct apic_chip_data *apicd) * affinity mask comes online. */ trace_vector_free_moved(apicd->irq, cpu, vector, managed); - irq_matrix_free(vector_matrix, cpu, vector, managed); + apic_free_vector(cpu, vector, managed); per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; hlist_del_init(&apicd->clist); apicd->prev_vector = 0; diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c new file mode 100644 index 000000000000..dbc5678bc3b6 --- /dev/null +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -0,0 +1,428 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Secure AVIC Support (SEV-SNP Guests) + * + * Copyright (C) 2024 Advanced Micro Devices, Inc. + * + * Author: Neeraj Upadhyay <Neeraj.Upadhyay@amd.com> + */ + +#include <linux/cc_platform.h> +#include <linux/cpumask.h> +#include <linux/percpu-defs.h> +#include <linux/align.h> + +#include <asm/apic.h> +#include <asm/sev.h> + +#include "local.h" + +struct secure_avic_page { + u8 regs[PAGE_SIZE]; +} __aligned(PAGE_SIZE); + +static struct secure_avic_page __percpu *savic_page __ro_after_init; + +static int savic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + return x2apic_enabled() && cc_platform_has(CC_ATTR_SNP_SECURE_AVIC); +} + +static inline void *get_reg_bitmap(unsigned int cpu, unsigned int offset) +{ + return &per_cpu_ptr(savic_page, cpu)->regs[offset]; +} + +static inline void update_vector(unsigned int cpu, unsigned int offset, + unsigned int vector, bool set) +{ + void *bitmap = get_reg_bitmap(cpu, offset); + + if (set) + apic_set_vector(vector, bitmap); + else + apic_clear_vector(vector, bitmap); +} + +#define SAVIC_ALLOWED_IRR 0x204 + +/* + * When Secure AVIC is enabled, RDMSR/WRMSR of the APIC registers + * result in #VC exception (for non-accelerated register accesses) + * with VMEXIT_AVIC_NOACCEL error code. The #VC exception handler + * can read/write the x2APIC register in the guest APIC backing page. + * + * Since doing this would increase the latency of accessing x2APIC + * registers, instead of doing RDMSR/WRMSR based accesses and + * handling the APIC register reads/writes in the #VC exception handler, + * the read() and write() callbacks directly read/write the APIC register + * from/to the vCPU's APIC backing page. + */ +static u32 savic_read(u32 reg) +{ + void *ap = this_cpu_ptr(savic_page); + + switch (reg) { + case APIC_LVTT: + case APIC_TMICT: + case APIC_TMCCT: + case APIC_TDCR: + case APIC_LVTTHMR: + case APIC_LVTPC: + case APIC_LVT0: + case APIC_LVT1: + case APIC_LVTERR: + return savic_ghcb_msr_read(reg); + case APIC_ID: + case APIC_LVR: + case APIC_TASKPRI: + case APIC_ARBPRI: + case APIC_PROCPRI: + case APIC_LDR: + case APIC_SPIV: + case APIC_ESR: + case APIC_EFEAT: + case APIC_ECTRL: + case APIC_SEOI: + case APIC_IER: + case APIC_EILVTn(0) ... APIC_EILVTn(3): + return apic_get_reg(ap, reg); + case APIC_ICR: + return (u32)apic_get_reg64(ap, reg); + case APIC_ISR ... APIC_ISR + 0x70: + case APIC_TMR ... APIC_TMR + 0x70: + if (WARN_ONCE(!IS_ALIGNED(reg, 16), + "APIC register read offset 0x%x not aligned at 16 bytes", reg)) + return 0; + return apic_get_reg(ap, reg); + /* IRR and ALLOWED_IRR offset range */ + case APIC_IRR ... APIC_IRR + 0x74: + /* + * Valid APIC_IRR/SAVIC_ALLOWED_IRR registers are at 16 bytes strides from + * their respective base offset. APIC_IRRs are in the range + * + * (0x200, 0x210, ..., 0x270) + * + * while the SAVIC_ALLOWED_IRR range starts 4 bytes later, in the range + * + * (0x204, 0x214, ..., 0x274). + * + * Filter out everything else. + */ + if (WARN_ONCE(!(IS_ALIGNED(reg, 16) || + IS_ALIGNED(reg - 4, 16)), + "Misaligned APIC_IRR/ALLOWED_IRR APIC register read offset 0x%x", reg)) + return 0; + return apic_get_reg(ap, reg); + default: + pr_err("Error reading unknown Secure AVIC reg offset 0x%x\n", reg); + return 0; + } +} + +#define SAVIC_NMI_REQ 0x278 + +/* + * On WRMSR to APIC_SELF_IPI register by the guest, Secure AVIC hardware + * updates the APIC_IRR in the APIC backing page of the vCPU. In addition, + * hardware evaluates the new APIC_IRR update for interrupt injection to + * the vCPU. So, self IPIs are hardware-accelerated. + */ +static inline void self_ipi_reg_write(unsigned int vector) +{ + native_apic_msr_write(APIC_SELF_IPI, vector); +} + +static void send_ipi_dest(unsigned int cpu, unsigned int vector, bool nmi) +{ + if (nmi) + apic_set_reg(per_cpu_ptr(savic_page, cpu), SAVIC_NMI_REQ, 1); + else + update_vector(cpu, APIC_IRR, vector, true); +} + +static void send_ipi_allbut(unsigned int vector, bool nmi) +{ + unsigned int cpu, src_cpu; + + guard(irqsave)(); + + src_cpu = raw_smp_processor_id(); + + for_each_cpu(cpu, cpu_online_mask) { + if (cpu == src_cpu) + continue; + send_ipi_dest(cpu, vector, nmi); + } +} + +static inline void self_ipi(unsigned int vector, bool nmi) +{ + u32 icr_low = APIC_SELF_IPI | vector; + + if (nmi) + icr_low |= APIC_DM_NMI; + + native_x2apic_icr_write(icr_low, 0); +} + +static void savic_icr_write(u32 icr_low, u32 icr_high) +{ + unsigned int dsh, vector; + u64 icr_data; + bool nmi; + + dsh = icr_low & APIC_DEST_ALLBUT; + vector = icr_low & APIC_VECTOR_MASK; + nmi = ((icr_low & APIC_DM_FIXED_MASK) == APIC_DM_NMI); + + switch (dsh) { + case APIC_DEST_SELF: + self_ipi(vector, nmi); + break; + case APIC_DEST_ALLINC: + self_ipi(vector, nmi); + fallthrough; + case APIC_DEST_ALLBUT: + send_ipi_allbut(vector, nmi); + break; + default: + send_ipi_dest(icr_high, vector, nmi); + break; + } + + icr_data = ((u64)icr_high) << 32 | icr_low; + if (dsh != APIC_DEST_SELF) + savic_ghcb_msr_write(APIC_ICR, icr_data); + apic_set_reg64(this_cpu_ptr(savic_page), APIC_ICR, icr_data); +} + +static void savic_write(u32 reg, u32 data) +{ + void *ap = this_cpu_ptr(savic_page); + + switch (reg) { + case APIC_LVTT: + case APIC_TMICT: + case APIC_TDCR: + case APIC_LVT0: + case APIC_LVT1: + case APIC_LVTTHMR: + case APIC_LVTPC: + case APIC_LVTERR: + savic_ghcb_msr_write(reg, data); + break; + case APIC_TASKPRI: + case APIC_EOI: + case APIC_SPIV: + case SAVIC_NMI_REQ: + case APIC_ESR: + case APIC_ECTRL: + case APIC_SEOI: + case APIC_IER: + case APIC_EILVTn(0) ... APIC_EILVTn(3): + apic_set_reg(ap, reg, data); + break; + case APIC_ICR: + savic_icr_write(data, 0); + break; + case APIC_SELF_IPI: + self_ipi_reg_write(data); + break; + /* ALLOWED_IRR offsets are writable */ + case SAVIC_ALLOWED_IRR ... SAVIC_ALLOWED_IRR + 0x70: + if (IS_ALIGNED(reg - 4, 16)) { + apic_set_reg(ap, reg, data); + break; + } + fallthrough; + default: + pr_err("Error writing unknown Secure AVIC reg offset 0x%x\n", reg); + } +} + +static void send_ipi(u32 dest, unsigned int vector, unsigned int dsh) +{ + unsigned int icr_low; + + icr_low = __prepare_ICR(dsh, vector, APIC_DEST_PHYSICAL); + savic_icr_write(icr_low, dest); +} + +static void savic_send_ipi(int cpu, int vector) +{ + u32 dest = per_cpu(x86_cpu_to_apicid, cpu); + + send_ipi(dest, vector, 0); +} + +static void send_ipi_mask(const struct cpumask *mask, unsigned int vector, bool excl_self) +{ + unsigned int cpu, this_cpu; + + guard(irqsave)(); + + this_cpu = raw_smp_processor_id(); + + for_each_cpu(cpu, mask) { + if (excl_self && cpu == this_cpu) + continue; + send_ipi(per_cpu(x86_cpu_to_apicid, cpu), vector, 0); + } +} + +static void savic_send_ipi_mask(const struct cpumask *mask, int vector) +{ + send_ipi_mask(mask, vector, false); +} + +static void savic_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) +{ + send_ipi_mask(mask, vector, true); +} + +static void savic_send_ipi_allbutself(int vector) +{ + send_ipi(0, vector, APIC_DEST_ALLBUT); +} + +static void savic_send_ipi_all(int vector) +{ + send_ipi(0, vector, APIC_DEST_ALLINC); +} + +static void savic_send_ipi_self(int vector) +{ + self_ipi_reg_write(vector); +} + +static void savic_update_vector(unsigned int cpu, unsigned int vector, bool set) +{ + update_vector(cpu, SAVIC_ALLOWED_IRR, vector, set); +} + +static void savic_eoi(void) +{ + unsigned int cpu; + int vec; + + cpu = raw_smp_processor_id(); + vec = apic_find_highest_vector(get_reg_bitmap(cpu, APIC_ISR)); + if (WARN_ONCE(vec == -1, "EOI write while no active interrupt in APIC_ISR")) + return; + + /* Is level-triggered interrupt? */ + if (apic_test_vector(vec, get_reg_bitmap(cpu, APIC_TMR))) { + update_vector(cpu, APIC_ISR, vec, false); + /* + * Propagate the EOI write to the hypervisor for level-triggered + * interrupts. Return to the guest from GHCB protocol event takes + * care of re-evaluating interrupt state. + */ + savic_ghcb_msr_write(APIC_EOI, 0); + } else { + /* + * Hardware clears APIC_ISR and re-evaluates the interrupt state + * to determine if there is any pending interrupt which can be + * delivered to CPU. + */ + native_apic_msr_eoi(); + } +} + +static void savic_teardown(void) +{ + /* Disable Secure AVIC */ + native_wrmsrq(MSR_AMD64_SAVIC_CONTROL, 0); + savic_unregister_gpa(NULL); +} + +static void savic_setup(void) +{ + void *ap = this_cpu_ptr(savic_page); + enum es_result res; + unsigned long gpa; + + /* + * Before Secure AVIC is enabled, APIC MSR reads are intercepted. + * APIC_ID MSR read returns the value from the hypervisor. + */ + apic_set_reg(ap, APIC_ID, native_apic_msr_read(APIC_ID)); + + gpa = __pa(ap); + + /* + * The NPT entry for a vCPU's APIC backing page must always be + * present when the vCPU is running in order for Secure AVIC to + * function. A VMEXIT_BUSY is returned on VMRUN and the vCPU cannot + * be resumed if the NPT entry for the APIC backing page is not + * present. Notify GPA of the vCPU's APIC backing page to the + * hypervisor by calling savic_register_gpa(). Before executing + * VMRUN, the hypervisor makes use of this information to make sure + * the APIC backing page is mapped in NPT. + */ + res = savic_register_gpa(gpa); + if (res != ES_OK) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL); + + native_wrmsrq(MSR_AMD64_SAVIC_CONTROL, + gpa | MSR_AMD64_SAVIC_EN | MSR_AMD64_SAVIC_ALLOWEDNMI); +} + +static int savic_probe(void) +{ + if (!cc_platform_has(CC_ATTR_SNP_SECURE_AVIC)) + return 0; + + if (!x2apic_mode) { + pr_err("Secure AVIC enabled in non x2APIC mode\n"); + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL); + /* unreachable */ + } + + savic_page = alloc_percpu(struct secure_avic_page); + if (!savic_page) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL); + + return 1; +} + +static struct apic apic_x2apic_savic __ro_after_init = { + + .name = "secure avic x2apic", + .probe = savic_probe, + .acpi_madt_oem_check = savic_acpi_madt_oem_check, + .setup = savic_setup, + .teardown = savic_teardown, + + .dest_mode_logical = false, + + .disable_esr = 0, + + .cpu_present_to_apicid = default_cpu_present_to_apicid, + + .max_apic_id = UINT_MAX, + .x2apic_set_max_apicid = true, + .get_apic_id = x2apic_get_apic_id, + + .calc_dest_apicid = apic_default_calc_apicid, + + .send_IPI = savic_send_ipi, + .send_IPI_mask = savic_send_ipi_mask, + .send_IPI_mask_allbutself = savic_send_ipi_mask_allbutself, + .send_IPI_allbutself = savic_send_ipi_allbutself, + .send_IPI_all = savic_send_ipi_all, + .send_IPI_self = savic_send_ipi_self, + + .nmi_to_offline_cpu = true, + + .read = savic_read, + .write = savic_write, + .eoi = savic_eoi, + .icr_read = native_x2apic_icr_read, + .icr_write = savic_icr_write, + + .update_vector = savic_update_vector, +}; + +apic_driver(apic_x2apic_savic); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 1e26179ff18c..2f8a58ef690e 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -58,6 +58,7 @@ obj-$(CONFIG_X86_SGX) += sgx/ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o +obj-$(CONFIG_BHYVE_GUEST) += bhyve.o obj-$(CONFIG_ACRN_GUEST) += acrn.o obj-$(CONFIG_DEBUG_FS) += debugfs.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index a5ece6ebe8a7..a6f88ca1a6b4 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1326,8 +1326,8 @@ static const char * const s5_reset_reason_txt[] = { static __init int print_s5_reset_status_mmio(void) { - unsigned long value; void __iomem *addr; + u32 value; int i; if (!cpu_feature_enabled(X86_FEATURE_ZEN)) @@ -1340,12 +1340,16 @@ static __init int print_s5_reset_status_mmio(void) value = ioread32(addr); iounmap(addr); + /* Value with "all bits set" is an error response and should be ignored. */ + if (value == U32_MAX) + return 0; + for (i = 0; i < ARRAY_SIZE(s5_reset_reason_txt); i++) { if (!(value & BIT(i))) continue; if (s5_reset_reason_txt[i]) { - pr_info("x86/amd: Previous system reset reason [0x%08lx]: %s\n", + pr_info("x86/amd: Previous system reset reason [0x%08x]: %s\n", value, s5_reset_reason_txt[i]); } } diff --git a/arch/x86/kernel/cpu/bhyve.c b/arch/x86/kernel/cpu/bhyve.c new file mode 100644 index 000000000000..f1a8ca3dd1ed --- /dev/null +++ b/arch/x86/kernel/cpu/bhyve.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FreeBSD Bhyve guest enlightenments + * + * Copyright © 2025 Amazon.com, Inc. or its affiliates. + * + * Author: David Woodhouse <dwmw2@infradead.org> + */ + +#include <linux/init.h> +#include <linux/export.h> +#include <asm/processor.h> +#include <asm/hypervisor.h> + +static uint32_t bhyve_cpuid_base; +static uint32_t bhyve_cpuid_max; + +#define BHYVE_SIGNATURE "bhyve bhyve " + +#define CPUID_BHYVE_FEATURES 0x40000001 + +/* Features advertised in CPUID_BHYVE_FEATURES %eax */ + +/* MSI Extended Dest ID */ +#define CPUID_BHYVE_FEAT_EXT_DEST_ID (1UL << 0) + +static uint32_t __init bhyve_detect(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) + return 0; + + bhyve_cpuid_base = cpuid_base_hypervisor(BHYVE_SIGNATURE, 0); + if (!bhyve_cpuid_base) + return 0; + + bhyve_cpuid_max = cpuid_eax(bhyve_cpuid_base); + return bhyve_cpuid_max; +} + +static uint32_t bhyve_features(void) +{ + unsigned int cpuid_leaf = bhyve_cpuid_base | CPUID_BHYVE_FEATURES; + + if (bhyve_cpuid_max < cpuid_leaf) + return 0; + + return cpuid_eax(cpuid_leaf); +} + +static bool __init bhyve_ext_dest_id(void) +{ + return !!(bhyve_features() & CPUID_BHYVE_FEAT_EXT_DEST_ID); +} + +static bool __init bhyve_x2apic_available(void) +{ + return true; +} + +const struct hypervisor_x86 x86_hyper_bhyve __refconst = { + .name = "Bhyve", + .detect = bhyve_detect, + .init.init_platform = x86_init_noop, + .init.x2apic_available = bhyve_x2apic_available, + .init.msi_ext_dest_id = bhyve_ext_dest_id, +}; diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index b74bf937cd9f..6a526ae1fe99 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -96,6 +96,9 @@ static void __init its_update_mitigation(void); static void __init its_apply_mitigation(void); static void __init tsa_select_mitigation(void); static void __init tsa_apply_mitigation(void); +static void __init vmscape_select_mitigation(void); +static void __init vmscape_update_mitigation(void); +static void __init vmscape_apply_mitigation(void); /* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; @@ -105,6 +108,14 @@ EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); DEFINE_PER_CPU(u64, x86_spec_ctrl_current); EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current); +/* + * Set when the CPU has run a potentially malicious guest. An IBPB will + * be needed to before running userspace. That IBPB will flush the branch + * predictor content. + */ +DEFINE_PER_CPU(bool, x86_ibpb_exit_to_user); +EXPORT_PER_CPU_SYMBOL_GPL(x86_ibpb_exit_to_user); + u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB; static u64 __ro_after_init x86_arch_cap_msr; @@ -262,6 +273,7 @@ void __init cpu_select_mitigations(void) its_select_mitigation(); bhi_select_mitigation(); tsa_select_mitigation(); + vmscape_select_mitigation(); /* * After mitigations are selected, some may need to update their @@ -293,6 +305,7 @@ void __init cpu_select_mitigations(void) bhi_update_mitigation(); /* srso_update_mitigation() depends on retbleed_update_mitigation(). */ srso_update_mitigation(); + vmscape_update_mitigation(); spectre_v1_apply_mitigation(); spectre_v2_apply_mitigation(); @@ -310,6 +323,7 @@ void __init cpu_select_mitigations(void) its_apply_mitigation(); bhi_apply_mitigation(); tsa_apply_mitigation(); + vmscape_apply_mitigation(); } /* @@ -386,7 +400,6 @@ static bool __init should_mitigate_vuln(unsigned int bug) case X86_BUG_SPECTRE_V2: case X86_BUG_RETBLEED: - case X86_BUG_SRSO: case X86_BUG_L1TF: case X86_BUG_ITS: return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) || @@ -417,6 +430,13 @@ static bool __init should_mitigate_vuln(unsigned int bug) cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) || cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST) || (smt_mitigations != SMT_MITIGATIONS_OFF); + + case X86_BUG_SPEC_STORE_BYPASS: + return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER); + + case X86_BUG_VMSCAPE: + return cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST); + default: WARN(1, "Unknown bug %x\n", bug); return false; @@ -667,8 +687,7 @@ static const char * const mmio_strings[] = { static void __init mmio_select_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || - cpu_mitigations_off()) { + if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) { mmio_mitigation = MMIO_MITIGATION_OFF; return; } @@ -1069,10 +1088,8 @@ static void __init gds_select_mitigation(void) if (gds_mitigation == GDS_MITIGATION_AUTO) { if (should_mitigate_vuln(X86_BUG_GDS)) gds_mitigation = GDS_MITIGATION_FULL; - else { + else gds_mitigation = GDS_MITIGATION_OFF; - return; - } } /* No microcode */ @@ -1445,8 +1462,10 @@ static void __init retbleed_update_mitigation(void) retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; break; default: - if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) + if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) { pr_err(RETBLEED_INTEL_MSG); + retbleed_mitigation = RETBLEED_MITIGATION_NONE; + } } } @@ -1827,9 +1846,10 @@ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_IBRS, }; -static enum spectre_v2_mitigation_cmd spectre_v2_cmd __ro_after_init = SPECTRE_V2_CMD_AUTO; +static enum spectre_v2_mitigation_cmd spectre_v2_cmd __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE; -enum spectre_v2_user_cmd { +enum spectre_v2_user_mitigation_cmd { SPECTRE_V2_USER_CMD_NONE, SPECTRE_V2_USER_CMD_AUTO, SPECTRE_V2_USER_CMD_FORCE, @@ -1839,6 +1859,9 @@ enum spectre_v2_user_cmd { SPECTRE_V2_USER_CMD_SECCOMP_IBPB, }; +static enum spectre_v2_user_mitigation_cmd spectre_v2_user_cmd __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_USER_CMD_AUTO : SPECTRE_V2_USER_CMD_NONE; + static const char * const spectre_v2_user_strings[] = { [SPECTRE_V2_USER_NONE] = "User space: Vulnerable", [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection", @@ -1847,50 +1870,31 @@ static const char * const spectre_v2_user_strings[] = { [SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl", }; -static const struct { - const char *option; - enum spectre_v2_user_cmd cmd; - bool secure; -} v2_user_options[] __initconst = { - { "auto", SPECTRE_V2_USER_CMD_AUTO, false }, - { "off", SPECTRE_V2_USER_CMD_NONE, false }, - { "on", SPECTRE_V2_USER_CMD_FORCE, true }, - { "prctl", SPECTRE_V2_USER_CMD_PRCTL, false }, - { "prctl,ibpb", SPECTRE_V2_USER_CMD_PRCTL_IBPB, false }, - { "seccomp", SPECTRE_V2_USER_CMD_SECCOMP, false }, - { "seccomp,ibpb", SPECTRE_V2_USER_CMD_SECCOMP_IBPB, false }, -}; - -static void __init spec_v2_user_print_cond(const char *reason, bool secure) -{ - if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure) - pr_info("spectre_v2_user=%s forced on command line.\n", reason); -} - -static enum spectre_v2_user_cmd __init spectre_v2_parse_user_cmdline(void) +static int __init spectre_v2_user_parse_cmdline(char *str) { - char arg[20]; - int ret, i; - - if (!IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2)) - return SPECTRE_V2_USER_CMD_NONE; - - ret = cmdline_find_option(boot_command_line, "spectre_v2_user", - arg, sizeof(arg)); - if (ret < 0) - return SPECTRE_V2_USER_CMD_AUTO; + if (!str) + return -EINVAL; - for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) { - if (match_option(arg, ret, v2_user_options[i].option)) { - spec_v2_user_print_cond(v2_user_options[i].option, - v2_user_options[i].secure); - return v2_user_options[i].cmd; - } - } + if (!strcmp(str, "auto")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_AUTO; + else if (!strcmp(str, "off")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_NONE; + else if (!strcmp(str, "on")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_FORCE; + else if (!strcmp(str, "prctl")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_PRCTL; + else if (!strcmp(str, "prctl,ibpb")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_PRCTL_IBPB; + else if (!strcmp(str, "seccomp")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_SECCOMP; + else if (!strcmp(str, "seccomp,ibpb")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_SECCOMP_IBPB; + else + pr_err("Ignoring unknown spectre_v2_user option (%s).", str); - pr_err("Unknown user space protection option (%s). Switching to default\n", arg); - return SPECTRE_V2_USER_CMD_AUTO; + return 0; } +early_param("spectre_v2_user", spectre_v2_user_parse_cmdline); static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) { @@ -1902,7 +1906,7 @@ static void __init spectre_v2_user_select_mitigation(void) if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP)) return; - switch (spectre_v2_parse_user_cmdline()) { + switch (spectre_v2_user_cmd) { case SPECTRE_V2_USER_CMD_NONE: return; case SPECTRE_V2_USER_CMD_FORCE: @@ -2030,119 +2034,61 @@ static void __init spectre_v2_user_apply_mitigation(void) static const char * const spectre_v2_strings[] = { [SPECTRE_V2_NONE] = "Vulnerable", [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", - [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", + [SPECTRE_V2_LFENCE] = "Vulnerable: LFENCE", [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced / Automatic IBRS", [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced / Automatic IBRS + LFENCE", [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced / Automatic IBRS + Retpolines", [SPECTRE_V2_IBRS] = "Mitigation: IBRS", }; -static const struct { - const char *option; - enum spectre_v2_mitigation_cmd cmd; - bool secure; -} mitigation_options[] __initconst = { - { "off", SPECTRE_V2_CMD_NONE, false }, - { "on", SPECTRE_V2_CMD_FORCE, true }, - { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, - { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, - { "retpoline,lfence", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, - { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, - { "eibrs", SPECTRE_V2_CMD_EIBRS, false }, - { "eibrs,lfence", SPECTRE_V2_CMD_EIBRS_LFENCE, false }, - { "eibrs,retpoline", SPECTRE_V2_CMD_EIBRS_RETPOLINE, false }, - { "auto", SPECTRE_V2_CMD_AUTO, false }, - { "ibrs", SPECTRE_V2_CMD_IBRS, false }, -}; +static bool nospectre_v2 __ro_after_init; -static void __init spec_v2_print_cond(const char *reason, bool secure) +static int __init nospectre_v2_parse_cmdline(char *str) { - if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure) - pr_info("%s selected on command line.\n", reason); + nospectre_v2 = true; + spectre_v2_cmd = SPECTRE_V2_CMD_NONE; + return 0; } +early_param("nospectre_v2", nospectre_v2_parse_cmdline); -static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) +static int __init spectre_v2_parse_cmdline(char *str) { - enum spectre_v2_mitigation_cmd cmd; - char arg[20]; - int ret, i; - - cmd = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE; - if (cmdline_find_option_bool(boot_command_line, "nospectre_v2")) - return SPECTRE_V2_CMD_NONE; - - ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); - if (ret < 0) - return cmd; - - for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) { - if (!match_option(arg, ret, mitigation_options[i].option)) - continue; - cmd = mitigation_options[i].cmd; - break; - } - - if (i >= ARRAY_SIZE(mitigation_options)) { - pr_err("unknown option (%s). Switching to default mode\n", arg); - return cmd; - } - - if ((cmd == SPECTRE_V2_CMD_RETPOLINE || - cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || - cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC || - cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || - cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && - !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) { - pr_err("%s selected but not compiled in. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } - - if ((cmd == SPECTRE_V2_CMD_EIBRS || - cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || - cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && - !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { - pr_err("%s selected but CPU doesn't have Enhanced or Automatic IBRS. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } - - if ((cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || - cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) && - !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { - pr_err("%s selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } - - if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY)) { - pr_err("%s selected but not compiled in. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } - - if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { - pr_err("%s selected but not Intel CPU. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } + if (!str) + return -EINVAL; - if (cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) { - pr_err("%s selected but CPU doesn't have IBRS. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } + if (nospectre_v2) + return 0; - if (cmd == SPECTRE_V2_CMD_IBRS && cpu_feature_enabled(X86_FEATURE_XENPV)) { - pr_err("%s selected but running as XenPV guest. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; + if (!strcmp(str, "off")) { + spectre_v2_cmd = SPECTRE_V2_CMD_NONE; + } else if (!strcmp(str, "on")) { + spectre_v2_cmd = SPECTRE_V2_CMD_FORCE; + setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + setup_force_cpu_bug(X86_BUG_SPECTRE_V2_USER); + } else if (!strcmp(str, "retpoline")) { + spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE; + } else if (!strcmp(str, "retpoline,amd") || + !strcmp(str, "retpoline,lfence")) { + spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE_LFENCE; + } else if (!strcmp(str, "retpoline,generic")) { + spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE_GENERIC; + } else if (!strcmp(str, "eibrs")) { + spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS; + } else if (!strcmp(str, "eibrs,lfence")) { + spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS_LFENCE; + } else if (!strcmp(str, "eibrs,retpoline")) { + spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS_RETPOLINE; + } else if (!strcmp(str, "auto")) { + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } else if (!strcmp(str, "ibrs")) { + spectre_v2_cmd = SPECTRE_V2_CMD_IBRS; + } else { + pr_err("Ignoring unknown spectre_v2 option (%s).", str); } - spec_v2_print_cond(mitigation_options[i].option, - mitigation_options[i].secure); - return cmd; + return 0; } +early_param("spectre_v2", spectre_v2_parse_cmdline); static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) { @@ -2291,10 +2237,6 @@ static void __init bhi_update_mitigation(void) { if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE) bhi_mitigation = BHI_MITIGATION_OFF; - - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && - spectre_v2_cmd == SPECTRE_V2_CMD_AUTO) - bhi_mitigation = BHI_MITIGATION_OFF; } static void __init bhi_apply_mitigation(void) @@ -2330,11 +2272,55 @@ static void __init bhi_apply_mitigation(void) static void __init spectre_v2_select_mitigation(void) { - spectre_v2_cmd = spectre_v2_parse_cmdline(); + if ((spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE || + spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || + spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && + !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) { + pr_err("RETPOLINE selected but not compiled in. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if ((spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && + !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { + pr_err("EIBRS selected but CPU doesn't have Enhanced or Automatic IBRS. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if ((spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) && + !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { + pr_err("LFENCE selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY)) { + pr_err("IBRS selected but not compiled in. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && - (spectre_v2_cmd == SPECTRE_V2_CMD_NONE || spectre_v2_cmd == SPECTRE_V2_CMD_AUTO)) + if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + pr_err("IBRS selected but not Intel CPU. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) { + pr_err("IBRS selected but CPU doesn't have IBRS. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && cpu_feature_enabled(X86_FEATURE_XENPV)) { + pr_err("IBRS selected but running as XenPV guest. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) { + spectre_v2_cmd = SPECTRE_V2_CMD_NONE; return; + } switch (spectre_v2_cmd) { case SPECTRE_V2_CMD_NONE: @@ -2537,101 +2523,11 @@ static void update_mds_branch_idle(void) } } -#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" -#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" -#define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n" - -void cpu_bugs_smt_update(void) -{ - mutex_lock(&spec_ctrl_mutex); - - if (sched_smt_active() && unprivileged_ebpf_enabled() && - spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) - pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); - - switch (spectre_v2_user_stibp) { - case SPECTRE_V2_USER_NONE: - break; - case SPECTRE_V2_USER_STRICT: - case SPECTRE_V2_USER_STRICT_PREFERRED: - update_stibp_strict(); - break; - case SPECTRE_V2_USER_PRCTL: - case SPECTRE_V2_USER_SECCOMP: - update_indir_branch_cond(); - break; - } - - switch (mds_mitigation) { - case MDS_MITIGATION_FULL: - case MDS_MITIGATION_AUTO: - case MDS_MITIGATION_VMWERV: - if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY)) - pr_warn_once(MDS_MSG_SMT); - update_mds_branch_idle(); - break; - case MDS_MITIGATION_OFF: - break; - } - - switch (taa_mitigation) { - case TAA_MITIGATION_VERW: - case TAA_MITIGATION_AUTO: - case TAA_MITIGATION_UCODE_NEEDED: - if (sched_smt_active()) - pr_warn_once(TAA_MSG_SMT); - break; - case TAA_MITIGATION_TSX_DISABLED: - case TAA_MITIGATION_OFF: - break; - } - - switch (mmio_mitigation) { - case MMIO_MITIGATION_VERW: - case MMIO_MITIGATION_AUTO: - case MMIO_MITIGATION_UCODE_NEEDED: - if (sched_smt_active()) - pr_warn_once(MMIO_MSG_SMT); - break; - case MMIO_MITIGATION_OFF: - break; - } - - switch (tsa_mitigation) { - case TSA_MITIGATION_USER_KERNEL: - case TSA_MITIGATION_VM: - case TSA_MITIGATION_AUTO: - case TSA_MITIGATION_FULL: - /* - * TSA-SQ can potentially lead to info leakage between - * SMT threads. - */ - if (sched_smt_active()) - static_branch_enable(&cpu_buf_idle_clear); - else - static_branch_disable(&cpu_buf_idle_clear); - break; - case TSA_MITIGATION_NONE: - case TSA_MITIGATION_UCODE_NEEDED: - break; - } - - mutex_unlock(&spec_ctrl_mutex); -} - #undef pr_fmt #define pr_fmt(fmt) "Speculative Store Bypass: " fmt -static enum ssb_mitigation ssb_mode __ro_after_init = SPEC_STORE_BYPASS_NONE; - -/* The kernel command line selection */ -enum ssb_mitigation_cmd { - SPEC_STORE_BYPASS_CMD_NONE, - SPEC_STORE_BYPASS_CMD_AUTO, - SPEC_STORE_BYPASS_CMD_ON, - SPEC_STORE_BYPASS_CMD_PRCTL, - SPEC_STORE_BYPASS_CMD_SECCOMP, -}; +static enum ssb_mitigation ssb_mode __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_SSB) ? SPEC_STORE_BYPASS_AUTO : SPEC_STORE_BYPASS_NONE; static const char * const ssb_strings[] = { [SPEC_STORE_BYPASS_NONE] = "Vulnerable", @@ -2640,89 +2536,61 @@ static const char * const ssb_strings[] = { [SPEC_STORE_BYPASS_SECCOMP] = "Mitigation: Speculative Store Bypass disabled via prctl and seccomp", }; -static const struct { - const char *option; - enum ssb_mitigation_cmd cmd; -} ssb_mitigation_options[] __initconst = { - { "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */ - { "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */ - { "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */ - { "prctl", SPEC_STORE_BYPASS_CMD_PRCTL }, /* Disable Speculative Store Bypass via prctl */ - { "seccomp", SPEC_STORE_BYPASS_CMD_SECCOMP }, /* Disable Speculative Store Bypass via prctl and seccomp */ -}; +static bool nossb __ro_after_init; -static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) +static int __init nossb_parse_cmdline(char *str) { - enum ssb_mitigation_cmd cmd; - char arg[20]; - int ret, i; - - cmd = IS_ENABLED(CONFIG_MITIGATION_SSB) ? - SPEC_STORE_BYPASS_CMD_AUTO : SPEC_STORE_BYPASS_CMD_NONE; - if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") || - cpu_mitigations_off()) { - return SPEC_STORE_BYPASS_CMD_NONE; - } else { - ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable", - arg, sizeof(arg)); - if (ret < 0) - return cmd; + nossb = true; + ssb_mode = SPEC_STORE_BYPASS_NONE; + return 0; +} +early_param("nospec_store_bypass_disable", nossb_parse_cmdline); - for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) { - if (!match_option(arg, ret, ssb_mitigation_options[i].option)) - continue; +static int __init ssb_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; - cmd = ssb_mitigation_options[i].cmd; - break; - } + if (nossb) + return 0; - if (i >= ARRAY_SIZE(ssb_mitigation_options)) { - pr_err("unknown option (%s). Switching to default mode\n", arg); - return cmd; - } - } + if (!strcmp(str, "auto")) + ssb_mode = SPEC_STORE_BYPASS_AUTO; + else if (!strcmp(str, "on")) + ssb_mode = SPEC_STORE_BYPASS_DISABLE; + else if (!strcmp(str, "off")) + ssb_mode = SPEC_STORE_BYPASS_NONE; + else if (!strcmp(str, "prctl")) + ssb_mode = SPEC_STORE_BYPASS_PRCTL; + else if (!strcmp(str, "seccomp")) + ssb_mode = IS_ENABLED(CONFIG_SECCOMP) ? + SPEC_STORE_BYPASS_SECCOMP : SPEC_STORE_BYPASS_PRCTL; + else + pr_err("Ignoring unknown spec_store_bypass_disable option (%s).\n", + str); - return cmd; + return 0; } +early_param("spec_store_bypass_disable", ssb_parse_cmdline); static void __init ssb_select_mitigation(void) { - enum ssb_mitigation_cmd cmd; - - if (!boot_cpu_has(X86_FEATURE_SSBD)) - goto out; - - cmd = ssb_parse_cmdline(); - if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) && - (cmd == SPEC_STORE_BYPASS_CMD_NONE || - cmd == SPEC_STORE_BYPASS_CMD_AUTO)) + if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) { + ssb_mode = SPEC_STORE_BYPASS_NONE; return; + } - switch (cmd) { - case SPEC_STORE_BYPASS_CMD_SECCOMP: - /* - * Choose prctl+seccomp as the default mode if seccomp is - * enabled. - */ - if (IS_ENABLED(CONFIG_SECCOMP)) - ssb_mode = SPEC_STORE_BYPASS_SECCOMP; - else + if (ssb_mode == SPEC_STORE_BYPASS_AUTO) { + if (should_mitigate_vuln(X86_BUG_SPEC_STORE_BYPASS)) ssb_mode = SPEC_STORE_BYPASS_PRCTL; - break; - case SPEC_STORE_BYPASS_CMD_ON: - ssb_mode = SPEC_STORE_BYPASS_DISABLE; - break; - case SPEC_STORE_BYPASS_CMD_AUTO: - case SPEC_STORE_BYPASS_CMD_PRCTL: - ssb_mode = SPEC_STORE_BYPASS_PRCTL; - break; - case SPEC_STORE_BYPASS_CMD_NONE: - break; + else + ssb_mode = SPEC_STORE_BYPASS_NONE; } -out: - if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) - pr_info("%s\n", ssb_strings[ssb_mode]); + if (!boot_cpu_has(X86_FEATURE_SSBD)) + ssb_mode = SPEC_STORE_BYPASS_NONE; + + pr_info("%s\n", ssb_strings[ssb_mode]); } static void __init ssb_apply_mitigation(void) @@ -2938,6 +2806,7 @@ static int ssb_prctl_get(struct task_struct *task) return PR_SPEC_DISABLE; case SPEC_STORE_BYPASS_SECCOMP: case SPEC_STORE_BYPASS_PRCTL: + case SPEC_STORE_BYPASS_AUTO: if (task_spec_ssb_force_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; if (task_spec_ssb_noexec(task)) @@ -3184,8 +3053,18 @@ static void __init srso_select_mitigation(void) } if (srso_mitigation == SRSO_MITIGATION_AUTO) { - if (should_mitigate_vuln(X86_BUG_SRSO)) { + /* + * Use safe-RET if user->kernel or guest->host protection is + * required. Otherwise the 'microcode' mitigation is sufficient + * to protect the user->user and guest->guest vectors. + */ + if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) || + (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) && + !boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO))) { srso_mitigation = SRSO_MITIGATION_SAFE_RET; + } else if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) || + cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST)) { + srso_mitigation = SRSO_MITIGATION_MICROCODE; } else { srso_mitigation = SRSO_MITIGATION_NONE; return; @@ -3247,14 +3126,15 @@ ibpb_on_vmexit: static void __init srso_update_mitigation(void) { + if (!boot_cpu_has_bug(X86_BUG_SRSO)) + return; + /* If retbleed is using IBPB, that works for SRSO as well */ if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB && boot_cpu_has(X86_FEATURE_IBPB_BRTYPE)) srso_mitigation = SRSO_MITIGATION_IBPB; - if (boot_cpu_has_bug(X86_BUG_SRSO) && - !cpu_mitigations_off()) - pr_info("%s\n", srso_strings[srso_mitigation]); + pr_info("%s\n", srso_strings[srso_mitigation]); } static void __init srso_apply_mitigation(void) @@ -3315,8 +3195,187 @@ static void __init srso_apply_mitigation(void) } #undef pr_fmt +#define pr_fmt(fmt) "VMSCAPE: " fmt + +enum vmscape_mitigations { + VMSCAPE_MITIGATION_NONE, + VMSCAPE_MITIGATION_AUTO, + VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER, + VMSCAPE_MITIGATION_IBPB_ON_VMEXIT, +}; + +static const char * const vmscape_strings[] = { + [VMSCAPE_MITIGATION_NONE] = "Vulnerable", + /* [VMSCAPE_MITIGATION_AUTO] */ + [VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER] = "Mitigation: IBPB before exit to userspace", + [VMSCAPE_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT", +}; + +static enum vmscape_mitigations vmscape_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_VMSCAPE) ? VMSCAPE_MITIGATION_AUTO : VMSCAPE_MITIGATION_NONE; + +static int __init vmscape_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) { + vmscape_mitigation = VMSCAPE_MITIGATION_NONE; + } else if (!strcmp(str, "ibpb")) { + vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER; + } else if (!strcmp(str, "force")) { + setup_force_cpu_bug(X86_BUG_VMSCAPE); + vmscape_mitigation = VMSCAPE_MITIGATION_AUTO; + } else { + pr_err("Ignoring unknown vmscape=%s option.\n", str); + } + + return 0; +} +early_param("vmscape", vmscape_parse_cmdline); + +static void __init vmscape_select_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_VMSCAPE) || + !boot_cpu_has(X86_FEATURE_IBPB)) { + vmscape_mitigation = VMSCAPE_MITIGATION_NONE; + return; + } + + if (vmscape_mitigation == VMSCAPE_MITIGATION_AUTO) { + if (should_mitigate_vuln(X86_BUG_VMSCAPE)) + vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER; + else + vmscape_mitigation = VMSCAPE_MITIGATION_NONE; + } +} + +static void __init vmscape_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_VMSCAPE)) + return; + + if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB || + srso_mitigation == SRSO_MITIGATION_IBPB_ON_VMEXIT) + vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_ON_VMEXIT; + + pr_info("%s\n", vmscape_strings[vmscape_mitigation]); +} + +static void __init vmscape_apply_mitigation(void) +{ + if (vmscape_mitigation == VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER) + setup_force_cpu_cap(X86_FEATURE_IBPB_EXIT_TO_USER); +} + +#undef pr_fmt #define pr_fmt(fmt) fmt +#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" +#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" +#define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n" +#define VMSCAPE_MSG_SMT "VMSCAPE: SMT on, STIBP is required for full protection. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/vmscape.html for more details.\n" + +void cpu_bugs_smt_update(void) +{ + mutex_lock(&spec_ctrl_mutex); + + if (sched_smt_active() && unprivileged_ebpf_enabled() && + spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) + pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); + + switch (spectre_v2_user_stibp) { + case SPECTRE_V2_USER_NONE: + break; + case SPECTRE_V2_USER_STRICT: + case SPECTRE_V2_USER_STRICT_PREFERRED: + update_stibp_strict(); + break; + case SPECTRE_V2_USER_PRCTL: + case SPECTRE_V2_USER_SECCOMP: + update_indir_branch_cond(); + break; + } + + switch (mds_mitigation) { + case MDS_MITIGATION_FULL: + case MDS_MITIGATION_AUTO: + case MDS_MITIGATION_VMWERV: + if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY)) + pr_warn_once(MDS_MSG_SMT); + update_mds_branch_idle(); + break; + case MDS_MITIGATION_OFF: + break; + } + + switch (taa_mitigation) { + case TAA_MITIGATION_VERW: + case TAA_MITIGATION_AUTO: + case TAA_MITIGATION_UCODE_NEEDED: + if (sched_smt_active()) + pr_warn_once(TAA_MSG_SMT); + break; + case TAA_MITIGATION_TSX_DISABLED: + case TAA_MITIGATION_OFF: + break; + } + + switch (mmio_mitigation) { + case MMIO_MITIGATION_VERW: + case MMIO_MITIGATION_AUTO: + case MMIO_MITIGATION_UCODE_NEEDED: + if (sched_smt_active()) + pr_warn_once(MMIO_MSG_SMT); + break; + case MMIO_MITIGATION_OFF: + break; + } + + switch (tsa_mitigation) { + case TSA_MITIGATION_USER_KERNEL: + case TSA_MITIGATION_VM: + case TSA_MITIGATION_AUTO: + case TSA_MITIGATION_FULL: + /* + * TSA-SQ can potentially lead to info leakage between + * SMT threads. + */ + if (sched_smt_active()) + static_branch_enable(&cpu_buf_idle_clear); + else + static_branch_disable(&cpu_buf_idle_clear); + break; + case TSA_MITIGATION_NONE: + case TSA_MITIGATION_UCODE_NEEDED: + break; + } + + switch (vmscape_mitigation) { + case VMSCAPE_MITIGATION_NONE: + case VMSCAPE_MITIGATION_AUTO: + break; + case VMSCAPE_MITIGATION_IBPB_ON_VMEXIT: + case VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER: + /* + * Hypervisors can be attacked across-threads, warn for SMT when + * STIBP is not already enabled system-wide. + * + * Intel eIBRS (!AUTOIBRS) implies STIBP on. + */ + if (!sched_smt_active() || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED || + (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && + !boot_cpu_has(X86_FEATURE_AUTOIBRS))) + break; + pr_warn_once(VMSCAPE_MSG_SMT); + break; + } + + mutex_unlock(&spec_ctrl_mutex); +} + #ifdef CONFIG_SYSFS #define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion" @@ -3502,9 +3561,6 @@ static const char *spectre_bhi_state(void) static ssize_t spectre_v2_show_state(char *buf) { - if (spectre_v2_enabled == SPECTRE_V2_LFENCE) - return sysfs_emit(buf, "Vulnerable: LFENCE\n"); - if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); @@ -3562,6 +3618,11 @@ static ssize_t tsa_show_state(char *buf) return sysfs_emit(buf, "%s\n", tsa_strings[tsa_mitigation]); } +static ssize_t vmscape_show_state(char *buf) +{ + return sysfs_emit(buf, "%s\n", vmscape_strings[vmscape_mitigation]); +} + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { @@ -3628,6 +3689,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_TSA: return tsa_show_state(buf); + case X86_BUG_VMSCAPE: + return vmscape_show_state(buf); + default: break; } @@ -3719,6 +3783,11 @@ ssize_t cpu_show_tsa(struct device *dev, struct device_attribute *attr, char *bu { return cpu_show_common(dev, attr, buf, X86_BUG_TSA); } + +ssize_t cpu_show_vmscape(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_VMSCAPE); +} #endif void __warn_thunk(void) diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index adfa7e8bb865..51a95b07831f 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -290,6 +290,22 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c) } /* + * The max shared threads number comes from CPUID(0x4) EAX[25-14] with input + * ECX as cache index. Then right shift apicid by the number's order to get + * cache id for this cache node. + */ +static unsigned int get_cache_id(u32 apicid, const struct _cpuid4_info *id4) +{ + unsigned long num_threads_sharing; + int index_msb; + + num_threads_sharing = 1 + id4->eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + + return apicid >> index_msb; +} + +/* * AMD/Hygon CPUs may have multiple LLCs if L3 caches exist. */ @@ -312,18 +328,11 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id) * Newer families: LLC ID is calculated from the number * of threads sharing the L3 cache. */ - u32 eax, ebx, ecx, edx, num_sharing_cache = 0; u32 llc_index = find_num_cache_leaves(c) - 1; + struct _cpuid4_info id4 = {}; - cpuid_count(0x8000001d, llc_index, &eax, &ebx, &ecx, &edx); - if (eax) - num_sharing_cache = ((eax >> 14) & 0xfff) + 1; - - if (num_sharing_cache) { - int index_msb = get_count_order(num_sharing_cache); - - c->topo.llc_id = c->topo.apicid >> index_msb; - } + if (!amd_fill_cpuid4_info(llc_index, &id4)) + c->topo.llc_id = get_cache_id(c->topo.apicid, &id4); } } @@ -598,27 +607,12 @@ int init_cache_level(unsigned int cpu) return 0; } -/* - * The max shared threads number comes from CPUID(0x4) EAX[25-14] with input - * ECX as cache index. Then right shift apicid by the number's order to get - * cache id for this cache node. - */ -static void get_cache_id(int cpu, struct _cpuid4_info *id4) -{ - struct cpuinfo_x86 *c = &cpu_data(cpu); - unsigned long num_threads_sharing; - int index_msb; - - num_threads_sharing = 1 + id4->eax.split.num_threads_sharing; - index_msb = get_count_order(num_threads_sharing); - id4->id = c->topo.apicid >> index_msb; -} - int populate_cache_leaves(unsigned int cpu) { struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); struct cacheinfo *ci = this_cpu_ci->info_list; u8 cpu_vendor = boot_cpu_data.x86_vendor; + u32 apicid = cpu_data(cpu).topo.apicid; struct amd_northbridge *nb = NULL; struct _cpuid4_info id4 = {}; int idx, ret; @@ -628,7 +622,7 @@ int populate_cache_leaves(unsigned int cpu) if (ret) return ret; - get_cache_id(cpu, &id4); + id4.id = get_cache_id(apicid, &id4); if (cpu_vendor == X86_VENDOR_AMD || cpu_vendor == X86_VENDOR_HYGON) nb = amd_init_l3_cache(idx); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 34a054181c4d..c7d3512914ca 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1236,55 +1236,71 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define ITS_NATIVE_ONLY BIT(9) /* CPU is affected by Transient Scheduler Attacks */ #define TSA BIT(10) +/* CPU is affected by VMSCAPE */ +#define VMSCAPE BIT(11) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { - VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS), - VULNBL_INTEL_STEPS(INTEL_HASWELL, X86_STEP_MAX, SRBDS), - VULNBL_INTEL_STEPS(INTEL_HASWELL_L, X86_STEP_MAX, SRBDS), - VULNBL_INTEL_STEPS(INTEL_HASWELL_G, X86_STEP_MAX, SRBDS), - VULNBL_INTEL_STEPS(INTEL_HASWELL_X, X86_STEP_MAX, MMIO), - VULNBL_INTEL_STEPS(INTEL_BROADWELL_D, X86_STEP_MAX, MMIO), - VULNBL_INTEL_STEPS(INTEL_BROADWELL_G, X86_STEP_MAX, SRBDS), - VULNBL_INTEL_STEPS(INTEL_BROADWELL_X, X86_STEP_MAX, MMIO), - VULNBL_INTEL_STEPS(INTEL_BROADWELL, X86_STEP_MAX, SRBDS), - VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, 0x5, MMIO | RETBLEED | GDS), - VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS), - VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPS(INTEL_SKYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, 0xb, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS), - VULNBL_INTEL_STEPS(INTEL_KABYLAKE, 0xc, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS), - VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L, X86_STEP_MAX, RETBLEED), + VULNBL_INTEL_STEPS(INTEL_SANDYBRIDGE_X, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_SANDYBRIDGE, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE_X, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_HASWELL, X86_STEP_MAX, SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_HASWELL_L, X86_STEP_MAX, SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_HASWELL_G, X86_STEP_MAX, SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_HASWELL_X, X86_STEP_MAX, MMIO | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_BROADWELL_D, X86_STEP_MAX, MMIO | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_BROADWELL_X, X86_STEP_MAX, MMIO | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_BROADWELL_G, X86_STEP_MAX, SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_BROADWELL, X86_STEP_MAX, SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, 0x5, MMIO | RETBLEED | GDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, 0xb, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE, 0xc, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L, X86_STEP_MAX, RETBLEED | VMSCAPE), VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY), VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY), VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY), - VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS), - VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED | ITS), - VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED | ITS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | VMSCAPE), VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY), VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY), VULNBL_INTEL_STEPS(INTEL_LAKEFIELD, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED), VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY), - VULNBL_INTEL_TYPE(INTEL_ALDERLAKE, ATOM, RFDS), - VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS), - VULNBL_INTEL_TYPE(INTEL_RAPTORLAKE, ATOM, RFDS), - VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_P, X86_STEP_MAX, RFDS), - VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_S, X86_STEP_MAX, RFDS), - VULNBL_INTEL_STEPS(INTEL_ATOM_GRACEMONT, X86_STEP_MAX, RFDS), + VULNBL_INTEL_TYPE(INTEL_ALDERLAKE, ATOM, RFDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ALDERLAKE, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS | VMSCAPE), + VULNBL_INTEL_TYPE(INTEL_RAPTORLAKE, ATOM, RFDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_P, X86_STEP_MAX, RFDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_S, X86_STEP_MAX, RFDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_METEORLAKE_L, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ARROWLAKE_H, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ARROWLAKE, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ARROWLAKE_U, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_LUNARLAKE_M, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_SAPPHIRERAPIDS_X, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_GRANITERAPIDS_X, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_EMERALDRAPIDS_X, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ATOM_GRACEMONT, X86_STEP_MAX, RFDS | VMSCAPE), VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS), VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_D, X86_STEP_MAX, MMIO | RFDS), VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS), VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT, X86_STEP_MAX, RFDS), VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_D, X86_STEP_MAX, RFDS), VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_PLUS, X86_STEP_MAX, RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_CRESTMONT_X, X86_STEP_MAX, VMSCAPE), VULNBL_AMD(0x15, RETBLEED), VULNBL_AMD(0x16, RETBLEED), - VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO), - VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO), - VULNBL_AMD(0x19, SRSO | TSA), - VULNBL_AMD(0x1a, SRSO), + VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO | VMSCAPE), + VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO | VMSCAPE), + VULNBL_AMD(0x19, SRSO | TSA | VMSCAPE), + VULNBL_AMD(0x1a, SRSO | VMSCAPE), {} }; @@ -1543,6 +1559,14 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) } } + /* + * Set the bug only on bare-metal. A nested hypervisor should already be + * deploying IBPB to isolate itself from nested guests. + */ + if (cpu_matches(cpu_vuln_blacklist, VMSCAPE) && + !boot_cpu_has(X86_FEATURE_HYPERVISOR)) + setup_force_cpu_bug(X86_BUG_VMSCAPE); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; @@ -1784,6 +1808,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_clear_cpu_cap(X86_FEATURE_LA57); detect_nopl(); + mca_bsp_init(c); } void __init init_cpu_devs(void) diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 2154f12766fb..1fda6c3a2b65 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -16,6 +16,7 @@ #include <asm/spec-ctrl.h> #include <asm/delay.h> #include <asm/msr.h> +#include <asm/resctrl.h> #include "cpu.h" @@ -117,6 +118,8 @@ static void bsp_init_hygon(struct cpuinfo_x86 *c) x86_amd_ls_cfg_ssbd_mask = 1ULL << 10; } } + + resctrl_cpu_detect(c); } static void early_init_hygon(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 553bfbfc3a1b..f3e9219845e8 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -45,6 +45,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = #ifdef CONFIG_ACRN_GUEST &x86_hyper_acrn, #endif +#ifdef CONFIG_BHYVE_GUEST + &x86_hyper_bhyve, +#endif }; enum x86_hypervisor_type x86_hyper_type; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 076eaa41b8c8..98ae4c37c93e 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -262,7 +262,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); - } else if ((c->x86_vfm >= INTEL_P4_PRESCOTT && c->x86_vfm <= INTEL_P4_WILLAMETTE) || + } else if ((c->x86_vfm >= INTEL_P4_PRESCOTT && c->x86_vfm <= INTEL_P4_CEDARMILL) || (c->x86_vfm >= INTEL_CORE_YONAH && c->x86_vfm <= INTEL_IVYBRIDGE)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); } diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 5c4eb28c3ac9..d6906442f49b 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -241,7 +241,8 @@ struct threshold_block { struct threshold_bank { struct kobject *kobj; - struct threshold_block *blocks; + /* List of threshold blocks within this MCA bank. */ + struct list_head miscj; }; static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); @@ -252,9 +253,6 @@ static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); */ static DEFINE_PER_CPU(u64, bank_map); -/* Map of banks that have more than MCA_MISC0 available. */ -static DEFINE_PER_CPU(u64, smca_misc_banks_map); - static void amd_threshold_interrupt(void); static void amd_deferred_error_interrupt(void); @@ -264,28 +262,6 @@ static void default_deferred_error_interrupt(void) } void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; -static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu) -{ - u32 low, high; - - /* - * For SMCA enabled processors, BLKPTR field of the first MISC register - * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4). - */ - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) - return; - - if (!(low & MCI_CONFIG_MCAX)) - return; - - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high)) - return; - - if (low & MASK_BLKPTR_LO) - per_cpu(smca_misc_banks_map, cpu) |= BIT_ULL(bank); - -} - static void smca_configure(unsigned int bank, unsigned int cpu) { u8 *bank_counts = this_cpu_ptr(smca_bank_counts); @@ -326,8 +302,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu) wrmsr(smca_config, low, high); } - smca_set_misc_banks_map(bank, cpu); - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { pr_warn("Failed to read MCA_IPID for bank %d\n", bank); return; @@ -419,8 +393,8 @@ static bool lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) return true; }; -/* Reprogram MCx_MISC MSR behind this threshold bank. */ -static void threshold_restart_bank(void *_tr) +/* Reprogram MCx_MISC MSR behind this threshold block. */ +static void threshold_restart_block(void *_tr) { struct thresh_restart *tr = _tr; u32 hi, lo; @@ -478,7 +452,7 @@ static void mce_threshold_block_init(struct threshold_block *b, int offset) }; b->threshold_limit = THRESHOLD_MAX; - threshold_restart_bank(&tr); + threshold_restart_block(&tr); }; static int setup_APIC_mce_threshold(int reserved, int new) @@ -525,18 +499,6 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) wrmsr(MSR_CU_DEF_ERR, low, high); } -static u32 smca_get_block_address(unsigned int bank, unsigned int block, - unsigned int cpu) -{ - if (!block) - return MSR_AMD64_SMCA_MCx_MISC(bank); - - if (!(per_cpu(smca_misc_banks_map, cpu) & BIT_ULL(bank))) - return 0; - - return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); -} - static u32 get_block_address(u32 current_addr, u32 low, u32 high, unsigned int bank, unsigned int block, unsigned int cpu) @@ -546,8 +508,15 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high, if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS)) return addr; - if (mce_flags.smca) - return smca_get_block_address(bank, block, cpu); + if (mce_flags.smca) { + if (!block) + return MSR_AMD64_SMCA_MCx_MISC(bank); + + if (!(low & MASK_BLKPTR_LO)) + return 0; + + return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); + } /* Fall back to method we used for older processors: */ switch (block) { @@ -677,6 +646,28 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) wrmsrq(MSR_K7_HWCR, hwcr); } +static void amd_apply_cpu_quirks(struct cpuinfo_x86 *c) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + + /* This should be disabled by the BIOS, but isn't always */ + if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { + /* + * disable GART TBL walk error reporting, which + * trips off incorrectly with the IOMMU & 3ware + * & Cerberus: + */ + clear_bit(10, (unsigned long *)&mce_banks[4].ctl); + } + + /* + * Various K7s with broken bank 0 around. Always disable + * by default. + */ + if (c->x86 == 6 && this_cpu_read(mce_num_banks)) + mce_banks[0].ctl = 0; +} + /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { @@ -684,6 +675,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) u32 low = 0, high = 0, address = 0; int offset = -1; + amd_apply_cpu_quirks(c); + + mce_flags.amd_threshold = 1; for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { if (mce_flags.smca) @@ -714,6 +708,12 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) deferred_error_interrupt_enable(c); } +void smca_bsp_init(void) +{ + mce_threshold_vector = amd_threshold_interrupt; + deferred_error_int_vector = amd_deferred_error_interrupt; +} + /* * DRAM ECC errors are reported in the Northbridge (bank 4) with * Extended Error Code 8. @@ -921,7 +921,7 @@ static void log_and_reset_block(struct threshold_block *block) /* Reset threshold block after logging error. */ memset(&tr, 0, sizeof(tr)); tr.b = block; - threshold_restart_bank(&tr); + threshold_restart_block(&tr); } /* @@ -930,9 +930,9 @@ static void log_and_reset_block(struct threshold_block *block) */ static void amd_threshold_interrupt(void) { - struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL; - struct threshold_bank **bp = this_cpu_read(threshold_banks); + struct threshold_bank **bp = this_cpu_read(threshold_banks), *thr_bank; unsigned int bank, cpu = smp_processor_id(); + struct threshold_block *block, *tmp; /* * Validate that the threshold bank has been initialized already. The @@ -946,20 +946,20 @@ static void amd_threshold_interrupt(void) if (!(per_cpu(bank_map, cpu) & BIT_ULL(bank))) continue; - first_block = bp[bank]->blocks; - if (!first_block) + thr_bank = bp[bank]; + if (!thr_bank) continue; - /* - * The first block is also the head of the list. Check it first - * before iterating over the rest. - */ - log_and_reset_block(first_block); - list_for_each_entry_safe(block, tmp, &first_block->miscj, miscj) + list_for_each_entry_safe(block, tmp, &thr_bank->miscj, miscj) log_and_reset_block(block); } } +void amd_clear_bank(struct mce *m) +{ + mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0); +} + /* * Sysfs Interface */ @@ -995,7 +995,7 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) memset(&tr, 0, sizeof(tr)); tr.b = b; - if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1)) + if (smp_call_function_single(b->cpu, threshold_restart_block, &tr, 1)) return -ENODEV; return size; @@ -1020,7 +1020,7 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) b->threshold_limit = new; tr.b = b; - if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1)) + if (smp_call_function_single(b->cpu, threshold_restart_block, &tr, 1)) return -ENODEV; return size; @@ -1181,13 +1181,7 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb default_attrs[2] = NULL; } - INIT_LIST_HEAD(&b->miscj); - - /* This is safe as @tb is not visible yet */ - if (tb->blocks) - list_add(&b->miscj, &tb->blocks->miscj); - else - tb->blocks = b; + list_add(&b->miscj, &tb->miscj); err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b)); if (err) @@ -1238,6 +1232,8 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, goto out_free; } + INIT_LIST_HEAD(&b->miscj); + err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC)); if (err) goto out_kobj; @@ -1258,26 +1254,15 @@ static void threshold_block_release(struct kobject *kobj) kfree(to_block(kobj)); } -static void deallocate_threshold_blocks(struct threshold_bank *bank) +static void threshold_remove_bank(struct threshold_bank *bank) { struct threshold_block *pos, *tmp; - list_for_each_entry_safe(pos, tmp, &bank->blocks->miscj, miscj) { + list_for_each_entry_safe(pos, tmp, &bank->miscj, miscj) { list_del(&pos->miscj); kobject_put(&pos->kobj); } - kobject_put(&bank->blocks->kobj); -} - -static void threshold_remove_bank(struct threshold_bank *bank) -{ - if (!bank->blocks) - goto out_free; - - deallocate_threshold_blocks(bank); - -out_free: kobject_put(bank->kobj); kfree(bank); } @@ -1296,12 +1281,12 @@ static void __threshold_remove_device(struct threshold_bank **bp) kfree(bp); } -int mce_threshold_remove_device(unsigned int cpu) +void mce_threshold_remove_device(unsigned int cpu) { struct threshold_bank **bp = this_cpu_read(threshold_banks); if (!bp) - return 0; + return; /* * Clear the pointer before cleaning up, so that the interrupt won't @@ -1310,7 +1295,7 @@ int mce_threshold_remove_device(unsigned int cpu) this_cpu_write(threshold_banks, NULL); __threshold_remove_device(bp); - return 0; + return; } /** @@ -1324,36 +1309,34 @@ int mce_threshold_remove_device(unsigned int cpu) * thread running on @cpu. The callback is invoked on all CPUs which are * online when the callback is installed or during a real hotplug event. */ -int mce_threshold_create_device(unsigned int cpu) +void mce_threshold_create_device(unsigned int cpu) { unsigned int numbanks, bank; struct threshold_bank **bp; - int err; if (!mce_flags.amd_threshold) - return 0; + return; bp = this_cpu_read(threshold_banks); if (bp) - return 0; + return; numbanks = this_cpu_read(mce_num_banks); bp = kcalloc(numbanks, sizeof(*bp), GFP_KERNEL); if (!bp) - return -ENOMEM; + return; for (bank = 0; bank < numbanks; ++bank) { if (!(this_cpu_read(bank_map) & BIT_ULL(bank))) continue; - err = threshold_create_bank(bp, cpu, bank); - if (err) { + if (threshold_create_bank(bp, cpu, bank)) { __threshold_remove_device(bp); - return err; + return; } } this_cpu_write(threshold_banks, bp); if (thresholding_irq_en) mce_threshold_vector = amd_threshold_interrupt; - return 0; + return; } diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 4da4eab56c81..460e90a1a0b1 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -423,7 +423,7 @@ noinstr u64 mce_rdmsrq(u32 msr) return EAX_EDX_VAL(val, low, high); } -static noinstr void mce_wrmsrq(u32 msr, u64 v) +noinstr void mce_wrmsrq(u32 msr, u64 v) { u32 low, high; @@ -715,6 +715,60 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i) DEFINE_PER_CPU(unsigned, mce_poll_count); /* + * Newer Intel systems that support software error + * recovery need to make additional checks. Other + * CPUs should skip over uncorrected errors, but log + * everything else. + */ +static bool ser_should_log_poll_error(struct mce *m) +{ + /* Log "not enabled" (speculative) errors */ + if (!(m->status & MCI_STATUS_EN)) + return true; + + /* + * Log UCNA (SDM: 15.6.3 "UCR Error Classification") + * UC == 1 && PCC == 0 && S == 0 + */ + if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S)) + return true; + + return false; +} + +static bool should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *err) +{ + struct mce *m = &err->m; + + /* If this entry is not valid, ignore it. */ + if (!(m->status & MCI_STATUS_VAL)) + return false; + + /* + * If we are logging everything (at CPU online) or this + * is a corrected error, then we must log it. + */ + if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC)) + return true; + + if (mca_cfg.ser) + return ser_should_log_poll_error(m); + + if (m->status & MCI_STATUS_UC) + return false; + + return true; +} + +static void clear_bank(struct mce *m) +{ + if (m->cpuvendor == X86_VENDOR_AMD) + return amd_clear_bank(m); + + mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0); +} + +/* * Poll for corrected events or events that happened before reset. * Those are just logged through /dev/mcelog. * @@ -765,51 +819,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (!mca_cfg.cmci_disabled) mce_track_storm(m); - /* If this entry is not valid, ignore it */ - if (!(m->status & MCI_STATUS_VAL)) + /* Verify that the error should be logged based on hardware conditions. */ + if (!should_log_poll_error(flags, &err)) continue; - /* - * If we are logging everything (at CPU online) or this - * is a corrected error, then we must log it. - */ - if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC)) - goto log_it; - - /* - * Newer Intel systems that support software error - * recovery need to make additional checks. Other - * CPUs should skip over uncorrected errors, but log - * everything else. - */ - if (!mca_cfg.ser) { - if (m->status & MCI_STATUS_UC) - continue; - goto log_it; - } - - /* Log "not enabled" (speculative) errors */ - if (!(m->status & MCI_STATUS_EN)) - goto log_it; - - /* - * Log UCNA (SDM: 15.6.3 "UCR Error Classification") - * UC == 1 && PCC == 0 && S == 0 - */ - if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S)) - goto log_it; - - /* - * Skip anything else. Presumption is that our read of this - * bank is racing with a machine check. Leave the log alone - * for do_machine_check() to deal with it. - */ - continue; - -log_it: - if (flags & MCP_DONTLOG) - goto clear_it; - mce_read_aux(&err, i); m->severity = mce_severity(m, NULL, NULL, false); /* @@ -826,10 +839,7 @@ log_it: mce_log(&err); clear_it: - /* - * Clear state for this bank. - */ - mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); + clear_bank(m); } /* @@ -1810,9 +1820,10 @@ static void __mcheck_cpu_mce_banks_init(void) struct mce_bank *b = &mce_banks[i]; /* - * Init them all, __mcheck_cpu_apply_quirks() is going to apply - * the required vendor quirks before - * __mcheck_cpu_init_clear_banks() does the final bank setup. + * Init them all by default. + * + * The required vendor quirks will be applied before + * __mcheck_cpu_init_prepare_banks() does the final bank setup. */ b->ctl = -1ULL; b->init = true; @@ -1840,69 +1851,34 @@ static void __mcheck_cpu_cap_init(void) this_cpu_write(mce_num_banks, b); __mcheck_cpu_mce_banks_init(); - - /* Use accurate RIP reporting if available. */ - if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) - mca_cfg.rip_msr = MSR_IA32_MCG_EIP; - - if (cap & MCG_SER_P) - mca_cfg.ser = 1; } static void __mcheck_cpu_init_generic(void) { - enum mcp_flags m_fl = 0; - mce_banks_t all_banks; u64 cap; - if (!mca_cfg.bootlog) - m_fl = MCP_DONTLOG; - - /* - * Log the machine checks left over from the previous reset. Log them - * only, do not start processing them. That will happen in mcheck_late_init() - * when all consumers have been registered on the notifier chain. - */ - bitmap_fill(all_banks, MAX_NR_BANKS); - machine_check_poll(MCP_UC | MCP_QUEUE_LOG | m_fl, &all_banks); - - cr4_set_bits(X86_CR4_MCE); - rdmsrq(MSR_IA32_MCG_CAP, cap); if (cap & MCG_CTL_P) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); } -static void __mcheck_cpu_init_clear_banks(void) +static void __mcheck_cpu_init_prepare_banks(void) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + u64 msrval; int i; - for (i = 0; i < this_cpu_read(mce_num_banks); i++) { - struct mce_bank *b = &mce_banks[i]; + /* + * Log the machine checks left over from the previous reset. Log them + * only, do not start processing them. That will happen in mcheck_late_init() + * when all consumers have been registered on the notifier chain. + */ + if (mca_cfg.bootlog) { + mce_banks_t all_banks; - if (!b->init) - continue; - wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl); - wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); + bitmap_fill(all_banks, MAX_NR_BANKS); + machine_check_poll(MCP_UC | MCP_QUEUE_LOG, &all_banks); } -} - -/* - * Do a final check to see if there are any unused/RAZ banks. - * - * This must be done after the banks have been initialized and any quirks have - * been applied. - * - * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs. - * Otherwise, a user who disables a bank will not be able to re-enable it - * without a system reboot. - */ -static void __mcheck_cpu_check_banks(void) -{ - struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - u64 msrval; - int i; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { struct mce_bank *b = &mce_banks[i]; @@ -1910,25 +1886,16 @@ static void __mcheck_cpu_check_banks(void) if (!b->init) continue; + wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl); + wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); + rdmsrq(mca_msr_reg(i, MCA_CTL), msrval); b->init = !!msrval; } } -static void apply_quirks_amd(struct cpuinfo_x86 *c) +static void amd_apply_global_quirks(struct cpuinfo_x86 *c) { - struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - - /* This should be disabled by the BIOS, but isn't always */ - if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { - /* - * disable GART TBL walk error reporting, which - * trips off incorrectly with the IOMMU & 3ware - * & Cerberus: - */ - clear_bit(10, (unsigned long *)&mce_banks[4].ctl); - } - if (c->x86 < 0x11 && mca_cfg.bootlog < 0) { /* * Lots of broken BIOS around that don't clear them @@ -1938,13 +1905,6 @@ static void apply_quirks_amd(struct cpuinfo_x86 *c) } /* - * Various K7s with broken bank 0 around. Always disable - * by default. - */ - if (c->x86 == 6 && this_cpu_read(mce_num_banks)) - mce_banks[0].ctl = 0; - - /* * overflow_recov is supported for F15h Models 00h-0fh * even though we don't have a CPUID bit for it. */ @@ -1955,26 +1915,13 @@ static void apply_quirks_amd(struct cpuinfo_x86 *c) mce_flags.zen_ifu_quirk = 1; } -static void apply_quirks_intel(struct cpuinfo_x86 *c) +static void intel_apply_global_quirks(struct cpuinfo_x86 *c) { - struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - /* Older CPUs (prior to family 6) don't need quirks. */ if (c->x86_vfm < INTEL_PENTIUM_PRO) return; /* - * SDM documents that on family 6 bank 0 should not be written - * because it aliases to another special BIOS controlled - * register. - * But it's not aliased anymore on model 0x1a+ - * Don't ignore bank 0 completely because there could be a - * valid event later, merely don't write CTL0. - */ - if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks)) - mce_banks[0].init = false; - - /* * All newer Intel systems support MCE broadcasting. Enable * synchronization with a one second timeout. */ @@ -1999,7 +1946,7 @@ static void apply_quirks_intel(struct cpuinfo_x86 *c) mce_flags.skx_repmov_quirk = 1; } -static void apply_quirks_zhaoxin(struct cpuinfo_x86 *c) +static void zhaoxin_apply_global_quirks(struct cpuinfo_x86 *c) { /* * All newer Zhaoxin CPUs support MCE broadcasting. Enable @@ -2011,34 +1958,6 @@ static void apply_quirks_zhaoxin(struct cpuinfo_x86 *c) } } -/* Add per CPU specific workarounds here */ -static bool __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) -{ - struct mca_config *cfg = &mca_cfg; - - switch (c->x86_vendor) { - case X86_VENDOR_UNKNOWN: - pr_info("unknown CPU type - not enabling MCE support\n"); - return false; - case X86_VENDOR_AMD: - apply_quirks_amd(c); - break; - case X86_VENDOR_INTEL: - apply_quirks_intel(c); - break; - case X86_VENDOR_ZHAOXIN: - apply_quirks_zhaoxin(c); - break; - } - - if (cfg->monarch_timeout < 0) - cfg->monarch_timeout = 0; - if (cfg->bootlog != 0) - cfg->panic_timeout = 30; - - return true; -} - static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) { if (c->x86 != 5) @@ -2060,19 +1979,6 @@ static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) return false; } -/* - * Init basic CPU features needed for early decoding of MCEs. - */ -static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) -{ - if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) { - mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV); - mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); - mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); - mce_flags.amd_threshold = 1; - } -} - static void mce_centaur_feature_init(struct cpuinfo_x86 *c) { struct mca_config *cfg = &mca_cfg; @@ -2281,6 +2187,53 @@ DEFINE_IDTENTRY_RAW(exc_machine_check) } #endif +void mca_bsp_init(struct cpuinfo_x86 *c) +{ + u64 cap; + + if (!mce_available(c)) + return; + + if (c->x86_vendor == X86_VENDOR_UNKNOWN) { + mca_cfg.disabled = 1; + pr_info("unknown CPU type - not enabling MCE support\n"); + return; + } + + mce_flags.overflow_recov = cpu_feature_enabled(X86_FEATURE_OVERFLOW_RECOV); + mce_flags.succor = cpu_feature_enabled(X86_FEATURE_SUCCOR); + mce_flags.smca = cpu_feature_enabled(X86_FEATURE_SMCA); + + if (mce_flags.smca) + smca_bsp_init(); + + rdmsrq(MSR_IA32_MCG_CAP, cap); + + /* Use accurate RIP reporting if available. */ + if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) + mca_cfg.rip_msr = MSR_IA32_MCG_EIP; + + if (cap & MCG_SER_P) + mca_cfg.ser = 1; + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + amd_apply_global_quirks(c); + break; + case X86_VENDOR_INTEL: + intel_apply_global_quirks(c); + break; + case X86_VENDOR_ZHAOXIN: + zhaoxin_apply_global_quirks(c); + break; + } + + if (mca_cfg.monarch_timeout < 0) + mca_cfg.monarch_timeout = 0; + if (mca_cfg.bootlog != 0) + mca_cfg.panic_timeout = 30; +} + /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off: @@ -2298,11 +2251,6 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_cap_init(); - if (!__mcheck_cpu_apply_quirks(c)) { - mca_cfg.disabled = 1; - return; - } - if (!mce_gen_pool_init()) { mca_cfg.disabled = 1; pr_emerg("Couldn't allocate MCE records pool!\n"); @@ -2311,12 +2259,11 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) mca_cfg.initialized = 1; - __mcheck_cpu_init_early(c); __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(c); - __mcheck_cpu_init_clear_banks(); - __mcheck_cpu_check_banks(); + __mcheck_cpu_init_prepare_banks(); __mcheck_cpu_setup_timer(); + cr4_set_bits(X86_CR4_MCE); } /* @@ -2483,7 +2430,8 @@ static void mce_syscore_resume(void) { __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info)); - __mcheck_cpu_init_clear_banks(); + __mcheck_cpu_init_prepare_banks(); + cr4_set_bits(X86_CR4_MCE); } static struct syscore_ops mce_syscore_ops = { @@ -2501,8 +2449,9 @@ static void mce_cpu_restart(void *data) if (!mce_available(raw_cpu_ptr(&cpu_info))) return; __mcheck_cpu_init_generic(); - __mcheck_cpu_init_clear_banks(); + __mcheck_cpu_init_prepare_banks(); __mcheck_cpu_init_timer(); + cr4_set_bits(X86_CR4_MCE); } /* Reinit MCEs after user configuration changes */ diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index 9b149b9c4109..4655223ba560 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -468,8 +468,26 @@ static void intel_imc_init(struct cpuinfo_x86 *c) } } +static void intel_apply_cpu_quirks(struct cpuinfo_x86 *c) +{ + /* + * SDM documents that on family 6 bank 0 should not be written + * because it aliases to another special BIOS controlled + * register. + * But it's not aliased anymore on model 0x1a+ + * Don't ignore bank 0 completely because there could be a + * valid event later, merely don't write CTL0. + * + * Older CPUs (prior to family 6) can't reach this point and already + * return early due to the check of __mcheck_cpu_ancient_init(). + */ + if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks)) + this_cpu_ptr(mce_banks_array)[0].init = false; +} + void mce_intel_feature_init(struct cpuinfo_x86 *c) { + intel_apply_cpu_quirks(c); intel_init_cmci(); intel_init_lmce(); intel_imc_init(c); diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index b5ba598e54cb..b0e00ec5cc8c 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -265,8 +265,11 @@ void mce_prep_record_common(struct mce *m); void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m); #ifdef CONFIG_X86_MCE_AMD +void mce_threshold_create_device(unsigned int cpu); +void mce_threshold_remove_device(unsigned int cpu); extern bool amd_filter_mce(struct mce *m); bool amd_mce_usable_address(struct mce *m); +void amd_clear_bank(struct mce *m); /* * If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits @@ -292,10 +295,15 @@ static __always_inline void smca_extract_err_addr(struct mce *m) m->addr &= GENMASK_ULL(55, lsb); } +void smca_bsp_init(void); #else +static inline void mce_threshold_create_device(unsigned int cpu) { } +static inline void mce_threshold_remove_device(unsigned int cpu) { } static inline bool amd_filter_mce(struct mce *m) { return false; } static inline bool amd_mce_usable_address(struct mce *m) { return false; } +static inline void amd_clear_bank(struct mce *m) { } static inline void smca_extract_err_addr(struct mce *m) { } +static inline void smca_bsp_init(void) { } #endif #ifdef CONFIG_X86_ANCIENT_MCE @@ -313,6 +321,7 @@ static __always_inline void winchip_machine_check(struct pt_regs *regs) {} #endif noinstr u64 mce_rdmsrq(u32 msr); +noinstr void mce_wrmsrq(u32 msr, u64 v); static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg) { diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 097e39327942..cdce885e2fd5 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -171,8 +171,28 @@ static int cmp_id(const void *key, const void *elem) return 1; } +static u32 cpuid_to_ucode_rev(unsigned int val) +{ + union zen_patch_rev p = {}; + union cpuid_1_eax c; + + c.full = val; + + p.stepping = c.stepping; + p.model = c.model; + p.ext_model = c.ext_model; + p.ext_fam = c.ext_fam; + + return p.ucode_rev; +} + static bool need_sha_check(u32 cur_rev) { + if (!cur_rev) { + cur_rev = cpuid_to_ucode_rev(bsp_cpuid_1_eax); + pr_info_once("No current revision, generating the lowest one: 0x%x\n", cur_rev); + } + switch (cur_rev >> 8) { case 0x80012: return cur_rev <= 0x800126f; break; case 0x80082: return cur_rev <= 0x800820f; break; @@ -249,15 +269,6 @@ static bool verify_sha256_digest(u32 patch_id, u32 cur_rev, const u8 *data, unsi return true; } -static u32 get_patch_level(void) -{ - u32 rev, dummy __always_unused; - - native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - - return rev; -} - static union cpuid_1_eax ucode_rev_to_cpuid(unsigned int val) { union zen_patch_rev p; @@ -275,6 +286,30 @@ static union cpuid_1_eax ucode_rev_to_cpuid(unsigned int val) return c; } +static u32 get_patch_level(void) +{ + u32 rev, dummy __always_unused; + + if (IS_ENABLED(CONFIG_MICROCODE_DBG)) { + int cpu = smp_processor_id(); + + if (!microcode_rev[cpu]) { + if (!base_rev) + base_rev = cpuid_to_ucode_rev(bsp_cpuid_1_eax); + + microcode_rev[cpu] = base_rev; + + ucode_dbg("CPU%d, base_rev: 0x%x\n", cpu, base_rev); + } + + return microcode_rev[cpu]; + } + + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + + return rev; +} + static u16 find_equiv_id(struct equiv_cpu_table *et, u32 sig) { unsigned int i; @@ -304,13 +339,13 @@ static bool verify_container(const u8 *buf, size_t buf_size) u32 cont_magic; if (buf_size <= CONTAINER_HDR_SZ) { - pr_debug("Truncated microcode container header.\n"); + ucode_dbg("Truncated microcode container header.\n"); return false; } cont_magic = *(const u32 *)buf; if (cont_magic != UCODE_MAGIC) { - pr_debug("Invalid magic value (0x%08x).\n", cont_magic); + ucode_dbg("Invalid magic value (0x%08x).\n", cont_magic); return false; } @@ -335,8 +370,8 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size) cont_type = hdr[1]; if (cont_type != UCODE_EQUIV_CPU_TABLE_TYPE) { - pr_debug("Wrong microcode container equivalence table type: %u.\n", - cont_type); + ucode_dbg("Wrong microcode container equivalence table type: %u.\n", + cont_type); return false; } @@ -345,7 +380,7 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size) equiv_tbl_len = hdr[2]; if (equiv_tbl_len < sizeof(struct equiv_cpu_entry) || buf_size < equiv_tbl_len) { - pr_debug("Truncated equivalence table.\n"); + ucode_dbg("Truncated equivalence table.\n"); return false; } @@ -365,7 +400,7 @@ static bool __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize const u32 *hdr; if (buf_size < SECTION_HDR_SIZE) { - pr_debug("Truncated patch section.\n"); + ucode_dbg("Truncated patch section.\n"); return false; } @@ -374,13 +409,13 @@ static bool __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize p_size = hdr[1]; if (p_type != UCODE_UCODE_TYPE) { - pr_debug("Invalid type field (0x%x) in container file section header.\n", - p_type); + ucode_dbg("Invalid type field (0x%x) in container file section header.\n", + p_type); return false; } if (p_size < sizeof(struct microcode_header_amd)) { - pr_debug("Patch of size %u too short.\n", p_size); + ucode_dbg("Patch of size %u too short.\n", p_size); return false; } @@ -457,12 +492,12 @@ static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size) * size sh_psize, as the section claims. */ if (buf_size < sh_psize) { - pr_debug("Patch of size %u truncated.\n", sh_psize); + ucode_dbg("Patch of size %u truncated.\n", sh_psize); return -1; } if (!__verify_patch_size(sh_psize, buf_size)) { - pr_debug("Per-family patch size mismatch.\n"); + ucode_dbg("Per-family patch size mismatch.\n"); return -1; } @@ -476,6 +511,9 @@ static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size) proc_id = mc_hdr->processor_rev_id; patch_fam = 0xf + (proc_id >> 12); + + ucode_dbg("Patch-ID 0x%08x: family: 0x%x\n", mc_hdr->patch_id, patch_fam); + if (patch_fam != family) return 1; @@ -546,9 +584,14 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc) } mc = (struct microcode_amd *)(buf + SECTION_HDR_SIZE); + + ucode_dbg("patch_id: 0x%x\n", mc->hdr.patch_id); + if (mc_patch_matches(mc, eq_id)) { desc->psize = patch_size; desc->mc = mc; + + ucode_dbg(" match: size: %d\n", patch_size); } skip: @@ -619,8 +662,14 @@ static bool __apply_microcode_amd(struct microcode_amd *mc, u32 *cur_rev, invlpg(p_addr_end); } + if (IS_ENABLED(CONFIG_MICROCODE_DBG)) + microcode_rev[smp_processor_id()] = mc->hdr.patch_id; + /* verify patch application was successful */ *cur_rev = get_patch_level(); + + ucode_dbg("updated rev: 0x%x\n", *cur_rev); + if (*cur_rev != mc->hdr.patch_id) return false; @@ -749,8 +798,6 @@ static struct ucode_patch *cache_find_patch(struct ucode_cpu_info *uci, u16 equi n.equiv_cpu = equiv_cpu; n.patch_id = uci->cpu_sig.rev; - WARN_ON_ONCE(!n.patch_id); - list_for_each_entry(p, µcode_cache, plist) if (patch_cpus_equivalent(p, &n, false)) return p; @@ -1008,7 +1055,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover, patch->patch_id = mc_hdr->patch_id; patch->equiv_cpu = proc_id; - pr_debug("%s: Adding patch_id: 0x%08x, proc_id: 0x%04x\n", + ucode_dbg("%s: Adding patch_id: 0x%08x, proc_id: 0x%04x\n", __func__, patch->patch_id, proc_id); /* ... and add to cache. */ @@ -1151,7 +1198,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device) snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); if (request_firmware_direct(&fw, (const char *)fw_name, device)) { - pr_debug("failed to load file %s\n", fw_name); + ucode_dbg("failed to load file %s\n", fw_name); goto out; } diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index b92e09a87c69..f75c140906d0 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -43,10 +43,19 @@ #include "internal.h" static struct microcode_ops *microcode_ops; -static bool dis_ucode_ldr = false; +static bool dis_ucode_ldr; bool force_minrev = IS_ENABLED(CONFIG_MICROCODE_LATE_FORCE_MINREV); -module_param(force_minrev, bool, S_IRUSR | S_IWUSR); + +/* + * Those below should be behind CONFIG_MICROCODE_DBG ifdeffery but in + * order to not uglify the code with ifdeffery and use IS_ENABLED() + * instead, leave them in. When microcode debugging is not enabled, + * those are meaningless anyway. + */ +/* base microcode revision for debugging */ +u32 base_rev; +u32 microcode_rev[NR_CPUS] = {}; /* * Synchronization. @@ -119,20 +128,48 @@ bool __init microcode_loader_disabled(void) * overwritten. */ if (!cpuid_feature() || - native_cpuid_ecx(1) & BIT(31) || + ((native_cpuid_ecx(1) & BIT(31)) && + !IS_ENABLED(CONFIG_MICROCODE_DBG)) || amd_check_current_patch_level()) dis_ucode_ldr = true; return dis_ucode_ldr; } +static void early_parse_cmdline(void) +{ + char cmd_buf[64] = {}; + char *s, *p = cmd_buf; + + if (cmdline_find_option(boot_command_line, "microcode", cmd_buf, sizeof(cmd_buf)) > 0) { + while ((s = strsep(&p, ","))) { + if (IS_ENABLED(CONFIG_MICROCODE_DBG)) { + if (strstr(s, "base_rev=")) { + /* advance to the option arg */ + strsep(&s, "="); + if (kstrtouint(s, 16, &base_rev)) { ; } + } + } + + if (!strcmp("force_minrev", s)) + force_minrev = true; + + if (!strcmp(s, "dis_ucode_ldr")) + dis_ucode_ldr = true; + } + } + + /* old, compat option */ + if (cmdline_find_option_bool(boot_command_line, "dis_ucode_ldr") > 0) + dis_ucode_ldr = true; +} + void __init load_ucode_bsp(void) { unsigned int cpuid_1_eax; bool intel = true; - if (cmdline_find_option_bool(boot_command_line, "dis_ucode_ldr") > 0) - dis_ucode_ldr = true; + early_parse_cmdline(); if (microcode_loader_disabled()) return; diff --git a/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h index cb6e601701ab..2d48e6593540 100644 --- a/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h +++ b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h @@ -67,9 +67,8 @@ { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0008, .driver_data = 0x1000191 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0010, .driver_data = 0x2007006 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0020, .driver_data = 0x3000010 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0040, .driver_data = 0x4003605 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0080, .driver_data = 0x5003707 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0800, .driver_data = 0x7002904 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0080, .driver_data = 0x5003901 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0800, .driver_data = 0x7002b01 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0004, .driver_data = 0x1c }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0008, .driver_data = 0x700001c }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0010, .driver_data = 0xf00001a }, @@ -81,51 +80,62 @@ { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5f, .steppings = 0x0002, .driver_data = 0x3e }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x66, .steppings = 0x0008, .driver_data = 0x2a }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0020, .driver_data = 0xc0002f0 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0040, .driver_data = 0xd0003e7 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6c, .steppings = 0x0002, .driver_data = 0x10002b0 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0040, .driver_data = 0xd000404 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6c, .steppings = 0x0002, .driver_data = 0x10002d0 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0002, .driver_data = 0x42 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0100, .driver_data = 0x24 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7e, .steppings = 0x0020, .driver_data = 0xc6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0100, .driver_data = 0x26 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7e, .steppings = 0x0020, .driver_data = 0xca }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8a, .steppings = 0x0002, .driver_data = 0x33 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0002, .driver_data = 0xb8 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0004, .driver_data = 0x38 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8d, .steppings = 0x0002, .driver_data = 0x52 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0002, .driver_data = 0xbc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0004, .driver_data = 0x3c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8d, .steppings = 0x0002, .driver_data = 0x56 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0200, .driver_data = 0xf6 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0400, .driver_data = 0xf6 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0800, .driver_data = 0xf6 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x1000, .driver_data = 0xfc }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0100, .driver_data = 0x2c000390 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0080, .driver_data = 0x2b000603 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0040, .driver_data = 0x2c000390 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0020, .driver_data = 0x2c000390 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0010, .driver_data = 0x2c000390 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x1000, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0010, .driver_data = 0x2c0003f7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0020, .driver_data = 0x2c0003f7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0040, .driver_data = 0x2c0003f7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0080, .driver_data = 0x2b000639 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0100, .driver_data = 0x2c0003f7 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x96, .steppings = 0x0002, .driver_data = 0x1a }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0004, .driver_data = 0x37 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0020, .driver_data = 0x37 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0004, .driver_data = 0x37 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0020, .driver_data = 0x37 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0008, .driver_data = 0x435 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0010, .driver_data = 0x435 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0004, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0020, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0008, .driver_data = 0x437 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0010, .driver_data = 0x437 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9c, .steppings = 0x0001, .driver_data = 0x24000026 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0200, .driver_data = 0xf8 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0400, .driver_data = 0xf8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0400, .driver_data = 0xfa }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0800, .driver_data = 0xf6 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x1000, .driver_data = 0xf8 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x2000, .driver_data = 0x100 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0004, .driver_data = 0xfc }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0008, .driver_data = 0xfc }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0020, .driver_data = 0xfc }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0001, .driver_data = 0xfe }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0002, .driver_data = 0xfc }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa7, .steppings = 0x0002, .driver_data = 0x62 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xaa, .steppings = 0x0010, .driver_data = 0x20 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb7, .steppings = 0x0002, .driver_data = 0x12b }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0004, .driver_data = 0x4123 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0008, .driver_data = 0x4123 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0100, .driver_data = 0x4123 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbe, .steppings = 0x0001, .driver_data = 0x1a }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0004, .driver_data = 0x21000283 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0002, .driver_data = 0x21000283 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x2000, .driver_data = 0x104 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0004, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0008, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0020, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0001, .driver_data = 0x102 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0002, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa7, .steppings = 0x0002, .driver_data = 0x64 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xaa, .steppings = 0x0010, .driver_data = 0x24 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xad, .steppings = 0x0002, .driver_data = 0xa0000d1 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xaf, .steppings = 0x0008, .driver_data = 0x3000341 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb5, .steppings = 0x0001, .driver_data = 0xa }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb7, .steppings = 0x0002, .driver_data = 0x12f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb7, .steppings = 0x0010, .driver_data = 0x12f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0004, .driver_data = 0x4128 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0008, .driver_data = 0x4128 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0100, .driver_data = 0x4128 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbd, .steppings = 0x0002, .driver_data = 0x11f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbe, .steppings = 0x0001, .driver_data = 0x1d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0004, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0020, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0040, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0080, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xc5, .steppings = 0x0004, .driver_data = 0x118 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xc6, .steppings = 0x0004, .driver_data = 0x118 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xc6, .steppings = 0x0010, .driver_data = 0x118 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xca, .steppings = 0x0004, .driver_data = 0x118 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0002, .driver_data = 0x210002a9 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0004, .driver_data = 0x210002a9 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0080, .driver_data = 0x12 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0400, .driver_data = 0x15 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x01, .steppings = 0x0004, .driver_data = 0x2e }, diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h index 50a9702ae4e2..ae8dbc2b908d 100644 --- a/arch/x86/kernel/cpu/microcode/internal.h +++ b/arch/x86/kernel/cpu/microcode/internal.h @@ -44,6 +44,9 @@ struct early_load_data { extern struct early_load_data early_data; extern struct ucode_cpu_info ucode_cpu_info[]; +extern u32 microcode_rev[NR_CPUS]; +extern u32 base_rev; + struct cpio_data find_microcode_in_initrd(const char *path); #define MAX_UCODE_COUNT 128 @@ -122,4 +125,10 @@ static inline void reload_ucode_intel(void) { } static inline struct microcode_ops *init_intel_microcode(void) { return NULL; } #endif /* !CONFIG_CPU_SUP_INTEL */ +#define ucode_dbg(fmt, ...) \ +({ \ + if (IS_ENABLED(CONFIG_MICROCODE_DBG)) \ + pr_info(fmt, ##__VA_ARGS__); \ +}) + #endif /* _X86_MICROCODE_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 187d527ef73b..06ca5a30140c 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -107,7 +107,7 @@ u32 resctrl_arch_system_num_rmid_idx(void) struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ - return r->num_rmid; + return r->mon.num_rmid; } struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) @@ -365,8 +365,10 @@ static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom) static void mon_domain_free(struct rdt_hw_mon_domain *hw_dom) { - kfree(hw_dom->arch_mbm_total); - kfree(hw_dom->arch_mbm_local); + int idx; + + for_each_mbm_idx(idx) + kfree(hw_dom->arch_mbm_states[idx]); kfree(hw_dom); } @@ -400,25 +402,27 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain * */ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom) { - size_t tsize; - - if (resctrl_arch_is_mbm_total_enabled()) { - tsize = sizeof(*hw_dom->arch_mbm_total); - hw_dom->arch_mbm_total = kcalloc(num_rmid, tsize, GFP_KERNEL); - if (!hw_dom->arch_mbm_total) - return -ENOMEM; - } - if (resctrl_arch_is_mbm_local_enabled()) { - tsize = sizeof(*hw_dom->arch_mbm_local); - hw_dom->arch_mbm_local = kcalloc(num_rmid, tsize, GFP_KERNEL); - if (!hw_dom->arch_mbm_local) { - kfree(hw_dom->arch_mbm_total); - hw_dom->arch_mbm_total = NULL; - return -ENOMEM; - } + size_t tsize = sizeof(*hw_dom->arch_mbm_states[0]); + enum resctrl_event_id eventid; + int idx; + + for_each_mbm_event_id(eventid) { + if (!resctrl_is_mon_event_enabled(eventid)) + continue; + idx = MBM_STATE_IDX(eventid); + hw_dom->arch_mbm_states[idx] = kcalloc(num_rmid, tsize, GFP_KERNEL); + if (!hw_dom->arch_mbm_states[idx]) + goto cleanup; } return 0; +cleanup: + for_each_mbm_idx(idx) { + kfree(hw_dom->arch_mbm_states[idx]); + hw_dom->arch_mbm_states[idx] = NULL; + } + + return -ENOMEM; } static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope) @@ -516,6 +520,9 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) d = container_of(hdr, struct rdt_mon_domain, hdr); cpumask_set_cpu(cpu, &d->hdr.cpu_mask); + /* Update the mbm_assign_mode state for the CPU if supported */ + if (r->mon.mbm_cntr_assignable) + resctrl_arch_mbm_cntr_assign_set_one(r); return; } @@ -535,9 +542,13 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) d->ci_id = ci->id; cpumask_set_cpu(cpu, &d->hdr.cpu_mask); + /* Update the mbm_assign_mode state for the CPU if supported */ + if (r->mon.mbm_cntr_assignable) + resctrl_arch_mbm_cntr_assign_set_one(r); + arch_mon_domain_online(r, d); - if (arch_domain_mbm_alloc(r->num_rmid, hw_dom)) { + if (arch_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) { mon_domain_free(hw_dom); return; } @@ -707,6 +718,7 @@ enum { RDT_FLAG_MBA, RDT_FLAG_SMBA, RDT_FLAG_BMEC, + RDT_FLAG_ABMC, }; #define RDT_OPT(idx, n, f) \ @@ -732,6 +744,7 @@ static struct rdt_options rdt_options[] __ro_after_init = { RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA), RDT_OPT(RDT_FLAG_SMBA, "smba", X86_FEATURE_SMBA), RDT_OPT(RDT_FLAG_BMEC, "bmec", X86_FEATURE_BMEC), + RDT_OPT(RDT_FLAG_ABMC, "abmc", X86_FEATURE_ABMC), }; #define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options) @@ -863,15 +876,24 @@ static __init bool get_rdt_alloc_resources(void) static __init bool get_rdt_mon_resources(void) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + bool ret = false; - if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) - rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID); - if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) - rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID); - if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) - rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID); + if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) { + resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID); + ret = true; + } + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { + resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); + ret = true; + } + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { + resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); + ret = true; + } + if (rdt_cpu_has(X86_FEATURE_ABMC)) + ret = true; - if (!rdt_mon_features) + if (!ret) return false; return !rdt_get_mon_l3_config(r); @@ -965,7 +987,7 @@ static enum cpuhp_state rdt_online; /* Runs once on the BSP during boot. */ void resctrl_cpu_detect(struct cpuinfo_x86 *c) { - if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { + if (!cpu_has(c, X86_FEATURE_CQM_LLC) && !cpu_has(c, X86_FEATURE_ABMC)) { c->x86_cache_max_rmid = -1; c->x86_cache_occ_scale = -1; c->x86_cache_mbm_width_offset = -1; @@ -977,7 +999,8 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) || cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) || - cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) { + cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL) || + cpu_has(c, X86_FEATURE_ABMC)) { u32 eax, ebx, ecx, edx; /* QoS sub-leaf, EAX=0Fh, ECX=1 */ diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 5e3c41b36437..9f4c2f0aaf5c 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -37,6 +37,15 @@ struct arch_mbm_state { u64 prev_msr; }; +/* Setting bit 0 in L3_QOS_EXT_CFG enables the ABMC feature. */ +#define ABMC_ENABLE_BIT 0 + +/* + * Qos Event Identifiers. + */ +#define ABMC_EXTENDED_EVT_ID BIT(31) +#define ABMC_EVT_ID BIT(0) + /** * struct rdt_hw_ctrl_domain - Arch private attributes of a set of CPUs that share * a resource for a control function @@ -54,15 +63,15 @@ struct rdt_hw_ctrl_domain { * struct rdt_hw_mon_domain - Arch private attributes of a set of CPUs that share * a resource for a monitor function * @d_resctrl: Properties exposed to the resctrl file system - * @arch_mbm_total: arch private state for MBM total bandwidth - * @arch_mbm_local: arch private state for MBM local bandwidth + * @arch_mbm_states: Per-event pointer to the MBM event's saved state. + * An MBM event's state is an array of struct arch_mbm_state + * indexed by RMID on x86. * * Members of this structure are accessed via helpers that provide abstraction. */ struct rdt_hw_mon_domain { struct rdt_mon_domain d_resctrl; - struct arch_mbm_state *arch_mbm_total; - struct arch_mbm_state *arch_mbm_local; + struct arch_mbm_state *arch_mbm_states[QOS_NUM_L3_MBM_EVENTS]; }; static inline struct rdt_hw_ctrl_domain *resctrl_to_arch_ctrl_dom(struct rdt_ctrl_domain *r) @@ -102,6 +111,7 @@ struct msr_param { * @mon_scale: cqm counter * mon_scale = occupancy in bytes * @mbm_width: Monitor width, to detect and correct for overflow. * @cdp_enabled: CDP state of this resource + * @mbm_cntr_assign_enabled: ABMC feature is enabled * * Members of this structure are either private to the architecture * e.g. mbm_width, or accessed via helpers that provide abstraction. e.g. @@ -115,6 +125,7 @@ struct rdt_hw_resource { unsigned int mon_scale; unsigned int mbm_width; bool cdp_enabled; + bool mbm_cntr_assign_enabled; }; static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r) @@ -159,6 +170,42 @@ union cpuid_0x10_x_edx { unsigned int full; }; +/* + * ABMC counters are configured by writing to MSR_IA32_L3_QOS_ABMC_CFG. + * + * @bw_type : Event configuration that represents the memory + * transactions being tracked by the @cntr_id. + * @bw_src : Bandwidth source (RMID or CLOSID). + * @reserved1 : Reserved. + * @is_clos : @bw_src field is a CLOSID (not an RMID). + * @cntr_id : Counter identifier. + * @reserved : Reserved. + * @cntr_en : Counting enable bit. + * @cfg_en : Configuration enable bit. + * + * Configuration and counting: + * Counter can be configured across multiple writes to MSR. Configuration + * is applied only when @cfg_en = 1. Counter @cntr_id is reset when the + * configuration is applied. + * @cfg_en = 1, @cntr_en = 0 : Apply @cntr_id configuration but do not + * count events. + * @cfg_en = 1, @cntr_en = 1 : Apply @cntr_id configuration and start + * counting events. + */ +union l3_qos_abmc_cfg { + struct { + unsigned long bw_type :32, + bw_src :12, + reserved1: 3, + is_clos : 1, + cntr_id : 5, + reserved : 9, + cntr_en : 1, + cfg_en : 1; + } split; + unsigned long full; +}; + void rdt_ctrl_update(void *arg); int rdt_get_mon_l3_config(struct rdt_resource *r); @@ -168,5 +215,6 @@ bool rdt_cpu_has(int flag); void __init intel_rdt_mbm_apply_quirk(void); void rdt_domain_reconfigure_cdp(struct rdt_resource *r); +void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r); #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index c261558276cd..c8945610d455 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -31,11 +31,6 @@ */ bool rdt_mon_capable; -/* - * Global to indicate which monitoring events are enabled. - */ -unsigned int rdt_mon_features; - #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) static int snc_nodes_per_l3_cache = 1; @@ -135,7 +130,7 @@ static int logical_rmid_to_physical_rmid(int cpu, int lrmid) if (snc_nodes_per_l3_cache == 1) return lrmid; - return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid; + return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->mon.num_rmid; } static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) @@ -166,18 +161,14 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_do u32 rmid, enum resctrl_event_id eventid) { - switch (eventid) { - case QOS_L3_OCCUP_EVENT_ID: - return NULL; - case QOS_L3_MBM_TOTAL_EVENT_ID: - return &hw_dom->arch_mbm_total[rmid]; - case QOS_L3_MBM_LOCAL_EVENT_ID: - return &hw_dom->arch_mbm_local[rmid]; - default: - /* Never expect to get here */ - WARN_ON_ONCE(1); + struct arch_mbm_state *state; + + if (!resctrl_is_mbm_event(eventid)) return NULL; - } + + state = hw_dom->arch_mbm_states[MBM_STATE_IDX(eventid)]; + + return state ? &state[rmid] : NULL; } void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, @@ -206,14 +197,16 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) { struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); - - if (resctrl_arch_is_mbm_total_enabled()) - memset(hw_dom->arch_mbm_total, 0, - sizeof(*hw_dom->arch_mbm_total) * r->num_rmid); - - if (resctrl_arch_is_mbm_local_enabled()) - memset(hw_dom->arch_mbm_local, 0, - sizeof(*hw_dom->arch_mbm_local) * r->num_rmid); + enum resctrl_event_id eventid; + int idx; + + for_each_mbm_event_id(eventid) { + if (!resctrl_is_mon_event_enabled(eventid)) + continue; + idx = MBM_STATE_IDX(eventid); + memset(hw_dom->arch_mbm_states[idx], 0, + sizeof(*hw_dom->arch_mbm_states[0]) * r->mon.num_rmid); + } } static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) @@ -224,15 +217,33 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) return chunks >> shift; } +static u64 get_corrected_val(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 rmid, enum resctrl_event_id eventid, u64 msr_val) +{ + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + struct arch_mbm_state *am; + u64 chunks; + + am = get_arch_mbm_state(hw_dom, rmid, eventid); + if (am) { + am->chunks += mbm_overflow_count(am->prev_msr, msr_val, + hw_res->mbm_width); + chunks = get_corrected_mbm_count(rmid, am->chunks); + am->prev_msr = msr_val; + } else { + chunks = msr_val; + } + + return chunks * hw_res->mon_scale; +} + int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, u32 unused, u32 rmid, enum resctrl_event_id eventid, u64 *val, void *ignored) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); int cpu = cpumask_any(&d->hdr.cpu_mask); - struct arch_mbm_state *am; - u64 msr_val, chunks; + u64 msr_val; u32 prmid; int ret; @@ -243,17 +254,76 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, if (ret) return ret; + *val = get_corrected_val(r, d, rmid, eventid, msr_val); + + return 0; +} + +static int __cntr_id_read(u32 cntr_id, u64 *val) +{ + u64 msr_val; + + /* + * QM_EVTSEL Register definition: + * ======================================================= + * Bits Mnemonic Description + * ======================================================= + * 63:44 -- Reserved + * 43:32 RMID RMID or counter ID in ABMC mode + * when reading an MBM event + * 31 ExtendedEvtID Extended Event Identifier + * 30:8 -- Reserved + * 7:0 EvtID Event Identifier + * ======================================================= + * The contents of a specific counter can be read by setting the + * following fields in QM_EVTSEL.ExtendedEvtID(=1) and + * QM_EVTSEL.EvtID = L3CacheABMC (=1) and setting QM_EVTSEL.RMID + * to the desired counter ID. Reading the QM_CTR then returns the + * contents of the specified counter. The RMID_VAL_ERROR bit is set + * if the counter configuration is invalid, or if an invalid counter + * ID is set in the QM_EVTSEL.RMID field. The RMID_VAL_UNAVAIL bit + * is set if the counter data is unavailable. + */ + wrmsr(MSR_IA32_QM_EVTSEL, ABMC_EXTENDED_EVT_ID | ABMC_EVT_ID, cntr_id); + rdmsrl(MSR_IA32_QM_CTR, msr_val); + + if (msr_val & RMID_VAL_ERROR) + return -EIO; + if (msr_val & RMID_VAL_UNAVAIL) + return -EINVAL; + + *val = msr_val; + return 0; +} + +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 unused, u32 rmid, int cntr_id, + enum resctrl_event_id eventid) +{ + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct arch_mbm_state *am; + am = get_arch_mbm_state(hw_dom, rmid, eventid); if (am) { - am->chunks += mbm_overflow_count(am->prev_msr, msr_val, - hw_res->mbm_width); - chunks = get_corrected_mbm_count(rmid, am->chunks); - am->prev_msr = msr_val; - } else { - chunks = msr_val; + memset(am, 0, sizeof(*am)); + + /* Record any initial, non-zero count value. */ + __cntr_id_read(cntr_id, &am->prev_msr); } +} + +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 unused, u32 rmid, int cntr_id, + enum resctrl_event_id eventid, u64 *val) +{ + u64 msr_val; + int ret; + + ret = __cntr_id_read(cntr_id, &msr_val); + if (ret) + return ret; - *val = chunks * hw_res->mon_scale; + *val = get_corrected_val(r, d, rmid, eventid, msr_val); return 0; } @@ -346,12 +416,13 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); unsigned int threshold; + u32 eax, ebx, ecx, edx; snc_nodes_per_l3_cache = snc_get_config(); resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache; - r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; + r->mon.num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; hw_res->mbm_width = MBM_CNTR_WIDTH_BASE; if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) @@ -366,7 +437,7 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) * * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. */ - threshold = resctrl_rmid_realloc_limit / r->num_rmid; + threshold = resctrl_rmid_realloc_limit / r->mon.num_rmid; /* * Because num_rmid may not be a power of two, round the value @@ -375,12 +446,17 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) */ resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold); - if (rdt_cpu_has(X86_FEATURE_BMEC)) { - u32 eax, ebx, ecx, edx; - + if (rdt_cpu_has(X86_FEATURE_BMEC) || rdt_cpu_has(X86_FEATURE_ABMC)) { /* Detect list of bandwidth sources that can be tracked */ cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); - r->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; + r->mon.mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; + } + + if (rdt_cpu_has(X86_FEATURE_ABMC)) { + r->mon.mbm_cntr_assignable = true; + cpuid_count(0x80000020, 5, &eax, &ebx, &ecx, &edx); + r->mon.num_mbm_cntrs = (ebx & GENMASK(15, 0)) + 1; + hw_res->mbm_cntr_assign_enabled = true; } r->mon_capable = true; @@ -401,3 +477,91 @@ void __init intel_rdt_mbm_apply_quirk(void) mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold; mbm_cf = mbm_cf_table[cf_index].cf; } + +static void resctrl_abmc_set_one_amd(void *arg) +{ + bool *enable = arg; + + if (*enable) + msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT); + else + msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT); +} + +/* + * ABMC enable/disable requires update of L3_QOS_EXT_CFG MSR on all the CPUs + * associated with all monitor domains. + */ +static void _resctrl_abmc_enable(struct rdt_resource *r, bool enable) +{ + struct rdt_mon_domain *d; + + lockdep_assert_cpus_held(); + + list_for_each_entry(d, &r->mon_domains, hdr.list) { + on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_abmc_set_one_amd, + &enable, 1); + resctrl_arch_reset_rmid_all(r, d); + } +} + +int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + + if (r->mon.mbm_cntr_assignable && + hw_res->mbm_cntr_assign_enabled != enable) { + _resctrl_abmc_enable(r, enable); + hw_res->mbm_cntr_assign_enabled = enable; + } + + return 0; +} + +bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) +{ + return resctrl_to_arch_res(r)->mbm_cntr_assign_enabled; +} + +static void resctrl_abmc_config_one_amd(void *info) +{ + union l3_qos_abmc_cfg *abmc_cfg = info; + + wrmsrl(MSR_IA32_L3_QOS_ABMC_CFG, abmc_cfg->full); +} + +/* + * Send an IPI to the domain to assign the counter to RMID, event pair. + */ +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign) +{ + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + union l3_qos_abmc_cfg abmc_cfg = { 0 }; + struct arch_mbm_state *am; + + abmc_cfg.split.cfg_en = 1; + abmc_cfg.split.cntr_en = assign ? 1 : 0; + abmc_cfg.split.cntr_id = cntr_id; + abmc_cfg.split.bw_src = rmid; + if (assign) + abmc_cfg.split.bw_type = resctrl_get_mon_evt_cfg(evtid); + + smp_call_function_any(&d->hdr.cpu_mask, resctrl_abmc_config_one_amd, &abmc_cfg, 1); + + /* + * The hardware counter is reset (because cfg_en == 1) so there is no + * need to record initial non-zero counts. + */ + am = get_arch_mbm_state(hw_dom, rmid, evtid); + if (am) + memset(am, 0, sizeof(*am)); +} + +void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + + resctrl_abmc_set_one_amd(&hw_res->mbm_cntr_assign_enabled); +} diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index b4a1f6732a3a..4cee6213d667 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -48,8 +48,10 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_AMD_FAST_CPPC, CPUID_EDX, 15, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, + { X86_FEATURE_COHERENCY_SFW_NO, CPUID_EBX, 31, 0x8000001f, 0 }, { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 }, { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, + { X86_FEATURE_ABMC, CPUID_EBX, 5, 0x80000020, 0 }, { X86_FEATURE_TSA_SQ_NO, CPUID_ECX, 1, 0x80000021, 0 }, { X86_FEATURE_TSA_L1_NO, CPUID_ECX, 2, 0x80000021, 0 }, { X86_FEATURE_AMD_WORKLOAD_CLASS, CPUID_EAX, 22, 0x80000021, 0 }, diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index e35ccdc84910..6073a16628f9 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -372,6 +372,19 @@ unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_uni return topo_unit_count(lvlid, at_level, apic_maps[which_units].map); } +#ifdef CONFIG_SMP +int topology_get_primary_thread(unsigned int cpu) +{ + u32 apic_id = cpuid_to_apicid[cpu]; + + /* + * Get the core domain level APIC id, which is the primary thread + * and return the CPU number assigned to it. + */ + return topo_lookup_cpuid(topo_apicid(apic_id, TOPO_CORE_DOMAIN)); +} +#endif + #ifdef CONFIG_ACPI_HOTPLUG_CPU /** * topology_hotplug_apic - Handle a physical hotplugged APIC after boot diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index 843b1655ab45..6ac097e13106 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -59,7 +59,7 @@ static void store_node(struct topo_scan *tscan, u16 nr_nodes, u16 node_id) tscan->amd_node_id = node_id; } -static bool parse_8000_001e(struct topo_scan *tscan, bool has_topoext) +static bool parse_8000_001e(struct topo_scan *tscan) { struct { // eax @@ -81,20 +81,25 @@ static bool parse_8000_001e(struct topo_scan *tscan, bool has_topoext) cpuid_leaf(0x8000001e, &leaf); - tscan->c->topo.initial_apicid = leaf.ext_apic_id; - /* - * If leaf 0xb is available, then the domain shifts are set - * already and nothing to do here. Only valid for family >= 0x17. + * If leaf 0xb/0x26 is available, then the APIC ID and the domain + * shifts are set already. */ - if (!has_topoext && tscan->c->x86 >= 0x17) { + if (!cpu_feature_enabled(X86_FEATURE_XTOPOLOGY)) { + tscan->c->topo.initial_apicid = leaf.ext_apic_id; + /* - * Leaf 0x80000008 set the CORE domain shift already. - * Update the SMT domain, but do not propagate it. + * Leaf 0x8000008 sets the CORE domain shift but not the + * SMT domain shift. On CPUs with family >= 0x17, there + * might be hyperthreads. */ - unsigned int nthreads = leaf.core_nthreads + 1; + if (tscan->c->x86 >= 0x17) { + /* Update the SMT domain, but do not propagate it. */ + unsigned int nthreads = leaf.core_nthreads + 1; - topology_update_dom(tscan, TOPO_SMT_DOMAIN, get_count_order(nthreads), nthreads); + topology_update_dom(tscan, TOPO_SMT_DOMAIN, + get_count_order(nthreads), nthreads); + } } store_node(tscan, leaf.nnodes_per_socket + 1, leaf.node_id); @@ -158,11 +163,12 @@ static void topoext_fixup(struct topo_scan *tscan) c->x86 != 0x15 || c->x86_model < 0x10 || c->x86_model > 0x6f) return; - if (msr_set_bit(0xc0011005, 54) <= 0) + if (msr_set_bit(MSR_AMD64_CPUID_EXT_FEAT, + MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT_BIT) <= 0) return; - rdmsrq(0xc0011005, msrval); - if (msrval & BIT_64(54)) { + rdmsrq(MSR_AMD64_CPUID_EXT_FEAT, msrval); + if (msrval & MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT) { set_cpu_cap(c, X86_FEATURE_TOPOEXT); pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); } @@ -170,27 +176,27 @@ static void topoext_fixup(struct topo_scan *tscan) static void parse_topology_amd(struct topo_scan *tscan) { - bool has_topoext = false; + if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES)) + tscan->c->topo.cpu_type = cpuid_ebx(0x80000026); /* - * If the extended topology leaf 0x8000_001e is available - * try to get SMT, CORE, TILE, and DIE shifts from extended + * Try to get SMT, CORE, TILE, and DIE shifts from extended * CPUID leaf 0x8000_0026 on supported processors first. If * extended CPUID leaf 0x8000_0026 is not supported, try to - * get SMT and CORE shift from leaf 0xb first, then try to - * get the CORE shift from leaf 0x8000_0008. + * get SMT and CORE shift from leaf 0xb. If either leaf is + * available, cpu_parse_topology_ext() will return true. + * + * If XTOPOLOGY leaves (0x26/0xb) are not available, try to + * get the CORE shift from leaf 0x8000_0008 first. */ - if (cpu_feature_enabled(X86_FEATURE_TOPOEXT)) - has_topoext = cpu_parse_topology_ext(tscan); - - if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES)) - tscan->c->topo.cpu_type = cpuid_ebx(0x80000026); - - if (!has_topoext && !parse_8000_0008(tscan)) + if (!cpu_parse_topology_ext(tscan) && !parse_8000_0008(tscan)) return; - /* Prefer leaf 0x8000001e if available */ - if (parse_8000_001e(tscan, has_topoext)) + /* + * Prefer leaf 0x8000001e if available to get the SMT shift and + * the initial APIC ID if XTOPOLOGY leaves are not available. + */ + if (parse_8000_001e(tscan)) return; /* Try the NODEID MSR */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index aefd412a23dc..1f71cc135e9a 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -631,7 +631,7 @@ static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp) } /* Clone current's FPU state on fork */ -int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, +int fpu_clone(struct task_struct *dst, u64 clone_flags, bool minimal, unsigned long ssp) { /* diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 12ed75c1b567..28e4fd65c9da 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1881,19 +1881,20 @@ long fpu_xstate_prctl(int option, unsigned long arg2) #ifdef CONFIG_PROC_PID_ARCH_STATUS /* * Report the amount of time elapsed in millisecond since last AVX512 - * use in the task. + * use in the task. Report -1 if no AVX-512 usage. */ static void avx512_status(struct seq_file *m, struct task_struct *task) { - unsigned long timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp); - long delta; + unsigned long timestamp; + long delta = -1; - if (!timestamp) { - /* - * Report -1 if no AVX512 usage - */ - delta = -1; - } else { + /* AVX-512 usage is not tracked for kernel threads. Don't report anything. */ + if (task->flags & (PF_KTHREAD | PF_USER_WORKER)) + return; + + timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp); + + if (timestamp) { delta = (long)(jiffies - timestamp); /* * Cap to LONG_MAX if time difference > LONG_MAX diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 533fcf5636fc..fd28b53dbac5 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -52,10 +52,13 @@ SYM_PIC_ALIAS(next_early_pgt); pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); unsigned int __pgtable_l5_enabled __ro_after_init; +SYM_PIC_ALIAS(__pgtable_l5_enabled); unsigned int pgdir_shift __ro_after_init = 39; EXPORT_SYMBOL(pgdir_shift); +SYM_PIC_ALIAS(pgdir_shift); unsigned int ptrs_per_p4d __ro_after_init = 1; EXPORT_SYMBOL(ptrs_per_p4d); +SYM_PIC_ALIAS(ptrs_per_p4d); unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4; EXPORT_SYMBOL(page_offset_base); @@ -316,5 +319,5 @@ void early_setup_idt(void) handler = vc_boot_ghcb; } - startup_64_load_idt(handler); + __pi_startup_64_load_idt(handler); } diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 76743dfad6ab..80ef5d386b03 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -61,7 +61,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE) * any particular GDT layout, because we load our own as soon as we * can. */ -__HEAD + __INIT SYM_CODE_START(startup_32) movl pa(initial_stack),%ecx @@ -136,6 +136,9 @@ SYM_CODE_END(startup_32) * If cpu hotplug is not supported then this code can go in init section * which will be freed later */ +#ifdef CONFIG_HOTPLUG_CPU + .text +#endif SYM_FUNC_START(startup_32_smp) cld movl $(__BOOT_DS),%eax diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 3e9b3a3bd039..21816b48537c 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -33,7 +33,7 @@ * because we need identity-mapped pages. */ - __HEAD + __INIT .code64 SYM_CODE_START_NOALIGN(startup_64) UNWIND_HINT_END_OF_STACK @@ -71,7 +71,7 @@ SYM_CODE_START_NOALIGN(startup_64) xorl %edx, %edx wrmsr - call startup_64_setup_gdt_idt + call __pi_startup_64_setup_gdt_idt /* Now switch to __KERNEL_CS so IRET works reliably */ pushq $__KERNEL_CS @@ -91,7 +91,7 @@ SYM_CODE_START_NOALIGN(startup_64) * subsequent code. Pass the boot_params pointer as the first argument. */ movq %r15, %rdi - call sme_enable + call __pi_sme_enable #endif /* Sanitize CPU configuration */ @@ -111,7 +111,7 @@ SYM_CODE_START_NOALIGN(startup_64) * programmed into CR3. */ movq %r15, %rsi - call __startup_64 + call __pi___startup_64 /* Form the CR3 value being sure to include the CR3 modifier */ leaq early_top_pgt(%rip), %rcx @@ -562,7 +562,7 @@ SYM_CODE_START_NOALIGN(vc_no_ghcb) /* Call C handler */ movq %rsp, %rdi movq ORIG_RAX(%rsp), %rsi - call do_vc_no_ghcb + call __pi_do_vc_no_ghcb /* Unwind pt_regs */ POP_REGS diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 6079d15dab8c..3863d7709386 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -339,7 +339,7 @@ static bool can_probe(unsigned long paddr) if (is_exception_insn(&insn)) return false; - if (IS_ENABLED(CONFIG_CFI_CLANG)) { + if (IS_ENABLED(CONFIG_CFI)) { /* * The compiler generates the following instruction sequence * for indirect call checks and cfi.c decodes this; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 1b7960cf6eb0..e3a3987b0c4f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -159,7 +159,7 @@ __visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs, int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { - unsigned long clone_flags = args->flags; + u64 clone_flags = args->flags; unsigned long sp = args->stack; unsigned long tls = args->tls; struct inactive_task_frame *frame; diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index 2ddf23387c7e..978232b6d48d 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -191,7 +191,7 @@ void reset_thread_features(void) current->thread.features_locked = 0; } -unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long clone_flags, +unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, u64 clone_flags, unsigned long stack_size) { struct thread_shstk *shstk = &tsk->thread.shstk; @@ -246,6 +246,46 @@ static unsigned long get_user_shstk_addr(void) return ssp; } +int shstk_pop(u64 *val) +{ + int ret = 0; + u64 ssp; + + if (!features_enabled(ARCH_SHSTK_SHSTK)) + return -ENOTSUPP; + + fpregs_lock_and_load(); + + rdmsrq(MSR_IA32_PL3_SSP, ssp); + if (val && get_user(*val, (__user u64 *)ssp)) + ret = -EFAULT; + else + wrmsrq(MSR_IA32_PL3_SSP, ssp + SS_FRAME_SIZE); + fpregs_unlock(); + + return ret; +} + +int shstk_push(u64 val) +{ + u64 ssp; + int ret; + + if (!features_enabled(ARCH_SHSTK_SHSTK)) + return -ENOTSUPP; + + fpregs_lock_and_load(); + + rdmsrq(MSR_IA32_PL3_SSP, ssp); + ssp -= SS_FRAME_SIZE; + ret = write_user_shstk_64((__user void *)ssp, val); + if (!ret) + wrmsrq(MSR_IA32_PL3_SSP, ssp); + fpregs_unlock(); + + return ret; +} + #define SHSTK_DATA_BIT BIT(63) static int put_shstk_data(u64 __user *addr, u64 data) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 33e166f6ab12..eb289abece23 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -479,14 +479,14 @@ static int x86_cluster_flags(void) static bool x86_has_numa_in_package; static struct sched_domain_topology_level x86_topology[] = { - SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), + SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), #ifdef CONFIG_SCHED_CLUSTER - SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS), + SDTL_INIT(tl_cls_mask, x86_cluster_flags, CLS), #endif #ifdef CONFIG_SCHED_MC - SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC), + SDTL_INIT(tl_mc_mask, x86_core_flags, MC), #endif - SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG), + SDTL_INIT(tl_pkg_mask, x86_sched_itmt_flags, PKG), { NULL }, }; diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index 5a4b21389b1d..d432f3824f0c 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -156,15 +156,26 @@ static int identify_insn(struct insn *insn) if (!insn->modrm.nbytes) return -EINVAL; - /* All the instructions of interest start with 0x0f. */ - if (insn->opcode.bytes[0] != 0xf) + /* The instructions of interest have 2-byte opcodes: 0F 00 or 0F 01. */ + if (insn->opcode.nbytes < 2 || insn->opcode.bytes[0] != 0xf) return -EINVAL; if (insn->opcode.bytes[1] == 0x1) { switch (X86_MODRM_REG(insn->modrm.value)) { case 0: + /* The reg form of 0F 01 /0 encodes VMX instructions. */ + if (X86_MODRM_MOD(insn->modrm.value) == 3) + return -EINVAL; + return UMIP_INST_SGDT; case 1: + /* + * The reg form of 0F 01 /1 encodes MONITOR/MWAIT, + * STAC/CLAC, and ENCLS. + */ + if (X86_MODRM_MOD(insn->modrm.value) == 3) + return -EINVAL; + return UMIP_INST_SIDT; case 4: return UMIP_INST_SMSW; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 6d383839e839..845aeaf36b8d 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -18,6 +18,7 @@ #include <asm/processor.h> #include <asm/insn.h> #include <asm/mmu_context.h> +#include <asm/nops.h> /* Post-execution fixups. */ @@ -310,25 +311,32 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool #ifdef CONFIG_X86_64 +struct uretprobe_syscall_args { + unsigned long r11; + unsigned long cx; + unsigned long ax; +}; + asm ( ".pushsection .rodata\n" ".global uretprobe_trampoline_entry\n" "uretprobe_trampoline_entry:\n" - "pushq %rax\n" - "pushq %rcx\n" - "pushq %r11\n" - "movq $" __stringify(__NR_uretprobe) ", %rax\n" + "push %rax\n" + "push %rcx\n" + "push %r11\n" + "mov $" __stringify(__NR_uretprobe) ", %rax\n" "syscall\n" ".global uretprobe_syscall_check\n" "uretprobe_syscall_check:\n" - "popq %r11\n" - "popq %rcx\n" - - /* The uretprobe syscall replaces stored %rax value with final + "pop %r11\n" + "pop %rcx\n" + /* + * The uretprobe syscall replaces stored %rax value with final * return address, so we don't restore %rax in here and just * call ret. */ - "retq\n" + "ret\n" + "int3\n" ".global uretprobe_trampoline_end\n" "uretprobe_trampoline_end:\n" ".popsection\n" @@ -338,7 +346,7 @@ extern u8 uretprobe_trampoline_entry[]; extern u8 uretprobe_trampoline_end[]; extern u8 uretprobe_syscall_check[]; -void *arch_uprobe_trampoline(unsigned long *psize) +void *arch_uretprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; struct pt_regs *regs = task_pt_regs(current); @@ -365,7 +373,8 @@ static unsigned long trampoline_check_ip(unsigned long tramp) SYSCALL_DEFINE0(uretprobe) { struct pt_regs *regs = task_pt_regs(current); - unsigned long err, ip, sp, r11_cx_ax[3], tramp; + struct uretprobe_syscall_args args; + unsigned long err, ip, sp, tramp; /* If there's no trampoline, we are called from wrong place. */ tramp = uprobe_get_trampoline_vaddr(); @@ -376,15 +385,15 @@ SYSCALL_DEFINE0(uretprobe) if (unlikely(regs->ip != trampoline_check_ip(tramp))) goto sigill; - err = copy_from_user(r11_cx_ax, (void __user *)regs->sp, sizeof(r11_cx_ax)); + err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args)); if (err) goto sigill; /* expose the "right" values of r11/cx/ax/sp to uprobe_consumer/s */ - regs->r11 = r11_cx_ax[0]; - regs->cx = r11_cx_ax[1]; - regs->ax = r11_cx_ax[2]; - regs->sp += sizeof(r11_cx_ax); + regs->r11 = args.r11; + regs->cx = args.cx; + regs->ax = args.ax; + regs->sp += sizeof(args); regs->orig_ax = -1; ip = regs->ip; @@ -400,21 +409,21 @@ SYSCALL_DEFINE0(uretprobe) */ if (regs->sp != sp || shstk_is_enabled()) return regs->ax; - regs->sp -= sizeof(r11_cx_ax); + regs->sp -= sizeof(args); /* for the case uprobe_consumer has changed r11/cx */ - r11_cx_ax[0] = regs->r11; - r11_cx_ax[1] = regs->cx; + args.r11 = regs->r11; + args.cx = regs->cx; /* * ax register is passed through as return value, so we can use * its space on stack for ip value and jump to it through the * trampoline's ret instruction */ - r11_cx_ax[2] = regs->ip; + args.ax = regs->ip; regs->ip = ip; - err = copy_to_user((void __user *)regs->sp, r11_cx_ax, sizeof(r11_cx_ax)); + err = copy_to_user((void __user *)regs->sp, &args, sizeof(args)); if (err) goto sigill; @@ -608,6 +617,581 @@ static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) *sr = utask->autask.saved_scratch_register; } } + +static int tramp_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) +{ + return -EPERM; +} + +static struct page *tramp_mapping_pages[2] __ro_after_init; + +static struct vm_special_mapping tramp_mapping = { + .name = "[uprobes-trampoline]", + .mremap = tramp_mremap, + .pages = tramp_mapping_pages, +}; + +struct uprobe_trampoline { + struct hlist_node node; + unsigned long vaddr; +}; + +static bool is_reachable_by_call(unsigned long vtramp, unsigned long vaddr) +{ + long delta = (long)(vaddr + 5 - vtramp); + + return delta >= INT_MIN && delta <= INT_MAX; +} + +static unsigned long find_nearest_trampoline(unsigned long vaddr) +{ + struct vm_unmapped_area_info info = { + .length = PAGE_SIZE, + .align_mask = ~PAGE_MASK, + }; + unsigned long low_limit, high_limit; + unsigned long low_tramp, high_tramp; + unsigned long call_end = vaddr + 5; + + if (check_add_overflow(call_end, INT_MIN, &low_limit)) + low_limit = PAGE_SIZE; + + high_limit = call_end + INT_MAX; + + /* Search up from the caller address. */ + info.low_limit = call_end; + info.high_limit = min(high_limit, TASK_SIZE); + high_tramp = vm_unmapped_area(&info); + + /* Search down from the caller address. */ + info.low_limit = max(low_limit, PAGE_SIZE); + info.high_limit = call_end; + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + low_tramp = vm_unmapped_area(&info); + + if (IS_ERR_VALUE(high_tramp) && IS_ERR_VALUE(low_tramp)) + return -ENOMEM; + if (IS_ERR_VALUE(high_tramp)) + return low_tramp; + if (IS_ERR_VALUE(low_tramp)) + return high_tramp; + + /* Return address that's closest to the caller address. */ + if (call_end - low_tramp < high_tramp - call_end) + return low_tramp; + return high_tramp; +} + +static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr) +{ + struct pt_regs *regs = task_pt_regs(current); + struct mm_struct *mm = current->mm; + struct uprobe_trampoline *tramp; + struct vm_area_struct *vma; + + if (!user_64bit_mode(regs)) + return NULL; + + vaddr = find_nearest_trampoline(vaddr); + if (IS_ERR_VALUE(vaddr)) + return NULL; + + tramp = kzalloc(sizeof(*tramp), GFP_KERNEL); + if (unlikely(!tramp)) + return NULL; + + tramp->vaddr = vaddr; + vma = _install_special_mapping(mm, tramp->vaddr, PAGE_SIZE, + VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_DONTCOPY|VM_IO, + &tramp_mapping); + if (IS_ERR(vma)) { + kfree(tramp); + return NULL; + } + return tramp; +} + +static struct uprobe_trampoline *get_uprobe_trampoline(unsigned long vaddr, bool *new) +{ + struct uprobes_state *state = ¤t->mm->uprobes_state; + struct uprobe_trampoline *tramp = NULL; + + if (vaddr > TASK_SIZE || vaddr < PAGE_SIZE) + return NULL; + + hlist_for_each_entry(tramp, &state->head_tramps, node) { + if (is_reachable_by_call(tramp->vaddr, vaddr)) { + *new = false; + return tramp; + } + } + + tramp = create_uprobe_trampoline(vaddr); + if (!tramp) + return NULL; + + *new = true; + hlist_add_head(&tramp->node, &state->head_tramps); + return tramp; +} + +static void destroy_uprobe_trampoline(struct uprobe_trampoline *tramp) +{ + /* + * We do not unmap and release uprobe trampoline page itself, + * because there's no easy way to make sure none of the threads + * is still inside the trampoline. + */ + hlist_del(&tramp->node); + kfree(tramp); +} + +void arch_uprobe_init_state(struct mm_struct *mm) +{ + INIT_HLIST_HEAD(&mm->uprobes_state.head_tramps); +} + +void arch_uprobe_clear_state(struct mm_struct *mm) +{ + struct uprobes_state *state = &mm->uprobes_state; + struct uprobe_trampoline *tramp; + struct hlist_node *n; + + hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node) + destroy_uprobe_trampoline(tramp); +} + +static bool __in_uprobe_trampoline(unsigned long ip) +{ + struct vm_area_struct *vma = vma_lookup(current->mm, ip); + + return vma && vma_is_special_mapping(vma, &tramp_mapping); +} + +static bool in_uprobe_trampoline(unsigned long ip) +{ + struct mm_struct *mm = current->mm; + bool found, retry = true; + unsigned int seq; + + rcu_read_lock(); + if (mmap_lock_speculate_try_begin(mm, &seq)) { + found = __in_uprobe_trampoline(ip); + retry = mmap_lock_speculate_retry(mm, seq); + } + rcu_read_unlock(); + + if (retry) { + mmap_read_lock(mm); + found = __in_uprobe_trampoline(ip); + mmap_read_unlock(mm); + } + return found; +} + +/* + * See uprobe syscall trampoline; the call to the trampoline will push + * the return address on the stack, the trampoline itself then pushes + * cx, r11 and ax. + */ +struct uprobe_syscall_args { + unsigned long ax; + unsigned long r11; + unsigned long cx; + unsigned long retaddr; +}; + +SYSCALL_DEFINE0(uprobe) +{ + struct pt_regs *regs = task_pt_regs(current); + struct uprobe_syscall_args args; + unsigned long ip, sp, sret; + int err; + + /* Allow execution only from uprobe trampolines. */ + if (!in_uprobe_trampoline(regs->ip)) + return -ENXIO; + + err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args)); + if (err) + goto sigill; + + ip = regs->ip; + + /* + * expose the "right" values of ax/r11/cx/ip/sp to uprobe_consumer/s, plus: + * - adjust ip to the probe address, call saved next instruction address + * - adjust sp to the probe's stack frame (check trampoline code) + */ + regs->ax = args.ax; + regs->r11 = args.r11; + regs->cx = args.cx; + regs->ip = args.retaddr - 5; + regs->sp += sizeof(args); + regs->orig_ax = -1; + + sp = regs->sp; + + err = shstk_pop((u64 *)&sret); + if (err == -EFAULT || (!err && sret != args.retaddr)) + goto sigill; + + handle_syscall_uprobe(regs, regs->ip); + + /* + * Some of the uprobe consumers has changed sp, we can do nothing, + * just return via iret. + */ + if (regs->sp != sp) { + /* skip the trampoline call */ + if (args.retaddr - 5 == regs->ip) + regs->ip += 5; + return regs->ax; + } + + regs->sp -= sizeof(args); + + /* for the case uprobe_consumer has changed ax/r11/cx */ + args.ax = regs->ax; + args.r11 = regs->r11; + args.cx = regs->cx; + + /* keep return address unless we are instructed otherwise */ + if (args.retaddr - 5 != regs->ip) + args.retaddr = regs->ip; + + if (shstk_push(args.retaddr) == -EFAULT) + goto sigill; + + regs->ip = ip; + + err = copy_to_user((void __user *)regs->sp, &args, sizeof(args)); + if (err) + goto sigill; + + /* ensure sysret, see do_syscall_64() */ + regs->r11 = regs->flags; + regs->cx = regs->ip; + return 0; + +sigill: + force_sig(SIGILL); + return -1; +} + +asm ( + ".pushsection .rodata\n" + ".balign " __stringify(PAGE_SIZE) "\n" + "uprobe_trampoline_entry:\n" + "push %rcx\n" + "push %r11\n" + "push %rax\n" + "mov $" __stringify(__NR_uprobe) ", %rax\n" + "syscall\n" + "pop %rax\n" + "pop %r11\n" + "pop %rcx\n" + "ret\n" + "int3\n" + ".balign " __stringify(PAGE_SIZE) "\n" + ".popsection\n" +); + +extern u8 uprobe_trampoline_entry[]; + +static int __init arch_uprobes_init(void) +{ + tramp_mapping_pages[0] = virt_to_page(uprobe_trampoline_entry); + return 0; +} + +late_initcall(arch_uprobes_init); + +enum { + EXPECT_SWBP, + EXPECT_CALL, +}; + +struct write_opcode_ctx { + unsigned long base; + int expect; +}; + +static int is_call_insn(uprobe_opcode_t *insn) +{ + return *insn == CALL_INSN_OPCODE; +} + +/* + * Verification callback used by int3_update uprobe_write calls to make sure + * the underlying instruction is as expected - either int3 or call. + */ +static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode, + int nbytes, void *data) +{ + struct write_opcode_ctx *ctx = data; + uprobe_opcode_t old_opcode[5]; + + uprobe_copy_from_page(page, ctx->base, (uprobe_opcode_t *) &old_opcode, 5); + + switch (ctx->expect) { + case EXPECT_SWBP: + if (is_swbp_insn(&old_opcode[0])) + return 1; + break; + case EXPECT_CALL: + if (is_call_insn(&old_opcode[0])) + return 1; + break; + } + + return -1; +} + +/* + * Modify multi-byte instructions by using INT3 breakpoints on SMP. + * We completely avoid using stop_machine() here, and achieve the + * synchronization using INT3 breakpoints and SMP cross-calls. + * (borrowed comment from smp_text_poke_batch_finish) + * + * The way it is done: + * - Add an INT3 trap to the address that will be patched + * - SMP sync all CPUs + * - Update all but the first byte of the patched range + * - SMP sync all CPUs + * - Replace the first byte (INT3) by the first byte of the replacing opcode + * - SMP sync all CPUs + */ +static int int3_update(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr, char *insn, bool optimize) +{ + uprobe_opcode_t int3 = UPROBE_SWBP_INSN; + struct write_opcode_ctx ctx = { + .base = vaddr, + }; + int err; + + /* + * Write int3 trap. + * + * The swbp_optimize path comes with breakpoint already installed, + * so we can skip this step for optimize == true. + */ + if (!optimize) { + ctx.expect = EXPECT_CALL; + err = uprobe_write(auprobe, vma, vaddr, &int3, 1, verify_insn, + true /* is_register */, false /* do_update_ref_ctr */, + &ctx); + if (err) + return err; + } + + smp_text_poke_sync_each_cpu(); + + /* Write all but the first byte of the patched range. */ + ctx.expect = EXPECT_SWBP; + err = uprobe_write(auprobe, vma, vaddr + 1, insn + 1, 4, verify_insn, + true /* is_register */, false /* do_update_ref_ctr */, + &ctx); + if (err) + return err; + + smp_text_poke_sync_each_cpu(); + + /* + * Write first byte. + * + * The swbp_unoptimize needs to finish uprobe removal together + * with ref_ctr update, using uprobe_write with proper flags. + */ + err = uprobe_write(auprobe, vma, vaddr, insn, 1, verify_insn, + optimize /* is_register */, !optimize /* do_update_ref_ctr */, + &ctx); + if (err) + return err; + + smp_text_poke_sync_each_cpu(); + return 0; +} + +static int swbp_optimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr, unsigned long tramp) +{ + u8 call[5]; + + __text_gen_insn(call, CALL_INSN_OPCODE, (const void *) vaddr, + (const void *) tramp, CALL_INSN_SIZE); + return int3_update(auprobe, vma, vaddr, call, true /* optimize */); +} + +static int swbp_unoptimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + return int3_update(auprobe, vma, vaddr, auprobe->insn, false /* optimize */); +} + +static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst, int len) +{ + unsigned int gup_flags = FOLL_FORCE|FOLL_SPLIT_PMD; + struct vm_area_struct *vma; + struct page *page; + + page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); + if (IS_ERR(page)) + return PTR_ERR(page); + uprobe_copy_from_page(page, vaddr, dst, len); + put_page(page); + return 0; +} + +static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr) +{ + struct __packed __arch_relative_insn { + u8 op; + s32 raddr; + } *call = (struct __arch_relative_insn *) insn; + + if (!is_call_insn(insn)) + return false; + return __in_uprobe_trampoline(vaddr + 5 + call->raddr); +} + +static int is_optimized(struct mm_struct *mm, unsigned long vaddr) +{ + uprobe_opcode_t insn[5]; + int err; + + err = copy_from_vaddr(mm, vaddr, &insn, 5); + if (err) + return err; + return __is_optimized((uprobe_opcode_t *)&insn, vaddr); +} + +static bool should_optimize(struct arch_uprobe *auprobe) +{ + return !test_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags) && + test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags); +} + +int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + if (should_optimize(auprobe)) { + /* + * We could race with another thread that already optimized the probe, + * so let's not overwrite it with int3 again in this case. + */ + int ret = is_optimized(vma->vm_mm, vaddr); + if (ret < 0) + return ret; + if (ret) + return 0; + } + return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, + true /* is_register */); +} + +int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) { + int ret = is_optimized(vma->vm_mm, vaddr); + if (ret < 0) + return ret; + if (ret) { + ret = swbp_unoptimize(auprobe, vma, vaddr); + WARN_ON_ONCE(ret); + return ret; + } + } + return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn, + false /* is_register */); +} + +static int __arch_uprobe_optimize(struct arch_uprobe *auprobe, struct mm_struct *mm, + unsigned long vaddr) +{ + struct uprobe_trampoline *tramp; + struct vm_area_struct *vma; + bool new = false; + int err = 0; + + vma = find_vma(mm, vaddr); + if (!vma) + return -EINVAL; + tramp = get_uprobe_trampoline(vaddr, &new); + if (!tramp) + return -EINVAL; + err = swbp_optimize(auprobe, vma, vaddr, tramp->vaddr); + if (WARN_ON_ONCE(err) && new) + destroy_uprobe_trampoline(tramp); + return err; +} + +void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ + struct mm_struct *mm = current->mm; + uprobe_opcode_t insn[5]; + + if (!should_optimize(auprobe)) + return; + + mmap_write_lock(mm); + + /* + * Check if some other thread already optimized the uprobe for us, + * if it's the case just go away silently. + */ + if (copy_from_vaddr(mm, vaddr, &insn, 5)) + goto unlock; + if (!is_swbp_insn((uprobe_opcode_t*) &insn)) + goto unlock; + + /* + * If we fail to optimize the uprobe we set the fail bit so the + * above should_optimize will fail from now on. + */ + if (__arch_uprobe_optimize(auprobe, mm, vaddr)) + set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags); + +unlock: + mmap_write_unlock(mm); +} + +static bool insn_is_nop(struct insn *insn) +{ + return insn->opcode.nbytes == 1 && insn->opcode.bytes[0] == 0x90; +} + +static bool insn_is_nopl(struct insn *insn) +{ + if (insn->opcode.nbytes != 2) + return false; + + if (insn->opcode.bytes[0] != 0x0f || insn->opcode.bytes[1] != 0x1f) + return false; + + if (!insn->modrm.nbytes) + return false; + + if (X86_MODRM_REG(insn->modrm.bytes[0]) != 0) + return false; + + /* 0f 1f /0 - NOPL */ + return true; +} + +static bool can_optimize(struct insn *insn, unsigned long vaddr) +{ + if (!insn->x86_64 || insn->length != 5) + return false; + + if (!insn_is_nop(insn) && !insn_is_nopl(insn)) + return false; + + /* We can't do cross page atomic writes yet. */ + return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5; +} #else /* 32-bit: */ /* * No RIP-relative addressing on 32-bit @@ -621,6 +1205,10 @@ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { } +static bool can_optimize(struct insn *insn, unsigned long vaddr) +{ + return false; +} #endif /* CONFIG_X86_64 */ struct uprobe_xol_ops { @@ -979,14 +1567,17 @@ static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) */ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) { - struct insn insn; u8 fix_ip_or_call = UPROBE_FIX_IP; + struct insn insn; int ret; ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm)); if (ret) return ret; + if (can_optimize(&insn, addr)) + set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags); + ret = branch_setup_xol_ops(auprobe, &insn); if (ret != -ENOSYS) return ret; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 4fa0be732af1..d7af4a64c211 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -160,11 +160,6 @@ SECTIONS } :text = 0xcccccccc - /* bootstrapping code */ - .head.text : AT(ADDR(.head.text) - LOAD_OFFSET) { - HEAD_TEXT - } :text = 0xcccccccc - /* End of text section, which should occupy whole number of pages */ _etext = .; . = ALIGN(PAGE_SIZE); @@ -227,6 +222,8 @@ SECTIONS */ .altinstr_aux : AT(ADDR(.altinstr_aux) - LOAD_OFFSET) { *(.altinstr_aux) + . = ALIGN(PAGE_SIZE); + __inittext_end = .; } INIT_DATA_SECTION(16) @@ -535,3 +532,5 @@ xen_elfnote_entry_value = xen_elfnote_phys32_entry_value = ABSOLUTE(xen_elfnote_phys32_entry) + ABSOLUTE(pvh_start_xen - LOAD_OFFSET); #endif + +#include "../boot/startup/exports.h" |