diff options
Diffstat (limited to 'arch')
148 files changed, 4212 insertions, 2411 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index 97642c08a124..5440616f0774 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -41,6 +41,44 @@ config HOTPLUG_SMT config SMT_NUM_THREADS_DYNAMIC bool +config ARCH_SUPPORTS_SCHED_SMT + bool + +config ARCH_SUPPORTS_SCHED_CLUSTER + bool + +config ARCH_SUPPORTS_SCHED_MC + bool + +config SCHED_SMT + bool "SMT (Hyperthreading) scheduler support" + depends on ARCH_SUPPORTS_SCHED_SMT + default y + help + Improves the CPU scheduler's decision making when dealing with + MultiThreading at a cost of slightly increased overhead in some + places. If unsure say N here. + +config SCHED_CLUSTER + bool "Cluster scheduler support" + depends on ARCH_SUPPORTS_SCHED_CLUSTER + default y + help + Cluster scheduler support improves the CPU scheduler's decision + making when dealing with machines that have clusters of CPUs. + Cluster usually means a couple of CPUs which are placed closely + by sharing mid-level caches, last-level cache tags or internal + busses. + +config SCHED_MC + bool "Multi-Core Cache (MC) scheduler support" + depends on ARCH_SUPPORTS_SCHED_MC + default y + help + Multi-core scheduler support improves the CPU scheduler's decision + making when dealing with multi-core CPU chips at a cost of slightly + increased overhead in some places. If unsure say N here. + # Selected by HOTPLUG_CORE_SYNC_DEAD or HOTPLUG_CORE_SYNC_FULL config HOTPLUG_CORE_SYNC bool @@ -1619,7 +1657,7 @@ config HAVE_SPARSE_SYSCALL_NR related optimizations for a given architecture. config ARCH_HAS_VDSO_ARCH_DATA - depends on GENERIC_VDSO_DATA_STORE + depends on HAVE_GENERIC_VDSO bool config ARCH_HAS_VDSO_TIME_DATA @@ -1740,6 +1778,10 @@ config ARCH_VMLINUX_NEEDS_RELOCS relocations preserved. This is used by some architectures to construct bespoke relocation tables for KASLR. +# Select if architecture uses the common generic TIF bits +config HAVE_GENERIC_TIF_BITS + bool + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/arch/alpha/kernel/asm-offsets.c b/arch/alpha/kernel/asm-offsets.c index e9dad60b147f..1ebb05890499 100644 --- a/arch/alpha/kernel/asm-offsets.c +++ b/arch/alpha/kernel/asm-offsets.c @@ -4,6 +4,7 @@ * This code generates raw asm output which is post-processed to extract * and format the required data. */ +#define COMPILE_OFFSETS #include <linux/types.h> #include <linux/stddef.h> diff --git a/arch/arc/kernel/asm-offsets.c b/arch/arc/kernel/asm-offsets.c index f77deb799175..2978da85fcb6 100644 --- a/arch/arc/kernel/asm-offsets.c +++ b/arch/arc/kernel/asm-offsets.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) */ +#define COMPILE_OFFSETS #include <linux/sched.h> #include <linux/mm.h> diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 36ab8625be72..358057001859 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -941,28 +941,14 @@ config IRQSTACKS config ARM_CPU_TOPOLOGY bool "Support cpu topology definition" depends on SMP && CPU_V7 + select ARCH_SUPPORTS_SCHED_MC + select ARCH_SUPPORTS_SCHED_SMT default y help Support ARM cpu topology definition. The MPIDR register defines affinity between processors which is then used to describe the cpu topology of an ARM System. -config SCHED_MC - bool "Multi-core scheduler support" - depends on ARM_CPU_TOPOLOGY - help - Multi-core scheduler support improves the CPU scheduler's decision - making when dealing with multi-core CPU chips at a cost of slightly - increased overhead in some places. If unsure say N here. - -config SCHED_SMT - bool "SMT scheduler support" - depends on ARM_CPU_TOPOLOGY - help - Improves the CPU scheduler's decision making when dealing with - MultiThreading at a cost of slightly increased overhead in some - places. If unsure say N here. - config HAVE_ARM_SCU bool help diff --git a/arch/arm/include/asm/vdso/vsyscall.h b/arch/arm/include/asm/vdso/vsyscall.h index 4e7226ad02ec..ff1c729af05f 100644 --- a/arch/arm/include/asm/vdso/vsyscall.h +++ b/arch/arm/include/asm/vdso/vsyscall.h @@ -7,8 +7,6 @@ #include <vdso/datapage.h> #include <asm/cacheflush.h> -extern bool cntvct_ok; - static __always_inline void __arch_sync_vdso_time_data(struct vdso_time_data *vdata) { diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c index 123f4a8ef446..2101938d27fc 100644 --- a/arch/arm/kernel/asm-offsets.c +++ b/arch/arm/kernel/asm-offsets.c @@ -7,6 +7,8 @@ * This code generates raw asm output which is post-processed to extract * and format the required data. */ +#define COMPILE_OFFSETS + #include <linux/compiler.h> #include <linux/sched.h> #include <linux/mm.h> diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c index 325448ffbba0..e38a30477f3d 100644 --- a/arch/arm/kernel/vdso.c +++ b/arch/arm/kernel/vdso.c @@ -54,11 +54,9 @@ struct elfinfo { char *dynstr; /* ptr to .dynstr section */ }; -/* Cached result of boot-time check for whether the arch timer exists, - * and if so, whether the virtual counter is useable. +/* Boot-time check for whether the arch timer exists, and if so, + * whether the virtual counter is usable. */ -bool cntvct_ok __ro_after_init; - static bool __init cntvct_functional(void) { struct device_node *np; @@ -159,7 +157,7 @@ static void __init patch_vdso(void *ehdr) * want programs to incur the slight additional overhead of * dispatching through the VDSO only to fall back to syscalls. */ - if (!cntvct_ok) { + if (!cntvct_functional()) { vdso_nullpatch_one(&einfo, "__vdso_gettimeofday"); vdso_nullpatch_one(&einfo, "__vdso_clock_gettime"); vdso_nullpatch_one(&einfo, "__vdso_clock_gettime64"); @@ -197,8 +195,6 @@ static int __init vdso_init(void) vdso_total_pages = VDSO_NR_PAGES; /* for the data/vvar pages */ vdso_total_pages += text_pages; - cntvct_ok = cntvct_functional(); - patch_vdso(vdso_start); return 0; diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index 5c1023a6d78c..7b27ee9482b3 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -926,9 +926,7 @@ config VDSO default y if ARM_ARCH_TIMER select HAVE_GENERIC_VDSO select GENERIC_TIME_VSYSCALL - select GENERIC_VDSO_32 select GENERIC_GETTIMEOFDAY - select GENERIC_VDSO_DATA_STORE help Place in the process address space an ELF shared object providing fast implementations of gettimeofday and diff --git a/arch/arm/probes/uprobes/core.c b/arch/arm/probes/uprobes/core.c index 885e0c5e8c20..3d96fb41d624 100644 --- a/arch/arm/probes/uprobes/core.c +++ b/arch/arm/probes/uprobes/core.c @@ -30,7 +30,7 @@ int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { return uprobe_write_opcode(auprobe, vma, vaddr, - __opcode_to_mem_arm(auprobe->bpinsn)); + __opcode_to_mem_arm(auprobe->bpinsn), true); } bool arch_uprobe_ignore(struct arch_uprobe *auprobe, struct pt_regs *regs) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index f828781aa840..b3e13f67d598 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -108,6 +108,9 @@ config ARM64 select ARCH_SUPPORTS_PER_VMA_LOCK select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE select ARCH_SUPPORTS_RT + select ARCH_SUPPORTS_SCHED_SMT + select ARCH_SUPPORTS_SCHED_CLUSTER + select ARCH_SUPPORTS_SCHED_MC select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT select ARCH_WANT_DEFAULT_BPF_JIT @@ -163,8 +166,6 @@ config ARM64 select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select GENERIC_GETTIMEOFDAY - select GENERIC_VDSO_DATA_STORE - select GENERIC_VDSO_TIME_NS select HARDIRQS_SW_RESEND select HAS_IOPORT select HAVE_MOVE_PMD @@ -1507,29 +1508,6 @@ config CPU_LITTLE_ENDIAN endchoice -config SCHED_MC - bool "Multi-core scheduler support" - help - Multi-core scheduler support improves the CPU scheduler's decision - making when dealing with multi-core CPU chips at a cost of slightly - increased overhead in some places. If unsure say N here. - -config SCHED_CLUSTER - bool "Cluster scheduler support" - help - Cluster scheduler support improves the CPU scheduler's decision - making when dealing with machines that have clusters of CPUs. - Cluster usually means a couple of CPUs which are placed closely - by sharing mid-level caches, last-level cache tags or internal - busses. - -config SCHED_SMT - bool "SMT scheduler support" - help - Improves the CPU scheduler's decision making when dealing with - MultiThreading at a cost of slightly increased overhead in some - places. If unsure say N here. - config NR_CPUS int "Maximum number of CPUs (2-4096)" range 2 4096 @@ -1770,7 +1748,6 @@ config COMPAT_VDSO bool "Enable vDSO for 32-bit applications" depends on !CPU_BIG_ENDIAN depends on (CC_IS_CLANG && LD_IS_LLD) || "$(CROSS_COMPILE_COMPAT)" != "" - select GENERIC_COMPAT_VDSO default y help Place in the process address space of 32-bit applications an diff --git a/arch/arm64/include/asm/vdso/compat_barrier.h b/arch/arm64/include/asm/vdso/compat_barrier.h index 3ac35f4a667c..6d75e03d3827 100644 --- a/arch/arm64/include/asm/vdso/compat_barrier.h +++ b/arch/arm64/include/asm/vdso/compat_barrier.h @@ -7,11 +7,10 @@ #ifndef __ASSEMBLY__ /* - * Warning: This code is meant to be used with - * ENABLE_COMPAT_VDSO only. + * Warning: This code is meant to be used from the compat vDSO only. */ -#ifndef ENABLE_COMPAT_VDSO -#error This header is meant to be used with ENABLE_COMPAT_VDSO only +#ifdef __arch64__ +#error This header is meant to be used with from the compat vDSO only #endif #ifdef dmb diff --git a/arch/arm64/include/asm/vdso/compat_gettimeofday.h b/arch/arm64/include/asm/vdso/compat_gettimeofday.h index d60ea7a72a9c..7d1a116549b1 100644 --- a/arch/arm64/include/asm/vdso/compat_gettimeofday.h +++ b/arch/arm64/include/asm/vdso/compat_gettimeofday.h @@ -2,8 +2,8 @@ /* * Copyright (C) 2018 ARM Limited */ -#ifndef __ASM_VDSO_GETTIMEOFDAY_H -#define __ASM_VDSO_GETTIMEOFDAY_H +#ifndef __ASM_VDSO_COMPAT_GETTIMEOFDAY_H +#define __ASM_VDSO_COMPAT_GETTIMEOFDAY_H #ifndef __ASSEMBLY__ @@ -163,4 +163,4 @@ static inline bool vdso_clocksource_ok(const struct vdso_clock *vc) #endif /* !__ASSEMBLY__ */ -#endif /* __ASM_VDSO_GETTIMEOFDAY_H */ +#endif /* __ASM_VDSO_COMPAT_GETTIMEOFDAY_H */ diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h b/arch/arm64/include/asm/vdso/gettimeofday.h index da1ab8759592..c59e84105b43 100644 --- a/arch/arm64/include/asm/vdso/gettimeofday.h +++ b/arch/arm64/include/asm/vdso/gettimeofday.h @@ -5,6 +5,8 @@ #ifndef __ASM_VDSO_GETTIMEOFDAY_H #define __ASM_VDSO_GETTIMEOFDAY_H +#ifdef __aarch64__ + #ifndef __ASSEMBLY__ #include <asm/alternative.h> @@ -96,4 +98,10 @@ static __always_inline const struct vdso_time_data *__arch_get_vdso_u_time_data( #endif /* !__ASSEMBLY__ */ +#else /* !__aarch64__ */ + +#include "compat_gettimeofday.h" + +#endif /* __aarch64__ */ + #endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 30d4bbe68661..b6367ff3a49c 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -6,6 +6,7 @@ * 2001-2002 Keith Owens * Copyright (C) 2012 ARM Ltd. */ +#define COMPILE_OFFSETS #include <linux/arm_sdei.h> #include <linux/sched.h> diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index 5de4deaf4299..ffa3536581f6 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -57,7 +57,6 @@ VDSO_CAFLAGS += -DDISABLE_BRANCH_PROFILING VDSO_CAFLAGS += -march=armv8-a VDSO_CFLAGS := $(VDSO_CAFLAGS) -VDSO_CFLAGS += -DENABLE_COMPAT_VDSO=1 # KBUILD_CFLAGS from top-level Makefile VDSO_CFLAGS += -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ -fno-strict-aliasing -fno-common \ diff --git a/arch/arm64/net/Makefile b/arch/arm64/net/Makefile index 5c540efb7d9b..3ae382bfca87 100644 --- a/arch/arm64/net/Makefile +++ b/arch/arm64/net/Makefile @@ -2,4 +2,4 @@ # # ARM64 networking code # -obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o +obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o bpf_timed_may_goto.o diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 28996e0a9b00..ab83089c3d8f 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1066,19 +1066,53 @@ static void build_epilogue(struct jit_ctx *ctx, bool was_classic) emit(A64_RET(A64_LR), ctx); } -#define BPF_FIXUP_OFFSET_MASK GENMASK(26, 0) +/* + * Metadata encoding for exception handling in JITed code. + * + * Format of `fixup` field in `struct exception_table_entry`: + * + * Bit layout of `fixup` (32-bit): + * + * +-----------+--------+-----------+-----------+----------+ + * | 31-27 | 26-22 | 21 | 20-16 | 15-0 | + * | | | | | | + * | FIXUP_REG | Unused | ARENA_ACC | ARENA_REG | OFFSET | + * +-----------+--------+-----------+-----------+----------+ + * + * - OFFSET (16 bits): Offset used to compute address for Load/Store instruction. + * - ARENA_REG (5 bits): Register that is used to calculate the address for load/store when + * accessing the arena region. + * - ARENA_ACCESS (1 bit): This bit is set when the faulting instruction accessed the arena region. + * - FIXUP_REG (5 bits): Destination register for the load instruction (cleared on fault) or set to + * DONT_CLEAR if it is a store instruction. + */ + +#define BPF_FIXUP_OFFSET_MASK GENMASK(15, 0) +#define BPF_FIXUP_ARENA_REG_MASK GENMASK(20, 16) +#define BPF_ARENA_ACCESS BIT(21) #define BPF_FIXUP_REG_MASK GENMASK(31, 27) #define DONT_CLEAR 5 /* Unused ARM64 register from BPF's POV */ bool ex_handler_bpf(const struct exception_table_entry *ex, struct pt_regs *regs) { - off_t offset = FIELD_GET(BPF_FIXUP_OFFSET_MASK, ex->fixup); int dst_reg = FIELD_GET(BPF_FIXUP_REG_MASK, ex->fixup); + s16 off = FIELD_GET(BPF_FIXUP_OFFSET_MASK, ex->fixup); + int arena_reg = FIELD_GET(BPF_FIXUP_ARENA_REG_MASK, ex->fixup); + bool is_arena = !!(ex->fixup & BPF_ARENA_ACCESS); + bool is_write = (dst_reg == DONT_CLEAR); + unsigned long addr; + + if (is_arena) { + addr = regs->regs[arena_reg] + off; + bpf_prog_report_arena_violation(is_write, addr, regs->pc); + } if (dst_reg != DONT_CLEAR) regs->regs[dst_reg] = 0; - regs->pc = (unsigned long)&ex->fixup - offset; + /* Skip the faulting instruction */ + regs->pc += AARCH64_INSN_SIZE; + return true; } @@ -1088,7 +1122,9 @@ static int add_exception_handler(const struct bpf_insn *insn, int dst_reg) { off_t ins_offset; - off_t fixup_offset; + s16 off = insn->off; + bool is_arena; + int arena_reg; unsigned long pc; struct exception_table_entry *ex; @@ -1097,11 +1133,16 @@ static int add_exception_handler(const struct bpf_insn *insn, return 0; if (BPF_MODE(insn->code) != BPF_PROBE_MEM && - BPF_MODE(insn->code) != BPF_PROBE_MEMSX && - BPF_MODE(insn->code) != BPF_PROBE_MEM32 && - BPF_MODE(insn->code) != BPF_PROBE_ATOMIC) + BPF_MODE(insn->code) != BPF_PROBE_MEMSX && + BPF_MODE(insn->code) != BPF_PROBE_MEM32 && + BPF_MODE(insn->code) != BPF_PROBE_MEM32SX && + BPF_MODE(insn->code) != BPF_PROBE_ATOMIC) return 0; + is_arena = (BPF_MODE(insn->code) == BPF_PROBE_MEM32) || + (BPF_MODE(insn->code) == BPF_PROBE_MEM32SX) || + (BPF_MODE(insn->code) == BPF_PROBE_ATOMIC); + if (!ctx->prog->aux->extable || WARN_ON_ONCE(ctx->exentry_idx >= ctx->prog->aux->num_exentries)) return -EINVAL; @@ -1120,22 +1161,6 @@ static int add_exception_handler(const struct bpf_insn *insn, return -ERANGE; /* - * Since the extable follows the program, the fixup offset is always - * negative and limited to BPF_JIT_REGION_SIZE. Store a positive value - * to keep things simple, and put the destination register in the upper - * bits. We don't need to worry about buildtime or runtime sort - * modifying the upper bits because the table is already sorted, and - * isn't part of the main exception table. - * - * The fixup_offset is set to the next instruction from the instruction - * that may fault. The execution will jump to this after handling the - * fault. - */ - fixup_offset = (long)&ex->fixup - (pc + AARCH64_INSN_SIZE); - if (!FIELD_FIT(BPF_FIXUP_OFFSET_MASK, fixup_offset)) - return -ERANGE; - - /* * The offsets above have been calculated using the RO buffer but we * need to use the R/W buffer for writes. * switch ex to rw buffer for writing. @@ -1147,8 +1172,26 @@ static int add_exception_handler(const struct bpf_insn *insn, if (BPF_CLASS(insn->code) != BPF_LDX) dst_reg = DONT_CLEAR; - ex->fixup = FIELD_PREP(BPF_FIXUP_OFFSET_MASK, fixup_offset) | - FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg); + ex->fixup = FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg); + + if (is_arena) { + ex->fixup |= BPF_ARENA_ACCESS; + /* + * insn->src_reg/dst_reg holds the address in the arena region with upper 32-bits + * being zero because of a preceding addr_space_cast(r<n>, 0x0, 0x1) instruction. + * This address is adjusted with the addition of arena_vm_start (see the + * implementation of BPF_PROBE_MEM32 and BPF_PROBE_ATOMIC) before being used for the + * memory access. Pass the reg holding the unmodified 32-bit address to + * ex_handler_bpf. + */ + if (BPF_CLASS(insn->code) == BPF_LDX) + arena_reg = bpf2a64[insn->src_reg]; + else + arena_reg = bpf2a64[insn->dst_reg]; + + ex->fixup |= FIELD_PREP(BPF_FIXUP_OFFSET_MASK, off) | + FIELD_PREP(BPF_FIXUP_ARENA_REG_MASK, arena_reg); + } ex->type = EX_TYPE_BPF; @@ -1558,7 +1601,13 @@ emit_cond_jmp: if (ret < 0) return ret; emit_call(func_addr, ctx); - emit(A64_MOV(1, r0, A64_R(0)), ctx); + /* + * Call to arch_bpf_timed_may_goto() is emitted by the + * verifier and called with custom calling convention with + * first argument and return value in BPF_REG_AX (x9). + */ + if (func_addr != (u64)arch_bpf_timed_may_goto) + emit(A64_MOV(1, r0, A64_R(0)), ctx); break; } /* tail call */ @@ -1612,7 +1661,11 @@ emit_cond_jmp: case BPF_LDX | BPF_PROBE_MEM32 | BPF_H: case BPF_LDX | BPF_PROBE_MEM32 | BPF_W: case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW: - if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) { + case BPF_LDX | BPF_PROBE_MEM32SX | BPF_B: + case BPF_LDX | BPF_PROBE_MEM32SX | BPF_H: + case BPF_LDX | BPF_PROBE_MEM32SX | BPF_W: + if (BPF_MODE(insn->code) == BPF_PROBE_MEM32 || + BPF_MODE(insn->code) == BPF_PROBE_MEM32SX) { emit(A64_ADD(1, tmp2, src, arena_vm_base), ctx); src = tmp2; } @@ -1624,7 +1677,8 @@ emit_cond_jmp: off_adj = off; } sign_extend = (BPF_MODE(insn->code) == BPF_MEMSX || - BPF_MODE(insn->code) == BPF_PROBE_MEMSX); + BPF_MODE(insn->code) == BPF_PROBE_MEMSX || + BPF_MODE(insn->code) == BPF_PROBE_MEM32SX); switch (BPF_SIZE(code)) { case BPF_W: if (is_lsi_offset(off_adj, 2)) { @@ -1832,9 +1886,11 @@ emit_cond_jmp: if (ret) return ret; - ret = add_exception_handler(insn, ctx, dst); - if (ret) - return ret; + if (BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) { + ret = add_exception_handler(insn, ctx, dst); + if (ret) + return ret; + } break; default: @@ -2767,7 +2823,6 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image, goto out; } - bpf_flush_icache(ro_image, ro_image + size); out: kvfree(image); return ret; @@ -3038,6 +3093,11 @@ bool bpf_jit_bypass_spec_v4(void) return true; } +bool bpf_jit_supports_timed_may_goto(void) +{ + return true; +} + bool bpf_jit_inlines_helper_call(s32 imm) { switch (imm) { @@ -3064,8 +3124,7 @@ void bpf_jit_free(struct bpf_prog *prog) * before freeing it. */ if (jit_data) { - bpf_arch_text_copy(&jit_data->ro_header->size, &jit_data->header->size, - sizeof(jit_data->header->size)); + bpf_jit_binary_pack_finalize(jit_data->ro_header, jit_data->header); kfree(jit_data); } prog->bpf_func -= cfi_get_offset(); diff --git a/arch/arm64/net/bpf_timed_may_goto.S b/arch/arm64/net/bpf_timed_may_goto.S new file mode 100644 index 000000000000..894cfcd7b241 --- /dev/null +++ b/arch/arm64/net/bpf_timed_may_goto.S @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025 Puranjay Mohan <puranjay@kernel.org> */ + +#include <linux/linkage.h> + +SYM_FUNC_START(arch_bpf_timed_may_goto) + /* Allocate stack space and emit frame record */ + stp x29, x30, [sp, #-64]! + mov x29, sp + + /* Save BPF registers R0 - R5 (x7, x0-x4)*/ + stp x7, x0, [sp, #16] + stp x1, x2, [sp, #32] + stp x3, x4, [sp, #48] + + /* + * Stack depth was passed in BPF_REG_AX (x9), add it to the BPF_FP + * (x25) to get the pointer to count and timestamp and pass it as the + * first argument in x0. + * + * Before generating the call to arch_bpf_timed_may_goto, the verifier + * generates a load instruction using FP, i.e. REG_AX = *(u64 *)(FP - + * stack_off_cnt), so BPF_REG_FP (x25) is always set up by the arm64 + * jit in this case. + */ + add x0, x9, x25 + bl bpf_check_timed_may_goto + /* BPF_REG_AX(x9) will be stored into count, so move return value to it. */ + mov x9, x0 + + /* Restore BPF registers R0 - R5 (x7, x0-x4) */ + ldp x7, x0, [sp, #16] + ldp x1, x2, [sp, #32] + ldp x3, x4, [sp, #48] + + /* Restore FP and LR */ + ldp x29, x30, [sp], #64 + + ret +SYM_FUNC_END(arch_bpf_timed_may_goto) diff --git a/arch/csky/kernel/asm-offsets.c b/arch/csky/kernel/asm-offsets.c index d1e903579473..5525c8e7e1d9 100644 --- a/arch/csky/kernel/asm-offsets.c +++ b/arch/csky/kernel/asm-offsets.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. +#define COMPILE_OFFSETS #include <linux/sched.h> #include <linux/kernel_stat.h> diff --git a/arch/hexagon/kernel/asm-offsets.c b/arch/hexagon/kernel/asm-offsets.c index 03a7063f9456..50eea9fa6f13 100644 --- a/arch/hexagon/kernel/asm-offsets.c +++ b/arch/hexagon/kernel/asm-offsets.c @@ -8,6 +8,7 @@ * * Copyright (c) 2010-2012, The Linux Foundation. All rights reserved. */ +#define COMPILE_OFFSETS #include <linux/compat.h> #include <linux/types.h> diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 0631a6b11281..e6225539579f 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -70,6 +70,8 @@ config LOONGARCH select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_SUPPORTS_RT + select ARCH_SUPPORTS_SCHED_SMT if SMP + select ARCH_SUPPORTS_SCHED_MC if SMP select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_MEMTEST @@ -108,8 +110,6 @@ config LOONGARCH select GENERIC_SCHED_CLOCK select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL - select GENERIC_VDSO_DATA_STORE - select GENERIC_VDSO_TIME_NS select GPIOLIB select HAS_IOPORT select HAVE_ARCH_AUDITSYSCALL @@ -140,6 +140,7 @@ config LOONGARCH select HAVE_EBPF_JIT select HAVE_EFFICIENT_UNALIGNED_ACCESS if !ARCH_STRICT_ALIGN select HAVE_EXIT_THREAD + select HAVE_GENERIC_TIF_BITS select HAVE_GUP_FAST select HAVE_FTRACE_GRAPH_FUNC select HAVE_FUNCTION_ARG_ACCESS_API @@ -452,23 +453,6 @@ config EFI_STUB This kernel feature allows the kernel to be loaded directly by EFI firmware without the use of a bootloader. -config SCHED_SMT - bool "SMT scheduler support" - depends on SMP - default y - help - Improves scheduler's performance when there are multiple - threads in one physical core. - -config SCHED_MC - bool "Multi-core scheduler support" - depends on SMP - default y - help - Multi-core scheduler support improves the CPU scheduler's decision - making when dealing with multi-core CPU chips at a cost of slightly - increased overhead in some places. - config SMP bool "Multi-Processing support" help diff --git a/arch/loongarch/include/asm/thread_info.h b/arch/loongarch/include/asm/thread_info.h index 9dfa2ef00816..4d7117fcdc78 100644 --- a/arch/loongarch/include/asm/thread_info.h +++ b/arch/loongarch/include/asm/thread_info.h @@ -65,50 +65,42 @@ register unsigned long current_stack_pointer __asm__("$sp"); * access * - pending work-to-be-done flags are in LSW * - other flags in MSW + * + * Tell the generic TIF infrastructure which special bits loongarch supports */ -#define TIF_NEED_RESCHED 0 /* rescheduling necessary */ -#define TIF_NEED_RESCHED_LAZY 1 /* lazy rescheduling necessary */ -#define TIF_SIGPENDING 2 /* signal pending */ -#define TIF_NOTIFY_RESUME 3 /* callback before returning to user */ -#define TIF_NOTIFY_SIGNAL 4 /* signal notifications exist */ -#define TIF_RESTORE_SIGMASK 5 /* restore signal mask in do_signal() */ -#define TIF_NOHZ 6 /* in adaptive nohz mode */ -#define TIF_UPROBE 7 /* breakpointed or singlestepping */ -#define TIF_USEDFPU 8 /* FPU was used by this task this quantum (SMP) */ -#define TIF_USEDSIMD 9 /* SIMD has been used this quantum */ -#define TIF_MEMDIE 10 /* is terminating due to OOM killer */ -#define TIF_FIXADE 11 /* Fix address errors in software */ -#define TIF_LOGADE 12 /* Log address errors to syslog */ -#define TIF_32BIT_REGS 13 /* 32-bit general purpose registers */ -#define TIF_32BIT_ADDR 14 /* 32-bit address space */ -#define TIF_LOAD_WATCH 15 /* If set, load watch registers */ -#define TIF_SINGLESTEP 16 /* Single Step */ -#define TIF_LSX_CTX_LIVE 17 /* LSX context must be preserved */ -#define TIF_LASX_CTX_LIVE 18 /* LASX context must be preserved */ -#define TIF_USEDLBT 19 /* LBT was used by this task this quantum (SMP) */ -#define TIF_LBT_CTX_LIVE 20 /* LBT context must be preserved */ -#define TIF_PATCH_PENDING 21 /* pending live patching update */ +#define HAVE_TIF_NEED_RESCHED_LAZY +#define HAVE_TIF_RESTORE_SIGMASK + +#include <asm-generic/thread_info_tif.h> + +/* Architecture specific bits */ +#define TIF_NOHZ 16 /* in adaptive nohz mode */ +#define TIF_USEDFPU 17 /* FPU was used by this task this quantum (SMP) */ +#define TIF_USEDSIMD 18 /* SIMD has been used this quantum */ +#define TIF_FIXADE 19 /* Fix address errors in software */ +#define TIF_LOGADE 20 /* Log address errors to syslog */ +#define TIF_32BIT_REGS 21 /* 32-bit general purpose registers */ +#define TIF_32BIT_ADDR 22 /* 32-bit address space */ +#define TIF_LOAD_WATCH 23 /* If set, load watch registers */ +#define TIF_SINGLESTEP 24 /* Single Step */ +#define TIF_LSX_CTX_LIVE 25 /* LSX context must be preserved */ +#define TIF_LASX_CTX_LIVE 26 /* LASX context must be preserved */ +#define TIF_USEDLBT 27 /* LBT was used by this task this quantum (SMP) */ +#define TIF_LBT_CTX_LIVE 28 /* LBT context must be preserved */ -#define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) -#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY) -#define _TIF_SIGPENDING (1<<TIF_SIGPENDING) -#define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME) -#define _TIF_NOTIFY_SIGNAL (1<<TIF_NOTIFY_SIGNAL) -#define _TIF_NOHZ (1<<TIF_NOHZ) -#define _TIF_UPROBE (1<<TIF_UPROBE) -#define _TIF_USEDFPU (1<<TIF_USEDFPU) -#define _TIF_USEDSIMD (1<<TIF_USEDSIMD) -#define _TIF_FIXADE (1<<TIF_FIXADE) -#define _TIF_LOGADE (1<<TIF_LOGADE) -#define _TIF_32BIT_REGS (1<<TIF_32BIT_REGS) -#define _TIF_32BIT_ADDR (1<<TIF_32BIT_ADDR) -#define _TIF_LOAD_WATCH (1<<TIF_LOAD_WATCH) -#define _TIF_SINGLESTEP (1<<TIF_SINGLESTEP) -#define _TIF_LSX_CTX_LIVE (1<<TIF_LSX_CTX_LIVE) -#define _TIF_LASX_CTX_LIVE (1<<TIF_LASX_CTX_LIVE) -#define _TIF_USEDLBT (1<<TIF_USEDLBT) -#define _TIF_LBT_CTX_LIVE (1<<TIF_LBT_CTX_LIVE) -#define _TIF_PATCH_PENDING (1<<TIF_PATCH_PENDING) +#define _TIF_NOHZ BIT(TIF_NOHZ) +#define _TIF_USEDFPU BIT(TIF_USEDFPU) +#define _TIF_USEDSIMD BIT(TIF_USEDSIMD) +#define _TIF_FIXADE BIT(TIF_FIXADE) +#define _TIF_LOGADE BIT(TIF_LOGADE) +#define _TIF_32BIT_REGS BIT(TIF_32BIT_REGS) +#define _TIF_32BIT_ADDR BIT(TIF_32BIT_ADDR) +#define _TIF_LOAD_WATCH BIT(TIF_LOAD_WATCH) +#define _TIF_SINGLESTEP BIT(TIF_SINGLESTEP) +#define _TIF_LSX_CTX_LIVE BIT(TIF_LSX_CTX_LIVE) +#define _TIF_LASX_CTX_LIVE BIT(TIF_LASX_CTX_LIVE) +#define _TIF_USEDLBT BIT(TIF_USEDLBT) +#define _TIF_LBT_CTX_LIVE BIT(TIF_LBT_CTX_LIVE) #endif /* __KERNEL__ */ #endif /* _ASM_THREAD_INFO_H */ diff --git a/arch/loongarch/kernel/asm-offsets.c b/arch/loongarch/kernel/asm-offsets.c index db1e4bb26b6a..3017c7157600 100644 --- a/arch/loongarch/kernel/asm-offsets.c +++ b/arch/loongarch/kernel/asm-offsets.c @@ -4,6 +4,8 @@ * * Copyright (C) 2020-2022 Loongson Technology Corporation Limited */ +#define COMPILE_OFFSETS + #include <linux/types.h> #include <linux/sched.h> #include <linux/mm.h> diff --git a/arch/loongarch/kernel/time.c b/arch/loongarch/kernel/time.c index f3092f2de8b5..6fb92cc1a4c9 100644 --- a/arch/loongarch/kernel/time.c +++ b/arch/loongarch/kernel/time.c @@ -112,8 +112,6 @@ static int arch_timer_starting(unsigned int cpu) static int arch_timer_dying(unsigned int cpu) { - constant_set_state_shutdown(this_cpu_ptr(&constant_clockevent_device)); - /* Clear Timer Interrupt */ write_csr_tintclear(CSR_TINTCLR_TI); diff --git a/arch/m68k/kernel/asm-offsets.c b/arch/m68k/kernel/asm-offsets.c index 906d73230537..67a1990f9d74 100644 --- a/arch/m68k/kernel/asm-offsets.c +++ b/arch/m68k/kernel/asm-offsets.c @@ -9,6 +9,7 @@ * #defines from the assembly-language output. */ +#define COMPILE_OFFSETS #define ASM_OFFSETS_C #include <linux/stddef.h> diff --git a/arch/microblaze/kernel/asm-offsets.c b/arch/microblaze/kernel/asm-offsets.c index 104c3ac5f30c..b4b67d58e7f6 100644 --- a/arch/microblaze/kernel/asm-offsets.c +++ b/arch/microblaze/kernel/asm-offsets.c @@ -7,6 +7,7 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. */ +#define COMPILE_OFFSETS #include <linux/init.h> #include <linux/stddef.h> diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index caf508f6e9ec..608e01ed6cff 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -51,7 +51,6 @@ config MIPS select GENERIC_SMP_IDLE_THREAD select GENERIC_IDLE_POLL_SETUP select GENERIC_TIME_VSYSCALL - select GENERIC_VDSO_DATA_STORE select GUP_GET_PXX_LOW_HIGH if CPU_MIPS32 && PHYS_ADDR_T_64BIT select HAS_IOPORT if !NO_IOPORT_MAP || ISA select HAVE_ARCH_COMPILER_H @@ -2223,7 +2222,7 @@ config MIPS_MT_SMP select SMP select SMP_UP select SYS_SUPPORTS_SMP - select SYS_SUPPORTS_SCHED_SMT + select ARCH_SUPPORTS_SCHED_SMT select MIPS_PERF_SHARED_TC_COUNTERS help This is a kernel model which is known as SMVP. This is supported @@ -2235,18 +2234,6 @@ config MIPS_MT_SMP config MIPS_MT bool -config SCHED_SMT - bool "SMT (multithreading) scheduler support" - depends on SYS_SUPPORTS_SCHED_SMT - default n - help - SMT scheduler support improves the CPU scheduler's decision making - when dealing with MIPS MT enabled cores at a cost of slightly - increased overhead in some places. If unsure say N here. - -config SYS_SUPPORTS_SCHED_SMT - bool - config SYS_SUPPORTS_MULTITHREADING bool @@ -2318,7 +2305,7 @@ config MIPS_CPS select HOTPLUG_CORE_SYNC_DEAD if HOTPLUG_CPU select SYNC_R4K if (CEVT_R4K || CSRC_R4K) select SYS_SUPPORTS_HOTPLUG_CPU - select SYS_SUPPORTS_SCHED_SMT if CPU_MIPSR6 + select ARCH_SUPPORTS_SCHED_SMT if CPU_MIPSR6 select SYS_SUPPORTS_SMP select WEAK_ORDERING select GENERIC_IRQ_MIGRATION if HOTPLUG_CPU diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c index 1e29efcba46e..5debd9a3854a 100644 --- a/arch/mips/kernel/asm-offsets.c +++ b/arch/mips/kernel/asm-offsets.c @@ -9,6 +9,8 @@ * Kevin Kissell, kevink@mips.com and Carsten Langgaard, carstenl@mips.com * Copyright (C) 2000 MIPS Technologies, Inc. */ +#define COMPILE_OFFSETS + #include <linux/compat.h> #include <linux/types.h> #include <linux/sched.h> diff --git a/arch/nios2/kernel/asm-offsets.c b/arch/nios2/kernel/asm-offsets.c index e3d9b7b6fb48..88190b503ce5 100644 --- a/arch/nios2/kernel/asm-offsets.c +++ b/arch/nios2/kernel/asm-offsets.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2011 Tobias Klauser <tklauser@distanz.ch> */ +#define COMPILE_OFFSETS #include <linux/stddef.h> #include <linux/sched.h> diff --git a/arch/openrisc/kernel/asm-offsets.c b/arch/openrisc/kernel/asm-offsets.c index 710651d5aaae..3cc826f2216b 100644 --- a/arch/openrisc/kernel/asm-offsets.c +++ b/arch/openrisc/kernel/asm-offsets.c @@ -18,6 +18,7 @@ * compile this file to assembler, and then extract the * #defines from the assembly-language output. */ +#define COMPILE_OFFSETS #include <linux/signal.h> #include <linux/sched.h> diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 2efa4b08b7b8..0940c162f1f7 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -44,6 +44,7 @@ config PARISC select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_SMP_IDLE_THREAD select GENERIC_ARCH_TOPOLOGY if SMP + select ARCH_SUPPORTS_SCHED_MC if SMP && PA8X00 select GENERIC_CPU_DEVICES if !SMP select GENERIC_LIB_DEVMEM_IS_ALLOWED select SYSCTL_ARCH_UNALIGN_ALLOW @@ -319,14 +320,6 @@ config SMP If you don't know what to do here, say N. -config SCHED_MC - bool "Multi-core scheduler support" - depends on GENERIC_ARCH_TOPOLOGY && PA8X00 - help - Multi-core scheduler support improves the CPU scheduler's decision - making when dealing with multi-core CPU chips at a cost of slightly - increased overhead in some places. If unsure say N here. - config IRQSTACKS bool "Use separate kernel stacks when processing interrupts" default y diff --git a/arch/parisc/kernel/asm-offsets.c b/arch/parisc/kernel/asm-offsets.c index 757816a7bd4b..9abfe65492c6 100644 --- a/arch/parisc/kernel/asm-offsets.c +++ b/arch/parisc/kernel/asm-offsets.c @@ -13,6 +13,7 @@ * Copyright (C) 2002 Randolph Chung <tausq with parisc-linux.org> * Copyright (C) 2003 James Bottomley <jejb at parisc-linux.org> */ +#define COMPILE_OFFSETS #include <linux/types.h> #include <linux/sched.h> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 325c1171894d..22173a897670 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -170,6 +170,9 @@ config PPC select ARCH_STACKWALK select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC_BOOK3S || PPC_8xx + select ARCH_SUPPORTS_SCHED_MC if SMP + select ARCH_SUPPORTS_SCHED_SMT if PPC64 && SMP + select SCHED_MC if ARCH_SUPPORTS_SCHED_MC select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF if PPC64 select ARCH_USE_MEMTEST @@ -207,8 +210,6 @@ config PPC select GENERIC_PCI_IOMAP if PCI select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL - select GENERIC_VDSO_DATA_STORE - select GENERIC_VDSO_TIME_NS select HAS_IOPORT if PCI select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_HUGE_VMALLOC if HAVE_ARCH_HUGE_VMAP @@ -965,14 +966,6 @@ config PPC_PROT_SAO_LPAR config PPC_COPRO_BASE bool -config SCHED_SMT - bool "SMT (Hyperthreading) scheduler support" - depends on PPC64 && SMP - help - SMT scheduler support improves the CPU scheduler's decision making - when dealing with POWER5 cpus at a cost of slightly increased - overhead in some places. If unsure say N here. - config PPC_DENORMALISATION bool "PowerPC denormalisation exception handling" depends on PPC_BOOK3S_64 diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index da15b5efe807..f19ca44512d1 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -131,6 +131,8 @@ static inline int cpu_to_coregroup_id(int cpu) #ifdef CONFIG_SMP #include <asm/cputable.h> +struct cpumask *cpu_coregroup_mask(int cpu); + #ifdef CONFIG_PPC64 #include <asm/smp.h> diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index b3048f6d3822..a4bc80b30410 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -8,6 +8,7 @@ * compile this file to assembler, and then extract the * #defines from the assembly-language output. */ +#define COMPILE_OFFSETS #include <linux/compat.h> #include <linux/signal.h> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index f59e4b9cc207..68edb66c2964 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1028,19 +1028,19 @@ static int powerpc_shared_proc_flags(void) * We can't just pass cpu_l2_cache_mask() directly because * returns a non-const pointer and the compiler barfs on that. */ -static const struct cpumask *shared_cache_mask(int cpu) +static const struct cpumask *tl_cache_mask(struct sched_domain_topology_level *tl, int cpu) { return per_cpu(cpu_l2_cache_map, cpu); } #ifdef CONFIG_SCHED_SMT -static const struct cpumask *smallcore_smt_mask(int cpu) +static const struct cpumask *tl_smallcore_smt_mask(struct sched_domain_topology_level *tl, int cpu) { return cpu_smallcore_mask(cpu); } #endif -static struct cpumask *cpu_coregroup_mask(int cpu) +struct cpumask *cpu_coregroup_mask(int cpu) { return per_cpu(cpu_coregroup_map, cpu); } @@ -1054,11 +1054,6 @@ static bool has_coregroup_support(void) return coregroup_enabled; } -static const struct cpumask *cpu_mc_mask(int cpu) -{ - return cpu_coregroup_mask(cpu); -} - static int __init init_big_cores(void) { int cpu; @@ -1448,7 +1443,7 @@ static bool update_mask_by_l2(int cpu, cpumask_var_t *mask) return false; } - cpumask_and(*mask, cpu_online_mask, cpu_cpu_mask(cpu)); + cpumask_and(*mask, cpu_online_mask, cpu_node_mask(cpu)); /* Update l2-cache mask with all the CPUs that are part of submask */ or_cpumasks_related(cpu, cpu, submask_fn, cpu_l2_cache_mask); @@ -1538,7 +1533,7 @@ static void update_coregroup_mask(int cpu, cpumask_var_t *mask) return; } - cpumask_and(*mask, cpu_online_mask, cpu_cpu_mask(cpu)); + cpumask_and(*mask, cpu_online_mask, cpu_node_mask(cpu)); /* Update coregroup mask with all the CPUs that are part of submask */ or_cpumasks_related(cpu, cpu, submask_fn, cpu_coregroup_mask); @@ -1601,7 +1596,7 @@ static void add_cpu_to_masks(int cpu) /* If chip_id is -1; limit the cpu_core_mask to within PKG */ if (chip_id == -1) - cpumask_and(mask, mask, cpu_cpu_mask(cpu)); + cpumask_and(mask, mask, cpu_node_mask(cpu)); for_each_cpu(i, mask) { if (chip_id == cpu_to_chip_id(i)) { @@ -1701,22 +1696,22 @@ static void __init build_sched_topology(void) if (has_big_cores) { pr_info("Big cores detected but using small core scheduling\n"); powerpc_topology[i++] = - SDTL_INIT(smallcore_smt_mask, powerpc_smt_flags, SMT); + SDTL_INIT(tl_smallcore_smt_mask, powerpc_smt_flags, SMT); } else { - powerpc_topology[i++] = SDTL_INIT(cpu_smt_mask, powerpc_smt_flags, SMT); + powerpc_topology[i++] = SDTL_INIT(tl_smt_mask, powerpc_smt_flags, SMT); } #endif if (shared_caches) { powerpc_topology[i++] = - SDTL_INIT(shared_cache_mask, powerpc_shared_cache_flags, CACHE); + SDTL_INIT(tl_cache_mask, powerpc_shared_cache_flags, CACHE); } if (has_coregroup_support()) { powerpc_topology[i++] = - SDTL_INIT(cpu_mc_mask, powerpc_shared_proc_flags, MC); + SDTL_INIT(tl_mc_mask, powerpc_shared_proc_flags, MC); } - powerpc_topology[i++] = SDTL_INIT(cpu_cpu_mask, powerpc_shared_proc_flags, PKG); + powerpc_topology[i++] = SDTL_INIT(tl_pkg_mask, powerpc_shared_proc_flags, PKG); /* There must be one trailing NULL entry left. */ BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1); diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 28d00744cbb5..2181dde50d6e 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -53,7 +53,7 @@ config RISCV select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN - select ARCH_HAS_VDSO_ARCH_DATA if GENERIC_VDSO_DATA_STORE + select ARCH_HAS_VDSO_ARCH_DATA if HAVE_GENERIC_VDSO select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_KEEP_MEMBLOCK if ACPI select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE if 64BIT && MMU @@ -74,6 +74,7 @@ config RISCV select ARCH_SUPPORTS_PER_VMA_LOCK if MMU select ARCH_SUPPORTS_RT select ARCH_SUPPORTS_SHADOW_CALL_STACK if HAVE_SHADOW_CALL_STACK + select ARCH_SUPPORTS_SCHED_MC if SMP select ARCH_USE_CMPXCHG_LOCKREF if 64BIT select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS @@ -109,7 +110,7 @@ config RISCV select GENERIC_CPU_VULNERABILITIES select GENERIC_EARLY_IOREMAP select GENERIC_ENTRY - select GENERIC_GETTIMEOFDAY if HAVE_GENERIC_VDSO + select GENERIC_GETTIMEOFDAY if HAVE_GENERIC_VDSO && 64BIT select GENERIC_IDLE_POLL_SETUP select GENERIC_IOREMAP if MMU select GENERIC_IRQ_IPI if SMP @@ -122,9 +123,7 @@ config RISCV select GENERIC_PCI_IOMAP select GENERIC_SCHED_CLOCK select GENERIC_SMP_IDLE_THREAD - select GENERIC_TIME_VSYSCALL if MMU && 64BIT - select GENERIC_VDSO_DATA_STORE if MMU - select GENERIC_VDSO_TIME_NS if HAVE_GENERIC_VDSO + select GENERIC_TIME_VSYSCALL if GENERIC_GETTIMEOFDAY select HARDIRQS_SW_RESEND select HAS_IOPORT if MMU select HAVE_ALIGNED_STRUCT_PAGE @@ -163,11 +162,12 @@ config RISCV select HAVE_FUNCTION_GRAPH_FREGS select HAVE_FUNCTION_TRACER if !XIP_KERNEL && HAVE_DYNAMIC_FTRACE select HAVE_EBPF_JIT if MMU + select HAVE_GENERIC_TIF_BITS select HAVE_GUP_FAST if MMU select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUNCTION_ERROR_INJECTION select HAVE_GCC_PLUGINS - select HAVE_GENERIC_VDSO if MMU && 64BIT + select HAVE_GENERIC_VDSO if MMU select HAVE_IRQ_TIME_ACCOUNTING select HAVE_KERNEL_BZIP2 if !XIP_KERNEL && !EFI_ZBOOT select HAVE_KERNEL_GZIP if !XIP_KERNEL && !EFI_ZBOOT @@ -223,7 +223,7 @@ config RISCV select THREAD_INFO_IN_TASK select TRACE_IRQFLAGS_SUPPORT select UACCESS_MEMCPY if !MMU - select VDSO_GETRANDOM if HAVE_GENERIC_VDSO + select VDSO_GETRANDOM if HAVE_GENERIC_VDSO && 64BIT select USER_STACKTRACE_SUPPORT select ZONE_DMA32 if 64BIT @@ -455,14 +455,6 @@ config SMP If you don't know what to do here, say N. -config SCHED_MC - bool "Multi-core scheduler support" - depends on SMP - help - Multi-core scheduler support improves the CPU scheduler's decision - making when dealing with multi-core CPU chips at a cost of slightly - increased overhead in some places. If unsure say N here. - config NR_CPUS int "Maximum number of CPUs (2-512)" depends on SMP @@ -716,7 +708,6 @@ config TOOLCHAIN_HAS_ZACAS config RISCV_ISA_ZACAS bool "Zacas extension support for atomic CAS" - depends on TOOLCHAIN_HAS_ZACAS depends on RISCV_ALTERNATIVE default y help diff --git a/arch/riscv/boot/dts/sophgo/sg2042.dtsi b/arch/riscv/boot/dts/sophgo/sg2042.dtsi index b3e4d3c18fdc..6430c6e25c00 100644 --- a/arch/riscv/boot/dts/sophgo/sg2042.dtsi +++ b/arch/riscv/boot/dts/sophgo/sg2042.dtsi @@ -190,7 +190,7 @@ reg-names = "clr", "doorbell"; msi-controller; #msi-cells = <0>; - msi-ranges = <&intc 64 IRQ_TYPE_LEVEL_HIGH 32>; + msi-ranges = <&intc 64 IRQ_TYPE_EDGE_RISING 32>; }; rpgate: clock-controller@7030010368 { diff --git a/arch/riscv/boot/dts/sophgo/sg2044.dtsi b/arch/riscv/boot/dts/sophgo/sg2044.dtsi index 6ec955744b0c..320c4d1d08e6 100644 --- a/arch/riscv/boot/dts/sophgo/sg2044.dtsi +++ b/arch/riscv/boot/dts/sophgo/sg2044.dtsi @@ -214,7 +214,7 @@ reg-names = "clr", "doorbell"; #msi-cells = <0>; msi-controller; - msi-ranges = <&intc 352 IRQ_TYPE_LEVEL_HIGH 512>; + msi-ranges = <&intc 352 IRQ_TYPE_EDGE_RISING 512>; status = "disabled"; }; diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h index 80bd52363c68..122e1485d39a 100644 --- a/arch/riscv/include/asm/cmpxchg.h +++ b/arch/riscv/include/asm/cmpxchg.h @@ -134,6 +134,7 @@ ({ \ if (IS_ENABLED(CONFIG_RISCV_ISA_ZABHA) && \ IS_ENABLED(CONFIG_RISCV_ISA_ZACAS) && \ + IS_ENABLED(CONFIG_TOOLCHAIN_HAS_ZACAS) && \ riscv_has_extension_unlikely(RISCV_ISA_EXT_ZABHA) && \ riscv_has_extension_unlikely(RISCV_ISA_EXT_ZACAS)) { \ r = o; \ @@ -181,6 +182,7 @@ r, p, co, o, n) \ ({ \ if (IS_ENABLED(CONFIG_RISCV_ISA_ZACAS) && \ + IS_ENABLED(CONFIG_TOOLCHAIN_HAS_ZACAS) && \ riscv_has_extension_unlikely(RISCV_ISA_EXT_ZACAS)) { \ r = o; \ \ @@ -316,7 +318,7 @@ arch_cmpxchg_release((ptr), (o), (n)); \ }) -#if defined(CONFIG_64BIT) && defined(CONFIG_RISCV_ISA_ZACAS) +#if defined(CONFIG_64BIT) && defined(CONFIG_RISCV_ISA_ZACAS) && defined(CONFIG_TOOLCHAIN_HAS_ZACAS) #define system_has_cmpxchg128() riscv_has_extension_unlikely(RISCV_ISA_EXT_ZACAS) @@ -352,7 +354,7 @@ union __u128_halves { #define arch_cmpxchg128_local(ptr, o, n) \ __arch_cmpxchg128((ptr), (o), (n), "") -#endif /* CONFIG_64BIT && CONFIG_RISCV_ISA_ZACAS */ +#endif /* CONFIG_64BIT && CONFIG_RISCV_ISA_ZACAS && CONFIG_TOOLCHAIN_HAS_ZACAS */ #ifdef CONFIG_RISCV_ISA_ZAWRS /* diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h index c33d8b7dd488..836d80dd2921 100644 --- a/arch/riscv/include/asm/thread_info.h +++ b/arch/riscv/include/asm/thread_info.h @@ -107,23 +107,18 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); * - pending work-to-be-done flags are in lowest half-word * - other flags in upper half-word(s) */ -#define TIF_NEED_RESCHED 0 /* rescheduling necessary */ -#define TIF_NEED_RESCHED_LAZY 1 /* Lazy rescheduling needed */ -#define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ -#define TIF_SIGPENDING 3 /* signal pending */ -#define TIF_RESTORE_SIGMASK 4 /* restore signal mask in do_signal() */ -#define TIF_MEMDIE 5 /* is terminating due to OOM killer */ -#define TIF_NOTIFY_SIGNAL 9 /* signal notifications exist */ -#define TIF_UPROBE 10 /* uprobe breakpoint or singlestep */ -#define TIF_32BIT 11 /* compat-mode 32bit process */ -#define TIF_RISCV_V_DEFER_RESTORE 12 /* restore Vector before returing to user */ - -#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) -#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) -#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) -#define _TIF_SIGPENDING (1 << TIF_SIGPENDING) -#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) -#define _TIF_UPROBE (1 << TIF_UPROBE) -#define _TIF_RISCV_V_DEFER_RESTORE (1 << TIF_RISCV_V_DEFER_RESTORE) + +/* + * Tell the generic TIF infrastructure which bits riscv supports + */ +#define HAVE_TIF_NEED_RESCHED_LAZY +#define HAVE_TIF_RESTORE_SIGMASK + +#include <asm-generic/thread_info_tif.h> + +#define TIF_32BIT 16 /* compat-mode 32bit process */ +#define TIF_RISCV_V_DEFER_RESTORE 17 /* restore Vector before returing to user */ + +#define _TIF_RISCV_V_DEFER_RESTORE BIT(TIF_RISCV_V_DEFER_RESTORE) #endif /* _ASM_RISCV_THREAD_INFO_H */ diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c index 6e8c0d6feae9..7d42d3b8a32a 100644 --- a/arch/riscv/kernel/asm-offsets.c +++ b/arch/riscv/kernel/asm-offsets.c @@ -3,6 +3,7 @@ * Copyright (C) 2012 Regents of the University of California * Copyright (C) 2017 SiFive */ +#define COMPILE_OFFSETS #include <linux/kbuild.h> #include <linux/mm.h> diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index f90cce7a3ace..14235e58c539 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -290,6 +290,7 @@ static void __init riscv_spinlock_init(void) if (IS_ENABLED(CONFIG_RISCV_ISA_ZABHA) && IS_ENABLED(CONFIG_RISCV_ISA_ZACAS) && + IS_ENABLED(CONFIG_TOOLCHAIN_HAS_ZACAS) && riscv_isa_extension_available(NULL, ZABHA) && riscv_isa_extension_available(NULL, ZACAS)) { using_ext = "using Zabha"; diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h index e7b032dfd17f..632ced07bca4 100644 --- a/arch/riscv/net/bpf_jit.h +++ b/arch/riscv/net/bpf_jit.h @@ -13,21 +13,15 @@ #include <linux/filter.h> #include <asm/cacheflush.h> +/* verify runtime detection extension status */ +#define rv_ext_enabled(ext) \ + (IS_ENABLED(CONFIG_RISCV_ISA_##ext) && riscv_has_extension_likely(RISCV_ISA_EXT_##ext)) + static inline bool rvc_enabled(void) { return IS_ENABLED(CONFIG_RISCV_ISA_C); } -static inline bool rvzba_enabled(void) -{ - return IS_ENABLED(CONFIG_RISCV_ISA_ZBA) && riscv_has_extension_likely(RISCV_ISA_EXT_ZBA); -} - -static inline bool rvzbb_enabled(void) -{ - return IS_ENABLED(CONFIG_RISCV_ISA_ZBB) && riscv_has_extension_likely(RISCV_ISA_EXT_ZBB); -} - enum { RV_REG_ZERO = 0, /* The constant value 0 */ RV_REG_RA = 1, /* Return address */ @@ -84,6 +78,8 @@ struct rv_jit_context { int epilogue_offset; int *offset; /* BPF to RV */ int nexentries; + int ex_insn_off; + int ex_jmp_off; unsigned long flags; int stack_size; u64 arena_vm_start; @@ -757,6 +753,17 @@ static inline u16 rvc_swsp(u32 imm8, u8 rs2) return rv_css_insn(0x6, imm, rs2, 0x2); } +/* RVZACAS instructions. */ +static inline u32 rvzacas_amocas_w(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl) +{ + return rv_amo_insn(0x5, aq, rl, rs2, rs1, 2, rd, 0x2f); +} + +static inline u32 rvzacas_amocas_d(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl) +{ + return rv_amo_insn(0x5, aq, rl, rs2, rs1, 3, rd, 0x2f); +} + /* RVZBA instructions. */ static inline u32 rvzba_sh2add(u8 rd, u8 rs1, u8 rs2) { @@ -1123,7 +1130,7 @@ static inline void emit_sw(u8 rs1, s32 off, u8 rs2, struct rv_jit_context *ctx) static inline void emit_sh2add(u8 rd, u8 rs1, u8 rs2, struct rv_jit_context *ctx) { - if (rvzba_enabled()) { + if (rv_ext_enabled(ZBA)) { emit(rvzba_sh2add(rd, rs1, rs2), ctx); return; } @@ -1134,7 +1141,7 @@ static inline void emit_sh2add(u8 rd, u8 rs1, u8 rs2, struct rv_jit_context *ctx static inline void emit_sh3add(u8 rd, u8 rs1, u8 rs2, struct rv_jit_context *ctx) { - if (rvzba_enabled()) { + if (rv_ext_enabled(ZBA)) { emit(rvzba_sh3add(rd, rs1, rs2), ctx); return; } @@ -1184,7 +1191,7 @@ static inline void emit_subw(u8 rd, u8 rs1, u8 rs2, struct rv_jit_context *ctx) static inline void emit_sextb(u8 rd, u8 rs, struct rv_jit_context *ctx) { - if (rvzbb_enabled()) { + if (rv_ext_enabled(ZBB)) { emit(rvzbb_sextb(rd, rs), ctx); return; } @@ -1195,7 +1202,7 @@ static inline void emit_sextb(u8 rd, u8 rs, struct rv_jit_context *ctx) static inline void emit_sexth(u8 rd, u8 rs, struct rv_jit_context *ctx) { - if (rvzbb_enabled()) { + if (rv_ext_enabled(ZBB)) { emit(rvzbb_sexth(rd, rs), ctx); return; } @@ -1211,7 +1218,7 @@ static inline void emit_sextw(u8 rd, u8 rs, struct rv_jit_context *ctx) static inline void emit_zexth(u8 rd, u8 rs, struct rv_jit_context *ctx) { - if (rvzbb_enabled()) { + if (rv_ext_enabled(ZBB)) { emit(rvzbb_zexth(rd, rs), ctx); return; } @@ -1222,7 +1229,7 @@ static inline void emit_zexth(u8 rd, u8 rs, struct rv_jit_context *ctx) static inline void emit_zextw(u8 rd, u8 rs, struct rv_jit_context *ctx) { - if (rvzba_enabled()) { + if (rv_ext_enabled(ZBA)) { emit(rvzba_zextw(rd, rs), ctx); return; } @@ -1233,7 +1240,7 @@ static inline void emit_zextw(u8 rd, u8 rs, struct rv_jit_context *ctx) static inline void emit_bswap(u8 rd, s32 imm, struct rv_jit_context *ctx) { - if (rvzbb_enabled()) { + if (rv_ext_enabled(ZBB)) { int bits = 64 - imm; emit(rvzbb_rev8(rd, rd), ctx); @@ -1289,6 +1296,35 @@ out_be: emit_mv(rd, RV_REG_T2, ctx); } +static inline void emit_cmpxchg(u8 rd, u8 rs, u8 r0, bool is64, struct rv_jit_context *ctx) +{ + int jmp_offset; + + if (rv_ext_enabled(ZACAS)) { + ctx->ex_insn_off = ctx->ninsns; + emit(is64 ? rvzacas_amocas_d(r0, rs, rd, 1, 1) : + rvzacas_amocas_w(r0, rs, rd, 1, 1), ctx); + ctx->ex_jmp_off = ctx->ninsns; + if (!is64) + emit_zextw(r0, r0, ctx); + return; + } + + if (is64) + emit_mv(RV_REG_T2, r0, ctx); + else + emit_addiw(RV_REG_T2, r0, 0, ctx); + emit(is64 ? rv_lr_d(r0, 0, rd, 0, 0) : + rv_lr_w(r0, 0, rd, 0, 0), ctx); + jmp_offset = ninsns_rvoff(8); + emit(rv_bne(RV_REG_T2, r0, jmp_offset >> 1), ctx); + emit(is64 ? rv_sc_d(RV_REG_T3, rs, rd, 0, 1) : + rv_sc_w(RV_REG_T3, rs, rd, 0, 1), ctx); + jmp_offset = ninsns_rvoff(-6); + emit(rv_bne(RV_REG_T3, 0, jmp_offset >> 1), ctx); + emit_fence_rw_rw(ctx); +} + #endif /* __riscv_xlen == 64 */ void bpf_jit_build_prologue(struct rv_jit_context *ctx, bool is_subprog); diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 206b2b8552a7..45cbc7c6fe49 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -473,138 +473,92 @@ static inline void emit_kcfi(u32 hash, struct rv_jit_context *ctx) emit(hash, ctx); } -static int emit_load_8(bool sign_ext, u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) +static void emit_ldx_insn(u8 rd, s16 off, u8 rs, u8 size, bool sign_ext, + struct rv_jit_context *ctx) { - int insns_start; - - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lb(rd, off, rs), ctx); - else - emit(rv_lbu(rd, off, rs), ctx); - return ctx->ninsns - insns_start; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lb(rd, 0, RV_REG_T1), ctx); - else - emit(rv_lbu(rd, 0, RV_REG_T1), ctx); - return ctx->ninsns - insns_start; -} - -static int emit_load_16(bool sign_ext, u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) -{ - int insns_start; - - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lh(rd, off, rs), ctx); - else - emit(rv_lhu(rd, off, rs), ctx); - return ctx->ninsns - insns_start; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lh(rd, 0, RV_REG_T1), ctx); - else - emit(rv_lhu(rd, 0, RV_REG_T1), ctx); - return ctx->ninsns - insns_start; -} - -static int emit_load_32(bool sign_ext, u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) -{ - int insns_start; - - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lw(rd, off, rs), ctx); - else - emit(rv_lwu(rd, off, rs), ctx); - return ctx->ninsns - insns_start; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lw(rd, 0, RV_REG_T1), ctx); - else - emit(rv_lwu(rd, 0, RV_REG_T1), ctx); - return ctx->ninsns - insns_start; -} - -static int emit_load_64(bool sign_ext, u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) -{ - int insns_start; - - if (is_12b_int(off)) { - insns_start = ctx->ninsns; + switch (size) { + case BPF_B: + emit(sign_ext ? rv_lb(rd, off, rs) : rv_lbu(rd, off, rs), ctx); + break; + case BPF_H: + emit(sign_ext ? rv_lh(rd, off, rs) : rv_lhu(rd, off, rs), ctx); + break; + case BPF_W: + emit(sign_ext ? rv_lw(rd, off, rs) : rv_lwu(rd, off, rs), ctx); + break; + case BPF_DW: emit_ld(rd, off, rs, ctx); - return ctx->ninsns - insns_start; + break; } - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); - insns_start = ctx->ninsns; - emit_ld(rd, 0, RV_REG_T1, ctx); - return ctx->ninsns - insns_start; } -static void emit_store_8(u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) +static void emit_stx_insn(u8 rd, s16 off, u8 rs, u8 size, struct rv_jit_context *ctx) { - if (is_12b_int(off)) { + switch (size) { + case BPF_B: emit(rv_sb(rd, off, rs), ctx); - return; + break; + case BPF_H: + emit(rv_sh(rd, off, rs), ctx); + break; + case BPF_W: + emit_sw(rd, off, rs, ctx); + break; + case BPF_DW: + emit_sd(rd, off, rs, ctx); + break; } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - emit(rv_sb(RV_REG_T1, 0, rs), ctx); } -static void emit_store_16(u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) +static void emit_ldx(u8 rd, s16 off, u8 rs, u8 size, bool sign_ext, + struct rv_jit_context *ctx) { if (is_12b_int(off)) { - emit(rv_sh(rd, off, rs), ctx); + ctx->ex_insn_off = ctx->ninsns; + emit_ldx_insn(rd, off, rs, size, sign_ext, ctx); + ctx->ex_jmp_off = ctx->ninsns; return; } emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - emit(rv_sh(RV_REG_T1, 0, rs), ctx); + emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); + ctx->ex_insn_off = ctx->ninsns; + emit_ldx_insn(rd, 0, RV_REG_T1, size, sign_ext, ctx); + ctx->ex_jmp_off = ctx->ninsns; } -static void emit_store_32(u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) +static void emit_st(u8 rd, s16 off, s32 imm, u8 size, struct rv_jit_context *ctx) { + emit_imm(RV_REG_T1, imm, ctx); if (is_12b_int(off)) { - emit_sw(rd, off, rs, ctx); + ctx->ex_insn_off = ctx->ninsns; + emit_stx_insn(rd, off, RV_REG_T1, size, ctx); + ctx->ex_jmp_off = ctx->ninsns; return; } - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - emit_sw(RV_REG_T1, 0, rs, ctx); + emit_imm(RV_REG_T2, off, ctx); + emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); + ctx->ex_insn_off = ctx->ninsns; + emit_stx_insn(RV_REG_T2, 0, RV_REG_T1, size, ctx); + ctx->ex_jmp_off = ctx->ninsns; } -static void emit_store_64(u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) +static void emit_stx(u8 rd, s16 off, u8 rs, u8 size, struct rv_jit_context *ctx) { if (is_12b_int(off)) { - emit_sd(rd, off, rs, ctx); + ctx->ex_insn_off = ctx->ninsns; + emit_stx_insn(rd, off, rs, size, ctx); + ctx->ex_jmp_off = ctx->ninsns; return; } emit_imm(RV_REG_T1, off, ctx); emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - emit_sd(RV_REG_T1, 0, rs, ctx); + ctx->ex_insn_off = ctx->ninsns; + emit_stx_insn(RV_REG_T1, 0, rs, size, ctx); + ctx->ex_jmp_off = ctx->ninsns; } static int emit_atomic_ld_st(u8 rd, u8 rs, const struct bpf_insn *insn, @@ -617,20 +571,12 @@ static int emit_atomic_ld_st(u8 rd, u8 rs, const struct bpf_insn *insn, switch (imm) { /* dst_reg = load_acquire(src_reg + off16) */ case BPF_LOAD_ACQ: - switch (BPF_SIZE(code)) { - case BPF_B: - emit_load_8(false, rd, off, rs, ctx); - break; - case BPF_H: - emit_load_16(false, rd, off, rs, ctx); - break; - case BPF_W: - emit_load_32(false, rd, off, rs, ctx); - break; - case BPF_DW: - emit_load_64(false, rd, off, rs, ctx); - break; + if (BPF_MODE(code) == BPF_PROBE_ATOMIC) { + emit_add(RV_REG_T2, rs, RV_REG_ARENA, ctx); + rs = RV_REG_T2; } + + emit_ldx(rd, off, rs, BPF_SIZE(code), false, ctx); emit_fence_r_rw(ctx); /* If our next insn is a redundant zext, return 1 to tell @@ -641,21 +587,13 @@ static int emit_atomic_ld_st(u8 rd, u8 rs, const struct bpf_insn *insn, break; /* store_release(dst_reg + off16, src_reg) */ case BPF_STORE_REL: - emit_fence_rw_w(ctx); - switch (BPF_SIZE(code)) { - case BPF_B: - emit_store_8(rd, off, rs, ctx); - break; - case BPF_H: - emit_store_16(rd, off, rs, ctx); - break; - case BPF_W: - emit_store_32(rd, off, rs, ctx); - break; - case BPF_DW: - emit_store_64(rd, off, rs, ctx); - break; + if (BPF_MODE(code) == BPF_PROBE_ATOMIC) { + emit_add(RV_REG_T2, rd, RV_REG_ARENA, ctx); + rd = RV_REG_T2; } + + emit_fence_rw_w(ctx); + emit_stx(rd, off, rs, BPF_SIZE(code), ctx); break; default: pr_err_once("bpf-jit: invalid atomic load/store opcode %02x\n", imm); @@ -668,17 +606,15 @@ static int emit_atomic_ld_st(u8 rd, u8 rs, const struct bpf_insn *insn, static int emit_atomic_rmw(u8 rd, u8 rs, const struct bpf_insn *insn, struct rv_jit_context *ctx) { - u8 r0, code = insn->code; + u8 code = insn->code; s16 off = insn->off; s32 imm = insn->imm; - int jmp_offset; - bool is64; + bool is64 = BPF_SIZE(code) == BPF_DW; if (BPF_SIZE(code) != BPF_W && BPF_SIZE(code) != BPF_DW) { pr_err_once("bpf-jit: 1- and 2-byte RMW atomics are not supported\n"); return -EINVAL; } - is64 = BPF_SIZE(code) == BPF_DW; if (off) { if (is_12b_int(off)) { @@ -690,72 +626,82 @@ static int emit_atomic_rmw(u8 rd, u8 rs, const struct bpf_insn *insn, rd = RV_REG_T1; } + if (BPF_MODE(code) == BPF_PROBE_ATOMIC) { + emit_add(RV_REG_T1, rd, RV_REG_ARENA, ctx); + rd = RV_REG_T1; + } + switch (imm) { /* lock *(u32/u64 *)(dst_reg + off16) <op>= src_reg */ case BPF_ADD: + ctx->ex_insn_off = ctx->ninsns; emit(is64 ? rv_amoadd_d(RV_REG_ZERO, rs, rd, 0, 0) : rv_amoadd_w(RV_REG_ZERO, rs, rd, 0, 0), ctx); + ctx->ex_jmp_off = ctx->ninsns; break; case BPF_AND: + ctx->ex_insn_off = ctx->ninsns; emit(is64 ? rv_amoand_d(RV_REG_ZERO, rs, rd, 0, 0) : rv_amoand_w(RV_REG_ZERO, rs, rd, 0, 0), ctx); + ctx->ex_jmp_off = ctx->ninsns; break; case BPF_OR: + ctx->ex_insn_off = ctx->ninsns; emit(is64 ? rv_amoor_d(RV_REG_ZERO, rs, rd, 0, 0) : rv_amoor_w(RV_REG_ZERO, rs, rd, 0, 0), ctx); + ctx->ex_jmp_off = ctx->ninsns; break; case BPF_XOR: + ctx->ex_insn_off = ctx->ninsns; emit(is64 ? rv_amoxor_d(RV_REG_ZERO, rs, rd, 0, 0) : rv_amoxor_w(RV_REG_ZERO, rs, rd, 0, 0), ctx); + ctx->ex_jmp_off = ctx->ninsns; break; /* src_reg = atomic_fetch_<op>(dst_reg + off16, src_reg) */ case BPF_ADD | BPF_FETCH: + ctx->ex_insn_off = ctx->ninsns; emit(is64 ? rv_amoadd_d(rs, rs, rd, 1, 1) : rv_amoadd_w(rs, rs, rd, 1, 1), ctx); + ctx->ex_jmp_off = ctx->ninsns; if (!is64) emit_zextw(rs, rs, ctx); break; case BPF_AND | BPF_FETCH: + ctx->ex_insn_off = ctx->ninsns; emit(is64 ? rv_amoand_d(rs, rs, rd, 1, 1) : rv_amoand_w(rs, rs, rd, 1, 1), ctx); + ctx->ex_jmp_off = ctx->ninsns; if (!is64) emit_zextw(rs, rs, ctx); break; case BPF_OR | BPF_FETCH: + ctx->ex_insn_off = ctx->ninsns; emit(is64 ? rv_amoor_d(rs, rs, rd, 1, 1) : rv_amoor_w(rs, rs, rd, 1, 1), ctx); + ctx->ex_jmp_off = ctx->ninsns; if (!is64) emit_zextw(rs, rs, ctx); break; case BPF_XOR | BPF_FETCH: + ctx->ex_insn_off = ctx->ninsns; emit(is64 ? rv_amoxor_d(rs, rs, rd, 1, 1) : rv_amoxor_w(rs, rs, rd, 1, 1), ctx); + ctx->ex_jmp_off = ctx->ninsns; if (!is64) emit_zextw(rs, rs, ctx); break; /* src_reg = atomic_xchg(dst_reg + off16, src_reg); */ case BPF_XCHG: + ctx->ex_insn_off = ctx->ninsns; emit(is64 ? rv_amoswap_d(rs, rs, rd, 1, 1) : rv_amoswap_w(rs, rs, rd, 1, 1), ctx); + ctx->ex_jmp_off = ctx->ninsns; if (!is64) emit_zextw(rs, rs, ctx); break; /* r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg); */ case BPF_CMPXCHG: - r0 = bpf_to_rv_reg(BPF_REG_0, ctx); - if (is64) - emit_mv(RV_REG_T2, r0, ctx); - else - emit_addiw(RV_REG_T2, r0, 0, ctx); - emit(is64 ? rv_lr_d(r0, 0, rd, 0, 0) : - rv_lr_w(r0, 0, rd, 0, 0), ctx); - jmp_offset = ninsns_rvoff(8); - emit(rv_bne(RV_REG_T2, r0, jmp_offset >> 1), ctx); - emit(is64 ? rv_sc_d(RV_REG_T3, rs, rd, 0, 1) : - rv_sc_w(RV_REG_T3, rs, rd, 0, 1), ctx); - jmp_offset = ninsns_rvoff(-6); - emit(rv_bne(RV_REG_T3, 0, jmp_offset >> 1), ctx); - emit_fence_rw_rw(ctx); + emit_cmpxchg(rd, rs, regmap[BPF_REG_0], is64, ctx); break; default: pr_err_once("bpf-jit: invalid atomic RMW opcode %02x\n", imm); @@ -765,6 +711,39 @@ static int emit_atomic_rmw(u8 rd, u8 rs, const struct bpf_insn *insn, return 0; } +/* + * Sign-extend the register if necessary + */ +static int sign_extend(u8 rd, u8 rs, u8 sz, bool sign, struct rv_jit_context *ctx) +{ + if (!sign && (sz == 1 || sz == 2)) { + if (rd != rs) + emit_mv(rd, rs, ctx); + return 0; + } + + switch (sz) { + case 1: + emit_sextb(rd, rs, ctx); + break; + case 2: + emit_sexth(rd, rs, ctx); + break; + case 4: + emit_sextw(rd, rs, ctx); + break; + case 8: + if (rd != rs) + emit_mv(rd, rs, ctx); + break; + default: + pr_err("bpf-jit: invalid size %d for sign_extend\n", sz); + return -EINVAL; + } + + return 0; +} + #define BPF_FIXUP_OFFSET_MASK GENMASK(26, 0) #define BPF_FIXUP_REG_MASK GENMASK(31, 27) #define REG_DONT_CLEAR_MARKER 0 /* RV_REG_ZERO unused in pt_regmap */ @@ -783,9 +762,8 @@ bool ex_handler_bpf(const struct exception_table_entry *ex, } /* For accesses to BTF pointers, add an entry to the exception table */ -static int add_exception_handler(const struct bpf_insn *insn, - struct rv_jit_context *ctx, - int dst_reg, int insn_len) +static int add_exception_handler(const struct bpf_insn *insn, int dst_reg, + struct rv_jit_context *ctx) { struct exception_table_entry *ex; unsigned long pc; @@ -793,21 +771,23 @@ static int add_exception_handler(const struct bpf_insn *insn, off_t fixup_offset; if (!ctx->insns || !ctx->ro_insns || !ctx->prog->aux->extable || - (BPF_MODE(insn->code) != BPF_PROBE_MEM && BPF_MODE(insn->code) != BPF_PROBE_MEMSX && - BPF_MODE(insn->code) != BPF_PROBE_MEM32)) + ctx->ex_insn_off <= 0 || ctx->ex_jmp_off <= 0) return 0; - if (WARN_ON_ONCE(ctx->nexentries >= ctx->prog->aux->num_exentries)) - return -EINVAL; + if (BPF_MODE(insn->code) != BPF_PROBE_MEM && + BPF_MODE(insn->code) != BPF_PROBE_MEMSX && + BPF_MODE(insn->code) != BPF_PROBE_MEM32 && + BPF_MODE(insn->code) != BPF_PROBE_ATOMIC) + return 0; - if (WARN_ON_ONCE(insn_len > ctx->ninsns)) + if (WARN_ON_ONCE(ctx->nexentries >= ctx->prog->aux->num_exentries)) return -EINVAL; - if (WARN_ON_ONCE(!rvc_enabled() && insn_len == 1)) + if (WARN_ON_ONCE(ctx->ex_insn_off > ctx->ninsns || ctx->ex_jmp_off > ctx->ninsns)) return -EINVAL; ex = &ctx->prog->aux->extable[ctx->nexentries]; - pc = (unsigned long)&ctx->ro_insns[ctx->ninsns - insn_len]; + pc = (unsigned long)&ctx->ro_insns[ctx->ex_insn_off]; /* * This is the relative offset of the instruction that may fault from @@ -831,7 +811,7 @@ static int add_exception_handler(const struct bpf_insn *insn, * that may fault. The execution will jump to this after handling the * fault. */ - fixup_offset = (long)&ex->fixup - (pc + insn_len * sizeof(u16)); + fixup_offset = (long)&ex->fixup - (long)&ctx->ro_insns[ctx->ex_jmp_off]; if (!FIELD_FIT(BPF_FIXUP_OFFSET_MASK, fixup_offset)) return -ERANGE; @@ -848,6 +828,8 @@ static int add_exception_handler(const struct bpf_insn *insn, FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg); ex->type = EX_TYPE_BPF; + ctx->ex_insn_off = 0; + ctx->ex_jmp_off = 0; ctx->nexentries++; return 0; } @@ -1079,10 +1061,9 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, stack_size += 16; save_ret = flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET); - if (save_ret) { + if (save_ret) stack_size += 16; /* Save both A5 (BPF R0) and A0 */ - retval_off = stack_size; - } + retval_off = stack_size; stack_size += nr_arg_slots * 8; args_off = stack_size; @@ -1226,8 +1207,15 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, restore_args(min_t(int, nr_arg_slots, RV_MAX_REG_ARGS), args_off, ctx); if (save_ret) { - emit_ld(RV_REG_A0, -retval_off, RV_REG_FP, ctx); emit_ld(regmap[BPF_REG_0], -(retval_off - 8), RV_REG_FP, ctx); + if (is_struct_ops) { + ret = sign_extend(RV_REG_A0, regmap[BPF_REG_0], m->ret_size, + m->ret_flags & BTF_FMODEL_SIGNED_ARG, ctx); + if (ret) + goto out; + } else { + emit_ld(RV_REG_A0, -retval_off, RV_REG_FP, ctx); + } } emit_ld(RV_REG_S1, -sreg_off, RV_REG_FP, ctx); @@ -1320,7 +1308,6 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image, goto out; } - bpf_flush_icache(ro_image, ro_image_end); out: kvfree(image); return ret < 0 ? ret : size; @@ -1857,7 +1844,6 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW: { bool sign_ext; - int insn_len; sign_ext = BPF_MODE(insn->code) == BPF_MEMSX || BPF_MODE(insn->code) == BPF_PROBE_MEMSX; @@ -1867,22 +1853,9 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, rs = RV_REG_T2; } - switch (BPF_SIZE(code)) { - case BPF_B: - insn_len = emit_load_8(sign_ext, rd, off, rs, ctx); - break; - case BPF_H: - insn_len = emit_load_16(sign_ext, rd, off, rs, ctx); - break; - case BPF_W: - insn_len = emit_load_32(sign_ext, rd, off, rs, ctx); - break; - case BPF_DW: - insn_len = emit_load_64(sign_ext, rd, off, rs, ctx); - break; - } + emit_ldx(rd, off, rs, BPF_SIZE(code), sign_ext, ctx); - ret = add_exception_handler(insn, ctx, rd, insn_len); + ret = add_exception_handler(insn, rd, ctx); if (ret) return ret; @@ -1890,238 +1863,73 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, return 1; break; } + /* speculation barrier */ case BPF_ST | BPF_NOSPEC: break; /* ST: *(size *)(dst + off) = imm */ case BPF_ST | BPF_MEM | BPF_B: - emit_imm(RV_REG_T1, imm, ctx); - if (is_12b_int(off)) { - emit(rv_sb(rd, off, RV_REG_T1), ctx); - break; - } - - emit_imm(RV_REG_T2, off, ctx); - emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); - emit(rv_sb(RV_REG_T2, 0, RV_REG_T1), ctx); - break; - case BPF_ST | BPF_MEM | BPF_H: - emit_imm(RV_REG_T1, imm, ctx); - if (is_12b_int(off)) { - emit(rv_sh(rd, off, RV_REG_T1), ctx); - break; - } - - emit_imm(RV_REG_T2, off, ctx); - emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); - emit(rv_sh(RV_REG_T2, 0, RV_REG_T1), ctx); - break; case BPF_ST | BPF_MEM | BPF_W: - emit_imm(RV_REG_T1, imm, ctx); - if (is_12b_int(off)) { - emit_sw(rd, off, RV_REG_T1, ctx); - break; - } - - emit_imm(RV_REG_T2, off, ctx); - emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); - emit_sw(RV_REG_T2, 0, RV_REG_T1, ctx); - break; case BPF_ST | BPF_MEM | BPF_DW: - emit_imm(RV_REG_T1, imm, ctx); - if (is_12b_int(off)) { - emit_sd(rd, off, RV_REG_T1, ctx); - break; - } - - emit_imm(RV_REG_T2, off, ctx); - emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); - emit_sd(RV_REG_T2, 0, RV_REG_T1, ctx); - break; - + /* ST | PROBE_MEM32: *(size *)(dst + RV_REG_ARENA + off) = imm */ case BPF_ST | BPF_PROBE_MEM32 | BPF_B: case BPF_ST | BPF_PROBE_MEM32 | BPF_H: case BPF_ST | BPF_PROBE_MEM32 | BPF_W: case BPF_ST | BPF_PROBE_MEM32 | BPF_DW: - { - int insn_len, insns_start; - - emit_add(RV_REG_T3, rd, RV_REG_ARENA, ctx); - rd = RV_REG_T3; - - /* Load imm to a register then store it */ - emit_imm(RV_REG_T1, imm, ctx); - - switch (BPF_SIZE(code)) { - case BPF_B: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - emit(rv_sb(rd, off, RV_REG_T1), ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T2, off, ctx); - emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); - insns_start = ctx->ninsns; - emit(rv_sb(RV_REG_T2, 0, RV_REG_T1), ctx); - insn_len = ctx->ninsns - insns_start; - break; - case BPF_H: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - emit(rv_sh(rd, off, RV_REG_T1), ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T2, off, ctx); - emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); - insns_start = ctx->ninsns; - emit(rv_sh(RV_REG_T2, 0, RV_REG_T1), ctx); - insn_len = ctx->ninsns - insns_start; - break; - case BPF_W: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - emit_sw(rd, off, RV_REG_T1, ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T2, off, ctx); - emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); - insns_start = ctx->ninsns; - emit_sw(RV_REG_T2, 0, RV_REG_T1, ctx); - insn_len = ctx->ninsns - insns_start; - break; - case BPF_DW: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - emit_sd(rd, off, RV_REG_T1, ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T2, off, ctx); - emit_add(RV_REG_T2, RV_REG_T2, rd, ctx); - insns_start = ctx->ninsns; - emit_sd(RV_REG_T2, 0, RV_REG_T1, ctx); - insn_len = ctx->ninsns - insns_start; - break; + if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) { + emit_add(RV_REG_T3, rd, RV_REG_ARENA, ctx); + rd = RV_REG_T3; } - ret = add_exception_handler(insn, ctx, REG_DONT_CLEAR_MARKER, - insn_len); + emit_st(rd, off, imm, BPF_SIZE(code), ctx); + + ret = add_exception_handler(insn, REG_DONT_CLEAR_MARKER, ctx); if (ret) return ret; - break; - } /* STX: *(size *)(dst + off) = src */ case BPF_STX | BPF_MEM | BPF_B: - emit_store_8(rd, off, rs, ctx); - break; case BPF_STX | BPF_MEM | BPF_H: - emit_store_16(rd, off, rs, ctx); - break; case BPF_STX | BPF_MEM | BPF_W: - emit_store_32(rd, off, rs, ctx); - break; case BPF_STX | BPF_MEM | BPF_DW: - emit_store_64(rd, off, rs, ctx); + /* STX | PROBE_MEM32: *(size *)(dst + RV_REG_ARENA + off) = src */ + case BPF_STX | BPF_PROBE_MEM32 | BPF_B: + case BPF_STX | BPF_PROBE_MEM32 | BPF_H: + case BPF_STX | BPF_PROBE_MEM32 | BPF_W: + case BPF_STX | BPF_PROBE_MEM32 | BPF_DW: + if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) { + emit_add(RV_REG_T2, rd, RV_REG_ARENA, ctx); + rd = RV_REG_T2; + } + + emit_stx(rd, off, rs, BPF_SIZE(code), ctx); + + ret = add_exception_handler(insn, REG_DONT_CLEAR_MARKER, ctx); + if (ret) + return ret; break; + + /* Atomics */ case BPF_STX | BPF_ATOMIC | BPF_B: case BPF_STX | BPF_ATOMIC | BPF_H: case BPF_STX | BPF_ATOMIC | BPF_W: case BPF_STX | BPF_ATOMIC | BPF_DW: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_B: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_H: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_W: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_DW: if (bpf_atomic_is_load_store(insn)) ret = emit_atomic_ld_st(rd, rs, insn, ctx); else ret = emit_atomic_rmw(rd, rs, insn, ctx); - if (ret) - return ret; - break; - case BPF_STX | BPF_PROBE_MEM32 | BPF_B: - case BPF_STX | BPF_PROBE_MEM32 | BPF_H: - case BPF_STX | BPF_PROBE_MEM32 | BPF_W: - case BPF_STX | BPF_PROBE_MEM32 | BPF_DW: - { - int insn_len, insns_start; - - emit_add(RV_REG_T2, rd, RV_REG_ARENA, ctx); - rd = RV_REG_T2; - - switch (BPF_SIZE(code)) { - case BPF_B: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - emit(rv_sb(rd, off, rs), ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - insns_start = ctx->ninsns; - emit(rv_sb(RV_REG_T1, 0, rs), ctx); - insn_len = ctx->ninsns - insns_start; - break; - case BPF_H: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - emit(rv_sh(rd, off, rs), ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - insns_start = ctx->ninsns; - emit(rv_sh(RV_REG_T1, 0, rs), ctx); - insn_len = ctx->ninsns - insns_start; - break; - case BPF_W: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - emit_sw(rd, off, rs, ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - insns_start = ctx->ninsns; - emit_sw(RV_REG_T1, 0, rs, ctx); - insn_len = ctx->ninsns - insns_start; - break; - case BPF_DW: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - emit_sd(rd, off, rs, ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - insns_start = ctx->ninsns; - emit_sd(RV_REG_T1, 0, rs, ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - ret = add_exception_handler(insn, ctx, REG_DONT_CLEAR_MARKER, - insn_len); + ret = ret ?: add_exception_handler(insn, REG_DONT_CLEAR_MARKER, ctx); if (ret) return ret; - break; - } default: pr_err("bpf-jit: unknown opcode %02x\n", code); @@ -2249,6 +2057,25 @@ bool bpf_jit_supports_arena(void) return true; } +bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) +{ + if (in_arena) { + switch (insn->code) { + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: + if (insn->imm == BPF_CMPXCHG) + return rv_ext_enabled(ZACAS); + break; + case BPF_LDX | BPF_MEMSX | BPF_B: + case BPF_LDX | BPF_MEMSX | BPF_H: + case BPF_LDX | BPF_MEMSX | BPF_W: + return false; + } + } + + return true; +} + bool bpf_jit_supports_percpu_insn(void) { return true; diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 22862ce7ec68..2414ee3ff002 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -174,8 +174,6 @@ config S390 select GENERIC_GETTIMEOFDAY select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL - select GENERIC_VDSO_DATA_STORE - select GENERIC_VDSO_TIME_NS select GENERIC_IOREMAP if PCI select HAVE_ALIGNED_STRUCT_PAGE select HAVE_ARCH_AUDITSYSCALL @@ -206,6 +204,7 @@ config S390 select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_EBPF_JIT if HAVE_MARCH_Z196_FEATURES select HAVE_EFFICIENT_UNALIGNED_ACCESS + select HAVE_GENERIC_TIF_BITS select HAVE_GUP_FAST select HAVE_FENTRY select HAVE_FTRACE_GRAPH_FUNC @@ -554,15 +553,11 @@ config NODES_SHIFT depends on NUMA default "1" -config SCHED_SMT - def_bool n - -config SCHED_MC - def_bool n - config SCHED_TOPOLOGY def_bool y prompt "Topology scheduler support" + select ARCH_SUPPORTS_SCHED_SMT + select ARCH_SUPPORTS_SCHED_MC select SCHED_SMT select SCHED_MC help diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index f6ed2c8192c8..7878e9bfbf07 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -56,49 +56,31 @@ void arch_setup_new_exec(void); /* * thread information flags bit numbers + * + * Tell the generic TIF infrastructure which special bits s390 supports */ -#define TIF_NOTIFY_RESUME 0 /* callback before returning to user */ -#define TIF_SIGPENDING 1 /* signal pending */ -#define TIF_NEED_RESCHED 2 /* rescheduling necessary */ -#define TIF_NEED_RESCHED_LAZY 3 /* lazy rescheduling needed */ -#define TIF_UPROBE 4 /* breakpointed or single-stepping */ -#define TIF_PATCH_PENDING 5 /* pending live patching update */ -#define TIF_ASCE_PRIMARY 6 /* primary asce is kernel asce */ -#define TIF_NOTIFY_SIGNAL 7 /* signal notifications exist */ -#define TIF_GUARDED_STORAGE 8 /* load guarded storage control block */ -#define TIF_ISOLATE_BP_GUEST 9 /* Run KVM guests with isolated BP */ -#define TIF_PER_TRAP 10 /* Need to handle PER trap on exit to usermode */ -#define TIF_31BIT 16 /* 32bit process */ -#define TIF_MEMDIE 17 /* is terminating due to OOM killer */ -#define TIF_RESTORE_SIGMASK 18 /* restore signal mask in do_signal() */ -#define TIF_SINGLE_STEP 19 /* This task is single stepped */ -#define TIF_BLOCK_STEP 20 /* This task is block stepped */ -#define TIF_UPROBE_SINGLESTEP 21 /* This task is uprobe single stepped */ -#define TIF_SYSCALL_TRACE 24 /* syscall trace active */ -#define TIF_SYSCALL_AUDIT 25 /* syscall auditing active */ -#define TIF_SECCOMP 26 /* secure computing */ -#define TIF_SYSCALL_TRACEPOINT 27 /* syscall tracepoint instrumentation */ +#define HAVE_TIF_NEED_RESCHED_LAZY +#define HAVE_TIF_RESTORE_SIGMASK + +#include <asm-generic/thread_info_tif.h> + +/* Architecture specific bits */ +#define TIF_ASCE_PRIMARY 16 /* primary asce is kernel asce */ +#define TIF_GUARDED_STORAGE 17 /* load guarded storage control block */ +#define TIF_ISOLATE_BP_GUEST 18 /* Run KVM guests with isolated BP */ +#define TIF_PER_TRAP 19 /* Need to handle PER trap on exit to usermode */ +#define TIF_31BIT 20 /* 32bit process */ +#define TIF_SINGLE_STEP 21 /* This task is single stepped */ +#define TIF_BLOCK_STEP 22 /* This task is block stepped */ +#define TIF_UPROBE_SINGLESTEP 23 /* This task is uprobe single stepped */ -#define _TIF_NOTIFY_RESUME BIT(TIF_NOTIFY_RESUME) -#define _TIF_SIGPENDING BIT(TIF_SIGPENDING) -#define _TIF_NEED_RESCHED BIT(TIF_NEED_RESCHED) -#define _TIF_NEED_RESCHED_LAZY BIT(TIF_NEED_RESCHED_LAZY) -#define _TIF_UPROBE BIT(TIF_UPROBE) -#define _TIF_PATCH_PENDING BIT(TIF_PATCH_PENDING) #define _TIF_ASCE_PRIMARY BIT(TIF_ASCE_PRIMARY) -#define _TIF_NOTIFY_SIGNAL BIT(TIF_NOTIFY_SIGNAL) #define _TIF_GUARDED_STORAGE BIT(TIF_GUARDED_STORAGE) #define _TIF_ISOLATE_BP_GUEST BIT(TIF_ISOLATE_BP_GUEST) #define _TIF_PER_TRAP BIT(TIF_PER_TRAP) #define _TIF_31BIT BIT(TIF_31BIT) -#define _TIF_MEMDIE BIT(TIF_MEMDIE) -#define _TIF_RESTORE_SIGMASK BIT(TIF_RESTORE_SIGMASK) #define _TIF_SINGLE_STEP BIT(TIF_SINGLE_STEP) #define _TIF_BLOCK_STEP BIT(TIF_BLOCK_STEP) #define _TIF_UPROBE_SINGLESTEP BIT(TIF_UPROBE_SINGLESTEP) -#define _TIF_SYSCALL_TRACE BIT(TIF_SYSCALL_TRACE) -#define _TIF_SYSCALL_AUDIT BIT(TIF_SYSCALL_AUDIT) -#define _TIF_SECCOMP BIT(TIF_SECCOMP) -#define _TIF_SYSCALL_TRACEPOINT BIT(TIF_SYSCALL_TRACEPOINT) #endif /* _ASM_THREAD_INFO_H */ diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 95ecad9c7d7d..a8915663e917 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -4,6 +4,7 @@ * This code generates raw asm output which is post-processed to extract * and format the required data. */ +#define COMPILE_OFFSETS #include <linux/kbuild.h> #include <linux/sched.h> diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 46569b8e47dd..1594c80e9bc4 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -509,33 +509,27 @@ int topology_cpu_init(struct cpu *cpu) return rc; } -static const struct cpumask *cpu_thread_mask(int cpu) -{ - return &cpu_topology[cpu].thread_mask; -} - - const struct cpumask *cpu_coregroup_mask(int cpu) { return &cpu_topology[cpu].core_mask; } -static const struct cpumask *cpu_book_mask(int cpu) +static const struct cpumask *tl_book_mask(struct sched_domain_topology_level *tl, int cpu) { return &cpu_topology[cpu].book_mask; } -static const struct cpumask *cpu_drawer_mask(int cpu) +static const struct cpumask *tl_drawer_mask(struct sched_domain_topology_level *tl, int cpu) { return &cpu_topology[cpu].drawer_mask; } static struct sched_domain_topology_level s390_topology[] = { - SDTL_INIT(cpu_thread_mask, cpu_smt_flags, SMT), - SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC), - SDTL_INIT(cpu_book_mask, NULL, BOOK), - SDTL_INIT(cpu_drawer_mask, NULL, DRAWER), - SDTL_INIT(cpu_cpu_mask, NULL, PKG), + SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), + SDTL_INIT(tl_mc_mask, cpu_core_flags, MC), + SDTL_INIT(tl_book_mask, NULL, BOOK), + SDTL_INIT(tl_drawer_mask, NULL, DRAWER), + SDTL_INIT(tl_pkg_mask, NULL, PKG), { NULL, }, }; diff --git a/arch/s390/net/Makefile b/arch/s390/net/Makefile index 8cab6deb0403..9275cf63192a 100644 --- a/arch/s390/net/Makefile +++ b/arch/s390/net/Makefile @@ -2,5 +2,5 @@ # # Arch-specific network modules # -obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o +obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o bpf_timed_may_goto.o obj-$(CONFIG_HAVE_PNETID) += pnet.o diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index bb17efe29d65..cf461d76e9da 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -675,20 +675,6 @@ static void bpf_jit_prologue(struct bpf_jit *jit, struct bpf_prog *fp) } while (0) /* - * Call r1 either directly or via __s390_indirect_jump_r1 thunk - */ -static void call_r1(struct bpf_jit *jit) -{ - if (nospec_uses_trampoline()) - /* brasl %r14,__s390_indirect_jump_r1 */ - EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, - __s390_indirect_jump_r1); - else - /* basr %r14,%r1 */ - EMIT2(0x0d00, REG_14, REG_1); -} - -/* * Function epilogue */ static void bpf_jit_epilogue(struct bpf_jit *jit) @@ -1790,20 +1776,21 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, REG_SET_SEEN(BPF_REG_5); jit->seen |= SEEN_FUNC; + /* * Copy the tail call counter to where the callee expects it. - * - * Note 1: The callee can increment the tail call counter, but - * we do not load it back, since the x86 JIT does not do this - * either. - * - * Note 2: We assume that the verifier does not let us call the - * main program, which clears the tail call counter on entry. */ - /* mvc tail_call_cnt(4,%r15),frame_off+tail_call_cnt(%r15) */ - _EMIT6(0xd203f000 | offsetof(struct prog_frame, tail_call_cnt), - 0xf000 | (jit->frame_off + - offsetof(struct prog_frame, tail_call_cnt))); + + if (insn->src_reg == BPF_PSEUDO_CALL) + /* + * mvc tail_call_cnt(4,%r15), + * frame_off+tail_call_cnt(%r15) + */ + _EMIT6(0xd203f000 | offsetof(struct prog_frame, + tail_call_cnt), + 0xf000 | (jit->frame_off + + offsetof(struct prog_frame, + tail_call_cnt))); /* Sign-extend the kfunc arguments. */ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { @@ -1819,12 +1806,38 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, } } - /* lgrl %w1,func */ - EMIT6_PCREL_RILB(0xc4080000, REG_W1, _EMIT_CONST_U64(func)); - /* %r1() */ - call_r1(jit); - /* lgr %b0,%r2: load return value into %b0 */ - EMIT4(0xb9040000, BPF_REG_0, REG_2); + if ((void *)func == arch_bpf_timed_may_goto) { + /* + * arch_bpf_timed_may_goto() has a special ABI: the + * parameters are in BPF_REG_AX and BPF_REG_10; the + * return value is in BPF_REG_AX; and all GPRs except + * REG_W0, REG_W1, and BPF_REG_AX are callee-saved. + */ + + /* brasl %r0,func */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_0, (void *)func); + } else { + /* brasl %r14,func */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, (void *)func); + /* lgr %b0,%r2: load return value into %b0 */ + EMIT4(0xb9040000, BPF_REG_0, REG_2); + } + + /* + * Copy the potentially updated tail call counter back. + */ + + if (insn->src_reg == BPF_PSEUDO_CALL) + /* + * mvc frame_off+tail_call_cnt(%r15), + * tail_call_cnt(4,%r15) + */ + _EMIT6(0xd203f000 | (jit->frame_off + + offsetof(struct prog_frame, + tail_call_cnt)), + 0xf000 | offsetof(struct prog_frame, + tail_call_cnt)); + break; } case BPF_JMP | BPF_TAIL_CALL: { @@ -2517,14 +2530,12 @@ static int invoke_bpf_prog(struct bpf_tramp_jit *tjit, * goto skip; */ - /* %r1 = __bpf_prog_enter */ - load_imm64(jit, REG_1, (u64)bpf_trampoline_enter(p)); /* %r2 = p */ load_imm64(jit, REG_2, (u64)p); /* la %r3,run_ctx_off(%r15) */ EMIT4_DISP(0x41000000, REG_3, REG_15, tjit->run_ctx_off); - /* %r1() */ - call_r1(jit); + /* brasl %r14,__bpf_prog_enter */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, bpf_trampoline_enter(p)); /* ltgr %r7,%r2 */ EMIT4(0xb9020000, REG_7, REG_2); /* brcl 8,skip */ @@ -2535,15 +2546,13 @@ static int invoke_bpf_prog(struct bpf_tramp_jit *tjit, * retval = bpf_func(args, p->insnsi); */ - /* %r1 = p->bpf_func */ - load_imm64(jit, REG_1, (u64)p->bpf_func); /* la %r2,bpf_args_off(%r15) */ EMIT4_DISP(0x41000000, REG_2, REG_15, tjit->bpf_args_off); /* %r3 = p->insnsi */ if (!p->jited) load_imm64(jit, REG_3, (u64)p->insnsi); - /* %r1() */ - call_r1(jit); + /* brasl %r14,p->bpf_func */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, p->bpf_func); /* stg %r2,retval_off(%r15) */ if (save_ret) { if (sign_extend(jit, REG_2, m->ret_size, m->ret_flags)) @@ -2560,16 +2569,14 @@ static int invoke_bpf_prog(struct bpf_tramp_jit *tjit, * __bpf_prog_exit(p, start, &run_ctx); */ - /* %r1 = __bpf_prog_exit */ - load_imm64(jit, REG_1, (u64)bpf_trampoline_exit(p)); /* %r2 = p */ load_imm64(jit, REG_2, (u64)p); /* lgr %r3,%r7 */ EMIT4(0xb9040000, REG_3, REG_7); /* la %r4,run_ctx_off(%r15) */ EMIT4_DISP(0x41000000, REG_4, REG_15, tjit->run_ctx_off); - /* %r1() */ - call_r1(jit); + /* brasl %r14,__bpf_prog_exit */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, bpf_trampoline_exit(p)); return 0; } @@ -2729,9 +2736,6 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, /* lgr %r8,%r0 */ EMIT4(0xb9040000, REG_8, REG_0); - } else { - /* %r8 = func_addr + S390X_PATCH_SIZE */ - load_imm64(jit, REG_8, (u64)func_addr + S390X_PATCH_SIZE); } /* @@ -2757,12 +2761,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, * __bpf_tramp_enter(im); */ - /* %r1 = __bpf_tramp_enter */ - load_imm64(jit, REG_1, (u64)__bpf_tramp_enter); /* %r2 = im */ load_imm64(jit, REG_2, (u64)im); - /* %r1() */ - call_r1(jit); + /* brasl %r14,__bpf_tramp_enter */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, __bpf_tramp_enter); } for (i = 0; i < fentry->nr_links; i++) @@ -2815,13 +2817,25 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, /* mvc tail_call_cnt(4,%r15),tccnt_off(%r15) */ _EMIT6(0xd203f000 | offsetof(struct prog_frame, tail_call_cnt), 0xf000 | tjit->tccnt_off); - /* lgr %r1,%r8 */ - EMIT4(0xb9040000, REG_1, REG_8); - /* %r1() */ - call_r1(jit); + if (flags & BPF_TRAMP_F_ORIG_STACK) { + if (nospec_uses_trampoline()) + /* brasl %r14,__s390_indirect_jump_r8 */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, + __s390_indirect_jump_r8); + else + /* basr %r14,%r8 */ + EMIT2(0x0d00, REG_14, REG_8); + } else { + /* brasl %r14,func_addr+S390X_PATCH_SIZE */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, + func_addr + S390X_PATCH_SIZE); + } /* stg %r2,retval_off(%r15) */ EMIT6_DISP_LH(0xe3000000, 0x0024, REG_2, REG_0, REG_15, tjit->retval_off); + /* mvc tccnt_off(%r15),tail_call_cnt(4,%r15) */ + _EMIT6(0xd203f000 | tjit->tccnt_off, + 0xf000 | offsetof(struct prog_frame, tail_call_cnt)); im->ip_after_call = jit->prg_buf + jit->prg; @@ -2846,12 +2860,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, * __bpf_tramp_exit(im); */ - /* %r1 = __bpf_tramp_exit */ - load_imm64(jit, REG_1, (u64)__bpf_tramp_exit); /* %r2 = im */ load_imm64(jit, REG_2, (u64)im); - /* %r1() */ - call_r1(jit); + /* brasl %r14,__bpf_tramp_exit */ + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, __bpf_tramp_exit); } /* lmg %r2,%rN,reg_args_off(%r15) */ @@ -2860,7 +2872,8 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, REG_2 + (nr_reg_args - 1), REG_15, tjit->reg_args_off); /* lgr %r1,%r8 */ - if (!(flags & BPF_TRAMP_F_SKIP_FRAME)) + if (!(flags & BPF_TRAMP_F_SKIP_FRAME) && + (flags & BPF_TRAMP_F_ORIG_STACK)) EMIT4(0xb9040000, REG_1, REG_8); /* lmg %r7,%r8,r7_r8_off(%r15) */ EMIT6_DISP_LH(0xeb000000, 0x0004, REG_7, REG_8, REG_15, @@ -2879,9 +2892,12 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, EMIT4_IMM(0xa70b0000, REG_15, tjit->stack_size); if (flags & BPF_TRAMP_F_SKIP_FRAME) EMIT_JUMP_REG(14); - else + else if (flags & BPF_TRAMP_F_ORIG_STACK) EMIT_JUMP_REG(1); - + else + /* brcl 0xf,func_addr+S390X_PATCH_SIZE */ + EMIT6_PCREL_RILC_PTR(0xc0040000, 0xf, + func_addr + S390X_PATCH_SIZE); return 0; } @@ -2951,6 +2967,11 @@ bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) case BPF_STX | BPF_ATOMIC | BPF_DW: if (bpf_atomic_is_load_store(insn)) return false; + break; + case BPF_LDX | BPF_MEMSX | BPF_B: + case BPF_LDX | BPF_MEMSX | BPF_H: + case BPF_LDX | BPF_MEMSX | BPF_W: + return false; } return true; } @@ -2989,3 +3010,8 @@ void arch_bpf_stack_walk(bool (*consume_fn)(void *, u64, u64, u64), prev_addr = addr; } } + +bool bpf_jit_supports_timed_may_goto(void) +{ + return true; +} diff --git a/arch/s390/net/bpf_timed_may_goto.S b/arch/s390/net/bpf_timed_may_goto.S new file mode 100644 index 000000000000..06f567a460d7 --- /dev/null +++ b/arch/s390/net/bpf_timed_may_goto.S @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/export.h> +#include <linux/linkage.h> +#include <asm/asm-offsets.h> +#include <asm/nospec-insn.h> + +#define R2_OFF 0 +#define R5_OFF (R2_OFF + (5 - 2 + 1) * 8) +#define R14_OFF (R5_OFF + 8) +#define RETADDR_OFF (R14_OFF + 8) +#define R15_OFF (RETADDR_OFF + 8) +#define BACKCHAIN_OFF (R15_OFF + 8) +#define FRAME_SIZE (BACKCHAIN_OFF + 8) +#define FRAME_OFF (STACK_FRAME_OVERHEAD - FRAME_SIZE) +#if (FRAME_OFF + BACKCHAIN_OFF) != __SF_BACKCHAIN +#error Stack frame layout calculation is broken +#endif + + GEN_BR_THUNK %r1 + +SYM_FUNC_START(arch_bpf_timed_may_goto) + /* + * This function has a special ABI: the parameters are in %r12 and + * %r13; the return value is in %r12; all GPRs except %r0, %r1, and + * %r12 are callee-saved; and the return address is in %r0. + */ + stmg %r2,%r5,FRAME_OFF+R2_OFF(%r15) + stg %r14,FRAME_OFF+R14_OFF(%r15) + stg %r0,FRAME_OFF+RETADDR_OFF(%r15) + stg %r15,FRAME_OFF+R15_OFF(%r15) + lgr %r1,%r15 + lay %r15,-FRAME_SIZE(%r15) + stg %r1,__SF_BACKCHAIN(%r15) + + lay %r2,0(%r12,%r13) + brasl %r14,bpf_check_timed_may_goto + lgr %r12,%r2 + + lg %r15,FRAME_SIZE+FRAME_OFF+R15_OFF(%r15) + lmg %r2,%r5,FRAME_OFF+R2_OFF(%r15) + lg %r14,FRAME_OFF+R14_OFF(%r15) + lg %r1,FRAME_OFF+RETADDR_OFF(%r15) + BR_EX %r1 +SYM_FUNC_END(arch_bpf_timed_may_goto) diff --git a/arch/sh/kernel/asm-offsets.c b/arch/sh/kernel/asm-offsets.c index a0322e832845..429b6a763146 100644 --- a/arch/sh/kernel/asm-offsets.c +++ b/arch/sh/kernel/asm-offsets.c @@ -8,6 +8,7 @@ * compile this file to assembler, and then extract the * #defines from the assembly-language output. */ +#define COMPILE_OFFSETS #include <linux/stddef.h> #include <linux/types.h> diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 7b595092cbfb..a630d373e645 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -110,6 +110,8 @@ config SPARC64 select HAVE_SETUP_PER_CPU_AREA select NEED_PER_CPU_EMBED_FIRST_CHUNK select NEED_PER_CPU_PAGE_FIRST_CHUNK + select ARCH_SUPPORTS_SCHED_SMT if SMP + select ARCH_SUPPORTS_SCHED_MC if SMP config ARCH_PROC_KCORE_TEXT def_bool y @@ -288,24 +290,6 @@ if SPARC64 || COMPILE_TEST source "kernel/power/Kconfig" endif -config SCHED_SMT - bool "SMT (Hyperthreading) scheduler support" - depends on SPARC64 && SMP - default y - help - SMT scheduler support improves the CPU scheduler's decision making - when dealing with SPARC cpus at a cost of slightly increased overhead - in some places. If unsure say N here. - -config SCHED_MC - bool "Multi-core scheduler support" - depends on SPARC64 && SMP - default y - help - Multi-core scheduler support improves the CPU scheduler's decision - making when dealing with multi-core CPU chips at a cost of slightly - increased overhead in some places. If unsure say N here. - config CMDLINE_BOOL bool "Default bootloader kernel arguments" depends on SPARC64 diff --git a/arch/sparc/kernel/asm-offsets.c b/arch/sparc/kernel/asm-offsets.c index 3d9b9855dce9..6e660bde48dd 100644 --- a/arch/sparc/kernel/asm-offsets.c +++ b/arch/sparc/kernel/asm-offsets.c @@ -10,6 +10,7 @@ * * On sparc, thread_info data is static and TI_XXX offsets are computed by hand. */ +#define COMPILE_OFFSETS #include <linux/sched.h> #include <linux/mm_types.h> diff --git a/arch/um/kernel/asm-offsets.c b/arch/um/kernel/asm-offsets.c index 1fb12235ab9c..a69873aa697f 100644 --- a/arch/um/kernel/asm-offsets.c +++ b/arch/um/kernel/asm-offsets.c @@ -1 +1,3 @@ +#define COMPILE_OFFSETS + #include <sysdep/kernel-offsets.h> diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index f7fb3d88c57b..36b985d0e7bf 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -3,6 +3,8 @@ # Branch profiling isn't noinstr-safe. Disable it for arch/x86/* subdir-ccflags-$(CONFIG_TRACE_BRANCH_PROFILING) += -DDISABLE_BRANCH_PROFILING +obj-y += boot/startup/ + obj-$(CONFIG_ARCH_HAS_CC_PLATFORM) += coco/ obj-y += entry/ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 052a5f811203..75f3de70df51 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -14,7 +14,6 @@ config X86_32 select ARCH_WANT_IPC_PARSE_VERSION select CLKSRC_I8253 select CLONE_BACKWARDS - select GENERIC_VDSO_32 select HAVE_DEBUG_STACKOVERFLOW select KMAP_LOCAL select MODULES_USE_ELF_REL @@ -182,8 +181,6 @@ config X86 select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select GENERIC_GETTIMEOFDAY - select GENERIC_VDSO_DATA_STORE - select GENERIC_VDSO_TIME_NS select GENERIC_VDSO_OVERFLOW_PROTECT select GUP_GET_PXX_LOW_HIGH if X86_PAE select HARDIRQS_SW_RESEND @@ -239,6 +236,7 @@ config X86 select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_EISA if X86_32 select HAVE_EXIT_THREAD + select HAVE_GENERIC_TIF_BITS select HAVE_GUP_FAST select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE select HAVE_FTRACE_GRAPH_FUNC if HAVE_FUNCTION_GRAPH_TRACER @@ -330,6 +328,10 @@ config X86 imply IMA_SECURE_AND_OR_TRUSTED_BOOT if EFI select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE select ARCH_SUPPORTS_PT_RECLAIM if X86_64 + select ARCH_SUPPORTS_SCHED_SMT if SMP + select SCHED_SMT if SMP + select ARCH_SUPPORTS_SCHED_CLUSTER if SMP + select ARCH_SUPPORTS_SCHED_MC if SMP config INSTRUCTION_DECODER def_bool y @@ -483,6 +485,19 @@ config X86_X2APIC If in doubt, say Y. +config AMD_SECURE_AVIC + bool "AMD Secure AVIC" + depends on AMD_MEM_ENCRYPT && X86_X2APIC + help + Enable this to get AMD Secure AVIC support on guests that have this feature. + + AMD Secure AVIC provides hardware acceleration for performance sensitive + APIC accesses and support for managing guest owned APIC state for SEV-SNP + guests. Secure AVIC does not support xAPIC mode. It has functional + dependency on x2apic being enabled in the guest. + + If you don't know what to do here, say N. + config X86_POSTED_MSI bool "Enable MSI and MSI-x delivery by posted interrupts" depends on X86_64 && IRQ_REMAP @@ -879,6 +894,15 @@ config ACRN_GUEST IOT with small footprint and real-time features. More details can be found in https://projectacrn.org/. +config BHYVE_GUEST + bool "Bhyve (BSD Hypervisor) Guest support" + depends on X86_64 + help + This option allows to run Linux to recognise when it is running as a + guest in the Bhyve hypervisor, and to support more than 255 vCPUs when + when doing so. More details about Bhyve can be found at https://bhyve.org + and https://wiki.freebsd.org/bhyve/. + config INTEL_TDX_GUEST bool "Intel TDX (Trust Domain Extensions) - Guest Support" depends on X86_64 && CPU_SUP_INTEL @@ -1031,29 +1055,6 @@ config NR_CPUS This is purely to save memory: each supported CPU adds about 8KB to the kernel image. -config SCHED_CLUSTER - bool "Cluster scheduler support" - depends on SMP - default y - help - Cluster scheduler support improves the CPU scheduler's decision - making when dealing with machines that have clusters of CPUs. - Cluster usually means a couple of CPUs which are placed closely - by sharing mid-level caches, last-level cache tags or internal - busses. - -config SCHED_SMT - def_bool y if SMP - -config SCHED_MC - def_bool y - prompt "Multi-core scheduler support" - depends on SMP - help - Multi-core scheduler support improves the CPU scheduler's decision - making when dealing with multi-core CPU chips at a cost of slightly - increased overhead in some places. If unsure say N here. - config SCHED_MC_PRIO bool "CPU core priorities scheduler support" depends on SCHED_MC @@ -1340,7 +1341,7 @@ config MICROCODE_LATE_LOADING use this at your own risk. Late loading taints the kernel unless the microcode header indicates that it is safe for late loading via the minimal revision check. This minimal revision check can be enforced on - the kernel command line with "microcode.minrev=Y". + the kernel command line with "microcode=force_minrev". config MICROCODE_LATE_FORCE_MINREV bool "Enforce late microcode loading minimal revision check" @@ -1356,10 +1357,22 @@ config MICROCODE_LATE_FORCE_MINREV revision check fails. This minimal revision check can also be controlled via the - "microcode.minrev" parameter on the kernel command line. + "microcode=force_minrev" parameter on the kernel command line. If unsure say Y. +config MICROCODE_DBG + bool "Enable microcode loader debugging" + default n + depends on MICROCODE + help + Enable code which allows for debugging the microcode loader in + a guest. Meaning the patch loading is simulated but everything else + related to patch parsing and handling is done as on baremetal with + the purpose of debugging solely the software side of things. + + You almost certainly want to say n here. + config X86_MSR tristate "/dev/cpu/*/msr - Model-specific register support" help @@ -1753,11 +1766,7 @@ config X86_UMIP config CC_HAS_IBT # GCC >= 9 and binutils >= 2.29 # Retpoline check to work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93654 - # Clang/LLVM >= 14 - # https://github.com/llvm/llvm-project/commit/e0b89df2e0f0130881bf6c39bf31d7f6aac00e0f - # https://github.com/llvm/llvm-project/commit/dfcf69770bc522b9e411c66454934a37c1f35332 - def_bool ((CC_IS_GCC && $(cc-option, -fcf-protection=branch -mindirect-branch-register)) || \ - (CC_IS_CLANG && CLANG_VERSION >= 140000)) && \ + def_bool ((CC_IS_GCC && $(cc-option, -fcf-protection=branch -mindirect-branch-register)) || CC_IS_CLANG) && \ $(as-instr,endbr64) config X86_CET @@ -1769,8 +1778,6 @@ config X86_KERNEL_IBT prompt "Indirect Branch Tracking" def_bool y depends on X86_64 && CC_HAS_IBT && HAVE_OBJTOOL - # https://github.com/llvm/llvm-project/commit/9d7001eba9c4cb311e03cd8cdc231f9e579f2d0f - depends on !LD_IS_LLD || LLD_VERSION >= 140000 select OBJTOOL select X86_CET help diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1913d342969b..4db7e4bf69f5 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -13,8 +13,8 @@ else endif ifdef CONFIG_CC_IS_GCC -RETPOLINE_CFLAGS := $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register) -RETPOLINE_VDSO_CFLAGS := $(call cc-option,-mindirect-branch=thunk-inline -mindirect-branch-register) +RETPOLINE_CFLAGS := -mindirect-branch=thunk-extern -mindirect-branch-register +RETPOLINE_VDSO_CFLAGS := -mindirect-branch=thunk-inline -mindirect-branch-register endif ifdef CONFIG_CC_IS_CLANG RETPOLINE_CFLAGS := -mretpoline-external-thunk @@ -37,10 +37,11 @@ export RETPOLINE_VDSO_CFLAGS # For gcc stack alignment is specified with -mpreferred-stack-boundary, # clang has the option -mstack-alignment for that purpose. -ifneq ($(call cc-option, -mpreferred-stack-boundary=4),) +ifdef CONFIG_CC_IS_GCC cc_stack_align4 := -mpreferred-stack-boundary=2 cc_stack_align8 := -mpreferred-stack-boundary=3 -else ifneq ($(call cc-option, -mstack-alignment=16),) +endif +ifdef CONFIG_CC_IS_CLANG cc_stack_align4 := -mstack-alignment=4 cc_stack_align8 := -mstack-alignment=8 endif @@ -83,19 +84,7 @@ KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-av # CC_FLAGS_FPU := -msse -msse2 ifdef CONFIG_CC_IS_GCC -# Stack alignment mismatch, proceed with caution. -# GCC < 7.1 cannot compile code using `double` and -mpreferred-stack-boundary=3 -# (8B stack alignment). -# See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383 -# -# The "-msse" in the first argument is there so that the -# -mpreferred-stack-boundary=3 build error: -# -# -mpreferred-stack-boundary=3 is not between 4 and 12 -# -# can be triggered. Otherwise gcc doesn't complain. CC_FLAGS_FPU += -mhard-float -CC_FLAGS_FPU += $(call cc-option,-msse -mpreferred-stack-boundary=3,-mpreferred-stack-boundary=4) endif ifeq ($(CONFIG_X86_KERNEL_IBT),y) @@ -159,7 +148,7 @@ else # Don't autogenerate traditional x87 instructions KBUILD_CFLAGS += -mno-80387 - KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387) + KBUILD_CFLAGS += -mno-fp-ret-in-387 # By default gcc and clang use a stack alignment of 16 bytes for x86. # However the standard kernel entry on x86-64 leaves the stack on an @@ -171,7 +160,7 @@ else KBUILD_CFLAGS += $(cc_stack_align8) # Use -mskip-rax-setup if supported. - KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) + KBUILD_CFLAGS += -mskip-rax-setup ifdef CONFIG_X86_NATIVE_CPU KBUILD_CFLAGS += -march=native @@ -286,7 +275,6 @@ archprepare: $(cpufeaturemasks.hdr) ### # Kernel objects -core-y += arch/x86/boot/startup/ libs-y += arch/x86/lib/ # drivers-y are linked after core-y diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 3a38fdcdb9bd..74657589264d 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -73,7 +73,7 @@ LDFLAGS_vmlinux += -T hostprogs := mkpiggy HOST_EXTRACFLAGS += -I$(srctree)/tools/include -sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABbCDGRSTtVW] \(_text\|__start_rodata\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p' +sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABbCDGRSTtVW] \(_text\|__start_rodata\|_sinittext\|__inittext_end\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p' quiet_cmd_voffset = VOFFSET $@ cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@ diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 94b5991da001..0f41ca0e52c0 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -332,6 +332,8 @@ static size_t parse_elf(void *output) } const unsigned long kernel_text_size = VO___start_rodata - VO__text; +const unsigned long kernel_inittext_offset = VO__sinittext - VO__text; +const unsigned long kernel_inittext_size = VO___inittext_end - VO__sinittext; const unsigned long kernel_total_size = VO__end - VO__text; static u8 boot_heap[BOOT_HEAP_SIZE] __aligned(4); diff --git a/arch/x86/boot/compressed/sev-handle-vc.c b/arch/x86/boot/compressed/sev-handle-vc.c index 89dd02de2a0f..7530ad8b768b 100644 --- a/arch/x86/boot/compressed/sev-handle-vc.c +++ b/arch/x86/boot/compressed/sev-handle-vc.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "misc.h" +#include "error.h" #include "sev.h" #include <linux/kernel.h> @@ -14,6 +15,8 @@ #include <asm/fpu/xcr.h> #define __BOOT_COMPRESSED +#undef __init +#define __init /* Basic instruction decoding support needed */ #include "../../lib/inat.c" diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index fd1b67dfea22..6e5c32a53d03 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -32,102 +32,47 @@ struct ghcb *boot_ghcb; #undef __init #define __init -#undef __head -#define __head - #define __BOOT_COMPRESSED -extern struct svsm_ca *boot_svsm_caa; -extern u64 boot_svsm_caa_pa; - -struct svsm_ca *svsm_get_caa(void) -{ - return boot_svsm_caa; -} - -u64 svsm_get_caa_pa(void) -{ - return boot_svsm_caa_pa; -} - -int svsm_perform_call_protocol(struct svsm_call *call); - u8 snp_vmpl; +u16 ghcb_version; + +u64 boot_svsm_caa_pa; /* Include code for early handlers */ #include "../../boot/startup/sev-shared.c" -int svsm_perform_call_protocol(struct svsm_call *call) -{ - struct ghcb *ghcb; - int ret; - - if (boot_ghcb) - ghcb = boot_ghcb; - else - ghcb = NULL; - - do { - ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call) - : svsm_perform_msr_protocol(call); - } while (ret == -EAGAIN); - - return ret; -} - static bool sev_snp_enabled(void) { return sev_status & MSR_AMD64_SEV_SNP_ENABLED; } -static void __page_state_change(unsigned long paddr, enum psc_op op) -{ - u64 val, msr; - - /* - * If private -> shared then invalidate the page before requesting the - * state change in the RMP table. - */ - if (op == SNP_PAGE_STATE_SHARED) - pvalidate_4k_page(paddr, paddr, false); - - /* Save the current GHCB MSR value */ - msr = sev_es_rd_ghcb_msr(); - - /* Issue VMGEXIT to change the page state in RMP table. */ - sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); - VMGEXIT(); - - /* Read the response of the VMGEXIT. */ - val = sev_es_rd_ghcb_msr(); - if ((GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) || GHCB_MSR_PSC_RESP_VAL(val)) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); - - /* Restore the GHCB MSR value */ - sev_es_wr_ghcb_msr(msr); - - /* - * Now that page state is changed in the RMP table, validate it so that it is - * consistent with the RMP entry. - */ - if (op == SNP_PAGE_STATE_PRIVATE) - pvalidate_4k_page(paddr, paddr, true); -} - void snp_set_page_private(unsigned long paddr) { + struct psc_desc d = { + SNP_PAGE_STATE_PRIVATE, + (struct svsm_ca *)boot_svsm_caa_pa, + boot_svsm_caa_pa + }; + if (!sev_snp_enabled()) return; - __page_state_change(paddr, SNP_PAGE_STATE_PRIVATE); + __page_state_change(paddr, paddr, &d); } void snp_set_page_shared(unsigned long paddr) { + struct psc_desc d = { + SNP_PAGE_STATE_SHARED, + (struct svsm_ca *)boot_svsm_caa_pa, + boot_svsm_caa_pa + }; + if (!sev_snp_enabled()) return; - __page_state_change(paddr, SNP_PAGE_STATE_SHARED); + __page_state_change(paddr, paddr, &d); } bool early_setup_ghcb(void) @@ -152,8 +97,14 @@ bool early_setup_ghcb(void) void snp_accept_memory(phys_addr_t start, phys_addr_t end) { + struct psc_desc d = { + SNP_PAGE_STATE_PRIVATE, + (struct svsm_ca *)boot_svsm_caa_pa, + boot_svsm_caa_pa + }; + for (phys_addr_t pa = start; pa < end; pa += PAGE_SIZE) - __page_state_change(pa, SNP_PAGE_STATE_PRIVATE); + __page_state_change(pa, pa, &d); } void sev_es_shutdown_ghcb(void) @@ -235,15 +186,23 @@ bool sev_es_check_ghcb_fault(unsigned long address) MSR_AMD64_SNP_VMSA_REG_PROT | \ MSR_AMD64_SNP_RESERVED_BIT13 | \ MSR_AMD64_SNP_RESERVED_BIT15 | \ + MSR_AMD64_SNP_SECURE_AVIC | \ MSR_AMD64_SNP_RESERVED_MASK) +#ifdef CONFIG_AMD_SECURE_AVIC +#define SNP_FEATURE_SECURE_AVIC MSR_AMD64_SNP_SECURE_AVIC +#else +#define SNP_FEATURE_SECURE_AVIC 0 +#endif + /* * SNP_FEATURES_PRESENT is the mask of SNP features that are implemented * by the guest kernel. As and when a new feature is implemented in the * guest kernel, a corresponding bit should be added to the mask. */ #define SNP_FEATURES_PRESENT (MSR_AMD64_SNP_DEBUG_SWAP | \ - MSR_AMD64_SNP_SECURE_TSC) + MSR_AMD64_SNP_SECURE_TSC | \ + SNP_FEATURE_SECURE_AVIC) u64 snp_get_unsupported_features(u64 status) { @@ -347,7 +306,7 @@ static bool early_snp_init(struct boot_params *bp) * running at VMPL0. The CA will be used to communicate with the * SVSM and request its services. */ - svsm_setup_ca(cc_info); + svsm_setup_ca(cc_info, rip_rel_ptr(&boot_ghcb_page)); /* * Pass run-time kernel a pointer to CC info via boot_params so EFI @@ -391,6 +350,8 @@ static int sev_check_cpu_support(void) if (!(eax & BIT(1))) return -ENODEV; + sev_snp_needs_sfw = !(ebx & BIT(31)); + return ebx & 0x3f; } @@ -453,30 +414,16 @@ void sev_enable(struct boot_params *bp) */ if (sev_status & MSR_AMD64_SEV_SNP_ENABLED) { u64 hv_features; - int ret; hv_features = get_hv_features(); if (!(hv_features & GHCB_HV_FT_SNP)) sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); /* - * Enforce running at VMPL0 or with an SVSM. - * - * Use RMPADJUST (see the rmpadjust() function for a description of - * what the instruction does) to update the VMPL1 permissions of a - * page. If the guest is running at VMPL0, this will succeed. If the - * guest is running at any other VMPL, this will fail. Linux SNP guests - * only ever run at a single VMPL level so permission mask changes of a - * lesser-privileged VMPL are a don't-care. - */ - ret = rmpadjust((unsigned long)&boot_ghcb_page, RMP_PG_SIZE_4K, 1); - - /* - * Running at VMPL0 is not required if an SVSM is present and the hypervisor - * supports the required SVSM GHCB events. + * Running at VMPL0 is required unless an SVSM is present and + * the hypervisor supports the required SVSM GHCB events. */ - if (ret && - !(snp_vmpl && (hv_features & GHCB_HV_FT_SNP_MULTI_VMPL))) + if (snp_vmpl && !(hv_features & GHCB_HV_FT_SNP_MULTI_VMPL)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NOT_VMPL0); } @@ -550,7 +497,6 @@ bool early_is_sevsnp_guest(void) /* Obtain the address of the calling area to use */ boot_rdmsr(MSR_SVSM_CAA, &m); - boot_svsm_caa = (void *)m.q; boot_svsm_caa_pa = m.q; /* diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c index 63e037e94e4c..916bac09b464 100644 --- a/arch/x86/boot/cpuflags.c +++ b/arch/x86/boot/cpuflags.c @@ -106,18 +106,5 @@ void get_cpuflags(void) cpuid(0x80000001, &ignored, &ignored, &cpu.flags[6], &cpu.flags[1]); } - - if (max_amd_level >= 0x8000001f) { - u32 ebx; - - /* - * The X86_FEATURE_COHERENCY_SFW_NO feature bit is in - * the virtualization flags entry (word 8) and set by - * scattered.c, so the bit needs to be explicitly set. - */ - cpuid(0x8000001f, &ignored, &ebx, &ignored, &ignored); - if (ebx & BIT(31)) - set_bit(X86_FEATURE_COHERENCY_SFW_NO, cpu.flags); - } } } diff --git a/arch/x86/boot/startup/Makefile b/arch/x86/boot/startup/Makefile index b514f7e81332..e8fdf020b422 100644 --- a/arch/x86/boot/startup/Makefile +++ b/arch/x86/boot/startup/Makefile @@ -4,6 +4,7 @@ KBUILD_AFLAGS += -D__DISABLE_EXPORTS KBUILD_CFLAGS += -D__DISABLE_EXPORTS -mcmodel=small -fPIC \ -Os -DDISABLE_BRANCH_PROFILING \ $(DISABLE_STACKLEAK_PLUGIN) \ + $(DISABLE_LATENT_ENTROPY_PLUGIN) \ -fno-stack-protector -D__NO_FORTIFY \ -fno-jump-tables \ -include $(srctree)/include/linux/hidden.h @@ -19,6 +20,7 @@ KCOV_INSTRUMENT := n obj-$(CONFIG_X86_64) += gdt_idt.o map_kernel.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += sme.o sev-startup.o +pi-objs := $(patsubst %.o,$(obj)/%.o,$(obj-y)) lib-$(CONFIG_X86_64) += la57toggle.o lib-$(CONFIG_EFI_MIXED) += efi-mixed.o @@ -28,3 +30,23 @@ lib-$(CONFIG_EFI_MIXED) += efi-mixed.o # to be linked into the decompressor or the EFI stub but not vmlinux # $(patsubst %.o,$(obj)/%.o,$(lib-y)): OBJECT_FILES_NON_STANDARD := y + +# +# Invoke objtool for each object individually to check for absolute +# relocations, even if other objtool actions are being deferred. +# +$(pi-objs): objtool-enabled = 1 +$(pi-objs): objtool-args = $(if $(delay-objtool),,$(objtool-args-y)) --noabs + +# +# Confine the startup code by prefixing all symbols with __pi_ (for position +# independent). This ensures that startup code can only call other startup +# code, or code that has explicitly been made accessible to it via a symbol +# alias. +# +$(obj)/%.pi.o: OBJCOPYFLAGS := --prefix-symbols=__pi_ +$(obj)/%.pi.o: $(obj)/%.o FORCE + $(call if_changed,objcopy) + +targets += $(obj-y) +obj-y := $(patsubst %.o,%.pi.o,$(obj-y)) diff --git a/arch/x86/boot/startup/exports.h b/arch/x86/boot/startup/exports.h new file mode 100644 index 000000000000..01d2363dc445 --- /dev/null +++ b/arch/x86/boot/startup/exports.h @@ -0,0 +1,14 @@ + +/* + * The symbols below are functions that are implemented by the startup code, + * but called at runtime by the SEV code residing in the core kernel. + */ +PROVIDE(early_set_pages_state = __pi_early_set_pages_state); +PROVIDE(early_snp_set_memory_private = __pi_early_snp_set_memory_private); +PROVIDE(early_snp_set_memory_shared = __pi_early_snp_set_memory_shared); +PROVIDE(get_hv_features = __pi_get_hv_features); +PROVIDE(sev_es_terminate = __pi_sev_es_terminate); +PROVIDE(snp_cpuid = __pi_snp_cpuid); +PROVIDE(snp_cpuid_get_table = __pi_snp_cpuid_get_table); +PROVIDE(svsm_issue_call = __pi_svsm_issue_call); +PROVIDE(svsm_process_result_codes = __pi_svsm_process_result_codes); diff --git a/arch/x86/boot/startup/gdt_idt.c b/arch/x86/boot/startup/gdt_idt.c index a3112a69b06a..d16102abdaec 100644 --- a/arch/x86/boot/startup/gdt_idt.c +++ b/arch/x86/boot/startup/gdt_idt.c @@ -24,7 +24,7 @@ static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data; /* This may run while still in the direct mapping */ -void __head startup_64_load_idt(void *vc_handler) +void startup_64_load_idt(void *vc_handler) { struct desc_ptr desc = { .address = (unsigned long)rip_rel_ptr(bringup_idt_table), @@ -46,7 +46,7 @@ void __head startup_64_load_idt(void *vc_handler) /* * Setup boot CPU state needed before kernel switches to virtual addresses. */ -void __head startup_64_setup_gdt_idt(void) +void __init startup_64_setup_gdt_idt(void) { struct gdt_page *gp = rip_rel_ptr((void *)(__force unsigned long)&gdt_page); void *handler = NULL; diff --git a/arch/x86/boot/startup/map_kernel.c b/arch/x86/boot/startup/map_kernel.c index 332dbe6688c4..83ba98d61572 100644 --- a/arch/x86/boot/startup/map_kernel.c +++ b/arch/x86/boot/startup/map_kernel.c @@ -30,7 +30,7 @@ static inline bool check_la57_support(void) return true; } -static unsigned long __head sme_postprocess_startup(struct boot_params *bp, +static unsigned long __init sme_postprocess_startup(struct boot_params *bp, pmdval_t *pmd, unsigned long p2v_offset) { @@ -84,7 +84,7 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, * the 1:1 mapping of memory. Kernel virtual addresses can be determined by * subtracting p2v_offset from the RIP-relative address. */ -unsigned long __head __startup_64(unsigned long p2v_offset, +unsigned long __init __startup_64(unsigned long p2v_offset, struct boot_params *bp) { pmd_t (*early_pgts)[PTRS_PER_PMD] = rip_rel_ptr(early_dynamic_pgts); diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index a34cd19796f9..4e22ffd73516 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -12,35 +12,12 @@ #include <asm/setup_data.h> #ifndef __BOOT_COMPRESSED -#define error(v) pr_err(v) #define has_cpuflag(f) boot_cpu_has(f) #else #undef WARN #define WARN(condition, format...) (!!(condition)) -#undef vc_forward_exception -#define vc_forward_exception(c) panic("SNP: Hypervisor requested exception\n") #endif -/* - * SVSM related information: - * During boot, the page tables are set up as identity mapped and later - * changed to use kernel virtual addresses. Maintain separate virtual and - * physical addresses for the CAA to allow SVSM functions to be used during - * early boot, both with identity mapped virtual addresses and proper kernel - * virtual addresses. - */ -struct svsm_ca *boot_svsm_caa __ro_after_init; -u64 boot_svsm_caa_pa __ro_after_init; - -/* - * Since feature negotiation related variables are set early in the boot - * process they must reside in the .data section so as not to be zeroed - * out when the .bss section is later cleared. - * - * GHCB protocol version negotiated with the hypervisor. - */ -static u16 ghcb_version __ro_after_init; - /* Copy of the SNP firmware's CPUID page. */ static struct snp_cpuid_table cpuid_table_copy __ro_after_init; @@ -54,17 +31,9 @@ static u32 cpuid_std_range_max __ro_after_init; static u32 cpuid_hyp_range_max __ro_after_init; static u32 cpuid_ext_range_max __ro_after_init; -bool __init sev_es_check_cpu_features(void) -{ - if (!has_cpuflag(X86_FEATURE_RDRAND)) { - error("RDRAND instruction not supported - no trusted source of randomness available\n"); - return false; - } +bool sev_snp_needs_sfw; - return true; -} - -void __head __noreturn +void __noreturn sev_es_terminate(unsigned int set, unsigned int reason) { u64 val = GHCB_MSR_TERM_REQ; @@ -83,7 +52,7 @@ sev_es_terminate(unsigned int set, unsigned int reason) /* * The hypervisor features are available from GHCB version 2 onward. */ -u64 get_hv_features(void) +u64 __init get_hv_features(void) { u64 val; @@ -100,72 +69,7 @@ u64 get_hv_features(void) return GHCB_MSR_HV_FT_RESP_VAL(val); } -void snp_register_ghcb_early(unsigned long paddr) -{ - unsigned long pfn = paddr >> PAGE_SHIFT; - u64 val; - - sev_es_wr_ghcb_msr(GHCB_MSR_REG_GPA_REQ_VAL(pfn)); - VMGEXIT(); - - val = sev_es_rd_ghcb_msr(); - - /* If the response GPA is not ours then abort the guest */ - if ((GHCB_RESP_CODE(val) != GHCB_MSR_REG_GPA_RESP) || - (GHCB_MSR_REG_GPA_RESP_VAL(val) != pfn)) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_REGISTER); -} - -bool sev_es_negotiate_protocol(void) -{ - u64 val; - - /* Do the GHCB protocol version negotiation */ - sev_es_wr_ghcb_msr(GHCB_MSR_SEV_INFO_REQ); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - - if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP) - return false; - - if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN || - GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX) - return false; - - ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val), GHCB_PROTOCOL_MAX); - - return true; -} - -static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - u32 ret; - - ret = ghcb->save.sw_exit_info_1 & GENMASK_ULL(31, 0); - if (!ret) - return ES_OK; - - if (ret == 1) { - u64 info = ghcb->save.sw_exit_info_2; - unsigned long v = info & SVM_EVTINJ_VEC_MASK; - - /* Check if exception information from hypervisor is sane. */ - if ((info & SVM_EVTINJ_VALID) && - ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) && - ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) { - ctxt->fi.vector = v; - - if (info & SVM_EVTINJ_VALID_ERR) - ctxt->fi.error_code = info >> 32; - - return ES_EXCEPTION; - } - } - - return ES_VMM_ERROR; -} - -static inline int svsm_process_result_codes(struct svsm_call *call) +int svsm_process_result_codes(struct svsm_call *call) { switch (call->rax_out) { case SVSM_SUCCESS: @@ -193,7 +97,7 @@ static inline int svsm_process_result_codes(struct svsm_call *call) * - RAX specifies the SVSM protocol/callid as input and the return code * as output. */ -static __always_inline void svsm_issue_call(struct svsm_call *call, u8 *pending) +void svsm_issue_call(struct svsm_call *call, u8 *pending) { register unsigned long rax asm("rax") = call->rax; register unsigned long rcx asm("rcx") = call->rcx; @@ -216,7 +120,7 @@ static __always_inline void svsm_issue_call(struct svsm_call *call, u8 *pending) call->r9_out = r9; } -static int svsm_perform_msr_protocol(struct svsm_call *call) +int svsm_perform_msr_protocol(struct svsm_call *call) { u8 pending = 0; u64 val, resp; @@ -247,63 +151,6 @@ static int svsm_perform_msr_protocol(struct svsm_call *call) return svsm_process_result_codes(call); } -static int svsm_perform_ghcb_protocol(struct ghcb *ghcb, struct svsm_call *call) -{ - struct es_em_ctxt ctxt; - u8 pending = 0; - - vc_ghcb_invalidate(ghcb); - - /* - * Fill in protocol and format specifiers. This can be called very early - * in the boot, so use rip-relative references as needed. - */ - ghcb->protocol_version = ghcb_version; - ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; - - ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_SNP_RUN_VMPL); - ghcb_set_sw_exit_info_1(ghcb, 0); - ghcb_set_sw_exit_info_2(ghcb, 0); - - sev_es_wr_ghcb_msr(__pa(ghcb)); - - svsm_issue_call(call, &pending); - - if (pending) - return -EINVAL; - - switch (verify_exception_info(ghcb, &ctxt)) { - case ES_OK: - break; - case ES_EXCEPTION: - vc_forward_exception(&ctxt); - fallthrough; - default: - return -EINVAL; - } - - return svsm_process_result_codes(call); -} - -enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, - struct es_em_ctxt *ctxt, - u64 exit_code, u64 exit_info_1, - u64 exit_info_2) -{ - /* Fill in protocol and format specifiers */ - ghcb->protocol_version = ghcb_version; - ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; - - ghcb_set_sw_exit_code(ghcb, exit_code); - ghcb_set_sw_exit_info_1(ghcb, exit_info_1); - ghcb_set_sw_exit_info_2(ghcb, exit_info_2); - - sev_es_wr_ghcb_msr(__pa(ghcb)); - VMGEXIT(); - - return verify_exception_info(ghcb, ctxt); -} - static int __sev_cpuid_hv(u32 fn, int reg_idx, u32 *reg) { u64 val; @@ -342,44 +189,7 @@ static int __sev_cpuid_hv_msr(struct cpuid_leaf *leaf) return ret; } -static int __sev_cpuid_hv_ghcb(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) -{ - u32 cr4 = native_read_cr4(); - int ret; - - ghcb_set_rax(ghcb, leaf->fn); - ghcb_set_rcx(ghcb, leaf->subfn); - - if (cr4 & X86_CR4_OSXSAVE) - /* Safe to read xcr0 */ - ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); - else - /* xgetbv will cause #UD - use reset value for xcr0 */ - ghcb_set_xcr0(ghcb, 1); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); - if (ret != ES_OK) - return ret; - - if (!(ghcb_rax_is_valid(ghcb) && - ghcb_rbx_is_valid(ghcb) && - ghcb_rcx_is_valid(ghcb) && - ghcb_rdx_is_valid(ghcb))) - return ES_VMM_ERROR; - leaf->eax = ghcb->save.rax; - leaf->ebx = ghcb->save.rbx; - leaf->ecx = ghcb->save.rcx; - leaf->edx = ghcb->save.rdx; - - return ES_OK; -} - -static int sev_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) -{ - return ghcb ? __sev_cpuid_hv_ghcb(ghcb, ctxt, leaf) - : __sev_cpuid_hv_msr(leaf); -} /* * This may be called early while still running on the initial identity @@ -412,7 +222,7 @@ const struct snp_cpuid_table *snp_cpuid_get_table(void) * * Return: XSAVE area size on success, 0 otherwise. */ -static u32 __head snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) +static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) { const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); u64 xfeatures_found = 0; @@ -448,7 +258,7 @@ static u32 __head snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) return xsave_size; } -static bool __head +static bool snp_cpuid_get_validated_func(struct cpuid_leaf *leaf) { const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); @@ -484,21 +294,21 @@ snp_cpuid_get_validated_func(struct cpuid_leaf *leaf) return false; } -static void snp_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) +static void snp_cpuid_hv_msr(void *ctx, struct cpuid_leaf *leaf) { - if (sev_cpuid_hv(ghcb, ctxt, leaf)) + if (__sev_cpuid_hv_msr(leaf)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV); } -static int __head -snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, - struct cpuid_leaf *leaf) +static int +snp_cpuid_postprocess(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), + void *ctx, struct cpuid_leaf *leaf) { struct cpuid_leaf leaf_hv = *leaf; switch (leaf->fn) { case 0x1: - snp_cpuid_hv(ghcb, ctxt, &leaf_hv); + cpuid_fn(ctx, &leaf_hv); /* initial APIC ID */ leaf->ebx = (leaf_hv.ebx & GENMASK(31, 24)) | (leaf->ebx & GENMASK(23, 0)); @@ -517,7 +327,7 @@ snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, break; case 0xB: leaf_hv.subfn = 0; - snp_cpuid_hv(ghcb, ctxt, &leaf_hv); + cpuid_fn(ctx, &leaf_hv); /* extended APIC ID */ leaf->edx = leaf_hv.edx; @@ -565,7 +375,7 @@ snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, } break; case 0x8000001E: - snp_cpuid_hv(ghcb, ctxt, &leaf_hv); + cpuid_fn(ctx, &leaf_hv); /* extended APIC ID */ leaf->eax = leaf_hv.eax; @@ -586,8 +396,8 @@ snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value * should be treated as fatal by caller. */ -int __head -snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) +int snp_cpuid(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), + void *ctx, struct cpuid_leaf *leaf) { const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); @@ -621,7 +431,7 @@ snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) return 0; } - return snp_cpuid_postprocess(ghcb, ctxt, leaf); + return snp_cpuid_postprocess(cpuid_fn, ctx, leaf); } /* @@ -629,7 +439,7 @@ snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) * page yet, so it only supports the MSR based communication with the * hypervisor and only the CPUID exit-code. */ -void __head do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) +void do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) { unsigned int subfn = lower_bits(regs->cx, 32); unsigned int fn = lower_bits(regs->ax, 32); @@ -648,13 +458,24 @@ void __head do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) leaf.fn = fn; leaf.subfn = subfn; - ret = snp_cpuid(NULL, NULL, &leaf); + /* + * If SNP is active, then snp_cpuid() uses the CPUID table to obtain the + * CPUID values (with possible HV interaction during post-processing of + * the values). But if SNP is not active (no CPUID table present), then + * snp_cpuid() returns -EOPNOTSUPP so that an SEV-ES guest can call the + * HV to obtain the CPUID information. + */ + ret = snp_cpuid(snp_cpuid_hv_msr, NULL, &leaf); if (!ret) goto cpuid_done; if (ret != -EOPNOTSUPP) goto fail; + /* + * This is reached by a SEV-ES guest and needs to invoke the HV for + * the CPUID data. + */ if (__sev_cpuid_hv_msr(&leaf)) goto fail; @@ -705,7 +526,7 @@ struct cc_setup_data { * Search for a Confidential Computing blob passed in as a setup_data entry * via the Linux Boot Protocol. */ -static __head +static __init struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp) { struct cc_setup_data *sd = NULL; @@ -733,7 +554,7 @@ struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp) * mapping needs to be updated in sync with all the changes to virtual memory * layout and related mapping facilities throughout the boot process. */ -static void __head setup_cpuid_table(const struct cc_blob_sev_info *cc_info) +static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info) { const struct snp_cpuid_table *cpuid_table_fw, *cpuid_table; int i; @@ -761,13 +582,24 @@ static void __head setup_cpuid_table(const struct cc_blob_sev_info *cc_info) } } -static void __head svsm_pval_4k_page(unsigned long paddr, bool validate) +static int svsm_call_msr_protocol(struct svsm_call *call) +{ + int ret; + + do { + ret = svsm_perform_msr_protocol(call); + } while (ret == -EAGAIN); + + return ret; +} + +static void svsm_pval_4k_page(unsigned long paddr, bool validate, + struct svsm_ca *caa, u64 caa_pa) { struct svsm_pvalidate_call *pc; struct svsm_call call = {}; unsigned long flags; u64 pc_pa; - int ret; /* * This can be called very early in the boot, use native functions in @@ -775,10 +607,10 @@ static void __head svsm_pval_4k_page(unsigned long paddr, bool validate) */ flags = native_local_irq_save(); - call.caa = svsm_get_caa(); + call.caa = caa; pc = (struct svsm_pvalidate_call *)call.caa->svsm_buffer; - pc_pa = svsm_get_caa_pa() + offsetof(struct svsm_ca, svsm_buffer); + pc_pa = caa_pa + offsetof(struct svsm_ca, svsm_buffer); pc->num_entries = 1; pc->cur_index = 0; @@ -792,20 +624,24 @@ static void __head svsm_pval_4k_page(unsigned long paddr, bool validate) call.rax = SVSM_CORE_CALL(SVSM_CORE_PVALIDATE); call.rcx = pc_pa; - ret = svsm_perform_call_protocol(&call); - if (ret) + /* + * Use the MSR protocol exclusively, so that this code is usable in + * startup code where VA/PA translations of the GHCB page's address may + * be problematic. + */ + if (svsm_call_msr_protocol(&call)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); native_local_irq_restore(flags); } -static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, - bool validate) +static void pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, + bool validate, struct svsm_ca *caa, u64 caa_pa) { int ret; if (snp_vmpl) { - svsm_pval_4k_page(paddr, validate); + svsm_pval_4k_page(paddr, validate, caa, caa_pa); } else { ret = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); if (ret) @@ -816,15 +652,51 @@ static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, * If validating memory (making it private) and affected by the * cache-coherency vulnerability, perform the cache eviction mitigation. */ - if (validate && !has_cpuflag(X86_FEATURE_COHERENCY_SFW_NO)) + if (validate && sev_snp_needs_sfw) sev_evict_cache((void *)vaddr, 1); } +static void __page_state_change(unsigned long vaddr, unsigned long paddr, + const struct psc_desc *desc) +{ + u64 val, msr; + + /* + * If private -> shared then invalidate the page before requesting the + * state change in the RMP table. + */ + if (desc->op == SNP_PAGE_STATE_SHARED) + pvalidate_4k_page(vaddr, paddr, false, desc->ca, desc->caa_pa); + + /* Save the current GHCB MSR value */ + msr = sev_es_rd_ghcb_msr(); + + /* Issue VMGEXIT to change the page state in RMP table. */ + sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, desc->op)); + VMGEXIT(); + + /* Read the response of the VMGEXIT. */ + val = sev_es_rd_ghcb_msr(); + if ((GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) || GHCB_MSR_PSC_RESP_VAL(val)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); + + /* Restore the GHCB MSR value */ + sev_es_wr_ghcb_msr(msr); + + /* + * Now that page state is changed in the RMP table, validate it so that it is + * consistent with the RMP entry. + */ + if (desc->op == SNP_PAGE_STATE_PRIVATE) + pvalidate_4k_page(vaddr, paddr, true, desc->ca, desc->caa_pa); +} + /* * Maintain the GPA of the SVSM Calling Area (CA) in order to utilize the SVSM * services needed when not running in VMPL0. */ -static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info) +static bool __init svsm_setup_ca(const struct cc_blob_sev_info *cc_info, + void *page) { struct snp_secrets_page *secrets_page; struct snp_cpuid_table *cpuid_table; @@ -847,7 +719,7 @@ static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info) * routine is running identity mapped when called, both by the decompressor * code and the early kernel code. */ - if (!rmpadjust((unsigned long)rip_rel_ptr(&boot_ghcb_page), RMP_PG_SIZE_4K, 1)) + if (!rmpadjust((unsigned long)page, RMP_PG_SIZE_4K, 1)) return false; /* @@ -875,11 +747,6 @@ static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info) if (caa & (PAGE_SIZE - 1)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CAA); - /* - * The CA is identity mapped when this routine is called, both by the - * decompressor code and the early kernel code. - */ - boot_svsm_caa = (struct svsm_ca *)caa; boot_svsm_caa_pa = caa; /* Advertise the SVSM presence via CPUID. */ diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c index 0b7e3b950183..09725428d3e6 100644 --- a/arch/x86/boot/startup/sev-startup.c +++ b/arch/x86/boot/startup/sev-startup.c @@ -41,143 +41,14 @@ #include <asm/cpuid/api.h> #include <asm/cmdline.h> -/* For early boot hypervisor communication in SEV-ES enabled guests */ -struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); - -/* - * Needs to be in the .data section because we need it NULL before bss is - * cleared - */ -struct ghcb *boot_ghcb __section(".data"); - -/* Bitmap of SEV features supported by the hypervisor */ -u64 sev_hv_features __ro_after_init; - -/* Secrets page physical address from the CC blob */ -u64 sev_secrets_pa __ro_after_init; - -/* For early boot SVSM communication */ -struct svsm_ca boot_svsm_ca_page __aligned(PAGE_SIZE); - -DEFINE_PER_CPU(struct svsm_ca *, svsm_caa); -DEFINE_PER_CPU(u64, svsm_caa_pa); - -/* - * Nothing shall interrupt this code path while holding the per-CPU - * GHCB. The backup GHCB is only for NMIs interrupting this path. - * - * Callers must disable local interrupts around it. - */ -noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state) -{ - struct sev_es_runtime_data *data; - struct ghcb *ghcb; - - WARN_ON(!irqs_disabled()); - - data = this_cpu_read(runtime_data); - ghcb = &data->ghcb_page; - - if (unlikely(data->ghcb_active)) { - /* GHCB is already in use - save its contents */ - - if (unlikely(data->backup_ghcb_active)) { - /* - * Backup-GHCB is also already in use. There is no way - * to continue here so just kill the machine. To make - * panic() work, mark GHCBs inactive so that messages - * can be printed out. - */ - data->ghcb_active = false; - data->backup_ghcb_active = false; - - instrumentation_begin(); - panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); - instrumentation_end(); - } - - /* Mark backup_ghcb active before writing to it */ - data->backup_ghcb_active = true; - - state->ghcb = &data->backup_ghcb; - - /* Backup GHCB content */ - *state->ghcb = *ghcb; - } else { - state->ghcb = NULL; - data->ghcb_active = true; - } - - return ghcb; -} - /* Include code shared with pre-decompression boot stage */ #include "sev-shared.c" -noinstr void __sev_put_ghcb(struct ghcb_state *state) -{ - struct sev_es_runtime_data *data; - struct ghcb *ghcb; - - WARN_ON(!irqs_disabled()); - - data = this_cpu_read(runtime_data); - ghcb = &data->ghcb_page; - - if (state->ghcb) { - /* Restore GHCB from Backup */ - *ghcb = *state->ghcb; - data->backup_ghcb_active = false; - state->ghcb = NULL; - } else { - /* - * Invalidate the GHCB so a VMGEXIT instruction issued - * from userspace won't appear to be valid. - */ - vc_ghcb_invalidate(ghcb); - data->ghcb_active = false; - } -} - -int svsm_perform_call_protocol(struct svsm_call *call) -{ - struct ghcb_state state; - unsigned long flags; - struct ghcb *ghcb; - int ret; - - /* - * This can be called very early in the boot, use native functions in - * order to avoid paravirt issues. - */ - flags = native_local_irq_save(); - - if (sev_cfg.ghcbs_initialized) - ghcb = __sev_get_ghcb(&state); - else if (boot_ghcb) - ghcb = boot_ghcb; - else - ghcb = NULL; - - do { - ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call) - : svsm_perform_msr_protocol(call); - } while (ret == -EAGAIN); - - if (sev_cfg.ghcbs_initialized) - __sev_put_ghcb(&state); - - native_local_irq_restore(flags); - - return ret; -} - -void __head +void early_set_pages_state(unsigned long vaddr, unsigned long paddr, - unsigned long npages, enum psc_op op) + unsigned long npages, const struct psc_desc *desc) { unsigned long paddr_end; - u64 val; vaddr = vaddr & PAGE_MASK; @@ -185,42 +56,22 @@ early_set_pages_state(unsigned long vaddr, unsigned long paddr, paddr_end = paddr + (npages << PAGE_SHIFT); while (paddr < paddr_end) { - /* Page validation must be rescinded before changing to shared */ - if (op == SNP_PAGE_STATE_SHARED) - pvalidate_4k_page(vaddr, paddr, false); - - /* - * Use the MSR protocol because this function can be called before - * the GHCB is established. - */ - sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); - VMGEXIT(); - - val = sev_es_rd_ghcb_msr(); - - if (GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) - goto e_term; - - if (GHCB_MSR_PSC_RESP_VAL(val)) - goto e_term; - - /* Page validation must be performed after changing to private */ - if (op == SNP_PAGE_STATE_PRIVATE) - pvalidate_4k_page(vaddr, paddr, true); + __page_state_change(vaddr, paddr, desc); vaddr += PAGE_SIZE; paddr += PAGE_SIZE; } - - return; - -e_term: - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); } -void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, +void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned long npages) { + struct psc_desc d = { + SNP_PAGE_STATE_PRIVATE, + rip_rel_ptr(&boot_svsm_ca_page), + boot_svsm_caa_pa + }; + /* * This can be invoked in early boot while running identity mapped, so * use an open coded check for SNP instead of using cc_platform_has(). @@ -234,12 +85,18 @@ void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long padd * Ask the hypervisor to mark the memory pages as private in the RMP * table. */ - early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE); + early_set_pages_state(vaddr, paddr, npages, &d); } -void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, +void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned long npages) { + struct psc_desc d = { + SNP_PAGE_STATE_SHARED, + rip_rel_ptr(&boot_svsm_ca_page), + boot_svsm_caa_pa + }; + /* * This can be invoked in early boot while running identity mapped, so * use an open coded check for SNP instead of using cc_platform_has(). @@ -250,7 +107,7 @@ void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr return; /* Ask hypervisor to mark the memory pages shared in the RMP table. */ - early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_SHARED); + early_set_pages_state(vaddr, paddr, npages, &d); } /* @@ -266,7 +123,7 @@ void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr * * Scan for the blob in that order. */ -static __head struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) +static struct cc_blob_sev_info *__init find_cc_blob(struct boot_params *bp) { struct cc_blob_sev_info *cc_info; @@ -287,15 +144,15 @@ static __head struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) found_cc_info: if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC) - snp_abort(); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); return cc_info; } -static __head void svsm_setup(struct cc_blob_sev_info *cc_info) +static void __init svsm_setup(struct cc_blob_sev_info *cc_info) { + struct snp_secrets_page *secrets = (void *)cc_info->secrets_phys; struct svsm_call call = {}; - int ret; u64 pa; /* @@ -303,7 +160,7 @@ static __head void svsm_setup(struct cc_blob_sev_info *cc_info) * running at VMPL0. The CA will be used to communicate with the * SVSM to perform the SVSM services. */ - if (!svsm_setup_ca(cc_info)) + if (!svsm_setup_ca(cc_info, rip_rel_ptr(&boot_svsm_ca_page))) return; /* @@ -315,25 +172,25 @@ static __head void svsm_setup(struct cc_blob_sev_info *cc_info) pa = (u64)rip_rel_ptr(&boot_svsm_ca_page); /* - * Switch over to the boot SVSM CA while the current CA is still - * addressable. There is no GHCB at this point so use the MSR protocol. + * Switch over to the boot SVSM CA while the current CA is still 1:1 + * mapped and thus addressable with VA == PA. There is no GHCB at this + * point so use the MSR protocol. * * SVSM_CORE_REMAP_CA call: * RAX = 0 (Protocol=0, CallID=0) * RCX = New CA GPA */ - call.caa = svsm_get_caa(); + call.caa = (struct svsm_ca *)secrets->svsm_caa; call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA); call.rcx = pa; - ret = svsm_perform_call_protocol(&call); - if (ret) + + if (svsm_call_msr_protocol(&call)) sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CA_REMAP_FAIL); - boot_svsm_caa = (struct svsm_ca *)pa; boot_svsm_caa_pa = pa; } -bool __head snp_init(struct boot_params *bp) +bool __init snp_init(struct boot_params *bp) { struct cc_blob_sev_info *cc_info; @@ -361,8 +218,3 @@ bool __head snp_init(struct boot_params *bp) return true; } - -void __head __noreturn snp_abort(void) -{ - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); -} diff --git a/arch/x86/boot/startup/sme.c b/arch/x86/boot/startup/sme.c index 70ea1748c0a7..e7ea65f3f1d6 100644 --- a/arch/x86/boot/startup/sme.c +++ b/arch/x86/boot/startup/sme.c @@ -91,7 +91,7 @@ struct sme_populate_pgd_data { */ static char sme_workarea[2 * PMD_SIZE] __section(".init.scratch"); -static void __head sme_clear_pgd(struct sme_populate_pgd_data *ppd) +static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) { unsigned long pgd_start, pgd_end, pgd_size; pgd_t *pgd_p; @@ -106,7 +106,7 @@ static void __head sme_clear_pgd(struct sme_populate_pgd_data *ppd) memset(pgd_p, 0, pgd_size); } -static pud_t __head *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) +static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) { pgd_t *pgd; p4d_t *p4d; @@ -143,7 +143,7 @@ static pud_t __head *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) return pud; } -static void __head sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) +static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) { pud_t *pud; pmd_t *pmd; @@ -159,7 +159,7 @@ static void __head sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) set_pmd(pmd, __pmd(ppd->paddr | ppd->pmd_flags)); } -static void __head sme_populate_pgd(struct sme_populate_pgd_data *ppd) +static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd) { pud_t *pud; pmd_t *pmd; @@ -185,7 +185,7 @@ static void __head sme_populate_pgd(struct sme_populate_pgd_data *ppd) set_pte(pte, __pte(ppd->paddr | ppd->pte_flags)); } -static void __head __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) +static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) { while (ppd->vaddr < ppd->vaddr_end) { sme_populate_pgd_large(ppd); @@ -195,7 +195,7 @@ static void __head __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) } } -static void __head __sme_map_range_pte(struct sme_populate_pgd_data *ppd) +static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd) { while (ppd->vaddr < ppd->vaddr_end) { sme_populate_pgd(ppd); @@ -205,7 +205,7 @@ static void __head __sme_map_range_pte(struct sme_populate_pgd_data *ppd) } } -static void __head __sme_map_range(struct sme_populate_pgd_data *ppd, +static void __init __sme_map_range(struct sme_populate_pgd_data *ppd, pmdval_t pmd_flags, pteval_t pte_flags) { unsigned long vaddr_end; @@ -229,22 +229,22 @@ static void __head __sme_map_range(struct sme_populate_pgd_data *ppd, __sme_map_range_pte(ppd); } -static void __head sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) +static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) { __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC); } -static void __head sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) +static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) { __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC); } -static void __head sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) +static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) { __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP); } -static unsigned long __head sme_pgtable_calc(unsigned long len) +static unsigned long __init sme_pgtable_calc(unsigned long len) { unsigned long entries = 0, tables = 0; @@ -281,7 +281,7 @@ static unsigned long __head sme_pgtable_calc(unsigned long len) return entries + tables; } -void __head sme_encrypt_kernel(struct boot_params *bp) +void __init sme_encrypt_kernel(struct boot_params *bp) { unsigned long workarea_start, workarea_end, workarea_len; unsigned long execute_start, execute_end, execute_len; @@ -485,7 +485,7 @@ void __head sme_encrypt_kernel(struct boot_params *bp) native_write_cr3(__native_read_cr3()); } -void __head sme_enable(struct boot_params *bp) +void __init sme_enable(struct boot_params *bp) { unsigned int eax, ebx, ecx, edx; unsigned long feature_mask; @@ -521,6 +521,7 @@ void __head sme_enable(struct boot_params *bp) return; me_mask = 1UL << (ebx & 0x3f); + sev_snp_needs_sfw = !(ebx & BIT(31)); /* Check the SEV MSR whether SEV or SME is enabled */ sev_status = msr = native_rdmsrq(MSR_AMD64_SEV); @@ -531,7 +532,7 @@ void __head sme_enable(struct boot_params *bp) * enablement abort the guest. */ if (snp_en ^ !!(msr & MSR_AMD64_SEV_SNP_ENABLED)) - snp_abort(); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); /* Check if memory encryption is enabled */ if (feature_mask == AMD_SME_BIT) { @@ -567,7 +568,6 @@ void __head sme_enable(struct boot_params *bp) #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* Local version for startup code, which never operates on user page tables */ -__weak pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) { return pgd; diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index d4610af68114..989ca9f72ba3 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -104,6 +104,9 @@ static bool noinstr amd_cc_platform_has(enum cc_attr attr) case CC_ATTR_HOST_SEV_SNP: return cc_flags.host_sev_snp; + case CC_ATTR_SNP_SECURE_AVIC: + return sev_status & MSR_AMD64_SNP_SECURE_AVIC; + default: return false; } diff --git a/arch/x86/coco/sev/Makefile b/arch/x86/coco/sev/Makefile index 342d79f0ab6a..3b8ae214a6a6 100644 --- a/arch/x86/coco/sev/Makefile +++ b/arch/x86/coco/sev/Makefile @@ -1,10 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y += core.o sev-nmi.o vc-handle.o +obj-y += core.o noinstr.o vc-handle.o # Clang 14 and older may fail to respect __no_sanitize_undefined when inlining -UBSAN_SANITIZE_sev-nmi.o := n +UBSAN_SANITIZE_noinstr.o := n # GCC may fail to respect __no_sanitize_address or __no_kcsan when inlining -KASAN_SANITIZE_sev-nmi.o := n -KCSAN_SANITIZE_sev-nmi.o := n +KASAN_SANITIZE_noinstr.o := n +KCSAN_SANITIZE_noinstr.o := n diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index 14ef5908fb27..9ae3b11754e6 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -46,6 +46,48 @@ #include <asm/cmdline.h> #include <asm/msr.h> +/* Bitmap of SEV features supported by the hypervisor */ +u64 sev_hv_features __ro_after_init; +SYM_PIC_ALIAS(sev_hv_features); + +/* Secrets page physical address from the CC blob */ +u64 sev_secrets_pa __ro_after_init; +SYM_PIC_ALIAS(sev_secrets_pa); + +/* For early boot SVSM communication */ +struct svsm_ca boot_svsm_ca_page __aligned(PAGE_SIZE); +SYM_PIC_ALIAS(boot_svsm_ca_page); + +/* + * SVSM related information: + * During boot, the page tables are set up as identity mapped and later + * changed to use kernel virtual addresses. Maintain separate virtual and + * physical addresses for the CAA to allow SVSM functions to be used during + * early boot, both with identity mapped virtual addresses and proper kernel + * virtual addresses. + */ +u64 boot_svsm_caa_pa __ro_after_init; +SYM_PIC_ALIAS(boot_svsm_caa_pa); + +DEFINE_PER_CPU(struct svsm_ca *, svsm_caa); +DEFINE_PER_CPU(u64, svsm_caa_pa); + +static inline struct svsm_ca *svsm_get_caa(void) +{ + if (sev_cfg.use_cas) + return this_cpu_read(svsm_caa); + else + return rip_rel_ptr(&boot_svsm_ca_page); +} + +static inline u64 svsm_get_caa_pa(void) +{ + if (sev_cfg.use_cas) + return this_cpu_read(svsm_caa_pa); + else + return boot_svsm_caa_pa; +} + /* AP INIT values as documented in the APM2 section "Processor Initialization State" */ #define AP_INIT_CS_LIMIT 0xffff #define AP_INIT_DS_LIMIT 0xffff @@ -79,6 +121,7 @@ static const char * const sev_status_feat_names[] = { [MSR_AMD64_SNP_IBS_VIRT_BIT] = "IBSVirt", [MSR_AMD64_SNP_VMSA_REG_PROT_BIT] = "VMSARegProt", [MSR_AMD64_SNP_SMT_PROT_BIT] = "SMTProt", + [MSR_AMD64_SNP_SECURE_AVIC_BIT] = "SecureAVIC", }; /* @@ -100,6 +143,26 @@ DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa); */ u8 snp_vmpl __ro_after_init; EXPORT_SYMBOL_GPL(snp_vmpl); +SYM_PIC_ALIAS(snp_vmpl); + +/* + * Since feature negotiation related variables are set early in the boot + * process they must reside in the .data section so as not to be zeroed + * out when the .bss section is later cleared. + * + * GHCB protocol version negotiated with the hypervisor. + */ +u16 ghcb_version __ro_after_init; +SYM_PIC_ALIAS(ghcb_version); + +/* For early boot hypervisor communication in SEV-ES enabled guests */ +static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); + +/* + * Needs to be in the .data section because we need it NULL before bss is + * cleared + */ +struct ghcb *boot_ghcb __section(".data"); static u64 __init get_snp_jump_table_addr(void) { @@ -154,6 +217,73 @@ static u64 __init get_jump_table_addr(void) return ret; } +static int svsm_perform_ghcb_protocol(struct ghcb *ghcb, struct svsm_call *call) +{ + struct es_em_ctxt ctxt; + u8 pending = 0; + + vc_ghcb_invalidate(ghcb); + + /* + * Fill in protocol and format specifiers. This can be called very early + * in the boot, so use rip-relative references as needed. + */ + ghcb->protocol_version = ghcb_version; + ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; + + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_SNP_RUN_VMPL); + ghcb_set_sw_exit_info_1(ghcb, 0); + ghcb_set_sw_exit_info_2(ghcb, 0); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + + svsm_issue_call(call, &pending); + + if (pending) + return -EINVAL; + + switch (verify_exception_info(ghcb, &ctxt)) { + case ES_OK: + break; + case ES_EXCEPTION: + vc_forward_exception(&ctxt); + fallthrough; + default: + return -EINVAL; + } + + return svsm_process_result_codes(call); +} + +static int svsm_perform_call_protocol(struct svsm_call *call) +{ + struct ghcb_state state; + unsigned long flags; + struct ghcb *ghcb; + int ret; + + flags = native_local_irq_save(); + + if (sev_cfg.ghcbs_initialized) + ghcb = __sev_get_ghcb(&state); + else if (boot_ghcb) + ghcb = boot_ghcb; + else + ghcb = NULL; + + do { + ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call) + : __pi_svsm_perform_msr_protocol(call); + } while (ret == -EAGAIN); + + if (sev_cfg.ghcbs_initialized) + __sev_put_ghcb(&state); + + native_local_irq_restore(flags); + + return ret; +} + static inline void __pval_terminate(u64 pfn, bool action, unsigned int page_size, int ret, u64 svsm_ret) { @@ -531,8 +661,11 @@ static void set_pages_state(unsigned long vaddr, unsigned long npages, int op) unsigned long vaddr_end; /* Use the MSR protocol when a GHCB is not available. */ - if (!boot_ghcb) - return early_set_pages_state(vaddr, __pa(vaddr), npages, op); + if (!boot_ghcb) { + struct psc_desc d = { op, svsm_get_caa(), svsm_get_caa_pa() }; + + return early_set_pages_state(vaddr, __pa(vaddr), npages, &d); + } vaddr = vaddr & PAGE_MASK; vaddr_end = vaddr + (npages << PAGE_SHIFT); @@ -973,6 +1106,9 @@ static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip, unsigned vmsa->x87_ftw = AP_INIT_X87_FTW_DEFAULT; vmsa->x87_fcw = AP_INIT_X87_FCW_DEFAULT; + if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC)) + vmsa->vintr_ctrl |= V_GIF_MASK | V_NMI_ENABLE_MASK; + /* SVME must be set. */ vmsa->efer = EFER_SVME; @@ -1107,6 +1243,105 @@ int __init sev_es_efi_map_ghcbs_cas(pgd_t *pgd) return 0; } +u64 savic_ghcb_msr_read(u32 reg) +{ + u64 msr = APIC_BASE_MSR + (reg >> 4); + struct pt_regs regs = { .cx = msr }; + struct es_em_ctxt ctxt = { .regs = ®s }; + struct ghcb_state state; + enum es_result res; + struct ghcb *ghcb; + + guard(irqsave)(); + + ghcb = __sev_get_ghcb(&state); + vc_ghcb_invalidate(ghcb); + + res = sev_es_ghcb_handle_msr(ghcb, &ctxt, false); + if (res != ES_OK) { + pr_err("Secure AVIC MSR (0x%llx) read returned error (%d)\n", msr, res); + /* MSR read failures are treated as fatal errors */ + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL); + } + + __sev_put_ghcb(&state); + + return regs.ax | regs.dx << 32; +} + +void savic_ghcb_msr_write(u32 reg, u64 value) +{ + u64 msr = APIC_BASE_MSR + (reg >> 4); + struct pt_regs regs = { + .cx = msr, + .ax = lower_32_bits(value), + .dx = upper_32_bits(value) + }; + struct es_em_ctxt ctxt = { .regs = ®s }; + struct ghcb_state state; + enum es_result res; + struct ghcb *ghcb; + + guard(irqsave)(); + + ghcb = __sev_get_ghcb(&state); + vc_ghcb_invalidate(ghcb); + + res = sev_es_ghcb_handle_msr(ghcb, &ctxt, true); + if (res != ES_OK) { + pr_err("Secure AVIC MSR (0x%llx) write returned error (%d)\n", msr, res); + /* MSR writes should never fail. Any failure is fatal error for SNP guest */ + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL); + } + + __sev_put_ghcb(&state); +} + +enum es_result savic_register_gpa(u64 gpa) +{ + struct ghcb_state state; + struct es_em_ctxt ctxt; + enum es_result res; + struct ghcb *ghcb; + + guard(irqsave)(); + + ghcb = __sev_get_ghcb(&state); + vc_ghcb_invalidate(ghcb); + + ghcb_set_rax(ghcb, SVM_VMGEXIT_SAVIC_SELF_GPA); + ghcb_set_rbx(ghcb, gpa); + res = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_SAVIC, + SVM_VMGEXIT_SAVIC_REGISTER_GPA, 0); + + __sev_put_ghcb(&state); + + return res; +} + +enum es_result savic_unregister_gpa(u64 *gpa) +{ + struct ghcb_state state; + struct es_em_ctxt ctxt; + enum es_result res; + struct ghcb *ghcb; + + guard(irqsave)(); + + ghcb = __sev_get_ghcb(&state); + vc_ghcb_invalidate(ghcb); + + ghcb_set_rax(ghcb, SVM_VMGEXIT_SAVIC_SELF_GPA); + res = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_SAVIC, + SVM_VMGEXIT_SAVIC_UNREGISTER_GPA, 0); + if (gpa && res == ES_OK) + *gpa = ghcb->save.rbx; + + __sev_put_ghcb(&state); + + return res; +} + static void snp_register_per_cpu_ghcb(void) { struct sev_es_runtime_data *data; @@ -1233,7 +1468,8 @@ static void __init alloc_runtime_data(int cpu) struct svsm_ca *caa; /* Allocate the SVSM CA page if an SVSM is present */ - caa = memblock_alloc_or_panic(sizeof(*caa), PAGE_SIZE); + caa = cpu ? memblock_alloc_or_panic(sizeof(*caa), PAGE_SIZE) + : &boot_svsm_ca_page; per_cpu(svsm_caa, cpu) = caa; per_cpu(svsm_caa_pa, cpu) = __pa(caa); @@ -1287,32 +1523,9 @@ void __init sev_es_init_vc_handling(void) init_ghcb(cpu); } - /* If running under an SVSM, switch to the per-cpu CA */ - if (snp_vmpl) { - struct svsm_call call = {}; - unsigned long flags; - int ret; - - local_irq_save(flags); - - /* - * SVSM_CORE_REMAP_CA call: - * RAX = 0 (Protocol=0, CallID=0) - * RCX = New CA GPA - */ - call.caa = svsm_get_caa(); - call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA); - call.rcx = this_cpu_read(svsm_caa_pa); - ret = svsm_perform_call_protocol(&call); - if (ret) - panic("Can't remap the SVSM CA, ret=%d, rax_out=0x%llx\n", - ret, call.rax_out); - + if (snp_vmpl) sev_cfg.use_cas = true; - local_irq_restore(flags); - } - sev_es_setup_play_dead(); /* Secondary CPUs use the runtime #VC handler */ @@ -1590,15 +1803,6 @@ void sev_show_status(void) pr_cont("\n"); } -void __init snp_update_svsm_ca(void) -{ - if (!snp_vmpl) - return; - - /* Update the CAA to a proper kernel address */ - boot_svsm_caa = &boot_svsm_ca_page; -} - #ifdef CONFIG_SYSFS static ssize_t vmpl_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) diff --git a/arch/x86/coco/sev/sev-nmi.c b/arch/x86/coco/sev/noinstr.c index d8dfaddfb367..b527eafb6312 100644 --- a/arch/x86/coco/sev/sev-nmi.c +++ b/arch/x86/coco/sev/noinstr.c @@ -106,3 +106,77 @@ void noinstr __sev_es_nmi_complete(void) __sev_put_ghcb(&state); } + +/* + * Nothing shall interrupt this code path while holding the per-CPU + * GHCB. The backup GHCB is only for NMIs interrupting this path. + * + * Callers must disable local interrupts around it. + */ +noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + WARN_ON(!irqs_disabled()); + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (unlikely(data->ghcb_active)) { + /* GHCB is already in use - save its contents */ + + if (unlikely(data->backup_ghcb_active)) { + /* + * Backup-GHCB is also already in use. There is no way + * to continue here so just kill the machine. To make + * panic() work, mark GHCBs inactive so that messages + * can be printed out. + */ + data->ghcb_active = false; + data->backup_ghcb_active = false; + + instrumentation_begin(); + panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); + instrumentation_end(); + } + + /* Mark backup_ghcb active before writing to it */ + data->backup_ghcb_active = true; + + state->ghcb = &data->backup_ghcb; + + /* Backup GHCB content */ + *state->ghcb = *ghcb; + } else { + state->ghcb = NULL; + data->ghcb_active = true; + } + + return ghcb; +} + +noinstr void __sev_put_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + WARN_ON(!irqs_disabled()); + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (state->ghcb) { + /* Restore GHCB from Backup */ + *ghcb = *state->ghcb; + data->backup_ghcb_active = false; + state->ghcb = NULL; + } else { + /* + * Invalidate the GHCB so a VMGEXIT instruction issued + * from userspace won't appear to be valid. + */ + vc_ghcb_invalidate(ghcb); + data->ghcb_active = false; + } +} diff --git a/arch/x86/coco/sev/vc-handle.c b/arch/x86/coco/sev/vc-handle.c index c3b4acbde0d8..7fc136a35334 100644 --- a/arch/x86/coco/sev/vc-handle.c +++ b/arch/x86/coco/sev/vc-handle.c @@ -351,6 +351,8 @@ fault: } #define sev_printk(fmt, ...) printk(fmt, ##__VA_ARGS__) +#define error(v) +#define has_cpuflag(f) boot_cpu_has(f) #include "vc-shared.c" @@ -402,14 +404,10 @@ static enum es_result __vc_handle_secure_tsc_msrs(struct es_em_ctxt *ctxt, bool return ES_OK; } -static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +enum es_result sev_es_ghcb_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt, bool write) { struct pt_regs *regs = ctxt->regs; enum es_result ret; - bool write; - - /* Is it a WRMSR? */ - write = ctxt->insn.opcode.bytes[1] == 0x30; switch (regs->cx) { case MSR_SVSM_CAA: @@ -419,6 +417,15 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) if (sev_status & MSR_AMD64_SNP_SECURE_TSC) return __vc_handle_secure_tsc_msrs(ctxt, write); break; + case MSR_AMD64_SAVIC_CONTROL: + /* + * AMD64_SAVIC_CONTROL should not be intercepted when + * Secure AVIC is enabled. Terminate the Secure AVIC guest + * if the interception is enabled. + */ + if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC)) + return ES_VMM_ERROR; + break; default: break; } @@ -439,6 +446,11 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) return ret; } +static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + return sev_es_ghcb_handle_msr(ghcb, ctxt, ctxt->insn.opcode.bytes[1] == 0x30); +} + static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) { int trapnr = ctxt->fi.vector; diff --git a/arch/x86/coco/sev/vc-shared.c b/arch/x86/coco/sev/vc-shared.c index 2c0ab0fdc060..9b01c9ad81be 100644 --- a/arch/x86/coco/sev/vc-shared.c +++ b/arch/x86/coco/sev/vc-shared.c @@ -409,15 +409,109 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) return ret; } +enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + u32 ret; + + ret = ghcb->save.sw_exit_info_1 & GENMASK_ULL(31, 0); + if (!ret) + return ES_OK; + + if (ret == 1) { + u64 info = ghcb->save.sw_exit_info_2; + unsigned long v = info & SVM_EVTINJ_VEC_MASK; + + /* Check if exception information from hypervisor is sane. */ + if ((info & SVM_EVTINJ_VALID) && + ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) && + ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) { + ctxt->fi.vector = v; + + if (info & SVM_EVTINJ_VALID_ERR) + ctxt->fi.error_code = info >> 32; + + return ES_EXCEPTION; + } + } + + return ES_VMM_ERROR; +} + +enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, + struct es_em_ctxt *ctxt, + u64 exit_code, u64 exit_info_1, + u64 exit_info_2) +{ + /* Fill in protocol and format specifiers */ + ghcb->protocol_version = ghcb_version; + ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; + + ghcb_set_sw_exit_code(ghcb, exit_code); + ghcb_set_sw_exit_info_1(ghcb, exit_info_1); + ghcb_set_sw_exit_info_2(ghcb, exit_info_2); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + return verify_exception_info(ghcb, ctxt); +} + +static int __sev_cpuid_hv_ghcb(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) +{ + u32 cr4 = native_read_cr4(); + int ret; + + ghcb_set_rax(ghcb, leaf->fn); + ghcb_set_rcx(ghcb, leaf->subfn); + + if (cr4 & X86_CR4_OSXSAVE) + /* Safe to read xcr0 */ + ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); + else + /* xgetbv will cause #UD - use reset value for xcr0 */ + ghcb_set_xcr0(ghcb, 1); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && + ghcb_rbx_is_valid(ghcb) && + ghcb_rcx_is_valid(ghcb) && + ghcb_rdx_is_valid(ghcb))) + return ES_VMM_ERROR; + + leaf->eax = ghcb->save.rax; + leaf->ebx = ghcb->save.rbx; + leaf->ecx = ghcb->save.rcx; + leaf->edx = ghcb->save.rdx; + + return ES_OK; +} + +struct cpuid_ctx { + struct ghcb *ghcb; + struct es_em_ctxt *ctxt; +}; + +static void snp_cpuid_hv_ghcb(void *p, struct cpuid_leaf *leaf) +{ + struct cpuid_ctx *ctx = p; + + if (__sev_cpuid_hv_ghcb(ctx->ghcb, ctx->ctxt, leaf)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV); +} + static int vc_handle_cpuid_snp(struct ghcb *ghcb, struct es_em_ctxt *ctxt) { + struct cpuid_ctx ctx = { ghcb, ctxt }; struct pt_regs *regs = ctxt->regs; struct cpuid_leaf leaf; int ret; leaf.fn = regs->ax; leaf.subfn = regs->cx; - ret = snp_cpuid(ghcb, ctxt, &leaf); + ret = snp_cpuid(snp_cpuid_hv_ghcb, &ctx, &leaf); if (!ret) { regs->ax = leaf.eax; regs->bx = leaf.ebx; @@ -502,3 +596,50 @@ static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, return ES_OK; } + +void snp_register_ghcb_early(unsigned long paddr) +{ + unsigned long pfn = paddr >> PAGE_SHIFT; + u64 val; + + sev_es_wr_ghcb_msr(GHCB_MSR_REG_GPA_REQ_VAL(pfn)); + VMGEXIT(); + + val = sev_es_rd_ghcb_msr(); + + /* If the response GPA is not ours then abort the guest */ + if ((GHCB_RESP_CODE(val) != GHCB_MSR_REG_GPA_RESP) || + (GHCB_MSR_REG_GPA_RESP_VAL(val) != pfn)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_REGISTER); +} + +bool __init sev_es_check_cpu_features(void) +{ + if (!has_cpuflag(X86_FEATURE_RDRAND)) { + error("RDRAND instruction not supported - no trusted source of randomness available\n"); + return false; + } + + return true; +} + +bool sev_es_negotiate_protocol(void) +{ + u64 val; + + /* Do the GHCB protocol version negotiation */ + sev_es_wr_ghcb_msr(GHCB_MSR_SEV_INFO_REQ); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + + if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP) + return false; + + if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN || + GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX) + return false; + + ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val), GHCB_PROTOCOL_MAX); + + return true; +} diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 92cf0fe2291e..ced2a1deecd7 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -345,6 +345,7 @@ 333 common io_pgetevents sys_io_pgetevents 334 common rseq sys_rseq 335 common uretprobe sys_uretprobe +336 common uprobe sys_uprobe # don't use numbers 387 through 423, add new calls after the last # 'common' entry 424 common pidfd_send_signal sys_pidfd_send_signal diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 7610f26dfbd9..745caa6c15a3 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2069,13 +2069,15 @@ static void _x86_pmu_read(struct perf_event *event) void x86_pmu_show_pmu_cap(struct pmu *pmu) { - pr_info("... version: %d\n", x86_pmu.version); - pr_info("... bit width: %d\n", x86_pmu.cntval_bits); - pr_info("... generic registers: %d\n", x86_pmu_num_counters(pmu)); - pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); - pr_info("... max period: %016Lx\n", x86_pmu.max_period); - pr_info("... fixed-purpose events: %d\n", x86_pmu_num_counters_fixed(pmu)); - pr_info("... event mask: %016Lx\n", hybrid(pmu, intel_ctrl)); + pr_info("... version: %d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.cntval_bits); + pr_info("... generic counters: %d\n", x86_pmu_num_counters(pmu)); + pr_info("... generic bitmap: %016llx\n", hybrid(pmu, cntr_mask64)); + pr_info("... fixed-purpose counters: %d\n", x86_pmu_num_counters_fixed(pmu)); + pr_info("... fixed-purpose bitmap: %016llx\n", hybrid(pmu, fixed_cntr_mask64)); + pr_info("... value mask: %016llx\n", x86_pmu.cntval_mask); + pr_info("... max period: %016llx\n", x86_pmu.max_period); + pr_info("... global_ctrl mask: %016llx\n", hybrid(pmu, intel_ctrl)); } static int __init init_hw_perf_events(void) diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 61da6b8a3d51..cbac54cb3a9e 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -643,4 +643,4 @@ static __init int bts_init(void) return perf_pmu_register(&bts_pmu, "intel_bts", -1); } -arch_initcall(bts_init); +early_initcall(bts_init); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index c2fb729c270e..28f5468a6ea3 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2845,8 +2845,8 @@ static void intel_pmu_enable_fixed(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - u64 mask, bits = 0; int idx = hwc->idx; + u64 bits = 0; if (is_topdown_idx(idx)) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -2885,14 +2885,10 @@ static void intel_pmu_enable_fixed(struct perf_event *event) idx -= INTEL_PMC_IDX_FIXED; bits = intel_fixed_bits_by_idx(idx, bits); - mask = intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK); - - if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) { + if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) bits |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE); - mask |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE); - } - cpuc->fixed_ctrl_val &= ~mask; + cpuc->fixed_ctrl_val &= ~intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK); cpuc->fixed_ctrl_val |= bits; } @@ -2997,7 +2993,8 @@ static void intel_pmu_acr_late_setup(struct cpu_hw_events *cpuc) if (event->group_leader != leader->group_leader) break; for_each_set_bit(idx, (unsigned long *)&event->attr.config2, X86_PMC_IDX_MAX) { - if (WARN_ON_ONCE(i + idx > cpuc->n_events)) + if (i + idx >= cpuc->n_events || + !is_acr_event_group(cpuc->event_list[i + idx])) return; __set_bit(cpuc->assign[i + idx], (unsigned long *)&event->hw.config1); } @@ -5318,9 +5315,9 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) 0, x86_pmu_num_counters(&pmu->pmu), 0, 0); if (pmu->intel_cap.perf_metrics) - pmu->intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; + pmu->intel_ctrl |= GLOBAL_CTRL_EN_PERF_METRICS; else - pmu->intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS); + pmu->intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS; intel_pmu_check_event_constraints(pmu->event_constraints, pmu->cntr_mask64, @@ -5455,7 +5452,7 @@ static void intel_pmu_cpu_starting(int cpu) rdmsrq(MSR_IA32_PERF_CAPABILITIES, perf_cap.capabilities); if (!perf_cap.perf_metrics) { x86_pmu.intel_cap.perf_metrics = 0; - x86_pmu.intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS); + x86_pmu.intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS; } } @@ -7789,7 +7786,7 @@ __init int intel_pmu_init(void) } if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics) - x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; + x86_pmu.intel_ctrl |= GLOBAL_CTRL_EN_PERF_METRICS; if (x86_pmu.intel_cap.pebs_timing_info) x86_pmu.flags |= PMU_FL_RETIRE_LATENCY; diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 07ba4935e873..a26e66d66444 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -305,6 +305,8 @@ struct apic { /* Probe, setup and smpboot functions */ int (*probe)(void); + void (*setup)(void); + void (*teardown)(void); int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); void (*init_apic_ldr)(void); @@ -317,6 +319,8 @@ struct apic { /* wakeup secondary CPU using 64-bit wakeup point */ int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip, unsigned int cpu); + void (*update_vector)(unsigned int cpu, unsigned int vector, bool set); + char *name; }; @@ -470,6 +474,12 @@ static __always_inline bool apic_id_valid(u32 apic_id) return apic_id <= apic->max_apic_id; } +static __always_inline void apic_update_vector(unsigned int cpu, unsigned int vector, bool set) +{ + if (apic->update_vector) + apic->update_vector(cpu, vector, set); +} + #else /* CONFIG_X86_LOCAL_APIC */ static inline u32 apic_read(u32 reg) { return 0; } @@ -481,6 +491,7 @@ static inline void apic_wait_icr_idle(void) { } static inline u32 safe_apic_wait_icr_idle(void) { return 0; } static inline void apic_native_eoi(void) { WARN_ON_ONCE(1); } static inline void apic_setup_apic_calls(void) { } +static inline void apic_update_vector(unsigned int cpu, unsigned int vector, bool set) { } #define apic_update_callback(_callback, _fn) do { } while (0) diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 094106b6a538..be39a543fbe5 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -135,6 +135,8 @@ #define APIC_TDR_DIV_128 0xA #define APIC_EFEAT 0x400 #define APIC_ECTRL 0x410 +#define APIC_SEOI 0x420 +#define APIC_IER 0x480 #define APIC_EILVTn(n) (0x500 + 0x10 * n) #define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */ #define APIC_EILVT_NR_AMD_10H 4 diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 02b23aa78955..f7b67cb73915 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -82,6 +82,8 @@ #ifndef __ASSEMBLER__ extern unsigned int output_len; extern const unsigned long kernel_text_size; +extern const unsigned long kernel_inittext_offset; +extern const unsigned long kernel_inittext_size; extern const unsigned long kernel_total_size; unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr, diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 751ca35386b0..b2a562217d3f 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -496,6 +496,7 @@ #define X86_FEATURE_TSA_L1_NO (21*32+12) /* AMD CPU not vulnerable to TSA-L1 */ #define X86_FEATURE_CLEAR_CPU_BUF_VM (21*32+13) /* Clear CPU buffers using VERW before VMRUN */ #define X86_FEATURE_IBPB_EXIT_TO_USER (21*32+14) /* Use IBPB on exit-to-userspace, see VMSCAPE bug */ +#define X86_FEATURE_ABMC (21*32+15) /* Assignable Bandwidth Monitoring Counters */ /* * BUG word(s) diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index e41cbf2ec41d..9ad86a7d13f6 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -30,6 +30,7 @@ enum x86_hypervisor_type { X86_HYPER_KVM, X86_HYPER_JAILHOUSE, X86_HYPER_ACRN, + X86_HYPER_BHYVE, }; #ifdef CONFIG_HYPERVISOR_GUEST @@ -64,6 +65,7 @@ extern const struct hypervisor_x86 x86_hyper_xen_pv; extern const struct hypervisor_x86 x86_hyper_kvm; extern const struct hypervisor_x86 x86_hyper_jailhouse; extern const struct hypervisor_x86 x86_hyper_acrn; +extern const struct hypervisor_x86 x86_hyper_bhyve; extern struct hypervisor_x86 x86_hyper_xen_hvm; extern bool nopv; diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h index 97f341777db5..1b3060a3425c 100644 --- a/arch/x86/include/asm/inat.h +++ b/arch/x86/include/asm/inat.h @@ -37,6 +37,8 @@ #define INAT_PFX_EVEX 15 /* EVEX prefix */ /* x86-64 REX2 prefix */ #define INAT_PFX_REX2 16 /* 0xD5 */ +/* AMD XOP prefix */ +#define INAT_PFX_XOP 17 /* 0x8F */ #define INAT_LSTPFX_MAX 3 #define INAT_LGCPFX_MAX 11 @@ -77,6 +79,7 @@ #define INAT_MOFFSET (1 << (INAT_FLAG_OFFS + 3)) #define INAT_VARIANT (1 << (INAT_FLAG_OFFS + 4)) #define INAT_VEXOK (1 << (INAT_FLAG_OFFS + 5)) +#define INAT_XOPOK INAT_VEXOK #define INAT_VEXONLY (1 << (INAT_FLAG_OFFS + 6)) #define INAT_EVEXONLY (1 << (INAT_FLAG_OFFS + 7)) #define INAT_NO_REX2 (1 << (INAT_FLAG_OFFS + 8)) @@ -111,6 +114,8 @@ extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm, extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, insn_byte_t vex_pp); +extern insn_attr_t inat_get_xop_attribute(insn_byte_t opcode, + insn_byte_t map_select); /* Attribute checking functions */ static inline int inat_is_legacy_prefix(insn_attr_t attr) @@ -164,6 +169,11 @@ static inline int inat_is_vex3_prefix(insn_attr_t attr) return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3; } +static inline int inat_is_xop_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_XOP; +} + static inline int inat_is_escape(insn_attr_t attr) { return attr & INAT_ESC_MASK; @@ -229,6 +239,11 @@ static inline int inat_accept_vex(insn_attr_t attr) return attr & INAT_VEXOK; } +static inline int inat_accept_xop(insn_attr_t attr) +{ + return attr & INAT_XOPOK; +} + static inline int inat_must_vex(insn_attr_t attr) { return attr & (INAT_VEXONLY | INAT_EVEXONLY); diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 5a68e9db6518..01ccdd168df0 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -2,12 +2,6 @@ #ifndef _ASM_X86_INIT_H #define _ASM_X86_INIT_H -#if defined(CONFIG_CC_IS_CLANG) && CONFIG_CLANG_VERSION < 170000 -#define __head __section(".head.text") __no_sanitize_undefined __no_stack_protector -#else -#define __head __section(".head.text") __no_sanitize_undefined __no_kstack_erase -#endif - struct x86_mapping_info { void *(*alloc_pgt_page)(void *); /* allocate buf for page table */ void (*free_pgt_page)(void *, void *); /* free buf for page table */ diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 7152ea809e6a..091f88c8254d 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -71,7 +71,10 @@ struct insn { * prefixes.bytes[3]: last prefix */ struct insn_field rex_prefix; /* REX prefix */ - struct insn_field vex_prefix; /* VEX prefix */ + union { + struct insn_field vex_prefix; /* VEX prefix */ + struct insn_field xop_prefix; /* XOP prefix */ + }; struct insn_field opcode; /* * opcode.bytes[0]: opcode1 * opcode.bytes[1]: opcode2 @@ -135,6 +138,17 @@ struct insn { #define X86_VEX_V(vex) (((vex) & 0x78) >> 3) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ +/* XOP bit fields */ +#define X86_XOP_R(xop) ((xop) & 0x80) /* XOP Byte2 */ +#define X86_XOP_X(xop) ((xop) & 0x40) /* XOP Byte2 */ +#define X86_XOP_B(xop) ((xop) & 0x20) /* XOP Byte2 */ +#define X86_XOP_M(xop) ((xop) & 0x1f) /* XOP Byte2 */ +#define X86_XOP_W(xop) ((xop) & 0x80) /* XOP Byte3 */ +#define X86_XOP_V(xop) ((xop) & 0x78) /* XOP Byte3 */ +#define X86_XOP_L(xop) ((xop) & 0x04) /* XOP Byte3 */ +#define X86_XOP_P(xop) ((xop) & 0x03) /* XOP Byte3 */ +#define X86_XOP_M_MIN 0x08 /* Min of XOP.M */ +#define X86_XOP_M_MAX 0x1f /* Max of XOP.M */ extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64); extern int insn_get_prefixes(struct insn *insn); @@ -178,7 +192,7 @@ static inline insn_byte_t insn_rex2_m_bit(struct insn *insn) return X86_REX2_M(insn->rex_prefix.bytes[1]); } -static inline int insn_is_avx(struct insn *insn) +static inline int insn_is_avx_or_xop(struct insn *insn) { if (!insn->prefixes.got) insn_get_prefixes(insn); @@ -192,6 +206,22 @@ static inline int insn_is_evex(struct insn *insn) return (insn->vex_prefix.nbytes == 4); } +/* If we already know this is AVX/XOP encoded */ +static inline int avx_insn_is_xop(struct insn *insn) +{ + insn_attr_t attr = inat_get_opcode_attribute(insn->vex_prefix.bytes[0]); + + return inat_is_xop_prefix(attr); +} + +static inline int insn_is_xop(struct insn *insn) +{ + if (!insn_is_avx_or_xop(insn)) + return 0; + + return avx_insn_is_xop(insn); +} + static inline int insn_has_emulate_prefix(struct insn *insn) { return !!insn->emulate_prefix_size; @@ -222,11 +252,26 @@ static inline insn_byte_t insn_vex_w_bit(struct insn *insn) return X86_VEX_W(insn->vex_prefix.bytes[2]); } +static inline insn_byte_t insn_xop_map_bits(struct insn *insn) +{ + if (insn->xop_prefix.nbytes < 3) /* XOP is 3 bytes */ + return 0; + return X86_XOP_M(insn->xop_prefix.bytes[1]); +} + +static inline insn_byte_t insn_xop_p_bits(struct insn *insn) +{ + return X86_XOP_P(insn->vex_prefix.bytes[2]); +} + /* Get the last prefix id from last prefix or VEX prefix */ static inline int insn_last_prefix_id(struct insn *insn) { - if (insn_is_avx(insn)) + if (insn_is_avx_or_xop(insn)) { + if (avx_insn_is_xop(insn)) + return insn_xop_p_bits(insn); return insn_vex_p_bits(insn); /* VEX_p is a SIMD prefix id */ + } if (insn->prefixes.bytes[3]) return inat_get_last_prefix_id(insn->prefixes.bytes[3]); diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index e345dbdf933e..f32a0eca2ae5 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -51,7 +51,7 @@ #define INTEL_PENTIUM_MMX IFM(5, 0x04) /* P55C */ #define INTEL_QUARK_X1000 IFM(5, 0x09) /* Quark X1000 SoC */ -/* Family 6 */ +/* Family 6, 18, 19 */ #define INTEL_PENTIUM_PRO IFM(6, 0x01) #define INTEL_PENTIUM_II_KLAMATH IFM(6, 0x03) #define INTEL_PENTIUM_III_DESCHUTES IFM(6, 0x05) @@ -126,6 +126,8 @@ #define INTEL_GRANITERAPIDS_X IFM(6, 0xAD) /* Redwood Cove */ #define INTEL_GRANITERAPIDS_D IFM(6, 0xAE) +#define INTEL_DIAMONDRAPIDS_X IFM(19, 0x01) /* Panther Cove */ + #define INTEL_BARTLETTLAKE IFM(6, 0xD7) /* Raptor Cove */ /* "Hybrid" Processors (P-Core/E-Core) */ @@ -203,9 +205,6 @@ #define INTEL_P4_PRESCOTT_2M IFM(15, 0x04) #define INTEL_P4_CEDARMILL IFM(15, 0x06) /* Also Xeon Dempsey */ -/* Family 19 */ -#define INTEL_PANTHERCOVE_X IFM(19, 0x01) /* Diamond Rapids */ - /* * Intel CPU core types * diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 6c77c03139f7..31e3cb550fb3 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -241,12 +241,14 @@ struct cper_ia_proc_ctx; #ifdef CONFIG_X86_MCE int mcheck_init(void); +void mca_bsp_init(struct cpuinfo_x86 *c); void mcheck_cpu_init(struct cpuinfo_x86 *c); void mcheck_cpu_clear(struct cpuinfo_x86 *c); int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id); #else static inline int mcheck_init(void) { return 0; } +static inline void mca_bsp_init(struct cpuinfo_x86 *c) {} static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} static inline void mcheck_cpu_clear(struct cpuinfo_x86 *c) {} static inline int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, @@ -290,8 +292,7 @@ DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); enum mcp_flags { MCP_TIMESTAMP = BIT(0), /* log time stamp */ MCP_UC = BIT(1), /* log uncorrected errors */ - MCP_DONTLOG = BIT(2), /* only clear, don't log */ - MCP_QUEUE_LOG = BIT(3), /* only queue to genpool */ + MCP_QUEUE_LOG = BIT(2), /* only queue to genpool */ }; void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); @@ -371,15 +372,9 @@ enum smca_bank_types { extern bool amd_mce_is_memory_error(struct mce *m); -extern int mce_threshold_create_device(unsigned int cpu); -extern int mce_threshold_remove_device(unsigned int cpu); - void mce_amd_feature_init(struct cpuinfo_x86 *c); enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank); #else - -static inline int mce_threshold_create_device(unsigned int cpu) { return 0; }; -static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; }; static inline bool amd_mce_is_memory_error(struct mce *m) { return false; }; static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } #endif diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b65c3ba5fa14..718a55d82fe4 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -315,12 +315,14 @@ #define PERF_CAP_PT_IDX 16 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 -#define PERF_CAP_PEBS_TRAP BIT_ULL(6) -#define PERF_CAP_ARCH_REG BIT_ULL(7) -#define PERF_CAP_PEBS_FORMAT 0xf00 -#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) -#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ - PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE) +#define PERF_CAP_PEBS_TRAP BIT_ULL(6) +#define PERF_CAP_ARCH_REG BIT_ULL(7) +#define PERF_CAP_PEBS_FORMAT 0xf00 +#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) +#define PERF_CAP_PEBS_TIMING_INFO BIT_ULL(17) +#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ + PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \ + PERF_CAP_PEBS_TIMING_INFO) #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) @@ -631,6 +633,11 @@ #define MSR_AMD_PPIN 0xc00102f1 #define MSR_AMD64_CPUID_FN_7 0xc0011002 #define MSR_AMD64_CPUID_FN_1 0xc0011004 + +#define MSR_AMD64_CPUID_EXT_FEAT 0xc0011005 +#define MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT_BIT 54 +#define MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT BIT_ULL(MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT_BIT) + #define MSR_AMD64_LS_CFG 0xc0011020 #define MSR_AMD64_DC_CFG 0xc0011022 #define MSR_AMD64_TW_CFG 0xc0011023 @@ -699,8 +706,15 @@ #define MSR_AMD64_SNP_VMSA_REG_PROT BIT_ULL(MSR_AMD64_SNP_VMSA_REG_PROT_BIT) #define MSR_AMD64_SNP_SMT_PROT_BIT 17 #define MSR_AMD64_SNP_SMT_PROT BIT_ULL(MSR_AMD64_SNP_SMT_PROT_BIT) -#define MSR_AMD64_SNP_RESV_BIT 18 +#define MSR_AMD64_SNP_SECURE_AVIC_BIT 18 +#define MSR_AMD64_SNP_SECURE_AVIC BIT_ULL(MSR_AMD64_SNP_SECURE_AVIC_BIT) +#define MSR_AMD64_SNP_RESV_BIT 19 #define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT) +#define MSR_AMD64_SAVIC_CONTROL 0xc0010138 +#define MSR_AMD64_SAVIC_EN_BIT 0 +#define MSR_AMD64_SAVIC_EN BIT_ULL(MSR_AMD64_SAVIC_EN_BIT) +#define MSR_AMD64_SAVIC_ALLOWEDNMI_BIT 1 +#define MSR_AMD64_SAVIC_ALLOWEDNMI BIT_ULL(MSR_AMD64_SAVIC_ALLOWEDNMI_BIT) #define MSR_AMD64_RMP_BASE 0xc0010132 #define MSR_AMD64_RMP_END 0xc0010133 #define MSR_AMD64_RMP_CFG 0xc0010136 @@ -1223,6 +1237,8 @@ /* - AMD: */ #define MSR_IA32_MBA_BW_BASE 0xc0000200 #define MSR_IA32_SMBA_BW_BASE 0xc0000280 +#define MSR_IA32_L3_QOS_ABMC_CFG 0xc00003fd +#define MSR_IA32_L3_QOS_EXT_CFG 0xc00003ff #define MSR_IA32_EVT_CFG_BASE 0xc0000400 /* AMD-V MSRs */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index e29f82466f43..08ed5a2e46a5 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -514,6 +514,7 @@ enum spectre_v2_user_mitigation { /* The Speculative Store Bypass disable variants */ enum ssb_mitigation { SPEC_STORE_BYPASS_NONE, + SPEC_STORE_BYPASS_AUTO, SPEC_STORE_BYPASS_DISABLE, SPEC_STORE_BYPASS_PRCTL, SPEC_STORE_BYPASS_SECCOMP, diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 70d1d94aca7e..49a4d442f3fc 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -35,7 +35,6 @@ #define ARCH_PERFMON_EVENTSEL_EQ (1ULL << 36) #define ARCH_PERFMON_EVENTSEL_UMASK2 (0xFFULL << 40) -#define INTEL_FIXED_BITS_MASK 0xFULL #define INTEL_FIXED_BITS_STRIDE 4 #define INTEL_FIXED_0_KERNEL (1ULL << 0) #define INTEL_FIXED_0_USER (1ULL << 1) @@ -48,6 +47,11 @@ #define ICL_EVENTSEL_ADAPTIVE (1ULL << 34) #define ICL_FIXED_0_ADAPTIVE (1ULL << 32) +#define INTEL_FIXED_BITS_MASK \ + (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER | \ + INTEL_FIXED_0_ANYTHREAD | INTEL_FIXED_0_ENABLE_PMI | \ + ICL_FIXED_0_ADAPTIVE) + #define intel_fixed_bits_by_idx(_idx, _bits) \ ((_bits) << ((_idx) * INTEL_FIXED_BITS_STRIDE)) @@ -430,7 +434,7 @@ static inline bool is_topdown_idx(int idx) #define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(GLOBAL_STATUS_TRACE_TOPAPMI_BIT) #define GLOBAL_STATUS_PERF_METRICS_OVF_BIT 48 -#define GLOBAL_CTRL_EN_PERF_METRICS 48 +#define GLOBAL_CTRL_EN_PERF_METRICS BIT_ULL(48) /* * We model guest LBR event tracing as another fixed-mode PMC like BTS. * diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index feb93b50e990..575f8408a9e7 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -44,7 +44,6 @@ DECLARE_PER_CPU(struct resctrl_pqr_state, pqr_state); extern bool rdt_alloc_capable; extern bool rdt_mon_capable; -extern unsigned int rdt_mon_features; DECLARE_STATIC_KEY_FALSE(rdt_enable_key); DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); @@ -84,21 +83,6 @@ static inline void resctrl_arch_disable_mon(void) static_branch_dec_cpuslocked(&rdt_enable_key); } -static inline bool resctrl_arch_is_llc_occupancy_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID)); -} - -static inline bool resctrl_arch_is_mbm_total_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID)); -} - -static inline bool resctrl_arch_is_mbm_local_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID)); -} - /* * __resctrl_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR * diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 77d8f49b92bd..f59ae7186940 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -244,7 +244,7 @@ static inline unsigned long vdso_encode_cpunode(int cpu, unsigned long node) static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node) { - unsigned int p; + unsigned long p; /* * Load CPU and node number from the GDT. LSL is faster than RDTSCP @@ -254,10 +254,10 @@ static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node) * * If RDPID is available, use it. */ - alternative_io ("lsl %[seg],%[p]", - ".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */ + alternative_io ("lsl %[seg],%k[p]", + "rdpid %[p]", X86_FEATURE_RDPID, - [p] "=a" (p), [seg] "r" (__CPUNODE_SEG)); + [p] "=r" (p), [seg] "r" (__CPUNODE_SEG)); if (cpu) *cpu = (p & VDSO_CPUNODE_MASK); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 692af46603a1..914eb32581c7 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -53,6 +53,7 @@ extern void i386_reserve_resources(void); extern unsigned long __startup_64(unsigned long p2v_offset, struct boot_params *bp); extern void startup_64_setup_gdt_idt(void); extern void startup_64_load_idt(void *vc_handler); +extern void __pi_startup_64_load_idt(void *vc_handler); extern void early_setup_idt(void); extern void __init do_early_exception(struct pt_regs *regs, int trapnr); diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 0020d77a0800..01a6e4dbe423 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -208,6 +208,7 @@ struct snp_psc_desc { #define GHCB_TERM_SVSM_CAA 9 /* SVSM is present but CAA is not page aligned */ #define GHCB_TERM_SECURE_TSC 10 /* Secure TSC initialization failed */ #define GHCB_TERM_SVSM_CA_REMAP_FAIL 11 /* SVSM is present but CA could not be remapped */ +#define GHCB_TERM_SAVIC_FAIL 12 /* Secure AVIC-specific failure */ #define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK) diff --git a/arch/x86/include/asm/sev-internal.h b/arch/x86/include/asm/sev-internal.h index 3dfd306d1c9e..c58c47c68ab6 100644 --- a/arch/x86/include/asm/sev-internal.h +++ b/arch/x86/include/asm/sev-internal.h @@ -2,7 +2,6 @@ #define DR7_RESET_VALUE 0x400 -extern struct ghcb boot_ghcb_page; extern u64 sev_hv_features; extern u64 sev_secrets_pa; @@ -56,31 +55,15 @@ DECLARE_PER_CPU(struct sev_es_runtime_data*, runtime_data); DECLARE_PER_CPU(struct sev_es_save_area *, sev_vmsa); void early_set_pages_state(unsigned long vaddr, unsigned long paddr, - unsigned long npages, enum psc_op op); + unsigned long npages, const struct psc_desc *desc); DECLARE_PER_CPU(struct svsm_ca *, svsm_caa); DECLARE_PER_CPU(u64, svsm_caa_pa); -extern struct svsm_ca *boot_svsm_caa; extern u64 boot_svsm_caa_pa; -static __always_inline struct svsm_ca *svsm_get_caa(void) -{ - if (sev_cfg.use_cas) - return this_cpu_read(svsm_caa); - else - return boot_svsm_caa; -} - -static __always_inline u64 svsm_get_caa_pa(void) -{ - if (sev_cfg.use_cas) - return this_cpu_read(svsm_caa_pa); - else - return boot_svsm_caa_pa; -} - -int svsm_perform_call_protocol(struct svsm_call *call); +enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt); +void vc_forward_exception(struct es_em_ctxt *ctxt); static inline u64 sev_es_rd_ghcb_msr(void) { @@ -97,9 +80,8 @@ static __always_inline void sev_es_wr_ghcb_msr(u64 val) native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high); } -void snp_register_ghcb_early(unsigned long paddr); -bool sev_es_negotiate_protocol(void); -bool sev_es_check_cpu_features(void); +enum es_result sev_es_ghcb_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt, bool write); + u64 get_hv_features(void); const struct snp_cpuid_table *snp_cpuid_get_table(void); diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 465b19fd1a2d..f9046c4b9a2b 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -503,6 +503,7 @@ static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) } void setup_ghcb(void); +void snp_register_ghcb_early(unsigned long paddr); void early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned long npages); void early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, @@ -511,14 +512,12 @@ void snp_set_memory_shared(unsigned long vaddr, unsigned long npages); void snp_set_memory_private(unsigned long vaddr, unsigned long npages); void snp_set_wakeup_secondary_cpu(void); bool snp_init(struct boot_params *bp); -void __noreturn snp_abort(void); void snp_dmi_setup(void); int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, struct svsm_attest_call *input); void snp_accept_memory(phys_addr_t start, phys_addr_t end); u64 snp_get_unsupported_features(u64 status); u64 sev_get_status(void); void sev_show_status(void); -void snp_update_svsm_ca(void); int prepare_pte_enc(struct pte_enc_desc *d); void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot); void snp_kexec_finish(void); @@ -533,6 +532,10 @@ int snp_svsm_vtpm_send_command(u8 *buffer); void __init snp_secure_tsc_prepare(void); void __init snp_secure_tsc_init(void); +enum es_result savic_register_gpa(u64 gpa); +enum es_result savic_unregister_gpa(u64 *gpa); +u64 savic_ghcb_msr_read(u32 reg); +void savic_ghcb_msr_write(u32 reg, u64 value); static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) { @@ -540,8 +543,6 @@ static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) __builtin_memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); } -void vc_forward_exception(struct es_em_ctxt *ctxt); - /* I/O parameters for CPUID-related helpers */ struct cpuid_leaf { u32 fn; @@ -552,7 +553,13 @@ struct cpuid_leaf { u32 edx; }; -int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf); +int svsm_perform_msr_protocol(struct svsm_call *call); +int __pi_svsm_perform_msr_protocol(struct svsm_call *call); +int snp_cpuid(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), + void *ctx, struct cpuid_leaf *leaf); + +void svsm_issue_call(struct svsm_call *call, u8 *pending); +int svsm_process_result_codes(struct svsm_call *call); void __noreturn sev_es_terminate(unsigned int set, unsigned int reason); enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, @@ -560,7 +567,18 @@ enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, u64 exit_code, u64 exit_info_1, u64 exit_info_2); +bool sev_es_negotiate_protocol(void); +bool sev_es_check_cpu_features(void); + +extern u16 ghcb_version; extern struct ghcb *boot_ghcb; +extern bool sev_snp_needs_sfw; + +struct psc_desc { + enum psc_op op; + struct svsm_ca *ca; + u64 caa_pa; +}; static inline void sev_evict_cache(void *va, int npages) { @@ -600,7 +618,6 @@ static inline void snp_set_memory_shared(unsigned long vaddr, unsigned long npag static inline void snp_set_memory_private(unsigned long vaddr, unsigned long npages) { } static inline void snp_set_wakeup_secondary_cpu(void) { } static inline bool snp_init(struct boot_params *bp) { return false; } -static inline void snp_abort(void) { } static inline void snp_dmi_setup(void) { } static inline int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, struct svsm_attest_call *input) { @@ -610,7 +627,6 @@ static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { } static inline u64 snp_get_unsupported_features(u64 status) { return 0; } static inline u64 sev_get_status(void) { return 0; } static inline void sev_show_status(void) { } -static inline void snp_update_svsm_ca(void) { } static inline int prepare_pte_enc(struct pte_enc_desc *d) { return 0; } static inline void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot) { } static inline void snp_kexec_finish(void) { } @@ -624,6 +640,10 @@ static inline int snp_svsm_vtpm_send_command(u8 *buffer) { return -ENODEV; } static inline void __init snp_secure_tsc_prepare(void) { } static inline void __init snp_secure_tsc_init(void) { } static inline void sev_evict_cache(void *va, int npages) {} +static inline enum es_result savic_register_gpa(u64 gpa) { return ES_UNSUPPORTED; } +static inline enum es_result savic_unregister_gpa(u64 *gpa) { return ES_UNSUPPORTED; } +static inline void savic_ghcb_msr_write(u32 reg, u64 value) { } +static inline u64 savic_ghcb_msr_read(u32 reg) { return 0; } #endif /* CONFIG_AMD_MEM_ENCRYPT */ @@ -635,9 +655,13 @@ void snp_dump_hva_rmpentry(unsigned long address); int psmash(u64 pfn); int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable); int rmp_make_shared(u64 pfn, enum pg_level level); -void snp_leak_pages(u64 pfn, unsigned int npages); +void __snp_leak_pages(u64 pfn, unsigned int npages, bool dump_rmp); void kdump_sev_callback(void); void snp_fixup_e820_tables(void); +static inline void snp_leak_pages(u64 pfn, unsigned int pages) +{ + __snp_leak_pages(pfn, pages, true); +} #else static inline bool snp_probe_rmptable_info(void) { return false; } static inline int snp_rmptable_init(void) { return -ENOSYS; } @@ -650,6 +674,7 @@ static inline int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 as return -ENODEV; } static inline int rmp_make_shared(u64 pfn, enum pg_level level) { return -ENODEV; } +static inline void __snp_leak_pages(u64 pfn, unsigned int npages, bool dump_rmp) {} static inline void snp_leak_pages(u64 pfn, unsigned int npages) {} static inline void kdump_sev_callback(void) { } static inline void snp_fixup_e820_tables(void) {} diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h index 0f50e0125943..fc7dcec58fd4 100644 --- a/arch/x86/include/asm/shstk.h +++ b/arch/x86/include/asm/shstk.h @@ -23,6 +23,8 @@ int setup_signal_shadow_stack(struct ksignal *ksig); int restore_signal_shadow_stack(void); int shstk_update_last_frame(unsigned long val); bool shstk_is_enabled(void); +int shstk_pop(u64 *val); +int shstk_push(u64 val); #else static inline long shstk_prctl(struct task_struct *task, int option, unsigned long arg2) { return -EINVAL; } @@ -35,6 +37,8 @@ static inline int setup_signal_shadow_stack(struct ksignal *ksig) { return 0; } static inline int restore_signal_shadow_stack(void) { return 0; } static inline int shstk_update_last_frame(unsigned long val) { return 0; } static inline bool shstk_is_enabled(void) { return false; } +static inline int shstk_pop(u64 *val) { return -ENOTSUPP; } +static inline int shstk_push(u64 val) { return -ENOTSUPP; } #endif /* CONFIG_X86_USER_SHADOW_STACK */ #endif /* __ASSEMBLER__ */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 9282465eea21..e71e0e8362ed 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -80,56 +80,42 @@ struct thread_info { #endif /* - * thread information flags - * - these are process state flags that various assembly files - * may need to access + * Tell the generic TIF infrastructure which bits x86 supports */ -#define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ -#define TIF_SIGPENDING 2 /* signal pending */ -#define TIF_NEED_RESCHED 3 /* rescheduling necessary */ -#define TIF_NEED_RESCHED_LAZY 4 /* Lazy rescheduling needed */ -#define TIF_SINGLESTEP 5 /* reenable singlestep on user return*/ -#define TIF_SSBD 6 /* Speculative store bypass disable */ -#define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */ -#define TIF_SPEC_L1D_FLUSH 10 /* Flush L1D on mm switches (processes) */ -#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ -#define TIF_UPROBE 12 /* breakpointed or singlestepping */ -#define TIF_PATCH_PENDING 13 /* pending live patching update */ -#define TIF_NEED_FPU_LOAD 14 /* load FPU on return to userspace */ -#define TIF_NOCPUID 15 /* CPUID is not accessible in userland */ -#define TIF_NOTSC 16 /* TSC is not accessible in userland */ -#define TIF_NOTIFY_SIGNAL 17 /* signal notifications exist */ -#define TIF_MEMDIE 20 /* is terminating due to OOM killer */ -#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ +#define HAVE_TIF_NEED_RESCHED_LAZY +#define HAVE_TIF_POLLING_NRFLAG +#define HAVE_TIF_SINGLESTEP + +#include <asm-generic/thread_info_tif.h> + +/* Architecture specific TIF space starts at 16 */ +#define TIF_SSBD 16 /* Speculative store bypass disable */ +#define TIF_SPEC_IB 17 /* Indirect branch speculation mitigation */ +#define TIF_SPEC_L1D_FLUSH 18 /* Flush L1D on mm switches (processes) */ +#define TIF_NEED_FPU_LOAD 19 /* load FPU on return to userspace */ +#define TIF_NOCPUID 20 /* CPUID is not accessible in userland */ +#define TIF_NOTSC 21 /* TSC is not accessible in userland */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ #define TIF_SPEC_FORCE_UPDATE 23 /* Force speculation MSR update in context switch */ #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ -#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ +#define TIF_SINGLESTEP 25 /* reenable singlestep on user return*/ +#define TIF_BLOCKSTEP 26 /* set when we want DEBUGCTLMSR_BTF */ #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ -#define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ - -#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) -#define _TIF_SIGPENDING (1 << TIF_SIGPENDING) -#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) -#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) -#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) -#define _TIF_SSBD (1 << TIF_SSBD) -#define _TIF_SPEC_IB (1 << TIF_SPEC_IB) -#define _TIF_SPEC_L1D_FLUSH (1 << TIF_SPEC_L1D_FLUSH) -#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) -#define _TIF_UPROBE (1 << TIF_UPROBE) -#define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING) -#define _TIF_NEED_FPU_LOAD (1 << TIF_NEED_FPU_LOAD) -#define _TIF_NOCPUID (1 << TIF_NOCPUID) -#define _TIF_NOTSC (1 << TIF_NOTSC) -#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) -#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) -#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) -#define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) -#define _TIF_FORCED_TF (1 << TIF_FORCED_TF) -#define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) -#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) -#define _TIF_ADDR32 (1 << TIF_ADDR32) +#define TIF_ADDR32 28 /* 32-bit address space on 64 bits */ + +#define _TIF_SSBD BIT(TIF_SSBD) +#define _TIF_SPEC_IB BIT(TIF_SPEC_IB) +#define _TIF_SPEC_L1D_FLUSH BIT(TIF_SPEC_L1D_FLUSH) +#define _TIF_NEED_FPU_LOAD BIT(TIF_NEED_FPU_LOAD) +#define _TIF_NOCPUID BIT(TIF_NOCPUID) +#define _TIF_NOTSC BIT(TIF_NOTSC) +#define _TIF_IO_BITMAP BIT(TIF_IO_BITMAP) +#define _TIF_SPEC_FORCE_UPDATE BIT(TIF_SPEC_FORCE_UPDATE) +#define _TIF_FORCED_TF BIT(TIF_FORCED_TF) +#define _TIF_BLOCKSTEP BIT(TIF_BLOCKSTEP) +#define _TIF_SINGLESTEP BIT(TIF_SINGLESTEP) +#define _TIF_LAZY_MMU_UPDATES BIT(TIF_LAZY_MMU_UPDATES) +#define _TIF_ADDR32 BIT(TIF_ADDR32) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW_BASE \ diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 678fb546f0a7..1ee2e5115955 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -20,6 +20,11 @@ typedef u8 uprobe_opcode_t; #define UPROBE_SWBP_INSN 0xcc #define UPROBE_SWBP_INSN_SIZE 1 +enum { + ARCH_UPROBE_FLAG_CAN_OPTIMIZE = 0, + ARCH_UPROBE_FLAG_OPTIMIZE_FAIL = 1, +}; + struct uprobe_xol_ops; struct arch_uprobe { @@ -45,6 +50,8 @@ struct arch_uprobe { u8 ilen; } push; }; + + unsigned long flags; }; struct arch_uprobe_task { diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index 9c640a521a67..650e3256ea7d 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -118,6 +118,10 @@ #define SVM_VMGEXIT_AP_CREATE 1 #define SVM_VMGEXIT_AP_DESTROY 2 #define SVM_VMGEXIT_SNP_RUN_VMPL 0x80000018 +#define SVM_VMGEXIT_SAVIC 0x8000001a +#define SVM_VMGEXIT_SAVIC_REGISTER_GPA 0 +#define SVM_VMGEXIT_SAVIC_UNREGISTER_GPA 1 +#define SVM_VMGEXIT_SAVIC_SELF_GPA ~0ULL #define SVM_VMGEXIT_HV_FEATURES 0x8000fffd #define SVM_VMGEXIT_TERM_REQUEST 0x8000fffe #define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code) \ diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 52d1808ee360..581db89477f9 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -18,6 +18,7 @@ ifeq ($(CONFIG_X86_64),y) # APIC probe will depend on the listing order here obj-$(CONFIG_X86_NUMACHIP) += apic_numachip.o obj-$(CONFIG_X86_UV) += x2apic_uv_x.o +obj-$(CONFIG_AMD_SECURE_AVIC) += x2apic_savic.o obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o obj-y += apic_flat_64.o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index d73ba5a7b623..680d305589a3 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -592,6 +592,8 @@ static void setup_APIC_timer(void) 0xF, ~0UL); } else clockevents_register_device(levt); + + apic_update_vector(smp_processor_id(), LOCAL_TIMER_VECTOR, true); } /* @@ -1168,6 +1170,9 @@ void disable_local_APIC(void) if (!apic_accessible()) return; + if (apic->teardown) + apic->teardown(); + apic_soft_disable(); #ifdef CONFIG_X86_32 @@ -1428,63 +1433,61 @@ union apic_ir { u32 regs[APIC_IR_REGS]; }; -static bool apic_check_and_ack(union apic_ir *irr, union apic_ir *isr) +static bool apic_check_and_eoi_isr(union apic_ir *isr) { int i, bit; - /* Read the IRRs */ - for (i = 0; i < APIC_IR_REGS; i++) - irr->regs[i] = apic_read(APIC_IRR + i * 0x10); - /* Read the ISRs */ for (i = 0; i < APIC_IR_REGS; i++) isr->regs[i] = apic_read(APIC_ISR + i * 0x10); + /* If the ISR map empty, nothing to do here. */ + if (bitmap_empty(isr->map, APIC_IR_BITS)) + return true; + /* - * If the ISR map is not empty. ACK the APIC and run another round - * to verify whether a pending IRR has been unblocked and turned - * into a ISR. + * There can be multiple ISR bits set when a high priority + * interrupt preempted a lower priority one. Issue an EOI for each + * set bit. The priority traversal order does not matter as there + * can't be new ISR bits raised at this point. What matters is that + * an EOI is issued for each ISR bit. */ - if (!bitmap_empty(isr->map, APIC_IR_BITS)) { - /* - * There can be multiple ISR bits set when a high priority - * interrupt preempted a lower priority one. Issue an ACK - * per set bit. - */ - for_each_set_bit(bit, isr->map, APIC_IR_BITS) - apic_eoi(); - return true; - } + for_each_set_bit(bit, isr->map, APIC_IR_BITS) + apic_eoi(); - return !bitmap_empty(irr->map, APIC_IR_BITS); + /* Reread the ISRs, they should be empty now */ + for (i = 0; i < APIC_IR_REGS; i++) + isr->regs[i] = apic_read(APIC_ISR + i * 0x10); + + return bitmap_empty(isr->map, APIC_IR_BITS); } /* - * After a crash, we no longer service the interrupts and a pending - * interrupt from previous kernel might still have ISR bit set. + * If a CPU services an interrupt and crashes before issuing EOI to the + * local APIC, the corresponding ISR bit is still set when the crashing CPU + * jumps into a crash kernel. Read the ISR and issue an EOI for each set + * bit to acknowledge it as otherwise these slots would be locked forever + * waiting for an EOI. * - * Most probably by now the CPU has serviced that pending interrupt and it - * might not have done the apic_eoi() because it thought, interrupt - * came from i8259 as ExtInt. LAPIC did not get EOI so it does not clear - * the ISR bit and cpu thinks it has already serviced the interrupt. Hence - * a vector might get locked. It was noticed for timer irq (vector - * 0x31). Issue an extra EOI to clear ISR. + * If there are pending bits in the IRR, then they won't be converted into + * ISR bits as the CPU has interrupts disabled. They will be delivered once + * the CPU enables interrupts and there is nothing which can prevent that. * - * If there are pending IRR bits they turn into ISR bits after a higher - * priority ISR bit has been acked. + * In the worst case this results in spurious interrupt warnings. */ -static void apic_pending_intr_clear(void) +static void apic_clear_isr(void) { - union apic_ir irr, isr; + union apic_ir ir; unsigned int i; - /* 512 loops are way oversized and give the APIC a chance to obey. */ - for (i = 0; i < 512; i++) { - if (!apic_check_and_ack(&irr, &isr)) - return; - } - /* Dump the IRR/ISR content if that failed */ - pr_warn("APIC: Stale IRR: %256pb ISR: %256pb\n", irr.map, isr.map); + if (!apic_check_and_eoi_isr(&ir)) + pr_warn("APIC: Stale ISR: %256pb\n", ir.map); + + for (i = 0; i < APIC_IR_REGS; i++) + ir.regs[i] = apic_read(APIC_IRR + i * 0x10); + + if (!bitmap_empty(ir.map, APIC_IR_BITS)) + pr_warn("APIC: Stale IRR: %256pb\n", ir.map); } /** @@ -1503,6 +1506,9 @@ static void setup_local_APIC(void) return; } + if (apic->setup) + apic->setup(); + /* * If this comes from kexec/kcrash the APIC might be enabled in * SPIV. Soft disable it before doing further initialization. @@ -1541,8 +1547,7 @@ static void setup_local_APIC(void) value |= 0x10; apic_write(APIC_TASKPRI, value); - /* Clear eventually stale ISR/IRR bits */ - apic_pending_intr_clear(); + apic_clear_isr(); /* * Now that we are all set up, enable the APIC diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index a947b46a8b64..bddc54465399 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -134,13 +134,20 @@ static void apic_update_irq_cfg(struct irq_data *irqd, unsigned int vector, apicd->hw_irq_cfg.vector = vector; apicd->hw_irq_cfg.dest_apicid = apic->calc_dest_apicid(cpu); + + apic_update_vector(cpu, vector, true); + irq_data_update_effective_affinity(irqd, cpumask_of(cpu)); - trace_vector_config(irqd->irq, vector, cpu, - apicd->hw_irq_cfg.dest_apicid); + trace_vector_config(irqd->irq, vector, cpu, apicd->hw_irq_cfg.dest_apicid); } -static void apic_update_vector(struct irq_data *irqd, unsigned int newvec, - unsigned int newcpu) +static void apic_free_vector(unsigned int cpu, unsigned int vector, bool managed) +{ + apic_update_vector(cpu, vector, false); + irq_matrix_free(vector_matrix, cpu, vector, managed); +} + +static void chip_data_update(struct irq_data *irqd, unsigned int newvec, unsigned int newcpu) { struct apic_chip_data *apicd = apic_chip_data(irqd); struct irq_desc *desc = irq_data_to_desc(irqd); @@ -174,8 +181,7 @@ static void apic_update_vector(struct irq_data *irqd, unsigned int newvec, apicd->prev_cpu = apicd->cpu; WARN_ON_ONCE(apicd->cpu == newcpu); } else { - irq_matrix_free(vector_matrix, apicd->cpu, apicd->vector, - managed); + apic_free_vector(apicd->cpu, apicd->vector, managed); } setnew: @@ -261,7 +267,7 @@ assign_vector_locked(struct irq_data *irqd, const struct cpumask *dest) trace_vector_alloc(irqd->irq, vector, resvd, vector); if (vector < 0) return vector; - apic_update_vector(irqd, vector, cpu); + chip_data_update(irqd, vector, cpu); return 0; } @@ -337,7 +343,7 @@ assign_managed_vector(struct irq_data *irqd, const struct cpumask *dest) trace_vector_alloc_managed(irqd->irq, vector, vector); if (vector < 0) return vector; - apic_update_vector(irqd, vector, cpu); + chip_data_update(irqd, vector, cpu); return 0; } @@ -357,7 +363,7 @@ static void clear_irq_vector(struct irq_data *irqd) apicd->prev_cpu); per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_SHUTDOWN; - irq_matrix_free(vector_matrix, apicd->cpu, vector, managed); + apic_free_vector(apicd->cpu, vector, managed); apicd->vector = 0; /* Clean up move in progress */ @@ -366,7 +372,7 @@ static void clear_irq_vector(struct irq_data *irqd) return; per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_SHUTDOWN; - irq_matrix_free(vector_matrix, apicd->prev_cpu, vector, managed); + apic_free_vector(apicd->prev_cpu, vector, managed); apicd->prev_vector = 0; apicd->move_in_progress = 0; hlist_del_init(&apicd->clist); @@ -905,7 +911,7 @@ static void free_moved_vector(struct apic_chip_data *apicd) * affinity mask comes online. */ trace_vector_free_moved(apicd->irq, cpu, vector, managed); - irq_matrix_free(vector_matrix, cpu, vector, managed); + apic_free_vector(cpu, vector, managed); per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; hlist_del_init(&apicd->clist); apicd->prev_vector = 0; diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c new file mode 100644 index 000000000000..dbc5678bc3b6 --- /dev/null +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -0,0 +1,428 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Secure AVIC Support (SEV-SNP Guests) + * + * Copyright (C) 2024 Advanced Micro Devices, Inc. + * + * Author: Neeraj Upadhyay <Neeraj.Upadhyay@amd.com> + */ + +#include <linux/cc_platform.h> +#include <linux/cpumask.h> +#include <linux/percpu-defs.h> +#include <linux/align.h> + +#include <asm/apic.h> +#include <asm/sev.h> + +#include "local.h" + +struct secure_avic_page { + u8 regs[PAGE_SIZE]; +} __aligned(PAGE_SIZE); + +static struct secure_avic_page __percpu *savic_page __ro_after_init; + +static int savic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + return x2apic_enabled() && cc_platform_has(CC_ATTR_SNP_SECURE_AVIC); +} + +static inline void *get_reg_bitmap(unsigned int cpu, unsigned int offset) +{ + return &per_cpu_ptr(savic_page, cpu)->regs[offset]; +} + +static inline void update_vector(unsigned int cpu, unsigned int offset, + unsigned int vector, bool set) +{ + void *bitmap = get_reg_bitmap(cpu, offset); + + if (set) + apic_set_vector(vector, bitmap); + else + apic_clear_vector(vector, bitmap); +} + +#define SAVIC_ALLOWED_IRR 0x204 + +/* + * When Secure AVIC is enabled, RDMSR/WRMSR of the APIC registers + * result in #VC exception (for non-accelerated register accesses) + * with VMEXIT_AVIC_NOACCEL error code. The #VC exception handler + * can read/write the x2APIC register in the guest APIC backing page. + * + * Since doing this would increase the latency of accessing x2APIC + * registers, instead of doing RDMSR/WRMSR based accesses and + * handling the APIC register reads/writes in the #VC exception handler, + * the read() and write() callbacks directly read/write the APIC register + * from/to the vCPU's APIC backing page. + */ +static u32 savic_read(u32 reg) +{ + void *ap = this_cpu_ptr(savic_page); + + switch (reg) { + case APIC_LVTT: + case APIC_TMICT: + case APIC_TMCCT: + case APIC_TDCR: + case APIC_LVTTHMR: + case APIC_LVTPC: + case APIC_LVT0: + case APIC_LVT1: + case APIC_LVTERR: + return savic_ghcb_msr_read(reg); + case APIC_ID: + case APIC_LVR: + case APIC_TASKPRI: + case APIC_ARBPRI: + case APIC_PROCPRI: + case APIC_LDR: + case APIC_SPIV: + case APIC_ESR: + case APIC_EFEAT: + case APIC_ECTRL: + case APIC_SEOI: + case APIC_IER: + case APIC_EILVTn(0) ... APIC_EILVTn(3): + return apic_get_reg(ap, reg); + case APIC_ICR: + return (u32)apic_get_reg64(ap, reg); + case APIC_ISR ... APIC_ISR + 0x70: + case APIC_TMR ... APIC_TMR + 0x70: + if (WARN_ONCE(!IS_ALIGNED(reg, 16), + "APIC register read offset 0x%x not aligned at 16 bytes", reg)) + return 0; + return apic_get_reg(ap, reg); + /* IRR and ALLOWED_IRR offset range */ + case APIC_IRR ... APIC_IRR + 0x74: + /* + * Valid APIC_IRR/SAVIC_ALLOWED_IRR registers are at 16 bytes strides from + * their respective base offset. APIC_IRRs are in the range + * + * (0x200, 0x210, ..., 0x270) + * + * while the SAVIC_ALLOWED_IRR range starts 4 bytes later, in the range + * + * (0x204, 0x214, ..., 0x274). + * + * Filter out everything else. + */ + if (WARN_ONCE(!(IS_ALIGNED(reg, 16) || + IS_ALIGNED(reg - 4, 16)), + "Misaligned APIC_IRR/ALLOWED_IRR APIC register read offset 0x%x", reg)) + return 0; + return apic_get_reg(ap, reg); + default: + pr_err("Error reading unknown Secure AVIC reg offset 0x%x\n", reg); + return 0; + } +} + +#define SAVIC_NMI_REQ 0x278 + +/* + * On WRMSR to APIC_SELF_IPI register by the guest, Secure AVIC hardware + * updates the APIC_IRR in the APIC backing page of the vCPU. In addition, + * hardware evaluates the new APIC_IRR update for interrupt injection to + * the vCPU. So, self IPIs are hardware-accelerated. + */ +static inline void self_ipi_reg_write(unsigned int vector) +{ + native_apic_msr_write(APIC_SELF_IPI, vector); +} + +static void send_ipi_dest(unsigned int cpu, unsigned int vector, bool nmi) +{ + if (nmi) + apic_set_reg(per_cpu_ptr(savic_page, cpu), SAVIC_NMI_REQ, 1); + else + update_vector(cpu, APIC_IRR, vector, true); +} + +static void send_ipi_allbut(unsigned int vector, bool nmi) +{ + unsigned int cpu, src_cpu; + + guard(irqsave)(); + + src_cpu = raw_smp_processor_id(); + + for_each_cpu(cpu, cpu_online_mask) { + if (cpu == src_cpu) + continue; + send_ipi_dest(cpu, vector, nmi); + } +} + +static inline void self_ipi(unsigned int vector, bool nmi) +{ + u32 icr_low = APIC_SELF_IPI | vector; + + if (nmi) + icr_low |= APIC_DM_NMI; + + native_x2apic_icr_write(icr_low, 0); +} + +static void savic_icr_write(u32 icr_low, u32 icr_high) +{ + unsigned int dsh, vector; + u64 icr_data; + bool nmi; + + dsh = icr_low & APIC_DEST_ALLBUT; + vector = icr_low & APIC_VECTOR_MASK; + nmi = ((icr_low & APIC_DM_FIXED_MASK) == APIC_DM_NMI); + + switch (dsh) { + case APIC_DEST_SELF: + self_ipi(vector, nmi); + break; + case APIC_DEST_ALLINC: + self_ipi(vector, nmi); + fallthrough; + case APIC_DEST_ALLBUT: + send_ipi_allbut(vector, nmi); + break; + default: + send_ipi_dest(icr_high, vector, nmi); + break; + } + + icr_data = ((u64)icr_high) << 32 | icr_low; + if (dsh != APIC_DEST_SELF) + savic_ghcb_msr_write(APIC_ICR, icr_data); + apic_set_reg64(this_cpu_ptr(savic_page), APIC_ICR, icr_data); +} + +static void savic_write(u32 reg, u32 data) +{ + void *ap = this_cpu_ptr(savic_page); + + switch (reg) { + case APIC_LVTT: + case APIC_TMICT: + case APIC_TDCR: + case APIC_LVT0: + case APIC_LVT1: + case APIC_LVTTHMR: + case APIC_LVTPC: + case APIC_LVTERR: + savic_ghcb_msr_write(reg, data); + break; + case APIC_TASKPRI: + case APIC_EOI: + case APIC_SPIV: + case SAVIC_NMI_REQ: + case APIC_ESR: + case APIC_ECTRL: + case APIC_SEOI: + case APIC_IER: + case APIC_EILVTn(0) ... APIC_EILVTn(3): + apic_set_reg(ap, reg, data); + break; + case APIC_ICR: + savic_icr_write(data, 0); + break; + case APIC_SELF_IPI: + self_ipi_reg_write(data); + break; + /* ALLOWED_IRR offsets are writable */ + case SAVIC_ALLOWED_IRR ... SAVIC_ALLOWED_IRR + 0x70: + if (IS_ALIGNED(reg - 4, 16)) { + apic_set_reg(ap, reg, data); + break; + } + fallthrough; + default: + pr_err("Error writing unknown Secure AVIC reg offset 0x%x\n", reg); + } +} + +static void send_ipi(u32 dest, unsigned int vector, unsigned int dsh) +{ + unsigned int icr_low; + + icr_low = __prepare_ICR(dsh, vector, APIC_DEST_PHYSICAL); + savic_icr_write(icr_low, dest); +} + +static void savic_send_ipi(int cpu, int vector) +{ + u32 dest = per_cpu(x86_cpu_to_apicid, cpu); + + send_ipi(dest, vector, 0); +} + +static void send_ipi_mask(const struct cpumask *mask, unsigned int vector, bool excl_self) +{ + unsigned int cpu, this_cpu; + + guard(irqsave)(); + + this_cpu = raw_smp_processor_id(); + + for_each_cpu(cpu, mask) { + if (excl_self && cpu == this_cpu) + continue; + send_ipi(per_cpu(x86_cpu_to_apicid, cpu), vector, 0); + } +} + +static void savic_send_ipi_mask(const struct cpumask *mask, int vector) +{ + send_ipi_mask(mask, vector, false); +} + +static void savic_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) +{ + send_ipi_mask(mask, vector, true); +} + +static void savic_send_ipi_allbutself(int vector) +{ + send_ipi(0, vector, APIC_DEST_ALLBUT); +} + +static void savic_send_ipi_all(int vector) +{ + send_ipi(0, vector, APIC_DEST_ALLINC); +} + +static void savic_send_ipi_self(int vector) +{ + self_ipi_reg_write(vector); +} + +static void savic_update_vector(unsigned int cpu, unsigned int vector, bool set) +{ + update_vector(cpu, SAVIC_ALLOWED_IRR, vector, set); +} + +static void savic_eoi(void) +{ + unsigned int cpu; + int vec; + + cpu = raw_smp_processor_id(); + vec = apic_find_highest_vector(get_reg_bitmap(cpu, APIC_ISR)); + if (WARN_ONCE(vec == -1, "EOI write while no active interrupt in APIC_ISR")) + return; + + /* Is level-triggered interrupt? */ + if (apic_test_vector(vec, get_reg_bitmap(cpu, APIC_TMR))) { + update_vector(cpu, APIC_ISR, vec, false); + /* + * Propagate the EOI write to the hypervisor for level-triggered + * interrupts. Return to the guest from GHCB protocol event takes + * care of re-evaluating interrupt state. + */ + savic_ghcb_msr_write(APIC_EOI, 0); + } else { + /* + * Hardware clears APIC_ISR and re-evaluates the interrupt state + * to determine if there is any pending interrupt which can be + * delivered to CPU. + */ + native_apic_msr_eoi(); + } +} + +static void savic_teardown(void) +{ + /* Disable Secure AVIC */ + native_wrmsrq(MSR_AMD64_SAVIC_CONTROL, 0); + savic_unregister_gpa(NULL); +} + +static void savic_setup(void) +{ + void *ap = this_cpu_ptr(savic_page); + enum es_result res; + unsigned long gpa; + + /* + * Before Secure AVIC is enabled, APIC MSR reads are intercepted. + * APIC_ID MSR read returns the value from the hypervisor. + */ + apic_set_reg(ap, APIC_ID, native_apic_msr_read(APIC_ID)); + + gpa = __pa(ap); + + /* + * The NPT entry for a vCPU's APIC backing page must always be + * present when the vCPU is running in order for Secure AVIC to + * function. A VMEXIT_BUSY is returned on VMRUN and the vCPU cannot + * be resumed if the NPT entry for the APIC backing page is not + * present. Notify GPA of the vCPU's APIC backing page to the + * hypervisor by calling savic_register_gpa(). Before executing + * VMRUN, the hypervisor makes use of this information to make sure + * the APIC backing page is mapped in NPT. + */ + res = savic_register_gpa(gpa); + if (res != ES_OK) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL); + + native_wrmsrq(MSR_AMD64_SAVIC_CONTROL, + gpa | MSR_AMD64_SAVIC_EN | MSR_AMD64_SAVIC_ALLOWEDNMI); +} + +static int savic_probe(void) +{ + if (!cc_platform_has(CC_ATTR_SNP_SECURE_AVIC)) + return 0; + + if (!x2apic_mode) { + pr_err("Secure AVIC enabled in non x2APIC mode\n"); + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL); + /* unreachable */ + } + + savic_page = alloc_percpu(struct secure_avic_page); + if (!savic_page) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL); + + return 1; +} + +static struct apic apic_x2apic_savic __ro_after_init = { + + .name = "secure avic x2apic", + .probe = savic_probe, + .acpi_madt_oem_check = savic_acpi_madt_oem_check, + .setup = savic_setup, + .teardown = savic_teardown, + + .dest_mode_logical = false, + + .disable_esr = 0, + + .cpu_present_to_apicid = default_cpu_present_to_apicid, + + .max_apic_id = UINT_MAX, + .x2apic_set_max_apicid = true, + .get_apic_id = x2apic_get_apic_id, + + .calc_dest_apicid = apic_default_calc_apicid, + + .send_IPI = savic_send_ipi, + .send_IPI_mask = savic_send_ipi_mask, + .send_IPI_mask_allbutself = savic_send_ipi_mask_allbutself, + .send_IPI_allbutself = savic_send_ipi_allbutself, + .send_IPI_all = savic_send_ipi_all, + .send_IPI_self = savic_send_ipi_self, + + .nmi_to_offline_cpu = true, + + .read = savic_read, + .write = savic_write, + .eoi = savic_eoi, + .icr_read = native_x2apic_icr_read, + .icr_write = savic_icr_write, + + .update_vector = savic_update_vector, +}; + +apic_driver(apic_x2apic_savic); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 1e26179ff18c..2f8a58ef690e 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -58,6 +58,7 @@ obj-$(CONFIG_X86_SGX) += sgx/ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o +obj-$(CONFIG_BHYVE_GUEST) += bhyve.o obj-$(CONFIG_ACRN_GUEST) += acrn.o obj-$(CONFIG_DEBUG_FS) += debugfs.o diff --git a/arch/x86/kernel/cpu/bhyve.c b/arch/x86/kernel/cpu/bhyve.c new file mode 100644 index 000000000000..f1a8ca3dd1ed --- /dev/null +++ b/arch/x86/kernel/cpu/bhyve.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FreeBSD Bhyve guest enlightenments + * + * Copyright © 2025 Amazon.com, Inc. or its affiliates. + * + * Author: David Woodhouse <dwmw2@infradead.org> + */ + +#include <linux/init.h> +#include <linux/export.h> +#include <asm/processor.h> +#include <asm/hypervisor.h> + +static uint32_t bhyve_cpuid_base; +static uint32_t bhyve_cpuid_max; + +#define BHYVE_SIGNATURE "bhyve bhyve " + +#define CPUID_BHYVE_FEATURES 0x40000001 + +/* Features advertised in CPUID_BHYVE_FEATURES %eax */ + +/* MSI Extended Dest ID */ +#define CPUID_BHYVE_FEAT_EXT_DEST_ID (1UL << 0) + +static uint32_t __init bhyve_detect(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) + return 0; + + bhyve_cpuid_base = cpuid_base_hypervisor(BHYVE_SIGNATURE, 0); + if (!bhyve_cpuid_base) + return 0; + + bhyve_cpuid_max = cpuid_eax(bhyve_cpuid_base); + return bhyve_cpuid_max; +} + +static uint32_t bhyve_features(void) +{ + unsigned int cpuid_leaf = bhyve_cpuid_base | CPUID_BHYVE_FEATURES; + + if (bhyve_cpuid_max < cpuid_leaf) + return 0; + + return cpuid_eax(cpuid_leaf); +} + +static bool __init bhyve_ext_dest_id(void) +{ + return !!(bhyve_features() & CPUID_BHYVE_FEAT_EXT_DEST_ID); +} + +static bool __init bhyve_x2apic_available(void) +{ + return true; +} + +const struct hypervisor_x86 x86_hyper_bhyve __refconst = { + .name = "Bhyve", + .detect = bhyve_detect, + .init.init_platform = x86_init_noop, + .init.x2apic_available = bhyve_x2apic_available, + .init.msi_ext_dest_id = bhyve_ext_dest_id, +}; diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 36dcfc5105be..6a526ae1fe99 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -434,6 +434,9 @@ static bool __init should_mitigate_vuln(unsigned int bug) case X86_BUG_SPEC_STORE_BYPASS: return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER); + case X86_BUG_VMSCAPE: + return cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST); + default: WARN(1, "Unknown bug %x\n", bug); return false; @@ -684,8 +687,7 @@ static const char * const mmio_strings[] = { static void __init mmio_select_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || - cpu_mitigations_off()) { + if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) { mmio_mitigation = MMIO_MITIGATION_OFF; return; } @@ -1460,8 +1462,10 @@ static void __init retbleed_update_mitigation(void) retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; break; default: - if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) + if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) { pr_err(RETBLEED_INTEL_MSG); + retbleed_mitigation = RETBLEED_MITIGATION_NONE; + } } } @@ -1842,9 +1846,10 @@ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_IBRS, }; -static enum spectre_v2_mitigation_cmd spectre_v2_cmd __ro_after_init = SPECTRE_V2_CMD_AUTO; +static enum spectre_v2_mitigation_cmd spectre_v2_cmd __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE; -enum spectre_v2_user_cmd { +enum spectre_v2_user_mitigation_cmd { SPECTRE_V2_USER_CMD_NONE, SPECTRE_V2_USER_CMD_AUTO, SPECTRE_V2_USER_CMD_FORCE, @@ -1854,6 +1859,9 @@ enum spectre_v2_user_cmd { SPECTRE_V2_USER_CMD_SECCOMP_IBPB, }; +static enum spectre_v2_user_mitigation_cmd spectre_v2_user_cmd __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_USER_CMD_AUTO : SPECTRE_V2_USER_CMD_NONE; + static const char * const spectre_v2_user_strings[] = { [SPECTRE_V2_USER_NONE] = "User space: Vulnerable", [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection", @@ -1862,50 +1870,31 @@ static const char * const spectre_v2_user_strings[] = { [SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl", }; -static const struct { - const char *option; - enum spectre_v2_user_cmd cmd; - bool secure; -} v2_user_options[] __initconst = { - { "auto", SPECTRE_V2_USER_CMD_AUTO, false }, - { "off", SPECTRE_V2_USER_CMD_NONE, false }, - { "on", SPECTRE_V2_USER_CMD_FORCE, true }, - { "prctl", SPECTRE_V2_USER_CMD_PRCTL, false }, - { "prctl,ibpb", SPECTRE_V2_USER_CMD_PRCTL_IBPB, false }, - { "seccomp", SPECTRE_V2_USER_CMD_SECCOMP, false }, - { "seccomp,ibpb", SPECTRE_V2_USER_CMD_SECCOMP_IBPB, false }, -}; - -static void __init spec_v2_user_print_cond(const char *reason, bool secure) -{ - if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure) - pr_info("spectre_v2_user=%s forced on command line.\n", reason); -} - -static enum spectre_v2_user_cmd __init spectre_v2_parse_user_cmdline(void) +static int __init spectre_v2_user_parse_cmdline(char *str) { - char arg[20]; - int ret, i; - - if (!IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2)) - return SPECTRE_V2_USER_CMD_NONE; - - ret = cmdline_find_option(boot_command_line, "spectre_v2_user", - arg, sizeof(arg)); - if (ret < 0) - return SPECTRE_V2_USER_CMD_AUTO; + if (!str) + return -EINVAL; - for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) { - if (match_option(arg, ret, v2_user_options[i].option)) { - spec_v2_user_print_cond(v2_user_options[i].option, - v2_user_options[i].secure); - return v2_user_options[i].cmd; - } - } + if (!strcmp(str, "auto")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_AUTO; + else if (!strcmp(str, "off")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_NONE; + else if (!strcmp(str, "on")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_FORCE; + else if (!strcmp(str, "prctl")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_PRCTL; + else if (!strcmp(str, "prctl,ibpb")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_PRCTL_IBPB; + else if (!strcmp(str, "seccomp")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_SECCOMP; + else if (!strcmp(str, "seccomp,ibpb")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_SECCOMP_IBPB; + else + pr_err("Ignoring unknown spectre_v2_user option (%s).", str); - pr_err("Unknown user space protection option (%s). Switching to default\n", arg); - return SPECTRE_V2_USER_CMD_AUTO; + return 0; } +early_param("spectre_v2_user", spectre_v2_user_parse_cmdline); static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) { @@ -1917,7 +1906,7 @@ static void __init spectre_v2_user_select_mitigation(void) if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP)) return; - switch (spectre_v2_parse_user_cmdline()) { + switch (spectre_v2_user_cmd) { case SPECTRE_V2_USER_CMD_NONE: return; case SPECTRE_V2_USER_CMD_FORCE: @@ -2045,119 +2034,61 @@ static void __init spectre_v2_user_apply_mitigation(void) static const char * const spectre_v2_strings[] = { [SPECTRE_V2_NONE] = "Vulnerable", [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", - [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", + [SPECTRE_V2_LFENCE] = "Vulnerable: LFENCE", [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced / Automatic IBRS", [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced / Automatic IBRS + LFENCE", [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced / Automatic IBRS + Retpolines", [SPECTRE_V2_IBRS] = "Mitigation: IBRS", }; -static const struct { - const char *option; - enum spectre_v2_mitigation_cmd cmd; - bool secure; -} mitigation_options[] __initconst = { - { "off", SPECTRE_V2_CMD_NONE, false }, - { "on", SPECTRE_V2_CMD_FORCE, true }, - { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, - { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, - { "retpoline,lfence", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, - { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, - { "eibrs", SPECTRE_V2_CMD_EIBRS, false }, - { "eibrs,lfence", SPECTRE_V2_CMD_EIBRS_LFENCE, false }, - { "eibrs,retpoline", SPECTRE_V2_CMD_EIBRS_RETPOLINE, false }, - { "auto", SPECTRE_V2_CMD_AUTO, false }, - { "ibrs", SPECTRE_V2_CMD_IBRS, false }, -}; +static bool nospectre_v2 __ro_after_init; -static void __init spec_v2_print_cond(const char *reason, bool secure) +static int __init nospectre_v2_parse_cmdline(char *str) { - if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure) - pr_info("%s selected on command line.\n", reason); + nospectre_v2 = true; + spectre_v2_cmd = SPECTRE_V2_CMD_NONE; + return 0; } +early_param("nospectre_v2", nospectre_v2_parse_cmdline); -static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) +static int __init spectre_v2_parse_cmdline(char *str) { - enum spectre_v2_mitigation_cmd cmd; - char arg[20]; - int ret, i; - - cmd = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE; - if (cmdline_find_option_bool(boot_command_line, "nospectre_v2")) - return SPECTRE_V2_CMD_NONE; - - ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); - if (ret < 0) - return cmd; - - for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) { - if (!match_option(arg, ret, mitigation_options[i].option)) - continue; - cmd = mitigation_options[i].cmd; - break; - } - - if (i >= ARRAY_SIZE(mitigation_options)) { - pr_err("unknown option (%s). Switching to default mode\n", arg); - return cmd; - } - - if ((cmd == SPECTRE_V2_CMD_RETPOLINE || - cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || - cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC || - cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || - cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && - !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) { - pr_err("%s selected but not compiled in. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } - - if ((cmd == SPECTRE_V2_CMD_EIBRS || - cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || - cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && - !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { - pr_err("%s selected but CPU doesn't have Enhanced or Automatic IBRS. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } - - if ((cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || - cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) && - !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { - pr_err("%s selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } - - if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY)) { - pr_err("%s selected but not compiled in. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } - - if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { - pr_err("%s selected but not Intel CPU. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } + if (!str) + return -EINVAL; - if (cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) { - pr_err("%s selected but CPU doesn't have IBRS. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } + if (nospectre_v2) + return 0; - if (cmd == SPECTRE_V2_CMD_IBRS && cpu_feature_enabled(X86_FEATURE_XENPV)) { - pr_err("%s selected but running as XenPV guest. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; + if (!strcmp(str, "off")) { + spectre_v2_cmd = SPECTRE_V2_CMD_NONE; + } else if (!strcmp(str, "on")) { + spectre_v2_cmd = SPECTRE_V2_CMD_FORCE; + setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + setup_force_cpu_bug(X86_BUG_SPECTRE_V2_USER); + } else if (!strcmp(str, "retpoline")) { + spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE; + } else if (!strcmp(str, "retpoline,amd") || + !strcmp(str, "retpoline,lfence")) { + spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE_LFENCE; + } else if (!strcmp(str, "retpoline,generic")) { + spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE_GENERIC; + } else if (!strcmp(str, "eibrs")) { + spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS; + } else if (!strcmp(str, "eibrs,lfence")) { + spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS_LFENCE; + } else if (!strcmp(str, "eibrs,retpoline")) { + spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS_RETPOLINE; + } else if (!strcmp(str, "auto")) { + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } else if (!strcmp(str, "ibrs")) { + spectre_v2_cmd = SPECTRE_V2_CMD_IBRS; + } else { + pr_err("Ignoring unknown spectre_v2 option (%s).", str); } - spec_v2_print_cond(mitigation_options[i].option, - mitigation_options[i].secure); - return cmd; + return 0; } +early_param("spectre_v2", spectre_v2_parse_cmdline); static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) { @@ -2306,10 +2237,6 @@ static void __init bhi_update_mitigation(void) { if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE) bhi_mitigation = BHI_MITIGATION_OFF; - - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && - spectre_v2_cmd == SPECTRE_V2_CMD_AUTO) - bhi_mitigation = BHI_MITIGATION_OFF; } static void __init bhi_apply_mitigation(void) @@ -2345,11 +2272,55 @@ static void __init bhi_apply_mitigation(void) static void __init spectre_v2_select_mitigation(void) { - spectre_v2_cmd = spectre_v2_parse_cmdline(); + if ((spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE || + spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || + spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && + !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) { + pr_err("RETPOLINE selected but not compiled in. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if ((spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && + !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { + pr_err("EIBRS selected but CPU doesn't have Enhanced or Automatic IBRS. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if ((spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) && + !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { + pr_err("LFENCE selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY)) { + pr_err("IBRS selected but not compiled in. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + pr_err("IBRS selected but not Intel CPU. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) { + pr_err("IBRS selected but CPU doesn't have IBRS. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && cpu_feature_enabled(X86_FEATURE_XENPV)) { + pr_err("IBRS selected but running as XenPV guest. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && - (spectre_v2_cmd == SPECTRE_V2_CMD_NONE || spectre_v2_cmd == SPECTRE_V2_CMD_AUTO)) + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) { + spectre_v2_cmd = SPECTRE_V2_CMD_NONE; return; + } switch (spectre_v2_cmd) { case SPECTRE_V2_CMD_NONE: @@ -2555,16 +2526,8 @@ static void update_mds_branch_idle(void) #undef pr_fmt #define pr_fmt(fmt) "Speculative Store Bypass: " fmt -static enum ssb_mitigation ssb_mode __ro_after_init = SPEC_STORE_BYPASS_NONE; - -/* The kernel command line selection */ -enum ssb_mitigation_cmd { - SPEC_STORE_BYPASS_CMD_NONE, - SPEC_STORE_BYPASS_CMD_AUTO, - SPEC_STORE_BYPASS_CMD_ON, - SPEC_STORE_BYPASS_CMD_PRCTL, - SPEC_STORE_BYPASS_CMD_SECCOMP, -}; +static enum ssb_mitigation ssb_mode __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_SSB) ? SPEC_STORE_BYPASS_AUTO : SPEC_STORE_BYPASS_NONE; static const char * const ssb_strings[] = { [SPEC_STORE_BYPASS_NONE] = "Vulnerable", @@ -2573,94 +2536,61 @@ static const char * const ssb_strings[] = { [SPEC_STORE_BYPASS_SECCOMP] = "Mitigation: Speculative Store Bypass disabled via prctl and seccomp", }; -static const struct { - const char *option; - enum ssb_mitigation_cmd cmd; -} ssb_mitigation_options[] __initconst = { - { "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */ - { "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */ - { "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */ - { "prctl", SPEC_STORE_BYPASS_CMD_PRCTL }, /* Disable Speculative Store Bypass via prctl */ - { "seccomp", SPEC_STORE_BYPASS_CMD_SECCOMP }, /* Disable Speculative Store Bypass via prctl and seccomp */ -}; +static bool nossb __ro_after_init; -static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) +static int __init nossb_parse_cmdline(char *str) { - enum ssb_mitigation_cmd cmd; - char arg[20]; - int ret, i; - - cmd = IS_ENABLED(CONFIG_MITIGATION_SSB) ? - SPEC_STORE_BYPASS_CMD_AUTO : SPEC_STORE_BYPASS_CMD_NONE; - if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") || - cpu_mitigations_off()) { - return SPEC_STORE_BYPASS_CMD_NONE; - } else { - ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable", - arg, sizeof(arg)); - if (ret < 0) - return cmd; + nossb = true; + ssb_mode = SPEC_STORE_BYPASS_NONE; + return 0; +} +early_param("nospec_store_bypass_disable", nossb_parse_cmdline); - for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) { - if (!match_option(arg, ret, ssb_mitigation_options[i].option)) - continue; +static int __init ssb_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; - cmd = ssb_mitigation_options[i].cmd; - break; - } + if (nossb) + return 0; - if (i >= ARRAY_SIZE(ssb_mitigation_options)) { - pr_err("unknown option (%s). Switching to default mode\n", arg); - return cmd; - } - } + if (!strcmp(str, "auto")) + ssb_mode = SPEC_STORE_BYPASS_AUTO; + else if (!strcmp(str, "on")) + ssb_mode = SPEC_STORE_BYPASS_DISABLE; + else if (!strcmp(str, "off")) + ssb_mode = SPEC_STORE_BYPASS_NONE; + else if (!strcmp(str, "prctl")) + ssb_mode = SPEC_STORE_BYPASS_PRCTL; + else if (!strcmp(str, "seccomp")) + ssb_mode = IS_ENABLED(CONFIG_SECCOMP) ? + SPEC_STORE_BYPASS_SECCOMP : SPEC_STORE_BYPASS_PRCTL; + else + pr_err("Ignoring unknown spec_store_bypass_disable option (%s).\n", + str); - return cmd; + return 0; } +early_param("spec_store_bypass_disable", ssb_parse_cmdline); static void __init ssb_select_mitigation(void) { - enum ssb_mitigation_cmd cmd; - - if (!boot_cpu_has(X86_FEATURE_SSBD)) - goto out; - - cmd = ssb_parse_cmdline(); - if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) && - (cmd == SPEC_STORE_BYPASS_CMD_NONE || - cmd == SPEC_STORE_BYPASS_CMD_AUTO)) + if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) { + ssb_mode = SPEC_STORE_BYPASS_NONE; return; + } - switch (cmd) { - case SPEC_STORE_BYPASS_CMD_SECCOMP: - /* - * Choose prctl+seccomp as the default mode if seccomp is - * enabled. - */ - if (IS_ENABLED(CONFIG_SECCOMP)) - ssb_mode = SPEC_STORE_BYPASS_SECCOMP; - else - ssb_mode = SPEC_STORE_BYPASS_PRCTL; - break; - case SPEC_STORE_BYPASS_CMD_ON: - ssb_mode = SPEC_STORE_BYPASS_DISABLE; - break; - case SPEC_STORE_BYPASS_CMD_AUTO: + if (ssb_mode == SPEC_STORE_BYPASS_AUTO) { if (should_mitigate_vuln(X86_BUG_SPEC_STORE_BYPASS)) ssb_mode = SPEC_STORE_BYPASS_PRCTL; else ssb_mode = SPEC_STORE_BYPASS_NONE; - break; - case SPEC_STORE_BYPASS_CMD_PRCTL: - ssb_mode = SPEC_STORE_BYPASS_PRCTL; - break; - case SPEC_STORE_BYPASS_CMD_NONE: - break; } -out: - if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) - pr_info("%s\n", ssb_strings[ssb_mode]); + if (!boot_cpu_has(X86_FEATURE_SSBD)) + ssb_mode = SPEC_STORE_BYPASS_NONE; + + pr_info("%s\n", ssb_strings[ssb_mode]); } static void __init ssb_apply_mitigation(void) @@ -2876,6 +2806,7 @@ static int ssb_prctl_get(struct task_struct *task) return PR_SPEC_DISABLE; case SPEC_STORE_BYPASS_SECCOMP: case SPEC_STORE_BYPASS_PRCTL: + case SPEC_STORE_BYPASS_AUTO: if (task_spec_ssb_force_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; if (task_spec_ssb_noexec(task)) @@ -3195,14 +3126,15 @@ ibpb_on_vmexit: static void __init srso_update_mitigation(void) { + if (!boot_cpu_has_bug(X86_BUG_SRSO)) + return; + /* If retbleed is using IBPB, that works for SRSO as well */ if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB && boot_cpu_has(X86_FEATURE_IBPB_BRTYPE)) srso_mitigation = SRSO_MITIGATION_IBPB; - if (boot_cpu_has_bug(X86_BUG_SRSO) && - !cpu_mitigations_off()) - pr_info("%s\n", srso_strings[srso_mitigation]); + pr_info("%s\n", srso_strings[srso_mitigation]); } static void __init srso_apply_mitigation(void) @@ -3304,15 +3236,18 @@ early_param("vmscape", vmscape_parse_cmdline); static void __init vmscape_select_mitigation(void) { - if (cpu_mitigations_off() || - !boot_cpu_has_bug(X86_BUG_VMSCAPE) || + if (!boot_cpu_has_bug(X86_BUG_VMSCAPE) || !boot_cpu_has(X86_FEATURE_IBPB)) { vmscape_mitigation = VMSCAPE_MITIGATION_NONE; return; } - if (vmscape_mitigation == VMSCAPE_MITIGATION_AUTO) - vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER; + if (vmscape_mitigation == VMSCAPE_MITIGATION_AUTO) { + if (should_mitigate_vuln(X86_BUG_VMSCAPE)) + vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER; + else + vmscape_mitigation = VMSCAPE_MITIGATION_NONE; + } } static void __init vmscape_update_mitigation(void) @@ -3626,9 +3561,6 @@ static const char *spectre_bhi_state(void) static ssize_t spectre_v2_show_state(char *buf) { - if (spectre_v2_enabled == SPECTRE_V2_LFENCE) - return sysfs_emit(buf, "Vulnerable: LFENCE\n"); - if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index adfa7e8bb865..51a95b07831f 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -290,6 +290,22 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c) } /* + * The max shared threads number comes from CPUID(0x4) EAX[25-14] with input + * ECX as cache index. Then right shift apicid by the number's order to get + * cache id for this cache node. + */ +static unsigned int get_cache_id(u32 apicid, const struct _cpuid4_info *id4) +{ + unsigned long num_threads_sharing; + int index_msb; + + num_threads_sharing = 1 + id4->eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + + return apicid >> index_msb; +} + +/* * AMD/Hygon CPUs may have multiple LLCs if L3 caches exist. */ @@ -312,18 +328,11 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id) * Newer families: LLC ID is calculated from the number * of threads sharing the L3 cache. */ - u32 eax, ebx, ecx, edx, num_sharing_cache = 0; u32 llc_index = find_num_cache_leaves(c) - 1; + struct _cpuid4_info id4 = {}; - cpuid_count(0x8000001d, llc_index, &eax, &ebx, &ecx, &edx); - if (eax) - num_sharing_cache = ((eax >> 14) & 0xfff) + 1; - - if (num_sharing_cache) { - int index_msb = get_count_order(num_sharing_cache); - - c->topo.llc_id = c->topo.apicid >> index_msb; - } + if (!amd_fill_cpuid4_info(llc_index, &id4)) + c->topo.llc_id = get_cache_id(c->topo.apicid, &id4); } } @@ -598,27 +607,12 @@ int init_cache_level(unsigned int cpu) return 0; } -/* - * The max shared threads number comes from CPUID(0x4) EAX[25-14] with input - * ECX as cache index. Then right shift apicid by the number's order to get - * cache id for this cache node. - */ -static void get_cache_id(int cpu, struct _cpuid4_info *id4) -{ - struct cpuinfo_x86 *c = &cpu_data(cpu); - unsigned long num_threads_sharing; - int index_msb; - - num_threads_sharing = 1 + id4->eax.split.num_threads_sharing; - index_msb = get_count_order(num_threads_sharing); - id4->id = c->topo.apicid >> index_msb; -} - int populate_cache_leaves(unsigned int cpu) { struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); struct cacheinfo *ci = this_cpu_ci->info_list; u8 cpu_vendor = boot_cpu_data.x86_vendor; + u32 apicid = cpu_data(cpu).topo.apicid; struct amd_northbridge *nb = NULL; struct _cpuid4_info id4 = {}; int idx, ret; @@ -628,7 +622,7 @@ int populate_cache_leaves(unsigned int cpu) if (ret) return ret; - get_cache_id(cpu, &id4); + id4.id = get_cache_id(apicid, &id4); if (cpu_vendor == X86_VENDOR_AMD || cpu_vendor == X86_VENDOR_HYGON) nb = amd_init_l3_cache(idx); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f98ec9c7fc07..c7d3512914ca 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1808,6 +1808,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_clear_cpu_cap(X86_FEATURE_LA57); detect_nopl(); + mca_bsp_init(c); } void __init init_cpu_devs(void) diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 553bfbfc3a1b..f3e9219845e8 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -45,6 +45,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = #ifdef CONFIG_ACRN_GUEST &x86_hyper_acrn, #endif +#ifdef CONFIG_BHYVE_GUEST + &x86_hyper_bhyve, +#endif }; enum x86_hypervisor_type x86_hyper_type; diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 5c4eb28c3ac9..d6906442f49b 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -241,7 +241,8 @@ struct threshold_block { struct threshold_bank { struct kobject *kobj; - struct threshold_block *blocks; + /* List of threshold blocks within this MCA bank. */ + struct list_head miscj; }; static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); @@ -252,9 +253,6 @@ static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); */ static DEFINE_PER_CPU(u64, bank_map); -/* Map of banks that have more than MCA_MISC0 available. */ -static DEFINE_PER_CPU(u64, smca_misc_banks_map); - static void amd_threshold_interrupt(void); static void amd_deferred_error_interrupt(void); @@ -264,28 +262,6 @@ static void default_deferred_error_interrupt(void) } void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; -static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu) -{ - u32 low, high; - - /* - * For SMCA enabled processors, BLKPTR field of the first MISC register - * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4). - */ - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) - return; - - if (!(low & MCI_CONFIG_MCAX)) - return; - - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high)) - return; - - if (low & MASK_BLKPTR_LO) - per_cpu(smca_misc_banks_map, cpu) |= BIT_ULL(bank); - -} - static void smca_configure(unsigned int bank, unsigned int cpu) { u8 *bank_counts = this_cpu_ptr(smca_bank_counts); @@ -326,8 +302,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu) wrmsr(smca_config, low, high); } - smca_set_misc_banks_map(bank, cpu); - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { pr_warn("Failed to read MCA_IPID for bank %d\n", bank); return; @@ -419,8 +393,8 @@ static bool lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) return true; }; -/* Reprogram MCx_MISC MSR behind this threshold bank. */ -static void threshold_restart_bank(void *_tr) +/* Reprogram MCx_MISC MSR behind this threshold block. */ +static void threshold_restart_block(void *_tr) { struct thresh_restart *tr = _tr; u32 hi, lo; @@ -478,7 +452,7 @@ static void mce_threshold_block_init(struct threshold_block *b, int offset) }; b->threshold_limit = THRESHOLD_MAX; - threshold_restart_bank(&tr); + threshold_restart_block(&tr); }; static int setup_APIC_mce_threshold(int reserved, int new) @@ -525,18 +499,6 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) wrmsr(MSR_CU_DEF_ERR, low, high); } -static u32 smca_get_block_address(unsigned int bank, unsigned int block, - unsigned int cpu) -{ - if (!block) - return MSR_AMD64_SMCA_MCx_MISC(bank); - - if (!(per_cpu(smca_misc_banks_map, cpu) & BIT_ULL(bank))) - return 0; - - return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); -} - static u32 get_block_address(u32 current_addr, u32 low, u32 high, unsigned int bank, unsigned int block, unsigned int cpu) @@ -546,8 +508,15 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high, if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS)) return addr; - if (mce_flags.smca) - return smca_get_block_address(bank, block, cpu); + if (mce_flags.smca) { + if (!block) + return MSR_AMD64_SMCA_MCx_MISC(bank); + + if (!(low & MASK_BLKPTR_LO)) + return 0; + + return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); + } /* Fall back to method we used for older processors: */ switch (block) { @@ -677,6 +646,28 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) wrmsrq(MSR_K7_HWCR, hwcr); } +static void amd_apply_cpu_quirks(struct cpuinfo_x86 *c) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + + /* This should be disabled by the BIOS, but isn't always */ + if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { + /* + * disable GART TBL walk error reporting, which + * trips off incorrectly with the IOMMU & 3ware + * & Cerberus: + */ + clear_bit(10, (unsigned long *)&mce_banks[4].ctl); + } + + /* + * Various K7s with broken bank 0 around. Always disable + * by default. + */ + if (c->x86 == 6 && this_cpu_read(mce_num_banks)) + mce_banks[0].ctl = 0; +} + /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { @@ -684,6 +675,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) u32 low = 0, high = 0, address = 0; int offset = -1; + amd_apply_cpu_quirks(c); + + mce_flags.amd_threshold = 1; for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { if (mce_flags.smca) @@ -714,6 +708,12 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) deferred_error_interrupt_enable(c); } +void smca_bsp_init(void) +{ + mce_threshold_vector = amd_threshold_interrupt; + deferred_error_int_vector = amd_deferred_error_interrupt; +} + /* * DRAM ECC errors are reported in the Northbridge (bank 4) with * Extended Error Code 8. @@ -921,7 +921,7 @@ static void log_and_reset_block(struct threshold_block *block) /* Reset threshold block after logging error. */ memset(&tr, 0, sizeof(tr)); tr.b = block; - threshold_restart_bank(&tr); + threshold_restart_block(&tr); } /* @@ -930,9 +930,9 @@ static void log_and_reset_block(struct threshold_block *block) */ static void amd_threshold_interrupt(void) { - struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL; - struct threshold_bank **bp = this_cpu_read(threshold_banks); + struct threshold_bank **bp = this_cpu_read(threshold_banks), *thr_bank; unsigned int bank, cpu = smp_processor_id(); + struct threshold_block *block, *tmp; /* * Validate that the threshold bank has been initialized already. The @@ -946,20 +946,20 @@ static void amd_threshold_interrupt(void) if (!(per_cpu(bank_map, cpu) & BIT_ULL(bank))) continue; - first_block = bp[bank]->blocks; - if (!first_block) + thr_bank = bp[bank]; + if (!thr_bank) continue; - /* - * The first block is also the head of the list. Check it first - * before iterating over the rest. - */ - log_and_reset_block(first_block); - list_for_each_entry_safe(block, tmp, &first_block->miscj, miscj) + list_for_each_entry_safe(block, tmp, &thr_bank->miscj, miscj) log_and_reset_block(block); } } +void amd_clear_bank(struct mce *m) +{ + mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0); +} + /* * Sysfs Interface */ @@ -995,7 +995,7 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) memset(&tr, 0, sizeof(tr)); tr.b = b; - if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1)) + if (smp_call_function_single(b->cpu, threshold_restart_block, &tr, 1)) return -ENODEV; return size; @@ -1020,7 +1020,7 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) b->threshold_limit = new; tr.b = b; - if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1)) + if (smp_call_function_single(b->cpu, threshold_restart_block, &tr, 1)) return -ENODEV; return size; @@ -1181,13 +1181,7 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb default_attrs[2] = NULL; } - INIT_LIST_HEAD(&b->miscj); - - /* This is safe as @tb is not visible yet */ - if (tb->blocks) - list_add(&b->miscj, &tb->blocks->miscj); - else - tb->blocks = b; + list_add(&b->miscj, &tb->miscj); err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b)); if (err) @@ -1238,6 +1232,8 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, goto out_free; } + INIT_LIST_HEAD(&b->miscj); + err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC)); if (err) goto out_kobj; @@ -1258,26 +1254,15 @@ static void threshold_block_release(struct kobject *kobj) kfree(to_block(kobj)); } -static void deallocate_threshold_blocks(struct threshold_bank *bank) +static void threshold_remove_bank(struct threshold_bank *bank) { struct threshold_block *pos, *tmp; - list_for_each_entry_safe(pos, tmp, &bank->blocks->miscj, miscj) { + list_for_each_entry_safe(pos, tmp, &bank->miscj, miscj) { list_del(&pos->miscj); kobject_put(&pos->kobj); } - kobject_put(&bank->blocks->kobj); -} - -static void threshold_remove_bank(struct threshold_bank *bank) -{ - if (!bank->blocks) - goto out_free; - - deallocate_threshold_blocks(bank); - -out_free: kobject_put(bank->kobj); kfree(bank); } @@ -1296,12 +1281,12 @@ static void __threshold_remove_device(struct threshold_bank **bp) kfree(bp); } -int mce_threshold_remove_device(unsigned int cpu) +void mce_threshold_remove_device(unsigned int cpu) { struct threshold_bank **bp = this_cpu_read(threshold_banks); if (!bp) - return 0; + return; /* * Clear the pointer before cleaning up, so that the interrupt won't @@ -1310,7 +1295,7 @@ int mce_threshold_remove_device(unsigned int cpu) this_cpu_write(threshold_banks, NULL); __threshold_remove_device(bp); - return 0; + return; } /** @@ -1324,36 +1309,34 @@ int mce_threshold_remove_device(unsigned int cpu) * thread running on @cpu. The callback is invoked on all CPUs which are * online when the callback is installed or during a real hotplug event. */ -int mce_threshold_create_device(unsigned int cpu) +void mce_threshold_create_device(unsigned int cpu) { unsigned int numbanks, bank; struct threshold_bank **bp; - int err; if (!mce_flags.amd_threshold) - return 0; + return; bp = this_cpu_read(threshold_banks); if (bp) - return 0; + return; numbanks = this_cpu_read(mce_num_banks); bp = kcalloc(numbanks, sizeof(*bp), GFP_KERNEL); if (!bp) - return -ENOMEM; + return; for (bank = 0; bank < numbanks; ++bank) { if (!(this_cpu_read(bank_map) & BIT_ULL(bank))) continue; - err = threshold_create_bank(bp, cpu, bank); - if (err) { + if (threshold_create_bank(bp, cpu, bank)) { __threshold_remove_device(bp); - return err; + return; } } this_cpu_write(threshold_banks, bp); if (thresholding_irq_en) mce_threshold_vector = amd_threshold_interrupt; - return 0; + return; } diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 4da4eab56c81..460e90a1a0b1 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -423,7 +423,7 @@ noinstr u64 mce_rdmsrq(u32 msr) return EAX_EDX_VAL(val, low, high); } -static noinstr void mce_wrmsrq(u32 msr, u64 v) +noinstr void mce_wrmsrq(u32 msr, u64 v) { u32 low, high; @@ -715,6 +715,60 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i) DEFINE_PER_CPU(unsigned, mce_poll_count); /* + * Newer Intel systems that support software error + * recovery need to make additional checks. Other + * CPUs should skip over uncorrected errors, but log + * everything else. + */ +static bool ser_should_log_poll_error(struct mce *m) +{ + /* Log "not enabled" (speculative) errors */ + if (!(m->status & MCI_STATUS_EN)) + return true; + + /* + * Log UCNA (SDM: 15.6.3 "UCR Error Classification") + * UC == 1 && PCC == 0 && S == 0 + */ + if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S)) + return true; + + return false; +} + +static bool should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *err) +{ + struct mce *m = &err->m; + + /* If this entry is not valid, ignore it. */ + if (!(m->status & MCI_STATUS_VAL)) + return false; + + /* + * If we are logging everything (at CPU online) or this + * is a corrected error, then we must log it. + */ + if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC)) + return true; + + if (mca_cfg.ser) + return ser_should_log_poll_error(m); + + if (m->status & MCI_STATUS_UC) + return false; + + return true; +} + +static void clear_bank(struct mce *m) +{ + if (m->cpuvendor == X86_VENDOR_AMD) + return amd_clear_bank(m); + + mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0); +} + +/* * Poll for corrected events or events that happened before reset. * Those are just logged through /dev/mcelog. * @@ -765,51 +819,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (!mca_cfg.cmci_disabled) mce_track_storm(m); - /* If this entry is not valid, ignore it */ - if (!(m->status & MCI_STATUS_VAL)) + /* Verify that the error should be logged based on hardware conditions. */ + if (!should_log_poll_error(flags, &err)) continue; - /* - * If we are logging everything (at CPU online) or this - * is a corrected error, then we must log it. - */ - if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC)) - goto log_it; - - /* - * Newer Intel systems that support software error - * recovery need to make additional checks. Other - * CPUs should skip over uncorrected errors, but log - * everything else. - */ - if (!mca_cfg.ser) { - if (m->status & MCI_STATUS_UC) - continue; - goto log_it; - } - - /* Log "not enabled" (speculative) errors */ - if (!(m->status & MCI_STATUS_EN)) - goto log_it; - - /* - * Log UCNA (SDM: 15.6.3 "UCR Error Classification") - * UC == 1 && PCC == 0 && S == 0 - */ - if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S)) - goto log_it; - - /* - * Skip anything else. Presumption is that our read of this - * bank is racing with a machine check. Leave the log alone - * for do_machine_check() to deal with it. - */ - continue; - -log_it: - if (flags & MCP_DONTLOG) - goto clear_it; - mce_read_aux(&err, i); m->severity = mce_severity(m, NULL, NULL, false); /* @@ -826,10 +839,7 @@ log_it: mce_log(&err); clear_it: - /* - * Clear state for this bank. - */ - mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); + clear_bank(m); } /* @@ -1810,9 +1820,10 @@ static void __mcheck_cpu_mce_banks_init(void) struct mce_bank *b = &mce_banks[i]; /* - * Init them all, __mcheck_cpu_apply_quirks() is going to apply - * the required vendor quirks before - * __mcheck_cpu_init_clear_banks() does the final bank setup. + * Init them all by default. + * + * The required vendor quirks will be applied before + * __mcheck_cpu_init_prepare_banks() does the final bank setup. */ b->ctl = -1ULL; b->init = true; @@ -1840,69 +1851,34 @@ static void __mcheck_cpu_cap_init(void) this_cpu_write(mce_num_banks, b); __mcheck_cpu_mce_banks_init(); - - /* Use accurate RIP reporting if available. */ - if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) - mca_cfg.rip_msr = MSR_IA32_MCG_EIP; - - if (cap & MCG_SER_P) - mca_cfg.ser = 1; } static void __mcheck_cpu_init_generic(void) { - enum mcp_flags m_fl = 0; - mce_banks_t all_banks; u64 cap; - if (!mca_cfg.bootlog) - m_fl = MCP_DONTLOG; - - /* - * Log the machine checks left over from the previous reset. Log them - * only, do not start processing them. That will happen in mcheck_late_init() - * when all consumers have been registered on the notifier chain. - */ - bitmap_fill(all_banks, MAX_NR_BANKS); - machine_check_poll(MCP_UC | MCP_QUEUE_LOG | m_fl, &all_banks); - - cr4_set_bits(X86_CR4_MCE); - rdmsrq(MSR_IA32_MCG_CAP, cap); if (cap & MCG_CTL_P) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); } -static void __mcheck_cpu_init_clear_banks(void) +static void __mcheck_cpu_init_prepare_banks(void) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + u64 msrval; int i; - for (i = 0; i < this_cpu_read(mce_num_banks); i++) { - struct mce_bank *b = &mce_banks[i]; + /* + * Log the machine checks left over from the previous reset. Log them + * only, do not start processing them. That will happen in mcheck_late_init() + * when all consumers have been registered on the notifier chain. + */ + if (mca_cfg.bootlog) { + mce_banks_t all_banks; - if (!b->init) - continue; - wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl); - wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); + bitmap_fill(all_banks, MAX_NR_BANKS); + machine_check_poll(MCP_UC | MCP_QUEUE_LOG, &all_banks); } -} - -/* - * Do a final check to see if there are any unused/RAZ banks. - * - * This must be done after the banks have been initialized and any quirks have - * been applied. - * - * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs. - * Otherwise, a user who disables a bank will not be able to re-enable it - * without a system reboot. - */ -static void __mcheck_cpu_check_banks(void) -{ - struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - u64 msrval; - int i; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { struct mce_bank *b = &mce_banks[i]; @@ -1910,25 +1886,16 @@ static void __mcheck_cpu_check_banks(void) if (!b->init) continue; + wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl); + wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); + rdmsrq(mca_msr_reg(i, MCA_CTL), msrval); b->init = !!msrval; } } -static void apply_quirks_amd(struct cpuinfo_x86 *c) +static void amd_apply_global_quirks(struct cpuinfo_x86 *c) { - struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - - /* This should be disabled by the BIOS, but isn't always */ - if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { - /* - * disable GART TBL walk error reporting, which - * trips off incorrectly with the IOMMU & 3ware - * & Cerberus: - */ - clear_bit(10, (unsigned long *)&mce_banks[4].ctl); - } - if (c->x86 < 0x11 && mca_cfg.bootlog < 0) { /* * Lots of broken BIOS around that don't clear them @@ -1938,13 +1905,6 @@ static void apply_quirks_amd(struct cpuinfo_x86 *c) } /* - * Various K7s with broken bank 0 around. Always disable - * by default. - */ - if (c->x86 == 6 && this_cpu_read(mce_num_banks)) - mce_banks[0].ctl = 0; - - /* * overflow_recov is supported for F15h Models 00h-0fh * even though we don't have a CPUID bit for it. */ @@ -1955,26 +1915,13 @@ static void apply_quirks_amd(struct cpuinfo_x86 *c) mce_flags.zen_ifu_quirk = 1; } -static void apply_quirks_intel(struct cpuinfo_x86 *c) +static void intel_apply_global_quirks(struct cpuinfo_x86 *c) { - struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - /* Older CPUs (prior to family 6) don't need quirks. */ if (c->x86_vfm < INTEL_PENTIUM_PRO) return; /* - * SDM documents that on family 6 bank 0 should not be written - * because it aliases to another special BIOS controlled - * register. - * But it's not aliased anymore on model 0x1a+ - * Don't ignore bank 0 completely because there could be a - * valid event later, merely don't write CTL0. - */ - if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks)) - mce_banks[0].init = false; - - /* * All newer Intel systems support MCE broadcasting. Enable * synchronization with a one second timeout. */ @@ -1999,7 +1946,7 @@ static void apply_quirks_intel(struct cpuinfo_x86 *c) mce_flags.skx_repmov_quirk = 1; } -static void apply_quirks_zhaoxin(struct cpuinfo_x86 *c) +static void zhaoxin_apply_global_quirks(struct cpuinfo_x86 *c) { /* * All newer Zhaoxin CPUs support MCE broadcasting. Enable @@ -2011,34 +1958,6 @@ static void apply_quirks_zhaoxin(struct cpuinfo_x86 *c) } } -/* Add per CPU specific workarounds here */ -static bool __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) -{ - struct mca_config *cfg = &mca_cfg; - - switch (c->x86_vendor) { - case X86_VENDOR_UNKNOWN: - pr_info("unknown CPU type - not enabling MCE support\n"); - return false; - case X86_VENDOR_AMD: - apply_quirks_amd(c); - break; - case X86_VENDOR_INTEL: - apply_quirks_intel(c); - break; - case X86_VENDOR_ZHAOXIN: - apply_quirks_zhaoxin(c); - break; - } - - if (cfg->monarch_timeout < 0) - cfg->monarch_timeout = 0; - if (cfg->bootlog != 0) - cfg->panic_timeout = 30; - - return true; -} - static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) { if (c->x86 != 5) @@ -2060,19 +1979,6 @@ static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) return false; } -/* - * Init basic CPU features needed for early decoding of MCEs. - */ -static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) -{ - if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) { - mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV); - mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); - mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); - mce_flags.amd_threshold = 1; - } -} - static void mce_centaur_feature_init(struct cpuinfo_x86 *c) { struct mca_config *cfg = &mca_cfg; @@ -2281,6 +2187,53 @@ DEFINE_IDTENTRY_RAW(exc_machine_check) } #endif +void mca_bsp_init(struct cpuinfo_x86 *c) +{ + u64 cap; + + if (!mce_available(c)) + return; + + if (c->x86_vendor == X86_VENDOR_UNKNOWN) { + mca_cfg.disabled = 1; + pr_info("unknown CPU type - not enabling MCE support\n"); + return; + } + + mce_flags.overflow_recov = cpu_feature_enabled(X86_FEATURE_OVERFLOW_RECOV); + mce_flags.succor = cpu_feature_enabled(X86_FEATURE_SUCCOR); + mce_flags.smca = cpu_feature_enabled(X86_FEATURE_SMCA); + + if (mce_flags.smca) + smca_bsp_init(); + + rdmsrq(MSR_IA32_MCG_CAP, cap); + + /* Use accurate RIP reporting if available. */ + if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) + mca_cfg.rip_msr = MSR_IA32_MCG_EIP; + + if (cap & MCG_SER_P) + mca_cfg.ser = 1; + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + amd_apply_global_quirks(c); + break; + case X86_VENDOR_INTEL: + intel_apply_global_quirks(c); + break; + case X86_VENDOR_ZHAOXIN: + zhaoxin_apply_global_quirks(c); + break; + } + + if (mca_cfg.monarch_timeout < 0) + mca_cfg.monarch_timeout = 0; + if (mca_cfg.bootlog != 0) + mca_cfg.panic_timeout = 30; +} + /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off: @@ -2298,11 +2251,6 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_cap_init(); - if (!__mcheck_cpu_apply_quirks(c)) { - mca_cfg.disabled = 1; - return; - } - if (!mce_gen_pool_init()) { mca_cfg.disabled = 1; pr_emerg("Couldn't allocate MCE records pool!\n"); @@ -2311,12 +2259,11 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) mca_cfg.initialized = 1; - __mcheck_cpu_init_early(c); __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(c); - __mcheck_cpu_init_clear_banks(); - __mcheck_cpu_check_banks(); + __mcheck_cpu_init_prepare_banks(); __mcheck_cpu_setup_timer(); + cr4_set_bits(X86_CR4_MCE); } /* @@ -2483,7 +2430,8 @@ static void mce_syscore_resume(void) { __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info)); - __mcheck_cpu_init_clear_banks(); + __mcheck_cpu_init_prepare_banks(); + cr4_set_bits(X86_CR4_MCE); } static struct syscore_ops mce_syscore_ops = { @@ -2501,8 +2449,9 @@ static void mce_cpu_restart(void *data) if (!mce_available(raw_cpu_ptr(&cpu_info))) return; __mcheck_cpu_init_generic(); - __mcheck_cpu_init_clear_banks(); + __mcheck_cpu_init_prepare_banks(); __mcheck_cpu_init_timer(); + cr4_set_bits(X86_CR4_MCE); } /* Reinit MCEs after user configuration changes */ diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index 9b149b9c4109..4655223ba560 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -468,8 +468,26 @@ static void intel_imc_init(struct cpuinfo_x86 *c) } } +static void intel_apply_cpu_quirks(struct cpuinfo_x86 *c) +{ + /* + * SDM documents that on family 6 bank 0 should not be written + * because it aliases to another special BIOS controlled + * register. + * But it's not aliased anymore on model 0x1a+ + * Don't ignore bank 0 completely because there could be a + * valid event later, merely don't write CTL0. + * + * Older CPUs (prior to family 6) can't reach this point and already + * return early due to the check of __mcheck_cpu_ancient_init(). + */ + if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks)) + this_cpu_ptr(mce_banks_array)[0].init = false; +} + void mce_intel_feature_init(struct cpuinfo_x86 *c) { + intel_apply_cpu_quirks(c); intel_init_cmci(); intel_init_lmce(); intel_imc_init(c); diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index b5ba598e54cb..b0e00ec5cc8c 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -265,8 +265,11 @@ void mce_prep_record_common(struct mce *m); void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m); #ifdef CONFIG_X86_MCE_AMD +void mce_threshold_create_device(unsigned int cpu); +void mce_threshold_remove_device(unsigned int cpu); extern bool amd_filter_mce(struct mce *m); bool amd_mce_usable_address(struct mce *m); +void amd_clear_bank(struct mce *m); /* * If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits @@ -292,10 +295,15 @@ static __always_inline void smca_extract_err_addr(struct mce *m) m->addr &= GENMASK_ULL(55, lsb); } +void smca_bsp_init(void); #else +static inline void mce_threshold_create_device(unsigned int cpu) { } +static inline void mce_threshold_remove_device(unsigned int cpu) { } static inline bool amd_filter_mce(struct mce *m) { return false; } static inline bool amd_mce_usable_address(struct mce *m) { return false; } +static inline void amd_clear_bank(struct mce *m) { } static inline void smca_extract_err_addr(struct mce *m) { } +static inline void smca_bsp_init(void) { } #endif #ifdef CONFIG_X86_ANCIENT_MCE @@ -313,6 +321,7 @@ static __always_inline void winchip_machine_check(struct pt_regs *regs) {} #endif noinstr u64 mce_rdmsrq(u32 msr); +noinstr void mce_wrmsrq(u32 msr, u64 v); static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg) { diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 514f63340880..cdce885e2fd5 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -269,15 +269,6 @@ static bool verify_sha256_digest(u32 patch_id, u32 cur_rev, const u8 *data, unsi return true; } -static u32 get_patch_level(void) -{ - u32 rev, dummy __always_unused; - - native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - - return rev; -} - static union cpuid_1_eax ucode_rev_to_cpuid(unsigned int val) { union zen_patch_rev p; @@ -295,6 +286,30 @@ static union cpuid_1_eax ucode_rev_to_cpuid(unsigned int val) return c; } +static u32 get_patch_level(void) +{ + u32 rev, dummy __always_unused; + + if (IS_ENABLED(CONFIG_MICROCODE_DBG)) { + int cpu = smp_processor_id(); + + if (!microcode_rev[cpu]) { + if (!base_rev) + base_rev = cpuid_to_ucode_rev(bsp_cpuid_1_eax); + + microcode_rev[cpu] = base_rev; + + ucode_dbg("CPU%d, base_rev: 0x%x\n", cpu, base_rev); + } + + return microcode_rev[cpu]; + } + + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + + return rev; +} + static u16 find_equiv_id(struct equiv_cpu_table *et, u32 sig) { unsigned int i; @@ -324,13 +339,13 @@ static bool verify_container(const u8 *buf, size_t buf_size) u32 cont_magic; if (buf_size <= CONTAINER_HDR_SZ) { - pr_debug("Truncated microcode container header.\n"); + ucode_dbg("Truncated microcode container header.\n"); return false; } cont_magic = *(const u32 *)buf; if (cont_magic != UCODE_MAGIC) { - pr_debug("Invalid magic value (0x%08x).\n", cont_magic); + ucode_dbg("Invalid magic value (0x%08x).\n", cont_magic); return false; } @@ -355,8 +370,8 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size) cont_type = hdr[1]; if (cont_type != UCODE_EQUIV_CPU_TABLE_TYPE) { - pr_debug("Wrong microcode container equivalence table type: %u.\n", - cont_type); + ucode_dbg("Wrong microcode container equivalence table type: %u.\n", + cont_type); return false; } @@ -365,7 +380,7 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size) equiv_tbl_len = hdr[2]; if (equiv_tbl_len < sizeof(struct equiv_cpu_entry) || buf_size < equiv_tbl_len) { - pr_debug("Truncated equivalence table.\n"); + ucode_dbg("Truncated equivalence table.\n"); return false; } @@ -385,7 +400,7 @@ static bool __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize const u32 *hdr; if (buf_size < SECTION_HDR_SIZE) { - pr_debug("Truncated patch section.\n"); + ucode_dbg("Truncated patch section.\n"); return false; } @@ -394,13 +409,13 @@ static bool __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize p_size = hdr[1]; if (p_type != UCODE_UCODE_TYPE) { - pr_debug("Invalid type field (0x%x) in container file section header.\n", - p_type); + ucode_dbg("Invalid type field (0x%x) in container file section header.\n", + p_type); return false; } if (p_size < sizeof(struct microcode_header_amd)) { - pr_debug("Patch of size %u too short.\n", p_size); + ucode_dbg("Patch of size %u too short.\n", p_size); return false; } @@ -477,12 +492,12 @@ static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size) * size sh_psize, as the section claims. */ if (buf_size < sh_psize) { - pr_debug("Patch of size %u truncated.\n", sh_psize); + ucode_dbg("Patch of size %u truncated.\n", sh_psize); return -1; } if (!__verify_patch_size(sh_psize, buf_size)) { - pr_debug("Per-family patch size mismatch.\n"); + ucode_dbg("Per-family patch size mismatch.\n"); return -1; } @@ -496,6 +511,9 @@ static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size) proc_id = mc_hdr->processor_rev_id; patch_fam = 0xf + (proc_id >> 12); + + ucode_dbg("Patch-ID 0x%08x: family: 0x%x\n", mc_hdr->patch_id, patch_fam); + if (patch_fam != family) return 1; @@ -566,9 +584,14 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc) } mc = (struct microcode_amd *)(buf + SECTION_HDR_SIZE); + + ucode_dbg("patch_id: 0x%x\n", mc->hdr.patch_id); + if (mc_patch_matches(mc, eq_id)) { desc->psize = patch_size; desc->mc = mc; + + ucode_dbg(" match: size: %d\n", patch_size); } skip: @@ -639,8 +662,14 @@ static bool __apply_microcode_amd(struct microcode_amd *mc, u32 *cur_rev, invlpg(p_addr_end); } + if (IS_ENABLED(CONFIG_MICROCODE_DBG)) + microcode_rev[smp_processor_id()] = mc->hdr.patch_id; + /* verify patch application was successful */ *cur_rev = get_patch_level(); + + ucode_dbg("updated rev: 0x%x\n", *cur_rev); + if (*cur_rev != mc->hdr.patch_id) return false; @@ -1026,7 +1055,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover, patch->patch_id = mc_hdr->patch_id; patch->equiv_cpu = proc_id; - pr_debug("%s: Adding patch_id: 0x%08x, proc_id: 0x%04x\n", + ucode_dbg("%s: Adding patch_id: 0x%08x, proc_id: 0x%04x\n", __func__, patch->patch_id, proc_id); /* ... and add to cache. */ @@ -1169,7 +1198,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device) snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); if (request_firmware_direct(&fw, (const char *)fw_name, device)) { - pr_debug("failed to load file %s\n", fw_name); + ucode_dbg("failed to load file %s\n", fw_name); goto out; } diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index b92e09a87c69..f75c140906d0 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -43,10 +43,19 @@ #include "internal.h" static struct microcode_ops *microcode_ops; -static bool dis_ucode_ldr = false; +static bool dis_ucode_ldr; bool force_minrev = IS_ENABLED(CONFIG_MICROCODE_LATE_FORCE_MINREV); -module_param(force_minrev, bool, S_IRUSR | S_IWUSR); + +/* + * Those below should be behind CONFIG_MICROCODE_DBG ifdeffery but in + * order to not uglify the code with ifdeffery and use IS_ENABLED() + * instead, leave them in. When microcode debugging is not enabled, + * those are meaningless anyway. + */ +/* base microcode revision for debugging */ +u32 base_rev; +u32 microcode_rev[NR_CPUS] = {}; /* * Synchronization. @@ -119,20 +128,48 @@ bool __init microcode_loader_disabled(void) * overwritten. */ if (!cpuid_feature() || - native_cpuid_ecx(1) & BIT(31) || + ((native_cpuid_ecx(1) & BIT(31)) && + !IS_ENABLED(CONFIG_MICROCODE_DBG)) || amd_check_current_patch_level()) dis_ucode_ldr = true; return dis_ucode_ldr; } +static void early_parse_cmdline(void) +{ + char cmd_buf[64] = {}; + char *s, *p = cmd_buf; + + if (cmdline_find_option(boot_command_line, "microcode", cmd_buf, sizeof(cmd_buf)) > 0) { + while ((s = strsep(&p, ","))) { + if (IS_ENABLED(CONFIG_MICROCODE_DBG)) { + if (strstr(s, "base_rev=")) { + /* advance to the option arg */ + strsep(&s, "="); + if (kstrtouint(s, 16, &base_rev)) { ; } + } + } + + if (!strcmp("force_minrev", s)) + force_minrev = true; + + if (!strcmp(s, "dis_ucode_ldr")) + dis_ucode_ldr = true; + } + } + + /* old, compat option */ + if (cmdline_find_option_bool(boot_command_line, "dis_ucode_ldr") > 0) + dis_ucode_ldr = true; +} + void __init load_ucode_bsp(void) { unsigned int cpuid_1_eax; bool intel = true; - if (cmdline_find_option_bool(boot_command_line, "dis_ucode_ldr") > 0) - dis_ucode_ldr = true; + early_parse_cmdline(); if (microcode_loader_disabled()) return; diff --git a/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h index cb6e601701ab..2d48e6593540 100644 --- a/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h +++ b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h @@ -67,9 +67,8 @@ { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0008, .driver_data = 0x1000191 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0010, .driver_data = 0x2007006 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0020, .driver_data = 0x3000010 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0040, .driver_data = 0x4003605 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0080, .driver_data = 0x5003707 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0800, .driver_data = 0x7002904 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0080, .driver_data = 0x5003901 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0800, .driver_data = 0x7002b01 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0004, .driver_data = 0x1c }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0008, .driver_data = 0x700001c }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0010, .driver_data = 0xf00001a }, @@ -81,51 +80,62 @@ { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5f, .steppings = 0x0002, .driver_data = 0x3e }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x66, .steppings = 0x0008, .driver_data = 0x2a }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0020, .driver_data = 0xc0002f0 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0040, .driver_data = 0xd0003e7 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6c, .steppings = 0x0002, .driver_data = 0x10002b0 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0040, .driver_data = 0xd000404 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6c, .steppings = 0x0002, .driver_data = 0x10002d0 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0002, .driver_data = 0x42 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0100, .driver_data = 0x24 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7e, .steppings = 0x0020, .driver_data = 0xc6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0100, .driver_data = 0x26 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7e, .steppings = 0x0020, .driver_data = 0xca }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8a, .steppings = 0x0002, .driver_data = 0x33 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0002, .driver_data = 0xb8 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0004, .driver_data = 0x38 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8d, .steppings = 0x0002, .driver_data = 0x52 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0002, .driver_data = 0xbc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0004, .driver_data = 0x3c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8d, .steppings = 0x0002, .driver_data = 0x56 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0200, .driver_data = 0xf6 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0400, .driver_data = 0xf6 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0800, .driver_data = 0xf6 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x1000, .driver_data = 0xfc }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0100, .driver_data = 0x2c000390 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0080, .driver_data = 0x2b000603 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0040, .driver_data = 0x2c000390 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0020, .driver_data = 0x2c000390 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0010, .driver_data = 0x2c000390 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x1000, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0010, .driver_data = 0x2c0003f7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0020, .driver_data = 0x2c0003f7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0040, .driver_data = 0x2c0003f7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0080, .driver_data = 0x2b000639 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0100, .driver_data = 0x2c0003f7 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x96, .steppings = 0x0002, .driver_data = 0x1a }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0004, .driver_data = 0x37 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0020, .driver_data = 0x37 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0004, .driver_data = 0x37 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0020, .driver_data = 0x37 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0008, .driver_data = 0x435 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0010, .driver_data = 0x435 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0004, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0020, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0008, .driver_data = 0x437 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0010, .driver_data = 0x437 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9c, .steppings = 0x0001, .driver_data = 0x24000026 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0200, .driver_data = 0xf8 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0400, .driver_data = 0xf8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0400, .driver_data = 0xfa }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0800, .driver_data = 0xf6 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x1000, .driver_data = 0xf8 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x2000, .driver_data = 0x100 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0004, .driver_data = 0xfc }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0008, .driver_data = 0xfc }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0020, .driver_data = 0xfc }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0001, .driver_data = 0xfe }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0002, .driver_data = 0xfc }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa7, .steppings = 0x0002, .driver_data = 0x62 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xaa, .steppings = 0x0010, .driver_data = 0x20 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb7, .steppings = 0x0002, .driver_data = 0x12b }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0004, .driver_data = 0x4123 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0008, .driver_data = 0x4123 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0100, .driver_data = 0x4123 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbe, .steppings = 0x0001, .driver_data = 0x1a }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0004, .driver_data = 0x21000283 }, -{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0002, .driver_data = 0x21000283 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x2000, .driver_data = 0x104 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0004, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0008, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0020, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0001, .driver_data = 0x102 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0002, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa7, .steppings = 0x0002, .driver_data = 0x64 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xaa, .steppings = 0x0010, .driver_data = 0x24 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xad, .steppings = 0x0002, .driver_data = 0xa0000d1 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xaf, .steppings = 0x0008, .driver_data = 0x3000341 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb5, .steppings = 0x0001, .driver_data = 0xa }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb7, .steppings = 0x0002, .driver_data = 0x12f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb7, .steppings = 0x0010, .driver_data = 0x12f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0004, .driver_data = 0x4128 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0008, .driver_data = 0x4128 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0100, .driver_data = 0x4128 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbd, .steppings = 0x0002, .driver_data = 0x11f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbe, .steppings = 0x0001, .driver_data = 0x1d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0004, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0020, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0040, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0080, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xc5, .steppings = 0x0004, .driver_data = 0x118 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xc6, .steppings = 0x0004, .driver_data = 0x118 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xc6, .steppings = 0x0010, .driver_data = 0x118 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xca, .steppings = 0x0004, .driver_data = 0x118 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0002, .driver_data = 0x210002a9 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0004, .driver_data = 0x210002a9 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0080, .driver_data = 0x12 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0400, .driver_data = 0x15 }, { .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x01, .steppings = 0x0004, .driver_data = 0x2e }, diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h index 50a9702ae4e2..ae8dbc2b908d 100644 --- a/arch/x86/kernel/cpu/microcode/internal.h +++ b/arch/x86/kernel/cpu/microcode/internal.h @@ -44,6 +44,9 @@ struct early_load_data { extern struct early_load_data early_data; extern struct ucode_cpu_info ucode_cpu_info[]; +extern u32 microcode_rev[NR_CPUS]; +extern u32 base_rev; + struct cpio_data find_microcode_in_initrd(const char *path); #define MAX_UCODE_COUNT 128 @@ -122,4 +125,10 @@ static inline void reload_ucode_intel(void) { } static inline struct microcode_ops *init_intel_microcode(void) { return NULL; } #endif /* !CONFIG_CPU_SUP_INTEL */ +#define ucode_dbg(fmt, ...) \ +({ \ + if (IS_ENABLED(CONFIG_MICROCODE_DBG)) \ + pr_info(fmt, ##__VA_ARGS__); \ +}) + #endif /* _X86_MICROCODE_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 187d527ef73b..06ca5a30140c 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -107,7 +107,7 @@ u32 resctrl_arch_system_num_rmid_idx(void) struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ - return r->num_rmid; + return r->mon.num_rmid; } struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) @@ -365,8 +365,10 @@ static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom) static void mon_domain_free(struct rdt_hw_mon_domain *hw_dom) { - kfree(hw_dom->arch_mbm_total); - kfree(hw_dom->arch_mbm_local); + int idx; + + for_each_mbm_idx(idx) + kfree(hw_dom->arch_mbm_states[idx]); kfree(hw_dom); } @@ -400,25 +402,27 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain * */ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom) { - size_t tsize; - - if (resctrl_arch_is_mbm_total_enabled()) { - tsize = sizeof(*hw_dom->arch_mbm_total); - hw_dom->arch_mbm_total = kcalloc(num_rmid, tsize, GFP_KERNEL); - if (!hw_dom->arch_mbm_total) - return -ENOMEM; - } - if (resctrl_arch_is_mbm_local_enabled()) { - tsize = sizeof(*hw_dom->arch_mbm_local); - hw_dom->arch_mbm_local = kcalloc(num_rmid, tsize, GFP_KERNEL); - if (!hw_dom->arch_mbm_local) { - kfree(hw_dom->arch_mbm_total); - hw_dom->arch_mbm_total = NULL; - return -ENOMEM; - } + size_t tsize = sizeof(*hw_dom->arch_mbm_states[0]); + enum resctrl_event_id eventid; + int idx; + + for_each_mbm_event_id(eventid) { + if (!resctrl_is_mon_event_enabled(eventid)) + continue; + idx = MBM_STATE_IDX(eventid); + hw_dom->arch_mbm_states[idx] = kcalloc(num_rmid, tsize, GFP_KERNEL); + if (!hw_dom->arch_mbm_states[idx]) + goto cleanup; } return 0; +cleanup: + for_each_mbm_idx(idx) { + kfree(hw_dom->arch_mbm_states[idx]); + hw_dom->arch_mbm_states[idx] = NULL; + } + + return -ENOMEM; } static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope) @@ -516,6 +520,9 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) d = container_of(hdr, struct rdt_mon_domain, hdr); cpumask_set_cpu(cpu, &d->hdr.cpu_mask); + /* Update the mbm_assign_mode state for the CPU if supported */ + if (r->mon.mbm_cntr_assignable) + resctrl_arch_mbm_cntr_assign_set_one(r); return; } @@ -535,9 +542,13 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) d->ci_id = ci->id; cpumask_set_cpu(cpu, &d->hdr.cpu_mask); + /* Update the mbm_assign_mode state for the CPU if supported */ + if (r->mon.mbm_cntr_assignable) + resctrl_arch_mbm_cntr_assign_set_one(r); + arch_mon_domain_online(r, d); - if (arch_domain_mbm_alloc(r->num_rmid, hw_dom)) { + if (arch_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) { mon_domain_free(hw_dom); return; } @@ -707,6 +718,7 @@ enum { RDT_FLAG_MBA, RDT_FLAG_SMBA, RDT_FLAG_BMEC, + RDT_FLAG_ABMC, }; #define RDT_OPT(idx, n, f) \ @@ -732,6 +744,7 @@ static struct rdt_options rdt_options[] __ro_after_init = { RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA), RDT_OPT(RDT_FLAG_SMBA, "smba", X86_FEATURE_SMBA), RDT_OPT(RDT_FLAG_BMEC, "bmec", X86_FEATURE_BMEC), + RDT_OPT(RDT_FLAG_ABMC, "abmc", X86_FEATURE_ABMC), }; #define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options) @@ -863,15 +876,24 @@ static __init bool get_rdt_alloc_resources(void) static __init bool get_rdt_mon_resources(void) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + bool ret = false; - if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) - rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID); - if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) - rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID); - if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) - rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID); + if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) { + resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID); + ret = true; + } + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { + resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); + ret = true; + } + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { + resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); + ret = true; + } + if (rdt_cpu_has(X86_FEATURE_ABMC)) + ret = true; - if (!rdt_mon_features) + if (!ret) return false; return !rdt_get_mon_l3_config(r); @@ -965,7 +987,7 @@ static enum cpuhp_state rdt_online; /* Runs once on the BSP during boot. */ void resctrl_cpu_detect(struct cpuinfo_x86 *c) { - if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { + if (!cpu_has(c, X86_FEATURE_CQM_LLC) && !cpu_has(c, X86_FEATURE_ABMC)) { c->x86_cache_max_rmid = -1; c->x86_cache_occ_scale = -1; c->x86_cache_mbm_width_offset = -1; @@ -977,7 +999,8 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) || cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) || - cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) { + cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL) || + cpu_has(c, X86_FEATURE_ABMC)) { u32 eax, ebx, ecx, edx; /* QoS sub-leaf, EAX=0Fh, ECX=1 */ diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 5e3c41b36437..9f4c2f0aaf5c 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -37,6 +37,15 @@ struct arch_mbm_state { u64 prev_msr; }; +/* Setting bit 0 in L3_QOS_EXT_CFG enables the ABMC feature. */ +#define ABMC_ENABLE_BIT 0 + +/* + * Qos Event Identifiers. + */ +#define ABMC_EXTENDED_EVT_ID BIT(31) +#define ABMC_EVT_ID BIT(0) + /** * struct rdt_hw_ctrl_domain - Arch private attributes of a set of CPUs that share * a resource for a control function @@ -54,15 +63,15 @@ struct rdt_hw_ctrl_domain { * struct rdt_hw_mon_domain - Arch private attributes of a set of CPUs that share * a resource for a monitor function * @d_resctrl: Properties exposed to the resctrl file system - * @arch_mbm_total: arch private state for MBM total bandwidth - * @arch_mbm_local: arch private state for MBM local bandwidth + * @arch_mbm_states: Per-event pointer to the MBM event's saved state. + * An MBM event's state is an array of struct arch_mbm_state + * indexed by RMID on x86. * * Members of this structure are accessed via helpers that provide abstraction. */ struct rdt_hw_mon_domain { struct rdt_mon_domain d_resctrl; - struct arch_mbm_state *arch_mbm_total; - struct arch_mbm_state *arch_mbm_local; + struct arch_mbm_state *arch_mbm_states[QOS_NUM_L3_MBM_EVENTS]; }; static inline struct rdt_hw_ctrl_domain *resctrl_to_arch_ctrl_dom(struct rdt_ctrl_domain *r) @@ -102,6 +111,7 @@ struct msr_param { * @mon_scale: cqm counter * mon_scale = occupancy in bytes * @mbm_width: Monitor width, to detect and correct for overflow. * @cdp_enabled: CDP state of this resource + * @mbm_cntr_assign_enabled: ABMC feature is enabled * * Members of this structure are either private to the architecture * e.g. mbm_width, or accessed via helpers that provide abstraction. e.g. @@ -115,6 +125,7 @@ struct rdt_hw_resource { unsigned int mon_scale; unsigned int mbm_width; bool cdp_enabled; + bool mbm_cntr_assign_enabled; }; static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r) @@ -159,6 +170,42 @@ union cpuid_0x10_x_edx { unsigned int full; }; +/* + * ABMC counters are configured by writing to MSR_IA32_L3_QOS_ABMC_CFG. + * + * @bw_type : Event configuration that represents the memory + * transactions being tracked by the @cntr_id. + * @bw_src : Bandwidth source (RMID or CLOSID). + * @reserved1 : Reserved. + * @is_clos : @bw_src field is a CLOSID (not an RMID). + * @cntr_id : Counter identifier. + * @reserved : Reserved. + * @cntr_en : Counting enable bit. + * @cfg_en : Configuration enable bit. + * + * Configuration and counting: + * Counter can be configured across multiple writes to MSR. Configuration + * is applied only when @cfg_en = 1. Counter @cntr_id is reset when the + * configuration is applied. + * @cfg_en = 1, @cntr_en = 0 : Apply @cntr_id configuration but do not + * count events. + * @cfg_en = 1, @cntr_en = 1 : Apply @cntr_id configuration and start + * counting events. + */ +union l3_qos_abmc_cfg { + struct { + unsigned long bw_type :32, + bw_src :12, + reserved1: 3, + is_clos : 1, + cntr_id : 5, + reserved : 9, + cntr_en : 1, + cfg_en : 1; + } split; + unsigned long full; +}; + void rdt_ctrl_update(void *arg); int rdt_get_mon_l3_config(struct rdt_resource *r); @@ -168,5 +215,6 @@ bool rdt_cpu_has(int flag); void __init intel_rdt_mbm_apply_quirk(void); void rdt_domain_reconfigure_cdp(struct rdt_resource *r); +void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r); #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index c261558276cd..c8945610d455 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -31,11 +31,6 @@ */ bool rdt_mon_capable; -/* - * Global to indicate which monitoring events are enabled. - */ -unsigned int rdt_mon_features; - #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) static int snc_nodes_per_l3_cache = 1; @@ -135,7 +130,7 @@ static int logical_rmid_to_physical_rmid(int cpu, int lrmid) if (snc_nodes_per_l3_cache == 1) return lrmid; - return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid; + return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->mon.num_rmid; } static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) @@ -166,18 +161,14 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_do u32 rmid, enum resctrl_event_id eventid) { - switch (eventid) { - case QOS_L3_OCCUP_EVENT_ID: - return NULL; - case QOS_L3_MBM_TOTAL_EVENT_ID: - return &hw_dom->arch_mbm_total[rmid]; - case QOS_L3_MBM_LOCAL_EVENT_ID: - return &hw_dom->arch_mbm_local[rmid]; - default: - /* Never expect to get here */ - WARN_ON_ONCE(1); + struct arch_mbm_state *state; + + if (!resctrl_is_mbm_event(eventid)) return NULL; - } + + state = hw_dom->arch_mbm_states[MBM_STATE_IDX(eventid)]; + + return state ? &state[rmid] : NULL; } void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, @@ -206,14 +197,16 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) { struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); - - if (resctrl_arch_is_mbm_total_enabled()) - memset(hw_dom->arch_mbm_total, 0, - sizeof(*hw_dom->arch_mbm_total) * r->num_rmid); - - if (resctrl_arch_is_mbm_local_enabled()) - memset(hw_dom->arch_mbm_local, 0, - sizeof(*hw_dom->arch_mbm_local) * r->num_rmid); + enum resctrl_event_id eventid; + int idx; + + for_each_mbm_event_id(eventid) { + if (!resctrl_is_mon_event_enabled(eventid)) + continue; + idx = MBM_STATE_IDX(eventid); + memset(hw_dom->arch_mbm_states[idx], 0, + sizeof(*hw_dom->arch_mbm_states[0]) * r->mon.num_rmid); + } } static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) @@ -224,15 +217,33 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) return chunks >> shift; } +static u64 get_corrected_val(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 rmid, enum resctrl_event_id eventid, u64 msr_val) +{ + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + struct arch_mbm_state *am; + u64 chunks; + + am = get_arch_mbm_state(hw_dom, rmid, eventid); + if (am) { + am->chunks += mbm_overflow_count(am->prev_msr, msr_val, + hw_res->mbm_width); + chunks = get_corrected_mbm_count(rmid, am->chunks); + am->prev_msr = msr_val; + } else { + chunks = msr_val; + } + + return chunks * hw_res->mon_scale; +} + int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, u32 unused, u32 rmid, enum resctrl_event_id eventid, u64 *val, void *ignored) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); int cpu = cpumask_any(&d->hdr.cpu_mask); - struct arch_mbm_state *am; - u64 msr_val, chunks; + u64 msr_val; u32 prmid; int ret; @@ -243,17 +254,76 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, if (ret) return ret; + *val = get_corrected_val(r, d, rmid, eventid, msr_val); + + return 0; +} + +static int __cntr_id_read(u32 cntr_id, u64 *val) +{ + u64 msr_val; + + /* + * QM_EVTSEL Register definition: + * ======================================================= + * Bits Mnemonic Description + * ======================================================= + * 63:44 -- Reserved + * 43:32 RMID RMID or counter ID in ABMC mode + * when reading an MBM event + * 31 ExtendedEvtID Extended Event Identifier + * 30:8 -- Reserved + * 7:0 EvtID Event Identifier + * ======================================================= + * The contents of a specific counter can be read by setting the + * following fields in QM_EVTSEL.ExtendedEvtID(=1) and + * QM_EVTSEL.EvtID = L3CacheABMC (=1) and setting QM_EVTSEL.RMID + * to the desired counter ID. Reading the QM_CTR then returns the + * contents of the specified counter. The RMID_VAL_ERROR bit is set + * if the counter configuration is invalid, or if an invalid counter + * ID is set in the QM_EVTSEL.RMID field. The RMID_VAL_UNAVAIL bit + * is set if the counter data is unavailable. + */ + wrmsr(MSR_IA32_QM_EVTSEL, ABMC_EXTENDED_EVT_ID | ABMC_EVT_ID, cntr_id); + rdmsrl(MSR_IA32_QM_CTR, msr_val); + + if (msr_val & RMID_VAL_ERROR) + return -EIO; + if (msr_val & RMID_VAL_UNAVAIL) + return -EINVAL; + + *val = msr_val; + return 0; +} + +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 unused, u32 rmid, int cntr_id, + enum resctrl_event_id eventid) +{ + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct arch_mbm_state *am; + am = get_arch_mbm_state(hw_dom, rmid, eventid); if (am) { - am->chunks += mbm_overflow_count(am->prev_msr, msr_val, - hw_res->mbm_width); - chunks = get_corrected_mbm_count(rmid, am->chunks); - am->prev_msr = msr_val; - } else { - chunks = msr_val; + memset(am, 0, sizeof(*am)); + + /* Record any initial, non-zero count value. */ + __cntr_id_read(cntr_id, &am->prev_msr); } +} + +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 unused, u32 rmid, int cntr_id, + enum resctrl_event_id eventid, u64 *val) +{ + u64 msr_val; + int ret; + + ret = __cntr_id_read(cntr_id, &msr_val); + if (ret) + return ret; - *val = chunks * hw_res->mon_scale; + *val = get_corrected_val(r, d, rmid, eventid, msr_val); return 0; } @@ -346,12 +416,13 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); unsigned int threshold; + u32 eax, ebx, ecx, edx; snc_nodes_per_l3_cache = snc_get_config(); resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache; - r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; + r->mon.num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; hw_res->mbm_width = MBM_CNTR_WIDTH_BASE; if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) @@ -366,7 +437,7 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) * * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. */ - threshold = resctrl_rmid_realloc_limit / r->num_rmid; + threshold = resctrl_rmid_realloc_limit / r->mon.num_rmid; /* * Because num_rmid may not be a power of two, round the value @@ -375,12 +446,17 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) */ resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold); - if (rdt_cpu_has(X86_FEATURE_BMEC)) { - u32 eax, ebx, ecx, edx; - + if (rdt_cpu_has(X86_FEATURE_BMEC) || rdt_cpu_has(X86_FEATURE_ABMC)) { /* Detect list of bandwidth sources that can be tracked */ cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); - r->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; + r->mon.mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; + } + + if (rdt_cpu_has(X86_FEATURE_ABMC)) { + r->mon.mbm_cntr_assignable = true; + cpuid_count(0x80000020, 5, &eax, &ebx, &ecx, &edx); + r->mon.num_mbm_cntrs = (ebx & GENMASK(15, 0)) + 1; + hw_res->mbm_cntr_assign_enabled = true; } r->mon_capable = true; @@ -401,3 +477,91 @@ void __init intel_rdt_mbm_apply_quirk(void) mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold; mbm_cf = mbm_cf_table[cf_index].cf; } + +static void resctrl_abmc_set_one_amd(void *arg) +{ + bool *enable = arg; + + if (*enable) + msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT); + else + msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT); +} + +/* + * ABMC enable/disable requires update of L3_QOS_EXT_CFG MSR on all the CPUs + * associated with all monitor domains. + */ +static void _resctrl_abmc_enable(struct rdt_resource *r, bool enable) +{ + struct rdt_mon_domain *d; + + lockdep_assert_cpus_held(); + + list_for_each_entry(d, &r->mon_domains, hdr.list) { + on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_abmc_set_one_amd, + &enable, 1); + resctrl_arch_reset_rmid_all(r, d); + } +} + +int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + + if (r->mon.mbm_cntr_assignable && + hw_res->mbm_cntr_assign_enabled != enable) { + _resctrl_abmc_enable(r, enable); + hw_res->mbm_cntr_assign_enabled = enable; + } + + return 0; +} + +bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) +{ + return resctrl_to_arch_res(r)->mbm_cntr_assign_enabled; +} + +static void resctrl_abmc_config_one_amd(void *info) +{ + union l3_qos_abmc_cfg *abmc_cfg = info; + + wrmsrl(MSR_IA32_L3_QOS_ABMC_CFG, abmc_cfg->full); +} + +/* + * Send an IPI to the domain to assign the counter to RMID, event pair. + */ +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign) +{ + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + union l3_qos_abmc_cfg abmc_cfg = { 0 }; + struct arch_mbm_state *am; + + abmc_cfg.split.cfg_en = 1; + abmc_cfg.split.cntr_en = assign ? 1 : 0; + abmc_cfg.split.cntr_id = cntr_id; + abmc_cfg.split.bw_src = rmid; + if (assign) + abmc_cfg.split.bw_type = resctrl_get_mon_evt_cfg(evtid); + + smp_call_function_any(&d->hdr.cpu_mask, resctrl_abmc_config_one_amd, &abmc_cfg, 1); + + /* + * The hardware counter is reset (because cfg_en == 1) so there is no + * need to record initial non-zero counts. + */ + am = get_arch_mbm_state(hw_dom, rmid, evtid); + if (am) + memset(am, 0, sizeof(*am)); +} + +void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + + resctrl_abmc_set_one_amd(&hw_res->mbm_cntr_assign_enabled); +} diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 6b868afb26c3..4cee6213d667 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -51,6 +51,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_COHERENCY_SFW_NO, CPUID_EBX, 31, 0x8000001f, 0 }, { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 }, { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, + { X86_FEATURE_ABMC, CPUID_EBX, 5, 0x80000020, 0 }, { X86_FEATURE_TSA_SQ_NO, CPUID_ECX, 1, 0x80000021, 0 }, { X86_FEATURE_TSA_L1_NO, CPUID_ECX, 2, 0x80000021, 0 }, { X86_FEATURE_AMD_WORKLOAD_CLASS, CPUID_EAX, 22, 0x80000021, 0 }, diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index c79ebbb639cb..6ac097e13106 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -59,7 +59,7 @@ static void store_node(struct topo_scan *tscan, u16 nr_nodes, u16 node_id) tscan->amd_node_id = node_id; } -static bool parse_8000_001e(struct topo_scan *tscan, bool has_topoext) +static bool parse_8000_001e(struct topo_scan *tscan) { struct { // eax @@ -85,7 +85,7 @@ static bool parse_8000_001e(struct topo_scan *tscan, bool has_topoext) * If leaf 0xb/0x26 is available, then the APIC ID and the domain * shifts are set already. */ - if (!has_topoext) { + if (!cpu_feature_enabled(X86_FEATURE_XTOPOLOGY)) { tscan->c->topo.initial_apicid = leaf.ext_apic_id; /* @@ -163,11 +163,12 @@ static void topoext_fixup(struct topo_scan *tscan) c->x86 != 0x15 || c->x86_model < 0x10 || c->x86_model > 0x6f) return; - if (msr_set_bit(0xc0011005, 54) <= 0) + if (msr_set_bit(MSR_AMD64_CPUID_EXT_FEAT, + MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT_BIT) <= 0) return; - rdmsrq(0xc0011005, msrval); - if (msrval & BIT_64(54)) { + rdmsrq(MSR_AMD64_CPUID_EXT_FEAT, msrval); + if (msrval & MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT) { set_cpu_cap(c, X86_FEATURE_TOPOEXT); pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); } @@ -175,30 +176,27 @@ static void topoext_fixup(struct topo_scan *tscan) static void parse_topology_amd(struct topo_scan *tscan) { + if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES)) + tscan->c->topo.cpu_type = cpuid_ebx(0x80000026); + /* * Try to get SMT, CORE, TILE, and DIE shifts from extended * CPUID leaf 0x8000_0026 on supported processors first. If * extended CPUID leaf 0x8000_0026 is not supported, try to * get SMT and CORE shift from leaf 0xb. If either leaf is * available, cpu_parse_topology_ext() will return true. - */ - bool has_xtopology = cpu_parse_topology_ext(tscan); - - if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES)) - tscan->c->topo.cpu_type = cpuid_ebx(0x80000026); - - /* + * * If XTOPOLOGY leaves (0x26/0xb) are not available, try to * get the CORE shift from leaf 0x8000_0008 first. */ - if (!has_xtopology && !parse_8000_0008(tscan)) + if (!cpu_parse_topology_ext(tscan) && !parse_8000_0008(tscan)) return; /* * Prefer leaf 0x8000001e if available to get the SMT shift and * the initial APIC ID if XTOPOLOGY leaves are not available. */ - if (parse_8000_001e(tscan, has_xtopology)) + if (parse_8000_001e(tscan)) return; /* Try the NODEID MSR */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 533fcf5636fc..fd28b53dbac5 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -52,10 +52,13 @@ SYM_PIC_ALIAS(next_early_pgt); pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); unsigned int __pgtable_l5_enabled __ro_after_init; +SYM_PIC_ALIAS(__pgtable_l5_enabled); unsigned int pgdir_shift __ro_after_init = 39; EXPORT_SYMBOL(pgdir_shift); +SYM_PIC_ALIAS(pgdir_shift); unsigned int ptrs_per_p4d __ro_after_init = 1; EXPORT_SYMBOL(ptrs_per_p4d); +SYM_PIC_ALIAS(ptrs_per_p4d); unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4; EXPORT_SYMBOL(page_offset_base); @@ -316,5 +319,5 @@ void early_setup_idt(void) handler = vc_boot_ghcb; } - startup_64_load_idt(handler); + __pi_startup_64_load_idt(handler); } diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 76743dfad6ab..80ef5d386b03 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -61,7 +61,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE) * any particular GDT layout, because we load our own as soon as we * can. */ -__HEAD + __INIT SYM_CODE_START(startup_32) movl pa(initial_stack),%ecx @@ -136,6 +136,9 @@ SYM_CODE_END(startup_32) * If cpu hotplug is not supported then this code can go in init section * which will be freed later */ +#ifdef CONFIG_HOTPLUG_CPU + .text +#endif SYM_FUNC_START(startup_32_smp) cld movl $(__BOOT_DS),%eax diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 3e9b3a3bd039..21816b48537c 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -33,7 +33,7 @@ * because we need identity-mapped pages. */ - __HEAD + __INIT .code64 SYM_CODE_START_NOALIGN(startup_64) UNWIND_HINT_END_OF_STACK @@ -71,7 +71,7 @@ SYM_CODE_START_NOALIGN(startup_64) xorl %edx, %edx wrmsr - call startup_64_setup_gdt_idt + call __pi_startup_64_setup_gdt_idt /* Now switch to __KERNEL_CS so IRET works reliably */ pushq $__KERNEL_CS @@ -91,7 +91,7 @@ SYM_CODE_START_NOALIGN(startup_64) * subsequent code. Pass the boot_params pointer as the first argument. */ movq %r15, %rdi - call sme_enable + call __pi_sme_enable #endif /* Sanitize CPU configuration */ @@ -111,7 +111,7 @@ SYM_CODE_START_NOALIGN(startup_64) * programmed into CR3. */ movq %r15, %rsi - call __startup_64 + call __pi___startup_64 /* Form the CR3 value being sure to include the CR3 modifier */ leaq early_top_pgt(%rip), %rcx @@ -562,7 +562,7 @@ SYM_CODE_START_NOALIGN(vc_no_ghcb) /* Call C handler */ movq %rsp, %rdi movq ORIG_RAX(%rsp), %rsi - call do_vc_no_ghcb + call __pi_do_vc_no_ghcb /* Unwind pt_regs */ POP_REGS diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index 5eba6c5a6775..978232b6d48d 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -246,6 +246,46 @@ static unsigned long get_user_shstk_addr(void) return ssp; } +int shstk_pop(u64 *val) +{ + int ret = 0; + u64 ssp; + + if (!features_enabled(ARCH_SHSTK_SHSTK)) + return -ENOTSUPP; + + fpregs_lock_and_load(); + + rdmsrq(MSR_IA32_PL3_SSP, ssp); + if (val && get_user(*val, (__user u64 *)ssp)) + ret = -EFAULT; + else + wrmsrq(MSR_IA32_PL3_SSP, ssp + SS_FRAME_SIZE); + fpregs_unlock(); + + return ret; +} + +int shstk_push(u64 val) +{ + u64 ssp; + int ret; + + if (!features_enabled(ARCH_SHSTK_SHSTK)) + return -ENOTSUPP; + + fpregs_lock_and_load(); + + rdmsrq(MSR_IA32_PL3_SSP, ssp); + ssp -= SS_FRAME_SIZE; + ret = write_user_shstk_64((__user void *)ssp, val); + if (!ret) + wrmsrq(MSR_IA32_PL3_SSP, ssp); + fpregs_unlock(); + + return ret; +} + #define SHSTK_DATA_BIT BIT(63) static int put_shstk_data(u64 __user *addr, u64 data) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 33e166f6ab12..eb289abece23 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -479,14 +479,14 @@ static int x86_cluster_flags(void) static bool x86_has_numa_in_package; static struct sched_domain_topology_level x86_topology[] = { - SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), + SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), #ifdef CONFIG_SCHED_CLUSTER - SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS), + SDTL_INIT(tl_cls_mask, x86_cluster_flags, CLS), #endif #ifdef CONFIG_SCHED_MC - SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC), + SDTL_INIT(tl_mc_mask, x86_core_flags, MC), #endif - SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG), + SDTL_INIT(tl_pkg_mask, x86_sched_itmt_flags, PKG), { NULL }, }; diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index 5a4b21389b1d..d432f3824f0c 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -156,15 +156,26 @@ static int identify_insn(struct insn *insn) if (!insn->modrm.nbytes) return -EINVAL; - /* All the instructions of interest start with 0x0f. */ - if (insn->opcode.bytes[0] != 0xf) + /* The instructions of interest have 2-byte opcodes: 0F 00 or 0F 01. */ + if (insn->opcode.nbytes < 2 || insn->opcode.bytes[0] != 0xf) return -EINVAL; if (insn->opcode.bytes[1] == 0x1) { switch (X86_MODRM_REG(insn->modrm.value)) { case 0: + /* The reg form of 0F 01 /0 encodes VMX instructions. */ + if (X86_MODRM_MOD(insn->modrm.value) == 3) + return -EINVAL; + return UMIP_INST_SGDT; case 1: + /* + * The reg form of 0F 01 /1 encodes MONITOR/MWAIT, + * STAC/CLAC, and ENCLS. + */ + if (X86_MODRM_MOD(insn->modrm.value) == 3) + return -EINVAL; + return UMIP_INST_SIDT; case 4: return UMIP_INST_SMSW; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 6d383839e839..845aeaf36b8d 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -18,6 +18,7 @@ #include <asm/processor.h> #include <asm/insn.h> #include <asm/mmu_context.h> +#include <asm/nops.h> /* Post-execution fixups. */ @@ -310,25 +311,32 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool #ifdef CONFIG_X86_64 +struct uretprobe_syscall_args { + unsigned long r11; + unsigned long cx; + unsigned long ax; +}; + asm ( ".pushsection .rodata\n" ".global uretprobe_trampoline_entry\n" "uretprobe_trampoline_entry:\n" - "pushq %rax\n" - "pushq %rcx\n" - "pushq %r11\n" - "movq $" __stringify(__NR_uretprobe) ", %rax\n" + "push %rax\n" + "push %rcx\n" + "push %r11\n" + "mov $" __stringify(__NR_uretprobe) ", %rax\n" "syscall\n" ".global uretprobe_syscall_check\n" "uretprobe_syscall_check:\n" - "popq %r11\n" - "popq %rcx\n" - - /* The uretprobe syscall replaces stored %rax value with final + "pop %r11\n" + "pop %rcx\n" + /* + * The uretprobe syscall replaces stored %rax value with final * return address, so we don't restore %rax in here and just * call ret. */ - "retq\n" + "ret\n" + "int3\n" ".global uretprobe_trampoline_end\n" "uretprobe_trampoline_end:\n" ".popsection\n" @@ -338,7 +346,7 @@ extern u8 uretprobe_trampoline_entry[]; extern u8 uretprobe_trampoline_end[]; extern u8 uretprobe_syscall_check[]; -void *arch_uprobe_trampoline(unsigned long *psize) +void *arch_uretprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; struct pt_regs *regs = task_pt_regs(current); @@ -365,7 +373,8 @@ static unsigned long trampoline_check_ip(unsigned long tramp) SYSCALL_DEFINE0(uretprobe) { struct pt_regs *regs = task_pt_regs(current); - unsigned long err, ip, sp, r11_cx_ax[3], tramp; + struct uretprobe_syscall_args args; + unsigned long err, ip, sp, tramp; /* If there's no trampoline, we are called from wrong place. */ tramp = uprobe_get_trampoline_vaddr(); @@ -376,15 +385,15 @@ SYSCALL_DEFINE0(uretprobe) if (unlikely(regs->ip != trampoline_check_ip(tramp))) goto sigill; - err = copy_from_user(r11_cx_ax, (void __user *)regs->sp, sizeof(r11_cx_ax)); + err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args)); if (err) goto sigill; /* expose the "right" values of r11/cx/ax/sp to uprobe_consumer/s */ - regs->r11 = r11_cx_ax[0]; - regs->cx = r11_cx_ax[1]; - regs->ax = r11_cx_ax[2]; - regs->sp += sizeof(r11_cx_ax); + regs->r11 = args.r11; + regs->cx = args.cx; + regs->ax = args.ax; + regs->sp += sizeof(args); regs->orig_ax = -1; ip = regs->ip; @@ -400,21 +409,21 @@ SYSCALL_DEFINE0(uretprobe) */ if (regs->sp != sp || shstk_is_enabled()) return regs->ax; - regs->sp -= sizeof(r11_cx_ax); + regs->sp -= sizeof(args); /* for the case uprobe_consumer has changed r11/cx */ - r11_cx_ax[0] = regs->r11; - r11_cx_ax[1] = regs->cx; + args.r11 = regs->r11; + args.cx = regs->cx; /* * ax register is passed through as return value, so we can use * its space on stack for ip value and jump to it through the * trampoline's ret instruction */ - r11_cx_ax[2] = regs->ip; + args.ax = regs->ip; regs->ip = ip; - err = copy_to_user((void __user *)regs->sp, r11_cx_ax, sizeof(r11_cx_ax)); + err = copy_to_user((void __user *)regs->sp, &args, sizeof(args)); if (err) goto sigill; @@ -608,6 +617,581 @@ static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) *sr = utask->autask.saved_scratch_register; } } + +static int tramp_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) +{ + return -EPERM; +} + +static struct page *tramp_mapping_pages[2] __ro_after_init; + +static struct vm_special_mapping tramp_mapping = { + .name = "[uprobes-trampoline]", + .mremap = tramp_mremap, + .pages = tramp_mapping_pages, +}; + +struct uprobe_trampoline { + struct hlist_node node; + unsigned long vaddr; +}; + +static bool is_reachable_by_call(unsigned long vtramp, unsigned long vaddr) +{ + long delta = (long)(vaddr + 5 - vtramp); + + return delta >= INT_MIN && delta <= INT_MAX; +} + +static unsigned long find_nearest_trampoline(unsigned long vaddr) +{ + struct vm_unmapped_area_info info = { + .length = PAGE_SIZE, + .align_mask = ~PAGE_MASK, + }; + unsigned long low_limit, high_limit; + unsigned long low_tramp, high_tramp; + unsigned long call_end = vaddr + 5; + + if (check_add_overflow(call_end, INT_MIN, &low_limit)) + low_limit = PAGE_SIZE; + + high_limit = call_end + INT_MAX; + + /* Search up from the caller address. */ + info.low_limit = call_end; + info.high_limit = min(high_limit, TASK_SIZE); + high_tramp = vm_unmapped_area(&info); + + /* Search down from the caller address. */ + info.low_limit = max(low_limit, PAGE_SIZE); + info.high_limit = call_end; + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + low_tramp = vm_unmapped_area(&info); + + if (IS_ERR_VALUE(high_tramp) && IS_ERR_VALUE(low_tramp)) + return -ENOMEM; + if (IS_ERR_VALUE(high_tramp)) + return low_tramp; + if (IS_ERR_VALUE(low_tramp)) + return high_tramp; + + /* Return address that's closest to the caller address. */ + if (call_end - low_tramp < high_tramp - call_end) + return low_tramp; + return high_tramp; +} + +static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr) +{ + struct pt_regs *regs = task_pt_regs(current); + struct mm_struct *mm = current->mm; + struct uprobe_trampoline *tramp; + struct vm_area_struct *vma; + + if (!user_64bit_mode(regs)) + return NULL; + + vaddr = find_nearest_trampoline(vaddr); + if (IS_ERR_VALUE(vaddr)) + return NULL; + + tramp = kzalloc(sizeof(*tramp), GFP_KERNEL); + if (unlikely(!tramp)) + return NULL; + + tramp->vaddr = vaddr; + vma = _install_special_mapping(mm, tramp->vaddr, PAGE_SIZE, + VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_DONTCOPY|VM_IO, + &tramp_mapping); + if (IS_ERR(vma)) { + kfree(tramp); + return NULL; + } + return tramp; +} + +static struct uprobe_trampoline *get_uprobe_trampoline(unsigned long vaddr, bool *new) +{ + struct uprobes_state *state = ¤t->mm->uprobes_state; + struct uprobe_trampoline *tramp = NULL; + + if (vaddr > TASK_SIZE || vaddr < PAGE_SIZE) + return NULL; + + hlist_for_each_entry(tramp, &state->head_tramps, node) { + if (is_reachable_by_call(tramp->vaddr, vaddr)) { + *new = false; + return tramp; + } + } + + tramp = create_uprobe_trampoline(vaddr); + if (!tramp) + return NULL; + + *new = true; + hlist_add_head(&tramp->node, &state->head_tramps); + return tramp; +} + +static void destroy_uprobe_trampoline(struct uprobe_trampoline *tramp) +{ + /* + * We do not unmap and release uprobe trampoline page itself, + * because there's no easy way to make sure none of the threads + * is still inside the trampoline. + */ + hlist_del(&tramp->node); + kfree(tramp); +} + +void arch_uprobe_init_state(struct mm_struct *mm) +{ + INIT_HLIST_HEAD(&mm->uprobes_state.head_tramps); +} + +void arch_uprobe_clear_state(struct mm_struct *mm) +{ + struct uprobes_state *state = &mm->uprobes_state; + struct uprobe_trampoline *tramp; + struct hlist_node *n; + + hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node) + destroy_uprobe_trampoline(tramp); +} + +static bool __in_uprobe_trampoline(unsigned long ip) +{ + struct vm_area_struct *vma = vma_lookup(current->mm, ip); + + return vma && vma_is_special_mapping(vma, &tramp_mapping); +} + +static bool in_uprobe_trampoline(unsigned long ip) +{ + struct mm_struct *mm = current->mm; + bool found, retry = true; + unsigned int seq; + + rcu_read_lock(); + if (mmap_lock_speculate_try_begin(mm, &seq)) { + found = __in_uprobe_trampoline(ip); + retry = mmap_lock_speculate_retry(mm, seq); + } + rcu_read_unlock(); + + if (retry) { + mmap_read_lock(mm); + found = __in_uprobe_trampoline(ip); + mmap_read_unlock(mm); + } + return found; +} + +/* + * See uprobe syscall trampoline; the call to the trampoline will push + * the return address on the stack, the trampoline itself then pushes + * cx, r11 and ax. + */ +struct uprobe_syscall_args { + unsigned long ax; + unsigned long r11; + unsigned long cx; + unsigned long retaddr; +}; + +SYSCALL_DEFINE0(uprobe) +{ + struct pt_regs *regs = task_pt_regs(current); + struct uprobe_syscall_args args; + unsigned long ip, sp, sret; + int err; + + /* Allow execution only from uprobe trampolines. */ + if (!in_uprobe_trampoline(regs->ip)) + return -ENXIO; + + err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args)); + if (err) + goto sigill; + + ip = regs->ip; + + /* + * expose the "right" values of ax/r11/cx/ip/sp to uprobe_consumer/s, plus: + * - adjust ip to the probe address, call saved next instruction address + * - adjust sp to the probe's stack frame (check trampoline code) + */ + regs->ax = args.ax; + regs->r11 = args.r11; + regs->cx = args.cx; + regs->ip = args.retaddr - 5; + regs->sp += sizeof(args); + regs->orig_ax = -1; + + sp = regs->sp; + + err = shstk_pop((u64 *)&sret); + if (err == -EFAULT || (!err && sret != args.retaddr)) + goto sigill; + + handle_syscall_uprobe(regs, regs->ip); + + /* + * Some of the uprobe consumers has changed sp, we can do nothing, + * just return via iret. + */ + if (regs->sp != sp) { + /* skip the trampoline call */ + if (args.retaddr - 5 == regs->ip) + regs->ip += 5; + return regs->ax; + } + + regs->sp -= sizeof(args); + + /* for the case uprobe_consumer has changed ax/r11/cx */ + args.ax = regs->ax; + args.r11 = regs->r11; + args.cx = regs->cx; + + /* keep return address unless we are instructed otherwise */ + if (args.retaddr - 5 != regs->ip) + args.retaddr = regs->ip; + + if (shstk_push(args.retaddr) == -EFAULT) + goto sigill; + + regs->ip = ip; + + err = copy_to_user((void __user *)regs->sp, &args, sizeof(args)); + if (err) + goto sigill; + + /* ensure sysret, see do_syscall_64() */ + regs->r11 = regs->flags; + regs->cx = regs->ip; + return 0; + +sigill: + force_sig(SIGILL); + return -1; +} + +asm ( + ".pushsection .rodata\n" + ".balign " __stringify(PAGE_SIZE) "\n" + "uprobe_trampoline_entry:\n" + "push %rcx\n" + "push %r11\n" + "push %rax\n" + "mov $" __stringify(__NR_uprobe) ", %rax\n" + "syscall\n" + "pop %rax\n" + "pop %r11\n" + "pop %rcx\n" + "ret\n" + "int3\n" + ".balign " __stringify(PAGE_SIZE) "\n" + ".popsection\n" +); + +extern u8 uprobe_trampoline_entry[]; + +static int __init arch_uprobes_init(void) +{ + tramp_mapping_pages[0] = virt_to_page(uprobe_trampoline_entry); + return 0; +} + +late_initcall(arch_uprobes_init); + +enum { + EXPECT_SWBP, + EXPECT_CALL, +}; + +struct write_opcode_ctx { + unsigned long base; + int expect; +}; + +static int is_call_insn(uprobe_opcode_t *insn) +{ + return *insn == CALL_INSN_OPCODE; +} + +/* + * Verification callback used by int3_update uprobe_write calls to make sure + * the underlying instruction is as expected - either int3 or call. + */ +static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode, + int nbytes, void *data) +{ + struct write_opcode_ctx *ctx = data; + uprobe_opcode_t old_opcode[5]; + + uprobe_copy_from_page(page, ctx->base, (uprobe_opcode_t *) &old_opcode, 5); + + switch (ctx->expect) { + case EXPECT_SWBP: + if (is_swbp_insn(&old_opcode[0])) + return 1; + break; + case EXPECT_CALL: + if (is_call_insn(&old_opcode[0])) + return 1; + break; + } + + return -1; +} + +/* + * Modify multi-byte instructions by using INT3 breakpoints on SMP. + * We completely avoid using stop_machine() here, and achieve the + * synchronization using INT3 breakpoints and SMP cross-calls. + * (borrowed comment from smp_text_poke_batch_finish) + * + * The way it is done: + * - Add an INT3 trap to the address that will be patched + * - SMP sync all CPUs + * - Update all but the first byte of the patched range + * - SMP sync all CPUs + * - Replace the first byte (INT3) by the first byte of the replacing opcode + * - SMP sync all CPUs + */ +static int int3_update(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr, char *insn, bool optimize) +{ + uprobe_opcode_t int3 = UPROBE_SWBP_INSN; + struct write_opcode_ctx ctx = { + .base = vaddr, + }; + int err; + + /* + * Write int3 trap. + * + * The swbp_optimize path comes with breakpoint already installed, + * so we can skip this step for optimize == true. + */ + if (!optimize) { + ctx.expect = EXPECT_CALL; + err = uprobe_write(auprobe, vma, vaddr, &int3, 1, verify_insn, + true /* is_register */, false /* do_update_ref_ctr */, + &ctx); + if (err) + return err; + } + + smp_text_poke_sync_each_cpu(); + + /* Write all but the first byte of the patched range. */ + ctx.expect = EXPECT_SWBP; + err = uprobe_write(auprobe, vma, vaddr + 1, insn + 1, 4, verify_insn, + true /* is_register */, false /* do_update_ref_ctr */, + &ctx); + if (err) + return err; + + smp_text_poke_sync_each_cpu(); + + /* + * Write first byte. + * + * The swbp_unoptimize needs to finish uprobe removal together + * with ref_ctr update, using uprobe_write with proper flags. + */ + err = uprobe_write(auprobe, vma, vaddr, insn, 1, verify_insn, + optimize /* is_register */, !optimize /* do_update_ref_ctr */, + &ctx); + if (err) + return err; + + smp_text_poke_sync_each_cpu(); + return 0; +} + +static int swbp_optimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr, unsigned long tramp) +{ + u8 call[5]; + + __text_gen_insn(call, CALL_INSN_OPCODE, (const void *) vaddr, + (const void *) tramp, CALL_INSN_SIZE); + return int3_update(auprobe, vma, vaddr, call, true /* optimize */); +} + +static int swbp_unoptimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + return int3_update(auprobe, vma, vaddr, auprobe->insn, false /* optimize */); +} + +static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst, int len) +{ + unsigned int gup_flags = FOLL_FORCE|FOLL_SPLIT_PMD; + struct vm_area_struct *vma; + struct page *page; + + page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); + if (IS_ERR(page)) + return PTR_ERR(page); + uprobe_copy_from_page(page, vaddr, dst, len); + put_page(page); + return 0; +} + +static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr) +{ + struct __packed __arch_relative_insn { + u8 op; + s32 raddr; + } *call = (struct __arch_relative_insn *) insn; + + if (!is_call_insn(insn)) + return false; + return __in_uprobe_trampoline(vaddr + 5 + call->raddr); +} + +static int is_optimized(struct mm_struct *mm, unsigned long vaddr) +{ + uprobe_opcode_t insn[5]; + int err; + + err = copy_from_vaddr(mm, vaddr, &insn, 5); + if (err) + return err; + return __is_optimized((uprobe_opcode_t *)&insn, vaddr); +} + +static bool should_optimize(struct arch_uprobe *auprobe) +{ + return !test_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags) && + test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags); +} + +int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + if (should_optimize(auprobe)) { + /* + * We could race with another thread that already optimized the probe, + * so let's not overwrite it with int3 again in this case. + */ + int ret = is_optimized(vma->vm_mm, vaddr); + if (ret < 0) + return ret; + if (ret) + return 0; + } + return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, + true /* is_register */); +} + +int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) { + int ret = is_optimized(vma->vm_mm, vaddr); + if (ret < 0) + return ret; + if (ret) { + ret = swbp_unoptimize(auprobe, vma, vaddr); + WARN_ON_ONCE(ret); + return ret; + } + } + return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn, + false /* is_register */); +} + +static int __arch_uprobe_optimize(struct arch_uprobe *auprobe, struct mm_struct *mm, + unsigned long vaddr) +{ + struct uprobe_trampoline *tramp; + struct vm_area_struct *vma; + bool new = false; + int err = 0; + + vma = find_vma(mm, vaddr); + if (!vma) + return -EINVAL; + tramp = get_uprobe_trampoline(vaddr, &new); + if (!tramp) + return -EINVAL; + err = swbp_optimize(auprobe, vma, vaddr, tramp->vaddr); + if (WARN_ON_ONCE(err) && new) + destroy_uprobe_trampoline(tramp); + return err; +} + +void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ + struct mm_struct *mm = current->mm; + uprobe_opcode_t insn[5]; + + if (!should_optimize(auprobe)) + return; + + mmap_write_lock(mm); + + /* + * Check if some other thread already optimized the uprobe for us, + * if it's the case just go away silently. + */ + if (copy_from_vaddr(mm, vaddr, &insn, 5)) + goto unlock; + if (!is_swbp_insn((uprobe_opcode_t*) &insn)) + goto unlock; + + /* + * If we fail to optimize the uprobe we set the fail bit so the + * above should_optimize will fail from now on. + */ + if (__arch_uprobe_optimize(auprobe, mm, vaddr)) + set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags); + +unlock: + mmap_write_unlock(mm); +} + +static bool insn_is_nop(struct insn *insn) +{ + return insn->opcode.nbytes == 1 && insn->opcode.bytes[0] == 0x90; +} + +static bool insn_is_nopl(struct insn *insn) +{ + if (insn->opcode.nbytes != 2) + return false; + + if (insn->opcode.bytes[0] != 0x0f || insn->opcode.bytes[1] != 0x1f) + return false; + + if (!insn->modrm.nbytes) + return false; + + if (X86_MODRM_REG(insn->modrm.bytes[0]) != 0) + return false; + + /* 0f 1f /0 - NOPL */ + return true; +} + +static bool can_optimize(struct insn *insn, unsigned long vaddr) +{ + if (!insn->x86_64 || insn->length != 5) + return false; + + if (!insn_is_nop(insn) && !insn_is_nopl(insn)) + return false; + + /* We can't do cross page atomic writes yet. */ + return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5; +} #else /* 32-bit: */ /* * No RIP-relative addressing on 32-bit @@ -621,6 +1205,10 @@ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { } +static bool can_optimize(struct insn *insn, unsigned long vaddr) +{ + return false; +} #endif /* CONFIG_X86_64 */ struct uprobe_xol_ops { @@ -979,14 +1567,17 @@ static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) */ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) { - struct insn insn; u8 fix_ip_or_call = UPROBE_FIX_IP; + struct insn insn; int ret; ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm)); if (ret) return ret; + if (can_optimize(&insn, addr)) + set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags); + ret = branch_setup_xol_ops(auprobe, &insn); if (ret != -ENOSYS) return ret; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 4fa0be732af1..d7af4a64c211 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -160,11 +160,6 @@ SECTIONS } :text = 0xcccccccc - /* bootstrapping code */ - .head.text : AT(ADDR(.head.text) - LOAD_OFFSET) { - HEAD_TEXT - } :text = 0xcccccccc - /* End of text section, which should occupy whole number of pages */ _etext = .; . = ALIGN(PAGE_SIZE); @@ -227,6 +222,8 @@ SECTIONS */ .altinstr_aux : AT(ADDR(.altinstr_aux) - LOAD_OFFSET) { *(.altinstr_aux) + . = ALIGN(PAGE_SIZE); + __inittext_end = .; } INIT_DATA_SECTION(16) @@ -535,3 +532,5 @@ xen_elfnote_entry_value = xen_elfnote_phys32_entry_value = ABSOLUTE(xen_elfnote_phys32_entry) + ABSOLUTE(pvh_start_xen - LOAD_OFFSET); #endif + +#include "../boot/startup/exports.h" diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index ad89d0bd6005..103604c4b33b 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -13,7 +13,7 @@ #define MSR_IA32_MISC_ENABLE_PMU_RO_MASK (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | \ MSR_IA32_MISC_ENABLE_BTS_UNAVAIL) -/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */ +/* retrieve a fixed counter bits out of IA32_FIXED_CTR_CTRL */ #define fixed_ctrl_field(ctrl_reg, idx) \ (((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK) diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c index b0f3b2a62ae2..a5cafd402cfd 100644 --- a/arch/x86/lib/inat.c +++ b/arch/x86/lib/inat.c @@ -81,3 +81,16 @@ insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, return table[opcode]; } +insn_attr_t inat_get_xop_attribute(insn_byte_t opcode, insn_byte_t map_select) +{ + const insn_attr_t *table; + + if (map_select < X86_XOP_M_MIN || map_select > X86_XOP_M_MAX) + return 0; + map_select -= X86_XOP_M_MIN; + /* At first, this checks the master table */ + table = inat_xop_tables[map_select]; + if (!table) + return 0; + return table[opcode]; +} diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 149a57e334ab..225af1399c9d 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -200,12 +200,15 @@ found: } insn->rex_prefix.got = 1; - /* Decode VEX prefix */ + /* Decode VEX/XOP prefix */ b = peek_next(insn_byte_t, insn); - attr = inat_get_opcode_attribute(b); - if (inat_is_vex_prefix(attr)) { + if (inat_is_vex_prefix(attr) || inat_is_xop_prefix(attr)) { insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1); - if (!insn->x86_64) { + + if (inat_is_xop_prefix(attr) && X86_MODRM_REG(b2) == 0) { + /* Grp1A.0 is always POP Ev */ + goto vex_end; + } else if (!insn->x86_64) { /* * In 32-bits mode, if the [7:6] bits (mod bits of * ModRM) on the second byte are not 11b, it is @@ -226,13 +229,13 @@ found: if (insn->x86_64 && X86_VEX_W(b2)) /* VEX.W overrides opnd_size */ insn->opnd_bytes = 8; - } else if (inat_is_vex3_prefix(attr)) { + } else if (inat_is_vex3_prefix(attr) || inat_is_xop_prefix(attr)) { b2 = peek_nbyte_next(insn_byte_t, insn, 2); insn_set_byte(&insn->vex_prefix, 2, b2); insn->vex_prefix.nbytes = 3; insn->next_byte += 3; if (insn->x86_64 && X86_VEX_W(b2)) - /* VEX.W overrides opnd_size */ + /* VEX.W/XOP.W overrides opnd_size */ insn->opnd_bytes = 8; } else { /* @@ -288,9 +291,22 @@ int insn_get_opcode(struct insn *insn) insn_set_byte(opcode, 0, op); opcode->nbytes = 1; - /* Check if there is VEX prefix or not */ - if (insn_is_avx(insn)) { + /* Check if there is VEX/XOP prefix or not */ + if (insn_is_avx_or_xop(insn)) { insn_byte_t m, p; + + /* XOP prefix has different encoding */ + if (unlikely(avx_insn_is_xop(insn))) { + m = insn_xop_map_bits(insn); + insn->attr = inat_get_xop_attribute(op, m); + if (!inat_accept_xop(insn->attr)) { + insn->attr = 0; + return -EINVAL; + } + /* XOP has only 1 byte for opcode */ + goto end; + } + m = insn_vex_m_bits(insn); p = insn_vex_p_bits(insn); insn->attr = inat_get_avx_attribute(op, m, p); @@ -383,7 +399,8 @@ int insn_get_modrm(struct insn *insn) pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_group_attribute(mod, pfx_id, insn->attr); - if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) { + if (insn_is_avx_or_xop(insn) && !inat_accept_vex(insn->attr) && + !inat_accept_xop(insn->attr)) { /* Bad insn */ insn->attr = 0; return -EINVAL; diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index d78d769a02bd..f513d33b6d37 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -15,7 +15,6 @@ .section .text..__x86.indirect_thunk - .macro POLINE reg ANNOTATE_INTRA_FUNCTION_CALL call .Ldo_rop_\@ @@ -73,6 +72,7 @@ SYM_CODE_END(__x86_indirect_thunk_array) #undef GEN #ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING + .macro CALL_THUNK reg .align RETPOLINE_THUNK_SIZE @@ -126,7 +126,45 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array) #define GEN(reg) __EXPORT_THUNK(__x86_indirect_jump_thunk_ ## reg) #include <asm/GEN-for-each-reg.h> #undef GEN -#endif + +#endif /* CONFIG_MITIGATION_CALL_DEPTH_TRACKING */ + +#ifdef CONFIG_MITIGATION_ITS + +.macro ITS_THUNK reg + +/* + * If CFI paranoid is used then the ITS thunk starts with opcodes (0xea; jne 1b) + * that complete the fineibt_paranoid caller sequence. + */ +1: .byte 0xea +SYM_INNER_LABEL(__x86_indirect_paranoid_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_UNDEFINED + ANNOTATE_NOENDBR + jne 1b +SYM_INNER_LABEL(__x86_indirect_its_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_UNDEFINED + ANNOTATE_NOENDBR + ANNOTATE_RETPOLINE_SAFE + jmp *%\reg + int3 + .align 32, 0xcc /* fill to the end of the line */ + .skip 32 - (__x86_indirect_its_thunk_\reg - 1b), 0xcc /* skip to the next upper half */ +.endm + +/* ITS mitigation requires thunks be aligned to upper half of cacheline */ +.align 64, 0xcc +.skip 29, 0xcc + +#define GEN(reg) ITS_THUNK reg +#include <asm/GEN-for-each-reg.h> +#undef GEN + + .align 64, 0xcc +SYM_FUNC_ALIAS(__x86_indirect_its_thunk_array, __x86_indirect_its_thunk_rax) +SYM_CODE_END(__x86_indirect_its_thunk_array) + +#endif /* CONFIG_MITIGATION_ITS */ #ifdef CONFIG_MITIGATION_RETHUNK @@ -370,39 +408,6 @@ SYM_FUNC_END(call_depth_return_thunk) #ifdef CONFIG_MITIGATION_ITS -.macro ITS_THUNK reg - -/* - * If CFI paranoid is used then the ITS thunk starts with opcodes (0xea; jne 1b) - * that complete the fineibt_paranoid caller sequence. - */ -1: .byte 0xea -SYM_INNER_LABEL(__x86_indirect_paranoid_thunk_\reg, SYM_L_GLOBAL) - UNWIND_HINT_UNDEFINED - ANNOTATE_NOENDBR - jne 1b -SYM_INNER_LABEL(__x86_indirect_its_thunk_\reg, SYM_L_GLOBAL) - UNWIND_HINT_UNDEFINED - ANNOTATE_NOENDBR - ANNOTATE_RETPOLINE_SAFE - jmp *%\reg - int3 - .align 32, 0xcc /* fill to the end of the line */ - .skip 32 - (__x86_indirect_its_thunk_\reg - 1b), 0xcc /* skip to the next upper half */ -.endm - -/* ITS mitigation requires thunks be aligned to upper half of cacheline */ -.align 64, 0xcc -.skip 29, 0xcc - -#define GEN(reg) ITS_THUNK reg -#include <asm/GEN-for-each-reg.h> -#undef GEN - - .align 64, 0xcc -SYM_FUNC_ALIAS(__x86_indirect_its_thunk_array, __x86_indirect_its_thunk_rax) -SYM_CODE_END(__x86_indirect_its_thunk_array) - .align 64, 0xcc .skip 32, 0xcc SYM_CODE_START(its_return_thunk) diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 262f7ca1fb95..2a4e69ecc2de 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -27,6 +27,11 @@ # (evo): this opcode is changed by EVEX prefix (EVEX opcode) # (v): this opcode requires VEX prefix. # (v1): this opcode only supports 128bit VEX. +# (xop): this opcode accepts XOP prefix. +# +# XOP Superscripts +# (W=0): this opcode requires XOP.W == 0 +# (W=1): this opcode requires XOP.W == 1 # # Last Prefix Superscripts # - (66): the last prefix is 0x66 @@ -194,7 +199,7 @@ AVXcode: 8c: MOV Ev,Sw 8d: LEA Gv,M 8e: MOV Sw,Ew -8f: Grp1A (1A) | POP Ev (d64) +8f: Grp1A (1A) | POP Ev (d64) | XOP (Prefix) # 0x90 - 0x9f 90: NOP | PAUSE (F3) | XCHG r8,rAX 91: XCHG rCX/r9,rAX @@ -1106,6 +1111,84 @@ AVXcode: 7 f8: URDMSR Rq,Id (F2),(v1),(11B) | UWRMSR Id,Rq (F3),(v1),(11B) EndTable +# From AMD64 Architecture Programmer's Manual Vol3, Appendix A.1.5 +Table: XOP map 8h +Referrer: +XOPcode: 0 +85: VPMACSSWW Vo,Ho,Wo,Lo +86: VPMACSSWD Vo,Ho,Wo,Lo +87: VPMACSSDQL Vo,Ho,Wo,Lo +8e: VPMACSSDD Vo,Ho,Wo,Lo +8f: VPMACSSDQH Vo,Ho,Wo,Lo +95: VPMACSWW Vo,Ho,Wo,Lo +96: VPMACSWD Vo,Ho,Wo,Lo +97: VPMACSDQL Vo,Ho,Wo,Lo +9e: VPMACSDD Vo,Ho,Wo,Lo +9f: VPMACSDQH Vo,Ho,Wo,Lo +a2: VPCMOV Vx,Hx,Wx,Lx (W=0) | VPCMOV Vx,Hx,Lx,Wx (W=1) +a3: VPPERM Vo,Ho,Wo,Lo (W=0) | VPPERM Vo,Ho,Lo,Wo (W=1) +a6: VPMADCSSWD Vo,Ho,Wo,Lo +b6: VPMADCSWD Vo,Ho,Wo,Lo +c0: VPROTB Vo,Wo,Ib +c1: VPROTW Vo,Wo,Ib +c2: VPROTD Vo,Wo,Ib +c3: VPROTQ Vo,Wo,Ib +cc: VPCOMccB Vo,Ho,Wo,Ib +cd: VPCOMccW Vo,Ho,Wo,Ib +ce: VPCOMccD Vo,Ho,Wo,Ib +cf: VPCOMccQ Vo,Ho,Wo,Ib +ec: VPCOMccUB Vo,Ho,Wo,Ib +ed: VPCOMccUW Vo,Ho,Wo,Ib +ee: VPCOMccUD Vo,Ho,Wo,Ib +ef: VPCOMccUQ Vo,Ho,Wo,Ib +EndTable + +Table: XOP map 9h +Referrer: +XOPcode: 1 +01: GrpXOP1 +02: GrpXOP2 +12: GrpXOP3 +80: VFRCZPS Vx,Wx +81: VFRCZPD Vx,Wx +82: VFRCZSS Vq,Wss +83: VFRCZSD Vq,Wsd +90: VPROTB Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +91: VPROTW Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +92: VPROTD Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +93: VPROTQ Vo,Wo,Ho (W=0) | VPROTB Vo,Ho,Wo (W=1) +94: VPSHLB Vo,Wo,Ho (W=0) | VPSHLB Vo,Ho,Wo (W=1) +95: VPSHLW Vo,Wo,Ho (W=0) | VPSHLW Vo,Ho,Wo (W=1) +96: VPSHLD Vo,Wo,Ho (W=0) | VPSHLD Vo,Ho,Wo (W=1) +97: VPSHLQ Vo,Wo,Ho (W=0) | VPSHLQ Vo,Ho,Wo (W=1) +98: VPSHAB Vo,Wo,Ho (W=0) | VPSHAB Vo,Ho,Wo (W=1) +99: VPSHAW Vo,Wo,Ho (W=0) | VPSHAW Vo,Ho,Wo (W=1) +9a: VPSHAD Vo,Wo,Ho (W=0) | VPSHAD Vo,Ho,Wo (W=1) +9b: VPSHAQ Vo,Wo,Ho (W=0) | VPSHAQ Vo,Ho,Wo (W=1) +c1: VPHADDBW Vo,Wo +c2: VPHADDBD Vo,Wo +c3: VPHADDBQ Vo,Wo +c6: VPHADDWD Vo,Wo +c7: VPHADDWQ Vo,Wo +cb: VPHADDDQ Vo,Wo +d1: VPHADDUBWD Vo,Wo +d2: VPHADDUBD Vo,Wo +d3: VPHADDUBQ Vo,Wo +d6: VPHADDUWD Vo,Wo +d7: VPHADDUWQ Vo,Wo +db: VPHADDUDQ Vo,Wo +e1: VPHSUBBW Vo,Wo +e2: VPHSUBWD Vo,Wo +e3: VPHSUBDQ Vo,Wo +EndTable + +Table: XOP map Ah +Referrer: +XOPcode: 2 +10: BEXTR Gy,Ey,Id +12: GrpXOP4 +EndTable + GrpTable: Grp1 0: ADD 1: OR @@ -1320,3 +1403,29 @@ GrpTable: GrpRNG 4: xcrypt-cfb 5: xcrypt-ofb EndTable + +# GrpXOP1-4 is shown in AMD APM Vol.3 Appendix A as XOP group #1-4 +GrpTable: GrpXOP1 +1: BLCFILL By,Ey (xop) +2: BLSFILL By,Ey (xop) +3: BLCS By,Ey (xop) +4: TZMSK By,Ey (xop) +5: BLCIC By,Ey (xop) +6: BLSIC By,Ey (xop) +7: T1MSKC By,Ey (xop) +EndTable + +GrpTable: GrpXOP2 +1: BLCMSK By,Ey (xop) +6: BLCI By,Ey (xop) +EndTable + +GrpTable: GrpXOP3 +0: LLWPCB Ry (xop) +1: SLWPCB Ry (xop) +EndTable + +GrpTable: GrpXOP4 +0: LWPINS By,Ed,Id (xop) +1: LWPVAL By,Ed,Id (xop) +EndTable diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index faf3a13fb6ba..2f8c32173972 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -536,12 +536,6 @@ void __init sme_early_init(void) x86_init.resources.dmi_setup = snp_dmi_setup; } - /* - * Switch the SVSM CA mapping (if active) from identity mapped to - * kernel mapped. - */ - snp_update_svsm_ca(); - if (sev_status & MSR_AMD64_SNP_SECURE_TSC) setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); } diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S index f8a33b25ae86..edbf9c998848 100644 --- a/arch/x86/mm/mem_encrypt_boot.S +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -16,7 +16,7 @@ .text .code64 -SYM_FUNC_START(sme_encrypt_execute) +SYM_FUNC_START(__pi_sme_encrypt_execute) /* * Entry parameters: @@ -69,9 +69,9 @@ SYM_FUNC_START(sme_encrypt_execute) ANNOTATE_UNRET_SAFE ret int3 -SYM_FUNC_END(sme_encrypt_execute) +SYM_FUNC_END(__pi_sme_encrypt_execute) -SYM_FUNC_START(__enc_copy) +SYM_FUNC_START_LOCAL(__enc_copy) ANNOTATE_NOENDBR /* * Routine used to encrypt memory in place. diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 7e3fca164620..fc13306af15f 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -8,6 +8,7 @@ #include <linux/netdevice.h> #include <linux/filter.h> #include <linux/if_vlan.h> +#include <linux/bitfield.h> #include <linux/bpf.h> #include <linux/memory.h> #include <linux/sort.h> @@ -1151,11 +1152,38 @@ static void emit_ldx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 i *pprog = prog; } +static void emit_ldsx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 index_reg, int off) +{ + u8 *prog = *pprog; + + switch (size) { + case BPF_B: + /* movsx rax, byte ptr [rax + r12 + off] */ + EMIT3(add_3mod(0x48, src_reg, dst_reg, index_reg), 0x0F, 0xBE); + break; + case BPF_H: + /* movsx rax, word ptr [rax + r12 + off] */ + EMIT3(add_3mod(0x48, src_reg, dst_reg, index_reg), 0x0F, 0xBF); + break; + case BPF_W: + /* movsx rax, dword ptr [rax + r12 + off] */ + EMIT2(add_3mod(0x48, src_reg, dst_reg, index_reg), 0x63); + break; + } + emit_insn_suffix_SIB(&prog, src_reg, dst_reg, index_reg, off); + *pprog = prog; +} + static void emit_ldx_r12(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) { emit_ldx_index(pprog, size, dst_reg, src_reg, X86_REG_R12, off); } +static void emit_ldsx_r12(u8 **prog, u32 size, u32 dst_reg, u32 src_reg, int off) +{ + emit_ldsx_index(prog, size, dst_reg, src_reg, X86_REG_R12, off); +} + /* STX: *(u8*)(dst_reg + off) = src_reg */ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) { @@ -1388,16 +1416,67 @@ static int emit_atomic_ld_st_index(u8 **pprog, u32 atomic_op, u32 size, return 0; } +/* + * Metadata encoding for exception handling in JITed code. + * + * Format of `fixup` and `data` fields in `struct exception_table_entry`: + * + * Bit layout of `fixup` (32-bit): + * + * +-----------+--------+-----------+---------+----------+ + * | 31 | 30-24 | 23-16 | 15-8 | 7-0 | + * | | | | | | + * | ARENA_ACC | Unused | ARENA_REG | DST_REG | INSN_LEN | + * +-----------+--------+-----------+---------+----------+ + * + * - INSN_LEN (8 bits): Length of faulting insn (max x86 insn = 15 bytes (fits in 8 bits)). + * - DST_REG (8 bits): Offset of dst_reg from reg2pt_regs[] (max offset = 112 (fits in 8 bits)). + * This is set to DONT_CLEAR if the insn is a store. + * - ARENA_REG (8 bits): Offset of the register that is used to calculate the + * address for load/store when accessing the arena region. + * - ARENA_ACCESS (1 bit): This bit is set when the faulting instruction accessed the arena region. + * + * Bit layout of `data` (32-bit): + * + * +--------------+--------+--------------+ + * | 31-16 | 15-8 | 7-0 | + * | | | | + * | ARENA_OFFSET | Unused | EX_TYPE_BPF | + * +--------------+--------+--------------+ + * + * - ARENA_OFFSET (16 bits): Offset used to calculate the address for load/store when + * accessing the arena region. + */ + #define DONT_CLEAR 1 +#define FIXUP_INSN_LEN_MASK GENMASK(7, 0) +#define FIXUP_REG_MASK GENMASK(15, 8) +#define FIXUP_ARENA_REG_MASK GENMASK(23, 16) +#define FIXUP_ARENA_ACCESS BIT(31) +#define DATA_ARENA_OFFSET_MASK GENMASK(31, 16) bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs) { - u32 reg = x->fixup >> 8; + u32 reg = FIELD_GET(FIXUP_REG_MASK, x->fixup); + u32 insn_len = FIELD_GET(FIXUP_INSN_LEN_MASK, x->fixup); + bool is_arena = !!(x->fixup & FIXUP_ARENA_ACCESS); + bool is_write = (reg == DONT_CLEAR); + unsigned long addr; + s16 off; + u32 arena_reg; + + if (is_arena) { + arena_reg = FIELD_GET(FIXUP_ARENA_REG_MASK, x->fixup); + off = FIELD_GET(DATA_ARENA_OFFSET_MASK, x->data); + addr = *(unsigned long *)((void *)regs + arena_reg) + off; + bpf_prog_report_arena_violation(is_write, addr, regs->ip); + } /* jump over faulting load and clear dest register */ if (reg != DONT_CLEAR) *(unsigned long *)((void *)regs + reg) = 0; - regs->ip += x->fixup & 0xff; + regs->ip += insn_len; + return true; } @@ -2057,19 +2136,27 @@ st: if (is_imm8(insn->off)) case BPF_LDX | BPF_PROBE_MEM32 | BPF_H: case BPF_LDX | BPF_PROBE_MEM32 | BPF_W: case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW: + case BPF_LDX | BPF_PROBE_MEM32SX | BPF_B: + case BPF_LDX | BPF_PROBE_MEM32SX | BPF_H: + case BPF_LDX | BPF_PROBE_MEM32SX | BPF_W: case BPF_STX | BPF_PROBE_MEM32 | BPF_B: case BPF_STX | BPF_PROBE_MEM32 | BPF_H: case BPF_STX | BPF_PROBE_MEM32 | BPF_W: case BPF_STX | BPF_PROBE_MEM32 | BPF_DW: start_of_ldx = prog; - if (BPF_CLASS(insn->code) == BPF_LDX) - emit_ldx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); - else + if (BPF_CLASS(insn->code) == BPF_LDX) { + if (BPF_MODE(insn->code) == BPF_PROBE_MEM32SX) + emit_ldsx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); + else + emit_ldx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); + } else { emit_stx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); + } populate_extable: { struct exception_table_entry *ex; u8 *_insn = image + proglen + (start_of_ldx - temp); + u32 arena_reg, fixup_reg; s64 delta; if (!bpf_prog->aux->extable) @@ -2089,8 +2176,29 @@ populate_extable: ex->data = EX_TYPE_BPF; - ex->fixup = (prog - start_of_ldx) | - ((BPF_CLASS(insn->code) == BPF_LDX ? reg2pt_regs[dst_reg] : DONT_CLEAR) << 8); + /* + * src_reg/dst_reg holds the address in the arena region with upper + * 32-bits being zero because of a preceding addr_space_cast(r<n>, + * 0x0, 0x1) instruction. This address is adjusted with the addition + * of arena_vm_start (see the implementation of BPF_PROBE_MEM32 and + * BPF_PROBE_ATOMIC) before being used for the memory access. Pass + * the reg holding the unmodified 32-bit address to + * ex_handler_bpf(). + */ + if (BPF_CLASS(insn->code) == BPF_LDX) { + arena_reg = reg2pt_regs[src_reg]; + fixup_reg = reg2pt_regs[dst_reg]; + } else { + arena_reg = reg2pt_regs[dst_reg]; + fixup_reg = DONT_CLEAR; + } + + ex->fixup = FIELD_PREP(FIXUP_INSN_LEN_MASK, prog - start_of_ldx) | + FIELD_PREP(FIXUP_ARENA_REG_MASK, arena_reg) | + FIELD_PREP(FIXUP_REG_MASK, fixup_reg); + ex->fixup |= FIXUP_ARENA_ACCESS; + + ex->data |= FIELD_PREP(DATA_ARENA_OFFSET_MASK, insn->off); } break; @@ -2208,7 +2316,8 @@ populate_extable: * End result: x86 insn "mov rbx, qword ptr [rax+0x14]" * of 4 bytes will be ignored and rbx will be zero inited. */ - ex->fixup = (prog - start_of_ldx) | (reg2pt_regs[dst_reg] << 8); + ex->fixup = FIELD_PREP(FIXUP_INSN_LEN_MASK, prog - start_of_ldx) | + FIELD_PREP(FIXUP_REG_MASK, reg2pt_regs[dst_reg]); } break; diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S index 1d78e5631bb8..344030c1a81d 100644 --- a/arch/x86/platform/pvh/head.S +++ b/arch/x86/platform/pvh/head.S @@ -24,7 +24,7 @@ #include <asm/nospec-branch.h> #include <xen/interface/elfnote.h> - __HEAD + __INIT /* * Entry point for PVH guests. diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index 2c19d7fc8a85..7ea1b75e59b7 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -21,6 +21,7 @@ function clear_vars() { eid = -1 # escape id gid = -1 # group id aid = -1 # AVX id + xopid = -1 # XOP id tname = "" } @@ -39,9 +40,11 @@ BEGIN { ggid = 1 geid = 1 gaid = 0 + gxopid = 0 delete etable delete gtable delete atable + delete xoptable opnd_expr = "^[A-Za-z/]" ext_expr = "^\\(" @@ -61,6 +64,7 @@ BEGIN { imm_flag["Ob"] = "INAT_MOFFSET" imm_flag["Ov"] = "INAT_MOFFSET" imm_flag["Lx"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" + imm_flag["Lo"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])" force64_expr = "\\([df]64\\)" @@ -87,6 +91,8 @@ BEGIN { evexonly_expr = "\\(ev\\)" # (es) is the same as (ev) but also "SCALABLE" i.e. W and pp determine operand size evex_scalable_expr = "\\(es\\)" + # All opcodes in XOP table or with (xop) superscript accept XOP prefix + xopok_expr = "\\(xop\\)" prefix_expr = "\\(Prefix\\)" prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" @@ -106,6 +112,7 @@ BEGIN { prefix_num["VEX+2byte"] = "INAT_PFX_VEX3" prefix_num["EVEX"] = "INAT_PFX_EVEX" prefix_num["REX2"] = "INAT_PFX_REX2" + prefix_num["XOP"] = "INAT_PFX_XOP" clear_vars() } @@ -147,6 +154,7 @@ function array_size(arr, i,c) { if (NF != 1) { # AVX/escape opcode table aid = $2 + xopid = -1 if (gaid <= aid) gaid = aid + 1 if (tname == "") # AVX only opcode table @@ -156,6 +164,20 @@ function array_size(arr, i,c) { tname = "inat_primary_table" } +/^XOPcode:/ { + if (NF != 1) { + # XOP opcode table + xopid = $2 + aid = -1 + if (gxopid <= xopid) + gxopid = xopid + 1 + if (tname == "") # XOP only opcode table + tname = sprintf("inat_xop_table_%d", $2) + } + if (xopid == -1 && eid == -1) # primary opcode table + tname = "inat_primary_table" +} + /^GrpTable:/ { print "/* " $0 " */" if (!($2 in group)) @@ -206,6 +228,8 @@ function print_table(tbl,name,fmt,n) etable[eid,0] = tname if (aid >= 0) atable[aid,0] = tname + else if (xopid >= 0) + xoptable[xopid] = tname } if (array_size(lptable1) != 0) { print_table(lptable1,tname "_1[INAT_OPCODE_TABLE_SIZE]", @@ -347,6 +371,8 @@ function convert_operands(count,opnd, i,j,imm,mod) flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY") else if (match(ext, vexok_expr) || match(opcode, vexok_opcode_expr)) flags = add_flags(flags, "INAT_VEXOK") + else if (match(ext, xopok_expr) || xopid >= 0) + flags = add_flags(flags, "INAT_XOPOK") # check prefixes if (match(ext, prefix_expr)) { @@ -413,6 +439,14 @@ END { print " ["i"]["j"] = "atable[i,j]"," print "};\n" + print "/* XOP opcode map array */" + print "const insn_attr_t * const inat_xop_tables[X86_XOP_M_MAX - X86_XOP_M_MIN + 1]" \ + " = {" + for (i = 0; i < gxopid; i++) + if (xoptable[i]) + print " ["i"] = "xoptable[i]"," + print "};" + print "#else /* !__BOOT_COMPRESSED */\n" print "/* Escape opcode map array */" @@ -430,6 +464,10 @@ END { "[INAT_LSTPFX_MAX + 1];" print "" + print "/* XOP opcode map array */" + print "static const insn_attr_t *inat_xop_tables[X86_XOP_M_MAX - X86_XOP_M_MIN + 1];" + print "" + print "static void inat_init_tables(void)" print "{" @@ -455,6 +493,12 @@ END { if (atable[i,j]) print "\tinat_avx_tables["i"]["j"] = "atable[i,j]";" + print "" + print "\t/* Print XOP opcode map array */" + for (i = 0; i < gxopid; i++) + if (xoptable[i]) + print "\tinat_xop_tables["i"] = "xoptable[i]";" + print "}" print "#endif" } diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 5778bc498415..e5a2b9a912d1 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -740,10 +740,10 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel, static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, const char *symname) { - int headtext = !strcmp(sec_name(sec->shdr.sh_info), ".head.text"); unsigned r_type = ELF64_R_TYPE(rel->r_info); ElfW(Addr) offset = rel->r_offset; int shn_abs = (sym->st_shndx == SHN_ABS) && !is_reloc(S_REL, symname); + if (sym->st_shndx == SHN_UNDEF) return 0; @@ -783,12 +783,6 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, break; } - if (headtext) { - die("Absolute reference to symbol '%s' not permitted in .head.text\n", - symname); - break; - } - /* * Relocation offsets for 64 bit kernels are output * as 32 bits and sign extended back to 64 bits when diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c index 942372e69b4d..ee643a6cd691 100644 --- a/arch/x86/virt/svm/sev.c +++ b/arch/x86/virt/svm/sev.c @@ -1029,7 +1029,7 @@ int rmp_make_shared(u64 pfn, enum pg_level level) } EXPORT_SYMBOL_GPL(rmp_make_shared); -void snp_leak_pages(u64 pfn, unsigned int npages) +void __snp_leak_pages(u64 pfn, unsigned int npages, bool dump_rmp) { struct page *page = pfn_to_page(pfn); @@ -1052,14 +1052,15 @@ void snp_leak_pages(u64 pfn, unsigned int npages) (PageHead(page) && compound_nr(page) <= npages)) list_add_tail(&page->buddy_list, &snp_leaked_pages_list); - dump_rmpentry(pfn); + if (dump_rmp) + dump_rmpentry(pfn); snp_nr_leaked_pages++; pfn++; page++; } spin_unlock(&snp_leaked_pages_list_lock); } -EXPORT_SYMBOL_GPL(snp_leak_pages); +EXPORT_SYMBOL_GPL(__snp_leak_pages); void kdump_sev_callback(void) { diff --git a/arch/xtensa/kernel/asm-offsets.c b/arch/xtensa/kernel/asm-offsets.c index da38de20ae59..cfbced95e944 100644 --- a/arch/xtensa/kernel/asm-offsets.c +++ b/arch/xtensa/kernel/asm-offsets.c @@ -11,6 +11,7 @@ * * Chris Zankel <chris@zankel.net> */ +#define COMPILE_OFFSETS #include <asm/processor.h> #include <asm/coprocessor.h> |