diff options
| -rw-r--r-- | Documentation/virt/kvm/api.rst | 28 | ||||
| -rw-r--r-- | arch/x86/include/asm/cpufeatures.h | 1 | ||||
| -rw-r--r-- | arch/x86/include/asm/kvm_host.h | 9 | ||||
| -rw-r--r-- | arch/x86/include/uapi/asm/kvm.h | 6 | ||||
| -rw-r--r-- | arch/x86/kvm/Makefile | 49 | ||||
| -rw-r--r-- | arch/x86/kvm/cpuid.c | 75 | ||||
| -rw-r--r-- | arch/x86/kvm/cpuid.h | 12 | ||||
| -rw-r--r-- | arch/x86/kvm/ioapic.c | 2 | ||||
| -rw-r--r-- | arch/x86/kvm/lapic.c | 77 | ||||
| -rw-r--r-- | arch/x86/kvm/lapic.h | 2 | ||||
| -rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 11 | ||||
| -rw-r--r-- | arch/x86/kvm/pmu.c | 2 | ||||
| -rw-r--r-- | arch/x86/kvm/reverse_cpuid.h | 19 | ||||
| -rw-r--r-- | arch/x86/kvm/svm/svm.c | 3 | ||||
| -rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 3 | ||||
| -rw-r--r-- | arch/x86/kvm/x86.c | 81 | ||||
| -rw-r--r-- | arch/x86/kvm/x86.h | 15 |
17 files changed, 326 insertions, 69 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 49f043246f95..095095ae01dd 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7908,8 +7908,10 @@ Will return -EBUSY if a VCPU has already been created. Valid feature flags in args[0] are:: - #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) - #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) + #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) + #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) + #define KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST (1ULL << 2) + #define KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST (1ULL << 3) Enabling KVM_X2APIC_API_USE_32BIT_IDS changes the behavior of KVM_SET_GSI_ROUTING, KVM_SIGNAL_MSI, KVM_SET_LAPIC, and KVM_GET_LAPIC, @@ -7922,6 +7924,28 @@ as a broadcast even in x2APIC mode in order to support physical x2APIC without interrupt remapping. This is undesirable in logical mode, where 0xff represents CPUs 0-7 in cluster 0. +Setting KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST instructs KVM to enable +Suppress EOI Broadcasts. KVM will advertise support for Suppress EOI +Broadcast to the guest and suppress LAPIC EOI broadcasts when the guest +sets the Suppress EOI Broadcast bit in the SPIV register. This flag is +supported only when using a split IRQCHIP. + +Setting KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST disables support for +Suppress EOI Broadcasts entirely, i.e. instructs KVM to NOT advertise +support to the guest. + +Modern VMMs should either enable KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST +or KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST. If not, legacy quirky +behavior will be used by KVM: in split IRQCHIP mode, KVM will advertise +support for Suppress EOI Broadcasts but not actually suppress EOI +broadcasts; for in-kernel IRQCHIP mode, KVM will not advertise support for +Suppress EOI Broadcasts. + +Setting both KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST and +KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST will fail with an EINVAL error, +as will setting KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST without a split +IRCHIP. + 7.8 KVM_CAP_S390_USER_INSTR0 ---------------------------- diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 81f7b3b91986..c01fdde465de 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -326,6 +326,7 @@ #define X86_FEATURE_AMX_FP16 (12*32+21) /* AMX fp16 Support */ #define X86_FEATURE_AVX_IFMA (12*32+23) /* Support for VPMADD52[H,L]UQ */ #define X86_FEATURE_LAM (12*32+26) /* "lam" Linear Address Masking */ +#define X86_FEATURE_MOVRS (12*32+31) /* MOVRS instructions */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0353d8b6988c..91c26f159d89 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -784,6 +784,8 @@ enum kvm_only_cpuid_leafs { CPUID_24_0_EBX, CPUID_8000_0021_ECX, CPUID_7_1_ECX, + CPUID_1E_1_EAX, + CPUID_24_1_ECX, NR_KVM_CPU_CAPS, NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, @@ -1234,6 +1236,12 @@ enum kvm_irqchip_mode { KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */ }; +enum kvm_suppress_eoi_broadcast_mode { + KVM_SUPPRESS_EOI_BROADCAST_QUIRKED, /* Legacy behavior */ + KVM_SUPPRESS_EOI_BROADCAST_ENABLED, /* Enable Suppress EOI broadcast */ + KVM_SUPPRESS_EOI_BROADCAST_DISABLED /* Disable Suppress EOI broadcast */ +}; + struct kvm_x86_msr_filter { u8 count; bool default_allow:1; @@ -1483,6 +1491,7 @@ struct kvm_arch { bool x2apic_format; bool x2apic_broadcast_quirk_disabled; + enum kvm_suppress_eoi_broadcast_mode suppress_eoi_broadcast_mode; bool has_mapped_host_mmio; bool guest_can_read_msr_platform_info; diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index b2c928c5965d..846a63215ce1 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -916,8 +916,10 @@ struct kvm_sev_snp_launch_finish { __u64 pad1[4]; }; -#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) -#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) +#define KVM_X2APIC_API_USE_32BIT_IDS _BITULL(0) +#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK _BITULL(1) +#define KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST _BITULL(2) +#define KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST _BITULL(3) struct kvm_hyperv_eventfd { __u32 conn_id; diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index c4b8950c7abe..77337c37324b 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -47,3 +47,52 @@ $(obj)/kvm-asm-offsets.h: $(obj)/kvm-asm-offsets.s FORCE targets += kvm-asm-offsets.s clean-files += kvm-asm-offsets.h + + +# Fail the build if there is unexpected EXPORT_SYMBOL_GPL (or EXPORT_SYMBOL) +# usage. All KVM-internal exports should use EXPORT_SYMBOL_FOR_KVM_INTERNAL. +# Only a handful of exports intended for other modules (VFIO, KVMGT) should +# use EXPORT_SYMBOL_GPL, and EXPORT_SYMBOL should never be used. +ifdef CONFIG_KVM_X86 +# Search recursively for whole words and print line numbers. Filter out the +# allowed set of exports, i.e. those that are intended for external usage. +exports_grep_trailer := --include='*.[ch]' -nrw $(srctree)/virt/kvm $(srctree)/arch/x86/kvm | \ + grep -v -e kvm_page_track_register_notifier \ + -e kvm_page_track_unregister_notifier \ + -e kvm_write_track_add_gfn \ + -e kvm_write_track_remove_gfn \ + -e kvm_get_kvm \ + -e kvm_get_kvm_safe \ + -e kvm_put_kvm + +# Force grep to emit a goofy group separator that can in turn be replaced with +# the above newline macro (newlines in Make are a nightmare). Note, grep only +# prints the group separator when N lines of context are requested via -C, +# a.k.a. --NUM. Simply request zero lines. Print the separator only after +# filtering out expected exports to avoid extra newlines in the error message. +define get_kvm_exports +$(shell grep "$(1)" -C0 $(exports_grep_trailer) | grep "$(1)" -C0 --group-separator="!SEP!") +endef + +define check_kvm_exports +nr_kvm_exports := $(shell grep "$(1)" $(exports_grep_trailer) | wc -l) + +ifneq (0,$$(nr_kvm_exports)) +$$(error ERROR ***\ +$$(newline)found $$(nr_kvm_exports) unwanted occurrences of $(1):\ +$$(newline) $(subst !SEP!,$$(newline) ,$(call get_kvm_exports,$(1)))\ +$$(newline)in directories:\ +$$(newline) $(srctree)/arch/x86/kvm\ +$$(newline) $(srctree)/virt/kvm\ +$$(newline)Use EXPORT_SYMBOL_FOR_KVM_INTERNAL, not $(1)) +endif # nr_kvm_exports != 0 +undefine nr_kvm_exports +endef # check_kvm_exports + +$(eval $(call check_kvm_exports,EXPORT_SYMBOL_GPL)) +$(eval $(call check_kvm_exports,EXPORT_SYMBOL)) + +undefine check_kvm_exports +undefine get_kvm_exports +undefine exports_grep_trailer +endif # CONFIG_KVM_X86 diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index c590a5bd3196..7fe4e58a6ebf 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -36,6 +36,9 @@ u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_cpu_caps); +bool kvm_is_configuring_cpu_caps __read_mostly; +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_is_configuring_cpu_caps); + struct cpuid_xstate_sizes { u32 eax; u32 ebx; @@ -534,17 +537,20 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, BUILD_BUG_ON(sizeof(vcpu_caps) != sizeof(vcpu->arch.cpu_caps)); /* - * KVM does not correctly handle changing guest CPUID after KVM_RUN, as - * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't - * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page - * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with - * the core vCPU model on the fly. It would've been better to forbid any - * KVM_SET_CPUID{,2} calls after KVM_RUN altogether but unfortunately - * some VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do + * KVM does not correctly handle changing guest CPUID after KVM_RUN or + * while L2 is active, as MAXPHYADDR, GBPAGES support, AMD reserved bit + * behavior, etc. aren't tracked in kvm_mmu_page_role, and L2 state + * can't be adjusted (without breaking L2 in some way). As a result, + * KVM may reuse SPs/SPTEs and/or run L2 with bad/misconfigured state. + * + * In practice, no sane VMM mucks with the core vCPU model on the fly. + * It would've been better to forbid any KVM_SET_CPUID{,2} calls after + * KVM_RUN or KVM_SET_NESTED_STATE altogether, but unfortunately some + * VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do * KVM_SET_CPUID{,2} again. To support this legacy behavior, check * whether the supplied CPUID data is equal to what's already set. */ - if (kvm_vcpu_has_run(vcpu)) { + if (!kvm_can_set_cpuid_and_feature_msrs(vcpu)) { r = kvm_cpuid_check_equal(vcpu, e2, nent); if (r) goto err; @@ -823,10 +829,13 @@ do { \ /* DS is defined by ptrace-abi.h on 32-bit builds. */ #undef DS -void kvm_set_cpu_caps(void) +void kvm_initialize_cpu_caps(void) { memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps)); + WARN_ON_ONCE(kvm_is_configuring_cpu_caps); + kvm_is_configuring_cpu_caps = true; + BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) > sizeof(boot_cpu_data.x86_capability)); @@ -1025,6 +1034,7 @@ void kvm_set_cpu_caps(void) F(AMX_FP16), F(AVX_IFMA), F(LAM), + F(MOVRS), ); kvm_cpu_cap_init(CPUID_7_1_ECX, @@ -1063,12 +1073,27 @@ void kvm_set_cpu_caps(void) SCATTERED_F(SGX_EDECCSSA), ); + kvm_cpu_cap_init(CPUID_1E_1_EAX, + F(AMX_INT8_ALIAS), + F(AMX_BF16_ALIAS), + F(AMX_COMPLEX_ALIAS), + F(AMX_FP16_ALIAS), + F(AMX_FP8), + F(AMX_TF32), + F(AMX_AVX512), + F(AMX_MOVRS), + ); + kvm_cpu_cap_init(CPUID_24_0_EBX, F(AVX10_128), F(AVX10_256), F(AVX10_512), ); + kvm_cpu_cap_init(CPUID_24_1_ECX, + F(AVX10_VNNI_INT), + ); + kvm_cpu_cap_init(CPUID_8000_0001_ECX, F(LAHF_LM), F(CMP_LEGACY), @@ -1270,7 +1295,7 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_clear(X86_FEATURE_RDPID); } } -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cpu_caps); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_initialize_cpu_caps); #undef F #undef SCATTERED_F @@ -1624,6 +1649,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; } + + max_idx = entry->eax = min(entry->eax, 1u); + + /* KVM only supports up to 0x1e.0x1, capped above via min(). */ + if (max_idx >= 1) { + entry = do_host_cpuid(array, function, 1); + if (!entry) + goto out; + + cpuid_entry_override(entry, CPUID_1E_1_EAX); + entry->ebx = 0; + entry->ecx = 0; + entry->edx = 0; + } break; case 0x24: { u8 avx10_version; @@ -1633,18 +1672,30 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) break; } + max_idx = entry->eax = min(entry->eax, 1u); /* * The AVX10 version is encoded in EBX[7:0]. Note, the version * is guaranteed to be >=1 if AVX10 is supported. Note #2, the * version needs to be captured before overriding EBX features! */ - avx10_version = min_t(u8, entry->ebx & 0xff, 1); + avx10_version = min_t(u8, entry->ebx & 0xff, 2); cpuid_entry_override(entry, CPUID_24_0_EBX); entry->ebx |= avx10_version; - entry->eax = 0; entry->ecx = 0; entry->edx = 0; + + /* KVM only supports up to 0x24.0x1, capped above via min(). */ + if (max_idx >= 1) { + entry = do_host_cpuid(array, function, 1); + if (!entry) + goto out; + + cpuid_entry_override(entry, CPUID_24_1_ECX); + entry->eax = 0; + entry->ebx = 0; + entry->edx = 0; + } break; } case KVM_CPUID_SIGNATURE: { diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index d3f5ae15a7ca..039b8e6f40ba 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -8,7 +8,15 @@ #include <uapi/asm/kvm_para.h> extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; -void kvm_set_cpu_caps(void); +extern bool kvm_is_configuring_cpu_caps __read_mostly; + +void kvm_initialize_cpu_caps(void); + +static inline void kvm_finalize_cpu_caps(void) +{ + WARN_ON_ONCE(!kvm_is_configuring_cpu_caps); + kvm_is_configuring_cpu_caps = false; +} void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2(struct kvm_cpuid_entry2 *entries, @@ -188,6 +196,7 @@ static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature) { unsigned int x86_leaf = __feature_leaf(x86_feature); + WARN_ON_ONCE(!kvm_is_configuring_cpu_caps); kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature); } @@ -195,6 +204,7 @@ static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature) { unsigned int x86_leaf = __feature_leaf(x86_feature); + WARN_ON_ONCE(!kvm_is_configuring_cpu_caps); kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature); } diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 2c2783296aed..a26fa4222f29 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -561,7 +561,7 @@ static void kvm_ioapic_update_eoi_one(struct kvm_vcpu *vcpu, spin_lock(&ioapic->lock); if (trigger_mode != IOAPIC_LEVEL_TRIG || - kvm_lapic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) + kvm_lapic_suppress_eoi_broadcast(apic)) return; ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 1597dd0b0cc6..738ec3c1b0b5 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -105,6 +105,63 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector) apic_test_vector(vector, apic->regs + APIC_IRR); } +static bool kvm_lapic_advertise_suppress_eoi_broadcast(struct kvm *kvm) +{ + switch (kvm->arch.suppress_eoi_broadcast_mode) { + case KVM_SUPPRESS_EOI_BROADCAST_ENABLED: + return true; + case KVM_SUPPRESS_EOI_BROADCAST_DISABLED: + return false; + case KVM_SUPPRESS_EOI_BROADCAST_QUIRKED: + /* + * The default in-kernel I/O APIC emulates the 82093AA and does not + * implement an EOI register. Some guests (e.g. Windows with the + * Hyper-V role enabled) disable LAPIC EOI broadcast without + * checking the I/O APIC version, which can cause level-triggered + * interrupts to never be EOI'd. + * + * To avoid this, KVM doesn't advertise Suppress EOI Broadcast + * support when using the default in-kernel I/O APIC. + * + * Historically, in split IRQCHIP mode, KVM always advertised + * Suppress EOI Broadcast support but did not actually suppress + * EOIs, resulting in quirky behavior. + */ + return !ioapic_in_kernel(kvm); + default: + WARN_ON_ONCE(1); + return false; + } +} + +bool kvm_lapic_suppress_eoi_broadcast(struct kvm_lapic *apic) +{ + struct kvm *kvm = apic->vcpu->kvm; + + if (!(kvm_lapic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) + return false; + + switch (kvm->arch.suppress_eoi_broadcast_mode) { + case KVM_SUPPRESS_EOI_BROADCAST_ENABLED: + return true; + case KVM_SUPPRESS_EOI_BROADCAST_DISABLED: + return false; + case KVM_SUPPRESS_EOI_BROADCAST_QUIRKED: + /* + * Historically, in split IRQCHIP mode, KVM ignored the suppress + * EOI broadcast bit set by the guest and broadcasts EOIs to the + * userspace I/O APIC. For In-kernel I/O APIC, the support itself + * is not advertised, can only be enabled via KVM_SET_APIC_STATE, + * and KVM's I/O APIC doesn't emulate Directed EOIs; but if the + * feature is enabled, it is respected (with odd behavior). + */ + return ioapic_in_kernel(kvm); + default: + WARN_ON_ONCE(1); + return false; + } +} + __read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu); EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_has_noapic_vcpu); @@ -554,15 +611,9 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) v = APIC_VERSION | ((apic->nr_lvt_entries - 1) << 16); - /* - * KVM emulates 82093AA datasheet (with in-kernel IOAPIC implementation) - * which doesn't have EOI register; Some buggy OSes (e.g. Windows with - * Hyper-V role) disable EOI broadcast in lapic not checking for IOAPIC - * version first and level-triggered interrupts never get EOIed in - * IOAPIC. - */ + if (guest_cpu_cap_has(vcpu, X86_FEATURE_X2APIC) && - !ioapic_in_kernel(vcpu->kvm)) + kvm_lapic_advertise_suppress_eoi_broadcast(vcpu->kvm)) v |= APIC_LVR_DIRECTED_EOI; kvm_lapic_set_reg(apic, APIC_LVR, v); } @@ -1517,6 +1568,15 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) /* Request a KVM exit to inform the userspace IOAPIC. */ if (irqchip_split(apic->vcpu->kvm)) { + /* + * Don't exit to userspace if the guest has enabled Directed + * EOI, a.k.a. Suppress EOI Broadcasts, in which case the local + * APIC doesn't broadcast EOIs (the guest must EOI the target + * I/O APIC(s) directly). + */ + if (kvm_lapic_suppress_eoi_broadcast(apic)) + return; + apic->vcpu->arch.pending_ioapic_eoi = vector; kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu); return; @@ -3498,7 +3558,6 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu) * wait-for-SIPI (WFS). */ if (!kvm_apic_init_sipi_allowed(vcpu)) { - WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED); clear_bit(KVM_APIC_SIPI, &apic->pending_events); return 0; } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 282b9b7da98c..e5f5a222eced 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -231,6 +231,8 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); +bool kvm_lapic_suppress_eoi_broadcast(struct kvm_lapic *apic); + void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu); void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq, diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 02c450686b4a..3911ac9bddfd 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4521,7 +4521,10 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, arch.gfn = fault->gfn; arch.error_code = fault->error_code; arch.direct_map = vcpu->arch.mmu->root_role.direct; - arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu); + if (arch.direct_map) + arch.cr3 = (unsigned long)INVALID_GPA; + else + arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu); return kvm_setup_async_pf(vcpu, fault->addr, kvm_vcpu_gfn_to_hva(vcpu, fault->gfn), &arch); @@ -6031,11 +6034,7 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.nested_mmu.cpu_role.ext.valid = 0; kvm_mmu_reset_context(vcpu); - /* - * Changing guest CPUID after KVM_RUN is forbidden, see the comment in - * kvm_arch_vcpu_ioctl(). - */ - KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm); + KVM_BUG_ON(!kvm_can_set_cpuid_and_feature_msrs(vcpu), vcpu->kvm); } void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 487ad19a236e..ff20b4102173 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -853,7 +853,7 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm)) + if (KVM_BUG_ON(!kvm_can_set_cpuid_and_feature_msrs(vcpu), vcpu->kvm)) return; /* diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index 81b4a7acf72e..657f5f743ed9 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -44,11 +44,28 @@ #define KVM_X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4) #define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5) +/* + * Intel-defined sub-features, CPUID level 0x0000001E:1 (EAX). Note, several + * of the bits are aliases to features of the same name that are enumerated via + * various CPUID.0x7 sub-leafs. + */ +#define X86_FEATURE_AMX_INT8_ALIAS KVM_X86_FEATURE(CPUID_1E_1_EAX, 0) +#define X86_FEATURE_AMX_BF16_ALIAS KVM_X86_FEATURE(CPUID_1E_1_EAX, 1) +#define X86_FEATURE_AMX_COMPLEX_ALIAS KVM_X86_FEATURE(CPUID_1E_1_EAX, 2) +#define X86_FEATURE_AMX_FP16_ALIAS KVM_X86_FEATURE(CPUID_1E_1_EAX, 3) +#define X86_FEATURE_AMX_FP8 KVM_X86_FEATURE(CPUID_1E_1_EAX, 4) +#define X86_FEATURE_AMX_TF32 KVM_X86_FEATURE(CPUID_1E_1_EAX, 6) +#define X86_FEATURE_AMX_AVX512 KVM_X86_FEATURE(CPUID_1E_1_EAX, 7) +#define X86_FEATURE_AMX_MOVRS KVM_X86_FEATURE(CPUID_1E_1_EAX, 8) + /* Intel-defined sub-features, CPUID level 0x00000024:0 (EBX) */ #define X86_FEATURE_AVX10_128 KVM_X86_FEATURE(CPUID_24_0_EBX, 16) #define X86_FEATURE_AVX10_256 KVM_X86_FEATURE(CPUID_24_0_EBX, 17) #define X86_FEATURE_AVX10_512 KVM_X86_FEATURE(CPUID_24_0_EBX, 18) +/* Intel-defined sub-features, CPUID level 0x00000024:1 (ECX) */ +#define X86_FEATURE_AVX10_VNNI_INT KVM_X86_FEATURE(CPUID_24_1_ECX, 2) + /* CPUID level 0x80000007 (EDX). */ #define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8) @@ -90,6 +107,8 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_24_0_EBX] = { 0x24, 0, CPUID_EBX}, [CPUID_8000_0021_ECX] = {0x80000021, 0, CPUID_ECX}, [CPUID_7_1_ECX] = { 7, 1, CPUID_ECX}, + [CPUID_1E_1_EAX] = { 0x1e, 1, CPUID_EAX}, + [CPUID_24_1_ECX] = { 0x24, 1, CPUID_ECX}, }; /* diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 8b0ac67becae..9ee74c57bd51 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5259,7 +5259,7 @@ static __init void svm_adjust_mmio_mask(void) static __init void svm_set_cpu_caps(void) { - kvm_set_cpu_caps(); + kvm_initialize_cpu_caps(); kvm_caps.supported_perf_cap = 0; @@ -5343,6 +5343,7 @@ static __init void svm_set_cpu_caps(void) kvm_cpu_cap_clear(X86_FEATURE_MSR_IMM); kvm_setup_xss_caps(); + kvm_finalize_cpu_caps(); } static __init int svm_hardware_setup(void) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 530981a42c96..edf12bf58578 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8039,7 +8039,7 @@ static __init u64 vmx_get_perf_capabilities(void) static __init void vmx_set_cpu_caps(void) { - kvm_set_cpu_caps(); + kvm_initialize_cpu_caps(); /* CPUID 0x1 */ if (nested) @@ -8098,6 +8098,7 @@ static __init void vmx_set_cpu_caps(void) } kvm_setup_xss_caps(); + kvm_finalize_cpu_caps(); } static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1f66c7441272..08a6d6e20e9b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -121,8 +121,10 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE -#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ - KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) +#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ + KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK | \ + KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST | \ + KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST) static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); @@ -2314,13 +2316,14 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) u64 val; /* - * Disallow writes to immutable feature MSRs after KVM_RUN. KVM does - * not support modifying the guest vCPU model on the fly, e.g. changing - * the nVMX capabilities while L2 is running is nonsensical. Allow - * writes of the same value, e.g. to allow userspace to blindly stuff - * all MSRs when emulating RESET. + * Reject writes to immutable feature MSRs if the vCPU model is frozen, + * as KVM doesn't support modifying the guest vCPU model on the fly, + * e.g. changing the VMX capabilities MSRs while L2 is active is + * nonsensical. Allow writes of the same value, e.g. so that userspace + * can blindly stuff all MSRs when emulating RESET. */ - if (kvm_vcpu_has_run(vcpu) && kvm_is_immutable_feature_msr(index) && + if (!kvm_can_set_cpuid_and_feature_msrs(vcpu) && + kvm_is_immutable_feature_msr(index) && (do_get_msr(vcpu, index, &val) || *data != val)) return -EINVAL; @@ -4096,47 +4099,47 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_KVM_WALL_CLOCK_NEW: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; vcpu->kvm->arch.wall_clock = data; kvm_write_wall_clock(vcpu->kvm, data, 0); break; case MSR_KVM_WALL_CLOCK: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; vcpu->kvm->arch.wall_clock = data; kvm_write_wall_clock(vcpu->kvm, data, 0); break; case MSR_KVM_SYSTEM_TIME_NEW: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; kvm_write_system_time(vcpu, data, false, msr_info->host_initiated); break; case MSR_KVM_SYSTEM_TIME: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; kvm_write_system_time(vcpu, data, true, msr_info->host_initiated); break; case MSR_KVM_ASYNC_PF_EN: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; if (kvm_pv_enable_async_pf(vcpu, data)) return 1; break; case MSR_KVM_ASYNC_PF_INT: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; if (kvm_pv_enable_async_pf_int(vcpu, data)) return 1; break; case MSR_KVM_ASYNC_PF_ACK: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; if (data & 0x1) { /* * Pairs with the smp_mb__after_atomic() in @@ -4149,7 +4152,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_KVM_STEAL_TIME: if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; if (unlikely(!sched_info_on())) return 1; @@ -4167,7 +4170,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_KVM_PV_EOI_EN: if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; if (kvm_lapic_set_pv_eoi(vcpu, data, sizeof(u8))) return 1; @@ -4175,7 +4178,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_KVM_POLL_CONTROL: if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; /* only enable bit supported */ if (data & (-1ULL << 1)) @@ -4476,61 +4479,61 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_KVM_WALL_CLOCK: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->kvm->arch.wall_clock; break; case MSR_KVM_WALL_CLOCK_NEW: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->kvm->arch.wall_clock; break; case MSR_KVM_SYSTEM_TIME: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.time; break; case MSR_KVM_SYSTEM_TIME_NEW: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.time; break; case MSR_KVM_ASYNC_PF_EN: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.apf.msr_en_val; break; case MSR_KVM_ASYNC_PF_INT: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.apf.msr_int_val; break; case MSR_KVM_ASYNC_PF_ACK: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = 0; break; case MSR_KVM_STEAL_TIME: if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.st.msr_val; break; case MSR_KVM_PV_EOI_EN: if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.pv_eoi.msr_val; break; case MSR_KVM_POLL_CONTROL: if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) - return 1; + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.msr_kvm_poll_control; break; @@ -4931,6 +4934,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_X2APIC_API: r = KVM_X2APIC_API_VALID_FLAGS; + if (kvm && !irqchip_split(kvm)) + r &= ~KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST; break; case KVM_CAP_NESTED_STATE: r = kvm_x86_ops.nested_ops->get_state ? @@ -6748,11 +6753,24 @@ split_irqchip_unlock: if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS) break; + if ((cap->args[0] & KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST) && + (cap->args[0] & KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST)) + break; + + if ((cap->args[0] & KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST) && + !irqchip_split(kvm)) + break; + if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS) kvm->arch.x2apic_format = true; if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) kvm->arch.x2apic_broadcast_quirk_disabled = true; + if (cap->args[0] & KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST) + kvm->arch.suppress_eoi_broadcast_mode = KVM_SUPPRESS_EOI_BROADCAST_ENABLED; + if (cap->args[0] & KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST) + kvm->arch.suppress_eoi_broadcast_mode = KVM_SUPPRESS_EOI_BROADCAST_DISABLED; + r = 0; break; case KVM_CAP_X86_DISABLE_EXITS: @@ -11609,8 +11627,7 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu) if (is_guest_mode(vcpu)) { int r = kvm_check_nested_events(vcpu); - WARN_ON_ONCE(r == -EBUSY); - if (r < 0) + if (r < 0 && r != -EBUSY) return 0; } @@ -12158,9 +12175,11 @@ static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2) return; if (is_pae_paging(vcpu)) { + kvm_vcpu_srcu_read_lock(vcpu); for (i = 0 ; i < 4 ; i++) sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i); sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID; + kvm_vcpu_srcu_read_unlock(vcpu); } } @@ -13320,7 +13339,7 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm) #endif kvm_mmu_pre_destroy_vm(kvm); - static_call_cond(kvm_x86_vm_pre_destroy)(kvm); + kvm_x86_call(vm_pre_destroy)(kvm); } void kvm_arch_destroy_vm(struct kvm *kvm) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 00de24f55b1f..ff20e62d98c6 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -172,9 +172,20 @@ static inline void kvm_nested_vmexit_handle_ibrs(struct kvm_vcpu *vcpu) indirect_branch_prediction_barrier(); } -static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu) +/* + * Disallow modifying CPUID and feature MSRs, which affect the core virtual CPU + * model exposed to the guest and virtualized by KVM, if the vCPU has already + * run or is in guest mode (L2). In both cases, KVM has already consumed the + * current virtual CPU model, and doesn't support "unwinding" to react to the + * new model. + * + * Note, the only way is_guest_mode() can be true with 'last_vmentry_cpu == -1' + * is if userspace sets CPUID and feature MSRs (to enable VMX/SVM), then sets + * nested state, and then attempts to set CPUID and/or feature MSRs *again*. + */ +static inline bool kvm_can_set_cpuid_and_feature_msrs(struct kvm_vcpu *vcpu) { - return vcpu->arch.last_vmentry_cpu != -1; + return vcpu->arch.last_vmentry_cpu == -1 && !is_guest_mode(vcpu); } static inline void kvm_set_mp_state(struct kvm_vcpu *vcpu, int mp_state) |
