summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/virt/kvm/api.rst28
-rw-r--r--arch/x86/include/asm/cpufeatures.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h9
-rw-r--r--arch/x86/include/uapi/asm/kvm.h6
-rw-r--r--arch/x86/kvm/Makefile49
-rw-r--r--arch/x86/kvm/cpuid.c75
-rw-r--r--arch/x86/kvm/cpuid.h12
-rw-r--r--arch/x86/kvm/ioapic.c2
-rw-r--r--arch/x86/kvm/lapic.c77
-rw-r--r--arch/x86/kvm/lapic.h2
-rw-r--r--arch/x86/kvm/mmu/mmu.c11
-rw-r--r--arch/x86/kvm/pmu.c2
-rw-r--r--arch/x86/kvm/reverse_cpuid.h19
-rw-r--r--arch/x86/kvm/svm/svm.c3
-rw-r--r--arch/x86/kvm/vmx/vmx.c3
-rw-r--r--arch/x86/kvm/x86.c81
-rw-r--r--arch/x86/kvm/x86.h15
17 files changed, 326 insertions, 69 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 49f043246f95..095095ae01dd 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -7908,8 +7908,10 @@ Will return -EBUSY if a VCPU has already been created.
Valid feature flags in args[0] are::
- #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0)
- #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1)
+ #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0)
+ #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1)
+ #define KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST (1ULL << 2)
+ #define KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST (1ULL << 3)
Enabling KVM_X2APIC_API_USE_32BIT_IDS changes the behavior of
KVM_SET_GSI_ROUTING, KVM_SIGNAL_MSI, KVM_SET_LAPIC, and KVM_GET_LAPIC,
@@ -7922,6 +7924,28 @@ as a broadcast even in x2APIC mode in order to support physical x2APIC
without interrupt remapping. This is undesirable in logical mode,
where 0xff represents CPUs 0-7 in cluster 0.
+Setting KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST instructs KVM to enable
+Suppress EOI Broadcasts. KVM will advertise support for Suppress EOI
+Broadcast to the guest and suppress LAPIC EOI broadcasts when the guest
+sets the Suppress EOI Broadcast bit in the SPIV register. This flag is
+supported only when using a split IRQCHIP.
+
+Setting KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST disables support for
+Suppress EOI Broadcasts entirely, i.e. instructs KVM to NOT advertise
+support to the guest.
+
+Modern VMMs should either enable KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST
+or KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST. If not, legacy quirky
+behavior will be used by KVM: in split IRQCHIP mode, KVM will advertise
+support for Suppress EOI Broadcasts but not actually suppress EOI
+broadcasts; for in-kernel IRQCHIP mode, KVM will not advertise support for
+Suppress EOI Broadcasts.
+
+Setting both KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST and
+KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST will fail with an EINVAL error,
+as will setting KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST without a split
+IRCHIP.
+
7.8 KVM_CAP_S390_USER_INSTR0
----------------------------
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 81f7b3b91986..c01fdde465de 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -326,6 +326,7 @@
#define X86_FEATURE_AMX_FP16 (12*32+21) /* AMX fp16 Support */
#define X86_FEATURE_AVX_IFMA (12*32+23) /* Support for VPMADD52[H,L]UQ */
#define X86_FEATURE_LAM (12*32+26) /* "lam" Linear Address Masking */
+#define X86_FEATURE_MOVRS (12*32+31) /* MOVRS instructions */
/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
#define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0353d8b6988c..91c26f159d89 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -784,6 +784,8 @@ enum kvm_only_cpuid_leafs {
CPUID_24_0_EBX,
CPUID_8000_0021_ECX,
CPUID_7_1_ECX,
+ CPUID_1E_1_EAX,
+ CPUID_24_1_ECX,
NR_KVM_CPU_CAPS,
NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
@@ -1234,6 +1236,12 @@ enum kvm_irqchip_mode {
KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */
};
+enum kvm_suppress_eoi_broadcast_mode {
+ KVM_SUPPRESS_EOI_BROADCAST_QUIRKED, /* Legacy behavior */
+ KVM_SUPPRESS_EOI_BROADCAST_ENABLED, /* Enable Suppress EOI broadcast */
+ KVM_SUPPRESS_EOI_BROADCAST_DISABLED /* Disable Suppress EOI broadcast */
+};
+
struct kvm_x86_msr_filter {
u8 count;
bool default_allow:1;
@@ -1483,6 +1491,7 @@ struct kvm_arch {
bool x2apic_format;
bool x2apic_broadcast_quirk_disabled;
+ enum kvm_suppress_eoi_broadcast_mode suppress_eoi_broadcast_mode;
bool has_mapped_host_mmio;
bool guest_can_read_msr_platform_info;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index b2c928c5965d..846a63215ce1 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -916,8 +916,10 @@ struct kvm_sev_snp_launch_finish {
__u64 pad1[4];
};
-#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0)
-#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1)
+#define KVM_X2APIC_API_USE_32BIT_IDS _BITULL(0)
+#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK _BITULL(1)
+#define KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST _BITULL(2)
+#define KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST _BITULL(3)
struct kvm_hyperv_eventfd {
__u32 conn_id;
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index c4b8950c7abe..77337c37324b 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -47,3 +47,52 @@ $(obj)/kvm-asm-offsets.h: $(obj)/kvm-asm-offsets.s FORCE
targets += kvm-asm-offsets.s
clean-files += kvm-asm-offsets.h
+
+
+# Fail the build if there is unexpected EXPORT_SYMBOL_GPL (or EXPORT_SYMBOL)
+# usage. All KVM-internal exports should use EXPORT_SYMBOL_FOR_KVM_INTERNAL.
+# Only a handful of exports intended for other modules (VFIO, KVMGT) should
+# use EXPORT_SYMBOL_GPL, and EXPORT_SYMBOL should never be used.
+ifdef CONFIG_KVM_X86
+# Search recursively for whole words and print line numbers. Filter out the
+# allowed set of exports, i.e. those that are intended for external usage.
+exports_grep_trailer := --include='*.[ch]' -nrw $(srctree)/virt/kvm $(srctree)/arch/x86/kvm | \
+ grep -v -e kvm_page_track_register_notifier \
+ -e kvm_page_track_unregister_notifier \
+ -e kvm_write_track_add_gfn \
+ -e kvm_write_track_remove_gfn \
+ -e kvm_get_kvm \
+ -e kvm_get_kvm_safe \
+ -e kvm_put_kvm
+
+# Force grep to emit a goofy group separator that can in turn be replaced with
+# the above newline macro (newlines in Make are a nightmare). Note, grep only
+# prints the group separator when N lines of context are requested via -C,
+# a.k.a. --NUM. Simply request zero lines. Print the separator only after
+# filtering out expected exports to avoid extra newlines in the error message.
+define get_kvm_exports
+$(shell grep "$(1)" -C0 $(exports_grep_trailer) | grep "$(1)" -C0 --group-separator="!SEP!")
+endef
+
+define check_kvm_exports
+nr_kvm_exports := $(shell grep "$(1)" $(exports_grep_trailer) | wc -l)
+
+ifneq (0,$$(nr_kvm_exports))
+$$(error ERROR ***\
+$$(newline)found $$(nr_kvm_exports) unwanted occurrences of $(1):\
+$$(newline) $(subst !SEP!,$$(newline) ,$(call get_kvm_exports,$(1)))\
+$$(newline)in directories:\
+$$(newline) $(srctree)/arch/x86/kvm\
+$$(newline) $(srctree)/virt/kvm\
+$$(newline)Use EXPORT_SYMBOL_FOR_KVM_INTERNAL, not $(1))
+endif # nr_kvm_exports != 0
+undefine nr_kvm_exports
+endef # check_kvm_exports
+
+$(eval $(call check_kvm_exports,EXPORT_SYMBOL_GPL))
+$(eval $(call check_kvm_exports,EXPORT_SYMBOL))
+
+undefine check_kvm_exports
+undefine get_kvm_exports
+undefine exports_grep_trailer
+endif # CONFIG_KVM_X86
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index c590a5bd3196..7fe4e58a6ebf 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -36,6 +36,9 @@
u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_cpu_caps);
+bool kvm_is_configuring_cpu_caps __read_mostly;
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_is_configuring_cpu_caps);
+
struct cpuid_xstate_sizes {
u32 eax;
u32 ebx;
@@ -534,17 +537,20 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
BUILD_BUG_ON(sizeof(vcpu_caps) != sizeof(vcpu->arch.cpu_caps));
/*
- * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
- * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
- * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
- * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with
- * the core vCPU model on the fly. It would've been better to forbid any
- * KVM_SET_CPUID{,2} calls after KVM_RUN altogether but unfortunately
- * some VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do
+ * KVM does not correctly handle changing guest CPUID after KVM_RUN or
+ * while L2 is active, as MAXPHYADDR, GBPAGES support, AMD reserved bit
+ * behavior, etc. aren't tracked in kvm_mmu_page_role, and L2 state
+ * can't be adjusted (without breaking L2 in some way). As a result,
+ * KVM may reuse SPs/SPTEs and/or run L2 with bad/misconfigured state.
+ *
+ * In practice, no sane VMM mucks with the core vCPU model on the fly.
+ * It would've been better to forbid any KVM_SET_CPUID{,2} calls after
+ * KVM_RUN or KVM_SET_NESTED_STATE altogether, but unfortunately some
+ * VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do
* KVM_SET_CPUID{,2} again. To support this legacy behavior, check
* whether the supplied CPUID data is equal to what's already set.
*/
- if (kvm_vcpu_has_run(vcpu)) {
+ if (!kvm_can_set_cpuid_and_feature_msrs(vcpu)) {
r = kvm_cpuid_check_equal(vcpu, e2, nent);
if (r)
goto err;
@@ -823,10 +829,13 @@ do { \
/* DS is defined by ptrace-abi.h on 32-bit builds. */
#undef DS
-void kvm_set_cpu_caps(void)
+void kvm_initialize_cpu_caps(void)
{
memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps));
+ WARN_ON_ONCE(kvm_is_configuring_cpu_caps);
+ kvm_is_configuring_cpu_caps = true;
+
BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) >
sizeof(boot_cpu_data.x86_capability));
@@ -1025,6 +1034,7 @@ void kvm_set_cpu_caps(void)
F(AMX_FP16),
F(AVX_IFMA),
F(LAM),
+ F(MOVRS),
);
kvm_cpu_cap_init(CPUID_7_1_ECX,
@@ -1063,12 +1073,27 @@ void kvm_set_cpu_caps(void)
SCATTERED_F(SGX_EDECCSSA),
);
+ kvm_cpu_cap_init(CPUID_1E_1_EAX,
+ F(AMX_INT8_ALIAS),
+ F(AMX_BF16_ALIAS),
+ F(AMX_COMPLEX_ALIAS),
+ F(AMX_FP16_ALIAS),
+ F(AMX_FP8),
+ F(AMX_TF32),
+ F(AMX_AVX512),
+ F(AMX_MOVRS),
+ );
+
kvm_cpu_cap_init(CPUID_24_0_EBX,
F(AVX10_128),
F(AVX10_256),
F(AVX10_512),
);
+ kvm_cpu_cap_init(CPUID_24_1_ECX,
+ F(AVX10_VNNI_INT),
+ );
+
kvm_cpu_cap_init(CPUID_8000_0001_ECX,
F(LAHF_LM),
F(CMP_LEGACY),
@@ -1270,7 +1295,7 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_clear(X86_FEATURE_RDPID);
}
}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cpu_caps);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_initialize_cpu_caps);
#undef F
#undef SCATTERED_F
@@ -1624,6 +1649,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
break;
}
+
+ max_idx = entry->eax = min(entry->eax, 1u);
+
+ /* KVM only supports up to 0x1e.0x1, capped above via min(). */
+ if (max_idx >= 1) {
+ entry = do_host_cpuid(array, function, 1);
+ if (!entry)
+ goto out;
+
+ cpuid_entry_override(entry, CPUID_1E_1_EAX);
+ entry->ebx = 0;
+ entry->ecx = 0;
+ entry->edx = 0;
+ }
break;
case 0x24: {
u8 avx10_version;
@@ -1633,18 +1672,30 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
break;
}
+ max_idx = entry->eax = min(entry->eax, 1u);
/*
* The AVX10 version is encoded in EBX[7:0]. Note, the version
* is guaranteed to be >=1 if AVX10 is supported. Note #2, the
* version needs to be captured before overriding EBX features!
*/
- avx10_version = min_t(u8, entry->ebx & 0xff, 1);
+ avx10_version = min_t(u8, entry->ebx & 0xff, 2);
cpuid_entry_override(entry, CPUID_24_0_EBX);
entry->ebx |= avx10_version;
- entry->eax = 0;
entry->ecx = 0;
entry->edx = 0;
+
+ /* KVM only supports up to 0x24.0x1, capped above via min(). */
+ if (max_idx >= 1) {
+ entry = do_host_cpuid(array, function, 1);
+ if (!entry)
+ goto out;
+
+ cpuid_entry_override(entry, CPUID_24_1_ECX);
+ entry->eax = 0;
+ entry->ebx = 0;
+ entry->edx = 0;
+ }
break;
}
case KVM_CPUID_SIGNATURE: {
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index d3f5ae15a7ca..039b8e6f40ba 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -8,7 +8,15 @@
#include <uapi/asm/kvm_para.h>
extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
-void kvm_set_cpu_caps(void);
+extern bool kvm_is_configuring_cpu_caps __read_mostly;
+
+void kvm_initialize_cpu_caps(void);
+
+static inline void kvm_finalize_cpu_caps(void)
+{
+ WARN_ON_ONCE(!kvm_is_configuring_cpu_caps);
+ kvm_is_configuring_cpu_caps = false;
+}
void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu);
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2(struct kvm_cpuid_entry2 *entries,
@@ -188,6 +196,7 @@ static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature)
{
unsigned int x86_leaf = __feature_leaf(x86_feature);
+ WARN_ON_ONCE(!kvm_is_configuring_cpu_caps);
kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature);
}
@@ -195,6 +204,7 @@ static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature)
{
unsigned int x86_leaf = __feature_leaf(x86_feature);
+ WARN_ON_ONCE(!kvm_is_configuring_cpu_caps);
kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature);
}
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 2c2783296aed..a26fa4222f29 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -561,7 +561,7 @@ static void kvm_ioapic_update_eoi_one(struct kvm_vcpu *vcpu,
spin_lock(&ioapic->lock);
if (trigger_mode != IOAPIC_LEVEL_TRIG ||
- kvm_lapic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)
+ kvm_lapic_suppress_eoi_broadcast(apic))
return;
ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 1597dd0b0cc6..738ec3c1b0b5 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -105,6 +105,63 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector)
apic_test_vector(vector, apic->regs + APIC_IRR);
}
+static bool kvm_lapic_advertise_suppress_eoi_broadcast(struct kvm *kvm)
+{
+ switch (kvm->arch.suppress_eoi_broadcast_mode) {
+ case KVM_SUPPRESS_EOI_BROADCAST_ENABLED:
+ return true;
+ case KVM_SUPPRESS_EOI_BROADCAST_DISABLED:
+ return false;
+ case KVM_SUPPRESS_EOI_BROADCAST_QUIRKED:
+ /*
+ * The default in-kernel I/O APIC emulates the 82093AA and does not
+ * implement an EOI register. Some guests (e.g. Windows with the
+ * Hyper-V role enabled) disable LAPIC EOI broadcast without
+ * checking the I/O APIC version, which can cause level-triggered
+ * interrupts to never be EOI'd.
+ *
+ * To avoid this, KVM doesn't advertise Suppress EOI Broadcast
+ * support when using the default in-kernel I/O APIC.
+ *
+ * Historically, in split IRQCHIP mode, KVM always advertised
+ * Suppress EOI Broadcast support but did not actually suppress
+ * EOIs, resulting in quirky behavior.
+ */
+ return !ioapic_in_kernel(kvm);
+ default:
+ WARN_ON_ONCE(1);
+ return false;
+ }
+}
+
+bool kvm_lapic_suppress_eoi_broadcast(struct kvm_lapic *apic)
+{
+ struct kvm *kvm = apic->vcpu->kvm;
+
+ if (!(kvm_lapic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
+ return false;
+
+ switch (kvm->arch.suppress_eoi_broadcast_mode) {
+ case KVM_SUPPRESS_EOI_BROADCAST_ENABLED:
+ return true;
+ case KVM_SUPPRESS_EOI_BROADCAST_DISABLED:
+ return false;
+ case KVM_SUPPRESS_EOI_BROADCAST_QUIRKED:
+ /*
+ * Historically, in split IRQCHIP mode, KVM ignored the suppress
+ * EOI broadcast bit set by the guest and broadcasts EOIs to the
+ * userspace I/O APIC. For In-kernel I/O APIC, the support itself
+ * is not advertised, can only be enabled via KVM_SET_APIC_STATE,
+ * and KVM's I/O APIC doesn't emulate Directed EOIs; but if the
+ * feature is enabled, it is respected (with odd behavior).
+ */
+ return ioapic_in_kernel(kvm);
+ default:
+ WARN_ON_ONCE(1);
+ return false;
+ }
+}
+
__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_has_noapic_vcpu);
@@ -554,15 +611,9 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
v = APIC_VERSION | ((apic->nr_lvt_entries - 1) << 16);
- /*
- * KVM emulates 82093AA datasheet (with in-kernel IOAPIC implementation)
- * which doesn't have EOI register; Some buggy OSes (e.g. Windows with
- * Hyper-V role) disable EOI broadcast in lapic not checking for IOAPIC
- * version first and level-triggered interrupts never get EOIed in
- * IOAPIC.
- */
+
if (guest_cpu_cap_has(vcpu, X86_FEATURE_X2APIC) &&
- !ioapic_in_kernel(vcpu->kvm))
+ kvm_lapic_advertise_suppress_eoi_broadcast(vcpu->kvm))
v |= APIC_LVR_DIRECTED_EOI;
kvm_lapic_set_reg(apic, APIC_LVR, v);
}
@@ -1517,6 +1568,15 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
/* Request a KVM exit to inform the userspace IOAPIC. */
if (irqchip_split(apic->vcpu->kvm)) {
+ /*
+ * Don't exit to userspace if the guest has enabled Directed
+ * EOI, a.k.a. Suppress EOI Broadcasts, in which case the local
+ * APIC doesn't broadcast EOIs (the guest must EOI the target
+ * I/O APIC(s) directly).
+ */
+ if (kvm_lapic_suppress_eoi_broadcast(apic))
+ return;
+
apic->vcpu->arch.pending_ioapic_eoi = vector;
kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu);
return;
@@ -3498,7 +3558,6 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu)
* wait-for-SIPI (WFS).
*/
if (!kvm_apic_init_sipi_allowed(vcpu)) {
- WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED);
clear_bit(KVM_APIC_SIPI, &apic->pending_events);
return 0;
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 282b9b7da98c..e5f5a222eced 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -231,6 +231,8 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
+bool kvm_lapic_suppress_eoi_broadcast(struct kvm_lapic *apic);
+
void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu);
void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq,
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 02c450686b4a..3911ac9bddfd 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4521,7 +4521,10 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu,
arch.gfn = fault->gfn;
arch.error_code = fault->error_code;
arch.direct_map = vcpu->arch.mmu->root_role.direct;
- arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu);
+ if (arch.direct_map)
+ arch.cr3 = (unsigned long)INVALID_GPA;
+ else
+ arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu);
return kvm_setup_async_pf(vcpu, fault->addr,
kvm_vcpu_gfn_to_hva(vcpu, fault->gfn), &arch);
@@ -6031,11 +6034,7 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
vcpu->arch.nested_mmu.cpu_role.ext.valid = 0;
kvm_mmu_reset_context(vcpu);
- /*
- * Changing guest CPUID after KVM_RUN is forbidden, see the comment in
- * kvm_arch_vcpu_ioctl().
- */
- KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm);
+ KVM_BUG_ON(!kvm_can_set_cpuid_and_feature_msrs(vcpu), vcpu->kvm);
}
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 487ad19a236e..ff20b4102173 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -853,7 +853,7 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm))
+ if (KVM_BUG_ON(!kvm_can_set_cpuid_and_feature_msrs(vcpu), vcpu->kvm))
return;
/*
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
index 81b4a7acf72e..657f5f743ed9 100644
--- a/arch/x86/kvm/reverse_cpuid.h
+++ b/arch/x86/kvm/reverse_cpuid.h
@@ -44,11 +44,28 @@
#define KVM_X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4)
#define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5)
+/*
+ * Intel-defined sub-features, CPUID level 0x0000001E:1 (EAX). Note, several
+ * of the bits are aliases to features of the same name that are enumerated via
+ * various CPUID.0x7 sub-leafs.
+ */
+#define X86_FEATURE_AMX_INT8_ALIAS KVM_X86_FEATURE(CPUID_1E_1_EAX, 0)
+#define X86_FEATURE_AMX_BF16_ALIAS KVM_X86_FEATURE(CPUID_1E_1_EAX, 1)
+#define X86_FEATURE_AMX_COMPLEX_ALIAS KVM_X86_FEATURE(CPUID_1E_1_EAX, 2)
+#define X86_FEATURE_AMX_FP16_ALIAS KVM_X86_FEATURE(CPUID_1E_1_EAX, 3)
+#define X86_FEATURE_AMX_FP8 KVM_X86_FEATURE(CPUID_1E_1_EAX, 4)
+#define X86_FEATURE_AMX_TF32 KVM_X86_FEATURE(CPUID_1E_1_EAX, 6)
+#define X86_FEATURE_AMX_AVX512 KVM_X86_FEATURE(CPUID_1E_1_EAX, 7)
+#define X86_FEATURE_AMX_MOVRS KVM_X86_FEATURE(CPUID_1E_1_EAX, 8)
+
/* Intel-defined sub-features, CPUID level 0x00000024:0 (EBX) */
#define X86_FEATURE_AVX10_128 KVM_X86_FEATURE(CPUID_24_0_EBX, 16)
#define X86_FEATURE_AVX10_256 KVM_X86_FEATURE(CPUID_24_0_EBX, 17)
#define X86_FEATURE_AVX10_512 KVM_X86_FEATURE(CPUID_24_0_EBX, 18)
+/* Intel-defined sub-features, CPUID level 0x00000024:1 (ECX) */
+#define X86_FEATURE_AVX10_VNNI_INT KVM_X86_FEATURE(CPUID_24_1_ECX, 2)
+
/* CPUID level 0x80000007 (EDX). */
#define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8)
@@ -90,6 +107,8 @@ static const struct cpuid_reg reverse_cpuid[] = {
[CPUID_24_0_EBX] = { 0x24, 0, CPUID_EBX},
[CPUID_8000_0021_ECX] = {0x80000021, 0, CPUID_ECX},
[CPUID_7_1_ECX] = { 7, 1, CPUID_ECX},
+ [CPUID_1E_1_EAX] = { 0x1e, 1, CPUID_EAX},
+ [CPUID_24_1_ECX] = { 0x24, 1, CPUID_ECX},
};
/*
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 8b0ac67becae..9ee74c57bd51 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -5259,7 +5259,7 @@ static __init void svm_adjust_mmio_mask(void)
static __init void svm_set_cpu_caps(void)
{
- kvm_set_cpu_caps();
+ kvm_initialize_cpu_caps();
kvm_caps.supported_perf_cap = 0;
@@ -5343,6 +5343,7 @@ static __init void svm_set_cpu_caps(void)
kvm_cpu_cap_clear(X86_FEATURE_MSR_IMM);
kvm_setup_xss_caps();
+ kvm_finalize_cpu_caps();
}
static __init int svm_hardware_setup(void)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 530981a42c96..edf12bf58578 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -8039,7 +8039,7 @@ static __init u64 vmx_get_perf_capabilities(void)
static __init void vmx_set_cpu_caps(void)
{
- kvm_set_cpu_caps();
+ kvm_initialize_cpu_caps();
/* CPUID 0x1 */
if (nested)
@@ -8098,6 +8098,7 @@ static __init void vmx_set_cpu_caps(void)
}
kvm_setup_xss_caps();
+ kvm_finalize_cpu_caps();
}
static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1f66c7441272..08a6d6e20e9b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -121,8 +121,10 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
#define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE
-#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
- KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
+#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
+ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK | \
+ KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST | \
+ KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST)
static void update_cr8_intercept(struct kvm_vcpu *vcpu);
static void process_nmi(struct kvm_vcpu *vcpu);
@@ -2314,13 +2316,14 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
u64 val;
/*
- * Disallow writes to immutable feature MSRs after KVM_RUN. KVM does
- * not support modifying the guest vCPU model on the fly, e.g. changing
- * the nVMX capabilities while L2 is running is nonsensical. Allow
- * writes of the same value, e.g. to allow userspace to blindly stuff
- * all MSRs when emulating RESET.
+ * Reject writes to immutable feature MSRs if the vCPU model is frozen,
+ * as KVM doesn't support modifying the guest vCPU model on the fly,
+ * e.g. changing the VMX capabilities MSRs while L2 is active is
+ * nonsensical. Allow writes of the same value, e.g. so that userspace
+ * can blindly stuff all MSRs when emulating RESET.
*/
- if (kvm_vcpu_has_run(vcpu) && kvm_is_immutable_feature_msr(index) &&
+ if (!kvm_can_set_cpuid_and_feature_msrs(vcpu) &&
+ kvm_is_immutable_feature_msr(index) &&
(do_get_msr(vcpu, index, &val) || *data != val))
return -EINVAL;
@@ -4096,47 +4099,47 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_KVM_WALL_CLOCK_NEW:
if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
- return 1;
+ return KVM_MSR_RET_UNSUPPORTED;
vcpu->kvm->arch.wall_clock = data;
kvm_write_wall_clock(vcpu->kvm, data, 0);
break;
case MSR_KVM_WALL_CLOCK:
if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
- return 1;
+ return KVM_MSR_RET_UNSUPPORTED;
vcpu->kvm->arch.wall_clock = data;
kvm_write_wall_clock(vcpu->kvm, data, 0);
break;
case MSR_KVM_SYSTEM_TIME_NEW:
if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
- return 1;
+ return KVM_MSR_RET_UNSUPPORTED;
kvm_write_system_time(vcpu, data, false, msr_info->host_initiated);
break;
case MSR_KVM_SYSTEM_TIME:
if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
- return 1;
+ return KVM_MSR_RET_UNSUPPORTED;
kvm_write_system_time(vcpu, data, true, msr_info->host_initiated);
break;
case MSR_KVM_ASYNC_PF_EN:
if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
- return 1;
+ return KVM_MSR_RET_UNSUPPORTED;
if (kvm_pv_enable_async_pf(vcpu, data))
return 1;
break;
case MSR_KVM_ASYNC_PF_INT:
if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
- return 1;
+ return KVM_MSR_RET_UNSUPPORTED;
if (kvm_pv_enable_async_pf_int(vcpu, data))
return 1;
break;
case MSR_KVM_ASYNC_PF_ACK:
if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
- return 1;
+ return KVM_MSR_RET_UNSUPPORTED;
if (data & 0x1) {
/*
* Pairs with the smp_mb__after_atomic() in
@@ -4149,7 +4152,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_KVM_STEAL_TIME:
if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
- return 1;
+ return KVM_MSR_RET_UNSUPPORTED;
if (unlikely(!sched_info_on()))
return 1;
@@ -4167,7 +4170,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_KVM_PV_EOI_EN:
if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
- return 1;
+ return KVM_MSR_RET_UNSUPPORTED;
if (kvm_lapic_set_pv_eoi(vcpu, data, sizeof(u8)))
return 1;
@@ -4175,7 +4178,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_KVM_POLL_CONTROL:
if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
- return 1;
+ return KVM_MSR_RET_UNSUPPORTED;
/* only enable bit supported */
if (data & (-1ULL << 1))
@@ -4476,61 +4479,61 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_KVM_WALL_CLOCK:
if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
- return 1;
+ return KVM_MSR_RET_UNSUPPORTED;
msr_info->data = vcpu->kvm->arch.wall_clock;
break;
case MSR_KVM_WALL_CLOCK_NEW:
if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
- return 1;
+ return KVM_MSR_RET_UNSUPPORTED;
msr_info->data = vcpu->kvm->arch.wall_clock;
break;
case MSR_KVM_SYSTEM_TIME:
if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
- return 1;
+ return KVM_MSR_RET_UNSUPPORTED;
msr_info->data = vcpu->arch.time;
break;
case MSR_KVM_SYSTEM_TIME_NEW:
if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
- return 1;
+ return KVM_MSR_RET_UNSUPPORTED;
msr_info->data = vcpu->arch.time;
break;
case MSR_KVM_ASYNC_PF_EN:
if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
- return 1;
+ return KVM_MSR_RET_UNSUPPORTED;
msr_info->data = vcpu->arch.apf.msr_en_val;
break;
case MSR_KVM_ASYNC_PF_INT:
if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
- return 1;
+ return KVM_MSR_RET_UNSUPPORTED;
msr_info->data = vcpu->arch.apf.msr_int_val;
break;
case MSR_KVM_ASYNC_PF_ACK:
if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
- return 1;
+ return KVM_MSR_RET_UNSUPPORTED;
msr_info->data = 0;
break;
case MSR_KVM_STEAL_TIME:
if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
- return 1;
+ return KVM_MSR_RET_UNSUPPORTED;
msr_info->data = vcpu->arch.st.msr_val;
break;
case MSR_KVM_PV_EOI_EN:
if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
- return 1;
+ return KVM_MSR_RET_UNSUPPORTED;
msr_info->data = vcpu->arch.pv_eoi.msr_val;
break;
case MSR_KVM_POLL_CONTROL:
if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
- return 1;
+ return KVM_MSR_RET_UNSUPPORTED;
msr_info->data = vcpu->arch.msr_kvm_poll_control;
break;
@@ -4931,6 +4934,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
break;
case KVM_CAP_X2APIC_API:
r = KVM_X2APIC_API_VALID_FLAGS;
+ if (kvm && !irqchip_split(kvm))
+ r &= ~KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST;
break;
case KVM_CAP_NESTED_STATE:
r = kvm_x86_ops.nested_ops->get_state ?
@@ -6748,11 +6753,24 @@ split_irqchip_unlock:
if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
break;
+ if ((cap->args[0] & KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST) &&
+ (cap->args[0] & KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST))
+ break;
+
+ if ((cap->args[0] & KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST) &&
+ !irqchip_split(kvm))
+ break;
+
if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
kvm->arch.x2apic_format = true;
if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
kvm->arch.x2apic_broadcast_quirk_disabled = true;
+ if (cap->args[0] & KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST)
+ kvm->arch.suppress_eoi_broadcast_mode = KVM_SUPPRESS_EOI_BROADCAST_ENABLED;
+ if (cap->args[0] & KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST)
+ kvm->arch.suppress_eoi_broadcast_mode = KVM_SUPPRESS_EOI_BROADCAST_DISABLED;
+
r = 0;
break;
case KVM_CAP_X86_DISABLE_EXITS:
@@ -11609,8 +11627,7 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu)
if (is_guest_mode(vcpu)) {
int r = kvm_check_nested_events(vcpu);
- WARN_ON_ONCE(r == -EBUSY);
- if (r < 0)
+ if (r < 0 && r != -EBUSY)
return 0;
}
@@ -12158,9 +12175,11 @@ static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
return;
if (is_pae_paging(vcpu)) {
+ kvm_vcpu_srcu_read_lock(vcpu);
for (i = 0 ; i < 4 ; i++)
sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i);
sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID;
+ kvm_vcpu_srcu_read_unlock(vcpu);
}
}
@@ -13320,7 +13339,7 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm)
#endif
kvm_mmu_pre_destroy_vm(kvm);
- static_call_cond(kvm_x86_vm_pre_destroy)(kvm);
+ kvm_x86_call(vm_pre_destroy)(kvm);
}
void kvm_arch_destroy_vm(struct kvm *kvm)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 00de24f55b1f..ff20e62d98c6 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -172,9 +172,20 @@ static inline void kvm_nested_vmexit_handle_ibrs(struct kvm_vcpu *vcpu)
indirect_branch_prediction_barrier();
}
-static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu)
+/*
+ * Disallow modifying CPUID and feature MSRs, which affect the core virtual CPU
+ * model exposed to the guest and virtualized by KVM, if the vCPU has already
+ * run or is in guest mode (L2). In both cases, KVM has already consumed the
+ * current virtual CPU model, and doesn't support "unwinding" to react to the
+ * new model.
+ *
+ * Note, the only way is_guest_mode() can be true with 'last_vmentry_cpu == -1'
+ * is if userspace sets CPUID and feature MSRs (to enable VMX/SVM), then sets
+ * nested state, and then attempts to set CPUID and/or feature MSRs *again*.
+ */
+static inline bool kvm_can_set_cpuid_and_feature_msrs(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.last_vmentry_cpu != -1;
+ return vcpu->arch.last_vmentry_cpu == -1 && !is_guest_mode(vcpu);
}
static inline void kvm_set_mp_state(struct kvm_vcpu *vcpu, int mp_state)