From cfbe371194d1f342bdd88f87a9b36407d1ec0f52 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 12 Jan 2026 15:28:05 -0800 Subject: KVM: SVM: Check vCPU ID against max x2AVIC ID if and only if x2AVIC is enabled When allocating the AVIC backing page, only check one of the max AVIC vs. x2AVIC ID based on whether or not x2AVIC is enabled. Doing so fixes a bug where KVM incorrectly inhibits AVIC if x2AVIC is _disabled_ and any vCPU with a non-zero APIC ID is created, as x2avic_max_physical_id is left '0' when x2AVIC is disabled. Fixes: 940fc47cfb0d ("KVM: SVM: Add AVIC support for 4k vCPUs in x2AVIC mode") Cc: stable@vger.kernel.org Cc: Naveen N Rao (AMD) Cc: Suravee Suthikulpanit Reviewed-by: Naveen N Rao (AMD) Link: https://patch.msgid.link/20260112232805.1512361-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 6b77b2033208..0f6c8596719b 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -376,6 +376,7 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) static int avic_init_backing_page(struct kvm_vcpu *vcpu) { + u32 max_id = x2avic_enabled ? x2avic_max_physical_id : AVIC_MAX_PHYSICAL_ID; struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); struct vcpu_svm *svm = to_svm(vcpu); u32 id = vcpu->vcpu_id; @@ -388,8 +389,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) * avic_vcpu_load() expects to be called if and only if the vCPU has * fully initialized AVIC. */ - if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) || - (id > x2avic_max_physical_id)) { + if (id > max_id) { kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG); vcpu->arch.apic->apicv_active = false; return 0; -- cgit v1.2.3 From b4d37cdb77a0015f51fee083598fa227cc07aaf1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jan 2026 09:46:05 -0800 Subject: KVM: Don't clobber irqfd routing type when deassigning irqfd When deassigning a KVM_IRQFD, don't clobber the irqfd's copy of the IRQ's routing entry as doing so breaks kvm_arch_irq_bypass_del_producer() on x86 and arm64, which explicitly look for KVM_IRQ_ROUTING_MSI. Instead, to handle a concurrent routing update, verify that the irqfd is still active before consuming the routing information. As evidenced by the x86 and arm64 bugs, and another bug in kvm_arch_update_irqfd_routing() (see below), clobbering the entry type without notifying arch code is surprising and error prone. As a bonus, checking that the irqfd is active provides a convenient location for documenting _why_ KVM must not consume the routing entry for an irqfd that is in the process of being deassigned: once the irqfd is deleted from the list (which happens *before* the eventfd is detached), it will no longer receive updates via kvm_irq_routing_update(), and so KVM could deliver an event using stale routing information (relative to KVM_SET_GSI_ROUTING returning to userspace). As an even better bonus, explicitly checking for the irqfd being active fixes a similar bug to the one the clobbering is trying to prevent: if an irqfd is deactivated, and then its routing is changed, kvm_irq_routing_update() won't invoke kvm_arch_update_irqfd_routing() (because the irqfd isn't in the list). And so if the irqfd is in bypass mode, IRQs will continue to be posted using the old routing information. As for kvm_arch_irq_bypass_del_producer(), clobbering the routing type results in KVM incorrectly keeping the IRQ in bypass mode, which is especially problematic on AMD as KVM tracks IRQs that are being posted to a vCPU in a list whose lifetime is tied to the irqfd. Without the help of KASAN to detect use-after-free, the most common sympton on AMD is a NULL pointer deref in amd_iommu_update_ga() due to the memory for irqfd structure being re-allocated and zeroed, resulting in irqfd->irq_bypass_data being NULL when read by avic_update_iommu_vcpu_affinity(): BUG: kernel NULL pointer dereference, address: 0000000000000018 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 40cf2b9067 P4D 40cf2b9067 PUD 408362a067 PMD 0 Oops: Oops: 0000 [#1] SMP CPU: 6 UID: 0 PID: 40383 Comm: vfio_irq_test Tainted: G U W O 6.19.0-smp--5dddc257e6b2-irqfd #31 NONE Tainted: [U]=USER, [W]=WARN, [O]=OOT_MODULE Hardware name: Google, Inc. Arcadia_IT_80/Arcadia_IT_80, BIOS 34.78.2-0 09/05/2025 RIP: 0010:amd_iommu_update_ga+0x19/0xe0 Call Trace: avic_update_iommu_vcpu_affinity+0x3d/0x90 [kvm_amd] __avic_vcpu_load+0xf4/0x130 [kvm_amd] kvm_arch_vcpu_load+0x89/0x210 [kvm] vcpu_load+0x30/0x40 [kvm] kvm_arch_vcpu_ioctl_run+0x45/0x620 [kvm] kvm_vcpu_ioctl+0x571/0x6a0 [kvm] __se_sys_ioctl+0x6d/0xb0 do_syscall_64+0x6f/0x9d0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 RIP: 0033:0x46893b ---[ end trace 0000000000000000 ]--- If AVIC is inhibited when the irfd is deassigned, the bug will manifest as list corruption, e.g. on the next irqfd assignment. list_add corruption. next->prev should be prev (ffff8d474d5cd588), but was 0000000000000000. (next=ffff8d8658f86530). ------------[ cut here ]------------ kernel BUG at lib/list_debug.c:31! Oops: invalid opcode: 0000 [#1] SMP CPU: 128 UID: 0 PID: 80818 Comm: vfio_irq_test Tainted: G U W O 6.19.0-smp--f19dc4d680ba-irqfd #28 NONE Tainted: [U]=USER, [W]=WARN, [O]=OOT_MODULE Hardware name: Google, Inc. Arcadia_IT_80/Arcadia_IT_80, BIOS 34.78.2-0 09/05/2025 RIP: 0010:__list_add_valid_or_report+0x97/0xc0 Call Trace: avic_pi_update_irte+0x28e/0x2b0 [kvm_amd] kvm_pi_update_irte+0xbf/0x190 [kvm] kvm_arch_irq_bypass_add_producer+0x72/0x90 [kvm] irq_bypass_register_consumer+0xcd/0x170 [irqbypass] kvm_irqfd+0x4c6/0x540 [kvm] kvm_vm_ioctl+0x118/0x5d0 [kvm] __se_sys_ioctl+0x6d/0xb0 do_syscall_64+0x6f/0x9d0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 ---[ end trace 0000000000000000 ]--- On Intel and arm64, the bug is less noisy, as the end result is that the device keeps posting IRQs to the vCPU even after it's been deassigned. Note, the worst of the breakage can be traced back to commit cb210737675e ("KVM: Pass new routing entries and irqfd when updating IRTEs"), as before that commit KVM would pull the routing information from the per-VM routing table. But as above, similar bugs have existed since support for IRQ bypass was added. E.g. if a routing change finished before irq_shutdown() invoked kvm_arch_irq_bypass_del_producer(), VMX and SVM would see stale routing information and potentially leave the irqfd in bypass mode. Alternatively, x86 could be fixed by explicitly checking irq_bypass_vcpu instead of irq_entry.type in kvm_arch_irq_bypass_del_producer(), and arm64 could be modified to utilize irq_bypass_vcpu in a similar manner. But (a) that wouldn't fix the routing updates bug, and (b) fixing core code doesn't preclude x86 (or arm64) from adding such code as a sanity check (spoiler alert). Fixes: f70c20aaf141 ("KVM: Add an arch specific hooks in 'struct kvm_kernel_irqfd'") Fixes: cb210737675e ("KVM: Pass new routing entries and irqfd when updating IRTEs") Fixes: a0d7e2fc61ab ("KVM: arm64: vgic-v4: Only attempt vLPI mapping for actual MSIs") Cc: stable@vger.kernel.org Cc: Marc Zyngier Cc: Oliver Upton Link: https://patch.msgid.link/20260113174606.104978-2-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/eventfd.c | 44 ++++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 0e8b5277be3b..a369b20d47f0 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -157,21 +157,28 @@ irqfd_shutdown(struct work_struct *work) } -/* assumes kvm->irqfds.lock is held */ -static bool -irqfd_is_active(struct kvm_kernel_irqfd *irqfd) +static bool irqfd_is_active(struct kvm_kernel_irqfd *irqfd) { + /* + * Assert that either irqfds.lock or SRCU is held, as irqfds.lock must + * be held to prevent false positives (on the irqfd being active), and + * while false negatives are impossible as irqfds are never added back + * to the list once they're deactivated, the caller must at least hold + * SRCU to guard against routing changes if the irqfd is deactivated. + */ + lockdep_assert_once(lockdep_is_held(&irqfd->kvm->irqfds.lock) || + srcu_read_lock_held(&irqfd->kvm->irq_srcu)); + return list_empty(&irqfd->list) ? false : true; } /* * Mark the irqfd as inactive and schedule it for removal - * - * assumes kvm->irqfds.lock is held */ -static void -irqfd_deactivate(struct kvm_kernel_irqfd *irqfd) +static void irqfd_deactivate(struct kvm_kernel_irqfd *irqfd) { + lockdep_assert_held(&irqfd->kvm->irqfds.lock); + BUG_ON(!irqfd_is_active(irqfd)); list_del_init(&irqfd->list); @@ -217,8 +224,15 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) seq = read_seqcount_begin(&irqfd->irq_entry_sc); irq = irqfd->irq_entry; } while (read_seqcount_retry(&irqfd->irq_entry_sc, seq)); - /* An event has been signaled, inject an interrupt */ - if (kvm_arch_set_irq_inatomic(&irq, kvm, + + /* + * An event has been signaled, inject an interrupt unless the + * irqfd is being deassigned (isn't active), in which case the + * routing information may be stale (once the irqfd is removed + * from the list, it will stop receiving routing updates). + */ + if (unlikely(!irqfd_is_active(irqfd)) || + kvm_arch_set_irq_inatomic(&irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, false) == -EWOULDBLOCK) schedule_work(&irqfd->inject); @@ -585,18 +599,8 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args) spin_lock_irq(&kvm->irqfds.lock); list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { - if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) { - /* - * This clearing of irq_entry.type is needed for when - * another thread calls kvm_irq_routing_update before - * we flush workqueue below (we synchronize with - * kvm_irq_routing_update using irqfds.lock). - */ - write_seqcount_begin(&irqfd->irq_entry_sc); - irqfd->irq_entry.type = 0; - write_seqcount_end(&irqfd->irq_entry_sc); + if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) irqfd_deactivate(irqfd); - } } spin_unlock_irq(&kvm->irqfds.lock); -- cgit v1.2.3 From ef3719e33e6649164382c629d58704b828f56079 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 13 Jan 2026 09:46:06 -0800 Subject: KVM: x86: Assert that non-MSI doesn't have bypass vCPU when deleting producer When disconnecting a non-MSI irqfd from an IRQ bypass producer, WARN if the irqfd is configured for IRQ bypass and set its IRTE back to remapped mode to harden against kernel/KVM bugs (keeping the irqfd in bypass mode is often fatal to the host). Deactivating an irqfd (removing it from the list of irqfds), updating irqfd routes, and the code in question are all mutually exclusive (all run under irqfds.lock). If an irqfd is configured for bypass, and the irqfd is deassigned at the same time IRQ routing is updated (to change the routing to non-MSI), then either kvm_arch_update_irqfd_routing() should process the irqfd routing change and put the IRTE into remapped mode (routing update "wins"), or kvm_arch_irq_bypass_del_producer() should see the MSI routing info (deactivation "wins"). Link: https://patch.msgid.link/20260113174606.104978-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/irq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 7cc8950005b6..4c7688670c2d 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -514,7 +514,8 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, */ spin_lock_irq(&kvm->irqfds.lock); - if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) { + if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI || + WARN_ON_ONCE(irqfd->irq_bypass_vcpu)) { ret = kvm_pi_update_irte(irqfd, NULL); if (ret) pr_info("irq bypass consumer (eventfd %p) unregistration fails: %d\n", -- cgit v1.2.3 From e396a74222654486d6ab45dca5d0c54c408b8b91 Mon Sep 17 00:00:00 2001 From: Zhiquan Li Date: Thu, 22 Jan 2026 13:35:50 +0800 Subject: KVM: selftests: Add -U_FORTIFY_SOURCE to avoid some unpredictable test failures Some distributions (such as Ubuntu) configure GCC so that _FORTIFY_SOURCE is automatically enabled at -O1 or above. This results in some fortified version of definitions of standard library functions are included. While linker resolves the symbols, the fortified versions might override the definitions in lib/string_override.c and reference to those PLT entries in GLIBC. This is not a problem for the code in host, but it is a disaster for the guest code. E.g., if build and run x86/nested_emulation_test on Ubuntu 24.04 will encounter a L1 #PF due to memset() reference to __memset_chk@plt. The option -fno-builtin-memset is not helpful here, because those fortified versions are not built-in but some definitions which are included by header, they are for different intentions. In order to eliminate the unpredictable behaviors may vary depending on the linker and platform, add the "-U_FORTIFY_SOURCE" into CFLAGS to prevent from introducing the fortified definitions. Signed-off-by: Zhiquan Li Link: https://patch.msgid.link/20260122053551.548229-1-zhiquan_li@163.com Fixes: 6b6f71484bf4 ("KVM: selftests: Implement memcmp(), memcpy(), and memset() for guest use") Cc: stable@vger.kernel.org [sean: tag for stable] Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index ba5c2b643efa..d45bf4ccb3bf 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -251,6 +251,7 @@ LINUX_TOOL_INCLUDE = $(top_srcdir)/tools/include LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \ -Wno-gnu-variable-sized-type-not-at-end -MD -MP -DCONFIG_64BIT \ + -U_FORTIFY_SOURCE \ -fno-builtin-memcmp -fno-builtin-memcpy \ -fno-builtin-memset -fno-builtin-strnlen \ -fno-stack-protector -fno-PIE -fno-strict-aliasing \ -- cgit v1.2.3 From f8ade833b733ae0b72e87ac6d2202a1afbe3eb4a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 27 Jan 2026 17:43:08 -0800 Subject: KVM: x86: Explicitly configure supported XSS from {svm,vmx}_set_cpu_caps() Explicitly configure KVM's supported XSS as part of each vendor's setup flow to fix a bug where clearing SHSTK and IBT in kvm_cpu_caps, e.g. due to lack of CET XFEATURE support, makes kvm-intel.ko unloadable when nested VMX is enabled, i.e. when nested=1. The late clearing results in nested_vmx_setup_{entry,exit}_ctls() clearing VM_{ENTRY,EXIT}_LOAD_CET_STATE when nested_vmx_setup_ctls_msrs() runs during the CPU compatibility checks, ultimately leading to a mismatched VMCS config due to the reference config having the CET bits set, but every CPU's "local" config having the bits cleared. Note, kvm_caps.supported_{xcr0,xss} are unconditionally initialized by kvm_x86_vendor_init(), before calling into vendor code, and not referenced between ops->hardware_setup() and their current/old location. Fixes: 69cc3e886582 ("KVM: x86: Add XSS support for CET_KERNEL and CET_USER") Cc: stable@vger.kernel.org Cc: Mathias Krause Cc: John Allen Cc: Rick Edgecombe Cc: Chao Gao Cc: Binbin Wu Cc: Xiaoyao Li Reviewed-by: Xiaoyao Li Reviewed-by: Binbin Wu Link: https://patch.msgid.link/20260128014310.3255561-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 2 ++ arch/x86/kvm/vmx/vmx.c | 2 ++ arch/x86/kvm/x86.c | 30 +++++++++++++++++------------- arch/x86/kvm/x86.h | 2 ++ 4 files changed, 23 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 24d59ccfa40d..4394be40fe78 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5284,6 +5284,8 @@ static __init void svm_set_cpu_caps(void) */ kvm_cpu_cap_clear(X86_FEATURE_BUS_LOCK_DETECT); kvm_cpu_cap_clear(X86_FEATURE_MSR_IMM); + + kvm_setup_xss_caps(); } static __init int svm_hardware_setup(void) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6b96f7aea20b..8c94241fbcca 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8051,6 +8051,8 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_clear(X86_FEATURE_SHSTK); kvm_cpu_cap_clear(X86_FEATURE_IBT); } + + kvm_setup_xss_caps(); } static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 63afdb6bb078..72d37c8930ad 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9953,6 +9953,23 @@ static struct notifier_block pvclock_gtod_notifier = { }; #endif +void kvm_setup_xss_caps(void) +{ + if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) + kvm_caps.supported_xss = 0; + + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && + !kvm_cpu_cap_has(X86_FEATURE_IBT)) + kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL; + + if ((kvm_caps.supported_xss & XFEATURE_MASK_CET_ALL) != XFEATURE_MASK_CET_ALL) { + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); + kvm_cpu_cap_clear(X86_FEATURE_IBT); + kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL; + } +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_setup_xss_caps); + static inline void kvm_ops_update(struct kvm_x86_init_ops *ops) { memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); @@ -10125,19 +10142,6 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) if (!tdp_enabled) kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; - if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) - kvm_caps.supported_xss = 0; - - if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && - !kvm_cpu_cap_has(X86_FEATURE_IBT)) - kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL; - - if ((kvm_caps.supported_xss & XFEATURE_MASK_CET_ALL) != XFEATURE_MASK_CET_ALL) { - kvm_cpu_cap_clear(X86_FEATURE_SHSTK); - kvm_cpu_cap_clear(X86_FEATURE_IBT); - kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL; - } - if (kvm_caps.has_tsc_control) { /* * Make sure the user can only configure tsc_khz values that diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index fdab0ad49098..00de24f55b1f 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -471,6 +471,8 @@ extern struct kvm_host_values kvm_host; extern bool enable_pmu; +void kvm_setup_xss_caps(void); + /* * Get a filtered version of KVM's supported XCR0 that strips out dynamic * features for which the current process doesn't (yet) have permission to use. -- cgit v1.2.3