From ff083a2d972f56bebfd82409ca62e5dfce950961 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:22 +0000 Subject: perf: Protect perf_guest_cbs with RCU Protect perf_guest_cbs with RCU to fix multiple possible errors. Luckily, all paths that read perf_guest_cbs already require RCU protection, e.g. to protect the callback chains, so only the direct perf_guest_cbs touchpoints need to be modified. Bug #1 is a simple lack of WRITE_ONCE/READ_ONCE behavior to ensure perf_guest_cbs isn't reloaded between a !NULL check and a dereference. Fixed via the READ_ONCE() in rcu_dereference(). Bug #2 is that on weakly-ordered architectures, updates to the callbacks themselves are not guaranteed to be visible before the pointer is made visible to readers. Fixed by the smp_store_release() in rcu_assign_pointer() when the new pointer is non-NULL. Bug #3 is that, because the callbacks are global, it's possible for readers to run in parallel with an unregisters, and thus a module implementing the callbacks can be unloaded while readers are in flight, resulting in a use-after-free. Fixed by a synchronize_rcu() call when unregistering callbacks. Bug #1 escaped notice because it's extremely unlikely a compiler will reload perf_guest_cbs in this sequence. perf_guest_cbs does get reloaded for future derefs, e.g. for ->is_user_mode(), but the ->is_in_guest() guard all but guarantees the consumer will win the race, e.g. to nullify perf_guest_cbs, KVM has to completely exit the guest and teardown down all VMs before KVM start its module unload / unregister sequence. This also makes it all but impossible to encounter bug #3. Bug #2 has not been a problem because all architectures that register callbacks are strongly ordered and/or have a static set of callbacks. But with help, unloading kvm_intel can trigger bug #1 e.g. wrapping perf_guest_cbs with READ_ONCE in perf_misc_flags() while spamming kvm_intel module load/unload leads to: BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP CPU: 6 PID: 1825 Comm: stress Not tainted 5.14.0-rc2+ #459 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:perf_misc_flags+0x1c/0x70 Call Trace: perf_prepare_sample+0x53/0x6b0 perf_event_output_forward+0x67/0x160 __perf_event_overflow+0x52/0xf0 handle_pmi_common+0x207/0x300 intel_pmu_handle_irq+0xcf/0x410 perf_event_nmi_handler+0x28/0x50 nmi_handle+0xc7/0x260 default_do_nmi+0x6b/0x170 exc_nmi+0x103/0x130 asm_exc_nmi+0x76/0xbf Fixes: 39447b386c84 ("perf: Enhance perf to allow for guest statistic collection from host") Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paolo Bonzini Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20211111020738.2512932-2-seanjc@google.com --- kernel/events/core.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 523106a506ee..c552e1bfcaea 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6526,18 +6526,25 @@ static void perf_pending_event(struct irq_work *entry) * Later on, we might change it to a list if there is * another virtualization implementation supporting the callbacks. */ -struct perf_guest_info_callbacks *perf_guest_cbs; +struct perf_guest_info_callbacks __rcu *perf_guest_cbs; int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { - perf_guest_cbs = cbs; + if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs))) + return -EBUSY; + + rcu_assign_pointer(perf_guest_cbs, cbs); return 0; } EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { - perf_guest_cbs = NULL; + if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs)) + return -EINVAL; + + rcu_assign_pointer(perf_guest_cbs, NULL); + synchronize_rcu(); return 0; } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); -- cgit v1.3 From 2934e3d09350c1a7ca2433fbeabfcd831e48a575 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:25 +0000 Subject: perf: Stop pretending that perf can handle multiple guest callbacks Drop the 'int' return value from the perf (un)register callbacks helpers and stop pretending perf can support multiple callbacks. The 'int' returns are not future proofing anything as none of the callers take action on an error. It's also not obvious that there will ever be co-tenant hypervisors, and if there are, that allowing multiple callbacks to be registered is desirable or even correct. Opportunistically rename callbacks=>cbs in the affected declarations to match their definitions. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20211111020738.2512932-5-seanjc@google.com --- arch/arm64/include/asm/kvm_host.h | 4 ++-- arch/arm64/kvm/perf.c | 8 ++++---- include/linux/perf_event.h | 12 ++++++------ kernel/events/core.c | 15 ++++----------- 4 files changed, 16 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 2a5f7f38006f..f680f303ba7c 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -675,8 +675,8 @@ unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len); int kvm_handle_mmio_return(struct kvm_vcpu *vcpu); int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa); -int kvm_perf_init(void); -int kvm_perf_teardown(void); +void kvm_perf_init(void); +void kvm_perf_teardown(void); long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu); gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c index c84fe24b2ea1..a0d660cf889e 100644 --- a/arch/arm64/kvm/perf.c +++ b/arch/arm64/kvm/perf.c @@ -48,12 +48,12 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { .get_guest_ip = kvm_get_guest_ip, }; -int kvm_perf_init(void) +void kvm_perf_init(void) { - return perf_register_guest_info_callbacks(&kvm_guest_cbs); + perf_register_guest_info_callbacks(&kvm_guest_cbs); } -int kvm_perf_teardown(void) +void kvm_perf_teardown(void) { - return perf_unregister_guest_info_callbacks(&kvm_guest_cbs); + perf_unregister_guest_info_callbacks(&kvm_guest_cbs); } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 318c489b735b..98c204488496 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1252,8 +1252,8 @@ static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void) */ return rcu_dereference(perf_guest_cbs); } -extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); -extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); +extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); +extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); extern void perf_event_exec(void); extern void perf_event_comm(struct task_struct *tsk, bool exec); @@ -1497,10 +1497,10 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { } static inline void perf_bp_event(struct perf_event *event, void *data) { } -static inline int perf_register_guest_info_callbacks -(struct perf_guest_info_callbacks *callbacks) { return 0; } -static inline int perf_unregister_guest_info_callbacks -(struct perf_guest_info_callbacks *callbacks) { return 0; } +static inline void perf_register_guest_info_callbacks +(struct perf_guest_info_callbacks *cbs) { } +static inline void perf_unregister_guest_info_callbacks +(struct perf_guest_info_callbacks *cbs) { } static inline void perf_event_mmap(struct vm_area_struct *vma) { } diff --git a/kernel/events/core.c b/kernel/events/core.c index c552e1bfcaea..17e5b20762c5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6521,31 +6521,24 @@ static void perf_pending_event(struct irq_work *entry) perf_swevent_put_recursion_context(rctx); } -/* - * We assume there is only KVM supporting the callbacks. - * Later on, we might change it to a list if there is - * another virtualization implementation supporting the callbacks. - */ struct perf_guest_info_callbacks __rcu *perf_guest_cbs; -int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs))) - return -EBUSY; + return; rcu_assign_pointer(perf_guest_cbs, cbs); - return 0; } EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); -int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs)) - return -EINVAL; + return; rcu_assign_pointer(perf_guest_cbs, NULL); synchronize_rcu(); - return 0; } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); -- cgit v1.3 From 2aef6f306b39bbe74e2287d6e2ee07c4867d87d0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:29 +0000 Subject: perf: Force architectures to opt-in to guest callbacks Introduce GUEST_PERF_EVENTS and require architectures to select it to allow registering and using guest callbacks in perf. This will hopefully make it more difficult for new architectures to add useless "support" for guest callbacks, e.g. via copy+paste. Stubbing out the helpers has the happy bonus of avoiding a load of perf_guest_cbs when GUEST_PERF_EVENTS=n on arm64/x86. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20211111020738.2512932-9-seanjc@google.com --- arch/arm64/kvm/Kconfig | 1 + arch/x86/kvm/Kconfig | 1 + arch/x86/xen/Kconfig | 1 + include/linux/perf_event.h | 6 ++++++ init/Kconfig | 4 ++++ kernel/events/core.c | 2 ++ 6 files changed, 15 insertions(+) (limited to 'kernel') diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 8ffcbe29395e..e9761d84f982 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -39,6 +39,7 @@ menuconfig KVM select HAVE_KVM_IRQ_BYPASS select HAVE_KVM_VCPU_RUN_PID_CHANGE select SCHED_INFO + select GUEST_PERF_EVENTS if PERF_EVENTS help Support hosting virtualized guest machines. diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 619186138176..47bdbe705a76 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -36,6 +36,7 @@ config KVM select KVM_MMIO select SCHED_INFO select PERF_EVENTS + select GUEST_PERF_EVENTS select HAVE_KVM_MSI select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_NO_POLL diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 6bcd3d8ca6ac..85246dd9faa1 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -23,6 +23,7 @@ config XEN_PV select PARAVIRT_XXL select XEN_HAVE_PVMMU select XEN_HAVE_VPMU + select GUEST_PERF_EVENTS help Support running as a Xen PV guest. diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 346d5aff5804..ea47ef616ee0 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1242,6 +1242,7 @@ extern void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags); +#ifdef CONFIG_GUEST_PERF_EVENTS extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void) { @@ -1280,6 +1281,11 @@ static inline unsigned int perf_guest_handle_intel_pt_intr(void) } extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); +#else +static inline unsigned int perf_guest_state(void) { return 0; } +static inline unsigned long perf_guest_get_ip(void) { return 0; } +static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return 0; } +#endif /* CONFIG_GUEST_PERF_EVENTS */ extern void perf_event_exec(void); extern void perf_event_comm(struct task_struct *tsk, bool exec); diff --git a/init/Kconfig b/init/Kconfig index 036b750e8d8a..72d40b3b5805 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1804,6 +1804,10 @@ config HAVE_PERF_EVENTS help See tools/perf/design.txt for details. +config GUEST_PERF_EVENTS + bool + depends on HAVE_PERF_EVENTS + config PERF_USE_VMALLOC bool help diff --git a/kernel/events/core.c b/kernel/events/core.c index 17e5b20762c5..5a3502cd5362 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6521,6 +6521,7 @@ static void perf_pending_event(struct irq_work *entry) perf_swevent_put_recursion_context(rctx); } +#ifdef CONFIG_GUEST_PERF_EVENTS struct perf_guest_info_callbacks __rcu *perf_guest_cbs; void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) @@ -6541,6 +6542,7 @@ void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) synchronize_rcu(); } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); +#endif static void perf_output_sample_regs(struct perf_output_handle *handle, -- cgit v1.3 From 87b940a0675e25261f022ac3e53e0dfff9cdb995 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:30 +0000 Subject: perf/core: Use static_call to optimize perf_guest_info_callbacks Use static_call to optimize perf's guest callbacks on arm64 and x86, which are now the only architectures that define the callbacks. Use DEFINE_STATIC_CALL_RET0 as the default/NULL for all guest callbacks, as the callback semantics are that a return value '0' means "not in guest". static_call obviously avoids the overhead of CONFIG_RETPOLINE=y, but is also advantageous versus other solutions, e.g. per-cpu callbacks, in that a per-cpu memory load is not needed to detect the !guest case. Based on code from Peter and Like. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20211111020738.2512932-10-seanjc@google.com --- include/linux/perf_event.h | 34 ++++++++-------------------------- kernel/events/core.c | 15 +++++++++++++++ 2 files changed, 23 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ea47ef616ee0..0ac7d867ca0c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1244,40 +1244,22 @@ extern void perf_event_bpf_event(struct bpf_prog *prog, #ifdef CONFIG_GUEST_PERF_EVENTS extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; -static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void) -{ - /* - * Callbacks are RCU-protected and must be READ_ONCE to avoid reloading - * the callbacks between a !NULL check and dereferences, to ensure - * pending stores/changes to the callback pointers are visible before a - * non-NULL perf_guest_cbs is visible to readers, and to prevent a - * module from unloading callbacks while readers are active. - */ - return rcu_dereference(perf_guest_cbs); -} + +DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state); +DECLARE_STATIC_CALL(__perf_guest_get_ip, *perf_guest_cbs->get_ip); +DECLARE_STATIC_CALL(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); + static inline unsigned int perf_guest_state(void) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - - return guest_cbs ? guest_cbs->state() : 0; + return static_call(__perf_guest_state)(); } static inline unsigned long perf_guest_get_ip(void) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - - /* - * Arbitrarily return '0' in the unlikely scenario that the callbacks - * are unregistered between checking guest state and getting the IP. - */ - return guest_cbs ? guest_cbs->get_ip() : 0; + return static_call(__perf_guest_get_ip)(); } static inline unsigned int perf_guest_handle_intel_pt_intr(void) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - - if (guest_cbs && guest_cbs->handle_intel_pt_intr) - return guest_cbs->handle_intel_pt_intr(); - return 0; + return static_call(__perf_guest_handle_intel_pt_intr)(); } extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); diff --git a/kernel/events/core.c b/kernel/events/core.c index 5a3502cd5362..3b3297a57228 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6524,12 +6524,23 @@ static void perf_pending_event(struct irq_work *entry) #ifdef CONFIG_GUEST_PERF_EVENTS struct perf_guest_info_callbacks __rcu *perf_guest_cbs; +DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state); +DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip); +DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); + void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs))) return; rcu_assign_pointer(perf_guest_cbs, cbs); + static_call_update(__perf_guest_state, cbs->state); + static_call_update(__perf_guest_get_ip, cbs->get_ip); + + /* Implementing ->handle_intel_pt_intr is optional. */ + if (cbs->handle_intel_pt_intr) + static_call_update(__perf_guest_handle_intel_pt_intr, + cbs->handle_intel_pt_intr); } EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); @@ -6539,6 +6550,10 @@ void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) return; rcu_assign_pointer(perf_guest_cbs, NULL); + static_call_update(__perf_guest_state, (void *)&__static_call_return0); + static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0); + static_call_update(__perf_guest_handle_intel_pt_intr, + (void *)&__static_call_return0); synchronize_rcu(); } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); -- cgit v1.3