From 7b3c92b85a65c2db1f542265bc98e1f9e3056eba Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 4 Jul 2019 15:13:23 -0700 Subject: sched/core: Convert get_task_struct() to return the task Returning the pointer that was passed in allows us to write slightly more idiomatic code. Convert a few users. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190704221323.24290-1-willy@infradead.org Signed-off-by: Ingo Molnar --- kernel/trace/trace_sched_wakeup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 743b2b520d34..5e43b9664eca 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -579,8 +579,7 @@ probe_wakeup(void *ignore, struct task_struct *p) else tracing_dl = 0; - wakeup_task = p; - get_task_struct(wakeup_task); + wakeup_task = get_task_struct(p); local_save_flags(flags); -- cgit v1.2.3 From 6c77221df96177da0520847ce91e33f539fb8b2d Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 30 Jul 2019 22:08:50 +0800 Subject: fgraph: Remove redundant ftrace_graph_notrace_addr() test We already have tested it before. The second one should be removed. With this change, the performance should have little improvement. Link: http://lkml.kernel.org/r/20190730140850.7927-1-changbin.du@gmail.com Cc: stable@vger.kernel.org Fixes: 9cd2992f2d6c ("fgraph: Have set_graph_notrace only affect function_graph tracer") Signed-off-by: Changbin Du Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_functions_graph.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 69ebf3c2f1b5..78af97163147 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -137,6 +137,13 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) return 0; + /* + * Do not trace a function if it's filtered by set_graph_notrace. + * Make the index of ret stack negative to indicate that it should + * ignore further functions. But it needs its own ret stack entry + * to recover the original index in order to continue tracing after + * returning from the function. + */ if (ftrace_graph_notrace_addr(trace->func)) { trace_recursion_set(TRACE_GRAPH_NOTRACE_BIT); /* @@ -155,16 +162,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) if (ftrace_graph_ignore_irqs()) return 0; - /* - * Do not trace a function if it's filtered by set_graph_notrace. - * Make the index of ret stack negative to indicate that it should - * ignore further functions. But it needs its own ret stack entry - * to recover the original index in order to continue tracing after - * returning from the function. - */ - if (ftrace_graph_notrace_addr(trace->func)) - return 1; - /* * Stop here if tracing_threshold is set. We only write function return * events to the ring buffer. -- cgit v1.2.3 From 01b1d88b09824bea1a75b0bac04dcf50d9893875 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Jul 2019 23:19:38 +0200 Subject: rcu: Use CONFIG_PREEMPTION CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT. Both PREEMPT and PREEMPT_RT require the same functionality which today depends on CONFIG_PREEMPT. Switch the conditionals in RCU to use CONFIG_PREEMPTION. That's the first step towards RCU on RT. The further tweaks are work in progress. This neither touches the selftest bits which need a closer look by Paul. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20190726212124.210156346@linutronix.de Signed-off-by: Ingo Molnar --- arch/Kconfig | 2 +- include/linux/rcupdate.h | 2 +- include/linux/rcutree.h | 2 +- include/linux/torture.h | 2 +- kernel/rcu/Kconfig | 8 ++++---- kernel/rcu/tree.c | 6 +++--- kernel/rcu/tree_stall.h | 6 +++--- kernel/trace/Kconfig | 2 +- 8 files changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel/trace') diff --git a/arch/Kconfig b/arch/Kconfig index a7b57dd42c26..c7efbc018f4f 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -103,7 +103,7 @@ config STATIC_KEYS_SELFTEST config OPTPROBES def_bool y depends on KPROBES && HAVE_OPTPROBES - select TASKS_RCU if PREEMPT + select TASKS_RCU if PREEMPTION config KPROBES_ON_FTRACE def_bool y diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 8f7167478c1d..c4f76a310443 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -578,7 +578,7 @@ do { \ * * In non-preemptible RCU implementations (TREE_RCU and TINY_RCU), * it is illegal to block while in an RCU read-side critical section. - * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPT + * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPTION * kernel builds, RCU read-side critical sections may be preempted, * but explicit blocking is illegal. Finally, in preemptible RCU * implementations in real-time (with -rt patchset) kernel builds, RCU diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 735601ac27d3..18b1ed9864b0 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -53,7 +53,7 @@ void rcu_scheduler_starting(void); extern int rcu_scheduler_active __read_mostly; void rcu_end_inkernel_boot(void); bool rcu_is_watching(void); -#ifndef CONFIG_PREEMPT +#ifndef CONFIG_PREEMPTION void rcu_all_qs(void); #endif diff --git a/include/linux/torture.h b/include/linux/torture.h index a620118385bb..6241f59e2d6f 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -86,7 +86,7 @@ void _torture_stop_kthread(char *m, struct task_struct **tp); #define torture_stop_kthread(n, tp) \ _torture_stop_kthread("Stopping " #n " task", &(tp)) -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION #define torture_preempt_schedule() preempt_schedule() #else #define torture_preempt_schedule() diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 480edf328b51..7644eda17d62 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -7,7 +7,7 @@ menu "RCU Subsystem" config TREE_RCU bool - default y if !PREEMPT && SMP + default y if !PREEMPTION && SMP help This option selects the RCU implementation that is designed for very large SMP system with hundreds or @@ -16,7 +16,7 @@ config TREE_RCU config PREEMPT_RCU bool - default y if PREEMPT + default y if PREEMPTION help This option selects the RCU implementation that is designed for very large SMP systems with hundreds or @@ -28,7 +28,7 @@ config PREEMPT_RCU config TINY_RCU bool - default y if !PREEMPT && !SMP + default y if !PREEMPTION && !SMP help This option selects the RCU implementation that is designed for UP systems from which real-time response @@ -70,7 +70,7 @@ config TREE_SRCU This option selects the full-fledged version of SRCU. config TASKS_RCU - def_bool PREEMPT + def_bool PREEMPTION select SRCU help This option enables a task-based RCU implementation that uses diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a14e5fbbea46..5962636502bc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1881,7 +1881,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) struct rcu_node *rnp_p; raw_lockdep_assert_held_rcu_node(rnp); - if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)) || + if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPTION)) || WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) || rnp->qsmask != 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2205,7 +2205,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) mask = 0; raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask == 0) { - if (!IS_ENABLED(CONFIG_PREEMPT) || + if (!IS_ENABLED(CONFIG_PREEMPTION) || rcu_preempt_blocked_readers_cgp(rnp)) { /* * No point in scanning bits because they @@ -2622,7 +2622,7 @@ static int rcu_blocking_is_gp(void) { int ret; - if (IS_ENABLED(CONFIG_PREEMPT)) + if (IS_ENABLED(CONFIG_PREEMPTION)) return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE; might_sleep(); /* Check for RCU read-side critical section. */ preempt_disable(); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 065183391f75..9b92bf18b737 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -163,7 +163,7 @@ static void rcu_iw_handler(struct irq_work *iwp) // // Printing RCU CPU stall warnings -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* * Dump detailed information for all tasks blocking the current RCU @@ -215,7 +215,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) return ndetected; } -#else /* #ifdef CONFIG_PREEMPT */ +#else /* #ifdef CONFIG_PREEMPTION */ /* * Because preemptible RCU does not exist, we never have to check for @@ -233,7 +233,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) { return 0; } -#endif /* #else #ifdef CONFIG_PREEMPT */ +#endif /* #else #ifdef CONFIG_PREEMPTION */ /* * Dump stacks of all tasks running on stalled CPUs. First try using diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 98da8998c25c..d2c1fe0b451d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -146,7 +146,7 @@ config FUNCTION_TRACER select GENERIC_TRACER select CONTEXT_SWITCH_TRACER select GLOB - select TASKS_RCU if PREEMPT + select TASKS_RCU if PREEMPTION help Enable the kernel to trace every kernel function. This is done by using a compiler feature to insert a small, 5-byte No-Operation -- cgit v1.2.3 From 30c937043b2db09ae3408f5534824f9ececdb581 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Jul 2019 23:19:40 +0200 Subject: tracing: Use CONFIG_PREEMPTION CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT. Both PREEMPT and PREEMPT_RT require the same functionality which today depends on CONFIG_PREEMPT. Switch the conditionals in the tracer over to CONFIG_PREEMPTION. This is the first step to make the tracer work on RT. The other small tweaks are submitted separately. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20190726212124.409766323@linutronix.de Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 4 ++-- kernel/trace/ftrace.c | 2 +- kernel/trace/ring_buffer_benchmark.c | 2 +- kernel/trace/trace_events.c | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d2c1fe0b451d..6a64d7772870 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -179,7 +179,7 @@ config TRACE_PREEMPT_TOGGLE config PREEMPTIRQ_EVENTS bool "Enable trace events for preempt and irq disable/enable" select TRACE_IRQFLAGS - select TRACE_PREEMPT_TOGGLE if PREEMPT + select TRACE_PREEMPT_TOGGLE if PREEMPTION select GENERIC_TRACER default n help @@ -214,7 +214,7 @@ config PREEMPT_TRACER bool "Preemption-off Latency Tracer" default n depends on !ARCH_USES_GETTIMEOFFSET - depends on PREEMPT + depends on PREEMPTION select GENERIC_TRACER select TRACER_MAX_TRACE select RING_BUFFER_ALLOW_SWAP diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index eca34503f178..a800e867c1a3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2814,7 +2814,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) * synchornize_rcu_tasks() will wait for those tasks to * execute and either schedule voluntarily or enter user space. */ - if (IS_ENABLED(CONFIG_PREEMPT)) + if (IS_ENABLED(CONFIG_PREEMPTION)) synchronize_rcu_tasks(); free_ops: diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 0564f6db0561..09b0b49f346e 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -267,7 +267,7 @@ static void ring_buffer_producer(void) if (consumer && !(cnt % wakeup_interval)) wake_up_process(consumer); -#ifndef CONFIG_PREEMPT +#ifndef CONFIG_PREEMPTION /* * If we are a non preempt kernel, the 10 second run will * stop everything while it runs. Instead, we will call diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index c7506bc81b75..5a189fb8ec23 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -255,12 +255,12 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, local_save_flags(fbuffer->flags); fbuffer->pc = preempt_count(); /* - * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables + * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables * preemption (adding one to the preempt_count). Since we are * interested in the preempt_count at the time the tracepoint was * hit, we need to subtract one to offset the increment. */ - if (IS_ENABLED(CONFIG_PREEMPT)) + if (IS_ENABLED(CONFIG_PREEMPTION)) fbuffer->pc--; fbuffer->trace_file = trace_file; -- cgit v1.2.3 From 0a5b99f57873e233ad42ef71e23c629f6ea1fcfe Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 11 Jul 2019 16:45:41 -0400 Subject: treewide: Rename rcu_dereference_raw_notrace() to _check() The rcu_dereference_raw_notrace() API name is confusing. It is equivalent to rcu_dereference_raw() except that it also does sparse pointer checking. There are only a few users of rcu_dereference_raw_notrace(). This patches renames all of them to be rcu_dereference_raw_check() with the "_check()" indicating sparse checking. Signed-off-by: Joel Fernandes (Google) [ paulmck: Fix checkpatch warnings about parentheses. ] Signed-off-by: Paul E. McKenney --- Documentation/RCU/Design/Requirements/Requirements.html | 2 +- arch/powerpc/include/asm/kvm_book3s_64.h | 2 +- include/linux/rculist.h | 6 +++--- include/linux/rcupdate.h | 2 +- kernel/trace/ftrace_internal.h | 8 ++++---- kernel/trace/trace.c | 4 ++-- 6 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel/trace') diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index 5a9238a2883c..bdbc84f1b949 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -2512,7 +2512,7 @@ disabled across the entire RCU read-side critical section.

It is possible to use tracing on RCU code, but tracing itself uses RCU. -For this reason, rcu_dereference_raw_notrace() +For this reason, rcu_dereference_raw_check() is provided for use by tracing, which avoids the destructive recursion that could otherwise ensue. This API is also used by virtualization in some architectures, diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index bb7c8cc77f1a..04b2b927bb5a 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -535,7 +535,7 @@ static inline void note_hpte_modification(struct kvm *kvm, */ static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm) { - return rcu_dereference_raw_notrace(kvm->memslots[0]); + return rcu_dereference_raw_check(kvm->memslots[0]); } extern void kvmppc_mmu_debugfs_init(struct kvm *kvm); diff --git a/include/linux/rculist.h b/include/linux/rculist.h index e91ec9ddcd30..932296144131 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -622,7 +622,7 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n, * as long as the traversal is guarded by rcu_read_lock(). */ #define hlist_for_each_entry_rcu(pos, head, member) \ - for (pos = hlist_entry_safe (rcu_dereference_raw(hlist_first_rcu(head)),\ + for (pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\ @@ -642,10 +642,10 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n, * not do any RCU debugging or tracing. */ #define hlist_for_each_entry_rcu_notrace(pos, head, member) \ - for (pos = hlist_entry_safe (rcu_dereference_raw_notrace(hlist_first_rcu(head)),\ + for (pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ - pos = hlist_entry_safe(rcu_dereference_raw_notrace(hlist_next_rcu(\ + pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) /** diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 8f7167478c1d..bfcafbc1e301 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -476,7 +476,7 @@ do { \ * The no-tracing version of rcu_dereference_raw() must not call * rcu_read_lock_held(). */ -#define rcu_dereference_raw_notrace(p) __rcu_dereference_check((p), 1, __rcu) +#define rcu_dereference_raw_check(p) __rcu_dereference_check((p), 1, __rcu) /** * rcu_dereference_protected() - fetch RCU pointer when updates prevented diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h index 0515a2096f90..0456e0a3dab1 100644 --- a/kernel/trace/ftrace_internal.h +++ b/kernel/trace/ftrace_internal.h @@ -6,22 +6,22 @@ /* * Traverse the ftrace_global_list, invoking all entries. The reason that we - * can use rcu_dereference_raw_notrace() is that elements removed from this list + * can use rcu_dereference_raw_check() is that elements removed from this list * are simply leaked, so there is no need to interact with a grace-period - * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle + * mechanism. The rcu_dereference_raw_check() calls are needed to handle * concurrent insertions into the ftrace_global_list. * * Silly Alpha and silly pointer-speculation compiler optimizations! */ #define do_for_each_ftrace_op(op, list) \ - op = rcu_dereference_raw_notrace(list); \ + op = rcu_dereference_raw_check(list); \ do /* * Optimized for just a single item in the list (as that is the normal case). */ #define while_for_each_ftrace_op(op) \ - while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \ + while (likely(op = rcu_dereference_raw_check((op)->next)) && \ unlikely((op) != &ftrace_list_end)) extern struct ftrace_ops __rcu *ftrace_ops_list; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 525a97fbbc60..642474b26ba7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2642,10 +2642,10 @@ static void ftrace_exports(struct ring_buffer_event *event) preempt_disable_notrace(); - export = rcu_dereference_raw_notrace(ftrace_exports_list); + export = rcu_dereference_raw_check(ftrace_exports_list); while (export) { trace_process_export(export, event); - export = rcu_dereference_raw_notrace(export->next); + export = rcu_dereference_raw_check(export->next); } preempt_enable_notrace(); -- cgit v1.2.3 From a94549dd87f5ea4ca50fee493df08a2dc6256b53 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Aug 2019 17:17:58 -0700 Subject: lockdown: Lock down tracing and perf kprobes when in confidentiality mode Disallow the creation of perf and ftrace kprobes when the kernel is locked down in confidentiality mode by preventing their registration. This prevents kprobes from being used to access kernel memory to steal crypto data, but continues to allow the use of kprobes from signed modules. Reported-by: Alexei Starovoitov Signed-off-by: David Howells Signed-off-by: Matthew Garrett Acked-by: Masami Hiramatsu Reviewed-by: Kees Cook Cc: Naveen N. Rao Cc: Anil S Keshavamurthy Cc: davem@davemloft.net Cc: Masami Hiramatsu Signed-off-by: James Morris --- include/linux/security.h | 1 + kernel/trace/trace_kprobe.c | 5 +++++ security/lockdown/lockdown.c | 1 + 3 files changed, 7 insertions(+) (limited to 'kernel/trace') diff --git a/include/linux/security.h b/include/linux/security.h index 669e8de5299d..0b2529dbf0f4 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -117,6 +117,7 @@ enum lockdown_reason { LOCKDOWN_MMIOTRACE, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_KCORE, + LOCKDOWN_KPROBES, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7d736248a070..fcb28b0702b2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "trace_dynevent.h" #include "trace_kprobe_selftest.h" @@ -415,6 +416,10 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) { int i, ret; + ret = security_locked_down(LOCKDOWN_KPROBES); + if (ret) + return ret; + if (trace_probe_is_registered(&tk->tp)) return -EINVAL; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 403b30357f75..27b2cf51e443 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -32,6 +32,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_MMIOTRACE] = "unsafe mmio", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_KCORE] = "/proc/kcore access", + [LOCKDOWN_KPROBES] = "use of kprobes", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; -- cgit v1.2.3 From 9d1f8be5cf42b497a3bddf1d523f2bb142e9318c Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 Aug 2019 17:17:59 -0700 Subject: bpf: Restrict bpf when kernel lockdown is in confidentiality mode bpf_read() and bpf_read_str() could potentially be abused to (eg) allow private keys in kernel memory to be leaked. Disable them if the kernel has been locked down in confidentiality mode. Suggested-by: Alexei Starovoitov Signed-off-by: Matthew Garrett Reviewed-by: Kees Cook cc: netdev@vger.kernel.org cc: Chun-Yi Lee cc: Alexei Starovoitov Cc: Daniel Borkmann Signed-off-by: James Morris --- include/linux/security.h | 1 + kernel/trace/bpf_trace.c | 10 ++++++++++ security/lockdown/lockdown.c | 1 + 3 files changed, 12 insertions(+) (limited to 'kernel/trace') diff --git a/include/linux/security.h b/include/linux/security.h index 0b2529dbf0f4..e604f4c67f03 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -118,6 +118,7 @@ enum lockdown_reason { LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_KCORE, LOCKDOWN_KPROBES, + LOCKDOWN_BPF_READ, LOCKDOWN_CONFIDENTIALITY_MAX, }; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1c9a4745e596..33a954c367f3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -139,8 +139,13 @@ BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) { int ret; + ret = security_locked_down(LOCKDOWN_BPF_READ); + if (ret < 0) + goto out; + ret = probe_kernel_read(dst, unsafe_ptr, size); if (unlikely(ret < 0)) +out: memset(dst, 0, size); return ret; @@ -566,6 +571,10 @@ BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size, { int ret; + ret = security_locked_down(LOCKDOWN_BPF_READ); + if (ret < 0) + goto out; + /* * The strncpy_from_unsafe() call will likely not fill the entire * buffer, but that's okay in this circumstance as we're probing @@ -577,6 +586,7 @@ BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size, */ ret = strncpy_from_unsafe(dst, unsafe_ptr, size); if (unlikely(ret < 0)) +out: memset(dst, 0, size); return ret; diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c index 27b2cf51e443..2397772c56bd 100644 --- a/security/lockdown/lockdown.c +++ b/security/lockdown/lockdown.c @@ -33,6 +33,7 @@ static char *lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_KCORE] = "/proc/kcore access", [LOCKDOWN_KPROBES] = "use of kprobes", + [LOCKDOWN_BPF_READ] = "use of bpf to read kernel RAM", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; -- cgit v1.2.3 From 5cbd22c179012114099fe3996999b54fd2a092c5 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Wed, 21 Aug 2019 00:08:57 +0100 Subject: bpf: clarify description for CONFIG_BPF_EVENTS PERF_EVENT_IOC_SET_BPF supports uprobes since v4.3, and tracepoints since v4.7 via commit 04a22fae4cbc ("tracing, perf: Implement BPF programs attached to uprobes"), and commit 98b5c2c65c29 ("perf, bpf: allow bpf programs attach to tracepoints") respectively. Signed-off-by: Peter Wu Signed-off-by: Alexei Starovoitov --- kernel/trace/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 98da8998c25c..b09d7b1ffffd 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -520,7 +520,8 @@ config BPF_EVENTS bool default y help - This allows the user to attach BPF programs to kprobe events. + This allows the user to attach BPF programs to kprobe, uprobe, and + tracepoint events. config DYNAMIC_EVENTS def_bool n -- cgit v1.2.3 From 7bd46644ea0f6021dc396a39a8bfd3a58f6f1f9f Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 4 Jul 2019 20:04:41 +0530 Subject: ftrace: Fix NULL pointer dereference in t_probe_next() LTP testsuite on powerpc results in the below crash: Unable to handle kernel paging request for data at address 0x00000000 Faulting instruction address: 0xc00000000029d800 Oops: Kernel access of bad area, sig: 11 [#1] LE SMP NR_CPUS=2048 NUMA PowerNV ... CPU: 68 PID: 96584 Comm: cat Kdump: loaded Tainted: G W NIP: c00000000029d800 LR: c00000000029dac4 CTR: c0000000001e6ad0 REGS: c0002017fae8ba10 TRAP: 0300 Tainted: G W MSR: 9000000000009033 CR: 28022422 XER: 20040000 CFAR: c00000000029d90c DAR: 0000000000000000 DSISR: 40000000 IRQMASK: 0 ... NIP [c00000000029d800] t_probe_next+0x60/0x180 LR [c00000000029dac4] t_mod_start+0x1a4/0x1f0 Call Trace: [c0002017fae8bc90] [c000000000cdbc40] _cond_resched+0x10/0xb0 (unreliable) [c0002017fae8bce0] [c0000000002a15b0] t_start+0xf0/0x1c0 [c0002017fae8bd30] [c0000000004ec2b4] seq_read+0x184/0x640 [c0002017fae8bdd0] [c0000000004a57bc] sys_read+0x10c/0x300 [c0002017fae8be30] [c00000000000b388] system_call+0x5c/0x70 The test (ftrace_set_ftrace_filter.sh) is part of ftrace stress tests and the crash happens when the test does 'cat $TRACING_PATH/set_ftrace_filter'. The address points to the second line below, in t_probe_next(), where filter_hash is dereferenced: hash = iter->probe->ops.func_hash->filter_hash; size = 1 << hash->size_bits; This happens due to a race with register_ftrace_function_probe(). A new ftrace_func_probe is created and added into the func_probes list in trace_array under ftrace_lock. However, before initializing the filter, we drop ftrace_lock, and re-acquire it after acquiring regex_lock. If another process is trying to read set_ftrace_filter, it will be able to acquire ftrace_lock during this window and it will end up seeing a NULL filter_hash. Fix this by just checking for a NULL filter_hash in t_probe_next(). If the filter_hash is NULL, then this probe is just being added and we can simply return from here. Link: http://lkml.kernel.org/r/05e021f757625cbbb006fad41380323dbe4e3b43.1562249521.git.naveen.n.rao@linux.vnet.ibm.com Cc: stable@vger.kernel.org Fixes: 7b60f3d876156 ("ftrace: Dynamically create the probe ftrace_ops for the trace_array") Signed-off-by: Naveen N. Rao Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index eca34503f178..80beed2cf0da 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3095,6 +3095,10 @@ t_probe_next(struct seq_file *m, loff_t *pos) hnd = &iter->probe_entry->hlist; hash = iter->probe->ops.func_hash->filter_hash; + + if (!hash) + return NULL; + size = 1 << hash->size_bits; retry: -- cgit v1.2.3 From 372e0d01da71c84dcecf7028598a33813b0d5256 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 30 Aug 2019 16:30:01 -0400 Subject: ftrace: Check for empty hash and comment the race with registering probes The race between adding a function probe and reading the probes that exist is very subtle. It needs a comment. Also, the issue can also happen if the probe has has the EMPTY_HASH as its func_hash. Cc: stable@vger.kernel.org Fixes: 7b60f3d876156 ("ftrace: Dynamically create the probe ftrace_ops for the trace_array") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 80beed2cf0da..6200a6fe10e3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3096,7 +3096,11 @@ t_probe_next(struct seq_file *m, loff_t *pos) hash = iter->probe->ops.func_hash->filter_hash; - if (!hash) + /* + * A probe being registered may temporarily have an empty hash + * and it's at the end of the func_probes list. + */ + if (!hash || hash == EMPTY_HASH) return NULL; size = 1 << hash->size_bits; @@ -4324,6 +4328,10 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, mutex_unlock(&ftrace_lock); + /* + * Note, there's a small window here that the func_hash->filter_hash + * may be NULL or empty. Need to be carefule when reading the loop. + */ mutex_lock(&probe->ops.func_hash->regex_lock); orig_hash = &probe->ops.func_hash->filter_hash; -- cgit v1.2.3 From 5b0022dd32b7c2e15edf1827ba80aa1407edf9ff Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 4 Jul 2019 20:04:42 +0530 Subject: ftrace: Check for successful allocation of hash In register_ftrace_function_probe(), we are not checking the return value of alloc_and_copy_ftrace_hash(). The subsequent call to ftrace_match_records() may end up dereferencing the same. Add a check to ensure this doesn't happen. Link: http://lkml.kernel.org/r/26e92574f25ad23e7cafa3cf5f7a819de1832cbe.1562249521.git.naveen.n.rao@linux.vnet.ibm.com Cc: stable@vger.kernel.org Fixes: 1ec3a81a0cf42 ("ftrace: Have each function probe use its own ftrace_ops") Signed-off-by: Naveen N. Rao Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6200a6fe10e3..f9821a3374e9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4338,6 +4338,11 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, old_hash = *orig_hash; hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); + if (!hash) { + ret = -ENOMEM; + goto out; + } + ret = ftrace_match_records(hash, glob, strlen(glob)); /* Nothing found? */ -- cgit v1.2.3 From 595a438c78dbdc43d6c9db4f437267f0bd1548bf Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Thu, 4 Jul 2019 20:21:10 +0300 Subject: tracing: Make exported ftrace_set_clr_event non-static The function ftrace_set_clr_event is declared static and marked EXPORT_SYMBOL_GPL(), which is at best an odd combination. Because the function was decided to be a part of API, this commit removes the static attribute and adds the declaration to the header. Link: http://lkml.kernel.org/r/20190704172110.27041-1-efremov@linux.com Fixes: f45d1225adb04 ("tracing: Kernel access to Ftrace instances") Reviewed-by: Joe Jin Signed-off-by: Denis Efremov Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 1 + kernel/trace/trace_events.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 5150436783e8..30a8cdcfd4a4 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -548,6 +548,7 @@ extern int trace_event_get_offsets(struct trace_event_call *call); #define is_signed_type(type) (((type)(-1)) < (type)1) +int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set); int trace_set_clr_event(const char *system, const char *event, int set); /* diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index c7506bc81b75..648930823b57 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -787,7 +787,7 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, return ret; } -static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) +int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) { char *event = NULL, *sub = NULL, *match; int ret; -- cgit v1.2.3 From 19a58ce1dc72264b9d50ff6d86cc36b3c439fb64 Mon Sep 17 00:00:00 2001 From: Xinpeng Liu Date: Thu, 8 Aug 2019 07:29:23 +0800 Subject: tracing/probe: Fix null pointer dereference BUG: KASAN: null-ptr-deref in trace_probe_cleanup+0x8d/0xd0 Read of size 8 at addr 0000000000000000 by task syz-executor.0/9746 trace_probe_cleanup+0x8d/0xd0 free_trace_kprobe.part.14+0x15/0x50 alloc_trace_kprobe+0x23e/0x250 Link: http://lkml.kernel.org/r/1565220563-980-1-git-send-email-danielliu861@gmail.com Fixes: e3dc9f898ef9c ("tracing/probe: Add trace_event_call accesses APIs") Signed-off-by: Xinpeng Liu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index dbef0d135075..fb6bfbc5bf86 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -895,7 +895,8 @@ void trace_probe_cleanup(struct trace_probe *tp) for (i = 0; i < tp->nr_args; i++) traceprobe_free_probe_arg(&tp->args[i]); - kfree(call->class->system); + if (call->class) + kfree(call->class->system); kfree(call->name); kfree(call->print_fmt); } -- cgit v1.2.3 From c68c9ec1c52e5bcd221eb09bc5344ad4f407b204 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 27 Aug 2019 22:25:47 -0700 Subject: tracing: Correct kdoc formats Fix the following kdoc warnings: kernel/trace/trace.c:1579: warning: Function parameter or member 'tr' not described in 'update_max_tr_single' kernel/trace/trace.c:1579: warning: Function parameter or member 'tsk' not described in 'update_max_tr_single' kernel/trace/trace.c:1579: warning: Function parameter or member 'cpu' not described in 'update_max_tr_single' kernel/trace/trace.c:1776: warning: Function parameter or member 'type' not described in 'register_tracer' kernel/trace/trace.c:2239: warning: Function parameter or member 'task' not described in 'tracing_record_taskinfo' kernel/trace/trace.c:2239: warning: Function parameter or member 'flags' not described in 'tracing_record_taskinfo' kernel/trace/trace.c:2269: warning: Function parameter or member 'prev' not described in 'tracing_record_taskinfo_sched_switch' kernel/trace/trace.c:2269: warning: Function parameter or member 'next' not described in 'tracing_record_taskinfo_sched_switch' kernel/trace/trace.c:2269: warning: Function parameter or member 'flags' not described in 'tracing_record_taskinfo_sched_switch' kernel/trace/trace.c:3078: warning: Function parameter or member 'ip' not described in 'trace_vbprintk' kernel/trace/trace.c:3078: warning: Function parameter or member 'fmt' not described in 'trace_vbprintk' kernel/trace/trace.c:3078: warning: Function parameter or member 'args' not described in 'trace_vbprintk' Link: http://lkml.kernel.org/r/20190828052549.2472-2-jakub.kicinski@netronome.com Signed-off-by: Jakub Kicinski Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 525a97fbbc60..563e80f9006a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1567,9 +1567,9 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, /** * update_max_tr_single - only copy one trace over, and reset the rest - * @tr - tracer - * @tsk - task with the latency - * @cpu - the cpu of the buffer to copy. + * @tr: tracer + * @tsk: task with the latency + * @cpu: the cpu of the buffer to copy. * * Flip the trace of a single CPU buffer between the @tr and the max_tr. */ @@ -1767,7 +1767,7 @@ static void __init apply_trace_boot_options(void); /** * register_tracer - register a tracer with the ftrace system. - * @type - the plugin for the tracer + * @type: the plugin for the tracer * * Register a new plugin tracer. */ @@ -2230,9 +2230,9 @@ static bool tracing_record_taskinfo_skip(int flags) /** * tracing_record_taskinfo - record the task info of a task * - * @task - task to record - * @flags - TRACE_RECORD_CMDLINE for recording comm - * - TRACE_RECORD_TGID for recording tgid + * @task: task to record + * @flags: TRACE_RECORD_CMDLINE for recording comm + * TRACE_RECORD_TGID for recording tgid */ void tracing_record_taskinfo(struct task_struct *task, int flags) { @@ -2258,10 +2258,10 @@ void tracing_record_taskinfo(struct task_struct *task, int flags) /** * tracing_record_taskinfo_sched_switch - record task info for sched_switch * - * @prev - previous task during sched_switch - * @next - next task during sched_switch - * @flags - TRACE_RECORD_CMDLINE for recording comm - * TRACE_RECORD_TGID for recording tgid + * @prev: previous task during sched_switch + * @next: next task during sched_switch + * @flags: TRACE_RECORD_CMDLINE for recording comm + * TRACE_RECORD_TGID for recording tgid */ void tracing_record_taskinfo_sched_switch(struct task_struct *prev, struct task_struct *next, int flags) @@ -3072,7 +3072,9 @@ static void trace_printk_start_stop_comm(int enabled) /** * trace_vbprintk - write binary msg to tracing buffer - * + * @ip: The address of the caller + * @fmt: The string format to write to the buffer + * @args: Arguments for @fmt */ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) { -- cgit v1.2.3 From 60d53e2c3b75e79c83970fe73db79123d9462c7c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 20 Jun 2019 00:07:20 +0900 Subject: tracing/probe: Split trace_event related data from trace_probe Split the trace_event related data from trace_probe data structure and introduce trace_probe_event data structure for its folder. This trace_probe_event data structure can have multiple trace_probe. Link: http://lkml.kernel.org/r/156095683995.28024.7552150340561557873.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 157 ++++++++++++++++++++++++++++++----------- kernel/trace/trace_probe.c | 54 ++++++++++----- kernel/trace/trace_probe.h | 48 ++++++++++--- kernel/trace/trace_uprobe.c | 165 +++++++++++++++++++++++++++++++++----------- 4 files changed, 311 insertions(+), 113 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 9d483ad9bb6c..eac6344a2e7c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -180,20 +180,33 @@ unsigned long trace_kprobe_address(struct trace_kprobe *tk) return addr; } +static nokprobe_inline struct trace_kprobe * +trace_kprobe_primary_from_call(struct trace_event_call *call) +{ + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return NULL; + + return container_of(tp, struct trace_kprobe, tp); +} + bool trace_kprobe_on_func_entry(struct trace_event_call *call) { - struct trace_kprobe *tk = (struct trace_kprobe *)call->data; + struct trace_kprobe *tk = trace_kprobe_primary_from_call(call); - return kprobe_on_func_entry(tk->rp.kp.addr, + return tk ? kprobe_on_func_entry(tk->rp.kp.addr, tk->rp.kp.addr ? NULL : tk->rp.kp.symbol_name, - tk->rp.kp.addr ? 0 : tk->rp.kp.offset); + tk->rp.kp.addr ? 0 : tk->rp.kp.offset) : false; } bool trace_kprobe_error_injectable(struct trace_event_call *call) { - struct trace_kprobe *tk = (struct trace_kprobe *)call->data; + struct trace_kprobe *tk = trace_kprobe_primary_from_call(call); - return within_error_injection_list(trace_kprobe_address(tk)); + return tk ? within_error_injection_list(trace_kprobe_address(tk)) : + false; } static int register_kprobe_event(struct trace_kprobe *tk); @@ -291,32 +304,75 @@ static inline int __enable_trace_kprobe(struct trace_kprobe *tk) return ret; } +static void __disable_trace_kprobe(struct trace_probe *tp) +{ + struct trace_probe *pos; + struct trace_kprobe *tk; + + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + tk = container_of(pos, struct trace_kprobe, tp); + if (!trace_kprobe_is_registered(tk)) + continue; + if (trace_kprobe_is_return(tk)) + disable_kretprobe(&tk->rp); + else + disable_kprobe(&tk->rp.kp); + } +} + /* * Enable trace_probe * if the file is NULL, enable "perf" handler, or enable "trace" handler. */ -static int -enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) +static int enable_trace_kprobe(struct trace_event_call *call, + struct trace_event_file *file) { - bool enabled = trace_probe_is_enabled(&tk->tp); + struct trace_probe *pos, *tp; + struct trace_kprobe *tk; + bool enabled; int ret = 0; + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + enabled = trace_probe_is_enabled(tp); + + /* This also changes "enabled" state */ if (file) { - ret = trace_probe_add_file(&tk->tp, file); + ret = trace_probe_add_file(tp, file); if (ret) return ret; } else - trace_probe_set_flag(&tk->tp, TP_FLAG_PROFILE); + trace_probe_set_flag(tp, TP_FLAG_PROFILE); if (enabled) return 0; - ret = __enable_trace_kprobe(tk); - if (ret) { + enabled = false; + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + tk = container_of(pos, struct trace_kprobe, tp); + if (trace_kprobe_has_gone(tk)) + continue; + ret = __enable_trace_kprobe(tk); + if (ret) { + if (enabled) { + __disable_trace_kprobe(tp); + enabled = false; + } + break; + } + enabled = true; + } + + if (!enabled) { + /* No probe is enabled. Roll back */ if (file) - trace_probe_remove_file(&tk->tp, file); + trace_probe_remove_file(tp, file); else - trace_probe_clear_flag(&tk->tp, TP_FLAG_PROFILE); + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); + if (!ret) + /* Since all probes are gone, this is not available */ + ret = -EADDRNOTAVAIL; } return ret; @@ -326,11 +382,14 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) * Disable trace_probe * if the file is NULL, disable "perf" handler, or disable "trace" handler. */ -static int -disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) +static int disable_trace_kprobe(struct trace_event_call *call, + struct trace_event_file *file) { - struct trace_probe *tp = &tk->tp; - int ret = 0; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; if (file) { if (!trace_probe_get_file_link(tp, file)) @@ -341,12 +400,8 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) } else trace_probe_clear_flag(tp, TP_FLAG_PROFILE); - if (!trace_probe_is_enabled(tp) && trace_kprobe_is_registered(tk)) { - if (trace_kprobe_is_return(tk)) - disable_kretprobe(&tk->rp); - else - disable_kprobe(&tk->rp.kp); - } + if (!trace_probe_is_enabled(tp)) + __disable_trace_kprobe(tp); out: if (file) @@ -358,7 +413,7 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) */ trace_probe_remove_file(tp, file); - return ret; + return 0; } #if defined(CONFIG_KPROBES_ON_FTRACE) && \ @@ -1089,7 +1144,10 @@ print_kprobe_event(struct trace_iterator *iter, int flags, struct trace_probe *tp; field = (struct kprobe_trace_entry_head *)iter->ent; - tp = container_of(event, struct trace_probe, call.event); + tp = trace_probe_primary_from_call( + container_of(event, struct trace_event_call, event)); + if (WARN_ON_ONCE(!tp)) + goto out; trace_seq_printf(s, "%s: (", trace_probe_name(tp)); @@ -1116,7 +1174,10 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, struct trace_probe *tp; field = (struct kretprobe_trace_entry_head *)iter->ent; - tp = container_of(event, struct trace_probe, call.event); + tp = trace_probe_primary_from_call( + container_of(event, struct trace_event_call, event)); + if (WARN_ON_ONCE(!tp)) + goto out; trace_seq_printf(s, "%s: (", trace_probe_name(tp)); @@ -1145,23 +1206,31 @@ static int kprobe_event_define_fields(struct trace_event_call *event_call) { int ret; struct kprobe_trace_entry_head field; - struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(event_call); + if (WARN_ON_ONCE(!tp)) + return -ENOENT; DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); - return traceprobe_define_arg_fields(event_call, sizeof(field), &tk->tp); + return traceprobe_define_arg_fields(event_call, sizeof(field), tp); } static int kretprobe_event_define_fields(struct trace_event_call *event_call) { int ret; struct kretprobe_trace_entry_head field; - struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(event_call); + if (WARN_ON_ONCE(!tp)) + return -ENOENT; DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); - return traceprobe_define_arg_fields(event_call, sizeof(field), &tk->tp); + return traceprobe_define_arg_fields(event_call, sizeof(field), tp); } #ifdef CONFIG_PERF_EVENTS @@ -1289,20 +1358,19 @@ int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type, static int kprobe_register(struct trace_event_call *event, enum trace_reg type, void *data) { - struct trace_kprobe *tk = (struct trace_kprobe *)event->data; struct trace_event_file *file = data; switch (type) { case TRACE_REG_REGISTER: - return enable_trace_kprobe(tk, file); + return enable_trace_kprobe(event, file); case TRACE_REG_UNREGISTER: - return disable_trace_kprobe(tk, file); + return disable_trace_kprobe(event, file); #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: - return enable_trace_kprobe(tk, NULL); + return enable_trace_kprobe(event, NULL); case TRACE_REG_PERF_UNREGISTER: - return disable_trace_kprobe(tk, NULL); + return disable_trace_kprobe(event, NULL); case TRACE_REG_PERF_OPEN: case TRACE_REG_PERF_CLOSE: case TRACE_REG_PERF_ADD: @@ -1369,7 +1437,6 @@ static inline void init_trace_event_call(struct trace_kprobe *tk) call->flags = TRACE_EVENT_FL_KPROBE; call->class->reg = kprobe_register; - call->data = tk; } static int register_kprobe_event(struct trace_kprobe *tk) @@ -1432,7 +1499,9 @@ void destroy_local_trace_kprobe(struct trace_event_call *event_call) { struct trace_kprobe *tk; - tk = container_of(event_call, struct trace_kprobe, tp.call); + tk = trace_kprobe_primary_from_call(event_call); + if (unlikely(!tk)) + return; if (trace_probe_is_enabled(&tk->tp)) { WARN_ON(1); @@ -1577,7 +1646,8 @@ static __init int kprobe_trace_self_tests_init(void) pr_warn("error on getting probe file.\n"); warn++; } else - enable_trace_kprobe(tk, file); + enable_trace_kprobe( + trace_probe_event_call(&tk->tp), file); } } @@ -1598,7 +1668,8 @@ static __init int kprobe_trace_self_tests_init(void) pr_warn("error on getting probe file.\n"); warn++; } else - enable_trace_kprobe(tk, file); + enable_trace_kprobe( + trace_probe_event_call(&tk->tp), file); } } @@ -1631,7 +1702,8 @@ static __init int kprobe_trace_self_tests_init(void) pr_warn("error on getting probe file.\n"); warn++; } else - disable_trace_kprobe(tk, file); + disable_trace_kprobe( + trace_probe_event_call(&tk->tp), file); } tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM); @@ -1649,7 +1721,8 @@ static __init int kprobe_trace_self_tests_init(void) pr_warn("error on getting probe file.\n"); warn++; } else - disable_trace_kprobe(tk, file); + disable_trace_kprobe( + trace_probe_event_call(&tk->tp), file); } ret = trace_run_command("-:testprobe", create_or_delete_trace_kprobe); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index fb6bfbc5bf86..28733bd6b607 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -889,41 +889,59 @@ int traceprobe_define_arg_fields(struct trace_event_call *event_call, void trace_probe_cleanup(struct trace_probe *tp) { - struct trace_event_call *call = trace_probe_event_call(tp); int i; for (i = 0; i < tp->nr_args; i++) traceprobe_free_probe_arg(&tp->args[i]); - if (call->class) - kfree(call->class->system); - kfree(call->name); - kfree(call->print_fmt); + if (tp->event) { + struct trace_event_call *call = trace_probe_event_call(tp); + + kfree(tp->event->class.system); + kfree(call->name); + kfree(call->print_fmt); + kfree(tp->event); + tp->event = NULL; + } } int trace_probe_init(struct trace_probe *tp, const char *event, const char *group) { - struct trace_event_call *call = trace_probe_event_call(tp); + struct trace_event_call *call; + int ret = 0; if (!event || !group) return -EINVAL; - call->class = &tp->class; - call->name = kstrdup(event, GFP_KERNEL); - if (!call->name) + tp->event = kzalloc(sizeof(struct trace_probe_event), GFP_KERNEL); + if (!tp->event) return -ENOMEM; - tp->class.system = kstrdup(group, GFP_KERNEL); - if (!tp->class.system) { - kfree(call->name); - call->name = NULL; - return -ENOMEM; + call = trace_probe_event_call(tp); + call->class = &tp->event->class; + call->name = kstrdup(event, GFP_KERNEL); + if (!call->name) { + ret = -ENOMEM; + goto error; + } + + tp->event->class.system = kstrdup(group, GFP_KERNEL); + if (!tp->event->class.system) { + ret = -ENOMEM; + goto error; } - INIT_LIST_HEAD(&tp->files); - INIT_LIST_HEAD(&tp->class.fields); + INIT_LIST_HEAD(&tp->event->files); + INIT_LIST_HEAD(&tp->event->class.fields); + INIT_LIST_HEAD(&tp->event->probes); + INIT_LIST_HEAD(&tp->list); + list_add(&tp->event->probes, &tp->list); return 0; + +error: + trace_probe_cleanup(tp); + return ret; } int trace_probe_register_event_call(struct trace_probe *tp) @@ -952,7 +970,7 @@ int trace_probe_add_file(struct trace_probe *tp, struct trace_event_file *file) link->file = file; INIT_LIST_HEAD(&link->list); - list_add_tail_rcu(&link->list, &tp->files); + list_add_tail_rcu(&link->list, &tp->event->files); trace_probe_set_flag(tp, TP_FLAG_TRACE); return 0; } @@ -983,7 +1001,7 @@ int trace_probe_remove_file(struct trace_probe *tp, synchronize_rcu(); kfree(link); - if (list_empty(&tp->files)) + if (list_empty(&tp->event->files)) trace_probe_clear_flag(tp, TP_FLAG_TRACE); return 0; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index d1714820efe1..0b84abb884c2 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -222,11 +222,18 @@ struct probe_arg { const struct fetch_type *type; /* Type of this argument */ }; -struct trace_probe { +/* Event call and class holder */ +struct trace_probe_event { unsigned int flags; /* For TP_FLAG_* */ struct trace_event_class class; struct trace_event_call call; struct list_head files; + struct list_head probes; +}; + +struct trace_probe { + struct list_head list; + struct trace_probe_event *event; ssize_t size; /* trace entry size */ unsigned int nr_args; struct probe_arg args[]; @@ -240,19 +247,19 @@ struct event_file_link { static inline bool trace_probe_test_flag(struct trace_probe *tp, unsigned int flag) { - return !!(tp->flags & flag); + return !!(tp->event->flags & flag); } static inline void trace_probe_set_flag(struct trace_probe *tp, unsigned int flag) { - tp->flags |= flag; + tp->event->flags |= flag; } static inline void trace_probe_clear_flag(struct trace_probe *tp, unsigned int flag) { - tp->flags &= ~flag; + tp->event->flags &= ~flag; } static inline bool trace_probe_is_enabled(struct trace_probe *tp) @@ -262,29 +269,48 @@ static inline bool trace_probe_is_enabled(struct trace_probe *tp) static inline const char *trace_probe_name(struct trace_probe *tp) { - return trace_event_name(&tp->call); + return trace_event_name(&tp->event->call); } static inline const char *trace_probe_group_name(struct trace_probe *tp) { - return tp->call.class->system; + return tp->event->call.class->system; } static inline struct trace_event_call * trace_probe_event_call(struct trace_probe *tp) { - return &tp->call; + return &tp->event->call; +} + +static inline struct trace_probe_event * +trace_probe_event_from_call(struct trace_event_call *event_call) +{ + return container_of(event_call, struct trace_probe_event, call); +} + +static inline struct trace_probe * +trace_probe_primary_from_call(struct trace_event_call *call) +{ + struct trace_probe_event *tpe = trace_probe_event_from_call(call); + + return list_first_entry(&tpe->probes, struct trace_probe, list); +} + +static inline struct list_head *trace_probe_probe_list(struct trace_probe *tp) +{ + return &tp->event->probes; } static inline int trace_probe_unregister_event_call(struct trace_probe *tp) { /* tp->event is unregistered in trace_remove_event_call() */ - return trace_remove_event_call(&tp->call); + return trace_remove_event_call(&tp->event->call); } static inline bool trace_probe_has_single_file(struct trace_probe *tp) { - return !!list_is_singular(&tp->files); + return !!list_is_singular(&tp->event->files); } int trace_probe_init(struct trace_probe *tp, const char *event, @@ -298,9 +324,9 @@ struct event_file_link *trace_probe_get_file_link(struct trace_probe *tp, struct trace_event_file *file); #define trace_probe_for_each_link(pos, tp) \ - list_for_each_entry(pos, &(tp)->files, list) + list_for_each_entry(pos, &(tp)->event->files, list) #define trace_probe_for_each_link_rcu(pos, tp) \ - list_for_each_entry_rcu(pos, &(tp)->files, list) + list_for_each_entry_rcu(pos, &(tp)->event->files, list) /* Check the name is good for event/group/fields */ static inline bool is_good_name(const char *name) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 1ceedb9146b1..ac799abb7da9 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -293,6 +293,18 @@ static bool trace_uprobe_match(const char *system, const char *event, (!system || strcmp(trace_probe_group_name(&tu->tp), system) == 0); } +static nokprobe_inline struct trace_uprobe * +trace_uprobe_primary_from_call(struct trace_event_call *call) +{ + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return NULL; + + return container_of(tp, struct trace_uprobe, tp); +} + /* * Allocate new trace_uprobe and initialize it (including uprobes). */ @@ -897,7 +909,10 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e u8 *data; entry = (struct uprobe_trace_entry_head *)iter->ent; - tu = container_of(event, struct trace_uprobe, tp.call.event); + tu = trace_uprobe_primary_from_call( + container_of(event, struct trace_event_call, event)); + if (unlikely(!tu)) + goto out; if (is_ret_probe(tu)) { trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", @@ -924,27 +939,71 @@ typedef bool (*filter_func_t)(struct uprobe_consumer *self, enum uprobe_filter_ctx ctx, struct mm_struct *mm); -static int -probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file, - filter_func_t filter) +static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter) { - bool enabled = trace_probe_is_enabled(&tu->tp); int ret; + tu->consumer.filter = filter; + tu->inode = d_real_inode(tu->path.dentry); + + if (tu->ref_ctr_offset) + ret = uprobe_register_refctr(tu->inode, tu->offset, + tu->ref_ctr_offset, &tu->consumer); + else + ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); + + if (ret) + tu->inode = NULL; + + return ret; +} + +static void __probe_event_disable(struct trace_probe *tp) +{ + struct trace_probe *pos; + struct trace_uprobe *tu; + + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + tu = container_of(pos, struct trace_uprobe, tp); + if (!tu->inode) + continue; + + WARN_ON(!uprobe_filter_is_empty(&tu->filter)); + + uprobe_unregister(tu->inode, tu->offset, &tu->consumer); + tu->inode = NULL; + } +} + +static int probe_event_enable(struct trace_event_call *call, + struct trace_event_file *file, filter_func_t filter) +{ + struct trace_probe *pos, *tp; + struct trace_uprobe *tu; + bool enabled; + int ret; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + enabled = trace_probe_is_enabled(tp); + + /* This may also change "enabled" state */ if (file) { - if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) + if (trace_probe_test_flag(tp, TP_FLAG_PROFILE)) return -EINTR; - ret = trace_probe_add_file(&tu->tp, file); + ret = trace_probe_add_file(tp, file); if (ret < 0) return ret; } else { - if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) + if (trace_probe_test_flag(tp, TP_FLAG_TRACE)) return -EINTR; - trace_probe_set_flag(&tu->tp, TP_FLAG_PROFILE); + trace_probe_set_flag(tp, TP_FLAG_PROFILE); } + tu = container_of(tp, struct trace_uprobe, tp); WARN_ON(!uprobe_filter_is_empty(&tu->filter)); if (enabled) @@ -954,18 +1013,15 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file, if (ret) goto err_flags; - tu->consumer.filter = filter; - tu->inode = d_real_inode(tu->path.dentry); - if (tu->ref_ctr_offset) { - ret = uprobe_register_refctr(tu->inode, tu->offset, - tu->ref_ctr_offset, &tu->consumer); - } else { - ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + tu = container_of(pos, struct trace_uprobe, tp); + ret = trace_uprobe_enable(tu, filter); + if (ret) { + __probe_event_disable(tp); + goto err_buffer; + } } - if (ret) - goto err_buffer; - return 0; err_buffer: @@ -973,33 +1029,35 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file, err_flags: if (file) - trace_probe_remove_file(&tu->tp, file); + trace_probe_remove_file(tp, file); else - trace_probe_clear_flag(&tu->tp, TP_FLAG_PROFILE); + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); return ret; } -static void -probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file) +static void probe_event_disable(struct trace_event_call *call, + struct trace_event_file *file) { - if (!trace_probe_is_enabled(&tu->tp)) + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return; + + if (!trace_probe_is_enabled(tp)) return; if (file) { - if (trace_probe_remove_file(&tu->tp, file) < 0) + if (trace_probe_remove_file(tp, file) < 0) return; - if (trace_probe_is_enabled(&tu->tp)) + if (trace_probe_is_enabled(tp)) return; } else - trace_probe_clear_flag(&tu->tp, TP_FLAG_PROFILE); - - WARN_ON(!uprobe_filter_is_empty(&tu->filter)); - - uprobe_unregister(tu->inode, tu->offset, &tu->consumer); - tu->inode = NULL; + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); + __probe_event_disable(tp); uprobe_buffer_disable(); } @@ -1007,7 +1065,11 @@ static int uprobe_event_define_fields(struct trace_event_call *event_call) { int ret, size; struct uprobe_trace_entry_head field; - struct trace_uprobe *tu = event_call->data; + struct trace_uprobe *tu; + + tu = trace_uprobe_primary_from_call(event_call); + if (unlikely(!tu)) + return -ENODEV; if (is_ret_probe(tu)) { DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_FUNC, 0); @@ -1100,6 +1162,27 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) return err; } +static int uprobe_perf_multi_call(struct trace_event_call *call, + struct perf_event *event, + int (*op)(struct trace_uprobe *tu, struct perf_event *event)) +{ + struct trace_probe *pos, *tp; + struct trace_uprobe *tu; + int ret = 0; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + tu = container_of(pos, struct trace_uprobe, tp); + ret = op(tu, event); + if (ret) + break; + } + + return ret; +} static bool uprobe_perf_filter(struct uprobe_consumer *uc, enum uprobe_filter_ctx ctx, struct mm_struct *mm) { @@ -1213,30 +1296,29 @@ static int trace_uprobe_register(struct trace_event_call *event, enum trace_reg type, void *data) { - struct trace_uprobe *tu = event->data; struct trace_event_file *file = data; switch (type) { case TRACE_REG_REGISTER: - return probe_event_enable(tu, file, NULL); + return probe_event_enable(event, file, NULL); case TRACE_REG_UNREGISTER: - probe_event_disable(tu, file); + probe_event_disable(event, file); return 0; #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: - return probe_event_enable(tu, NULL, uprobe_perf_filter); + return probe_event_enable(event, NULL, uprobe_perf_filter); case TRACE_REG_PERF_UNREGISTER: - probe_event_disable(tu, NULL); + probe_event_disable(event, NULL); return 0; case TRACE_REG_PERF_OPEN: - return uprobe_perf_open(tu, data); + return uprobe_perf_multi_call(event, data, uprobe_perf_open); case TRACE_REG_PERF_CLOSE: - return uprobe_perf_close(tu, data); + return uprobe_perf_multi_call(event, data, uprobe_perf_close); #endif default: @@ -1330,7 +1412,6 @@ static inline void init_trace_event_call(struct trace_uprobe *tu) call->flags = TRACE_EVENT_FL_UPROBE | TRACE_EVENT_FL_CAP_ANY; call->class->reg = trace_uprobe_register; - call->data = tu; } static int register_uprobe_event(struct trace_uprobe *tu) @@ -1399,7 +1480,7 @@ void destroy_local_trace_uprobe(struct trace_event_call *event_call) { struct trace_uprobe *tu; - tu = container_of(event_call, struct trace_uprobe, tp.call); + tu = trace_uprobe_primary_from_call(event_call); free_trace_uprobe(tu); } -- cgit v1.2.3 From cb8e7a8d55e052fdcfd1a567305a9a180fb61c57 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 20 Jun 2019 00:07:29 +0900 Subject: tracing/dynevent: Delete all matched events When user gives an event name to delete, delete all matched events instead of the first one. This means if there are several events which have same name but different group (subsystem) name, those are removed if user passed only the event name, e.g. # cat kprobe_events p:group1/testevent _do_fork p:group2/testevent fork_idle # echo -:testevent >> kprobe_events # cat kprobe_events # Link: http://lkml.kernel.org/r/156095684958.28024.16597826267117453638.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_dynevent.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index fa100ed3b4de..1cc55c50c491 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -61,10 +61,12 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type) for_each_dyn_event_safe(pos, n) { if (type && type != pos->ops) continue; - if (pos->ops->match(system, event, pos)) { - ret = pos->ops->free(pos); + if (!pos->ops->match(system, event, pos)) + continue; + + ret = pos->ops->free(pos); + if (ret) break; - } } mutex_unlock(&event_mutex); -- cgit v1.2.3 From 30199137c899d7e416a2adc58bf09bec217ce9ca Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 20 Jun 2019 00:07:39 +0900 Subject: tracing/dynevent: Pass extra arguments to match operation Pass extra arguments to match operation for checking exact match. If the event doesn't support exact match, it will be ignored. Link: http://lkml.kernel.org/r/156095685930.28024.10405547027475590975.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_dynevent.c | 4 +++- kernel/trace/trace_dynevent.h | 7 ++++--- kernel/trace/trace_events_hist.c | 4 ++-- kernel/trace/trace_kprobe.c | 4 ++-- kernel/trace/trace_uprobe.c | 4 ++-- 5 files changed, 13 insertions(+), 10 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 1cc55c50c491..a41fed46c285 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -47,6 +47,7 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type) return -EINVAL; event++; } + argc--; argv++; p = strchr(event, '/'); if (p) { @@ -61,7 +62,8 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type) for_each_dyn_event_safe(pos, n) { if (type && type != pos->ops) continue; - if (!pos->ops->match(system, event, pos)) + if (!pos->ops->match(system, event, + argc, (const char **)argv, pos)) continue; ret = pos->ops->free(pos); diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h index 8c334064e4d6..46898138d2df 100644 --- a/kernel/trace/trace_dynevent.h +++ b/kernel/trace/trace_dynevent.h @@ -31,8 +31,9 @@ struct dyn_event; * @is_busy: Check whether given event is busy so that it can not be deleted. * Return true if it is busy, otherwides false. * @free: Delete the given event. Return 0 if success, otherwides error. - * @match: Check whether given event and system name match this event. - * Return true if it matches, otherwides false. + * @match: Check whether given event and system name match this event. The argc + * and argv is used for exact match. Return true if it matches, otherwides + * false. * * Except for @create, these methods are called under holding event_mutex. */ @@ -43,7 +44,7 @@ struct dyn_event_operations { bool (*is_busy)(struct dyn_event *ev); int (*free)(struct dyn_event *ev); bool (*match)(const char *system, const char *event, - struct dyn_event *ev); + int argc, const char **argv, struct dyn_event *ev); }; /* Register new dyn_event type -- must be called at first */ diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index ca6b0dff60c5..65e7d071ed28 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -374,7 +374,7 @@ static int synth_event_show(struct seq_file *m, struct dyn_event *ev); static int synth_event_release(struct dyn_event *ev); static bool synth_event_is_busy(struct dyn_event *ev); static bool synth_event_match(const char *system, const char *event, - struct dyn_event *ev); + int argc, const char **argv, struct dyn_event *ev); static struct dyn_event_operations synth_event_ops = { .create = synth_event_create, @@ -422,7 +422,7 @@ static bool synth_event_is_busy(struct dyn_event *ev) } static bool synth_event_match(const char *system, const char *event, - struct dyn_event *ev) + int argc, const char **argv, struct dyn_event *ev) { struct synth_event *sev = to_synth_event(ev); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index eac6344a2e7c..e8f72431b866 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -39,7 +39,7 @@ static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev); static int trace_kprobe_release(struct dyn_event *ev); static bool trace_kprobe_is_busy(struct dyn_event *ev); static bool trace_kprobe_match(const char *system, const char *event, - struct dyn_event *ev); + int argc, const char **argv, struct dyn_event *ev); static struct dyn_event_operations trace_kprobe_ops = { .create = trace_kprobe_create, @@ -138,7 +138,7 @@ static bool trace_kprobe_is_busy(struct dyn_event *ev) } static bool trace_kprobe_match(const char *system, const char *event, - struct dyn_event *ev) + int argc, const char **argv, struct dyn_event *ev) { struct trace_kprobe *tk = to_trace_kprobe(ev); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index ac799abb7da9..2862e6829e48 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -44,7 +44,7 @@ static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev); static int trace_uprobe_release(struct dyn_event *ev); static bool trace_uprobe_is_busy(struct dyn_event *ev); static bool trace_uprobe_match(const char *system, const char *event, - struct dyn_event *ev); + int argc, const char **argv, struct dyn_event *ev); static struct dyn_event_operations trace_uprobe_ops = { .create = trace_uprobe_create, @@ -285,7 +285,7 @@ static bool trace_uprobe_is_busy(struct dyn_event *ev) } static bool trace_uprobe_match(const char *system, const char *event, - struct dyn_event *ev) + int argc, const char **argv, struct dyn_event *ev) { struct trace_uprobe *tu = to_trace_uprobe(ev); -- cgit v1.2.3 From ca89bc071d5e4e981dcc52e0ca90f4500d332e42 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 20 Jun 2019 00:07:49 +0900 Subject: tracing/kprobe: Add multi-probe per event support Add multi-probe per one event support to kprobe events. User can define several different probes on one trace event if those events have same "event signature", e.g. # echo p:testevent _do_fork > kprobe_events # echo p:testevent fork_idle >> kprobe_events # kprobe_events p:kprobes/testevent _do_fork p:kprobes/testevent fork_idle The event signature is defined by kprobe type (retprobe or not), the number of args, argument names, and argument types. Note that this only support appending method. Delete event operation will delete all probes on the event. Link: http://lkml.kernel.org/r/156095686913.28024.9357292202316540742.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 4 +-- kernel/trace/trace_kprobe.c | 52 ++++++++++++++++++++++++++++++++++----- kernel/trace/trace_probe.c | 59 ++++++++++++++++++++++++++++++++++++++------- kernel/trace/trace_probe.h | 14 ++++++++++- 4 files changed, 111 insertions(+), 18 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 563e80f9006a..a8505d84b76e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4815,11 +4815,11 @@ static const char readme_msg[] = #endif #endif /* CONFIG_STACK_TRACER */ #ifdef CONFIG_DYNAMIC_EVENTS - " dynamic_events\t\t- Add/remove/show the generic dynamic events\n" + " dynamic_events\t\t- Create/append/remove/show the generic dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" #endif #ifdef CONFIG_KPROBE_EVENTS - " kprobe_events\t\t- Add/remove/show the kernel dynamic events\n" + " kprobe_events\t\t- Create/append/remove/show the kernel dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" #endif #ifdef CONFIG_UPROBE_EVENTS diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index e8f72431b866..f43098bf62dd 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -492,6 +492,10 @@ static void __unregister_trace_kprobe(struct trace_kprobe *tk) /* Unregister a trace_probe and probe_event */ static int unregister_trace_kprobe(struct trace_kprobe *tk) { + /* If other probes are on the event, just unregister kprobe */ + if (trace_probe_has_sibling(&tk->tp)) + goto unreg; + /* Enabled event can not be unregistered */ if (trace_probe_is_enabled(&tk->tp)) return -EBUSY; @@ -500,12 +504,38 @@ static int unregister_trace_kprobe(struct trace_kprobe *tk) if (unregister_kprobe_event(tk)) return -EBUSY; +unreg: __unregister_trace_kprobe(tk); dyn_event_remove(&tk->devent); + trace_probe_unlink(&tk->tp); return 0; } +static int append_trace_kprobe(struct trace_kprobe *tk, struct trace_kprobe *to) +{ + int ret; + + /* Append to existing event */ + ret = trace_probe_append(&tk->tp, &to->tp); + if (ret) + return ret; + + /* Register k*probe */ + ret = __register_trace_kprobe(tk); + if (ret == -ENOENT && !trace_kprobe_module_exist(tk)) { + pr_warn("This probe might be able to register after target module is loaded. Continue.\n"); + ret = 0; + } + + if (ret) + trace_probe_unlink(&tk->tp); + else + dyn_event_add(&tk->devent); + + return ret; +} + /* Register a trace_probe and probe_event */ static int register_trace_kprobe(struct trace_kprobe *tk) { @@ -514,14 +544,24 @@ static int register_trace_kprobe(struct trace_kprobe *tk) mutex_lock(&event_mutex); - /* Delete old (same name) event if exist */ old_tk = find_trace_kprobe(trace_probe_name(&tk->tp), trace_probe_group_name(&tk->tp)); if (old_tk) { - ret = unregister_trace_kprobe(old_tk); - if (ret < 0) - goto end; - free_trace_kprobe(old_tk); + if (trace_kprobe_is_return(tk) != trace_kprobe_is_return(old_tk)) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, DIFF_PROBE_TYPE); + ret = -EEXIST; + } else { + ret = trace_probe_compare_arg_type(&tk->tp, &old_tk->tp); + if (ret) { + /* Note that argument starts index = 2 */ + trace_probe_log_set_index(ret + 1); + trace_probe_log_err(0, DIFF_ARG_TYPE); + ret = -EEXIST; + } else + ret = append_trace_kprobe(tk, old_tk); + } + goto end; } /* Register new event */ @@ -755,7 +795,7 @@ static int trace_kprobe_create(int argc, const char *argv[]) trace_probe_log_err(0, BAD_INSN_BNDRY); else if (ret == -ENOENT) trace_probe_log_err(0, BAD_PROBE_ADDR); - else if (ret != -ENOMEM) + else if (ret != -ENOMEM && ret != -EEXIST) trace_probe_log_err(0, FAIL_REG_PROBE); goto error; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 28733bd6b607..651a1449acde 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -886,6 +886,35 @@ int traceprobe_define_arg_fields(struct trace_event_call *event_call, return 0; } +static void trace_probe_event_free(struct trace_probe_event *tpe) +{ + kfree(tpe->class.system); + kfree(tpe->call.name); + kfree(tpe->call.print_fmt); + kfree(tpe); +} + +int trace_probe_append(struct trace_probe *tp, struct trace_probe *to) +{ + if (trace_probe_has_sibling(tp)) + return -EBUSY; + + list_del_init(&tp->list); + trace_probe_event_free(tp->event); + + tp->event = to->event; + list_add_tail(&tp->list, trace_probe_probe_list(to)); + + return 0; +} + +void trace_probe_unlink(struct trace_probe *tp) +{ + list_del_init(&tp->list); + if (list_empty(trace_probe_probe_list(tp))) + trace_probe_event_free(tp->event); + tp->event = NULL; +} void trace_probe_cleanup(struct trace_probe *tp) { @@ -894,15 +923,8 @@ void trace_probe_cleanup(struct trace_probe *tp) for (i = 0; i < tp->nr_args; i++) traceprobe_free_probe_arg(&tp->args[i]); - if (tp->event) { - struct trace_event_call *call = trace_probe_event_call(tp); - - kfree(tp->event->class.system); - kfree(call->name); - kfree(call->print_fmt); - kfree(tp->event); - tp->event = NULL; - } + if (tp->event) + trace_probe_unlink(tp); } int trace_probe_init(struct trace_probe *tp, const char *event, @@ -1006,3 +1028,22 @@ int trace_probe_remove_file(struct trace_probe *tp, return 0; } + +/* + * Return the smallest index of different type argument (start from 1). + * If all argument types and name are same, return 0. + */ +int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b) +{ + int i; + + for (i = 0; i < a->nr_args; i++) { + if ((b->nr_args <= i) || + ((a->args[i].type != b->args[i].type) || + (a->args[i].count != b->args[i].count) || + strcmp(a->args[i].name, b->args[i].name))) + return i + 1; + } + + return 0; +} diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 0b84abb884c2..39926e8a344b 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -302,6 +302,13 @@ static inline struct list_head *trace_probe_probe_list(struct trace_probe *tp) return &tp->event->probes; } +static inline bool trace_probe_has_sibling(struct trace_probe *tp) +{ + struct list_head *list = trace_probe_probe_list(tp); + + return !list_empty(list) && !list_is_singular(list); +} + static inline int trace_probe_unregister_event_call(struct trace_probe *tp) { /* tp->event is unregistered in trace_remove_event_call() */ @@ -316,12 +323,15 @@ static inline bool trace_probe_has_single_file(struct trace_probe *tp) int trace_probe_init(struct trace_probe *tp, const char *event, const char *group); void trace_probe_cleanup(struct trace_probe *tp); +int trace_probe_append(struct trace_probe *tp, struct trace_probe *to); +void trace_probe_unlink(struct trace_probe *tp); int trace_probe_register_event_call(struct trace_probe *tp); int trace_probe_add_file(struct trace_probe *tp, struct trace_event_file *file); int trace_probe_remove_file(struct trace_probe *tp, struct trace_event_file *file); struct event_file_link *trace_probe_get_file_link(struct trace_probe *tp, struct trace_event_file *file); +int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b); #define trace_probe_for_each_link(pos, tp) \ list_for_each_entry(pos, &(tp)->event->files, list) @@ -419,7 +429,9 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(ARG_TOO_LONG, "Argument expression is too long"), \ C(NO_ARG_BODY, "No argument expression"), \ C(BAD_INSN_BNDRY, "Probe point is not an instruction boundary"),\ - C(FAIL_REG_PROBE, "Failed to register probe event"), + C(FAIL_REG_PROBE, "Failed to register probe event"),\ + C(DIFF_PROBE_TYPE, "Probe type is different from existing probe"),\ + C(DIFF_ARG_TYPE, "Argument type or name is different from existing probe"), #undef C #define C(a, b) TP_ERR_##a -- cgit v1.2.3 From 41af3cf587f476f9a879b08219324c8b456e6a4c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 20 Jun 2019 00:07:58 +0900 Subject: tracing/uprobe: Add multi-probe per uprobe event support Allow user to define several probes on one uprobe event. Note that this only support appending method. So deleting event will delete all probes on the event. Link: http://lkml.kernel.org/r/156095687876.28024.13840331032234992863.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- kernel/trace/trace_uprobe.c | 60 +++++++++++++++++++++++++++++++-------------- 2 files changed, 43 insertions(+), 19 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a8505d84b76e..c7797a81a37e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4823,7 +4823,7 @@ static const char readme_msg[] = "\t\t\t Write into this file to define/undefine new trace events.\n" #endif #ifdef CONFIG_UPROBE_EVENTS - " uprobe_events\t\t- Add/remove/show the userspace dynamic events\n" + " uprobe_events\t\t- Create/append/remove/show the userspace dynamic events\n" "\t\t\t Write into this file to define/undefine new trace events.\n" #endif #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2862e6829e48..d84e09abb8de 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -364,15 +364,32 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu) { int ret; + if (trace_probe_has_sibling(&tu->tp)) + goto unreg; + ret = unregister_uprobe_event(tu); if (ret) return ret; +unreg: dyn_event_remove(&tu->devent); + trace_probe_unlink(&tu->tp); free_trace_uprobe(tu); return 0; } +static int append_trace_uprobe(struct trace_uprobe *tu, struct trace_uprobe *to) +{ + int ret; + + /* Append to existing event */ + ret = trace_probe_append(&tu->tp, &to->tp); + if (!ret) + dyn_event_add(&tu->devent); + + return ret; +} + /* * Uprobe with multiple reference counter is not allowed. i.e. * If inode and offset matches, reference counter offset *must* @@ -382,25 +399,21 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu) * as the new one does not conflict with any other existing * ones. */ -static struct trace_uprobe *find_old_trace_uprobe(struct trace_uprobe *new) +static int validate_ref_ctr_offset(struct trace_uprobe *new) { struct dyn_event *pos; - struct trace_uprobe *tmp, *old = NULL; + struct trace_uprobe *tmp; struct inode *new_inode = d_real_inode(new->path.dentry); - old = find_probe_event(trace_probe_name(&new->tp), - trace_probe_group_name(&new->tp)); - for_each_trace_uprobe(tmp, pos) { - if ((old ? old != tmp : true) && - new_inode == d_real_inode(tmp->path.dentry) && + if (new_inode == d_real_inode(tmp->path.dentry) && new->offset == tmp->offset && new->ref_ctr_offset != tmp->ref_ctr_offset) { pr_warn("Reference counter offset mismatch."); - return ERR_PTR(-EINVAL); + return -EINVAL; } } - return old; + return 0; } /* Register a trace_uprobe and probe_event */ @@ -411,18 +424,29 @@ static int register_trace_uprobe(struct trace_uprobe *tu) mutex_lock(&event_mutex); - /* register as an event */ - old_tu = find_old_trace_uprobe(tu); - if (IS_ERR(old_tu)) { - ret = PTR_ERR(old_tu); + ret = validate_ref_ctr_offset(tu); + if (ret) goto end; - } + /* register as an event */ + old_tu = find_probe_event(trace_probe_name(&tu->tp), + trace_probe_group_name(&tu->tp)); if (old_tu) { - /* delete old event */ - ret = unregister_trace_uprobe(old_tu); - if (ret) - goto end; + if (is_ret_probe(tu) != is_ret_probe(old_tu)) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, DIFF_PROBE_TYPE); + ret = -EEXIST; + } else { + ret = trace_probe_compare_arg_type(&tu->tp, &old_tu->tp); + if (ret) { + /* Note that argument starts index = 2 */ + trace_probe_log_set_index(ret + 1); + trace_probe_log_err(0, DIFF_ARG_TYPE); + ret = -EEXIST; + } else + ret = append_trace_uprobe(tu, old_tu); + } + goto end; } ret = register_uprobe_event(tu); -- cgit v1.2.3 From eb5bf81330a722d0079d28eed13d3a9355d938bf Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 20 Jun 2019 00:08:08 +0900 Subject: tracing/kprobe: Add per-probe delete from event Allow user to delete a probe from event. This is done by head match. For example, if we have 2 probes on an event $ cat kprobe_events p:kprobes/testprobe _do_fork r1=%ax r2=%dx p:kprobes/testprobe idle_fork r1=%ax r2=%cx Then you can remove one of them by passing the head of definition which identify the probe. $ echo "-:kprobes/testprobe idle_fork" >> kprobe_events Link: http://lkml.kernel.org/r/156095688848.28024.15798690082378432435.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 25 ++++++++++++++++++++++++- kernel/trace/trace_probe.c | 18 ++++++++++++++++++ kernel/trace/trace_probe.h | 2 ++ 3 files changed, 44 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index f43098bf62dd..18c4175b6585 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -137,13 +137,36 @@ static bool trace_kprobe_is_busy(struct dyn_event *ev) return trace_probe_is_enabled(&tk->tp); } +static bool trace_kprobe_match_command_head(struct trace_kprobe *tk, + int argc, const char **argv) +{ + char buf[MAX_ARGSTR_LEN + 1]; + + if (!argc) + return true; + + if (!tk->symbol) + snprintf(buf, sizeof(buf), "0x%p", tk->rp.kp.addr); + else if (tk->rp.kp.offset) + snprintf(buf, sizeof(buf), "%s+%u", + trace_kprobe_symbol(tk), tk->rp.kp.offset); + else + snprintf(buf, sizeof(buf), "%s", trace_kprobe_symbol(tk)); + if (strcmp(buf, argv[0])) + return false; + argc--; argv++; + + return trace_probe_match_command_args(&tk->tp, argc, argv); +} + static bool trace_kprobe_match(const char *system, const char *event, int argc, const char **argv, struct dyn_event *ev) { struct trace_kprobe *tk = to_trace_kprobe(ev); return strcmp(trace_probe_name(&tk->tp), event) == 0 && - (!system || strcmp(trace_probe_group_name(&tk->tp), system) == 0); + (!system || strcmp(trace_probe_group_name(&tk->tp), system) == 0) && + trace_kprobe_match_command_head(tk, argc, argv); } static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 651a1449acde..f8c3c65c035d 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1047,3 +1047,21 @@ int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b) return 0; } + +bool trace_probe_match_command_args(struct trace_probe *tp, + int argc, const char **argv) +{ + char buf[MAX_ARGSTR_LEN + 1]; + int i; + + if (tp->nr_args < argc) + return false; + + for (i = 0; i < argc; i++) { + snprintf(buf, sizeof(buf), "%s=%s", + tp->args[i].name, tp->args[i].comm); + if (strcmp(buf, argv[i])) + return false; + } + return true; +} diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 39926e8a344b..2dcc4e317787 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -332,6 +332,8 @@ int trace_probe_remove_file(struct trace_probe *tp, struct event_file_link *trace_probe_get_file_link(struct trace_probe *tp, struct trace_event_file *file); int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b); +bool trace_probe_match_command_args(struct trace_probe *tp, + int argc, const char **argv); #define trace_probe_for_each_link(pos, tp) \ list_for_each_entry(pos, &(tp)->event->files, list) -- cgit v1.2.3 From ab10d69eb714961d1eca4129e4f8cda5e0618f66 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 20 Jun 2019 00:08:18 +0900 Subject: tracing/uprobe: Add per-probe delete from event Add per-probe delete method from one event passing the head of definition. In other words, the events which match the head N parameters are deleted. Link: http://lkml.kernel.org/r/156095689811.28024.221706761151739433.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index d84e09abb8de..84925b5b6db5 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -284,13 +284,42 @@ static bool trace_uprobe_is_busy(struct dyn_event *ev) return trace_probe_is_enabled(&tu->tp); } +static bool trace_uprobe_match_command_head(struct trace_uprobe *tu, + int argc, const char **argv) +{ + char buf[MAX_ARGSTR_LEN + 1]; + int len; + + if (!argc) + return true; + + len = strlen(tu->filename); + if (strncmp(tu->filename, argv[0], len) || argv[0][len] != ':') + return false; + + if (tu->ref_ctr_offset == 0) + snprintf(buf, sizeof(buf), "0x%0*lx", + (int)(sizeof(void *) * 2), tu->offset); + else + snprintf(buf, sizeof(buf), "0x%0*lx(0x%lx)", + (int)(sizeof(void *) * 2), tu->offset, + tu->ref_ctr_offset); + if (strcmp(buf, &argv[0][len + 1])) + return false; + + argc--; argv++; + + return trace_probe_match_command_args(&tu->tp, argc, argv); +} + static bool trace_uprobe_match(const char *system, const char *event, int argc, const char **argv, struct dyn_event *ev) { struct trace_uprobe *tu = to_trace_uprobe(ev); return strcmp(trace_probe_name(&tu->tp), event) == 0 && - (!system || strcmp(trace_probe_group_name(&tu->tp), system) == 0); + (!system || strcmp(trace_probe_group_name(&tu->tp), system) == 0) && + trace_uprobe_match_command_head(tu, argc, argv); } static nokprobe_inline struct trace_uprobe * -- cgit v1.2.3 From 6218bf9f4d2942e88d97b60abc8c2ca0532e41a8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 20 Jun 2019 00:08:27 +0900 Subject: tracing/probe: Add immediate parameter support Add immediate value parameter (\1234) support to probe events. This allows you to specify an immediate (or dummy) parameter instead of fetching from memory or register. This feature looks odd, but imagine when you put a probe on a code to trace some data. If the code is compiled into 2 instructions and 1 instruction has a value but other has nothing since it is optimized out. In that case, you can not fold those into one event, even if ftrace supported multiple probes on one event. With this feature, you can set a dummy value like foo=\deadbeef instead of something like foo=%di. Link: http://lkml.kernel.org/r/156095690733.28024.13258186548822649469.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/kprobetrace.rst | 1 + Documentation/trace/uprobetracer.rst | 1 + kernel/trace/trace.c | 2 +- kernel/trace/trace_probe.c | 18 ++++++++++++++++++ kernel/trace/trace_probe.h | 1 + 5 files changed, 22 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst index fbb314bfa112..55993055902c 100644 --- a/Documentation/trace/kprobetrace.rst +++ b/Documentation/trace/kprobetrace.rst @@ -52,6 +52,7 @@ Synopsis of kprobe_events $retval : Fetch return value.(\*2) $comm : Fetch current task comm. +|-[u]OFFS(FETCHARG) : Fetch memory at FETCHARG +|- OFFS address.(\*3)(\*4) + \IMM : Store an immediate value to the argument. NAME=FETCHARG : Set NAME as the argument name of FETCHARG. FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types (u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types diff --git a/Documentation/trace/uprobetracer.rst b/Documentation/trace/uprobetracer.rst index 6e75a6c5a2c8..98cde99939d7 100644 --- a/Documentation/trace/uprobetracer.rst +++ b/Documentation/trace/uprobetracer.rst @@ -45,6 +45,7 @@ Synopsis of uprobe_tracer $retval : Fetch return value.(\*1) $comm : Fetch current task comm. +|-[u]OFFS(FETCHARG) : Fetch memory at FETCHARG +|- OFFS address.(\*2)(\*3) + \IMM : Store an immediate value to the argument. NAME=FETCHARG : Set NAME as the argument name of FETCHARG. FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types (u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c7797a81a37e..fb4003c10151 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4848,7 +4848,7 @@ static const char readme_msg[] = #else "\t $stack, $stack, $retval, $comm,\n" #endif - "\t +|-[u]()\n" + "\t +|-[u](), \\imm-value\n" "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n" "\t b@/, ustring,\n" "\t \\[\\]\n" diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index f8c3c65c035d..fb90baec3cd8 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -316,6 +316,17 @@ inval_var: return -EINVAL; } +static int str_to_immediate(char *str, unsigned long *imm) +{ + if (isdigit(str[0])) + return kstrtoul(str, 0, imm); + else if (str[0] == '-') + return kstrtol(str, 0, (long *)imm); + else if (str[0] == '+') + return kstrtol(str + 1, 0, (long *)imm); + return -EINVAL; +} + /* Recursive argument parser */ static int parse_probe_arg(char *arg, const struct fetch_type *type, @@ -444,6 +455,13 @@ parse_probe_arg(char *arg, const struct fetch_type *type, code->offset = offset; } break; + case '\\': /* Immediate value */ + ret = str_to_immediate(arg + 1, &code->immediate); + if (ret) + trace_probe_log_err(offs + 1, BAD_IMM); + else + code->op = FETCH_OP_IMM; + break; } if (!ret && code->op == FETCH_OP_NOP) { /* Parsed, but do not find fetch method */ diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 2dcc4e317787..cc113b82a4ce 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -408,6 +408,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(BAD_VAR, "Invalid $-valiable specified"), \ C(BAD_REG_NAME, "Invalid register name"), \ C(BAD_MEM_ADDR, "Invalid memory address"), \ + C(BAD_IMM, "Invalid immediate value"), \ C(FILE_ON_KPROBE, "File offset is not available with kprobe"), \ C(BAD_FILE_OFFS, "Invalid file offset value"), \ C(SYM_ON_UPROBE, "Symbol is not available with uprobe"), \ -- cgit v1.2.3 From a42e3c4de9642d5de524a0a48a7ce96872662dca Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 20 Jun 2019 00:08:37 +0900 Subject: tracing/probe: Add immediate string parameter support Add immediate string parameter (\"string") support to probe events. This allows you to specify an immediate (or dummy) parameter instead of fetching a string from memory. This feature looks odd, but imagine that you put a probe on a code to trace some string data. If the code is compiled into 2 instructions and 1 instruction has a string on memory but other has no string since it is optimized out. In that case, you can not fold those into one event, even if ftrace supported multiple probes on one event. With this feature, you can set a dummy string like foo=\"(optimized)":string instead of something like foo=+0(+0(%bp)):string. Link: http://lkml.kernel.org/r/156095691687.28024.13372712423865047991.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- kernel/trace/trace_kprobe.c | 3 +++ kernel/trace/trace_probe.c | 56 +++++++++++++++++++++++++++++++++------------ kernel/trace/trace_probe.h | 2 ++ kernel/trace/trace_uprobe.c | 3 +++ 5 files changed, 51 insertions(+), 15 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fb4003c10151..3916b72de715 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4848,7 +4848,7 @@ static const char readme_msg[] = #else "\t $stack, $stack, $retval, $comm,\n" #endif - "\t +|-[u](), \\imm-value\n" + "\t +|-[u](), \\imm-value, \\\"imm-string\"\n" "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n" "\t b@/, ustring,\n" "\t \\[\\]\n" diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 18c4175b6585..7579c53bb053 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1083,6 +1083,9 @@ retry: case FETCH_OP_COMM: val = (unsigned long)current->comm; break; + case FETCH_OP_DATA: + val = (unsigned long)code->data; + break; #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API case FETCH_OP_ARG: val = regs_get_kernel_argument(regs, code->param); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index fb90baec3cd8..1e67fef06e53 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -327,6 +327,18 @@ static int str_to_immediate(char *str, unsigned long *imm) return -EINVAL; } +static int __parse_imm_string(char *str, char **pbuf, int offs) +{ + size_t len = strlen(str); + + if (str[len - 1] != '"') { + trace_probe_log_err(offs + len, IMMSTR_NO_CLOSE); + return -EINVAL; + } + *pbuf = kstrndup(str, len - 1, GFP_KERNEL); + return 0; +} + /* Recursive argument parser */ static int parse_probe_arg(char *arg, const struct fetch_type *type, @@ -441,7 +453,8 @@ parse_probe_arg(char *arg, const struct fetch_type *type, ret = parse_probe_arg(arg, t2, &code, end, flags, offs); if (ret) break; - if (code->op == FETCH_OP_COMM) { + if (code->op == FETCH_OP_COMM || + code->op == FETCH_OP_DATA) { trace_probe_log_err(offs, COMM_CANT_DEREF); return -EINVAL; } @@ -456,11 +469,19 @@ parse_probe_arg(char *arg, const struct fetch_type *type, } break; case '\\': /* Immediate value */ - ret = str_to_immediate(arg + 1, &code->immediate); - if (ret) - trace_probe_log_err(offs + 1, BAD_IMM); - else - code->op = FETCH_OP_IMM; + if (arg[1] == '"') { /* Immediate string */ + ret = __parse_imm_string(arg + 2, &tmp, offs + 2); + if (ret) + break; + code->op = FETCH_OP_DATA; + code->data = tmp; + } else { + ret = str_to_immediate(arg + 1, &code->immediate); + if (ret) + trace_probe_log_err(offs + 1, BAD_IMM); + else + code->op = FETCH_OP_IMM; + } break; } if (!ret && code->op == FETCH_OP_NOP) { @@ -560,8 +581,11 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, } } - /* Since $comm can not be dereferred, we can find $comm by strcmp */ - if (strcmp(arg, "$comm") == 0) { + /* + * Since $comm and immediate string can not be dereferred, + * we can find those by strcmp. + */ + if (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0) { /* The type of $comm must be "string", and not an array. */ if (parg->count || (t && strcmp(t, "string"))) return -EINVAL; @@ -598,7 +622,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, if (!strcmp(parg->type->name, "string") || !strcmp(parg->type->name, "ustring")) { if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF && - code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM) { + code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM && + code->op != FETCH_OP_DATA) { trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_STRING); ret = -EINVAL; @@ -607,9 +632,10 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM) || parg->count) { /* - * IMM and COMM is pointing actual address, those must - * be kept, and if parg->count != 0, this is an array - * of string pointers instead of string address itself. + * IMM, DATA and COMM is pointing actual address, those + * must be kept, and if parg->count != 0, this is an + * array of string pointers instead of string address + * itself. */ code++; if (code->op != FETCH_OP_NOP) { @@ -683,7 +709,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, fail: if (ret) { for (code = tmp; code < tmp + FETCH_INSN_MAX; code++) - if (code->op == FETCH_NOP_SYMBOL) + if (code->op == FETCH_NOP_SYMBOL || + code->op == FETCH_OP_DATA) kfree(code->data); } kfree(tmp); @@ -754,7 +781,8 @@ void traceprobe_free_probe_arg(struct probe_arg *arg) struct fetch_insn *code = arg->code; while (code && code->op != FETCH_OP_END) { - if (code->op == FETCH_NOP_SYMBOL) + if (code->op == FETCH_NOP_SYMBOL || + code->op == FETCH_OP_DATA) kfree(code->data); code++; } diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index cc113b82a4ce..f805cc4cbe7c 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -89,6 +89,7 @@ enum fetch_op { FETCH_OP_COMM, /* Current comm */ FETCH_OP_ARG, /* Function argument : .param */ FETCH_OP_FOFFS, /* File offset: .immediate */ + FETCH_OP_DATA, /* Allocated data: .data */ // Stage 2 (dereference) op FETCH_OP_DEREF, /* Dereference: .offset */ FETCH_OP_UDEREF, /* User-space Dereference: .offset */ @@ -409,6 +410,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(BAD_REG_NAME, "Invalid register name"), \ C(BAD_MEM_ADDR, "Invalid memory address"), \ C(BAD_IMM, "Invalid immediate value"), \ + C(IMMSTR_NO_CLOSE, "String is not closed with '\"'"), \ C(FILE_ON_KPROBE, "File offset is not available with kprobe"), \ C(BAD_FILE_OFFS, "Invalid file offset value"), \ C(SYM_ON_UPROBE, "Symbol is not available with uprobe"), \ diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 84925b5b6db5..cbf4da4bf367 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -248,6 +248,9 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, case FETCH_OP_COMM: val = FETCH_TOKEN_COMM; break; + case FETCH_OP_DATA: + val = (unsigned long)code->data; + break; case FETCH_OP_FOFFS: val = translate_user_vaddr(code->immediate); break; -- cgit v1.2.3 From f7edb451fa51e44e62177347ea7850aa0e901ea5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 7 Aug 2019 11:28:59 -0400 Subject: tracing/arm64: Have max stack tracer handle the case of return address after data Most archs (well at least x86) store the function call return address on the stack before storing the local variables for the function. The max stack tracer depends on this in its algorithm to display the stack size of each function it finds in the back trace. Some archs (arm64), may store the return address (from its link register) just before calling a nested function. There's no reason to save the link register on leaf functions, as it wont be updated. This breaks the algorithm of the max stack tracer. Add a new define ARCH_FTRACE_SHIFT_STACK_TRACER that an architecture may set if it stores the return address (link register) after it stores the function's local variables, and have the stack trace shift the values of the mapped stack size to the appropriate functions. Link: 20190802094103.163576-1-jiping.ma2@windriver.com Reported-by: Jiping Ma Acked-by: Will Deacon Signed-off-by: Steven Rostedt (VMware) --- arch/arm64/include/asm/ftrace.h | 13 +++++++++++++ kernel/trace/trace_stack.c | 14 ++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'kernel/trace') diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h index 5ab5200b2bdc..d48667b04c41 100644 --- a/arch/arm64/include/asm/ftrace.h +++ b/arch/arm64/include/asm/ftrace.h @@ -14,6 +14,19 @@ #define MCOUNT_ADDR ((unsigned long)_mcount) #define MCOUNT_INSN_SIZE AARCH64_INSN_SIZE +/* + * Currently, gcc tends to save the link register after the local variables + * on the stack. This causes the max stack tracer to report the function + * frame sizes for the wrong functions. By defining + * ARCH_FTRACE_SHIFT_STACK_TRACER, it will tell the stack tracer to expect + * to find the return address on the stack after the local variables have + * been set up. + * + * Note, this may change in the future, and we will need to deal with that + * if it were to happen. + */ +#define ARCH_FTRACE_SHIFT_STACK_TRACER 1 + #ifndef __ASSEMBLY__ #include diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 5d16f73898db..642a850af81a 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -158,6 +158,20 @@ static void check_stack(unsigned long ip, unsigned long *stack) i++; } +#ifdef ARCH_FTRACE_SHIFT_STACK_TRACER + /* + * Some archs will store the link register before calling + * nested functions. This means the saved return address + * comes after the local storage, and we need to shift + * for that. + */ + if (x > 1) { + memmove(&stack_trace_index[0], &stack_trace_index[1], + sizeof(stack_trace_index[0]) * (x - 1)); + x--; + } +#endif + stack_trace_nr_entries = x; if (task_stack_end_corrupted(current)) { -- cgit v1.2.3 From 58fe7a87db51ea00596187765dabfc2c4ea2b436 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 7 Aug 2019 12:27:30 -0400 Subject: tracing: Document the stack trace algorithm in the comments As the max stack tracer algorithm is not that easy to understand from the code, add comments that explain the algorithm and mentions how ARCH_FTRACE_SHIFT_STACK_TRACER affects it. Link: http://lkml.kernel.org/r/20190806123455.487ac02b@gandalf.local.home Suggested-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_stack.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 642a850af81a..ec9a34a97129 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -53,6 +53,104 @@ static void print_max_stack(void) } } +/* + * The stack tracer looks for a maximum stack at each call from a function. It + * registers a callback from ftrace, and in that callback it examines the stack + * size. It determines the stack size from the variable passed in, which is the + * address of a local variable in the stack_trace_call() callback function. + * The stack size is calculated by the address of the local variable to the top + * of the current stack. If that size is smaller than the currently saved max + * stack size, nothing more is done. + * + * If the size of the stack is greater than the maximum recorded size, then the + * following algorithm takes place. + * + * For architectures (like x86) that store the function's return address before + * saving the function's local variables, the stack will look something like + * this: + * + * [ top of stack ] + * 0: sys call entry frame + * 10: return addr to entry code + * 11: start of sys_foo frame + * 20: return addr to sys_foo + * 21: start of kernel_func_bar frame + * 30: return addr to kernel_func_bar + * 31: [ do trace stack here ] + * + * The save_stack_trace() is called returning all the functions it finds in the + * current stack. Which would be (from the bottom of the stack to the top): + * + * return addr to kernel_func_bar + * return addr to sys_foo + * return addr to entry code + * + * Now to figure out how much each of these functions' local variable size is, + * a search of the stack is made to find these values. When a match is made, it + * is added to the stack_dump_trace[] array. The offset into the stack is saved + * in the stack_trace_index[] array. The above example would show: + * + * stack_dump_trace[] | stack_trace_index[] + * ------------------ + ------------------- + * return addr to kernel_func_bar | 30 + * return addr to sys_foo | 20 + * return addr to entry | 10 + * + * The print_max_stack() function above, uses these values to print the size of + * each function's portion of the stack. + * + * for (i = 0; i < nr_entries; i++) { + * size = i == nr_entries - 1 ? stack_trace_index[i] : + * stack_trace_index[i] - stack_trace_index[i+1] + * print "%d %d %d %s\n", i, stack_trace_index[i], size, stack_dump_trace[i]); + * } + * + * The above shows + * + * depth size location + * ----- ---- -------- + * 0 30 10 kernel_func_bar + * 1 20 10 sys_foo + * 2 10 10 entry code + * + * Now for architectures that might save the return address after the functions + * local variables (saving the link register before calling nested functions), + * this will cause the stack to look a little different: + * + * [ top of stack ] + * 0: sys call entry frame + * 10: start of sys_foo_frame + * 19: return addr to entry code << lr saved before calling kernel_func_bar + * 20: start of kernel_func_bar frame + * 29: return addr to sys_foo_frame << lr saved before calling next function + * 30: [ do trace stack here ] + * + * Although the functions returned by save_stack_trace() may be the same, the + * placement in the stack will be different. Using the same algorithm as above + * would yield: + * + * stack_dump_trace[] | stack_trace_index[] + * ------------------ + ------------------- + * return addr to kernel_func_bar | 30 + * return addr to sys_foo | 29 + * return addr to entry | 19 + * + * Where the mapping is off by one: + * + * kernel_func_bar stack frame size is 29 - 19 not 30 - 29! + * + * To fix this, if the architecture sets ARCH_RET_ADDR_AFTER_LOCAL_VARS the + * values in stack_trace_index[] are shifted by one to and the number of + * stack trace entries is decremented by one. + * + * stack_dump_trace[] | stack_trace_index[] + * ------------------ + ------------------- + * return addr to kernel_func_bar | 29 + * return addr to sys_foo | 19 + * + * Although the entry function is not displayed, the first function (sys_foo) + * will still include the stack size of it. + */ static void check_stack(unsigned long ip, unsigned long *stack) { unsigned long this_size, flags; unsigned long *p, *top, *start; -- cgit v1.2.3 From a47b53e95accfd2814efe39dfca06dbd45cd857a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 13 Aug 2019 12:14:35 -0400 Subject: tracing: Rename tracing_reset() to tracing_reset_cpu() The name tracing_reset() was a misnomer, as it really only reset a single CPU buffer. Rename it to tracing_reset_cpu() and also make it static and remove the prototype from trace.h, as it is only used in a single function. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 6 +++--- kernel/trace/trace.h | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3916b72de715..e917aa783675 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1854,7 +1854,7 @@ int __init register_tracer(struct tracer *type) return ret; } -void tracing_reset(struct trace_buffer *buf, int cpu) +static void tracing_reset_cpu(struct trace_buffer *buf, int cpu) { struct ring_buffer *buffer = buf->buffer; @@ -4251,7 +4251,7 @@ static int tracing_open(struct inode *inode, struct file *file) if (cpu == RING_BUFFER_ALL_CPUS) tracing_reset_online_cpus(trace_buf); else - tracing_reset(trace_buf, cpu); + tracing_reset_cpu(trace_buf, cpu); } if (file->f_mode & FMODE_READ) { @@ -6742,7 +6742,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, if (iter->cpu_file == RING_BUFFER_ALL_CPUS) tracing_reset_online_cpus(&tr->max_buffer); else - tracing_reset(&tr->max_buffer, iter->cpu_file); + tracing_reset_cpu(&tr->max_buffer, iter->cpu_file); } break; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 005f08629b8b..26b0a08f3c7d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -677,7 +677,6 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu) int tracer_init(struct tracer *t, struct trace_array *tr); int tracing_is_enabled(void); -void tracing_reset(struct trace_buffer *buf, int cpu); void tracing_reset_online_cpus(struct trace_buffer *buf); void tracing_reset_current(int cpu); void tracing_reset_all_online_cpus(void); -- cgit v1.2.3 From ac68154626ab7fe4ce5f424937c34f42a3e20c5b Mon Sep 17 00:00:00 2001 From: Zhengjun Xing Date: Fri, 12 Jul 2019 09:53:08 +0800 Subject: tracing: Add "gfp_t" support in synthetic_events Add "gfp_t" support in synthetic_events, then the "gfp_t" type parameter in some functions can be traced. Prints the gfp flags as hex in addition to the human-readable flag string. Example output: whoopsie-630 [000] ...1 78.969452: testevent: bar=b20 (GFP_ATOMIC|__GFP_ZERO) rcuc/0-11 [000] ...1 81.097555: testevent: bar=a20 (GFP_ATOMIC) rcuc/0-11 [000] ...1 81.583123: testevent: bar=a20 (GFP_ATOMIC) Link: http://lkml.kernel.org/r/20190712015308.9908-1-zhengjun.xing@linux.intel.com Signed-off-by: Zhengjun Xing [ Added printing of flag names ] Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 65e7d071ed28..3a6e42aa08e6 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -13,6 +13,10 @@ #include #include +/* for gfp flag names */ +#include +#include + #include "tracing_map.h" #include "trace.h" #include "trace_dynevent.h" @@ -752,6 +756,8 @@ static int synth_field_size(char *type) size = sizeof(unsigned long); else if (strcmp(type, "pid_t") == 0) size = sizeof(pid_t); + else if (strcmp(type, "gfp_t") == 0) + size = sizeof(gfp_t); else if (synth_field_is_string(type)) size = synth_field_string_size(type); @@ -792,6 +798,8 @@ static const char *synth_field_fmt(char *type) fmt = "%lu"; else if (strcmp(type, "pid_t") == 0) fmt = "%d"; + else if (strcmp(type, "gfp_t") == 0) + fmt = "%x"; else if (synth_field_is_string(type)) fmt = "%s"; @@ -834,9 +842,20 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, i == se->n_fields - 1 ? "" : " "); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } else { + struct trace_print_flags __flags[] = { + __def_gfpflag_names, {-1, NULL} }; + trace_seq_printf(s, print_fmt, se->fields[i]->name, entry->fields[n_u64], i == se->n_fields - 1 ? "" : " "); + + if (strcmp(se->fields[i]->type, "gfp_t") == 0) { + trace_seq_puts(s, " ("); + trace_print_flags_seq(s, "|", + entry->fields[n_u64], + __flags); + trace_seq_putc(s, ')'); + } n_u64++; } } -- cgit v1.2.3 From 08468754c16e731d31538a8b1b0b433be2410a89 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 10 Sep 2019 22:33:36 +0800 Subject: ftrace: Simplify ftrace hash lookup code in clear_func_from_hash() Function ftrace_lookup_ip() will check empty hash table. So we don't need extra check outside. Link: http://lkml.kernel.org/r/20190910143336.13472-1-changbin.du@gmail.com Signed-off-by: Changbin Du Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f9821a3374e9..c4cc048eb594 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6036,11 +6036,7 @@ clear_func_from_hash(struct ftrace_init_func *func, struct ftrace_hash *hash) { struct ftrace_func_entry *entry; - if (ftrace_hash_empty(hash)) - return; - - entry = __ftrace_lookup_ip(hash, func->ip); - + entry = ftrace_lookup_ip(hash, func->ip); /* * Do not allow this rec to match again. * Yeah, it may waste some memory, but will be removed -- cgit v1.2.3 From 119cdbdb95a66203c0bca09474427c297186f7a3 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 6 Aug 2019 18:15:43 +0300 Subject: tracing: Be more clever when dumping hex in __print_hex() Hex dump as many as 16 bytes at once in trace_print_hex_seq() instead of byte-by-byte approach. Link: http://lkml.kernel.org/r/20190806151543.86061-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_output.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index cab4a5398f1d..d54ce252b05a 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -219,10 +219,10 @@ trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len, { int i; const char *ret = trace_seq_buffer_ptr(p); + const char *fmt = concatenate ? "%*phN" : "%*ph"; - for (i = 0; i < buf_len; i++) - trace_seq_printf(p, "%s%2.2x", concatenate || i == 0 ? "" : " ", - buf[i]); + for (i = 0; i < buf_len; i += 16) + trace_seq_printf(p, fmt, min(buf_len - i, 16), &buf[i]); trace_seq_putc(p, 0); return ret; -- cgit v1.2.3 From 17f8607a1658a8e70415eef67909f990d13017b5 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 1 Sep 2019 17:02:01 -0500 Subject: tracing: Make sure variable reference alias has correct var_ref_idx Original changelog from Steve Rostedt (except last sentence which explains the problem, and the Fixes: tag): I performed a three way histogram with the following commands: echo 'irq_lat u64 lat pid_t pid' > synthetic_events echo 'wake_lat u64 lat u64 irqlat pid_t pid' >> synthetic_events echo 'hist:keys=common_pid:irqts=common_timestamp.usecs if function == 0xffffffff81200580' > events/timer/hrtimer_start/trigger echo 'hist:keys=common_pid:lat=common_timestamp.usecs-$irqts:onmatch(timer.hrtimer_start).irq_lat($lat,pid) if common_flags & 1' > events/sched/sched_waking/trigger echo 'hist:keys=pid:wakets=common_timestamp.usecs,irqlat=lat' > events/synthetic/irq_lat/trigger echo 'hist:keys=next_pid:lat=common_timestamp.usecs-$wakets,irqlat=$irqlat:onmatch(synthetic.irq_lat).wake_lat($lat,$irqlat,next_pid)' > events/sched/sched_switch/trigger echo 1 > events/synthetic/wake_lat/enable Basically I wanted to see: hrtimer_start (calling function tick_sched_timer) Note: # grep tick_sched_timer /proc/kallsyms ffffffff81200580 t tick_sched_timer And save the time of that, and then record sched_waking if it is called in interrupt context and with the same pid as the hrtimer_start, it will record the latency between that and the waking event. I then look at when the task that is woken is scheduled in, and record the latency between the wakeup and the task running. At the end, the wake_lat synthetic event will show the wakeup to scheduled latency, as well as the irq latency in from hritmer_start to the wakeup. The problem is that I found this: -0 [007] d... 190.485261: wake_lat: lat=27 irqlat=190485230 pid=698 -0 [005] d... 190.485283: wake_lat: lat=40 irqlat=190485239 pid=10 -0 [002] d... 190.488327: wake_lat: lat=56 irqlat=190488266 pid=335 -0 [005] d... 190.489330: wake_lat: lat=64 irqlat=190489262 pid=10 -0 [003] d... 190.490312: wake_lat: lat=43 irqlat=190490265 pid=77 -0 [005] d... 190.493322: wake_lat: lat=54 irqlat=190493262 pid=10 -0 [005] d... 190.497305: wake_lat: lat=35 irqlat=190497267 pid=10 -0 [005] d... 190.501319: wake_lat: lat=50 irqlat=190501264 pid=10 The irqlat seemed quite large! Investigating this further, if I had enabled the irq_lat synthetic event, I noticed this: -0 [002] d.s. 249.429308: irq_lat: lat=164968 pid=335 -0 [002] d... 249.429369: wake_lat: lat=55 irqlat=249429308 pid=335 Notice that the timestamp of the irq_lat "249.429308" is awfully similar to the reported irqlat variable. In fact, all instances were like this. It appeared that: irqlat=$irqlat Wasn't assigning the old $irqlat to the new irqlat variable, but instead was assigning the $irqts to it. The issue is that assigning the old $irqlat to the new irqlat variable creates a variable reference alias, but the alias creation code forgets to make sure the alias uses the same var_ref_idx to access the reference. Link: http://lkml.kernel.org/r/1567375321.5282.12.camel@kernel.org Cc: Linux Trace Devel Cc: linux-rt-users Cc: stable@vger.kernel.org Fixes: 7e8b88a30b085 ("tracing: Add hist trigger support for variable reference aliases") Reported-by: Steven Rostedt (VMware) Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 3a6e42aa08e6..9468bd8d44a2 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2804,6 +2804,8 @@ static struct hist_field *create_alias(struct hist_trigger_data *hist_data, return NULL; } + alias->var_ref_idx = var_ref->var_ref_idx; + return alias; } -- cgit v1.2.3 From d59fae6fea39efe65bb3d3310aaa2a54b5f55c0d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 17 Sep 2019 14:11:37 +0900 Subject: tracing/kprobe: Fix NULL pointer access in trace_porbe_unlink() Fix NULL pointer access in trace_probe_unlink() by initializing trace_probe.list correctly in trace_probe_init(). In the error case of trace_probe_init(), it can call trace_probe_unlink() before initializing trace_probe.list member. This causes NULL pointer dereference at list_del_init() in trace_probe_unlink(). Syzbot reported : kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 8633 Comm: syz-executor797 Not tainted 5.3.0-rc8-next-20190915 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__list_del_entry_valid+0x85/0xf5 lib/list_debug.c:51 Code: 0f 84 e1 00 00 00 48 b8 22 01 00 00 00 00 ad de 49 39 c4 0f 84 e2 00 00 00 48 b8 00 00 00 00 00 fc ff df 4c 89 e2 48 c1 ea 03 <80> 3c 02 00 75 53 49 8b 14 24 4c 39 f2 0f 85 99 00 00 00 49 8d 7d RSP: 0018:ffff888090a7f9d8 EFLAGS: 00010246 RAX: dffffc0000000000 RBX: ffff88809b6f90c0 RCX: ffffffff817c0ca9 RDX: 0000000000000000 RSI: ffffffff817c0a73 RDI: ffff88809b6f90c8 RBP: ffff888090a7f9f0 R08: ffff88809a04e600 R09: ffffed1015d26aed R10: ffffed1015d26aec R11: ffff8880ae935763 R12: 0000000000000000 R13: 0000000000000000 R14: ffff88809b6f90c0 R15: ffff88809b6f90d0 FS: 0000555556f99880(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000006cc090 CR3: 00000000962b2000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __list_del_entry include/linux/list.h:131 [inline] list_del_init include/linux/list.h:190 [inline] trace_probe_unlink+0x1f/0x200 kernel/trace/trace_probe.c:959 trace_probe_cleanup+0xd3/0x110 kernel/trace/trace_probe.c:973 trace_probe_init+0x3f2/0x510 kernel/trace/trace_probe.c:1011 alloc_trace_uprobe+0x5e/0x250 kernel/trace/trace_uprobe.c:353 create_local_trace_uprobe+0x109/0x4a0 kernel/trace/trace_uprobe.c:1508 perf_uprobe_init+0x131/0x210 kernel/trace/trace_event_perf.c:314 perf_uprobe_event_init+0x106/0x1a0 kernel/events/core.c:8898 perf_try_init_event+0x135/0x590 kernel/events/core.c:10184 perf_init_event kernel/events/core.c:10228 [inline] perf_event_alloc.part.0+0x1b89/0x33d0 kernel/events/core.c:10505 perf_event_alloc kernel/events/core.c:10887 [inline] __do_sys_perf_event_open+0xa2d/0x2d00 kernel/events/core.c:10989 __se_sys_perf_event_open kernel/events/core.c:10871 [inline] __x64_sys_perf_event_open+0xbe/0x150 kernel/events/core.c:10871 do_syscall_64+0xfa/0x760 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe Link: http://lkml.kernel.org/r/156869709721.22406.5153754822203046939.stgit@devnote2 Reported-by: syzbot+2f807f4d3a2a4e87f18f@syzkaller.appspotmail.com Fixes: ca89bc071d5e ("tracing/kprobe: Add multi-probe per event support") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 1e67fef06e53..baf58a3612c0 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -986,6 +986,12 @@ int trace_probe_init(struct trace_probe *tp, const char *event, if (!tp->event) return -ENOMEM; + INIT_LIST_HEAD(&tp->event->files); + INIT_LIST_HEAD(&tp->event->class.fields); + INIT_LIST_HEAD(&tp->event->probes); + INIT_LIST_HEAD(&tp->list); + list_add(&tp->event->probes, &tp->list); + call = trace_probe_event_call(tp); call->class = &tp->event->class; call->name = kstrdup(event, GFP_KERNEL); @@ -999,11 +1005,6 @@ int trace_probe_init(struct trace_probe *tp, const char *event, ret = -ENOMEM; goto error; } - INIT_LIST_HEAD(&tp->event->files); - INIT_LIST_HEAD(&tp->event->class.fields); - INIT_LIST_HEAD(&tp->event->probes); - INIT_LIST_HEAD(&tp->list); - list_add(&tp->event->probes, &tp->list); return 0; -- cgit v1.2.3 From a3db31ff6ce31f5a544a66b61613a098029031cc Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 5 Sep 2019 23:50:28 +0530 Subject: ftrace: Look up the address of return_to_handler() using helpers This ensures that we use the right address on architectures that use function descriptors. Signed-off-by: Naveen N. Rao Acked-by: Steven Rostedt (VMware) Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/8f6f14d192a994008ac370ce14036bbe67224c7d.1567707399.git.naveen.n.rao@linux.vnet.ibm.com --- kernel/trace/fgraph.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 8dfd5021b933..7950a0356042 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -276,7 +276,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, int index = task->curr_ret_stack; int i; - if (ret != (unsigned long)return_to_handler) + if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler)) return ret; if (index < 0) @@ -294,7 +294,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, { int task_idx; - if (ret != (unsigned long)return_to_handler) + if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler)) return ret; task_idx = task->curr_ret_stack; -- cgit v1.2.3 From 44d00dc7ceab1732ebd5f5aae601b24dacdf10c5 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 18 Sep 2019 17:55:37 +0900 Subject: tracing/probe: Fix to allow user to enable events on unloaded modules Fix to allow user to enable probe events on unloaded modules. This operations was allowed before commit 60d53e2c3b75 ("tracing/probe: Split trace_event related data from trace_probe"), because if users need to probe module init functions, they have to enable those probe events before loading module. Link: http://lkml.kernel.org/r/156879693733.31056.9331322616994665167.stgit@devnote2 Cc: stable@vger.kernel.org Fixes: 60d53e2c3b75 ("tracing/probe: Split trace_event related data from trace_probe") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7579c53bb053..0ba3239c0270 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -371,31 +371,24 @@ static int enable_trace_kprobe(struct trace_event_call *call, if (enabled) return 0; - enabled = false; list_for_each_entry(pos, trace_probe_probe_list(tp), list) { tk = container_of(pos, struct trace_kprobe, tp); if (trace_kprobe_has_gone(tk)) continue; ret = __enable_trace_kprobe(tk); - if (ret) { - if (enabled) { - __disable_trace_kprobe(tp); - enabled = false; - } + if (ret) break; - } enabled = true; } - if (!enabled) { - /* No probe is enabled. Roll back */ + if (ret) { + /* Failed to enable one of them. Roll back all */ + if (enabled) + __disable_trace_kprobe(tp); if (file) trace_probe_remove_file(tp, file); else trace_probe_clear_flag(tp, TP_FLAG_PROFILE); - if (!ret) - /* Since all probes are gone, this is not available */ - ret = -EADDRNOTAVAIL; } return ret; -- cgit v1.2.3 From fe60b0ce8e7335269722ec080173a9411a9d58a5 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 18 Sep 2019 17:55:46 +0900 Subject: tracing/probe: Reject exactly same probe event Reject exactly same probe events as existing probes. Multiprobe allows user to define multiple probes on same event. If user appends a probe which exactly same definition (same probe address and same arguments) on existing event, the event will record same probe information twice. That can be confusing users, so reject it. Link: http://lkml.kernel.org/r/156879694602.31056.5533024778165036763.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 52 ++++++++++++++++++++++++++++++++++++++------- kernel/trace/trace_probe.h | 3 ++- kernel/trace/trace_uprobe.c | 52 ++++++++++++++++++++++++++++++++++++++------- 3 files changed, 90 insertions(+), 17 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 0ba3239c0270..a6697e28ddda 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -528,10 +528,53 @@ unreg: return 0; } +static bool trace_kprobe_has_same_kprobe(struct trace_kprobe *orig, + struct trace_kprobe *comp) +{ + struct trace_probe_event *tpe = orig->tp.event; + struct trace_probe *pos; + int i; + + list_for_each_entry(pos, &tpe->probes, list) { + orig = container_of(pos, struct trace_kprobe, tp); + if (strcmp(trace_kprobe_symbol(orig), + trace_kprobe_symbol(comp)) || + trace_kprobe_offset(orig) != trace_kprobe_offset(comp)) + continue; + + /* + * trace_probe_compare_arg_type() ensured that nr_args and + * each argument name and type are same. Let's compare comm. + */ + for (i = 0; i < orig->tp.nr_args; i++) { + if (strcmp(orig->tp.args[i].comm, + comp->tp.args[i].comm)) + continue; + } + + return true; + } + + return false; +} + static int append_trace_kprobe(struct trace_kprobe *tk, struct trace_kprobe *to) { int ret; + ret = trace_probe_compare_arg_type(&tk->tp, &to->tp); + if (ret) { + /* Note that argument starts index = 2 */ + trace_probe_log_set_index(ret + 1); + trace_probe_log_err(0, DIFF_ARG_TYPE); + return -EEXIST; + } + if (trace_kprobe_has_same_kprobe(to, tk)) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, SAME_PROBE); + return -EEXIST; + } + /* Append to existing event */ ret = trace_probe_append(&tk->tp, &to->tp); if (ret) @@ -568,14 +611,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk) trace_probe_log_err(0, DIFF_PROBE_TYPE); ret = -EEXIST; } else { - ret = trace_probe_compare_arg_type(&tk->tp, &old_tk->tp); - if (ret) { - /* Note that argument starts index = 2 */ - trace_probe_log_set_index(ret + 1); - trace_probe_log_err(0, DIFF_ARG_TYPE); - ret = -EEXIST; - } else - ret = append_trace_kprobe(tk, old_tk); + ret = append_trace_kprobe(tk, old_tk); } goto end; } diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index f805cc4cbe7c..4ee703728aec 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -436,7 +436,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(BAD_INSN_BNDRY, "Probe point is not an instruction boundary"),\ C(FAIL_REG_PROBE, "Failed to register probe event"),\ C(DIFF_PROBE_TYPE, "Probe type is different from existing probe"),\ - C(DIFF_ARG_TYPE, "Argument type or name is different from existing probe"), + C(DIFF_ARG_TYPE, "Argument type or name is different from existing probe"),\ + C(SAME_PROBE, "There is already the exact same probe event"), #undef C #define C(a, b) TP_ERR_##a diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index cbf4da4bf367..34dd6d0016a3 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -410,10 +410,53 @@ unreg: return 0; } +static bool trace_uprobe_has_same_uprobe(struct trace_uprobe *orig, + struct trace_uprobe *comp) +{ + struct trace_probe_event *tpe = orig->tp.event; + struct trace_probe *pos; + struct inode *comp_inode = d_real_inode(comp->path.dentry); + int i; + + list_for_each_entry(pos, &tpe->probes, list) { + orig = container_of(pos, struct trace_uprobe, tp); + if (comp_inode != d_real_inode(orig->path.dentry) || + comp->offset != orig->offset) + continue; + + /* + * trace_probe_compare_arg_type() ensured that nr_args and + * each argument name and type are same. Let's compare comm. + */ + for (i = 0; i < orig->tp.nr_args; i++) { + if (strcmp(orig->tp.args[i].comm, + comp->tp.args[i].comm)) + continue; + } + + return true; + } + + return false; +} + static int append_trace_uprobe(struct trace_uprobe *tu, struct trace_uprobe *to) { int ret; + ret = trace_probe_compare_arg_type(&tu->tp, &to->tp); + if (ret) { + /* Note that argument starts index = 2 */ + trace_probe_log_set_index(ret + 1); + trace_probe_log_err(0, DIFF_ARG_TYPE); + return -EEXIST; + } + if (trace_uprobe_has_same_uprobe(to, tu)) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, SAME_PROBE); + return -EEXIST; + } + /* Append to existing event */ ret = trace_probe_append(&tu->tp, &to->tp); if (!ret) @@ -469,14 +512,7 @@ static int register_trace_uprobe(struct trace_uprobe *tu) trace_probe_log_err(0, DIFF_PROBE_TYPE); ret = -EEXIST; } else { - ret = trace_probe_compare_arg_type(&tu->tp, &old_tu->tp); - if (ret) { - /* Note that argument starts index = 2 */ - trace_probe_log_set_index(ret + 1); - trace_probe_log_err(0, DIFF_ARG_TYPE); - ret = -EEXIST; - } else - ret = append_trace_uprobe(tu, old_tu); + ret = append_trace_uprobe(tu, old_tu); } goto end; } -- cgit v1.2.3 From f8d7ab2bded897607bff6324d5c6ea6b4aecca0c Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Tue, 24 Sep 2019 17:19:06 +0530 Subject: tracing/probe: Fix same probe event argument matching Commit fe60b0ce8e73 ("tracing/probe: Reject exactly same probe event") tries to reject a event which matches an already existing probe. However it currently continues to match arguments and rejects adding a probe even when the arguments don't match. Fix this by only rejecting a probe if and only if all the arguments match. Link: http://lkml.kernel.org/r/20190924114906.14038-1-srikar@linux.vnet.ibm.com Fixes: fe60b0ce8e73 ("tracing/probe: Reject exactly same probe event") Acked-by: Masami Hiramatsu Signed-off-by: Srikar Dronamraju Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 5 +++-- kernel/trace/trace_uprobe.c | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index a6697e28ddda..402dc3ce88d3 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -549,10 +549,11 @@ static bool trace_kprobe_has_same_kprobe(struct trace_kprobe *orig, for (i = 0; i < orig->tp.nr_args; i++) { if (strcmp(orig->tp.args[i].comm, comp->tp.args[i].comm)) - continue; + break; } - return true; + if (i == orig->tp.nr_args) + return true; } return false; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 34dd6d0016a3..dd884341f5c5 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -431,10 +431,11 @@ static bool trace_uprobe_has_same_uprobe(struct trace_uprobe *orig, for (i = 0; i < orig->tp.nr_args; i++) { if (strcmp(orig->tp.args[i].comm, comp->tp.args[i].comm)) - continue; + break; } - return true; + if (i == orig->tp.nr_args) + return true; } return false; -- cgit v1.2.3 From 768fb61fcc13b2acaca758275d54c09a65e2968b Mon Sep 17 00:00:00 2001 From: Allan Zhang Date: Wed, 25 Sep 2019 16:43:12 -0700 Subject: bpf: Fix bpf_event_output re-entry issue BPF_PROG_TYPE_SOCK_OPS program can reenter bpf_event_output because it can be called from atomic and non-atomic contexts since we don't have bpf_prog_active to prevent it happen. This patch enables 3 levels of nesting to support normal, irq and nmi context. We can easily reproduce the issue by running netperf crr mode with 100 flows and 10 threads from netperf client side. Here is the whole stack dump: [ 515.228898] WARNING: CPU: 20 PID: 14686 at kernel/trace/bpf_trace.c:549 bpf_event_output+0x1f9/0x220 [ 515.228903] CPU: 20 PID: 14686 Comm: tcp_crr Tainted: G W 4.15.0-smp-fixpanic #44 [ 515.228904] Hardware name: Intel TBG,ICH10/Ikaria_QC_1b, BIOS 1.22.0 06/04/2018 [ 515.228905] RIP: 0010:bpf_event_output+0x1f9/0x220 [ 515.228906] RSP: 0018:ffff9a57ffc03938 EFLAGS: 00010246 [ 515.228907] RAX: 0000000000000012 RBX: 0000000000000001 RCX: 0000000000000000 [ 515.228907] RDX: 0000000000000000 RSI: 0000000000000096 RDI: ffffffff836b0f80 [ 515.228908] RBP: ffff9a57ffc039c8 R08: 0000000000000004 R09: 0000000000000012 [ 515.228908] R10: ffff9a57ffc1de40 R11: 0000000000000000 R12: 0000000000000002 [ 515.228909] R13: ffff9a57e13bae00 R14: 00000000ffffffff R15: ffff9a57ffc1e2c0 [ 515.228910] FS: 00007f5a3e6ec700(0000) GS:ffff9a57ffc00000(0000) knlGS:0000000000000000 [ 515.228910] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 515.228911] CR2: 0000537082664fff CR3: 000000061fed6002 CR4: 00000000000226f0 [ 515.228911] Call Trace: [ 515.228913] [ 515.228919] [] bpf_sockopt_event_output+0x3b/0x50 [ 515.228923] [] ? bpf_ktime_get_ns+0xe/0x10 [ 515.228927] [] ? __cgroup_bpf_run_filter_sock_ops+0x85/0x100 [ 515.228930] [] ? tcp_init_transfer+0x125/0x150 [ 515.228933] [] ? tcp_finish_connect+0x89/0x110 [ 515.228936] [] ? tcp_rcv_state_process+0x704/0x1010 [ 515.228939] [] ? sk_filter_trim_cap+0x53/0x2a0 [ 515.228942] [] ? tcp_v6_inbound_md5_hash+0x6f/0x1d0 [ 515.228945] [] ? tcp_v6_do_rcv+0x1c0/0x460 [ 515.228947] [] ? tcp_v6_rcv+0x9f8/0xb30 [ 515.228951] [] ? ip6_route_input+0x190/0x220 [ 515.228955] [] ? ip6_protocol_deliver_rcu+0x6d/0x450 [ 515.228958] [] ? ip6_rcv_finish+0xb6/0x170 [ 515.228961] [] ? ip6_protocol_deliver_rcu+0x450/0x450 [ 515.228963] [] ? ipv6_rcv+0x61/0xe0 [ 515.228966] [] ? ipv6_list_rcv+0x330/0x330 [ 515.228969] [] ? __netif_receive_skb_one_core+0x5b/0xa0 [ 515.228972] [] ? __netif_receive_skb+0x21/0x70 [ 515.228975] [] ? process_backlog+0xb2/0x150 [ 515.228978] [] ? net_rx_action+0x16f/0x410 [ 515.228982] [] ? __do_softirq+0xdd/0x305 [ 515.228986] [] ? irq_exit+0x9c/0xb0 [ 515.228989] [] ? smp_call_function_single_interrupt+0x65/0x120 [ 515.228991] [] ? call_function_single_interrupt+0x81/0x90 [ 515.228992] [ 515.228996] [] ? io_serial_in+0x20/0x20 [ 515.229000] [] ? console_unlock+0x230/0x490 [ 515.229003] [] ? vprintk_emit+0x26a/0x2a0 [ 515.229006] [] ? vprintk_default+0x1f/0x30 [ 515.229008] [] ? vprintk_func+0x35/0x70 [ 515.229011] [] ? printk+0x50/0x66 [ 515.229013] [] ? bpf_event_output+0xb7/0x220 [ 515.229016] [] ? bpf_sockopt_event_output+0x3b/0x50 [ 515.229019] [] ? bpf_ktime_get_ns+0xe/0x10 [ 515.229023] [] ? release_sock+0x97/0xb0 [ 515.229026] [] ? tcp_recvmsg+0x31a/0xda0 [ 515.229029] [] ? __cgroup_bpf_run_filter_sock_ops+0x85/0x100 [ 515.229032] [] ? tcp_set_state+0x191/0x1b0 [ 515.229035] [] ? tcp_disconnect+0x2e/0x600 [ 515.229038] [] ? tcp_close+0x3eb/0x460 [ 515.229040] [] ? inet_release+0x42/0x70 [ 515.229043] [] ? inet6_release+0x39/0x50 [ 515.229046] [] ? __sock_release+0x4d/0xd0 [ 515.229049] [] ? sock_close+0x15/0x20 [ 515.229052] [] ? __fput+0xe7/0x1f0 [ 515.229055] [] ? ____fput+0xe/0x10 [ 515.229058] [] ? task_work_run+0x82/0xb0 [ 515.229061] [] ? exit_to_usermode_loop+0x7e/0x11f [ 515.229064] [] ? do_syscall_64+0x111/0x130 [ 515.229067] [] ? entry_SYSCALL_64_after_hwframe+0x3d/0xa2 Fixes: a5a3a828cd00 ("bpf: add perf event notificaton support for sock_ops") Signed-off-by: Allan Zhang Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Reviewed-by: Eric Dumazet Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20190925234312.94063-2-allanzhang@google.com --- kernel/trace/bpf_trace.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ca1255d14576..3e38a010003c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -500,14 +500,17 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; -static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); -static DEFINE_PER_CPU(struct perf_sample_data, bpf_misc_sd); +static DEFINE_PER_CPU(int, bpf_event_output_nest_level); +struct bpf_nested_pt_regs { + struct pt_regs regs[3]; +}; +static DEFINE_PER_CPU(struct bpf_nested_pt_regs, bpf_pt_regs); +static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_misc_sds); u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { - struct perf_sample_data *sd = this_cpu_ptr(&bpf_misc_sd); - struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs); + int nest_level = this_cpu_inc_return(bpf_event_output_nest_level); struct perf_raw_frag frag = { .copy = ctx_copy, .size = ctx_size, @@ -522,12 +525,25 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, .data = meta, }, }; + struct perf_sample_data *sd; + struct pt_regs *regs; + u64 ret; + + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bpf_misc_sds.sds))) { + ret = -EBUSY; + goto out; + } + sd = this_cpu_ptr(&bpf_misc_sds.sds[nest_level - 1]); + regs = this_cpu_ptr(&bpf_pt_regs.regs[nest_level - 1]); perf_fetch_caller_regs(regs); perf_sample_data_init(sd, 0, 0); sd->raw = &raw; - return __bpf_perf_event_output(regs, map, flags, sd); + ret = __bpf_perf_event_output(regs, map, flags, sd); +out: + this_cpu_dec(bpf_event_output_nest_level); + return ret; } BPF_CALL_0(bpf_get_current_task) -- cgit v1.2.3 From d2aea95a1a4d195d939d16303700921be318a2b9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 28 Sep 2019 05:53:29 -0400 Subject: tracing/probe: Fix to check the difference of nr_args before adding probe Steven reported that a test triggered: ================================================================== BUG: KASAN: slab-out-of-bounds in trace_kprobe_create+0xa9e/0xe40 Read of size 8 at addr ffff8880c4f25a48 by task ftracetest/4798 CPU: 2 PID: 4798 Comm: ftracetest Not tainted 5.3.0-rc6-test+ #30 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016 Call Trace: dump_stack+0x7c/0xc0 ? trace_kprobe_create+0xa9e/0xe40 print_address_description+0x6c/0x332 ? trace_kprobe_create+0xa9e/0xe40 ? trace_kprobe_create+0xa9e/0xe40 __kasan_report.cold.6+0x1a/0x3b ? trace_kprobe_create+0xa9e/0xe40 kasan_report+0xe/0x12 trace_kprobe_create+0xa9e/0xe40 ? print_kprobe_event+0x280/0x280 ? match_held_lock+0x1b/0x240 ? find_held_lock+0xac/0xd0 ? fs_reclaim_release.part.112+0x5/0x20 ? lock_downgrade+0x350/0x350 ? kasan_unpoison_shadow+0x30/0x40 ? __kasan_kmalloc.constprop.6+0xc1/0xd0 ? trace_kprobe_create+0xe40/0xe40 ? trace_kprobe_create+0xe40/0xe40 create_or_delete_trace_kprobe+0x2e/0x60 trace_run_command+0xc3/0xe0 ? trace_panic_handler+0x20/0x20 ? kasan_unpoison_shadow+0x30/0x40 trace_parse_run_command+0xdc/0x163 vfs_write+0xe1/0x240 ksys_write+0xba/0x150 ? __ia32_sys_read+0x50/0x50 ? tracer_hardirqs_on+0x61/0x180 ? trace_hardirqs_off_caller+0x43/0x110 ? mark_held_locks+0x29/0xa0 ? do_syscall_64+0x14/0x260 do_syscall_64+0x68/0x260 Fix to check the difference of nr_args before adding probe on existing probes. This also may set the error log index bigger than the number of command parameters. In that case it sets the error position is next to the last parameter. Link: http://lkml.kernel.org/r/156966474783.3478.13217501608215769150.stgit@devnote2 Fixes: ca89bc071d5e ("tracing/kprobe: Add multi-probe per event support") Reported-by: Steven Rostedt (VMware) Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index baf58a3612c0..905b10af5d5c 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -178,6 +178,16 @@ void __trace_probe_log_err(int offset, int err_type) if (!command) return; + if (trace_probe_log.index >= trace_probe_log.argc) { + /** + * Set the error position is next to the last arg + space. + * Note that len includes the terminal null and the cursor + * appaers at pos + 1. + */ + pos = len; + offset = 0; + } + /* And make a command string from argv array */ p = command; for (i = 0; i < trace_probe_log.argc; i++) { @@ -1084,6 +1094,12 @@ int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b) { int i; + /* In case of more arguments */ + if (a->nr_args < b->nr_args) + return a->nr_args + 1; + if (a->nr_args > b->nr_args) + return b->nr_args + 1; + for (i = 0; i < a->nr_args; i++) { if ((b->nr_args <= i) || ((a->args[i].type != b->args[i].type) || -- cgit v1.2.3 From 968e5170939662341242812b9c82ef51cf140a33 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 26 Sep 2019 09:22:59 -0700 Subject: tracing: Fix clang -Wint-in-bool-context warnings in IF_ASSIGN macro After r372664 in clang, the IF_ASSIGN macro causes a couple hundred warnings along the lines of: kernel/trace/trace_output.c:1331:2: warning: converting the enum constant to a boolean [-Wint-in-bool-context] kernel/trace/trace.h:409:3: note: expanded from macro 'trace_assign_type' IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, ^ kernel/trace/trace.h:371:14: note: expanded from macro 'IF_ASSIGN' WARN_ON(id && (entry)->type != id); \ ^ 264 warnings generated. This warning can catch issues with constructs like: if (state == A || B) where the developer really meant: if (state == A || state == B) This is currently the only occurrence of the warning in the kernel tree across defconfig, allyesconfig, allmodconfig for arm32, arm64, and x86_64. Add the implicit '!= 0' to the WARN_ON statement to fix the warnings and find potential issues in the future. Link: https://github.com/llvm/llvm-project/commit/28b38c277a2941e9e891b2db30652cfd962f070b Link: https://github.com/ClangBuiltLinux/linux/issues/686 Link: http://lkml.kernel.org/r/20190926162258.466321-1-natechancellor@gmail.com Reviewed-by: Nick Desaulniers Signed-off-by: Nathan Chancellor Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 26b0a08f3c7d..f801d154ff6a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -365,11 +365,11 @@ static inline struct trace_array *top_trace_array(void) __builtin_types_compatible_p(typeof(var), type *) #undef IF_ASSIGN -#define IF_ASSIGN(var, entry, etype, id) \ - if (FTRACE_CMP_TYPE(var, etype)) { \ - var = (typeof(var))(entry); \ - WARN_ON(id && (entry)->type != id); \ - break; \ +#define IF_ASSIGN(var, entry, etype, id) \ + if (FTRACE_CMP_TYPE(var, etype)) { \ + var = (typeof(var))(entry); \ + WARN_ON(id != 0 && (entry)->type != id); \ + break; \ } /* Will cause compile errors if type is not found. */ -- cgit v1.2.3 From 96c5c6e6a5b6db592acae039fed54b5c8844cd35 Mon Sep 17 00:00:00 2001 From: Navid Emamdoost Date: Fri, 20 Sep 2019 17:57:59 -0500 Subject: tracing: Have error path in predicate_parse() free its allocated memory In predicate_parse, there is an error path that is not going to out_free instead it returns directly which leads to a memory leak. Link: http://lkml.kernel.org/r/20190920225800.3870-1-navid.emamdoost@gmail.com Signed-off-by: Navid Emamdoost Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index c773b8fb270c..c9a74f82b14a 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -452,8 +452,10 @@ predicate_parse(const char *str, int nr_parens, int nr_preds, switch (*next) { case '(': /* #2 */ - if (top - op_stack > nr_parens) - return ERR_PTR(-EINVAL); + if (top - op_stack > nr_parens) { + ret = -EINVAL; + goto out_free; + } *(++top) = invert; continue; case '!': /* #3 */ -- cgit v1.2.3