From c54a1a06daa78613519b4d24495b0d175b8af63f Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sun, 18 Aug 2024 21:50:28 +0900 Subject: tracing: Fix function timing profiler to initialize hashtable Since the new fgraph requires to initialize fgraph_ops.ops.func_hash before calling register_ftrace_graph(), initialize it with default (tracing all functions) parameter. Cc: stable@vger.kernel.org Fixes: 5fccc7552ccb ("ftrace: Add subops logic to allow one ops to manage many") Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4c28dd177ca6..d2dd71d04b8a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -883,6 +883,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace, } static struct fgraph_ops fprofiler_ops = { + .ops = { + .flags = FTRACE_OPS_FL_INITIALIZED, + INIT_OPS_HASH(fprofiler_ops.ops) + }, .entryfunc = &profile_graph_entry, .retfunc = &profile_graph_return, }; -- cgit v1.2.3 From a312a0f7834e605e7c41570f0e9525d0fc4a70a4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 14 Sep 2024 17:48:06 -0400 Subject: fgraph: Use fgraph data to store subtime for profiler Instead of having the "subtime" for the function profiler in the infrastructure ftrace_ret_stack structure, have it use the fgraph data reserve and retrieve functions. This will keep the limited shadow stack from wasting 8 bytes for something that is seldom used. Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Jiri Olsa Link: https://lore.kernel.org/20240914214826.780323141@goodmis.org Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 4 +--- kernel/trace/fgraph.c | 64 ++++++++++++++++++++++++++++++++++++++------------ kernel/trace/ftrace.c | 23 +++++++++--------- 3 files changed, 62 insertions(+), 29 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 42106b3de396..aabd348cad4a 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1081,6 +1081,7 @@ struct fgraph_ops { void *fgraph_reserve_data(int idx, int size_bytes); void *fgraph_retrieve_data(int idx, int *size_bytes); +void *fgraph_retrieve_parent_data(int idx, int *size_bytes, int depth); /* * Stack of return addresses for functions @@ -1091,9 +1092,6 @@ struct ftrace_ret_stack { unsigned long ret; unsigned long func; unsigned long long calltime; -#ifdef CONFIG_FUNCTION_PROFILER - unsigned long long subtime; -#endif #ifdef HAVE_FUNCTION_GRAPH_FP_TEST unsigned long fp; #endif diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index d7d4fb403f6f..095ceb752b28 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -390,21 +390,7 @@ void *fgraph_reserve_data(int idx, int size_bytes) */ void *fgraph_retrieve_data(int idx, int *size_bytes) { - int offset = current->curr_ret_stack - 1; - unsigned long val; - - val = get_fgraph_entry(current, offset); - while (__get_type(val) == FGRAPH_TYPE_DATA) { - if (__get_data_index(val) == idx) - goto found; - offset -= __get_data_size(val) + 1; - val = get_fgraph_entry(current, offset); - } - return NULL; -found: - if (size_bytes) - *size_bytes = __get_data_size(val) * sizeof(long); - return get_data_type_data(current, offset); + return fgraph_retrieve_parent_data(idx, size_bytes, 0); } /** @@ -460,6 +446,54 @@ get_ret_stack(struct task_struct *t, int offset, int *frame_offset) return RET_STACK(t, offset); } +/** + * fgraph_retrieve_parent_data - get data from a parent function + * @idx: The index into the fgraph_array (fgraph_ops::idx) + * @size_bytes: A pointer to retrieved data size + * @depth: The depth to find the parent (0 is the current function) + * + * This is similar to fgraph_retrieve_data() but can be used to retrieve + * data from a parent caller function. + * + * Return: a pointer to the specified parent data or NULL if not found + */ +void *fgraph_retrieve_parent_data(int idx, int *size_bytes, int depth) +{ + struct ftrace_ret_stack *ret_stack = NULL; + int offset = current->curr_ret_stack; + unsigned long val; + + if (offset <= 0) + return NULL; + + for (;;) { + int next_offset; + + ret_stack = get_ret_stack(current, offset, &next_offset); + if (!ret_stack || --depth < 0) + break; + offset = next_offset; + } + + if (!ret_stack) + return NULL; + + offset--; + + val = get_fgraph_entry(current, offset); + while (__get_type(val) == FGRAPH_TYPE_DATA) { + if (__get_data_index(val) == idx) + goto found; + offset -= __get_data_size(val) + 1; + val = get_fgraph_entry(current, offset); + } + return NULL; +found: + if (size_bytes) + *size_bytes = __get_data_size(val) * sizeof(long); + return get_data_type_data(current, offset); +} + /* Both enabled by default (can be cleared by function_graph tracer flags */ static bool fgraph_sleep_time = true; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d2dd71d04b8a..bac1f2ee1983 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -823,7 +823,7 @@ void ftrace_graph_graph_time_control(bool enable) static int profile_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops) { - struct ftrace_ret_stack *ret_stack; + unsigned long long *subtime; function_profile_call(trace->func, 0, NULL, NULL); @@ -831,9 +831,9 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace, if (!current->ret_stack) return 0; - ret_stack = ftrace_graph_get_ret_stack(current, 0); - if (ret_stack) - ret_stack->subtime = 0; + subtime = fgraph_reserve_data(gops->idx, sizeof(*subtime)); + if (subtime) + *subtime = 0; return 1; } @@ -841,11 +841,12 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace, static void profile_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops) { - struct ftrace_ret_stack *ret_stack; struct ftrace_profile_stat *stat; unsigned long long calltime; + unsigned long long *subtime; struct ftrace_profile *rec; unsigned long flags; + int size; local_irq_save(flags); stat = this_cpu_ptr(&ftrace_profile_stats); @@ -861,13 +862,13 @@ static void profile_graph_return(struct ftrace_graph_ret *trace, if (!fgraph_graph_time) { /* Append this call time to the parent time to subtract */ - ret_stack = ftrace_graph_get_ret_stack(current, 1); - if (ret_stack) - ret_stack->subtime += calltime; + subtime = fgraph_retrieve_parent_data(gops->idx, &size, 1); + if (subtime) + *subtime += calltime; - ret_stack = ftrace_graph_get_ret_stack(current, 0); - if (ret_stack && ret_stack->subtime < calltime) - calltime -= ret_stack->subtime; + subtime = fgraph_retrieve_data(gops->idx, &size); + if (subtime && *subtime && *subtime < calltime) + calltime -= *subtime; else calltime = 0; } -- cgit v1.2.3 From 3c9880f3ab52b52b5b4e1850a70e80dd7329cb4c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 14 Sep 2024 17:48:07 -0400 Subject: ftrace: Use a running sleeptime instead of saving on shadow stack The fgraph "sleep-time" option tells the function graph tracer and the profiler whether to include the time a function "sleeps" (is scheduled off the CPU) in its duration for the function. By default it is true, which means the duration of a function is calculated by the timestamp of when the function was entered to the timestamp of when it exits. If the "sleep-time" option is disabled, it needs to remove the time that the task was not running on the CPU during the function. Currently it is done in a sched_switch tracepoint probe where it moves the "calltime" (time of entry of the function) forward by the sleep time calculated. It updates all the calltime in the shadow stack. This is time consuming for those users of the function graph tracer that does not care about the sleep time. Instead, add a "ftrace_sleeptime" to the task_struct that gets the sleep time added each time the task wakes up. Then have the function entry save the current "ftrace_sleeptime" and on function exit, move the calltime forward by the difference of the current "ftrace_sleeptime" from the saved sleeptime. This removes one dependency of "calltime" needed to be on the shadow stack. It also simplifies the code that removes the sleep time of functions. TODO: Only enable the sched_switch tracepoint when this is needed. Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Jiri Olsa Link: https://lore.kernel.org/20240914214826.938908568@goodmis.org Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/sched.h | 1 + kernel/trace/fgraph.c | 16 ++------------- kernel/trace/ftrace.c | 39 ++++++++++++++++++++++++++---------- kernel/trace/trace.h | 1 + kernel/trace/trace_functions_graph.c | 28 ++++++++++++++++++++++++++ 5 files changed, 60 insertions(+), 25 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/sched.h b/include/linux/sched.h index e6ee4258169a..c08f3bdb11a5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1441,6 +1441,7 @@ struct task_struct { /* Timestamp for last schedule: */ unsigned long long ftrace_timestamp; + unsigned long long ftrace_sleeptime; /* * Number of functions that haven't been traced diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 095ceb752b28..b2e95bf82211 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -495,7 +495,7 @@ found: } /* Both enabled by default (can be cleared by function_graph tracer flags */ -static bool fgraph_sleep_time = true; +bool fgraph_sleep_time = true; #ifdef CONFIG_DYNAMIC_FTRACE /* @@ -1046,9 +1046,7 @@ ftrace_graph_probe_sched_switch(void *ignore, bool preempt, struct task_struct *next, unsigned int prev_state) { - struct ftrace_ret_stack *ret_stack; unsigned long long timestamp; - int offset; /* * Does the user want to count the time a function was asleep. @@ -1065,17 +1063,7 @@ ftrace_graph_probe_sched_switch(void *ignore, bool preempt, if (!next->ftrace_timestamp) return; - /* - * Update all the counters in next to make up for the - * time next was sleeping. - */ - timestamp -= next->ftrace_timestamp; - - for (offset = next->curr_ret_stack; offset > 0; ) { - ret_stack = get_ret_stack(next, offset, &offset); - if (ret_stack) - ret_stack->calltime += timestamp; - } + next->ftrace_sleeptime += timestamp - next->ftrace_timestamp; } static DEFINE_PER_CPU(unsigned long *, idle_ret_stack); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bac1f2ee1983..90b3975d5315 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -820,10 +820,15 @@ void ftrace_graph_graph_time_control(bool enable) fgraph_graph_time = enable; } +struct profile_fgraph_data { + unsigned long long subtime; + unsigned long long sleeptime; +}; + static int profile_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops) { - unsigned long long *subtime; + struct profile_fgraph_data *profile_data; function_profile_call(trace->func, 0, NULL, NULL); @@ -831,9 +836,12 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace, if (!current->ret_stack) return 0; - subtime = fgraph_reserve_data(gops->idx, sizeof(*subtime)); - if (subtime) - *subtime = 0; + profile_data = fgraph_reserve_data(gops->idx, sizeof(*profile_data)); + if (!profile_data) + return 0; + + profile_data->subtime = 0; + profile_data->sleeptime = current->ftrace_sleeptime; return 1; } @@ -841,9 +849,10 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace, static void profile_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops) { + struct profile_fgraph_data *profile_data; + struct profile_fgraph_data *parent_data; struct ftrace_profile_stat *stat; unsigned long long calltime; - unsigned long long *subtime; struct ftrace_profile *rec; unsigned long flags; int size; @@ -859,16 +868,24 @@ static void profile_graph_return(struct ftrace_graph_ret *trace, calltime = trace->rettime - trace->calltime; + if (!fgraph_sleep_time) { + profile_data = fgraph_retrieve_data(gops->idx, &size); + if (profile_data && current->ftrace_sleeptime) + calltime -= current->ftrace_sleeptime - profile_data->sleeptime; + } + if (!fgraph_graph_time) { /* Append this call time to the parent time to subtract */ - subtime = fgraph_retrieve_parent_data(gops->idx, &size, 1); - if (subtime) - *subtime += calltime; + parent_data = fgraph_retrieve_parent_data(gops->idx, &size, 1); + if (parent_data) + parent_data->subtime += calltime; + + if (!profile_data) + profile_data = fgraph_retrieve_data(gops->idx, &size); - subtime = fgraph_retrieve_data(gops->idx, &size); - if (subtime && *subtime && *subtime < calltime) - calltime -= *subtime; + if (profile_data && profile_data->subtime && profile_data->subtime < calltime) + calltime -= profile_data->subtime; else calltime = 0; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c866991b9c78..2f8017f8d34d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1048,6 +1048,7 @@ static inline void ftrace_graph_addr_finish(struct fgraph_ops *gops, struct ftra #endif /* CONFIG_DYNAMIC_FTRACE */ extern unsigned int fgraph_max_depth; +extern bool fgraph_sleep_time; static inline bool ftrace_graph_ignore_func(struct fgraph_ops *gops, struct ftrace_graph_ent *trace) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index a569daaac4c4..bbd898f5a73c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -133,6 +133,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace, unsigned long *task_var = fgraph_get_task_var(gops); struct trace_array *tr = gops->private; struct trace_array_cpu *data; + unsigned long *sleeptime; unsigned long flags; unsigned int trace_ctx; long disabled; @@ -167,6 +168,13 @@ int trace_graph_entry(struct ftrace_graph_ent *trace, if (ftrace_graph_ignore_irqs()) return 0; + /* save the current sleep time if we are to ignore it */ + if (!fgraph_sleep_time) { + sleeptime = fgraph_reserve_data(gops->idx, sizeof(*sleeptime)); + if (sleeptime) + *sleeptime = current->ftrace_sleeptime; + } + /* * Stop here if tracing_threshold is set. We only write function return * events to the ring buffer. @@ -238,6 +246,22 @@ void __trace_graph_return(struct trace_array *tr, trace_buffer_unlock_commit_nostack(buffer, event); } +static void handle_nosleeptime(struct ftrace_graph_ret *trace, + struct fgraph_ops *gops) +{ + unsigned long long *sleeptime; + int size; + + if (fgraph_sleep_time) + return; + + sleeptime = fgraph_retrieve_data(gops->idx, &size); + if (!sleeptime) + return; + + trace->calltime += current->ftrace_sleeptime - *sleeptime; +} + void trace_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops) { @@ -256,6 +280,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace, return; } + handle_nosleeptime(trace, gops); + local_irq_save(flags); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); @@ -278,6 +304,8 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace, return; } + handle_nosleeptime(trace, gops); + if (tracing_thresh && (trace->rettime - trace->calltime < tracing_thresh)) return; -- cgit v1.2.3 From f1f36e22bee967db5e812a65e24389e54c46f3c2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 14 Sep 2024 17:48:08 -0400 Subject: ftrace: Have calltime be saved in the fgraph storage The calltime field in the shadow stack frame is only used by the function graph tracer and profiler. But now that there's other users of the function graph infrastructure, this adds overhead and wastes space on the shadow stack. Move the calltime to the fgraph data storage, where the function graph and profiler entry functions will save it in its own graph storage and retrieve it in its exit functions. Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Jiri Olsa Link: https://lore.kernel.org/20240914214827.096968730@goodmis.org Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 1 - kernel/trace/fgraph.c | 5 --- kernel/trace/ftrace.c | 19 ++++++------ kernel/trace/trace_functions_graph.c | 60 ++++++++++++++++++++++++------------ 4 files changed, 51 insertions(+), 34 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index aabd348cad4a..e684addf6508 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1091,7 +1091,6 @@ void *fgraph_retrieve_parent_data(int idx, int *size_bytes, int depth); struct ftrace_ret_stack { unsigned long ret; unsigned long func; - unsigned long long calltime; #ifdef HAVE_FUNCTION_GRAPH_FP_TEST unsigned long fp; #endif diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index b2e95bf82211..58a28ec35dab 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -558,7 +558,6 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int fgraph_idx) { struct ftrace_ret_stack *ret_stack; - unsigned long long calltime; unsigned long val; int offset; @@ -588,8 +587,6 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, return -EBUSY; } - calltime = trace_clock_local(); - offset = READ_ONCE(current->curr_ret_stack); ret_stack = RET_STACK(current, offset); offset += FGRAPH_FRAME_OFFSET; @@ -623,7 +620,6 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, ret_stack->ret = ret; ret_stack->func = func; - ret_stack->calltime = calltime; #ifdef HAVE_FUNCTION_GRAPH_FP_TEST ret_stack->fp = frame_pointer; #endif @@ -757,7 +753,6 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, *offset += FGRAPH_FRAME_OFFSET; *ret = ret_stack->ret; trace->func = ret_stack->func; - trace->calltime = ret_stack->calltime; trace->overrun = atomic_read(¤t->trace_overrun); trace->depth = current->curr_ret_depth; /* diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 90b3975d5315..cae388122ca8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -821,6 +821,7 @@ void ftrace_graph_graph_time_control(bool enable) } struct profile_fgraph_data { + unsigned long long calltime; unsigned long long subtime; unsigned long long sleeptime; }; @@ -842,6 +843,7 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace, profile_data->subtime = 0; profile_data->sleeptime = current->ftrace_sleeptime; + profile_data->calltime = trace_clock_local(); return 1; } @@ -850,9 +852,9 @@ static void profile_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops) { struct profile_fgraph_data *profile_data; - struct profile_fgraph_data *parent_data; struct ftrace_profile_stat *stat; unsigned long long calltime; + unsigned long long rettime = trace_clock_local(); struct ftrace_profile *rec; unsigned long flags; int size; @@ -862,29 +864,28 @@ static void profile_graph_return(struct ftrace_graph_ret *trace, if (!stat->hash || !ftrace_profile_enabled) goto out; + profile_data = fgraph_retrieve_data(gops->idx, &size); + /* If the calltime was zero'd ignore it */ - if (!trace->calltime) + if (!profile_data || !profile_data->calltime) goto out; - calltime = trace->rettime - trace->calltime; + calltime = rettime - profile_data->calltime; if (!fgraph_sleep_time) { - profile_data = fgraph_retrieve_data(gops->idx, &size); - if (profile_data && current->ftrace_sleeptime) + if (current->ftrace_sleeptime) calltime -= current->ftrace_sleeptime - profile_data->sleeptime; } if (!fgraph_graph_time) { + struct profile_fgraph_data *parent_data; /* Append this call time to the parent time to subtract */ parent_data = fgraph_retrieve_parent_data(gops->idx, &size, 1); if (parent_data) parent_data->subtime += calltime; - if (!profile_data) - profile_data = fgraph_retrieve_data(gops->idx, &size); - - if (profile_data && profile_data->subtime && profile_data->subtime < calltime) + if (profile_data->subtime && profile_data->subtime < calltime) calltime -= profile_data->subtime; else calltime = 0; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index bbd898f5a73c..5c1b150fbba3 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -127,13 +127,18 @@ static inline int ftrace_graph_ignore_irqs(void) return in_hardirq(); } +struct fgraph_times { + unsigned long long calltime; + unsigned long long sleeptime; /* may be optional! */ +}; + int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops) { unsigned long *task_var = fgraph_get_task_var(gops); struct trace_array *tr = gops->private; struct trace_array_cpu *data; - unsigned long *sleeptime; + struct fgraph_times *ftimes; unsigned long flags; unsigned int trace_ctx; long disabled; @@ -168,12 +173,18 @@ int trace_graph_entry(struct ftrace_graph_ent *trace, if (ftrace_graph_ignore_irqs()) return 0; - /* save the current sleep time if we are to ignore it */ - if (!fgraph_sleep_time) { - sleeptime = fgraph_reserve_data(gops->idx, sizeof(*sleeptime)); - if (sleeptime) - *sleeptime = current->ftrace_sleeptime; + if (fgraph_sleep_time) { + /* Only need to record the calltime */ + ftimes = fgraph_reserve_data(gops->idx, sizeof(ftimes->calltime)); + } else { + ftimes = fgraph_reserve_data(gops->idx, sizeof(*ftimes)); + if (ftimes) + ftimes->sleeptime = current->ftrace_sleeptime; } + if (!ftimes) + return 0; + + ftimes->calltime = trace_clock_local(); /* * Stop here if tracing_threshold is set. We only write function return @@ -247,19 +258,13 @@ void __trace_graph_return(struct trace_array *tr, } static void handle_nosleeptime(struct ftrace_graph_ret *trace, - struct fgraph_ops *gops) + struct fgraph_times *ftimes, + int size) { - unsigned long long *sleeptime; - int size; - - if (fgraph_sleep_time) - return; - - sleeptime = fgraph_retrieve_data(gops->idx, &size); - if (!sleeptime) + if (fgraph_sleep_time || size < sizeof(*ftimes)) return; - trace->calltime += current->ftrace_sleeptime - *sleeptime; + ftimes->calltime += current->ftrace_sleeptime - ftimes->sleeptime; } void trace_graph_return(struct ftrace_graph_ret *trace, @@ -268,9 +273,11 @@ void trace_graph_return(struct ftrace_graph_ret *trace, unsigned long *task_var = fgraph_get_task_var(gops); struct trace_array *tr = gops->private; struct trace_array_cpu *data; + struct fgraph_times *ftimes; unsigned long flags; unsigned int trace_ctx; long disabled; + int size; int cpu; ftrace_graph_addr_finish(gops, trace); @@ -280,7 +287,13 @@ void trace_graph_return(struct ftrace_graph_ret *trace, return; } - handle_nosleeptime(trace, gops); + ftimes = fgraph_retrieve_data(gops->idx, &size); + if (!ftimes) + return; + + handle_nosleeptime(trace, ftimes, size); + + trace->calltime = ftimes->calltime; local_irq_save(flags); cpu = raw_smp_processor_id(); @@ -297,6 +310,9 @@ void trace_graph_return(struct ftrace_graph_ret *trace, static void trace_graph_thresh_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops) { + struct fgraph_times *ftimes; + int size; + ftrace_graph_addr_finish(gops, trace); if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) { @@ -304,10 +320,16 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace, return; } - handle_nosleeptime(trace, gops); + ftimes = fgraph_retrieve_data(gops->idx, &size); + if (!ftimes) + return; + + handle_nosleeptime(trace, ftimes, size); + + trace->calltime = ftimes->calltime; if (tracing_thresh && - (trace->rettime - trace->calltime < tracing_thresh)) + (trace->rettime - ftimes->calltime < tracing_thresh)) return; else trace_graph_return(trace, gops); -- cgit v1.2.3 From ac1987f8f525379a0677f7f23c7a7ef2596a338d Mon Sep 17 00:00:00 2001 From: Andrew Kreimer Date: Wed, 11 Sep 2024 14:43:38 +0300 Subject: rv: Fix a typo Fix a typo in comments. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20240911114349.20449-1-algonell@gmail.com Reported-by: Matthew Wilcox Signed-off-by: Andrew Kreimer Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/rv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index dc819aec43e8..279c70e1bd74 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -41,7 +41,7 @@ * per-task monitor, and so on), and the helper functions that glue the * monitor to the system via trace. Generally, a monitor includes some form * of trace output as a reaction for event parsing and exceptions, - * as depicted bellow: + * as depicted below: * * Linux +----- RV Monitor ----------------------------------+ Formal * Realm | | Realm -- cgit v1.2.3 From 21e92806d39c68af2accd1fb238c2daecfcf9fbd Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Sat, 14 Sep 2024 20:29:12 -0700 Subject: function_graph: Support recording and printing the function return address When using function_graph tracer to analyze the flow of kernel function execution, it is often necessary to quickly locate the exact line of code where the call occurs. While this may be easy at times, it can be more time-consuming when some functions are inlined or the flow is too long. This feature aims to simplify the process by recording the return address of traced funcions and printing it when outputing trace logs. To enhance human readability, the prefix 'ret=' is used for the kernel return value, while '<-' serves as the prefix for the return address in trace logs to make it look more like the function tracer. A new trace option named 'funcgraph-retaddr' has been introduced, and the existing option 'sym-addr' can be used to control the format of the return address. See below logs with both funcgraph-retval and funcgraph-retaddr enabled. 0) | load_elf_binary() { /* <-bprm_execve+0x249/0x600 */ 0) | load_elf_phdrs() { /* <-load_elf_binary+0x84/0x1730 */ 0) | __kmalloc_noprof() { /* <-load_elf_phdrs+0x4a/0xb0 */ 0) 3.657 us | __cond_resched(); /* <-__kmalloc_noprof+0x28c/0x390 ret=0x0 */ 0) + 24.335 us | } /* __kmalloc_noprof ret=0xffff8882007f3000 */ 0) | kernel_read() { /* <-load_elf_phdrs+0x6c/0xb0 */ 0) | rw_verify_area() { /* <-kernel_read+0x2b/0x50 */ 0) | security_file_permission() { /* <-kernel_read+0x2b/0x50 */ 0) | selinux_file_permission() { /* <-security_file_permission+0x26/0x40 */ 0) | __inode_security_revalidate() { /* <-selinux_file_permission+0x6d/0x140 */ 0) 2.034 us | __cond_resched(); /* <-__inode_security_revalidate+0x5f/0x80 ret=0x0 */ 0) 6.602 us | } /* __inode_security_revalidate ret=0x0 */ 0) 2.214 us | avc_policy_seqno(); /* <-selinux_file_permission+0x107/0x140 ret=0x0 */ 0) + 16.670 us | } /* selinux_file_permission ret=0x0 */ 0) + 20.809 us | } /* security_file_permission ret=0x0 */ 0) + 25.217 us | } /* rw_verify_area ret=0x0 */ 0) | __kernel_read() { /* <-load_elf_phdrs+0x6c/0xb0 */ 0) | ext4_file_read_iter() { /* <-__kernel_read+0x160/0x2e0 */ Then, we can use the faddr2line to locate the source code, for example: $ ./scripts/faddr2line ./vmlinux load_elf_phdrs+0x6c/0xb0 load_elf_phdrs+0x6c/0xb0: elf_read at fs/binfmt_elf.c:471 (inlined by) load_elf_phdrs at fs/binfmt_elf.c:531 Link: https://lore.kernel.org/20240915032912.1118397-1-dolinux.peng@gmail.com Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202409150605.HgUmU8ea-lkp@intel.com/ Signed-off-by: Donglin Peng [ Rebased to handle text_delta offsets ] Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 27 ++- kernel/trace/Kconfig | 10 + kernel/trace/fgraph.c | 22 ++- kernel/trace/ftrace.c | 3 +- kernel/trace/trace.h | 11 +- kernel/trace/trace_entries.h | 29 ++- kernel/trace/trace_functions_graph.c | 216 ++++++++++++++++----- kernel/trace/trace_irqsoff.c | 3 +- kernel/trace/trace_sched_wakeup.c | 3 +- kernel/trace/trace_selftest.c | 9 +- .../ftrace/test.d/ftrace/fgraph-retval.tc | 2 +- 11 files changed, 274 insertions(+), 61 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index e684addf6508..2ac3b3b53cd0 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1040,6 +1040,17 @@ struct ftrace_graph_ent { int depth; } __packed; +/* + * Structure that defines an entry function trace with retaddr. + * It's already packed but the attribute "packed" is needed + * to remove extra padding at the end. + */ +struct fgraph_retaddr_ent { + unsigned long func; /* Current function */ + int depth; + unsigned long retaddr; /* Return address */ +} __packed; + /* * Structure that defines a return function trace. * It's already packed but the attribute "packed" is needed @@ -1057,19 +1068,29 @@ struct ftrace_graph_ret { unsigned long long rettime; } __packed; +struct fgraph_extras; struct fgraph_ops; /* Type of the callback handlers for tracing function graph*/ typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *, struct fgraph_ops *); /* return */ typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *, - struct fgraph_ops *); /* entry */ + struct fgraph_ops *, + struct fgraph_extras *); /* entry */ -extern int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace, struct fgraph_ops *gops); +extern int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace, + struct fgraph_ops *gops, + struct fgraph_extras *extras); bool ftrace_pids_enabled(struct ftrace_ops *ops); #ifdef CONFIG_FUNCTION_GRAPH_TRACER +/* Used to convey some extra datas when creating a graph entry */ +struct fgraph_extras { + u32 flags; + unsigned long retaddr; +}; + struct fgraph_ops { trace_func_graph_ent_t entryfunc; trace_func_graph_ret_t retfunc; @@ -1115,6 +1136,8 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, unsigned long ret, unsigned long *retp); unsigned long *fgraph_get_task_var(struct fgraph_ops *gops); +u32 graph_tracer_flags_get(u32 flags); + /* * Sometimes we don't want to trace a function with the function * graph tracer but we want them to keep traced by the usual function diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 721c3b221048..74c2b1d43bb9 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -242,6 +242,16 @@ config FUNCTION_GRAPH_RETVAL enable it via the trace option funcgraph-retval. See Documentation/trace/ftrace.rst +config FUNCTION_GRAPH_RETADDR + bool "Kernel Function Graph Return Address" + depends on FUNCTION_GRAPH_TRACER + default n + help + Support recording and printing the function return address when + using function graph tracer. It can be helpful to locate code line that + the function is called. This feature is off by default, and you can + enable it via the trace option funcgraph-retaddr. + config DYNAMIC_FTRACE bool "enable/disable function tracing dynamically" depends on FUNCTION_TRACER diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 58a28ec35dab..875aefe60a13 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -290,7 +290,8 @@ static inline unsigned long make_data_type_val(int idx, int size, int offset) } /* ftrace_graph_entry set to this to tell some archs to run function graph */ -static int entry_run(struct ftrace_graph_ent *trace, struct fgraph_ops *ops) +static int entry_run(struct ftrace_graph_ent *trace, struct fgraph_ops *ops, + struct fgraph_extras *extras) { return 0; } @@ -518,7 +519,8 @@ int __weak ftrace_disable_ftrace_graph_caller(void) #endif int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct fgraph_extras *extras) { return 0; } @@ -646,13 +648,20 @@ int function_graph_enter(unsigned long ret, unsigned long func, unsigned long frame_pointer, unsigned long *retp) { struct ftrace_graph_ent trace; + struct fgraph_extras extras; unsigned long bitmap = 0; int offset; int i; + int idx = 0; trace.func = func; trace.depth = ++current->curr_ret_depth; + extras.flags = graph_tracer_flags_get(TRACE_GRAPH_PRINT_RETADDR); + if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) + && extras.flags & TRACE_GRAPH_PRINT_RETADDR) + extras.retaddr = ftrace_graph_ret_addr(current, &idx, ret, retp); + offset = ftrace_push_return_trace(ret, func, frame_pointer, retp, 0); if (offset < 0) goto out; @@ -661,7 +670,7 @@ int function_graph_enter(unsigned long ret, unsigned long func, if (static_branch_likely(&fgraph_do_direct)) { int save_curr_ret_stack = current->curr_ret_stack; - if (static_call(fgraph_func)(&trace, fgraph_direct_gops)) + if (static_call(fgraph_func)(&trace, fgraph_direct_gops, &extras)) bitmap |= BIT(fgraph_direct_gops->idx); else /* Clear out any saved storage */ @@ -679,7 +688,7 @@ int function_graph_enter(unsigned long ret, unsigned long func, save_curr_ret_stack = current->curr_ret_stack; if (ftrace_ops_test(&gops->ops, func, NULL) && - gops->entryfunc(&trace, gops)) + gops->entryfunc(&trace, gops, &extras)) bitmap |= BIT(i); else /* Clear out any saved storage */ @@ -1136,7 +1145,8 @@ void ftrace_graph_exit_task(struct task_struct *t) #ifdef CONFIG_DYNAMIC_FTRACE static int fgraph_pid_func(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct fgraph_extras *extras) { struct trace_array *tr = gops->ops.private; int pid; @@ -1150,7 +1160,7 @@ static int fgraph_pid_func(struct ftrace_graph_ent *trace, return 0; } - return gops->saved_func(trace, gops); + return gops->saved_func(trace, gops, NULL); } void fgraph_update_pid_func(void) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index cae388122ca8..5d87dac83b80 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -827,7 +827,8 @@ struct profile_fgraph_data { }; static int profile_graph_entry(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct fgraph_extras *extras) { struct profile_fgraph_data *profile_data; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2f8017f8d34d..13f08f257c0b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -46,6 +46,7 @@ enum trace_type { TRACE_BRANCH, TRACE_GRAPH_RET, TRACE_GRAPH_ENT, + TRACE_GRAPH_RETADDR_ENT, TRACE_USER_STACK, TRACE_BLK, TRACE_BPUTS, @@ -512,6 +513,8 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \ TRACE_GRAPH_ENT); \ + IF_ASSIGN(var, ent, struct fgraph_retaddr_ent_entry,\ + TRACE_GRAPH_RETADDR_ENT); \ IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ TRACE_GRAPH_RET); \ IF_ASSIGN(var, ent, struct func_repeats_entry, \ @@ -692,7 +695,8 @@ void trace_default_header(struct seq_file *m); void print_trace_header(struct seq_file *m, struct trace_iterator *iter); void trace_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops); -int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops); +int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, + struct fgraph_extras *extras); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); @@ -879,6 +883,7 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) #define TRACE_GRAPH_GRAPH_TIME 0x400 #define TRACE_GRAPH_PRINT_RETVAL 0x800 #define TRACE_GRAPH_PRINT_RETVAL_HEX 0x1000 +#define TRACE_GRAPH_PRINT_RETADDR 0x2000 #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) @@ -900,6 +905,10 @@ extern void graph_trace_close(struct trace_iterator *iter); extern int __trace_graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned int trace_ctx); +extern int __trace_graph_retaddr_entry(struct trace_array *tr, + struct ftrace_graph_ent *trace, + unsigned int trace_ctx, + unsigned long retaddr); extern void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, unsigned int trace_ctx); diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index c47422b20908..82fd174ebbe0 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -85,9 +85,35 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry, F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth) ); -/* Function return entry */ +#ifdef CONFIG_FUNCTION_GRAPH_RETADDR + +/* Function call entry with a return address */ +FTRACE_ENTRY_PACKED(fgraph_retaddr_entry, fgraph_retaddr_ent_entry, + + TRACE_GRAPH_RETADDR_ENT, + + F_STRUCT( + __field_struct( struct fgraph_retaddr_ent, graph_ent ) + __field_packed( unsigned long, graph_ent, func ) + __field_packed( int, graph_ent, depth ) + __field_packed( unsigned long, graph_ent, retaddr ) + ), + + F_printk("--> %ps (%d) <- %ps", (void *)__entry->func, __entry->depth, + (void *)__entry->retaddr) +); + +#else + +#ifndef fgraph_retaddr_ent_entry +#define fgraph_retaddr_ent_entry ftrace_graph_ent_entry +#endif + +#endif + #ifdef CONFIG_FUNCTION_GRAPH_RETVAL +/* Function return entry */ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, TRACE_GRAPH_RET, @@ -110,6 +136,7 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, #else +/* Function return entry */ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, TRACE_GRAPH_RET, diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 5c1b150fbba3..3dd63ae2afe8 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -31,7 +31,10 @@ struct fgraph_data { struct fgraph_cpu_data __percpu *cpu_data; /* Place to preserve last processed entry. */ - struct ftrace_graph_ent_entry ent; + union { + struct ftrace_graph_ent_entry ent; + struct fgraph_retaddr_ent_entry rent; + } ent; struct ftrace_graph_ret_entry ret; int failed; int cpu; @@ -63,6 +66,10 @@ static struct tracer_opt trace_opts[] = { { TRACER_OPT(funcgraph-retval, TRACE_GRAPH_PRINT_RETVAL) }, /* Display function return value in hexadecimal format ? */ { TRACER_OPT(funcgraph-retval-hex, TRACE_GRAPH_PRINT_RETVAL_HEX) }, +#endif +#ifdef CONFIG_FUNCTION_GRAPH_RETADDR + /* Display function return address ? */ + { TRACER_OPT(funcgraph-retaddr, TRACE_GRAPH_PRINT_RETADDR) }, #endif /* Include sleep time (scheduled out) between entry and return */ { TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) }, @@ -83,6 +90,11 @@ static struct tracer_flags tracer_flags = { .opts = trace_opts }; +u32 graph_tracer_flags_get(u32 flags) +{ + return tracer_flags.val & flags; +} + /* * DURATION column is being also used to display IRQ signs, * following values are used by print_graph_irq and others @@ -119,6 +131,40 @@ int __trace_graph_entry(struct trace_array *tr, return 1; } +#ifdef CONFIG_FUNCTION_GRAPH_RETADDR +int __trace_graph_retaddr_entry(struct trace_array *tr, + struct ftrace_graph_ent *trace, + unsigned int trace_ctx, + unsigned long retaddr) +{ + struct trace_event_call *call = &event_fgraph_retaddr_entry; + struct ring_buffer_event *event; + struct trace_buffer *buffer = tr->array_buffer.buffer; + struct fgraph_retaddr_ent_entry *entry; + + event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RETADDR_ENT, + sizeof(*entry), trace_ctx); + if (!event) + return 0; + entry = ring_buffer_event_data(event); + entry->graph_ent.func = trace->func; + entry->graph_ent.depth = trace->depth; + entry->graph_ent.retaddr = retaddr; + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit_nostack(buffer, event); + + return 1; +} +#else +int __trace_graph_retaddr_entry(struct trace_array *tr, + struct ftrace_graph_ent *trace, + unsigned int trace_ctx, + unsigned long retaddr) +{ + return 1; +} +#endif + static inline int ftrace_graph_ignore_irqs(void) { if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT)) @@ -133,7 +179,8 @@ struct fgraph_times { }; int trace_graph_entry(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct fgraph_extras *extras) { unsigned long *task_var = fgraph_get_task_var(gops); struct trace_array *tr = gops->private; @@ -199,7 +246,12 @@ int trace_graph_entry(struct ftrace_graph_ent *trace, disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { trace_ctx = tracing_gen_ctx_flags(flags); - ret = __trace_graph_entry(tr, trace, trace_ctx); + if (unlikely(IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && extras + && (extras->flags & TRACE_GRAPH_PRINT_RETADDR))) + ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, + extras->retaddr); + else + ret = __trace_graph_entry(tr, trace, trace_ctx); } else { ret = 0; } @@ -507,7 +559,7 @@ get_return_for_leaf(struct trace_iterator *iter, * then we just reuse the data from before. */ if (data && data->failed) { - curr = &data->ent; + curr = &data->ent.ent; next = &data->ret; } else { @@ -537,7 +589,10 @@ get_return_for_leaf(struct trace_iterator *iter, * Save current and next entries for later reference * if the output fails. */ - data->ent = *curr; + if (unlikely(curr->ent.type == TRACE_GRAPH_RETADDR_ENT)) + data->ent.rent = *(struct fgraph_retaddr_ent_entry *)curr; + else + data->ent.ent = *curr; /* * If the next event is not a return type, then * we only care about what type it is. Otherwise we can @@ -701,52 +756,96 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration, } #ifdef CONFIG_FUNCTION_GRAPH_RETVAL - #define __TRACE_GRAPH_PRINT_RETVAL TRACE_GRAPH_PRINT_RETVAL +#else +#define __TRACE_GRAPH_PRINT_RETVAL 0 +#endif -static void print_graph_retval(struct trace_seq *s, unsigned long retval, - bool leaf, void *func, bool hex_format) +#ifdef CONFIG_FUNCTION_GRAPH_RETADDR +#define __TRACE_GRAPH_PRINT_RETADDR TRACE_GRAPH_PRINT_RETADDR +static void print_graph_retaddr(struct trace_seq *s, struct fgraph_retaddr_ent_entry *entry, + u32 trace_flags, bool comment) +{ + if (comment) + trace_seq_puts(s, " /*"); + + trace_seq_puts(s, " <-"); + seq_print_ip_sym(s, entry->graph_ent.retaddr, trace_flags | TRACE_ITER_SYM_OFFSET); + + if (comment) + trace_seq_puts(s, " */"); +} +#else +#define __TRACE_GRAPH_PRINT_RETADDR 0 +#define print_graph_retaddr(_seq, _entry, _tflags, _comment) do { } while (0) +#endif + +#if defined(CONFIG_FUNCTION_GRAPH_RETVAL) || defined(CONFIG_FUNCTION_GRAPH_RETADDR) + +static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entry *entry, + struct ftrace_graph_ret *graph_ret, void *func, + u32 opt_flags, u32 trace_flags) { unsigned long err_code = 0; + unsigned long retval = 0; + bool print_retaddr = false; + bool print_retval = false; + bool hex_format = !!(opt_flags & TRACE_GRAPH_PRINT_RETVAL_HEX); - if (retval == 0 || hex_format) - goto done; +#ifdef CONFIG_FUNCTION_GRAPH_RETVAL + retval = graph_ret->retval; + print_retval = !!(opt_flags & TRACE_GRAPH_PRINT_RETVAL); +#endif - /* Check if the return value matches the negative format */ - if (IS_ENABLED(CONFIG_64BIT) && (retval & BIT(31)) && - (((u64)retval) >> 32) == 0) { - /* sign extension */ - err_code = (unsigned long)(s32)retval; - } else { - err_code = retval; +#ifdef CONFIG_FUNCTION_GRAPH_RETADDR + print_retaddr = !!(opt_flags & TRACE_GRAPH_PRINT_RETADDR); +#endif + + if (print_retval && retval && !hex_format) { + /* Check if the return value matches the negative format */ + if (IS_ENABLED(CONFIG_64BIT) && (retval & BIT(31)) && + (((u64)retval) >> 32) == 0) { + err_code = sign_extend64(retval, 31); + } else { + err_code = retval; + } + + if (!IS_ERR_VALUE(err_code)) + err_code = 0; } - if (!IS_ERR_VALUE(err_code)) - err_code = 0; + if (entry) { + if (entry->ent.type != TRACE_GRAPH_RETADDR_ENT) + print_retaddr = false; -done: - if (leaf) { - if (hex_format || (err_code == 0)) - trace_seq_printf(s, "%ps(); /* = 0x%lx */\n", - func, retval); + trace_seq_printf(s, "%ps();", func); + if (print_retval || print_retaddr) + trace_seq_puts(s, " /*"); else - trace_seq_printf(s, "%ps(); /* = %ld */\n", - func, err_code); + trace_seq_putc(s, '\n'); } else { + print_retaddr = false; + trace_seq_printf(s, "} /* %ps", func); + } + + if (print_retaddr) + print_graph_retaddr(s, (struct fgraph_retaddr_ent_entry *)entry, + trace_flags, false); + + if (print_retval) { if (hex_format || (err_code == 0)) - trace_seq_printf(s, "} /* %ps = 0x%lx */\n", - func, retval); + trace_seq_printf(s, " ret=0x%lx", retval); else - trace_seq_printf(s, "} /* %ps = %ld */\n", - func, err_code); + trace_seq_printf(s, " ret=%ld", err_code); } + + if (!entry || print_retval || print_retaddr) + trace_seq_puts(s, " */\n"); } #else -#define __TRACE_GRAPH_PRINT_RETVAL 0 - -#define print_graph_retval(_seq, _retval, _leaf, _func, _format) do {} while (0) +#define print_graph_retval(_seq, _ent, _ret, _func, _opt_flags, _trace_flags) do {} while (0) #endif @@ -798,14 +897,15 @@ print_graph_entry_leaf(struct trace_iterator *iter, trace_seq_putc(s, ' '); /* - * Write out the function return value if the option function-retval is - * enabled. + * Write out the function return value or return address */ - if (flags & __TRACE_GRAPH_PRINT_RETVAL) - print_graph_retval(s, graph_ret->retval, true, (void *)func, - !!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX)); - else + if (flags & (__TRACE_GRAPH_PRINT_RETVAL | __TRACE_GRAPH_PRINT_RETADDR)) { + print_graph_retval(s, entry, graph_ret, + (void *)graph_ret->func + iter->tr->text_delta, + flags, tr->trace_flags); + } else { trace_seq_printf(s, "%ps();\n", (void *)func); + } print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET, cpu, iter->ent->pid, flags); @@ -846,7 +946,12 @@ print_graph_entry_nested(struct trace_iterator *iter, func = call->func + iter->tr->text_delta; - trace_seq_printf(s, "%ps() {\n", (void *)func); + trace_seq_printf(s, "%ps() {", (void *)func); + if (flags & __TRACE_GRAPH_PRINT_RETADDR && + entry->ent.type == TRACE_GRAPH_RETADDR_ENT) + print_graph_retaddr(s, (struct fgraph_retaddr_ent_entry *)entry, + tr->trace_flags, true); + trace_seq_putc(s, '\n'); if (trace_seq_has_overflowed(s)) return TRACE_TYPE_PARTIAL_LINE; @@ -1093,11 +1198,10 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, /* * Always write out the function name and its return value if the - * function-retval option is enabled. + * funcgraph-retval option is enabled. */ if (flags & __TRACE_GRAPH_PRINT_RETVAL) { - print_graph_retval(s, trace->retval, false, (void *)func, - !!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX)); + print_graph_retval(s, NULL, trace, (void *)func, flags, tr->trace_flags); } else { /* * If the return function does not have a matching entry, @@ -1212,7 +1316,7 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) * to print out the missing entry which would never go out. */ if (data && data->failed) { - field = &data->ent; + field = &data->ent.ent; iter->cpu = data->cpu; ret = print_graph_entry(field, s, iter, flags); if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) { @@ -1236,6 +1340,16 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) saved = *field; return print_graph_entry(&saved, s, iter, flags); } +#ifdef CONFIG_FUNCTION_GRAPH_RETADDR + case TRACE_GRAPH_RETADDR_ENT: { + struct fgraph_retaddr_ent_entry saved; + struct fgraph_retaddr_ent_entry *rfield; + + trace_assign_type(rfield, entry); + saved = *rfield; + return print_graph_entry((struct ftrace_graph_ent_entry *)&saved, s, iter, flags); + } +#endif case TRACE_GRAPH_RET: { struct ftrace_graph_ret_entry *field; trace_assign_type(field, entry); @@ -1430,6 +1544,13 @@ static struct trace_event graph_trace_entry_event = { .funcs = &graph_functions, }; +#ifdef CONFIG_FUNCTION_GRAPH_RETADDR +static struct trace_event graph_trace_retaddr_entry_event = { + .type = TRACE_GRAPH_RETADDR_ENT, + .funcs = &graph_functions, +}; +#endif + static struct trace_event graph_trace_ret_event = { .type = TRACE_GRAPH_RET, .funcs = &graph_functions @@ -1516,6 +1637,13 @@ static __init int init_graph_trace(void) return 1; } +#ifdef CONFIG_FUNCTION_GRAPH_RETADDR + if (!register_trace_event(&graph_trace_retaddr_entry_event)) { + pr_warn("Warning: could not register graph trace retaddr events\n"); + return 1; + } +#endif + if (!register_trace_event(&graph_trace_ret_event)) { pr_warn("Warning: could not register graph trace events\n"); return 1; diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index fce064e20570..eb3aa36cf10f 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -176,7 +176,8 @@ static int irqsoff_display_graph(struct trace_array *tr, int set) } static int irqsoff_graph_entry(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct fgraph_extras *extras) { struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index ae2ace5e515a..155de2551507 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -113,7 +113,8 @@ static int wakeup_display_graph(struct trace_array *tr, int set) } static int wakeup_graph_entry(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct fgraph_extras *extras) { struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index c4ad7cd7e778..fbb99f8c8062 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -17,6 +17,7 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_PRINT: case TRACE_BRANCH: case TRACE_GRAPH_ENT: + case TRACE_GRAPH_RETADDR_ENT: case TRACE_GRAPH_RET: return 1; } @@ -773,7 +774,8 @@ struct fgraph_fixture { }; static __init int store_entry(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct fgraph_extras *extras) { struct fgraph_fixture *fixture = container_of(gops, struct fgraph_fixture, gops); const char *type = fixture->store_type_name; @@ -1024,7 +1026,8 @@ static unsigned int graph_hang_thresh; /* Wrap the real function entry probe to avoid possible hanging */ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct fgraph_extras *extras) { /* This is harmlessly racy, we want to approximately detect a hang */ if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) { @@ -1038,7 +1041,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace, return 0; } - return trace_graph_entry(trace, gops); + return trace_graph_entry(trace, gops, NULL); } static struct fgraph_ops fgraph_ops __initdata = { diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-retval.tc b/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-retval.tc index e34c0bdef3ed..e8e46378b88d 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-retval.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-retval.tc @@ -29,7 +29,7 @@ set -e : "Test printing the error code in signed decimal format" echo 0 > options/funcgraph-retval-hex -count=`cat trace | grep 'proc_reg_write' | grep '= -5' | wc -l` +count=`cat trace | grep 'proc_reg_write' | grep '=-5' | wc -l` if [ $count -eq 0 ]; then fail "Return value can not be printed in signed decimal format" fi -- cgit v1.2.3 From 474ec3e849686a02d00c5bd7a80c3042505b66bb Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 16 Sep 2024 19:58:19 +0200 Subject: function_graph: Remove unnecessary initialization in ftrace_graph_ret_addr() After the commit 29c1c24a2707 ("function_graph: Fix up ftrace_graph_ret_addr()") ftrace_graph_ret_addr() doesn't need to initialize "int i" at the start. Cc: Masami Hiramatsu Cc: Puranjay Mohan Link: https://lore.kernel.org/20240916175818.GA28944@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 875aefe60a13..27e523f01ed2 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -930,7 +930,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, { struct ftrace_ret_stack *ret_stack; unsigned long return_handler = (unsigned long)dereference_kernel_function_descriptor(return_to_handler); - int i = task->curr_ret_stack; + int i; if (ret != return_handler) return ret; -- cgit v1.2.3 From e32540b1e4b37fd720b59f8a504d7592fc3483bf Mon Sep 17 00:00:00 2001 From: Li Chen Date: Mon, 19 Aug 2024 14:01:53 +0800 Subject: ftrace: Use this_cpu_ptr() instead of per_cpu_ptr(smp_processor_id()) Use this_cpu_ptr() instead of open coding the equivalent in various ftrace functions. Cc: Mathieu Desnoyers Cc: Masami Hiramatsu Link: https://lore.kernel.org/87y14t6ofi.wl-me@linux.beauty Signed-off-by: Li Chen Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 3b0cea37e029..65fed0bbc5c2 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -184,7 +184,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, struct trace_array_cpu *data; unsigned int trace_ctx; int bit; - int cpu; if (unlikely(!tr->function_enabled)) return; @@ -195,8 +194,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, trace_ctx = tracing_gen_ctx(); - cpu = smp_processor_id(); - data = per_cpu_ptr(tr->array_buffer.data, cpu); + data = this_cpu_ptr(tr->array_buffer.data); if (!atomic_read(&data->disabled)) trace_function(tr, ip, parent_ip, trace_ctx); @@ -300,7 +298,6 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, unsigned int trace_ctx; unsigned long flags; int bit; - int cpu; if (unlikely(!tr->function_enabled)) return; @@ -309,8 +306,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, if (bit < 0) return; - cpu = smp_processor_id(); - data = per_cpu_ptr(tr->array_buffer.data, cpu); + data = this_cpu_ptr(tr->array_buffer.data); if (atomic_read(&data->disabled)) goto out; @@ -321,7 +317,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, * TODO: think about a solution that is better than just hoping to be * lucky. */ - last_info = per_cpu_ptr(tr->last_func_repeats, cpu); + last_info = this_cpu_ptr(tr->last_func_repeats); if (is_repeat_check(tr, last_info, ip, parent_ip)) goto out; -- cgit v1.2.3 From 2aa746ec0240dcbe70aef10f40fb1518f6dfb137 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Mon, 26 Aug 2024 10:40:49 -0700 Subject: tracing/branch-profiler: Replace deprecated strncpy with strscpy strncpy() is deprecated for use on NUL-terminated destination strings [1] and as such we should prefer more robust and less ambiguous string interfaces. Both of these fields want to be NUL-terminated as per their use in printk: F_printk("%u:%s:%s (%u)%s", __entry->line, __entry->func, __entry->file, __entry->correct, __entry->constant ? " CONSTANT" : "") Use strscpy() as it NUL-terminates the destination buffer, so it doesn't have to be done manually. Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strncpy-on-nul-terminated-strings [1] Link: https://manpages.debian.org/testing/linux-manual-4.8/strscpy.9.en.html Link: https://github.com/KSPP/linux/issues/90 Cc: linux-hardening@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Kees Cook Link: https://lore.kernel.org/20240826-strncpy-kernel-trace-trace_branch-c-v1-1-b2c14f2e9e84@google.com Signed-off-by: Justin Stitt Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_branch.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index e47fdb4c92fb..aa63548873c3 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -74,10 +74,8 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) p--; p++; - strncpy(entry->func, f->data.func, TRACE_FUNC_SIZE); - strncpy(entry->file, p, TRACE_FILE_SIZE); - entry->func[TRACE_FUNC_SIZE] = 0; - entry->file[TRACE_FILE_SIZE] = 0; + strscpy(entry->func, f->data.func); + strscpy(entry->file, p); entry->constant = f->constant; entry->line = f->data.line; entry->correct = val == expect; -- cgit v1.2.3 From 49e4154f4b16345da5e219b23ed9737a6e735bc1 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Wed, 11 Sep 2024 09:00:26 +0800 Subject: tracing: Remove TRACE_EVENT_FL_FILTERED logic After commit dcb0b5575d24 ("tracing: Remove TRACE_EVENT_FL_USE_CALL_FILTER logic"), no one's going to set the TRACE_EVENT_FL_FILTERED or change the call->filter, so remove related logic. Link: https://lore.kernel.org/20240911010026.2302849-1-zhengyejian@huaweicloud.com Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 4 ---- kernel/trace/trace.c | 44 ++++++++---------------------------- kernel/trace/trace.h | 4 ---- kernel/trace/trace_branch.c | 4 +--- kernel/trace/trace_events.c | 2 -- kernel/trace/trace_functions_graph.c | 8 ++----- kernel/trace/trace_hwlat.c | 4 +--- kernel/trace/trace_mmiotrace.c | 8 ++----- kernel/trace/trace_osnoise.c | 12 +++------- kernel/trace/trace_sched_wakeup.c | 8 ++----- 10 files changed, 20 insertions(+), 78 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 42bedcddd511..f8f2e52653df 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -326,7 +326,6 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, void trace_event_buffer_commit(struct trace_event_buffer *fbuffer); enum { - TRACE_EVENT_FL_FILTERED_BIT, TRACE_EVENT_FL_CAP_ANY_BIT, TRACE_EVENT_FL_NO_SET_FILTER_BIT, TRACE_EVENT_FL_IGNORE_ENABLE_BIT, @@ -341,7 +340,6 @@ enum { /* * Event flags: - * FILTERED - The event has a filter attached * CAP_ANY - Any user can enable for perf * NO_SET_FILTER - Set when filter has error and is to be ignored * IGNORE_ENABLE - For trace internal events, do not enable with debugfs file @@ -356,7 +354,6 @@ enum { * to a tracepoint yet, then it is cleared when it is. */ enum { - TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT), TRACE_EVENT_FL_NO_SET_FILTER = (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT), TRACE_EVENT_FL_IGNORE_ENABLE = (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT), @@ -381,7 +378,6 @@ struct trace_event_call { }; struct trace_event event; char *print_fmt; - struct event_filter *filter; /* * Static events can disappear with modules, * where as dynamic ones need their own ref count. diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1c69ca1f1088..bdb776e6ceb9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -593,19 +593,6 @@ int tracing_check_open_get_tr(struct trace_array *tr) return 0; } -int call_filter_check_discard(struct trace_event_call *call, void *rec, - struct trace_buffer *buffer, - struct ring_buffer_event *event) -{ - if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && - !filter_match_preds(call->filter, rec)) { - __trace_event_discard_commit(buffer, event); - return 1; - } - - return 0; -} - /** * trace_find_filtered_pid - check if a pid exists in a filtered_pid list * @filtered_pids: The list of pids to check @@ -2889,7 +2876,6 @@ void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned int trace_ctx) { - struct trace_event_call *call = &event_function; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; @@ -2902,11 +2888,9 @@ trace_function(struct trace_array *tr, unsigned long ip, unsigned long entry->ip = ip; entry->parent_ip = parent_ip; - if (!call_filter_check_discard(call, entry, buffer, event)) { - if (static_branch_unlikely(&trace_function_exports_enabled)) - ftrace_exports(event, TRACE_EXPORT_FUNCTION); - __buffer_unlock_commit(buffer, event); - } + if (static_branch_unlikely(&trace_function_exports_enabled)) + ftrace_exports(event, TRACE_EXPORT_FUNCTION); + __buffer_unlock_commit(buffer, event); } #ifdef CONFIG_STACKTRACE @@ -2932,7 +2916,6 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, unsigned int trace_ctx, int skip, struct pt_regs *regs) { - struct trace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; unsigned int size, nr_entries; struct ftrace_stack *fstack; @@ -2986,8 +2969,7 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, memcpy(&entry->caller, fstack->calls, flex_array_size(entry, caller, nr_entries)); - if (!call_filter_check_discard(call, entry, buffer, event)) - __buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); out: /* Again, don't let gcc optimize things here */ @@ -3060,7 +3042,6 @@ static void ftrace_trace_userstack(struct trace_array *tr, struct trace_buffer *buffer, unsigned int trace_ctx) { - struct trace_event_call *call = &event_user_stack; struct ring_buffer_event *event; struct userstack_entry *entry; @@ -3094,8 +3075,7 @@ ftrace_trace_userstack(struct trace_array *tr, memset(&entry->caller, 0, sizeof(entry->caller)); stack_trace_save_user(entry->caller, FTRACE_STACK_ENTRIES); - if (!call_filter_check_discard(call, entry, buffer, event)) - __buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); out_drop_count: __this_cpu_dec(user_stack_count); @@ -3264,7 +3244,6 @@ static void trace_printk_start_stop_comm(int enabled) */ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) { - struct trace_event_call *call = &event_bprint; struct ring_buffer_event *event; struct trace_buffer *buffer; struct trace_array *tr = READ_ONCE(printk_trace); @@ -3308,10 +3287,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) entry->fmt = fmt; memcpy(entry->buf, tbuffer, sizeof(u32) * len); - if (!call_filter_check_discard(call, entry, buffer, event)) { - __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); - } + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); out: ring_buffer_nest_end(buffer); @@ -3331,7 +3308,6 @@ static int __trace_array_vprintk(struct trace_buffer *buffer, unsigned long ip, const char *fmt, va_list args) { - struct trace_event_call *call = &event_print; struct ring_buffer_event *event; int len = 0, size; struct print_entry *entry; @@ -3366,10 +3342,8 @@ __trace_array_vprintk(struct trace_buffer *buffer, entry->ip = ip; memcpy(&entry->buf, tbuffer, len + 1); - if (!call_filter_check_discard(call, entry, buffer, event)) { - __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL); - } + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL); out: ring_buffer_nest_end(buffer); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c866991b9c78..638f452eec10 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1429,10 +1429,6 @@ struct trace_subsystem_dir { int nr_events; }; -extern int call_filter_check_discard(struct trace_event_call *call, void *rec, - struct trace_buffer *buffer, - struct ring_buffer_event *event); - void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct trace_buffer *buffer, struct ring_buffer_event *event, diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index aa63548873c3..6d08a5523ce0 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -30,7 +30,6 @@ static struct trace_array *branch_tracer; static void probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) { - struct trace_event_call *call = &event_branch; struct trace_array *tr = branch_tracer; struct trace_buffer *buffer; struct trace_array_cpu *data; @@ -80,8 +79,7 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) entry->line = f->data.line; entry->correct = val == expect; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); out: current->trace_recursion &= ~TRACE_BRANCH_BIT; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 7266ec2a4eea..77e68efbd43e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3149,8 +3149,6 @@ static void __trace_remove_event_call(struct trace_event_call *call) { event_remove(call); trace_destroy_fields(call); - free_event_filter(call->filter); - call->filter = NULL; } static int probe_remove_event_call(struct trace_event_call *call) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index a569daaac4c4..ab57ec78ca04 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -102,7 +102,6 @@ int __trace_graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned int trace_ctx) { - struct trace_event_call *call = &event_funcgraph_entry; struct ring_buffer_event *event; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ftrace_graph_ent_entry *entry; @@ -113,8 +112,7 @@ int __trace_graph_entry(struct trace_array *tr, return 0; entry = ring_buffer_event_data(event); entry->graph_ent = *trace; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); return 1; } @@ -223,7 +221,6 @@ void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, unsigned int trace_ctx) { - struct trace_event_call *call = &event_funcgraph_exit; struct ring_buffer_event *event; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ftrace_graph_ret_entry *entry; @@ -234,8 +231,7 @@ void __trace_graph_return(struct trace_array *tr, return; entry = ring_buffer_event_data(event); entry->ret = *trace; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); } void trace_graph_return(struct ftrace_graph_ret *trace, diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 3bd6071441ad..b65353ec2837 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -130,7 +130,6 @@ static bool hwlat_busy; static void trace_hwlat_sample(struct hwlat_sample *sample) { struct trace_array *tr = hwlat_trace; - struct trace_event_call *call = &event_hwlat; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct hwlat_entry *entry; @@ -148,8 +147,7 @@ static void trace_hwlat_sample(struct hwlat_sample *sample) entry->nmi_count = sample->nmi_count; entry->count = sample->count; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); } /* Macros to encapsulate the time capturing infrastructure */ diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 64e77b513697..ba5858866b2f 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -294,7 +294,6 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, struct mmiotrace_rw *rw) { - struct trace_event_call *call = &event_mmiotrace_rw; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct trace_mmiotrace_rw *entry; @@ -310,8 +309,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->rw = *rw; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); + trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); } void mmio_trace_rw(struct mmiotrace_rw *rw) @@ -325,7 +323,6 @@ static void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data, struct mmiotrace_map *map) { - struct trace_event_call *call = &event_mmiotrace_map; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct trace_mmiotrace_map *entry; @@ -341,8 +338,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->map = *map; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); + trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); } void mmio_trace_mapping(struct mmiotrace_map *map) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index a50ed23bee77..b9f96c77527d 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -499,7 +499,6 @@ static void print_osnoise_headers(struct seq_file *s) static void __trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer) { - struct trace_event_call *call = &event_osnoise; struct ring_buffer_event *event; struct osnoise_entry *entry; @@ -517,8 +516,7 @@ __trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffe entry->softirq_count = sample->softirq_count; entry->thread_count = sample->thread_count; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); } /* @@ -578,7 +576,6 @@ static void print_timerlat_headers(struct seq_file *s) static void __trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer) { - struct trace_event_call *call = &event_osnoise; struct ring_buffer_event *event; struct timerlat_entry *entry; @@ -591,8 +588,7 @@ __trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buf entry->context = sample->context; entry->timer_latency = sample->timer_latency; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); } /* @@ -654,7 +650,6 @@ static void timerlat_save_stack(int skip) static void __timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, unsigned int size) { - struct trace_event_call *call = &event_osnoise; struct ring_buffer_event *event; struct stack_entry *entry; @@ -668,8 +663,7 @@ __timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, u memcpy(&entry->caller, fstack->calls, size); entry->size = fstack->nr_entries; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); } /* diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index ae2ace5e515a..d6c7f18daa15 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -378,7 +378,6 @@ tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *next, unsigned int trace_ctx) { - struct trace_event_call *call = &event_context_switch; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct ctx_switch_entry *entry; @@ -396,8 +395,7 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_state = task_state_index(next); entry->next_cpu = task_cpu(next); - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); + trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); } static void @@ -406,7 +404,6 @@ tracing_sched_wakeup_trace(struct trace_array *tr, struct task_struct *curr, unsigned int trace_ctx) { - struct trace_event_call *call = &event_wakeup; struct ring_buffer_event *event; struct ctx_switch_entry *entry; struct trace_buffer *buffer = tr->array_buffer.buffer; @@ -424,8 +421,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_state = task_state_index(wakee); entry->next_cpu = task_cpu(wakee); - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); + trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); } static void notrace -- cgit v1.2.3 From 4a8840af5f53f2902eba91130fae650879f18e7a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 8 Oct 2024 12:17:19 -0700 Subject: tracepoints: Use new static branch API The old static key API is deprecated. Switch to the new one. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Alice Ryhl Link: https://lore.kernel.org/7a08dae3c5eddb14b13864923c1b58ac1f4af83c.1728414936.git.jpoimboe@kernel.org Signed-off-by: Josh Poimboeuf Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint-defs.h | 4 ++-- include/linux/tracepoint.h | 8 ++++---- kernel/trace/trace_events_hist.c | 2 +- kernel/trace/trace_events_user.c | 4 ++-- kernel/tracepoint.c | 4 ++-- 5 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h index 4dc4955f0fbf..60a6e8314d4c 100644 --- a/include/linux/tracepoint-defs.h +++ b/include/linux/tracepoint-defs.h @@ -31,7 +31,7 @@ struct tracepoint_func { struct tracepoint { const char *name; /* Tracepoint name */ - struct static_key key; + struct static_key_false key; struct static_call_key *static_call_key; void *static_call_tramp; void *iterator; @@ -83,7 +83,7 @@ struct bpf_raw_event_map { #ifdef CONFIG_TRACEPOINTS # define tracepoint_enabled(tp) \ - static_key_false(&(__tracepoint_##tp).key) + static_branch_unlikely(&(__tracepoint_##tp).key) #else # define tracepoint_enabled(tracepoint) false #endif diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 93a9f3070b48..2a29334bbc02 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -248,7 +248,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DECLARE_TRACE_RCU(name, proto, args, cond) \ static inline void trace_##name##_rcuidle(proto) \ { \ - if (static_key_false(&__tracepoint_##name.key)) \ + if (static_branch_unlikely(&__tracepoint_##name.key)) \ __DO_TRACE(name, \ TP_ARGS(args), \ TP_CONDITION(cond), 1); \ @@ -274,7 +274,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) extern struct tracepoint __tracepoint_##name; \ static inline void trace_##name(proto) \ { \ - if (static_key_false(&__tracepoint_##name.key)) \ + if (static_branch_unlikely(&__tracepoint_##name.key)) \ __DO_TRACE(name, \ TP_ARGS(args), \ TP_CONDITION(cond), 0); \ @@ -311,7 +311,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) static inline bool \ trace_##name##_enabled(void) \ { \ - return static_key_false(&__tracepoint_##name.key); \ + return static_branch_unlikely(&__tracepoint_##name.key);\ } /* @@ -328,7 +328,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) struct tracepoint __tracepoint_##_name __used \ __section("__tracepoints") = { \ .name = __tpstrtab_##_name, \ - .key = STATIC_KEY_INIT_FALSE, \ + .key = STATIC_KEY_FALSE_INIT, \ .static_call_key = &STATIC_CALL_KEY(tp_func_##_name), \ .static_call_tramp = STATIC_CALL_TRAMP_ADDR(tp_func_##_name), \ .iterator = &__traceiter_##_name, \ diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 5f9119eb7c67..cc2924ad32a3 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -822,7 +822,7 @@ static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals, { struct tracepoint *tp = event->tp; - if (unlikely(atomic_read(&tp->key.enabled) > 0)) { + if (unlikely(static_key_enabled(&tp->key))) { struct tracepoint_func *probe_func_ptr; synth_probe_func_t probe_func; void *__data; diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 42b0d998d103..17bcad8f79de 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1676,7 +1676,7 @@ static void update_enable_bit_for(struct user_event *user) struct tracepoint *tp = &user->tracepoint; char status = 0; - if (atomic_read(&tp->key.enabled) > 0) { + if (static_key_enabled(&tp->key)) { struct tracepoint_func *probe_func_ptr; user_event_func_t probe_func; @@ -2280,7 +2280,7 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) * It's possible key.enabled disables after this check, however * we don't mind if a few events are included in this condition. */ - if (likely(atomic_read(&tp->key.enabled) > 0)) { + if (likely(static_key_enabled(&tp->key))) { struct tracepoint_func *probe_func_ptr; user_event_func_t probe_func; struct iov_iter copy; diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 8879da16ef4d..1e3de77ea6b3 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -358,7 +358,7 @@ static int tracepoint_add_func(struct tracepoint *tp, tracepoint_update_call(tp, tp_funcs); /* Both iterator and static call handle NULL tp->funcs */ rcu_assign_pointer(tp->funcs, tp_funcs); - static_key_enable(&tp->key); + static_branch_enable(&tp->key); break; case TP_FUNC_2: /* 1->2 */ /* Set iterator static call */ @@ -414,7 +414,7 @@ static int tracepoint_remove_func(struct tracepoint *tp, if (tp->unregfunc && static_key_enabled(&tp->key)) tp->unregfunc(); - static_key_disable(&tp->key); + static_branch_disable(&tp->key); /* Set iterator static call */ tracepoint_update_call(tp, tp_funcs); /* Both iterator and static call handle NULL tp->funcs */ -- cgit v1.2.3 From 48bcda6848232667f13b4e97588de488c83c37d4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Oct 2024 18:16:29 -0400 Subject: tracing: Remove definition of trace_*_rcuidle() The trace_*_rcuidle() variant of a tracepoint was to handle places where a tracepoint was located but RCU was not "watching". All those locations have been removed, and RCU should be watching where all tracepoints are located. We can now remove the trace_*_rcuidle() variant. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mark Rutland Cc: Joel Fernandes Link: https://lore.kernel.org/20241003181629.36209057@gandalf.local.home Acked-by: Peter Zijlstra (Intel) Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint.h | 50 ++------------------------------------- include/trace/events/preemptirq.h | 8 ------- kernel/trace/trace_preemptirq.c | 26 +++++--------------- scripts/tags.sh | 2 -- 4 files changed, 8 insertions(+), 78 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 2a29334bbc02..7e4af7b3633c 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -196,67 +196,25 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DO_TRACE_CALL(name, args) __traceiter_##name(NULL, args) #endif /* CONFIG_HAVE_STATIC_CALL */ -/* - * ARCH_WANTS_NO_INSTR archs are expected to have sanitized entry and idle - * code that disallow any/all tracing/instrumentation when RCU isn't watching. - */ -#ifdef CONFIG_ARCH_WANTS_NO_INSTR -#define RCUIDLE_COND(rcuidle) (rcuidle) -#else -/* srcu can't be used from NMI */ -#define RCUIDLE_COND(rcuidle) (rcuidle && in_nmi()) -#endif - /* * it_func[0] is never NULL because there is at least one element in the array * when the array itself is non NULL. */ -#define __DO_TRACE(name, args, cond, rcuidle) \ +#define __DO_TRACE(name, args, cond) \ do { \ int __maybe_unused __idx = 0; \ \ if (!(cond)) \ return; \ \ - if (WARN_ONCE(RCUIDLE_COND(rcuidle), \ - "Bad RCU usage for tracepoint")) \ - return; \ - \ /* keep srcu and sched-rcu usage consistent */ \ preempt_disable_notrace(); \ \ - /* \ - * For rcuidle callers, use srcu since sched-rcu \ - * doesn't work from the idle path. \ - */ \ - if (rcuidle) { \ - __idx = srcu_read_lock_notrace(&tracepoint_srcu);\ - ct_irq_enter_irqson(); \ - } \ - \ __DO_TRACE_CALL(name, TP_ARGS(args)); \ \ - if (rcuidle) { \ - ct_irq_exit_irqson(); \ - srcu_read_unlock_notrace(&tracepoint_srcu, __idx);\ - } \ - \ preempt_enable_notrace(); \ } while (0) -#ifndef MODULE -#define __DECLARE_TRACE_RCU(name, proto, args, cond) \ - static inline void trace_##name##_rcuidle(proto) \ - { \ - if (static_branch_unlikely(&__tracepoint_##name.key)) \ - __DO_TRACE(name, \ - TP_ARGS(args), \ - TP_CONDITION(cond), 1); \ - } -#else -#define __DECLARE_TRACE_RCU(name, proto, args, cond) -#endif - /* * Make sure the alignment of the structure in the __tracepoints section will * not add unwanted padding between the beginning of the section and the @@ -277,14 +235,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) if (static_branch_unlikely(&__tracepoint_##name.key)) \ __DO_TRACE(name, \ TP_ARGS(args), \ - TP_CONDITION(cond), 0); \ + TP_CONDITION(cond)); \ if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ WARN_ONCE(!rcu_is_watching(), \ "RCU not watching for tracepoint"); \ } \ } \ - __DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args), \ - PARAMS(cond)) \ static inline int \ register_trace_##name(void (*probe)(data_proto), void *data) \ { \ @@ -375,8 +331,6 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DECLARE_TRACE(name, proto, args, cond, data_proto) \ static inline void trace_##name(proto) \ { } \ - static inline void trace_##name##_rcuidle(proto) \ - { } \ static inline int \ register_trace_##name(void (*probe)(data_proto), \ void *data) \ diff --git a/include/trace/events/preemptirq.h b/include/trace/events/preemptirq.h index 3f249e150c0c..f99562d2b496 100644 --- a/include/trace/events/preemptirq.h +++ b/include/trace/events/preemptirq.h @@ -43,8 +43,6 @@ DEFINE_EVENT(preemptirq_template, irq_enable, #else #define trace_irq_enable(...) #define trace_irq_disable(...) -#define trace_irq_enable_rcuidle(...) -#define trace_irq_disable_rcuidle(...) #endif #ifdef CONFIG_TRACE_PREEMPT_TOGGLE @@ -58,8 +56,6 @@ DEFINE_EVENT(preemptirq_template, preempt_enable, #else #define trace_preempt_enable(...) #define trace_preempt_disable(...) -#define trace_preempt_enable_rcuidle(...) -#define trace_preempt_disable_rcuidle(...) #endif #endif /* _TRACE_PREEMPTIRQ_H */ @@ -69,10 +65,6 @@ DEFINE_EVENT(preemptirq_template, preempt_enable, #else /* !CONFIG_PREEMPTIRQ_TRACEPOINTS */ #define trace_irq_enable(...) #define trace_irq_disable(...) -#define trace_irq_enable_rcuidle(...) -#define trace_irq_disable_rcuidle(...) #define trace_preempt_enable(...) #define trace_preempt_disable(...) -#define trace_preempt_enable_rcuidle(...) -#define trace_preempt_disable_rcuidle(...) #endif diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index e37446f7916e..5c03633316a6 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -15,20 +15,6 @@ #define CREATE_TRACE_POINTS #include -/* - * Use regular trace points on architectures that implement noinstr - * tooling: these calls will only happen with RCU enabled, which can - * use a regular tracepoint. - * - * On older architectures, use the rcuidle tracing methods (which - * aren't NMI-safe - so exclude NMI contexts): - */ -#ifdef CONFIG_ARCH_WANTS_NO_INSTR -#define trace(point) trace_##point -#else -#define trace(point) if (!in_nmi()) trace_##point##_rcuidle -#endif - #ifdef CONFIG_TRACE_IRQFLAGS /* Per-cpu variable to prevent redundant calls when IRQs already off */ static DEFINE_PER_CPU(int, tracing_irq_cpu); @@ -42,7 +28,7 @@ static DEFINE_PER_CPU(int, tracing_irq_cpu); void trace_hardirqs_on_prepare(void) { if (this_cpu_read(tracing_irq_cpu)) { - trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1); + trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1); tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); this_cpu_write(tracing_irq_cpu, 0); } @@ -53,7 +39,7 @@ NOKPROBE_SYMBOL(trace_hardirqs_on_prepare); void trace_hardirqs_on(void) { if (this_cpu_read(tracing_irq_cpu)) { - trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1); + trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1); tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); this_cpu_write(tracing_irq_cpu, 0); } @@ -75,7 +61,7 @@ void trace_hardirqs_off_finish(void) if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); - trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1); + trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1); } } @@ -89,7 +75,7 @@ void trace_hardirqs_off(void) if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); - trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1); + trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1); } } EXPORT_SYMBOL(trace_hardirqs_off); @@ -100,13 +86,13 @@ NOKPROBE_SYMBOL(trace_hardirqs_off); void trace_preempt_on(unsigned long a0, unsigned long a1) { - trace(preempt_enable)(a0, a1); + trace_preempt_enable(a0, a1); tracer_preempt_on(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { - trace(preempt_disable)(a0, a1); + trace_preempt_disable(a0, a1); tracer_preempt_off(a0, a1); } #endif diff --git a/scripts/tags.sh b/scripts/tags.sh index 191e0461d6d5..0d01c1cafb70 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -152,9 +152,7 @@ regex_c=( '/^BPF_CALL_[0-9]([[:space:]]*\([[:alnum:]_]*\).*/\1/' '/^COMPAT_SYSCALL_DEFINE[0-9]([[:space:]]*\([[:alnum:]_]*\).*/compat_sys_\1/' '/^TRACE_EVENT([[:space:]]*\([[:alnum:]_]*\).*/trace_\1/' - '/^TRACE_EVENT([[:space:]]*\([[:alnum:]_]*\).*/trace_\1_rcuidle/' '/^DEFINE_EVENT([^,)]*,[[:space:]]*\([[:alnum:]_]*\).*/trace_\1/' - '/^DEFINE_EVENT([^,)]*,[[:space:]]*\([[:alnum:]_]*\).*/trace_\1_rcuidle/' '/^DEFINE_INSN_CACHE_OPS([[:space:]]*\([[:alnum:]_]*\).*/get_\1_slot/' '/^DEFINE_INSN_CACHE_OPS([[:space:]]*\([[:alnum:]_]*\).*/free_\1_slot/' '/^PAGEFLAG([[:space:]]*\([[:alnum:]_]*\).*/Page\1/' -- cgit v1.2.3 From 13d750c2c03e9861e15268574ed2c239cca9c9d5 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 8 Oct 2024 21:07:12 -0400 Subject: tracing/ftrace: disable preemption in syscall probe In preparation for allowing system call enter/exit instrumentation to handle page faults, make sure that ftrace can handle this change by explicitly disabling preemption within the ftrace system call tracepoint probes to respect the current expectations within ftrace ring buffer code. This change does not yet allow ftrace to take page faults per se within its probe, but allows its existing probes to adapt to the upcoming change. Cc: Michael Jeanson Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Andrii Nakryiko Cc: bpf@vger.kernel.org Cc: Joel Fernandes Link: https://lore.kernel.org/20241009010718.2050182-3-mathieu.desnoyers@efficios.com Acked-by: Masami Hiramatsu (Google) Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/trace/trace_events.h | 39 ++++++++++++++++++++++++++++++++------- kernel/trace/trace_syscalls.c | 12 ++++++++++++ 2 files changed, 44 insertions(+), 7 deletions(-) (limited to 'kernel/trace') diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 8bcbb9ee44de..63071aa5923d 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -263,6 +263,9 @@ static struct trace_event_fields trace_event_fields_##call[] = { \ tstruct \ {} }; +#undef DECLARE_EVENT_SYSCALL_CLASS +#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS + #undef DEFINE_EVENT_PRINT #define DEFINE_EVENT_PRINT(template, name, proto, args, print) @@ -396,11 +399,11 @@ static inline notrace int trace_event_get_offsets_##call( \ #include "stages/stage6_event_callback.h" -#undef DECLARE_EVENT_CLASS -#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ - \ + +#undef __DECLARE_EVENT_CLASS +#define __DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ static notrace void \ -trace_event_raw_event_##call(void *__data, proto) \ +do_trace_event_raw_event_##call(void *__data, proto) \ { \ struct trace_event_file *trace_file = __data; \ struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\ @@ -425,15 +428,35 @@ trace_event_raw_event_##call(void *__data, proto) \ \ trace_event_buffer_commit(&fbuffer); \ } + +#undef DECLARE_EVENT_CLASS +#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ +__DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \ + PARAMS(assign), PARAMS(print)) \ +static notrace void \ +trace_event_raw_event_##call(void *__data, proto) \ +{ \ + do_trace_event_raw_event_##call(__data, args); \ +} + +#undef DECLARE_EVENT_SYSCALL_CLASS +#define DECLARE_EVENT_SYSCALL_CLASS(call, proto, args, tstruct, assign, print) \ +__DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \ + PARAMS(assign), PARAMS(print)) \ +static notrace void \ +trace_event_raw_event_##call(void *__data, proto) \ +{ \ + preempt_disable_notrace(); \ + do_trace_event_raw_event_##call(__data, args); \ + preempt_enable_notrace(); \ +} + /* * The ftrace_test_probe is compiled out, it is only here as a build time check * to make sure that if the tracepoint handling changes, the ftrace probe will * fail to compile unless it too is updated. */ -#undef DECLARE_EVENT_SYSCALL_CLASS -#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS - #undef DEFINE_EVENT #define DEFINE_EVENT(template, call, proto, args) \ static inline void ftrace_test_probe_##call(void) \ @@ -443,6 +466,8 @@ static inline void ftrace_test_probe_##call(void) \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +#undef __DECLARE_EVENT_CLASS + #include "stages/stage7_class_define.h" #undef DECLARE_EVENT_CLASS diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 785733245ead..f9b21bac9d45 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -299,6 +299,12 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) int syscall_nr; int size; + /* + * Syscall probe called with preemption enabled, but the ring + * buffer and per-cpu data require preemption to be disabled. + */ + guard(preempt_notrace)(); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; @@ -338,6 +344,12 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) struct trace_event_buffer fbuffer; int syscall_nr; + /* + * Syscall probe called with preemption enabled, but the ring + * buffer and per-cpu data require preemption to be disabled. + */ + guard(preempt_notrace)(); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; -- cgit v1.2.3 From 65e7462a16cea593025ca3b34c5d74e69b027ee0 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 8 Oct 2024 21:07:13 -0400 Subject: tracing/perf: disable preemption in syscall probe In preparation for allowing system call enter/exit instrumentation to handle page faults, make sure that perf can handle this change by explicitly disabling preemption within the perf system call tracepoint probes to respect the current expectations within perf ring buffer code. This change does not yet allow perf to take page faults per se within its probe, but allows its existing probes to adapt to the upcoming change. Cc: Michael Jeanson Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Andrii Nakryiko Cc: bpf@vger.kernel.org Cc: Joel Fernandes Link: https://lore.kernel.org/20241009010718.2050182-4-mathieu.desnoyers@efficios.com Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/trace/perf.h | 42 ++++++++++++++++++++++++++++++++++++++---- kernel/trace/trace_syscalls.c | 12 ++++++++++++ 2 files changed, 50 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/include/trace/perf.h b/include/trace/perf.h index ded997af481e..15cde7eac8b4 100644 --- a/include/trace/perf.h +++ b/include/trace/perf.h @@ -12,10 +12,10 @@ #undef __perf_task #define __perf_task(t) (__task = (t)) -#undef DECLARE_EVENT_CLASS -#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ +#undef __DECLARE_EVENT_CLASS +#define __DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ static notrace void \ -perf_trace_##call(void *__data, proto) \ +do_perf_trace_##call(void *__data, proto) \ { \ struct trace_event_call *event_call = __data; \ struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\ @@ -55,8 +55,39 @@ perf_trace_##call(void *__data, proto) \ head, __task); \ } +/* + * Define unused __count and __task variables to use @args to pass + * arguments to do_perf_trace_##call. This is needed because the + * macros __perf_count and __perf_task introduce the side-effect to + * store copies into those local variables. + */ +#undef DECLARE_EVENT_CLASS +#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ +__DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \ + PARAMS(assign), PARAMS(print)) \ +static notrace void \ +perf_trace_##call(void *__data, proto) \ +{ \ + u64 __count __attribute__((unused)); \ + struct task_struct *__task __attribute__((unused)); \ + \ + do_perf_trace_##call(__data, args); \ +} + #undef DECLARE_EVENT_SYSCALL_CLASS -#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS +#define DECLARE_EVENT_SYSCALL_CLASS(call, proto, args, tstruct, assign, print) \ +__DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \ + PARAMS(assign), PARAMS(print)) \ +static notrace void \ +perf_trace_##call(void *__data, proto) \ +{ \ + u64 __count __attribute__((unused)); \ + struct task_struct *__task __attribute__((unused)); \ + \ + preempt_disable_notrace(); \ + do_perf_trace_##call(__data, args); \ + preempt_enable_notrace(); \ +} /* * This part is compiled out, it is only here as a build time check @@ -76,4 +107,7 @@ static inline void perf_test_probe_##call(void) \ DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args)) #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +#undef __DECLARE_EVENT_CLASS + #endif /* CONFIG_PERF_EVENTS */ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index f9b21bac9d45..b1cc19806f3d 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -596,6 +596,12 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) int rctx; int size; + /* + * Syscall probe called with preemption enabled, but the ring + * buffer and per-cpu data require preemption to be disabled. + */ + guard(preempt_notrace)(); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; @@ -698,6 +704,12 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) int rctx; int size; + /* + * Syscall probe called with preemption enabled, but the ring + * buffer and per-cpu data require preemption to be disabled. + */ + guard(preempt_notrace)(); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; -- cgit v1.2.3 From a3204c740a59bebb3b37a294d83f4e353303a52c Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 8 Oct 2024 21:07:16 -0400 Subject: tracing/ftrace: Add might_fault check to syscall probes Add a might_fault() check to validate that the ftrace sys_enter/sys_exit probe callbacks are indeed called from a context where page faults can be handled. Cc: Michael Jeanson Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Andrii Nakryiko Cc: bpf@vger.kernel.org Cc: Joel Fernandes Link: https://lore.kernel.org/20241009010718.2050182-7-mathieu.desnoyers@efficios.com Acked-by: Masami Hiramatsu (Google) Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/trace/trace_events.h | 1 + kernel/trace/trace_syscalls.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'kernel/trace') diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 63071aa5923d..4f22136fd465 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -446,6 +446,7 @@ __DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \ static notrace void \ trace_event_raw_event_##call(void *__data, proto) \ { \ + might_fault(); \ preempt_disable_notrace(); \ do_trace_event_raw_event_##call(__data, args); \ preempt_enable_notrace(); \ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index b1cc19806f3d..6d6bbd56ed92 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -303,6 +303,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) * Syscall probe called with preemption enabled, but the ring * buffer and per-cpu data require preemption to be disabled. */ + might_fault(); guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); @@ -348,6 +349,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) * Syscall probe called with preemption enabled, but the ring * buffer and per-cpu data require preemption to be disabled. */ + might_fault(); guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); -- cgit v1.2.3 From cdb537ac417938408ee819992f432c410f2d01a2 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 8 Oct 2024 21:07:17 -0400 Subject: tracing/perf: Add might_fault check to syscall probes Add a might_fault() check to validate that the perf sys_enter/sys_exit probe callbacks are indeed called from a context where page faults can be handled. Cc: Michael Jeanson Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Andrii Nakryiko Cc: bpf@vger.kernel.org Cc: Joel Fernandes Link: https://lore.kernel.org/20241009010718.2050182-8-mathieu.desnoyers@efficios.com Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/trace/perf.h | 1 + kernel/trace/trace_syscalls.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'kernel/trace') diff --git a/include/trace/perf.h b/include/trace/perf.h index 15cde7eac8b4..a1754b73a8f5 100644 --- a/include/trace/perf.h +++ b/include/trace/perf.h @@ -84,6 +84,7 @@ perf_trace_##call(void *__data, proto) \ u64 __count __attribute__((unused)); \ struct task_struct *__task __attribute__((unused)); \ \ + might_fault(); \ preempt_disable_notrace(); \ do_perf_trace_##call(__data, args); \ preempt_enable_notrace(); \ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 6d6bbd56ed92..46aab0ab9350 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -602,6 +602,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) * Syscall probe called with preemption enabled, but the ring * buffer and per-cpu data require preemption to be disabled. */ + might_fault(); guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); @@ -710,6 +711,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) * Syscall probe called with preemption enabled, but the ring * buffer and per-cpu data require preemption to be disabled. */ + might_fault(); guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); -- cgit v1.2.3 From afe5960dc208fe069ddaaeb0994d857b24ac19d1 Mon Sep 17 00:00:00 2001 From: Levi Yun Date: Fri, 13 Sep 2024 03:13:47 +0100 Subject: trace/trace_event_perf: remove duplicate samples on the first tracepoint event When a tracepoint event is created with attr.freq = 1, 'hwc->period_left' is not initialized correctly. As a result, in the perf_swevent_overflow() function, when the first time the event occurs, it calculates the event overflow and the perf_swevent_set_period() returns 3, this leads to the event are recorded for three duplicate times. Step to reproduce: 1. Enable the tracepoint event & starting tracing $ echo 1 > /sys/kernel/tracing/events/module/module_free $ echo 1 > /sys/kernel/tracing/tracing_on 2. Record with perf $ perf record -a --strict-freq -F 1 -e "module:module_free" 3. Trigger module_free event. $ modprobe -i sunrpc $ modprobe -r sunrpc Result: - Trace pipe result: $ cat trace_pipe modprobe-174509 [003] ..... 6504.868896: module_free: sunrpc - perf sample: modprobe 174509 [003] 6504.868980: module:module_free: sunrpc modprobe 174509 [003] 6504.868980: module:module_free: sunrpc modprobe 174509 [003] 6504.868980: module:module_free: sunrpc By setting period_left via perf_swevent_set_period() as other sw_event did, This problem could be solved. After patch: - Trace pipe result: $ cat trace_pipe modprobe 1153096 [068] 613468.867774: module:module_free: xfs - perf sample modprobe 1153096 [068] 613468.867794: module:module_free: xfs Link: https://lore.kernel.org/20240913021347.595330-1-yeoreum.yun@arm.com Fixes: bd2b5b12849a ("perf_counter: More aggressive frequency adjustment") Signed-off-by: Levi Yun Acked-by: Namhyung Kim Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_event_perf.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 05e791241812..3ff9caa4a71b 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -352,10 +352,16 @@ void perf_uprobe_destroy(struct perf_event *p_event) int perf_trace_add(struct perf_event *p_event, int flags) { struct trace_event_call *tp_event = p_event->tp_event; + struct hw_perf_event *hwc = &p_event->hw; if (!(flags & PERF_EF_START)) p_event->hw.state = PERF_HES_STOPPED; + if (is_sampling_event(p_event)) { + hwc->last_period = hwc->sample_period; + perf_swevent_set_period(p_event); + } + /* * If TRACE_REG_PERF_ADD returns false; no custom action was performed * and we need to take the default action of enqueueing our event on -- cgit v1.2.3 From eb887c4567d1b0e7684c026fe7df44afa96589e6 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 7 Oct 2024 10:56:28 +0200 Subject: tracing: Use atomic64_inc_return() in trace_clock_counter() Use atomic64_inc_return(&ref) instead of atomic64_add_return(1, &ref) to use optimized implementation and ease register pressure around the primitive for targets that implement optimized variant. Cc: Steven Rostedt Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241007085651.48544-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_clock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 4702efb00ff2..4cb2ebc439be 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -154,5 +154,5 @@ static atomic64_t trace_counter; */ u64 notrace trace_clock_counter(void) { - return atomic64_add_return(1, &trace_counter); + return atomic64_inc_return(&trace_counter); } -- cgit v1.2.3 From 0a6c61bc9c636e9a32d9f5a4d6d3b031d08763ab Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 10 Oct 2024 23:59:09 +0900 Subject: fgraph: Simplify return address printing in function graph tracer Simplify return address printing in the function graph tracer by removing fgraph_extras. Since this feature is only used by the function graph tracer and the feature flags can directly accessible from the function graph tracer, fgraph_extras can be removed from the fgraph callback. Cc: Donglin Peng Link: https://lore.kernel.org/172857234900.270774.15378354017601069781.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 16 +++---------- kernel/trace/fgraph.c | 45 +++++++++++++++++++++++------------- kernel/trace/ftrace.c | 3 +-- kernel/trace/trace.h | 3 +-- kernel/trace/trace_functions_graph.c | 18 +++++++-------- kernel/trace/trace_irqsoff.c | 3 +-- kernel/trace/trace_sched_wakeup.c | 3 +-- kernel/trace/trace_selftest.c | 8 +++---- 8 files changed, 48 insertions(+), 51 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 2ac3b3b53cd0..4c7dd5e58c9f 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1068,29 +1068,20 @@ struct ftrace_graph_ret { unsigned long long rettime; } __packed; -struct fgraph_extras; struct fgraph_ops; /* Type of the callback handlers for tracing function graph*/ typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *, struct fgraph_ops *); /* return */ typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *, - struct fgraph_ops *, - struct fgraph_extras *); /* entry */ + struct fgraph_ops *); /* entry */ extern int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops, - struct fgraph_extras *extras); + struct fgraph_ops *gops); bool ftrace_pids_enabled(struct ftrace_ops *ops); #ifdef CONFIG_FUNCTION_GRAPH_TRACER -/* Used to convey some extra datas when creating a graph entry */ -struct fgraph_extras { - u32 flags; - unsigned long retaddr; -}; - struct fgraph_ops { trace_func_graph_ent_t entryfunc; trace_func_graph_ret_t retfunc; @@ -1131,13 +1122,12 @@ function_graph_enter(unsigned long ret, unsigned long func, struct ftrace_ret_stack * ftrace_graph_get_ret_stack(struct task_struct *task, int skip); +unsigned long ftrace_graph_top_ret_addr(struct task_struct *task); unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, unsigned long ret, unsigned long *retp); unsigned long *fgraph_get_task_var(struct fgraph_ops *gops); -u32 graph_tracer_flags_get(u32 flags); - /* * Sometimes we don't want to trace a function with the function * graph tracer but we want them to keep traced by the usual function diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 27e523f01ed2..ee829d65f301 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -290,8 +290,7 @@ static inline unsigned long make_data_type_val(int idx, int size, int offset) } /* ftrace_graph_entry set to this to tell some archs to run function graph */ -static int entry_run(struct ftrace_graph_ent *trace, struct fgraph_ops *ops, - struct fgraph_extras *extras) +static int entry_run(struct ftrace_graph_ent *trace, struct fgraph_ops *ops) { return 0; } @@ -519,8 +518,7 @@ int __weak ftrace_disable_ftrace_graph_caller(void) #endif int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops, - struct fgraph_extras *extras) + struct fgraph_ops *gops) { return 0; } @@ -648,20 +646,13 @@ int function_graph_enter(unsigned long ret, unsigned long func, unsigned long frame_pointer, unsigned long *retp) { struct ftrace_graph_ent trace; - struct fgraph_extras extras; unsigned long bitmap = 0; int offset; int i; - int idx = 0; trace.func = func; trace.depth = ++current->curr_ret_depth; - extras.flags = graph_tracer_flags_get(TRACE_GRAPH_PRINT_RETADDR); - if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) - && extras.flags & TRACE_GRAPH_PRINT_RETADDR) - extras.retaddr = ftrace_graph_ret_addr(current, &idx, ret, retp); - offset = ftrace_push_return_trace(ret, func, frame_pointer, retp, 0); if (offset < 0) goto out; @@ -670,7 +661,7 @@ int function_graph_enter(unsigned long ret, unsigned long func, if (static_branch_likely(&fgraph_do_direct)) { int save_curr_ret_stack = current->curr_ret_stack; - if (static_call(fgraph_func)(&trace, fgraph_direct_gops, &extras)) + if (static_call(fgraph_func)(&trace, fgraph_direct_gops)) bitmap |= BIT(fgraph_direct_gops->idx); else /* Clear out any saved storage */ @@ -688,7 +679,7 @@ int function_graph_enter(unsigned long ret, unsigned long func, save_curr_ret_stack = current->curr_ret_stack; if (ftrace_ops_test(&gops->ops, func, NULL) && - gops->entryfunc(&trace, gops, &extras)) + gops->entryfunc(&trace, gops)) bitmap |= BIT(i); else /* Clear out any saved storage */ @@ -905,6 +896,29 @@ ftrace_graph_get_ret_stack(struct task_struct *task, int idx) return ret_stack; } +/** + * ftrace_graph_top_ret_addr - return the top return address in the shadow stack + * @task: The task to read the shadow stack from. + * + * Return the first return address on the shadow stack of the @task, which is + * not the fgraph's return_to_handler. + */ +unsigned long ftrace_graph_top_ret_addr(struct task_struct *task) +{ + unsigned long return_handler = (unsigned long)dereference_kernel_function_descriptor(return_to_handler); + struct ftrace_ret_stack *ret_stack = NULL; + int offset = task->curr_ret_stack; + + if (offset < 0) + return 0; + + do { + ret_stack = get_ret_stack(task, offset, &offset); + } while (ret_stack && ret_stack->ret == return_handler); + + return ret_stack ? ret_stack->ret : 0; +} + /** * ftrace_graph_ret_addr - return the original value of the return address * @task: The task the unwinder is being executed on @@ -1145,8 +1159,7 @@ void ftrace_graph_exit_task(struct task_struct *t) #ifdef CONFIG_DYNAMIC_FTRACE static int fgraph_pid_func(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops, - struct fgraph_extras *extras) + struct fgraph_ops *gops) { struct trace_array *tr = gops->ops.private; int pid; @@ -1160,7 +1173,7 @@ static int fgraph_pid_func(struct ftrace_graph_ent *trace, return 0; } - return gops->saved_func(trace, gops, NULL); + return gops->saved_func(trace, gops); } void fgraph_update_pid_func(void) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5d87dac83b80..cae388122ca8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -827,8 +827,7 @@ struct profile_fgraph_data { }; static int profile_graph_entry(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops, - struct fgraph_extras *extras) + struct fgraph_ops *gops) { struct profile_fgraph_data *profile_data; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 13f08f257c0b..6adf48ef4312 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -695,8 +695,7 @@ void trace_default_header(struct seq_file *m); void print_trace_header(struct seq_file *m, struct trace_iterator *iter); void trace_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops); -int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, - struct fgraph_extras *extras); +int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 3dd63ae2afe8..20d0c579d3b5 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -90,9 +90,9 @@ static struct tracer_flags tracer_flags = { .opts = trace_opts }; -u32 graph_tracer_flags_get(u32 flags) +static bool tracer_flags_is_set(u32 flags) { - return tracer_flags.val & flags; + return (tracer_flags.val & flags) == flags; } /* @@ -179,8 +179,7 @@ struct fgraph_times { }; int trace_graph_entry(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops, - struct fgraph_extras *extras) + struct fgraph_ops *gops) { unsigned long *task_var = fgraph_get_task_var(gops); struct trace_array *tr = gops->private; @@ -246,11 +245,12 @@ int trace_graph_entry(struct ftrace_graph_ent *trace, disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { trace_ctx = tracing_gen_ctx_flags(flags); - if (unlikely(IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && extras - && (extras->flags & TRACE_GRAPH_PRINT_RETADDR))) - ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, - extras->retaddr); - else + if (unlikely(IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && + tracer_flags_is_set(TRACE_GRAPH_PRINT_RETADDR))) { + unsigned long retaddr = ftrace_graph_top_ret_addr(current); + + ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr); + } else ret = __trace_graph_entry(tr, trace, trace_ctx); } else { ret = 0; diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index eb3aa36cf10f..fce064e20570 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -176,8 +176,7 @@ static int irqsoff_display_graph(struct trace_array *tr, int set) } static int irqsoff_graph_entry(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops, - struct fgraph_extras *extras) + struct fgraph_ops *gops) { struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 155de2551507..ae2ace5e515a 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -113,8 +113,7 @@ static int wakeup_display_graph(struct trace_array *tr, int set) } static int wakeup_graph_entry(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops, - struct fgraph_extras *extras) + struct fgraph_ops *gops) { struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index fbb99f8c8062..d3a14ae47e26 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -774,8 +774,7 @@ struct fgraph_fixture { }; static __init int store_entry(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops, - struct fgraph_extras *extras) + struct fgraph_ops *gops) { struct fgraph_fixture *fixture = container_of(gops, struct fgraph_fixture, gops); const char *type = fixture->store_type_name; @@ -1026,8 +1025,7 @@ static unsigned int graph_hang_thresh; /* Wrap the real function entry probe to avoid possible hanging */ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops, - struct fgraph_extras *extras) + struct fgraph_ops *gops) { /* This is harmlessly racy, we want to approximately detect a hang */ if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) { @@ -1041,7 +1039,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace, return 0; } - return trace_graph_entry(trace, gops, NULL); + return trace_graph_entry(trace, gops); } static struct fgraph_ops fgraph_ops __initdata = { -- cgit v1.2.3 From c73eb02a4781aee53ee4122132967356361e4f1a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Oct 2024 19:40:20 -0400 Subject: fgragh: No need to invoke the function call_filter_check_discard() The function call_filter_check_discard() has been removed in the commit 49e4154f4b16 ("tracing: Remove TRACE_EVENT_FL_FILTERED logic"), from another topic branch. But when merged together with commit 21e92806d39c6 ("function_graph: Support recording and printing the function return address") which added another call to call_filter_check_discard(), it caused the build to fail. Since the function call_filter_check_discard() is useless, it can simply be removed regardless of being merged with commit 49e4154f4b16 or not. Link: https://lore.kernel.org/all/20241010134649.43ed357c@canb.auug.org.au/ Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Donglin Peng Link: https://lore.kernel.org/20241010194020.46192b21@gandalf.local.home Reported-by: Stephen Rothwell Fixes: 21e92806d39c6 ("function_graph: Support recording and printing the function return address") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions_graph.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 20d0c579d3b5..03c5a0d300a5 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -137,7 +137,6 @@ int __trace_graph_retaddr_entry(struct trace_array *tr, unsigned int trace_ctx, unsigned long retaddr) { - struct trace_event_call *call = &event_fgraph_retaddr_entry; struct ring_buffer_event *event; struct trace_buffer *buffer = tr->array_buffer.buffer; struct fgraph_retaddr_ent_entry *entry; @@ -150,8 +149,7 @@ int __trace_graph_retaddr_entry(struct trace_array *tr, entry->graph_ent.func = trace->func; entry->graph_ent.depth = trace->depth; entry->graph_ent.retaddr = retaddr; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); return 1; } -- cgit v1.2.3 From 7888af4166d4ab07ba51234be6ba332b7807e901 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 8 Oct 2024 19:05:28 -0400 Subject: ftrace: Make ftrace_regs abstract from direct use ftrace_regs was created to hold registers that store information to save function parameters, return value and stack. Since it is a subset of pt_regs, it should only be used by its accessor functions. But because pt_regs can easily be taken from ftrace_regs (on most archs), it is tempting to use it directly. But when running on other architectures, it may fail to build or worse, build but crash the kernel! Instead, make struct ftrace_regs an empty structure and have the architectures define __arch_ftrace_regs and all the accessor functions will typecast to it to get to the actual fields. This will help avoid usage of ftrace_regs directly. Link: https://lore.kernel.org/all/20241007171027.629bdafd@gandalf.local.home/ Cc: "linux-arch@vger.kernel.org" Cc: "x86@kernel.org" Cc: Mathieu Desnoyers Cc: Mark Rutland Cc: Will Deacon Cc: Huacai Chen Cc: WANG Xuerui Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Cc: Naveen N Rao Cc: Madhavan Srinivasan Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Link: https://lore.kernel.org/20241008230628.958778821@goodmis.org Acked-by: Catalin Marinas Signed-off-by: Steven Rostedt (Google) Acked-by: Masami Hiramatsu (Google) Acked-by: Heiko Carstens # s390 Signed-off-by: Steven Rostedt (Google) --- arch/arm64/include/asm/ftrace.h | 20 +++++++++++--------- arch/arm64/kernel/asm-offsets.c | 22 +++++++++++----------- arch/arm64/kernel/ftrace.c | 10 +++++----- arch/loongarch/include/asm/ftrace.h | 22 ++++++++++++---------- arch/loongarch/kernel/ftrace_dyn.c | 2 +- arch/powerpc/include/asm/ftrace.h | 21 ++++++++++++--------- arch/powerpc/kernel/trace/ftrace.c | 4 ++-- arch/powerpc/kernel/trace/ftrace_64_pg.c | 2 +- arch/riscv/include/asm/ftrace.h | 21 ++++++++++++--------- arch/riscv/kernel/asm-offsets.c | 28 ++++++++++++++-------------- arch/riscv/kernel/ftrace.c | 2 +- arch/s390/include/asm/ftrace.h | 23 +++++++++++++---------- arch/s390/kernel/asm-offsets.c | 4 ++-- arch/s390/kernel/ftrace.c | 2 +- arch/s390/lib/test_unwind.c | 4 ++-- arch/x86/include/asm/ftrace.h | 25 ++++++++++++++----------- arch/x86/kernel/ftrace.c | 2 +- include/linux/ftrace.h | 21 ++++++++++++++++++--- kernel/trace/ftrace.c | 2 +- 19 files changed, 134 insertions(+), 103 deletions(-) (limited to 'kernel/trace') diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h index dc9cf0bd2a4c..bbb69c7751b9 100644 --- a/arch/arm64/include/asm/ftrace.h +++ b/arch/arm64/include/asm/ftrace.h @@ -56,6 +56,8 @@ unsigned long ftrace_call_adjust(unsigned long addr); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS struct dyn_ftrace; struct ftrace_ops; +struct ftrace_regs; +#define arch_ftrace_regs(fregs) ((struct __arch_ftrace_regs *)(fregs)) #define arch_ftrace_get_regs(regs) NULL @@ -63,7 +65,7 @@ struct ftrace_ops; * Note: sizeof(struct ftrace_regs) must be a multiple of 16 to ensure correct * stack alignment */ -struct ftrace_regs { +struct __arch_ftrace_regs { /* x0 - x8 */ unsigned long regs[9]; @@ -83,47 +85,47 @@ struct ftrace_regs { static __always_inline unsigned long ftrace_regs_get_instruction_pointer(const struct ftrace_regs *fregs) { - return fregs->pc; + return arch_ftrace_regs(fregs)->pc; } static __always_inline void ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs, unsigned long pc) { - fregs->pc = pc; + arch_ftrace_regs(fregs)->pc = pc; } static __always_inline unsigned long ftrace_regs_get_stack_pointer(const struct ftrace_regs *fregs) { - return fregs->sp; + return arch_ftrace_regs(fregs)->sp; } static __always_inline unsigned long ftrace_regs_get_argument(struct ftrace_regs *fregs, unsigned int n) { if (n < 8) - return fregs->regs[n]; + return arch_ftrace_regs(fregs)->regs[n]; return 0; } static __always_inline unsigned long ftrace_regs_get_return_value(const struct ftrace_regs *fregs) { - return fregs->regs[0]; + return arch_ftrace_regs(fregs)->regs[0]; } static __always_inline void ftrace_regs_set_return_value(struct ftrace_regs *fregs, unsigned long ret) { - fregs->regs[0] = ret; + arch_ftrace_regs(fregs)->regs[0] = ret; } static __always_inline void ftrace_override_function_with_return(struct ftrace_regs *fregs) { - fregs->pc = fregs->lr; + arch_ftrace_regs(fregs)->pc = arch_ftrace_regs(fregs)->lr; } int ftrace_regs_query_register_offset(const char *name); @@ -143,7 +145,7 @@ static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, * The ftrace trampoline will return to this address instead of the * instrumented function. */ - fregs->direct_tramp = addr; + arch_ftrace_regs(fregs)->direct_tramp = addr; } #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 27de1dddb0ab..a5de57f68219 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -84,19 +84,19 @@ int main(void) DEFINE(PT_REGS_SIZE, sizeof(struct pt_regs)); BLANK(); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS - DEFINE(FREGS_X0, offsetof(struct ftrace_regs, regs[0])); - DEFINE(FREGS_X2, offsetof(struct ftrace_regs, regs[2])); - DEFINE(FREGS_X4, offsetof(struct ftrace_regs, regs[4])); - DEFINE(FREGS_X6, offsetof(struct ftrace_regs, regs[6])); - DEFINE(FREGS_X8, offsetof(struct ftrace_regs, regs[8])); - DEFINE(FREGS_FP, offsetof(struct ftrace_regs, fp)); - DEFINE(FREGS_LR, offsetof(struct ftrace_regs, lr)); - DEFINE(FREGS_SP, offsetof(struct ftrace_regs, sp)); - DEFINE(FREGS_PC, offsetof(struct ftrace_regs, pc)); + DEFINE(FREGS_X0, offsetof(struct __arch_ftrace_regs, regs[0])); + DEFINE(FREGS_X2, offsetof(struct __arch_ftrace_regs, regs[2])); + DEFINE(FREGS_X4, offsetof(struct __arch_ftrace_regs, regs[4])); + DEFINE(FREGS_X6, offsetof(struct __arch_ftrace_regs, regs[6])); + DEFINE(FREGS_X8, offsetof(struct __arch_ftrace_regs, regs[8])); + DEFINE(FREGS_FP, offsetof(struct __arch_ftrace_regs, fp)); + DEFINE(FREGS_LR, offsetof(struct __arch_ftrace_regs, lr)); + DEFINE(FREGS_SP, offsetof(struct __arch_ftrace_regs, sp)); + DEFINE(FREGS_PC, offsetof(struct __arch_ftrace_regs, pc)); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS - DEFINE(FREGS_DIRECT_TRAMP, offsetof(struct ftrace_regs, direct_tramp)); + DEFINE(FREGS_DIRECT_TRAMP, offsetof(struct __arch_ftrace_regs, direct_tramp)); #endif - DEFINE(FREGS_SIZE, sizeof(struct ftrace_regs)); + DEFINE(FREGS_SIZE, sizeof(struct __arch_ftrace_regs)); BLANK(); #endif #ifdef CONFIG_COMPAT diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index a650f5e11fc5..b2d947175cbe 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -23,10 +23,10 @@ struct fregs_offset { int offset; }; -#define FREGS_OFFSET(n, field) \ -{ \ - .name = n, \ - .offset = offsetof(struct ftrace_regs, field), \ +#define FREGS_OFFSET(n, field) \ +{ \ + .name = n, \ + .offset = offsetof(struct __arch_ftrace_regs, field), \ } static const struct fregs_offset fregs_offsets[] = { @@ -481,7 +481,7 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { - prepare_ftrace_return(ip, &fregs->lr, fregs->fp); + prepare_ftrace_return(ip, &arch_ftrace_regs(fregs)->lr, arch_ftrace_regs(fregs)->fp); } #else /* diff --git a/arch/loongarch/include/asm/ftrace.h b/arch/loongarch/include/asm/ftrace.h index c0a682808e07..0e15d36ce251 100644 --- a/arch/loongarch/include/asm/ftrace.h +++ b/arch/loongarch/include/asm/ftrace.h @@ -43,38 +43,40 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent); #ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS struct ftrace_ops; +struct ftrace_regs; +#define arch_ftrace_regs(fregs) ((struct __arch_ftrace_regs *)(fregs)) -struct ftrace_regs { +struct __arch_ftrace_regs { struct pt_regs regs; }; static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *fregs) { - return &fregs->regs; + return &arch_ftrace_regs(fregs)->regs; } static __always_inline unsigned long ftrace_regs_get_instruction_pointer(struct ftrace_regs *fregs) { - return instruction_pointer(&fregs->regs); + return instruction_pointer(&arch_ftrace_regs(fregs)->regs); } static __always_inline void ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs, unsigned long ip) { - instruction_pointer_set(&fregs->regs, ip); + instruction_pointer_set(&arch_ftrace_regs(fregs)->regs, ip); } #define ftrace_regs_get_argument(fregs, n) \ - regs_get_kernel_argument(&(fregs)->regs, n) + regs_get_kernel_argument(&arch_ftrace_regs(fregs)->regs, n) #define ftrace_regs_get_stack_pointer(fregs) \ - kernel_stack_pointer(&(fregs)->regs) + kernel_stack_pointer(&arch_ftrace_regs(fregs)->regs) #define ftrace_regs_return_value(fregs) \ - regs_return_value(&(fregs)->regs) + regs_return_value(&arch_ftrace_regs(fregs)->regs) #define ftrace_regs_set_return_value(fregs, ret) \ - regs_set_return_value(&(fregs)->regs, ret) + regs_set_return_value(&arch_ftrace_regs(fregs)->regs, ret) #define ftrace_override_function_with_return(fregs) \ - override_function_with_return(&(fregs)->regs) + override_function_with_return(&arch_ftrace_regs(fregs)->regs) #define ftrace_regs_query_register_offset(name) \ regs_query_register_offset(name) @@ -90,7 +92,7 @@ __arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr) } #define arch_ftrace_set_direct_caller(fregs, addr) \ - __arch_ftrace_set_direct_caller(&(fregs)->regs, addr) + __arch_ftrace_set_direct_caller(&arch_ftrace_regs(fregs)->regs, addr) #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ #endif diff --git a/arch/loongarch/kernel/ftrace_dyn.c b/arch/loongarch/kernel/ftrace_dyn.c index bff058317062..18056229e22e 100644 --- a/arch/loongarch/kernel/ftrace_dyn.c +++ b/arch/loongarch/kernel/ftrace_dyn.c @@ -241,7 +241,7 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent) void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { - struct pt_regs *regs = &fregs->regs; + struct pt_regs *regs = &arch_ftrace_regs(fregs)->regs; unsigned long *parent = (unsigned long *)®s->regs[1]; prepare_ftrace_return(ip, (unsigned long *)parent); diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index 559560286e6d..e299fd47d201 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -32,39 +32,42 @@ struct dyn_arch_ftrace { int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec); #define ftrace_init_nop ftrace_init_nop -struct ftrace_regs { +struct ftrace_regs; +#define arch_ftrace_regs(fregs) ((struct __arch_ftrace_regs *)(fregs)) + +struct __arch_ftrace_regs { struct pt_regs regs; }; static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *fregs) { /* We clear regs.msr in ftrace_call */ - return fregs->regs.msr ? &fregs->regs : NULL; + return arch_ftrace_regs(fregs)->regs.msr ? &arch_ftrace_regs(fregs)->regs : NULL; } static __always_inline void ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs, unsigned long ip) { - regs_set_return_ip(&fregs->regs, ip); + regs_set_return_ip(&arch_ftrace_regs(fregs)->regs, ip); } static __always_inline unsigned long ftrace_regs_get_instruction_pointer(struct ftrace_regs *fregs) { - return instruction_pointer(&fregs->regs); + return instruction_pointer(&arch_ftrace_regs(fregs)->regs); } #define ftrace_regs_get_argument(fregs, n) \ - regs_get_kernel_argument(&(fregs)->regs, n) + regs_get_kernel_argument(&arch_ftrace_regs(fregs)->regs, n) #define ftrace_regs_get_stack_pointer(fregs) \ - kernel_stack_pointer(&(fregs)->regs) + kernel_stack_pointer(&arch_ftrace_regs(fregs)->regs) #define ftrace_regs_return_value(fregs) \ - regs_return_value(&(fregs)->regs) + regs_return_value(&arch_ftrace_regs(fregs)->regs) #define ftrace_regs_set_return_value(fregs, ret) \ - regs_set_return_value(&(fregs)->regs, ret) + regs_set_return_value(&arch_ftrace_regs(fregs)->regs, ret) #define ftrace_override_function_with_return(fregs) \ - override_function_with_return(&(fregs)->regs) + override_function_with_return(&arch_ftrace_regs(fregs)->regs) #define ftrace_regs_query_register_offset(name) \ regs_query_register_offset(name) diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index d8d6b4fd9a14..df41f4a7c738 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -421,7 +421,7 @@ int __init ftrace_dyn_arch_init(void) void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { - unsigned long sp = fregs->regs.gpr[1]; + unsigned long sp = arch_ftrace_regs(fregs)->regs.gpr[1]; int bit; if (unlikely(ftrace_graph_is_dead())) @@ -439,6 +439,6 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, ftrace_test_recursion_unlock(bit); out: - fregs->regs.link = parent_ip; + arch_ftrace_regs(fregs)->regs.link = parent_ip; } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/powerpc/kernel/trace/ftrace_64_pg.c b/arch/powerpc/kernel/trace/ftrace_64_pg.c index 12fab1803bcf..d3c5552e4984 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_pg.c +++ b/arch/powerpc/kernel/trace/ftrace_64_pg.c @@ -829,7 +829,7 @@ out: void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { - fregs->regs.link = __prepare_ftrace_return(parent_ip, ip, fregs->regs.gpr[1]); + arch_ftrace_regs(fregs)->regs.link = __prepare_ftrace_return(parent_ip, ip, arch_ftrace_regs(fregs)->regs.gpr[1]); } #else unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip, diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h index 2cddd79ff21b..c6bcdff105b5 100644 --- a/arch/riscv/include/asm/ftrace.h +++ b/arch/riscv/include/asm/ftrace.h @@ -126,7 +126,10 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS #define arch_ftrace_get_regs(regs) NULL struct ftrace_ops; -struct ftrace_regs { +struct ftrace_regs; +#define arch_ftrace_regs(fregs) ((struct __arch_ftrace_regs *)(fregs)) + +struct __arch_ftrace_regs { unsigned long epc; unsigned long ra; unsigned long sp; @@ -150,42 +153,42 @@ struct ftrace_regs { static __always_inline unsigned long ftrace_regs_get_instruction_pointer(const struct ftrace_regs *fregs) { - return fregs->epc; + return arch_ftrace_regs(fregs)->epc; } static __always_inline void ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs, unsigned long pc) { - fregs->epc = pc; + arch_ftrace_regs(fregs)->epc = pc; } static __always_inline unsigned long ftrace_regs_get_stack_pointer(const struct ftrace_regs *fregs) { - return fregs->sp; + return arch_ftrace_regs(fregs)->sp; } static __always_inline unsigned long ftrace_regs_get_argument(struct ftrace_regs *fregs, unsigned int n) { if (n < 8) - return fregs->args[n]; + return arch_ftrace_regs(fregs)->args[n]; return 0; } static __always_inline unsigned long ftrace_regs_get_return_value(const struct ftrace_regs *fregs) { - return fregs->a0; + return arch_ftrace_regs(fregs)->a0; } static __always_inline void ftrace_regs_set_return_value(struct ftrace_regs *fregs, unsigned long ret) { - fregs->a0 = ret; + arch_ftrace_regs(fregs)->a0 = ret; } static __always_inline void ftrace_override_function_with_return(struct ftrace_regs *fregs) { - fregs->epc = fregs->ra; + arch_ftrace_regs(fregs)->epc = arch_ftrace_regs(fregs)->ra; } int ftrace_regs_query_register_offset(const char *name); @@ -196,7 +199,7 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, unsigned long addr) { - fregs->t1 = addr; + arch_ftrace_regs(fregs)->t1 = addr; } #endif /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */ diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c index e94180ba432f..f6f5a277ba9d 100644 --- a/arch/riscv/kernel/asm-offsets.c +++ b/arch/riscv/kernel/asm-offsets.c @@ -498,19 +498,19 @@ void asm_offsets(void) OFFSET(STACKFRAME_RA, stackframe, ra); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS - DEFINE(FREGS_SIZE_ON_STACK, ALIGN(sizeof(struct ftrace_regs), STACK_ALIGN)); - DEFINE(FREGS_EPC, offsetof(struct ftrace_regs, epc)); - DEFINE(FREGS_RA, offsetof(struct ftrace_regs, ra)); - DEFINE(FREGS_SP, offsetof(struct ftrace_regs, sp)); - DEFINE(FREGS_S0, offsetof(struct ftrace_regs, s0)); - DEFINE(FREGS_T1, offsetof(struct ftrace_regs, t1)); - DEFINE(FREGS_A0, offsetof(struct ftrace_regs, a0)); - DEFINE(FREGS_A1, offsetof(struct ftrace_regs, a1)); - DEFINE(FREGS_A2, offsetof(struct ftrace_regs, a2)); - DEFINE(FREGS_A3, offsetof(struct ftrace_regs, a3)); - DEFINE(FREGS_A4, offsetof(struct ftrace_regs, a4)); - DEFINE(FREGS_A5, offsetof(struct ftrace_regs, a5)); - DEFINE(FREGS_A6, offsetof(struct ftrace_regs, a6)); - DEFINE(FREGS_A7, offsetof(struct ftrace_regs, a7)); + DEFINE(FREGS_SIZE_ON_STACK, ALIGN(sizeof(struct __arch_ftrace_regs), STACK_ALIGN)); + DEFINE(FREGS_EPC, offsetof(struct __arch_ftrace_regs, epc)); + DEFINE(FREGS_RA, offsetof(struct __arch_ftrace_regs, ra)); + DEFINE(FREGS_SP, offsetof(struct __arch_ftrace_regs, sp)); + DEFINE(FREGS_S0, offsetof(struct __arch_ftrace_regs, s0)); + DEFINE(FREGS_T1, offsetof(struct __arch_ftrace_regs, t1)); + DEFINE(FREGS_A0, offsetof(struct __arch_ftrace_regs, a0)); + DEFINE(FREGS_A1, offsetof(struct __arch_ftrace_regs, a1)); + DEFINE(FREGS_A2, offsetof(struct __arch_ftrace_regs, a2)); + DEFINE(FREGS_A3, offsetof(struct __arch_ftrace_regs, a3)); + DEFINE(FREGS_A4, offsetof(struct __arch_ftrace_regs, a4)); + DEFINE(FREGS_A5, offsetof(struct __arch_ftrace_regs, a5)); + DEFINE(FREGS_A6, offsetof(struct __arch_ftrace_regs, a6)); + DEFINE(FREGS_A7, offsetof(struct __arch_ftrace_regs, a7)); #endif } diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c index 4b95c574fd04..5081ad886841 100644 --- a/arch/riscv/kernel/ftrace.c +++ b/arch/riscv/kernel/ftrace.c @@ -214,7 +214,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { - prepare_ftrace_return(&fregs->ra, ip, fregs->s0); + prepare_ftrace_return(&arch_ftrace_regs(fregs)->ra, ip, arch_ftrace_regs(fregs)->s0); } #else /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */ extern void ftrace_graph_call(void); diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h index 406746666eb7..1498d0a9c762 100644 --- a/arch/s390/include/asm/ftrace.h +++ b/arch/s390/include/asm/ftrace.h @@ -51,13 +51,16 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) return addr; } -struct ftrace_regs { +struct ftrace_regs; +#define arch_ftrace_regs(fregs) ((struct __arch_ftrace_regs *)(fregs)) + +struct __arch_ftrace_regs { struct pt_regs regs; }; static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *fregs) { - struct pt_regs *regs = &fregs->regs; + struct pt_regs *regs = &arch_ftrace_regs(fregs)->regs; if (test_pt_regs_flag(regs, PIF_FTRACE_FULL_REGS)) return regs; @@ -84,26 +87,26 @@ static __always_inline unsigned long fgraph_ret_regs_frame_pointer(struct fgraph static __always_inline unsigned long ftrace_regs_get_instruction_pointer(const struct ftrace_regs *fregs) { - return fregs->regs.psw.addr; + return arch_ftrace_regs(fregs)->regs.psw.addr; } static __always_inline void ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs, unsigned long ip) { - fregs->regs.psw.addr = ip; + arch_ftrace_regs(fregs)->regs.psw.addr = ip; } #define ftrace_regs_get_argument(fregs, n) \ - regs_get_kernel_argument(&(fregs)->regs, n) + regs_get_kernel_argument(&arch_ftrace_regs(fregs)->regs, n) #define ftrace_regs_get_stack_pointer(fregs) \ - kernel_stack_pointer(&(fregs)->regs) + kernel_stack_pointer(&arch_ftrace_regs(fregs)->regs) #define ftrace_regs_return_value(fregs) \ - regs_return_value(&(fregs)->regs) + regs_return_value(&arch_ftrace_regs(fregs)->regs) #define ftrace_regs_set_return_value(fregs, ret) \ - regs_set_return_value(&(fregs)->regs, ret) + regs_set_return_value(&arch_ftrace_regs(fregs)->regs, ret) #define ftrace_override_function_with_return(fregs) \ - override_function_with_return(&(fregs)->regs) + override_function_with_return(&arch_ftrace_regs(fregs)->regs) #define ftrace_regs_query_register_offset(name) \ regs_query_register_offset(name) @@ -117,7 +120,7 @@ ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs, */ static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, unsigned long addr) { - struct pt_regs *regs = &fregs->regs; + struct pt_regs *regs = &arch_ftrace_regs(fregs)->regs; regs->orig_gpr2 = addr; } #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 5529248d84fb..db9659980175 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -184,8 +184,8 @@ int main(void) OFFSET(__FGRAPH_RET_FP, fgraph_ret_regs, fp); DEFINE(__FGRAPH_RET_SIZE, sizeof(struct fgraph_ret_regs)); #endif - OFFSET(__FTRACE_REGS_PT_REGS, ftrace_regs, regs); - DEFINE(__FTRACE_REGS_SIZE, sizeof(struct ftrace_regs)); + OFFSET(__FTRACE_REGS_PT_REGS, __arch_ftrace_regs, regs); + DEFINE(__FTRACE_REGS_SIZE, sizeof(struct __arch_ftrace_regs)); OFFSET(__PCPU_FLAGS, pcpu, flags); return 0; diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 0b6e62d1d8b8..51439a71e392 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -318,7 +318,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, if (bit < 0) return; - kmsan_unpoison_memory(fregs, sizeof(*fregs)); + kmsan_unpoison_memory(fregs, ftrace_regs_size()); regs = ftrace_get_regs(fregs); p = get_kprobe((kprobe_opcode_t *)ip); if (!regs || unlikely(!p) || kprobe_disabled(p)) diff --git a/arch/s390/lib/test_unwind.c b/arch/s390/lib/test_unwind.c index 8b7f981e6f34..6e42100875e7 100644 --- a/arch/s390/lib/test_unwind.c +++ b/arch/s390/lib/test_unwind.c @@ -270,9 +270,9 @@ static void notrace __used test_unwind_ftrace_handler(unsigned long ip, struct ftrace_ops *fops, struct ftrace_regs *fregs) { - struct unwindme *u = (struct unwindme *)fregs->regs.gprs[2]; + struct unwindme *u = (struct unwindme *)arch_ftrace_regs(fregs)->regs.gprs[2]; - u->ret = test_unwind(NULL, (u->flags & UWM_REGS) ? &fregs->regs : NULL, + u->ret = test_unwind(NULL, (u->flags & UWM_REGS) ? &arch_ftrace_regs(fregs)->regs : NULL, (u->flags & UWM_SP) ? u->sp : 0); } diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 0152a81d9b4a..87943f7a299b 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -33,7 +33,10 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) } #ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS -struct ftrace_regs { +struct ftrace_regs; +#define arch_ftrace_regs(fregs) ((struct __arch_ftrace_regs *)(fregs)) + +struct __arch_ftrace_regs { struct pt_regs regs; }; @@ -41,27 +44,27 @@ static __always_inline struct pt_regs * arch_ftrace_get_regs(struct ftrace_regs *fregs) { /* Only when FL_SAVE_REGS is set, cs will be non zero */ - if (!fregs->regs.cs) + if (!arch_ftrace_regs(fregs)->regs.cs) return NULL; - return &fregs->regs; + return &arch_ftrace_regs(fregs)->regs; } #define ftrace_regs_set_instruction_pointer(fregs, _ip) \ - do { (fregs)->regs.ip = (_ip); } while (0) + do { arch_ftrace_regs(fregs)->regs.ip = (_ip); } while (0) #define ftrace_regs_get_instruction_pointer(fregs) \ - ((fregs)->regs.ip) + arch_ftrace_regs(fregs)->regs.ip) #define ftrace_regs_get_argument(fregs, n) \ - regs_get_kernel_argument(&(fregs)->regs, n) + regs_get_kernel_argument(&arch_ftrace_regs(fregs)->regs, n) #define ftrace_regs_get_stack_pointer(fregs) \ - kernel_stack_pointer(&(fregs)->regs) + kernel_stack_pointer(&arch_ftrace_regs(fregs)->regs) #define ftrace_regs_return_value(fregs) \ - regs_return_value(&(fregs)->regs) + regs_return_value(&arch_ftrace_regs(fregs)->regs) #define ftrace_regs_set_return_value(fregs, ret) \ - regs_set_return_value(&(fregs)->regs, ret) + regs_set_return_value(&arch_ftrace_regs(fregs)->regs, ret) #define ftrace_override_function_with_return(fregs) \ - override_function_with_return(&(fregs)->regs) + override_function_with_return(&arch_ftrace_regs(fregs)->regs) #define ftrace_regs_query_register_offset(name) \ regs_query_register_offset(name) @@ -88,7 +91,7 @@ __arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr) regs->orig_ax = addr; } #define arch_ftrace_set_direct_caller(fregs, addr) \ - __arch_ftrace_set_direct_caller(&(fregs)->regs, addr) + __arch_ftrace_set_direct_caller(&arch_ftrace_regs(fregs)->regs, addr) #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 8da0e66ca22d..adb09f78edb2 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -647,7 +647,7 @@ void prepare_ftrace_return(unsigned long ip, unsigned long *parent, void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { - struct pt_regs *regs = &fregs->regs; + struct pt_regs *regs = &arch_ftrace_regs(fregs)->regs; unsigned long *stack = (unsigned long *)kernel_stack_pointer(regs); prepare_ftrace_return(ip, (unsigned long *)stack, 0); diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4c7dd5e58c9f..66f10291a0b2 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -115,8 +115,6 @@ static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *val extern int ftrace_enabled; -#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS - /** * ftrace_regs - ftrace partial/optimal register set * @@ -142,11 +140,28 @@ extern int ftrace_enabled; * * NOTE: user *must not* access regs directly, only do it via APIs, because * the member can be changed according to the architecture. + * This is why the structure is empty here, so that nothing accesses + * the ftrace_regs directly. */ struct ftrace_regs { + /* Nothing to see here, use the accessor functions! */ +}; + +#define ftrace_regs_size() sizeof(struct __arch_ftrace_regs) + +#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS + +struct __arch_ftrace_regs { struct pt_regs regs; }; -#define arch_ftrace_get_regs(fregs) (&(fregs)->regs) + +struct ftrace_regs; +#define arch_ftrace_regs(fregs) ((struct __arch_ftrace_regs *)(fregs)) + +static inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *fregs) +{ + return &arch_ftrace_regs(fregs)->regs; +} /* * ftrace_regs_set_instruction_pointer() is to be defined by the architecture diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index cae388122ca8..e9fd4fb2769e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7943,7 +7943,7 @@ out: void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { - kmsan_unpoison_memory(fregs, sizeof(*fregs)); + kmsan_unpoison_memory(fregs, ftrace_regs_size()); __ftrace_ops_list_func(ip, parent_ip, NULL, fregs); } #else -- cgit v1.2.3 From b237e1f7d2273fdcffac20100b72c002bdd770dd Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Tue, 15 Oct 2024 13:27:46 +0200 Subject: ring-buffer: Limit time with disabled interrupts in rb_check_pages() The function rb_check_pages() validates the integrity of a specified per-CPU tracing ring buffer. It does so by traversing the underlying linked list and checking its next and prev links. To guarantee that the list isn't modified during the check, a caller typically needs to take cpu_buffer->reader_lock. This prevents the check from running concurrently, for example, with a potential reader which can make the list temporarily inconsistent when swapping its old reader page into the buffer. A problem with this approach is that the time when interrupts are disabled is non-deterministic, dependent on the ring buffer size. This particularly affects PREEMPT_RT because the reader_lock is a raw spinlock which doesn't become sleepable on PREEMPT_RT kernels. Modify the check so it still attempts to traverse the entire list, but gives up the reader_lock between checking individual pages. Introduce for this purpose a new variable ring_buffer_per_cpu.cnt which is bumped any time the list is modified. The value is used by rb_check_pages() to detect such a change and restart the check. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241015112810.27203-1-petr.pavlu@suse.com Signed-off-by: Petr Pavlu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 98 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 72 insertions(+), 26 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3ea4f7bb1837..adde95400ab4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -482,6 +482,8 @@ struct ring_buffer_per_cpu { unsigned long nr_pages; unsigned int current_context; struct list_head *pages; + /* pages generation counter, incremented when the list changes */ + unsigned long cnt; struct buffer_page *head_page; /* read from head */ struct buffer_page *tail_page; /* write to tail */ struct buffer_page *commit_page; /* committed pages */ @@ -1475,40 +1477,87 @@ static void rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer, RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK); } +static bool rb_check_links(struct ring_buffer_per_cpu *cpu_buffer, + struct list_head *list) +{ + if (RB_WARN_ON(cpu_buffer, + rb_list_head(rb_list_head(list->next)->prev) != list)) + return false; + + if (RB_WARN_ON(cpu_buffer, + rb_list_head(rb_list_head(list->prev)->next) != list)) + return false; + + return true; +} + /** * rb_check_pages - integrity check of buffer pages * @cpu_buffer: CPU buffer with pages to test * * As a safety measure we check to make sure the data pages have not * been corrupted. - * - * Callers of this function need to guarantee that the list of pages doesn't get - * modified during the check. In particular, if it's possible that the function - * is invoked with concurrent readers which can swap in a new reader page then - * the caller should take cpu_buffer->reader_lock. */ static void rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) { - struct list_head *head = rb_list_head(cpu_buffer->pages); - struct list_head *tmp; + struct list_head *head, *tmp; + unsigned long buffer_cnt; + unsigned long flags; + int nr_loops = 0; - if (RB_WARN_ON(cpu_buffer, - rb_list_head(rb_list_head(head->next)->prev) != head)) + /* + * Walk the linked list underpinning the ring buffer and validate all + * its next and prev links. + * + * The check acquires the reader_lock to avoid concurrent processing + * with code that could be modifying the list. However, the lock cannot + * be held for the entire duration of the walk, as this would make the + * time when interrupts are disabled non-deterministic, dependent on the + * ring buffer size. Therefore, the code releases and re-acquires the + * lock after checking each page. The ring_buffer_per_cpu.cnt variable + * is then used to detect if the list was modified while the lock was + * not held, in which case the check needs to be restarted. + * + * The code attempts to perform the check at most three times before + * giving up. This is acceptable because this is only a self-validation + * to detect problems early on. In practice, the list modification + * operations are fairly spaced, and so this check typically succeeds at + * most on the second try. + */ +again: + if (++nr_loops > 3) return; - if (RB_WARN_ON(cpu_buffer, - rb_list_head(rb_list_head(head->prev)->next) != head)) - return; + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + head = rb_list_head(cpu_buffer->pages); + if (!rb_check_links(cpu_buffer, head)) + goto out_locked; + buffer_cnt = cpu_buffer->cnt; + tmp = head; + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - for (tmp = rb_list_head(head->next); tmp != head; tmp = rb_list_head(tmp->next)) { - if (RB_WARN_ON(cpu_buffer, - rb_list_head(rb_list_head(tmp->next)->prev) != tmp)) - return; + while (true) { + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - if (RB_WARN_ON(cpu_buffer, - rb_list_head(rb_list_head(tmp->prev)->next) != tmp)) - return; + if (buffer_cnt != cpu_buffer->cnt) { + /* The list was updated, try again. */ + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + goto again; + } + + tmp = rb_list_head(tmp->next); + if (tmp == head) + /* The iteration circled back, all is done. */ + goto out_locked; + + if (!rb_check_links(cpu_buffer, tmp)) + goto out_locked; + + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } + +out_locked: + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } /* @@ -2535,6 +2584,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) /* make sure pages points to a valid page in the ring buffer */ cpu_buffer->pages = next_page; + cpu_buffer->cnt++; /* update head page */ if (head_bit) @@ -2641,6 +2691,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) * pointer to point to end of list */ head_page->prev = last_page; + cpu_buffer->cnt++; success = true; break; } @@ -2876,12 +2927,8 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, */ synchronize_rcu(); for_each_buffer_cpu(buffer, cpu) { - unsigned long flags; - cpu_buffer = buffer->buffers[cpu]; - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_check_pages(cpu_buffer); - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } atomic_dec(&buffer->record_disabled); } @@ -5299,6 +5346,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list; rb_inc_page(&cpu_buffer->head_page); + cpu_buffer->cnt++; local_inc(&cpu_buffer->pages_read); /* Finally update the reader page to the new head */ @@ -5838,12 +5886,9 @@ void ring_buffer_read_finish(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; - unsigned long flags; /* Use this opportunity to check the integrity of the ring buffer. */ - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_check_pages(cpu_buffer); - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); atomic_dec(&cpu_buffer->resize_disabled); kfree(iter->event); @@ -6760,6 +6805,7 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) /* Install the new pages, remove the head from the list */ cpu_buffer->pages = cpu_buffer->new_pages.next; list_del_init(&cpu_buffer->new_pages); + cpu_buffer->cnt++; cpu_buffer->head_page = list_entry(cpu_buffer->pages, struct buffer_page, list); -- cgit v1.2.3 From 0b60a7fb60b7d7012bfc468e1a7f11374337a70d Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Mon, 30 Sep 2024 13:21:07 +0200 Subject: ring-buffer: Reorganize kerneldoc parameter names Reorganize kerneldoc parameter names to match the parameter order in the function header. Problems identified using Coccinelle. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20240930112121.95324-22-Julia.Lawall@inria.fr Signed-off-by: Julia Lawall Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index adde95400ab4..db3bf6a1b536 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2436,9 +2436,9 @@ EXPORT_SYMBOL_GPL(__ring_buffer_alloc); * __ring_buffer_alloc_range - allocate a new ring_buffer from existing memory * @size: the size in bytes per cpu that is needed. * @flags: attributes to set for the ring buffer. + * @order: sub-buffer order * @start: start of allocated range * @range_size: size of allocated range - * @order: sub-buffer order * @key: ring buffer reader_lock_key. * * Currently the only flag that is available is the RB_FL_OVERWRITE -- cgit v1.2.3 From 514da6924e8647eb89aa879e73a19eaed8bcf669 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Fri, 18 Oct 2024 13:07:10 +0200 Subject: ring-buffer: Use str_low_high() helper in ring_buffer_producer() Remove hard-coded strings by using the helper function str_low_high(). Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241018110709.111707-2-thorsten.blum@linux.dev Signed-off-by: Thorsten Blum Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer_benchmark.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 008187ebd7fe..cdc3aea12c93 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -307,14 +307,14 @@ static void ring_buffer_producer(void) if (!disable_reader) { if (consumer_fifo) trace_printk("Running Consumer at SCHED_FIFO %s\n", - consumer_fifo == 1 ? "low" : "high"); + str_low_high(consumer_fifo == 1)); else trace_printk("Running Consumer at nice: %d\n", consumer_nice); } if (producer_fifo) trace_printk("Running Producer at SCHED_FIFO %s\n", - producer_fifo == 1 ? "low" : "high"); + str_low_high(producer_fifo == 1)); else trace_printk("Running Producer at nice: %d\n", producer_nice); -- cgit v1.2.3 From 6280cf718db0c557b5fe44e2d2e8ad8e832696a7 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Wed, 16 Oct 2024 08:41:35 +0000 Subject: bpf: Implement bpf_send_signal_task() kfunc Implement bpf_send_signal_task kfunc that is similar to bpf_send_signal_thread and bpf_send_signal helpers but can be used to send signals to other threads and processes. It also supports sending a cookie with the signal similar to sigqueue(). If the receiving process establishes a handler for the signal using the SA_SIGINFO flag to sigaction(), then it can obtain this cookie via the si_value field of the siginfo_t structure passed as the second argument to the handler. Signed-off-by: Puranjay Mohan Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241016084136.10305-2-puranjay@kernel.org --- kernel/bpf/helpers.c | 1 + kernel/trace/bpf_trace.c | 53 ++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 46 insertions(+), 8 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 073e6f04f4d7..5c3fdb29c1b1 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3055,6 +3055,7 @@ BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_throw) +BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS) BTF_KFUNCS_END(generic_btf_ids) static const struct btf_kfunc_id_set generic_kfunc_set = { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a582cd25ca87..e7370a321126 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -802,6 +802,8 @@ struct send_signal_irq_work { struct task_struct *task; u32 sig; enum pid_type type; + bool has_siginfo; + struct kernel_siginfo info; }; static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work); @@ -809,27 +811,46 @@ static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work); static void do_bpf_send_signal(struct irq_work *entry) { struct send_signal_irq_work *work; + struct kernel_siginfo *siginfo; work = container_of(entry, struct send_signal_irq_work, irq_work); - group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, work->type); + siginfo = work->has_siginfo ? &work->info : SEND_SIG_PRIV; + + group_send_sig_info(work->sig, siginfo, work->task, work->type); put_task_struct(work->task); } -static int bpf_send_signal_common(u32 sig, enum pid_type type) +static int bpf_send_signal_common(u32 sig, enum pid_type type, struct task_struct *task, u64 value) { struct send_signal_irq_work *work = NULL; + struct kernel_siginfo info; + struct kernel_siginfo *siginfo; + + if (!task) { + task = current; + siginfo = SEND_SIG_PRIV; + } else { + clear_siginfo(&info); + info.si_signo = sig; + info.si_errno = 0; + info.si_code = SI_KERNEL; + info.si_pid = 0; + info.si_uid = 0; + info.si_value.sival_ptr = (void *)(unsigned long)value; + siginfo = &info; + } /* Similar to bpf_probe_write_user, task needs to be * in a sound condition and kernel memory access be * permitted in order to send signal to the current * task. */ - if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING))) + if (unlikely(task->flags & (PF_KTHREAD | PF_EXITING))) return -EPERM; if (unlikely(!nmi_uaccess_okay())) return -EPERM; /* Task should not be pid=1 to avoid kernel panic. */ - if (unlikely(is_global_init(current))) + if (unlikely(is_global_init(task))) return -EPERM; if (irqs_disabled()) { @@ -847,19 +868,22 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type) * to the irq_work. The current task may change when queued * irq works get executed. */ - work->task = get_task_struct(current); + work->task = get_task_struct(task); + work->has_siginfo = siginfo == &info; + if (work->has_siginfo) + copy_siginfo(&work->info, &info); work->sig = sig; work->type = type; irq_work_queue(&work->irq_work); return 0; } - return group_send_sig_info(sig, SEND_SIG_PRIV, current, type); + return group_send_sig_info(sig, siginfo, task, type); } BPF_CALL_1(bpf_send_signal, u32, sig) { - return bpf_send_signal_common(sig, PIDTYPE_TGID); + return bpf_send_signal_common(sig, PIDTYPE_TGID, NULL, 0); } static const struct bpf_func_proto bpf_send_signal_proto = { @@ -871,7 +895,7 @@ static const struct bpf_func_proto bpf_send_signal_proto = { BPF_CALL_1(bpf_send_signal_thread, u32, sig) { - return bpf_send_signal_common(sig, PIDTYPE_PID); + return bpf_send_signal_common(sig, PIDTYPE_PID, NULL, 0); } static const struct bpf_func_proto bpf_send_signal_thread_proto = { @@ -3484,3 +3508,16 @@ static int __init bpf_kprobe_multi_kfuncs_init(void) } late_initcall(bpf_kprobe_multi_kfuncs_init); + +__bpf_kfunc_start_defs(); + +__bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid_type type, + u64 value) +{ + if (type != PIDTYPE_PID && type != PIDTYPE_TGID) + return -EINVAL; + + return bpf_send_signal_common(sig, type, task, value); +} + +__bpf_kfunc_end_defs(); -- cgit v1.2.3 From da09a9e0c3eab164af950be44ee6bdea8527c3e5 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 18 Oct 2024 22:22:51 +0200 Subject: uprobe: Add data pointer to consumer handlers Adding data pointer to both entry and exit consumer handlers and all its users. The functionality itself is coming in following change. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Oleg Nesterov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20241018202252.693462-2-jolsa@kernel.org --- include/linux/uprobes.h | 4 ++-- kernel/events/uprobes.c | 4 ++-- kernel/trace/bpf_trace.c | 6 ++++-- kernel/trace/trace_uprobe.c | 12 ++++++++---- tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c | 2 +- 5 files changed, 17 insertions(+), 11 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 2b294bf1881f..bb265a632b91 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -37,10 +37,10 @@ struct uprobe_consumer { * for the current process. If filter() is omitted or returns true, * UPROBE_HANDLER_REMOVE is effectively ignored. */ - int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs); + int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs, __u64 *data); int (*ret_handler)(struct uprobe_consumer *self, unsigned long func, - struct pt_regs *regs); + struct pt_regs *regs, __u64 *data); bool (*filter)(struct uprobe_consumer *self, struct mm_struct *mm); struct list_head cons_node; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2a0059464383..6b44c386a5df 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2090,7 +2090,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) int rc = 0; if (uc->handler) { - rc = uc->handler(uc, regs); + rc = uc->handler(uc, regs, NULL); WARN(rc & ~UPROBE_HANDLER_MASK, "bad rc=0x%x from %ps()\n", rc, uc->handler); } @@ -2128,7 +2128,7 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) rcu_read_lock_trace(); list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { if (uc->ret_handler) - uc->ret_handler(uc, ri->func, regs); + uc->ret_handler(uc, ri->func, regs, NULL); } rcu_read_unlock_trace(); } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a582cd25ca87..fdab7ecd8dfa 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3244,7 +3244,8 @@ uprobe_multi_link_filter(struct uprobe_consumer *con, struct mm_struct *mm) } static int -uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs) +uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs, + __u64 *data) { struct bpf_uprobe *uprobe; @@ -3253,7 +3254,8 @@ uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs) } static int -uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs) +uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs, + __u64 *data) { struct bpf_uprobe *uprobe; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c40531d2cbad..5895eabe3581 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -89,9 +89,11 @@ static struct trace_uprobe *to_trace_uprobe(struct dyn_event *ev) static int register_uprobe_event(struct trace_uprobe *tu); static int unregister_uprobe_event(struct trace_uprobe *tu); -static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); +static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs, + __u64 *data); static int uretprobe_dispatcher(struct uprobe_consumer *con, - unsigned long func, struct pt_regs *regs); + unsigned long func, struct pt_regs *regs, + __u64 *data); #ifdef CONFIG_STACK_GROWSUP static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n) @@ -1517,7 +1519,8 @@ trace_uprobe_register(struct trace_event_call *event, enum trace_reg type, } } -static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) +static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs, + __u64 *data) { struct trace_uprobe *tu; struct uprobe_dispatch_data udd; @@ -1548,7 +1551,8 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) } static int uretprobe_dispatcher(struct uprobe_consumer *con, - unsigned long func, struct pt_regs *regs) + unsigned long func, struct pt_regs *regs, + __u64 *data) { struct trace_uprobe *tu; struct uprobe_dispatch_data udd; diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index 8835761d9a12..12005e3dc3e4 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -461,7 +461,7 @@ static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = { static int uprobe_ret_handler(struct uprobe_consumer *self, unsigned long func, - struct pt_regs *regs) + struct pt_regs *regs, __u64 *data) { regs->ax = 0x12345678deadbeef; -- cgit v1.2.3 From 6ea8b69da6bf1b8a0e4683f37a006a2e2f4fc943 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 19 Oct 2024 15:29:51 -0400 Subject: fgraph: Separate size of ret_stack from PAGE_SIZE The ret_stack (shadow stack used by function graph infrastructure) is currently defined as PAGE_SIZE. But some architectures which have 64K PAGE_SIZE, this is way overkill. Also there's an effort to allow the PAGE_SIZE to be defined at boot up. Hard code it for now to 4096. In the future, this size may change and even be dependent on specific architectures. Link: https://lore.kernel.org/all/e5067bb8-0fcd-4739-9bca-0e872037d5a1@arm.com/ Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241019152951.053f9646@rorschach.local.home Suggested-by: Ryan Roberts Reviewed-by: Ryan Roberts Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 13fcc25d15a0..4ce87982966a 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -153,7 +153,7 @@ enum { * SHADOW_STACK_OFFSET: The size in long words of the shadow stack * SHADOW_STACK_MAX_OFFSET: The max offset of the stack for a new frame to be added */ -#define SHADOW_STACK_SIZE (PAGE_SIZE) +#define SHADOW_STACK_SIZE (4096) #define SHADOW_STACK_OFFSET (SHADOW_STACK_SIZE / sizeof(long)) /* Leave on a buffer at the end */ #define SHADOW_STACK_MAX_OFFSET \ -- cgit v1.2.3 From 434098485bfc8f5336d5164adea0a9d102a0a09a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 26 Oct 2024 06:32:10 -0400 Subject: fgraph: Give ret_stack its own kmem cache The ret_stack (shadow stack used by function graph infrastructure) is created for every task on the system when function graph is enabled. Give it its own kmem_cache. This will make it easier to see how much memory is being used specifically for function graph shadow stacks. In the future, this size may change and may not be a power of two. Having its own cache can also keep it from fragmenting memory. Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Ryan Roberts Link: https://lore.kernel.org/20241026063210.7d4910a7@rorschach.local.home Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 4ce87982966a..001abf376c0c 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -172,6 +172,8 @@ enum { DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph); int ftrace_graph_active; +static struct kmem_cache *fgraph_stack_cachep; + static struct fgraph_ops *fgraph_array[FGRAPH_ARRAY_SIZE]; static unsigned long fgraph_array_bitmask; @@ -1022,8 +1024,11 @@ static int alloc_retstack_tasklist(unsigned long **ret_stack_list) int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE; struct task_struct *g, *t; + if (WARN_ON_ONCE(!fgraph_stack_cachep)) + return -ENOMEM; + for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) { - ret_stack_list[i] = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL); + ret_stack_list[i] = kmem_cache_alloc(fgraph_stack_cachep, GFP_KERNEL); if (!ret_stack_list[i]) { start = 0; end = i; @@ -1054,7 +1059,7 @@ unlock: rcu_read_unlock(); free: for (i = start; i < end; i++) - kfree(ret_stack_list[i]); + kmem_cache_free(fgraph_stack_cachep, ret_stack_list[i]); return ret; } @@ -1117,9 +1122,12 @@ void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) if (ftrace_graph_active) { unsigned long *ret_stack; + if (WARN_ON_ONCE(!fgraph_stack_cachep)) + return; + ret_stack = per_cpu(idle_ret_stack, cpu); if (!ret_stack) { - ret_stack = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL); + ret_stack = kmem_cache_alloc(fgraph_stack_cachep, GFP_KERNEL); if (!ret_stack) return; per_cpu(idle_ret_stack, cpu) = ret_stack; @@ -1139,7 +1147,10 @@ void ftrace_graph_init_task(struct task_struct *t) if (ftrace_graph_active) { unsigned long *ret_stack; - ret_stack = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL); + if (WARN_ON_ONCE(!fgraph_stack_cachep)) + return; + + ret_stack = kmem_cache_alloc(fgraph_stack_cachep, GFP_KERNEL); if (!ret_stack) return; graph_init_task(t, ret_stack); @@ -1154,7 +1165,11 @@ void ftrace_graph_exit_task(struct task_struct *t) /* NULL must become visible to IRQs before we free it: */ barrier(); - kfree(ret_stack); + if (ret_stack) { + if (WARN_ON_ONCE(!fgraph_stack_cachep)) + return; + kmem_cache_free(fgraph_stack_cachep, ret_stack); + } } #ifdef CONFIG_DYNAMIC_FTRACE @@ -1294,6 +1309,14 @@ int register_ftrace_graph(struct fgraph_ops *gops) guard(mutex)(&ftrace_lock); + if (!fgraph_stack_cachep) { + fgraph_stack_cachep = kmem_cache_create("fgraph_stack", + SHADOW_STACK_SIZE, + SHADOW_STACK_SIZE, 0, NULL); + if (!fgraph_stack_cachep) + return -ENOMEM; + } + if (!fgraph_initialized) { ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "fgraph:online", fgraph_cpu_init, NULL); -- cgit v1.2.3 From 6348a3fa723a4fa2e5651b0b93fbcddd5293e92b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 28 Oct 2024 03:12:29 -0400 Subject: fgraph: Use guard(mutex)(&ftrace_lock) for unregister_ftrace_graph() The ftrace_lock is held throughout unregister_ftrace_graph(), use a guard to simplify the error paths. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Thomas Gleixner Cc: Peter Zijlstra Link: https://lore.kernel.org/20241028071307.770550792@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 001abf376c0c..0bf78517b5d4 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -1381,17 +1381,17 @@ void unregister_ftrace_graph(struct fgraph_ops *gops) { int command = 0; - mutex_lock(&ftrace_lock); + guard(mutex)(&ftrace_lock); if (unlikely(!ftrace_graph_active)) - goto out; + return; if (unlikely(gops->idx < 0 || gops->idx >= FGRAPH_ARRAY_SIZE || fgraph_array[gops->idx] != gops)) - goto out; + return; if (fgraph_lru_release_index(gops->idx) < 0) - goto out; + return; fgraph_array[gops->idx] = &fgraph_stub; @@ -1413,7 +1413,5 @@ void unregister_ftrace_graph(struct fgraph_ops *gops) unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); } - out: gops->saved_func = NULL; - mutex_unlock(&ftrace_lock); } -- cgit v1.2.3 From 06294cf04500317c50b5b4c4c29440123cd15d48 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 28 Oct 2024 03:12:30 -0400 Subject: ftrace: Use guard for match_records() The ftrace_lock is held for most of match_records() until the end of the function. Use guard to make error paths simpler. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Thomas Gleixner Cc: Peter Zijlstra Link: https://lore.kernel.org/20241028071307.927146604@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e9fd4fb2769e..44adc34643c9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4829,15 +4829,13 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod) mod_g.len = strlen(mod_g.search); } - mutex_lock(&ftrace_lock); + guard(mutex)(&ftrace_lock); if (unlikely(ftrace_disabled)) - goto out_unlock; + return 0; - if (func_g.type == MATCH_INDEX) { - found = add_rec_by_index(hash, &func_g, clear_filter); - goto out_unlock; - } + if (func_g.type == MATCH_INDEX) + return add_rec_by_index(hash, &func_g, clear_filter); do_for_each_ftrace_rec(pg, rec) { @@ -4846,16 +4844,12 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod) if (ftrace_match_record(rec, &func_g, mod_match, exclude_mod)) { ret = enter_record(hash, rec, clear_filter); - if (ret < 0) { - found = ret; - goto out_unlock; - } + if (ret < 0) + return ret; found = 1; } cond_resched(); } while_for_each_ftrace_rec(); - out_unlock: - mutex_unlock(&ftrace_lock); return found; } -- cgit v1.2.3 From 1432afb50de460e56ad90938a7bb42b6128790e6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 28 Oct 2024 03:12:31 -0400 Subject: ftrace: Use guard to lock ftrace_lock in cache_mod() The ftrace_lock is held throughout cache_mod(), use guard to simplify the error paths. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Thomas Gleixner Cc: Peter Zijlstra Link: https://lore.kernel.org/20241028071308.088458856@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 44adc34643c9..64997416415e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4947,14 +4947,14 @@ static int cache_mod(struct trace_array *tr, { struct ftrace_mod_load *ftrace_mod, *n; struct list_head *head = enable ? &tr->mod_trace : &tr->mod_notrace; - int ret; - mutex_lock(&ftrace_lock); + guard(mutex)(&ftrace_lock); /* We do not cache inverse filters */ if (func[0] == '!') { + int ret = -EINVAL; + func++; - ret = -EINVAL; /* Look to remove this hash */ list_for_each_entry_safe(ftrace_mod, n, head, list) { @@ -4970,20 +4970,15 @@ static int cache_mod(struct trace_array *tr, continue; } } - goto out; + return ret; } - ret = -EINVAL; /* We only care about modules that have not been loaded yet */ if (module_exists(module)) - goto out; + return -EINVAL; /* Save this string off, and execute it when the module is loaded */ - ret = ftrace_add_mod(tr, func, module, enable); - out: - mutex_unlock(&ftrace_lock); - - return ret; + return ftrace_add_mod(tr, func, module, enable); } static int -- cgit v1.2.3 From 9687bbf219549fc93f2364c78aa91fd9ffc9eca0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 28 Oct 2024 03:12:32 -0400 Subject: ftrace: Use guard to take the ftrace_lock in release_probe() The ftrace_lock is held throughout the entire release_probe() function. Use guard to simplify any exit paths. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Thomas Gleixner Cc: Peter Zijlstra Link: https://lore.kernel.org/20241028071308.250787901@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 64997416415e..c0fabd7da5b2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5288,7 +5288,7 @@ static void release_probe(struct ftrace_func_probe *probe) { struct ftrace_probe_ops *probe_ops; - mutex_lock(&ftrace_lock); + guard(mutex)(&ftrace_lock); WARN_ON(probe->ref <= 0); @@ -5306,7 +5306,6 @@ static void release_probe(struct ftrace_func_probe *probe) list_del(&probe->list); kfree(probe); } - mutex_unlock(&ftrace_lock); } static void acquire_probe_locked(struct ftrace_func_probe *probe) -- cgit v1.2.3 From 8b0cb3a4c5e85dda8957ba6a4c8c081a9aec6e80 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 28 Oct 2024 03:12:33 -0400 Subject: ftrace: Use guard to take ftrace_lock in ftrace_graph_set_hash() The ftrace_lock is taken for most of the ftrace_graph_set_hash() function throughout the end. Use guard to take the ftrace_lock to simplify the exit paths. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Thomas Gleixner Cc: Peter Zijlstra Link: https://lore.kernel.org/20241028071308.406073025@goodmis.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c0fabd7da5b2..b4ef469f4fd2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6816,12 +6816,10 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer) func_g.len = strlen(func_g.search); - mutex_lock(&ftrace_lock); + guard(mutex)(&ftrace_lock); - if (unlikely(ftrace_disabled)) { - mutex_unlock(&ftrace_lock); + if (unlikely(ftrace_disabled)) return -ENODEV; - } do_for_each_ftrace_rec(pg, rec) { @@ -6837,7 +6835,7 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer) if (entry) continue; if (add_hash_entry(hash, rec->ip) == NULL) - goto out; + return 0; } else { if (entry) { free_hash_entry(hash, entry); @@ -6846,13 +6844,8 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer) } } } while_for_each_ftrace_rec(); -out: - mutex_unlock(&ftrace_lock); - if (fail) - return -EINVAL; - - return 0; + return fail ? -EINVAL : 0; } static ssize_t -- cgit v1.2.3 From 36a367b8912a3aac023d9e35c815f7b1e609f4a3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 17 Oct 2024 11:31:05 -0400 Subject: ftrace: Show timings of how long nop patching took Since the beginning of ftrace, the code that did the patching had its timings saved on how long it took to complete. But this information was never exposed. It was used for debugging and exposing it was always something that was on the TODO list. Now it's time to expose it. There's even a file that is where it should go! Also include how long patching modules took as a separate value. # cat /sys/kernel/tracing/dyn_ftrace_total_info 57680 pages:231 groups: 9 ftrace boot update time = 14024666 (ns) ftrace module total update time = 126070 (ns) Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Mike Rapoport Cc: Peter Zijlstra Link: https://lore.kernel.org/20241017113105.1edfa943@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 11 ++++++++--- kernel/trace/trace.c | 15 +++++++++++---- kernel/trace/trace.h | 2 ++ 3 files changed, 21 insertions(+), 7 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b4ef469f4fd2..71e53eaba8bc 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3686,7 +3686,8 @@ static int ftrace_hash_move_and_update_subops(struct ftrace_ops *subops, } -static u64 ftrace_update_time; +u64 ftrace_update_time; +u64 ftrace_total_mod_time; unsigned long ftrace_update_tot_cnt; unsigned long ftrace_number_of_pages; unsigned long ftrace_number_of_groups; @@ -3706,7 +3707,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) bool init_nop = ftrace_need_init_nop(); struct ftrace_page *pg; struct dyn_ftrace *p; - u64 start, stop; + u64 start, stop, update_time; unsigned long update_cnt = 0; unsigned long rec_flags = 0; int i; @@ -3750,7 +3751,11 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) } stop = ftrace_now(raw_smp_processor_id()); - ftrace_update_time = stop - start; + update_time = stop - start; + if (mod) + ftrace_total_mod_time += update_time; + else + ftrace_update_time = update_time; ftrace_update_tot_cnt += update_cnt; return 0; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c01375adc471..405dcf498159 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8548,15 +8548,22 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf, char *buf; int r; - /* 256 should be plenty to hold the amount needed */ - buf = kmalloc(256, GFP_KERNEL); + /* 512 should be plenty to hold the amount needed */ +#define DYN_INFO_BUF_SIZE 512 + + buf = kmalloc(DYN_INFO_BUF_SIZE, GFP_KERNEL); if (!buf) return -ENOMEM; - r = scnprintf(buf, 256, "%ld pages:%ld groups: %ld\n", + r = scnprintf(buf, DYN_INFO_BUF_SIZE, + "%ld pages:%ld groups: %ld\n" + "ftrace boot update time = %llu (ns)\n" + "ftrace module total update time = %llu (ns)\n", ftrace_update_tot_cnt, ftrace_number_of_pages, - ftrace_number_of_groups); + ftrace_number_of_groups, + ftrace_update_time, + ftrace_total_mod_time); ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); kfree(buf); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6adf48ef4312..3307dad4d917 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -775,6 +775,8 @@ extern void trace_event_follow_fork(struct trace_array *tr, bool enable); extern unsigned long ftrace_update_tot_cnt; extern unsigned long ftrace_number_of_pages; extern unsigned long ftrace_number_of_groups; +extern u64 ftrace_update_time; +extern u64 ftrace_total_mod_time; void ftrace_init_trace_array(struct trace_array *tr); #else static inline void ftrace_init_trace_array(struct trace_array *tr) { } -- cgit v1.2.3 From 2c33155ef678033b8a3105b824cdef930f05b47d Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 21 Oct 2024 15:18:31 +0100 Subject: tracing: Make percpu stack trace buffer invariant to PAGE_SIZE Previously the size of "struct ftrace_stacks" depended upon PAGE_SIZE. For the common 4K page size, on a 64-bit system, sizeof(struct ftrace_stacks) was 32K. But for a 64K page size, sizeof(struct ftrace_stacks) was 512K. But ftrace stack usage requirements should be invariant to page size. So let's redefine FTRACE_KSTACK_ENTRIES so that "struct ftrace_stacks" is always sized at 32K for 64-bit and 16K for 32-bit. As a side effect, it removes the PAGE_SIZE compile-time constant assumption from this code, which is required to reach the goal of boot-time page size selection. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241021141832.3668264-1-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bdb776e6ceb9..f1d613d924e9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2898,7 +2898,7 @@ trace_function(struct trace_array *tr, unsigned long ip, unsigned long /* Allow 4 levels of nesting: normal, softirq, irq, NMI */ #define FTRACE_KSTACK_NESTING 4 -#define FTRACE_KSTACK_ENTRIES (PAGE_SIZE / FTRACE_KSTACK_NESTING) +#define FTRACE_KSTACK_ENTRIES (SZ_4K / FTRACE_KSTACK_NESTING) struct ftrace_stack { unsigned long calls[FTRACE_KSTACK_ENTRIES]; -- cgit v1.2.3 From 77a1326f64c3245ae9d2f9297abec5c8a0f11f58 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Mon, 14 Oct 2024 14:13:14 -0700 Subject: tracing: Replace multiple deprecated strncpy with memcpy strncpy() is deprecated for use on NUL-terminated destination strings [1] and as such we should prefer more robust and less ambiguous string interfaces. String copy operations involving manual pointer offset and length calculations followed by explicit NUL-byte assignments are best changed to either strscpy or memcpy. strscpy is not a drop-in replacement as @len would need a one subtracted from it to avoid truncating the source string. To not sabotage readability of the current code, use memcpy (retaining the manual NUL assignment) as this unambiguously describes the desired behavior. Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strncpy-on-nul-terminated-strings [1] Link: https://github.com/KSPP/linux/issues/90 [2] Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: linux-hardening@vger.kernel.org Link: https://lore.kernel.org/20241014-strncpy-kernel-trace-trace_events_filter-c-v2-1-d821e81e371e@google.com Reviewed-by: Kees Cook Signed-off-by: Justin Stitt Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_filter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 0c611b281a5b..78051de581e7 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1616,7 +1616,7 @@ static int parse_pred(const char *str, void *data, goto err_free; } - strncpy(num_buf, str + s, len); + memcpy(num_buf, str + s, len); num_buf[len] = 0; ret = kstrtoul(num_buf, 0, &ip); @@ -1694,7 +1694,7 @@ static int parse_pred(const char *str, void *data, if (!pred->regex) goto err_mem; pred->regex->len = len; - strncpy(pred->regex->pattern, str + s, len); + memcpy(pred->regex->pattern, str + s, len); pred->regex->pattern[len] = 0; } else if (!strncmp(str + i, "CPUS", 4)) { @@ -1859,7 +1859,7 @@ static int parse_pred(const char *str, void *data, if (!pred->regex) goto err_mem; pred->regex->len = len; - strncpy(pred->regex->pattern, str + s, len); + memcpy(pred->regex->pattern, str + s, len); pred->regex->pattern[len] = 0; filter_build_regex(pred); @@ -1919,7 +1919,7 @@ static int parse_pred(const char *str, void *data, goto err_free; } - strncpy(num_buf, str + s, len); + memcpy(num_buf, str + s, len); num_buf[len] = 0; /* Make sure it is a value */ -- cgit v1.2.3 From fa17cb4b3b42618aeed1e0bce80cc55106561718 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Wed, 30 Oct 2024 10:17:49 -0700 Subject: tracing: Document tracefs gid mount option Commit ee7f3666995d ("tracefs: Have new files inherit the ownership of their parent") and commit 48b27b6b5191 ("tracefs: Set all files to the same group ownership as the mount option") introduced a new gid mount option that allows specifying a group to apply to all entries in tracefs. Document this in the tracing readme. Cc: Eric Sandeen Cc: Mathieu Desnoyers Cc: Shuah Khan Cc: Ali Zahraee Cc: Christian Brauner Cc: David Howells Cc: Masami Hiramatsu Link: https://lore.kernel.org/20241030171928.4168869-3-kaleshsingh@google.com Signed-off-by: Kalesh Singh Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a8f52b6527ca..2b64b3ec67d9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5501,6 +5501,10 @@ static const struct file_operations tracing_iter_fops = { static const char readme_msg[] = "tracing mini-HOWTO:\n\n" + "By default tracefs removes all OTH file permission bits.\n" + "When mounting tracefs an optional group id can be specified\n" + "which adds the group to every directory and file in tracefs:\n\n" + "\t e.g. mount -t tracefs [-o [gid=]] nodev /sys/kernel/tracing\n\n" "# echo 0 > tracing_on : quick way to disable tracing\n" "# echo 1 > tracing_on : quick way to re-enable tracing\n\n" " Important files:\n" -- cgit v1.2.3 From e9f0a363473585a0f51b1b61a9bb15a63808d6ea Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 22 Oct 2024 13:01:12 +0200 Subject: tracing: Remove TRACE_FLAG_IRQS_NOSUPPORT It was possible to enable tracing with no IRQ tracing support. The tracing infrastructure would then record TRACE_FLAG_IRQS_NOSUPPORT as the only tracing flag and show an 'X' in the output. The last user of this feature was PPC32 which managed to implement it during PowerPC merge in 2009. Since then, it was unused and the PPC32 dependency was finally removed in commit 0ea5ee035133a ("tracing: Remove PPC32 wart from config TRACING_SUPPORT"). Since the PowerPC merge the code behind !CONFIG_TRACE_IRQFLAGS_SUPPORT with TRACING enabled can no longer be selected used and the 'X' is not displayed or recorded. Remove the CONFIG_TRACE_IRQFLAGS_SUPPORT from the tracing code. Remove TRACE_FLAG_IRQS_NOSUPPORT. Cc: Peter Zijlstra Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241022110112.XJI8I9T2@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/ftrace.rst | 3 --- include/linux/trace_events.h | 13 ------------- kernel/trace/trace_output.c | 1 - 3 files changed, 17 deletions(-) (limited to 'kernel/trace') diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst index 4073ca48af4a..74d5bd801b1a 100644 --- a/Documentation/trace/ftrace.rst +++ b/Documentation/trace/ftrace.rst @@ -1031,9 +1031,6 @@ explains which is which. CPU#: The CPU which the process was running on. irqs-off: 'd' interrupts are disabled. '.' otherwise. - .. caution:: If the architecture does not support a way to - read the irq flags variable, an 'X' will always - be printed here. need-resched: - 'N' both TIF_NEED_RESCHED and PREEMPT_NEED_RESCHED is set, diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index f8f2e52653df..016b29a56c87 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -184,7 +184,6 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status); enum trace_flag_type { TRACE_FLAG_IRQS_OFF = 0x01, - TRACE_FLAG_IRQS_NOSUPPORT = 0x02, TRACE_FLAG_NEED_RESCHED = 0x04, TRACE_FLAG_HARDIRQ = 0x08, TRACE_FLAG_SOFTIRQ = 0x10, @@ -193,7 +192,6 @@ enum trace_flag_type { TRACE_FLAG_BH_OFF = 0x80, }; -#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags) { unsigned int irq_status = irqs_disabled_flags(irqflags) ? @@ -207,17 +205,6 @@ static inline unsigned int tracing_gen_ctx(void) local_save_flags(irqflags); return tracing_gen_ctx_flags(irqflags); } -#else - -static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags) -{ - return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); -} -static inline unsigned int tracing_gen_ctx(void) -{ - return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); -} -#endif static inline unsigned int tracing_gen_ctx_dec(void) { diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 868f2f912f28..2ee6613dce6d 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -460,7 +460,6 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) (entry->flags & TRACE_FLAG_IRQS_OFF && bh_off) ? 'D' : (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : bh_off ? 'b' : - (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.'; switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | -- cgit v1.2.3 From 242b32d8073ed16868ff0f9381732e9782dea63b Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Thu, 31 Oct 2024 20:01:39 +0800 Subject: tracing: Replace strncpy() with strscpy() when copying comm Replace the depreciated[1] strncpy() calls with strscpy() when copying comm. Link: https://github.com/KSPP/linux/issues/90 [1] Cc: Cc: Link: https://lore.kernel.org/20241031120139.1343025-1-ruanjinjie@huawei.com Signed-off-by: Jinjie Ruan Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 2 +- kernel/trace/trace_events_hist.c | 4 ++-- kernel/trace/trace_sched_switch.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f1d613d924e9..a587fd7d7447 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1921,7 +1921,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) max_data->critical_start = data->critical_start; max_data->critical_end = data->critical_end; - strncpy(max_data->comm, tsk->comm, TASK_COMM_LEN); + strscpy(max_data->comm, tsk->comm); max_data->pid = tsk->pid; /* * If tsk == current, then use current_uid(), as that does not use diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index cc2924ad32a3..c288b92fc4df 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1599,7 +1599,7 @@ static inline void save_comm(char *comm, struct task_struct *task) return; } - strncpy(comm, task->comm, TASK_COMM_LEN); + strscpy(comm, task->comm, TASK_COMM_LEN); } static void hist_elt_data_free(struct hist_elt_data *elt_data) @@ -3405,7 +3405,7 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data) elt_data = context->elt->private_data; track_elt_data = track_data->elt.private_data; if (elt_data->comm) - strncpy(track_elt_data->comm, elt_data->comm, TASK_COMM_LEN); + strscpy(track_elt_data->comm, elt_data->comm, TASK_COMM_LEN); track_data->updated = true; diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 8a407adb0e1c..573b5d8e8a28 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -187,7 +187,7 @@ static inline char *get_saved_cmdlines(int idx) static inline void set_cmdline(int idx, const char *cmdline) { - strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); + strscpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); } static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) -- cgit v1.2.3 From 0c10cc2435115c36dcb611f8e1ed99ba6de6f17f Mon Sep 17 00:00:00 2001 From: Yuran Pereira Date: Mon, 28 Oct 2024 19:21:00 +0000 Subject: trace: kdb: Replace simple_strtoul with kstrtoul in kdb_ftdump The function simple_strtoul performs no error checking in scenarios where the input value overflows the intended output variable. This results in this function successfully returning, even when the output does not match the input string (aka the function returns successfully even when the result is wrong). Or as it was mentioned [1], "...simple_strtol(), simple_strtoll(), simple_strtoul(), and simple_strtoull() functions explicitly ignore overflows, which may lead to unexpected results in callers." Hence, the use of those functions is discouraged. This patch replaces all uses of the simple_strtoul with the safer alternatives kstrtoint and kstrtol. [1] https://www.kernel.org/doc/html/latest/process/deprecated.html#simple-strtol-simple-strtoll-simple-strtoul-simple-strtoull Signed-off-by: Yuran Pereira Reviewed-by: Douglas Anderson Acked-by: Masami Hiramatsu (Google) Acked-by: Steven Rostedt (Google) [nir: style fixes] Signed-off-by: Nir Lichtman Link: https://lore.kernel.org/r/20241028192100.GB918454@lichtman.org Signed-off-by: Daniel Thompson --- kernel/trace/trace_kdb.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 59857a1ee44c..1e72d20b3c2f 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -96,22 +96,19 @@ static int kdb_ftdump(int argc, const char **argv) { int skip_entries = 0; long cpu_file; - char *cp; + int err; int cnt; int cpu; if (argc > 2) return KDB_ARGCOUNT; - if (argc) { - skip_entries = simple_strtol(argv[1], &cp, 0); - if (*cp) - skip_entries = 0; - } + if (argc && kstrtoint(argv[1], 0, &skip_entries)) + return KDB_BADINT; if (argc == 2) { - cpu_file = simple_strtol(argv[2], &cp, 0); - if (*cp || cpu_file >= NR_CPUS || cpu_file < 0 || + err = kstrtol(argv[2], 0, &cpu_file); + if (err || cpu_file >= NR_CPUS || cpu_file < 0 || !cpu_online(cpu_file)) return KDB_BADINT; } else { -- cgit v1.2.3 From f505005bc7426f4309880da94cfbfc37efa225bd Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Nov 2024 14:45:33 +0100 Subject: bpf: Force uprobe bpf program to always return 0 As suggested by Andrii make uprobe multi bpf programs to always return 0, so they can't force uprobe removal. Keeping the int return type for uprobe_prog_run, because it will be used in following session changes. Fixes: 89ae89f53d20 ("bpf: Add multi uprobe link") Suggested-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241108134544.480660-3-jolsa@kernel.org --- kernel/trace/bpf_trace.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 88fd628850ca..db9e2792b42b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3229,7 +3229,6 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe, struct bpf_prog *prog = link->link.prog; bool sleepable = prog->sleepable; struct bpf_run_ctx *old_run_ctx; - int err = 0; if (link->task && !same_thread_group(current, link->task)) return 0; @@ -3242,7 +3241,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe, migrate_disable(); old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); - err = bpf_prog_run(link->link.prog, regs); + bpf_prog_run(link->link.prog, regs); bpf_reset_run_ctx(old_run_ctx); migrate_enable(); @@ -3251,7 +3250,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe, rcu_read_unlock_trace(); else rcu_read_unlock(); - return err; + return 0; } static bool -- cgit v1.2.3 From d920179b3d4842a0e27cae54fdddbe5ef3977e73 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Nov 2024 14:45:34 +0100 Subject: bpf: Add support for uprobe multi session attach Adding support to attach BPF program for entry and return probe of the same function. This is common use case which at the moment requires to create two uprobe multi links. Adding new BPF_TRACE_UPROBE_SESSION attach type that instructs kernel to attach single link program to both entry and exit probe. It's possible to control execution of the BPF program on return probe simply by returning zero or non zero from the entry BPF program execution to execute or not the BPF program on return probe respectively. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241108134544.480660-4-jolsa@kernel.org --- include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 9 +++++++-- kernel/bpf/verifier.c | 1 + kernel/trace/bpf_trace.c | 36 +++++++++++++++++++++++++++--------- tools/include/uapi/linux/bpf.h | 1 + tools/lib/bpf/libbpf.c | 1 + 6 files changed, 38 insertions(+), 11 deletions(-) (limited to 'kernel/trace') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f28b6527e815..4162afc6b5d0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1116,6 +1116,7 @@ enum bpf_attach_type { BPF_NETKIT_PRIMARY, BPF_NETKIT_PEER, BPF_TRACE_KPROBE_SESSION, + BPF_TRACE_UPROBE_SESSION, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8254b2973157..58190ca724a2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4103,10 +4103,14 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI && attach_type != BPF_TRACE_UPROBE_MULTI) return -EINVAL; + if (prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION && + attach_type != BPF_TRACE_UPROBE_SESSION) + return -EINVAL; if (attach_type != BPF_PERF_EVENT && attach_type != BPF_TRACE_KPROBE_MULTI && attach_type != BPF_TRACE_KPROBE_SESSION && - attach_type != BPF_TRACE_UPROBE_MULTI) + attach_type != BPF_TRACE_UPROBE_MULTI && + attach_type != BPF_TRACE_UPROBE_SESSION) return -EINVAL; return 0; case BPF_PROG_TYPE_SCHED_CLS: @@ -5359,7 +5363,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI || attr->link_create.attach_type == BPF_TRACE_KPROBE_SESSION) ret = bpf_kprobe_multi_link_attach(attr, prog); - else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI) + else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI || + attr->link_create.attach_type == BPF_TRACE_UPROBE_SESSION) ret = bpf_uprobe_multi_link_attach(attr, prog); break; default: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7d8ed377b35d..132fc172961f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -16027,6 +16027,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char case BPF_PROG_TYPE_KPROBE: switch (env->prog->expected_attach_type) { case BPF_TRACE_KPROBE_SESSION: + case BPF_TRACE_UPROBE_SESSION: range = retval_range(0, 1); break; default: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index db9e2792b42b..9c04b1364de2 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1581,6 +1581,17 @@ static inline bool is_kprobe_session(const struct bpf_prog *prog) return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION; } +static inline bool is_uprobe_multi(const struct bpf_prog *prog) +{ + return prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI || + prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION; +} + +static inline bool is_uprobe_session(const struct bpf_prog *prog) +{ + return prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION; +} + static const struct bpf_func_proto * kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1598,13 +1609,13 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_func_ip: if (is_kprobe_multi(prog)) return &bpf_get_func_ip_proto_kprobe_multi; - if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI) + if (is_uprobe_multi(prog)) return &bpf_get_func_ip_proto_uprobe_multi; return &bpf_get_func_ip_proto_kprobe; case BPF_FUNC_get_attach_cookie: if (is_kprobe_multi(prog)) return &bpf_get_attach_cookie_proto_kmulti; - if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI) + if (is_uprobe_multi(prog)) return &bpf_get_attach_cookie_proto_umulti; return &bpf_get_attach_cookie_proto_trace; default: @@ -3096,6 +3107,7 @@ struct bpf_uprobe { u64 cookie; struct uprobe *uprobe; struct uprobe_consumer consumer; + bool session; }; struct bpf_uprobe_multi_link { @@ -3267,9 +3279,13 @@ uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs, __u64 *data) { struct bpf_uprobe *uprobe; + int ret; uprobe = container_of(con, struct bpf_uprobe, consumer); - return uprobe_prog_run(uprobe, instruction_pointer(regs), regs); + ret = uprobe_prog_run(uprobe, instruction_pointer(regs), regs); + if (uprobe->session) + return ret ? UPROBE_HANDLER_IGNORE : 0; + return 0; } static int @@ -3279,7 +3295,8 @@ uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, s struct bpf_uprobe *uprobe; uprobe = container_of(con, struct bpf_uprobe, consumer); - return uprobe_prog_run(uprobe, func, regs); + uprobe_prog_run(uprobe, func, regs); + return 0; } static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx) @@ -3318,7 +3335,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (sizeof(u64) != sizeof(void *)) return -EOPNOTSUPP; - if (prog->expected_attach_type != BPF_TRACE_UPROBE_MULTI) + if (!is_uprobe_multi(prog)) return -EINVAL; flags = attr->link_create.uprobe_multi.flags; @@ -3394,11 +3411,12 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr uprobes[i].link = link; - if (flags & BPF_F_UPROBE_MULTI_RETURN) - uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler; - else + if (!(flags & BPF_F_UPROBE_MULTI_RETURN)) uprobes[i].consumer.handler = uprobe_multi_link_handler; - + if (flags & BPF_F_UPROBE_MULTI_RETURN || is_uprobe_session(prog)) + uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler; + if (is_uprobe_session(prog)) + uprobes[i].session = true; if (pid) uprobes[i].consumer.filter = uprobe_multi_link_filter; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f28b6527e815..4162afc6b5d0 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1116,6 +1116,7 @@ enum bpf_attach_type { BPF_NETKIT_PRIMARY, BPF_NETKIT_PEER, BPF_TRACE_KPROBE_SESSION, + BPF_TRACE_UPROBE_SESSION, __MAX_BPF_ATTACH_TYPE }; diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 711173acbcef..faac1c79840d 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -133,6 +133,7 @@ static const char * const attach_type_name[] = { [BPF_NETKIT_PRIMARY] = "netkit_primary", [BPF_NETKIT_PEER] = "netkit_peer", [BPF_TRACE_KPROBE_SESSION] = "trace_kprobe_session", + [BPF_TRACE_UPROBE_SESSION] = "trace_uprobe_session", }; static const char * const link_type_name[] = { -- cgit v1.2.3 From 99b403d2060d3e2604958a0ec3a7f37b18256d6b Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Nov 2024 14:45:35 +0100 Subject: bpf: Add support for uprobe multi session context Placing bpf_session_run_ctx layer in between bpf_run_ctx and bpf_uprobe_multi_run_ctx, so the session data can be retrieved from uprobe_multi link. Plus granting session kfuncs access to uprobe session programs. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241108134544.480660-5-jolsa@kernel.org --- kernel/trace/bpf_trace.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9c04b1364de2..949a3870946c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3120,7 +3120,7 @@ struct bpf_uprobe_multi_link { }; struct bpf_uprobe_multi_run_ctx { - struct bpf_run_ctx run_ctx; + struct bpf_session_run_ctx session_ctx; unsigned long entry_ip; struct bpf_uprobe *uprobe; }; @@ -3231,16 +3231,22 @@ static const struct bpf_link_ops bpf_uprobe_multi_link_lops = { static int uprobe_prog_run(struct bpf_uprobe *uprobe, unsigned long entry_ip, - struct pt_regs *regs) + struct pt_regs *regs, + bool is_return, void *data) { struct bpf_uprobe_multi_link *link = uprobe->link; struct bpf_uprobe_multi_run_ctx run_ctx = { + .session_ctx = { + .is_return = is_return, + .data = data, + }, .entry_ip = entry_ip, .uprobe = uprobe, }; struct bpf_prog *prog = link->link.prog; bool sleepable = prog->sleepable; struct bpf_run_ctx *old_run_ctx; + int err; if (link->task && !same_thread_group(current, link->task)) return 0; @@ -3252,8 +3258,8 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe, migrate_disable(); - old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); - bpf_prog_run(link->link.prog, regs); + old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx); + err = bpf_prog_run(link->link.prog, regs); bpf_reset_run_ctx(old_run_ctx); migrate_enable(); @@ -3262,7 +3268,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe, rcu_read_unlock_trace(); else rcu_read_unlock(); - return 0; + return err; } static bool @@ -3282,7 +3288,7 @@ uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs, int ret; uprobe = container_of(con, struct bpf_uprobe, consumer); - ret = uprobe_prog_run(uprobe, instruction_pointer(regs), regs); + ret = uprobe_prog_run(uprobe, instruction_pointer(regs), regs, false, data); if (uprobe->session) return ret ? UPROBE_HANDLER_IGNORE : 0; return 0; @@ -3295,7 +3301,7 @@ uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, s struct bpf_uprobe *uprobe; uprobe = container_of(con, struct bpf_uprobe, consumer); - uprobe_prog_run(uprobe, func, regs); + uprobe_prog_run(uprobe, func, regs, true, data); return 0; } @@ -3303,7 +3309,8 @@ static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx) { struct bpf_uprobe_multi_run_ctx *run_ctx; - run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, run_ctx); + run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, + session_ctx.run_ctx); return run_ctx->entry_ip; } @@ -3311,7 +3318,8 @@ static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx) { struct bpf_uprobe_multi_run_ctx *run_ctx; - run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, run_ctx); + run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, + session_ctx.run_ctx); return run_ctx->uprobe->cookie; } @@ -3505,7 +3513,7 @@ static int bpf_kprobe_multi_filter(const struct bpf_prog *prog, u32 kfunc_id) if (!btf_id_set8_contains(&kprobe_multi_kfunc_set_ids, kfunc_id)) return 0; - if (!is_kprobe_session(prog)) + if (!is_kprobe_session(prog) && !is_uprobe_session(prog)) return -EACCES; return 0; -- cgit v1.2.3 From 6371b4bc179aad17d84ee301b409a5a8ce675657 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 7 Nov 2024 12:05:30 +0000 Subject: tracing: Remove redundant check on field->field in histograms The check on field->field being true is handled as the first check on the cascaded if statement, so the later checks on field->field are redundant because this clause has already been handled. Since this later check is redundant, just remove it. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241107120530.18728-1-colin.i.king@gmail.com Signed-off-by: Colin Ian King Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index c288b92fc4df..9c058aa8baf3 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1354,10 +1354,7 @@ static const char *hist_field_name(struct hist_field *field, } else if (field->flags & HIST_FIELD_FL_TIMESTAMP) field_name = "common_timestamp"; else if (field->flags & HIST_FIELD_FL_STACKTRACE) { - if (field->field) - field_name = field->field->name; - else - field_name = "common_stacktrace"; + field_name = "common_stacktrace"; } else if (field->flags & HIST_FIELD_FL_HITCOUNT) field_name = "hitcount"; -- cgit v1.2.3 From 580bb355bcae7e9a6606ce9644af09b2a793f1bb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 13 Nov 2024 23:08:39 -0500 Subject: Revert: "ring-buffer: Do not have boot mapped buffers hook to CPU hotplug" A crash happened when testing cpu hotplug with respect to the memory mapped ring buffers. It was assumed that the hot plug code was adding a per CPU buffer that was already created that caused the crash. The real problem was due to ref counting and was fixed by commit 2cf9733891a4 ("ring-buffer: Fix refcount setting of boot mapped buffers"). When a per CPU buffer is created, it will not be created again even with CPU hotplug, so the fix to not use CPU hotplug was a red herring. In fact, it caused only the boot CPU buffer to be created, leaving the other CPU per CPU buffers disabled. Revert that change as it was not the culprit of the fix it was intended to be. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241113230839.6c03640f@gandalf.local.home Fixes: 912da2c384d5 ("ring-buffer: Do not have boot mapped buffers hook to CPU hotplug") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3ea4f7bb1837..5807116bcd0b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2337,12 +2337,9 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, if (!buffer->buffers[cpu]) goto fail_free_buffers; - /* If already mapped, do not hook to CPU hotplug */ - if (!start) { - ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); - if (ret < 0) - goto fail_free_buffers; - } + ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); + if (ret < 0) + goto fail_free_buffers; mutex_init(&buffer->mutex); -- cgit v1.2.3 From 09663753bb7c50b33f8e5fa562c20ce275b88237 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Nov 2024 11:28:25 -0500 Subject: tracing/ring-buffer: Clear all memory mapped CPU ring buffers on first recording The events of a memory mapped ring buffer from the previous boot should not be mixed in with events from the current boot. There's meta data that is used to handle KASLR so that function names can be shown properly. Also, since the timestamps of the previous boot have no meaning to the timestamps of the current boot, having them intermingled in a buffer can also cause confusion because there could possibly be events in the future. When a trace is activated the meta data is reset so that the pointers of are now processed for the new address space. The trace buffers are reset when tracing starts for the first time. The problem here is that the reset only happens on online CPUs. If a CPU is offline, it does not get reset. To demonstrate the issue, a previous boot had tracing enabled in the boot mapped ring buffer on reboot. On the following boot, tracing has not been started yet so the function trace from the previous boot is still visible. # trace-cmd show -B boot_mapped -c 3 | tail -0 [003] d.h2. 156.462395: __rcu_read_lock <-cpu_emergency_disable_virtualization -0 [003] d.h2. 156.462396: vmx_emergency_disable_virtualization_cpu <-cpu_emergency_disable_virtualization -0 [003] d.h2. 156.462396: __rcu_read_unlock <-__sysvec_reboot -0 [003] d.h2. 156.462397: stop_this_cpu <-__sysvec_reboot -0 [003] d.h2. 156.462397: set_cpu_online <-stop_this_cpu -0 [003] d.h2. 156.462397: disable_local_APIC <-stop_this_cpu -0 [003] d.h2. 156.462398: clear_local_APIC <-disable_local_APIC -0 [003] d.h2. 156.462574: mcheck_cpu_clear <-stop_this_cpu -0 [003] d.h2. 156.462575: mce_intel_feature_clear <-stop_this_cpu -0 [003] d.h2. 156.462575: lmce_supported <-mce_intel_feature_clear Now, if CPU 3 is taken offline, and tracing is started on the memory mapped ring buffer, the events from the previous boot in the CPU 3 ring buffer is not reset. Now those events are using the meta data from the current boot and produces just hex values. # echo 0 > /sys/devices/system/cpu/cpu3/online # trace-cmd start -B boot_mapped -p function # trace-cmd show -B boot_mapped -c 3 | tail -0 [003] d.h2. 156.462395: 0xffffffff9a1e3194 <-0xffffffff9a0f655e -0 [003] d.h2. 156.462396: 0xffffffff9a0a1d24 <-0xffffffff9a0f656f -0 [003] d.h2. 156.462396: 0xffffffff9a1e6bc4 <-0xffffffff9a0f7323 -0 [003] d.h2. 156.462397: 0xffffffff9a0d12b4 <-0xffffffff9a0f732a -0 [003] d.h2. 156.462397: 0xffffffff9a1458d4 <-0xffffffff9a0d12e2 -0 [003] d.h2. 156.462397: 0xffffffff9a0faed4 <-0xffffffff9a0d12e7 -0 [003] d.h2. 156.462398: 0xffffffff9a0faaf4 <-0xffffffff9a0faef2 -0 [003] d.h2. 156.462574: 0xffffffff9a0e3444 <-0xffffffff9a0d12ef -0 [003] d.h2. 156.462575: 0xffffffff9a0e4964 <-0xffffffff9a0d12ef -0 [003] d.h2. 156.462575: 0xffffffff9a0e3fb0 <-0xffffffff9a0e496f Reset all CPUs when starting a boot mapped ring buffer for the first time, and not just the online CPUs. Fixes: 7a1d1e4b9639f ("tracing/ring-buffer: Add last_boot_info file to boot instance") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a8f52b6527ca..619e9aa62201 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2386,6 +2386,25 @@ void tracing_reset_online_cpus(struct array_buffer *buf) ring_buffer_record_enable(buffer); } +static void tracing_reset_all_cpus(struct array_buffer *buf) +{ + struct trace_buffer *buffer = buf->buffer; + + if (!buffer) + return; + + ring_buffer_record_disable(buffer); + + /* Make sure all commits have finished */ + synchronize_rcu(); + + buf->time_start = buffer_ftrace_now(buf, buf->cpu); + + ring_buffer_reset(buffer); + + ring_buffer_record_enable(buffer); +} + /* Must have trace_types_lock held */ void tracing_reset_all_online_cpus_unlocked(void) { @@ -6141,8 +6160,13 @@ static void update_last_data(struct trace_array *tr) if (!tr->text_delta && !tr->data_delta) return; - /* Clear old data */ - tracing_reset_online_cpus(&tr->array_buffer); + /* + * Need to clear all CPU buffers as there cannot be events + * from the previous boot mixed with events with this boot + * as that will cause a confusing trace. Need to clear all + * CPU buffers, even for those that may currently be offline. + */ + tracing_reset_all_cpus(&tr->array_buffer); /* Using current data now */ tr->text_delta = 0; -- cgit v1.2.3 From 537affea1672a841cd5b87b208c193a3a542c7cf Mon Sep 17 00:00:00 2001 From: liujing Date: Thu, 7 Nov 2024 17:53:27 +0800 Subject: ring-buffer: Correct a grammatical error in a comment The word "trace" begins with a consonant sound, so "a" should be used instead of "an". Link: https://lore.kernel.org/20241107095327.6390-1-liujing@cmss.chinamobile.com Signed-off-by: liujing Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index db3bf6a1b536..d6100a7da483 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4060,7 +4060,7 @@ static const char *show_irq_str(int bits) return type[bits]; } -/* Assume this is an trace event */ +/* Assume this is a trace event */ static const char *show_flags(struct ring_buffer_event *event) { struct trace_entry *entry; -- cgit v1.2.3 From 60b1f578b5789730d81460d1836dec7fa60510bf Mon Sep 17 00:00:00 2001 From: Jeff Xie Date: Tue, 8 Oct 2024 11:31:59 +0800 Subject: ftrace: Get the true parent ip for function tracer When using both function tracer and function graph simultaneously, it is found that function tracer sometimes captures a fake parent ip (return_to_handler) instead of the true parent ip. This issue is easy to reproduce. Below are my reproduction steps: jeff-labs:~/bin # ./trace-net.sh jeff-labs:~/bin # cat /sys/kernel/debug/tracing/instances/foo/trace | grep return_to_handler trace-net.sh-405 [001] ...2. 31.859501: avc_has_perm+0x4/0x190 <-return_to_handler+0x0/0x40 trace-net.sh-405 [001] ...2. 31.859503: simple_setattr+0x4/0x70 <-return_to_handler+0x0/0x40 trace-net.sh-405 [001] ...2. 31.859503: truncate_pagecache+0x4/0x60 <-return_to_handler+0x0/0x40 trace-net.sh-405 [001] ...2. 31.859505: unmap_mapping_range+0x4/0x140 <-return_to_handler+0x0/0x40 trace-net.sh-405 [001] ...3. 31.859508: _raw_spin_unlock+0x4/0x30 <-return_to_handler+0x0/0x40 [...] The following is my simple trace script: jeff-labs:~/bin # cat ./trace-net.sh TRACE_PATH="/sys/kernel/tracing" set_events() { echo 1 > $1/events/net/enable echo 1 > $1/events/tcp/enable echo 1 > $1/events/sock/enable echo 1 > $1/events/napi/enable echo 1 > $1/events/fib/enable echo 1 > $1/events/neigh/enable } set_events ${TRACE_PATH} echo 1 > ${TRACE_PATH}/options/sym-offset echo 1 > ${TRACE_PATH}/options/funcgraph-tail echo 1 > ${TRACE_PATH}/options/funcgraph-proc echo 1 > ${TRACE_PATH}/options/funcgraph-abstime echo 'tcp_orphan*' > ${TRACE_PATH}/set_ftrace_notrace echo function_graph > ${TRACE_PATH}/current_tracer INSTANCE_FOO=${TRACE_PATH}/instances/foo if [ ! -e $INSTANCE_FOO ]; then mkdir ${INSTANCE_FOO} fi set_events ${INSTANCE_FOO} echo 1 > ${INSTANCE_FOO}/options/sym-offset echo 'tcp_orphan*' > ${INSTANCE_FOO}/set_ftrace_notrace echo function > ${INSTANCE_FOO}/current_tracer echo 1 > ${TRACE_PATH}/tracing_on echo 1 > ${INSTANCE_FOO}/tracing_on echo > ${TRACE_PATH}/trace echo > ${INSTANCE_FOO}/trace Link: https://lore.kernel.org/20241008033159.22459-1-jeff.xie@linux.dev Acked-by: Masami Hiramatsu (Google) Signed-off-by: Jeff Xie Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 65fed0bbc5c2..74c353164ca1 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -176,6 +176,27 @@ static void function_trace_start(struct trace_array *tr) tracing_reset_online_cpus(&tr->array_buffer); } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static __always_inline unsigned long +function_get_true_parent_ip(unsigned long parent_ip, struct ftrace_regs *fregs) +{ + unsigned long true_parent_ip; + int idx = 0; + + true_parent_ip = parent_ip; + if (unlikely(parent_ip == (unsigned long)&return_to_handler) && fregs) + true_parent_ip = ftrace_graph_ret_addr(current, &idx, parent_ip, + (unsigned long *)ftrace_regs_get_stack_pointer(fregs)); + return true_parent_ip; +} +#else +static __always_inline unsigned long +function_get_true_parent_ip(unsigned long parent_ip, struct ftrace_regs *fregs) +{ + return parent_ip; +} +#endif + static void function_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) @@ -192,6 +213,8 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, if (bit < 0) return; + parent_ip = function_get_true_parent_ip(parent_ip, fregs); + trace_ctx = tracing_gen_ctx(); data = this_cpu_ptr(tr->array_buffer.data); @@ -239,6 +262,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, * recursive protection is performed. */ local_irq_save(flags); + parent_ip = function_get_true_parent_ip(parent_ip, fregs); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); disabled = atomic_inc_return(&data->disabled); @@ -306,6 +330,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, if (bit < 0) return; + parent_ip = function_get_true_parent_ip(parent_ip, fregs); data = this_cpu_ptr(tr->array_buffer.data); if (atomic_read(&data->disabled)) goto out; @@ -352,6 +377,7 @@ function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, * recursive protection is performed. */ local_irq_save(flags); + parent_ip = function_get_true_parent_ip(parent_ip, fregs); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); disabled = atomic_inc_return(&data->disabled); -- cgit v1.2.3 From 6ce5a6f0a07d37cc377df08a8d8a9c283420f323 Mon Sep 17 00:00:00 2001 From: Tatsuya S Date: Mon, 21 Oct 2024 07:14:53 +0000 Subject: tracing: Fix function name for trampoline The issue that unrelated function name is shown on stack trace like following even though it should be trampoline code address is caused by the creation of trampoline code in the area where .init.text section of module was freed after module is loaded. bash-1344 [002] ..... 43.644608: => (MODULE INIT FUNCTION) => vfs_write => ksys_write => do_syscall_64 => entry_SYSCALL_64_after_hwframe To resolve this, when function address of stack trace entry is in trampoline, output without looking up symbol name. Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241021071454.34610-2-tatsuya.s2862@gmail.com Signed-off-by: Tatsuya S Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 33 +++++++++++++++++++++++++-------- kernel/trace/trace.h | 7 +++++++ kernel/trace/trace_output.c | 4 ++++ 3 files changed, 36 insertions(+), 8 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a587fd7d7447..566d99f9f086 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -975,7 +975,8 @@ static inline void trace_access_lock_init(void) #endif #ifdef CONFIG_STACKTRACE -static void __ftrace_trace_stack(struct trace_buffer *buffer, +static void __ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, unsigned int trace_ctx, int skip, struct pt_regs *regs); static inline void ftrace_trace_stack(struct trace_array *tr, @@ -984,7 +985,8 @@ static inline void ftrace_trace_stack(struct trace_array *tr, int skip, struct pt_regs *regs); #else -static inline void __ftrace_trace_stack(struct trace_buffer *buffer, +static inline void __ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, unsigned int trace_ctx, int skip, struct pt_regs *regs) { @@ -2912,7 +2914,8 @@ struct ftrace_stacks { static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks); static DEFINE_PER_CPU(int, ftrace_stack_reserve); -static void __ftrace_trace_stack(struct trace_buffer *buffer, +static void __ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, unsigned int trace_ctx, int skip, struct pt_regs *regs) { @@ -2958,6 +2961,20 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, nr_entries = stack_trace_save(fstack->calls, size, skip); } +#ifdef CONFIG_DYNAMIC_FTRACE + /* Mark entry of stack trace as trampoline code */ + if (tr->ops && tr->ops->trampoline) { + unsigned long tramp_start = tr->ops->trampoline; + unsigned long tramp_end = tramp_start + tr->ops->trampoline_size; + unsigned long *calls = fstack->calls; + + for (int i = 0; i < nr_entries; i++) { + if (calls[i] >= tramp_start && calls[i] < tramp_end) + calls[i] = FTRACE_TRAMPOLINE_MARKER; + } + } +#endif + event = __trace_buffer_lock_reserve(buffer, TRACE_STACK, struct_size(entry, caller, nr_entries), trace_ctx); @@ -2987,7 +3004,7 @@ static inline void ftrace_trace_stack(struct trace_array *tr, if (!(tr->trace_flags & TRACE_ITER_STACKTRACE)) return; - __ftrace_trace_stack(buffer, trace_ctx, skip, regs); + __ftrace_trace_stack(tr, buffer, trace_ctx, skip, regs); } void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, @@ -2996,7 +3013,7 @@ void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, struct trace_buffer *buffer = tr->array_buffer.buffer; if (rcu_is_watching()) { - __ftrace_trace_stack(buffer, trace_ctx, skip, NULL); + __ftrace_trace_stack(tr, buffer, trace_ctx, skip, NULL); return; } @@ -3013,7 +3030,7 @@ void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, return; ct_irq_enter_irqson(); - __ftrace_trace_stack(buffer, trace_ctx, skip, NULL); + __ftrace_trace_stack(tr, buffer, trace_ctx, skip, NULL); ct_irq_exit_irqson(); } @@ -3030,8 +3047,8 @@ void trace_dump_stack(int skip) /* Skip 1 to skip this function. */ skip++; #endif - __ftrace_trace_stack(printk_trace->array_buffer.buffer, - tracing_gen_ctx(), skip, NULL); + __ftrace_trace_stack(printk_trace, printk_trace->array_buffer.buffer, + tracing_gen_ctx(), skip, NULL); } EXPORT_SYMBOL_GPL(trace_dump_stack); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 638f452eec10..6e66b666c3e9 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -2172,4 +2172,11 @@ static inline int rv_init_interface(void) } #endif +/* + * This is used only to distinguish + * function address from trampoline code. + * So this value has no meaning. + */ +#define FTRACE_TRAMPOLINE_MARKER ((unsigned long) INT_MAX) + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 2ee6613dce6d..e08aee34ef63 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1245,6 +1245,10 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, break; trace_seq_puts(s, " => "); + if ((*p) == FTRACE_TRAMPOLINE_MARKER) { + trace_seq_puts(s, "[FTRACE TRAMPOLINE]\n"); + continue; + } seq_print_ip_sym(s, (*p) + delta, flags); trace_seq_putc(s, '\n'); } -- cgit v1.2.3 From 45af52e7d3b8560f21d139b3759735eead8b1653 Mon Sep 17 00:00:00 2001 From: guoweikang Date: Wed, 20 Nov 2024 13:27:49 +0800 Subject: ftrace: Fix regression with module command in stack_trace_filter When executing the following command: # echo "write*:mod:ext3" > /sys/kernel/tracing/stack_trace_filter The current mod command causes a null pointer dereference. While commit 0f17976568b3f ("ftrace: Fix regression with module command in stack_trace_filter") has addressed part of the issue, it left a corner case unhandled, which still results in a kernel crash. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241120052750.275463-1-guoweikang.kernel@gmail.com Fixes: 04ec7bb642b77 ("tracing: Have the trace_array hold the list of registered func probes"); Signed-off-by: guoweikang Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4c28dd177ca6..5ff0822342ac 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5076,6 +5076,9 @@ ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash, char *func; int ret; + if (!tr) + return -ENODEV; + /* match_records() modifies func, and we need the original */ func = kstrdup(func_orig, GFP_KERNEL); if (!func) -- cgit v1.2.3 From 0172afefbfbdd8987787c926b40b68400bd1c3d1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 22 Nov 2024 21:28:49 +0100 Subject: tracing: Record task flag NEED_RESCHED_LAZY. The scheduler added NEED_RESCHED_LAZY scheduling. Record this state as part of trace flags and expose it in the need_resched field. Record and expose NEED_RESCHED_LAZY. [bigeasy: Commit description, documentation bits.] Cc: Peter Zijlstra Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241122202849.7DfYpJR0@linutronix.de Reviewed-by: Ankur Arora Reviewed-by: Steven Rostedt (Google) Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/ftrace.rst | 4 ++++ include/linux/trace_events.h | 1 + kernel/trace/trace.c | 2 ++ kernel/trace/trace_output.c | 14 +++++++++++++- 4 files changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst index 74d5bd801b1a..272464bb7c60 100644 --- a/Documentation/trace/ftrace.rst +++ b/Documentation/trace/ftrace.rst @@ -1033,9 +1033,13 @@ explains which is which. irqs-off: 'd' interrupts are disabled. '.' otherwise. need-resched: + - 'B' all, TIF_NEED_RESCHED, PREEMPT_NEED_RESCHED and TIF_RESCHED_LAZY is set, - 'N' both TIF_NEED_RESCHED and PREEMPT_NEED_RESCHED is set, - 'n' only TIF_NEED_RESCHED is set, - 'p' only PREEMPT_NEED_RESCHED is set, + - 'L' both PREEMPT_NEED_RESCHED and TIF_RESCHED_LAZY is set, + - 'b' both TIF_NEED_RESCHED and TIF_RESCHED_LAZY is set, + - 'l' only TIF_RESCHED_LAZY is set - '.' otherwise. hardirq/softirq: diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 016b29a56c87..2a5df5b62cfc 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -184,6 +184,7 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status); enum trace_flag_type { TRACE_FLAG_IRQS_OFF = 0x01, + TRACE_FLAG_NEED_RESCHED_LAZY = 0x02, TRACE_FLAG_NEED_RESCHED = 0x04, TRACE_FLAG_HARDIRQ = 0x08, TRACE_FLAG_SOFTIRQ = 0x10, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3ef047ed9705..be62f0ea1814 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2552,6 +2552,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) trace_flags |= TRACE_FLAG_NEED_RESCHED; if (test_preempt_need_resched()) trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; + if (IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY) && tif_test_bit(TIF_NEED_RESCHED_LAZY)) + trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY; return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) | (min_t(unsigned int, migration_disable_value(), 0xf)) << 4; } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index e08aee34ef63..da748b7cbc4d 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -462,17 +462,29 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) bh_off ? 'b' : '.'; - switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | + switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY | TRACE_FLAG_PREEMPT_RESCHED)) { + case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY | TRACE_FLAG_PREEMPT_RESCHED: + need_resched = 'B'; + break; case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED: need_resched = 'N'; break; + case TRACE_FLAG_NEED_RESCHED_LAZY | TRACE_FLAG_PREEMPT_RESCHED: + need_resched = 'L'; + break; + case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY: + need_resched = 'b'; + break; case TRACE_FLAG_NEED_RESCHED: need_resched = 'n'; break; case TRACE_FLAG_PREEMPT_RESCHED: need_resched = 'p'; break; + case TRACE_FLAG_NEED_RESCHED_LAZY: + need_resched = 'l'; + break; default: need_resched = '.'; break; -- cgit v1.2.3