From afd2627f727b89496d79a6b934a025fc916d4ded Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Dec 2024 21:41:22 -0500 Subject: tracing: Check "%s" dereference via the field and not the TP_printk format The TP_printk() portion of a trace event is executed at the time a event is read from the trace. This can happen seconds, minutes, hours, days, months, years possibly later since the event was recorded. If the print format contains a dereference to a string via "%s", and that string was allocated, there's a chance that string could be freed before it is read by the trace file. To protect against such bugs, there are two functions that verify the event. The first one is test_event_printk(), which is called when the event is created. It reads the TP_printk() format as well as its arguments to make sure nothing may be dereferencing a pointer that was not copied into the ring buffer along with the event. If it is, it will trigger a WARN_ON(). For strings that use "%s", it is not so easy. The string may not reside in the ring buffer but may still be valid. Strings that are static and part of the kernel proper which will not be freed for the life of the running system, are safe to dereference. But to know if it is a pointer to a static string or to something on the heap can not be determined until the event is triggered. This brings us to the second function that tests for the bad dereferencing of strings, trace_check_vprintf(). It would walk through the printf format looking for "%s", and when it finds it, it would validate that the pointer is safe to read. If not, it would produces a WARN_ON() as well and write into the ring buffer "[UNSAFE-MEMORY]". The problem with this is how it used va_list to have vsnprintf() handle all the cases that it didn't need to check. Instead of re-implementing vsnprintf(), it would make a copy of the format up to the %s part, and call vsnprintf() with the current va_list ap variable, where the ap would then be ready to point at the string in question. For architectures that passed va_list by reference this was possible. For architectures that passed it by copy it was not. A test_can_verify() function was used to differentiate between the two, and if it wasn't possible, it would disable it. Even for architectures where this was feasible, it was a stretch to rely on such a method that is undocumented, and could cause issues later on with new optimizations of the compiler. Instead, the first function test_event_printk() was updated to look at "%s" as well. If the "%s" argument is a pointer outside the event in the ring buffer, it would find the field type of the event that is the problem and mark the structure with a new flag called "needs_test". The event itself will be marked by TRACE_EVENT_FL_TEST_STR to let it be known that this event has a field that needs to be verified before the event can be printed using the printf format. When the event fields are created from the field type structure, the fields would copy the field type's "needs_test" value. Finally, before being printed, a new function ignore_event() is called which will check if the event has the TEST_STR flag set (if not, it returns false). If the flag is set, it then iterates through the events fields looking for the ones that have the "needs_test" flag set. Then it uses the offset field from the field structure to find the pointer in the ring buffer event. It runs the tests to make sure that pointer is safe to print and if not, it triggers the WARN_ON() and also adds to the trace output that the event in question has an unsafe memory access. The ignore_event() makes the trace_check_vprintf() obsolete so it is removed. Link: https://lore.kernel.org/all/CAHk-=wh3uOnqnZPpR0PeLZZtyWbZLboZ7cHLCKRWsocvs9Y7hQ@mail.gmail.com/ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Al Viro Cc: Linus Torvalds Link: https://lore.kernel.org/20241217024720.848621576@goodmis.org Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers") Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux/trace_events.h') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 2a5df5b62cfc..91b8ffbdfa8c 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -273,7 +273,8 @@ struct trace_event_fields { const char *name; const int size; const int align; - const int is_signed; + const unsigned int is_signed:1; + unsigned int needs_test:1; const int filter_type; const int len; }; @@ -324,6 +325,7 @@ enum { TRACE_EVENT_FL_EPROBE_BIT, TRACE_EVENT_FL_FPROBE_BIT, TRACE_EVENT_FL_CUSTOM_BIT, + TRACE_EVENT_FL_TEST_STR_BIT, }; /* @@ -340,6 +342,7 @@ enum { * CUSTOM - Event is a custom event (to be attached to an exsiting tracepoint) * This is set when the custom event has not been attached * to a tracepoint yet, then it is cleared when it is. + * TEST_STR - The event has a "%s" that points to a string outside the event */ enum { TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT), @@ -352,6 +355,7 @@ enum { TRACE_EVENT_FL_EPROBE = (1 << TRACE_EVENT_FL_EPROBE_BIT), TRACE_EVENT_FL_FPROBE = (1 << TRACE_EVENT_FL_FPROBE_BIT), TRACE_EVENT_FL_CUSTOM = (1 << TRACE_EVENT_FL_CUSTOM_BIT), + TRACE_EVENT_FL_TEST_STR = (1 << TRACE_EVENT_FL_TEST_STR_BIT), }; #define TRACE_EVENT_FL_UKPROBE (TRACE_EVENT_FL_KPROBE | TRACE_EVENT_FL_UPROBE) -- cgit v1.2.3 From 452f4b31e3f70a52b97890888eeb9eaa9a87139a Mon Sep 17 00:00:00 2001 From: Christian Göttsche Date: Mon, 25 Nov 2024 11:50:25 +0100 Subject: tracing: Constify string literal data member in struct trace_event_call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The name member of the struct trace_event_call is assigned with generated string literals; declare them pointer to read-only. Reported by clang: security/landlock/syscalls.c:179:1: warning: initializing 'char *' with an expression of type 'const char[34]' discards qualifiers [-Wincompatible-pointer-types-discards-qualifiers] 179 | SYSCALL_DEFINE3(landlock_create_ruleset, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 180 | const struct landlock_ruleset_attr __user *const, attr, | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 181 | const size_t, size, const __u32, flags) | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/syscalls.h:226:36: note: expanded from macro 'SYSCALL_DEFINE3' 226 | #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/syscalls.h:234:2: note: expanded from macro 'SYSCALL_DEFINEx' 234 | SYSCALL_METADATA(sname, x, __VA_ARGS__) \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/syscalls.h:184:2: note: expanded from macro 'SYSCALL_METADATA' 184 | SYSCALL_TRACE_ENTER_EVENT(sname); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/syscalls.h:151:30: note: expanded from macro 'SYSCALL_TRACE_ENTER_EVENT' 151 | .name = "sys_enter"#sname, \ | ^~~~~~~~~~~~~~~~~ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mickaël Salaün Cc: Günther Noack Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Bill Wendling Cc: Justin Stitt Link: https://lore.kernel.org/20241125105028.42807-1-cgoettsche@seltendoof.de Fixes: b77e38aa240c3 ("tracing: add event trace infrastructure") Signed-off-by: Christian Göttsche Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/trace_events.h') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 91b8ffbdfa8c..58ad4ead33fc 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -364,7 +364,7 @@ struct trace_event_call { struct list_head list; struct trace_event_class *class; union { - char *name; + const char *name; /* Set TRACE_EVENT_FL_TRACEPOINT flag when using "tp" */ struct tracepoint *tp; }; -- cgit v1.2.3 From 1bd13edbbed6e7e396f1aab92b224a4775218e68 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Fri, 27 Dec 2024 13:07:57 +0900 Subject: tracing/hist: Add poll(POLLIN) support on hist file Add poll syscall support on the `hist` file. The Waiter will be waken up when the histogram is updated with POLLIN. Currently, there is no way to wait for a specific event in userspace. So user needs to peek the `trace` periodicaly, or wait on `trace_pipe`. But it is not a good idea to peek at the `trace` for an event that randomly happens. And `trace_pipe` is not coming back until a page is filled with events. This allows a user to wait for a specific event on the `hist` file. User can set a histogram trigger on the event which they want to monitor and poll() on its `hist` file. Since this poll() returns POLLIN, the next poll() will return soon unless a read() happens on that hist file. NOTE: To read the hist file again, you must set the file offset to 0, but just for monitoring the event, you may not need to read the histogram. Cc: Shuah Khan Cc: Mathieu Desnoyers Link: https://lore.kernel.org/173527247756.464571.14236296701625509931.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 14 ++++++++ kernel/trace/trace_events.c | 14 ++++++++ kernel/trace/trace_events_hist.c | 70 ++++++++++++++++++++++++++++++++++++++-- 3 files changed, 95 insertions(+), 3 deletions(-) (limited to 'include/linux/trace_events.h') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 91b8ffbdfa8c..02cde1174487 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -673,6 +673,20 @@ struct trace_event_file { atomic_t tm_ref; /* trigger-mode reference counter */ }; +#ifdef CONFIG_HIST_TRIGGERS +extern struct irq_work hist_poll_work; +extern wait_queue_head_t hist_poll_wq; + +static inline void hist_poll_wakeup(void) +{ + if (wq_has_sleeper(&hist_poll_wq)) + irq_work_queue(&hist_poll_work); +} + +#define hist_poll_wait(file, wait) \ + poll_wait(file, &hist_poll_wq, wait) +#endif + #define __TRACE_EVENT_FLAGS(name, value) \ static int __init trace_init_flags_##name(void) \ { \ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 047d2775184b..2b9222e7bd5a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3094,6 +3094,20 @@ static bool event_in_systems(struct trace_event_call *call, return !*p || isspace(*p) || *p == ','; } +#ifdef CONFIG_HIST_TRIGGERS +/* + * Wake up waiter on the hist_poll_wq from irq_work because the hist trigger + * may happen in any context. + */ +static void hist_poll_event_irq_work(struct irq_work *work) +{ + wake_up_all(&hist_poll_wq); +} + +DEFINE_IRQ_WORK(hist_poll_work, hist_poll_event_irq_work); +DECLARE_WAIT_QUEUE_HEAD(hist_poll_wq); +#endif + static struct trace_event_file * trace_create_new_event(struct trace_event_call *call, struct trace_array *tr) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 879b58892b9d..af4be28f01e0 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5311,6 +5311,8 @@ static void event_hist_trigger(struct event_trigger_data *data, if (resolve_var_refs(hist_data, key, var_ref_vals, true)) hist_trigger_actions(hist_data, elt, buffer, rec, rbe, key, var_ref_vals); + + hist_poll_wakeup(); } static void hist_trigger_stacktrace_print(struct seq_file *m, @@ -5590,15 +5592,36 @@ static void hist_trigger_show(struct seq_file *m, n_entries, (u64)atomic64_read(&hist_data->map->drops)); } +struct hist_file_data { + struct file *file; + u64 last_read; +}; + +static u64 get_hist_hit_count(struct trace_event_file *event_file) +{ + struct hist_trigger_data *hist_data; + struct event_trigger_data *data; + u64 ret = 0; + + list_for_each_entry(data, &event_file->triggers, list) { + if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) { + hist_data = data->private_data; + ret += atomic64_read(&hist_data->map->hits); + } + } + return ret; +} + static int hist_show(struct seq_file *m, void *v) { + struct hist_file_data *hist_file = m->private; struct event_trigger_data *data; struct trace_event_file *event_file; int n = 0; guard(mutex)(&event_mutex); - event_file = event_file_file(m->private); + event_file = event_file_file(hist_file->file); if (unlikely(!event_file)) return -ENODEV; @@ -5606,27 +5629,68 @@ static int hist_show(struct seq_file *m, void *v) if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) hist_trigger_show(m, data, n++); } + hist_file->last_read = get_hist_hit_count(event_file); + return 0; } +static __poll_t event_hist_poll(struct file *file, struct poll_table_struct *wait) +{ + struct trace_event_file *event_file; + struct seq_file *m = file->private_data; + struct hist_file_data *hist_file = m->private; + + guard(mutex)(&event_mutex); + + event_file = event_file_data(file); + if (!event_file) + return EPOLLERR; + + hist_poll_wait(file, wait); + + if (hist_file->last_read != get_hist_hit_count(event_file)) + return EPOLLIN | EPOLLRDNORM; + + return 0; +} + +static int event_hist_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + struct hist_file_data *hist_file = m->private; + + kfree(hist_file); + return tracing_single_release_file_tr(inode, file); +} + static int event_hist_open(struct inode *inode, struct file *file) { + struct hist_file_data *hist_file; int ret; ret = tracing_open_file_tr(inode, file); if (ret) return ret; + hist_file = kzalloc(sizeof(*hist_file), GFP_KERNEL); + if (!hist_file) + return -ENOMEM; + hist_file->file = file; + /* Clear private_data to avoid warning in single_open() */ file->private_data = NULL; - return single_open(file, hist_show, file); + ret = single_open(file, hist_show, hist_file); + if (ret) + kfree(hist_file); + return ret; } const struct file_operations event_hist_fops = { .open = event_hist_open, .read = seq_read, .llseek = seq_lseek, - .release = tracing_single_release_file_tr, + .release = event_hist_release, + .poll = event_hist_poll, }; #ifdef CONFIG_HIST_TRIGGERS_DEBUG -- cgit v1.2.3