From 5790b1fb3d672d9a1fe3881a7181dfdbe741568f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 4 Oct 2023 16:50:07 -0400 Subject: eventfs: Remove eventfs_file and just use eventfs_inode Instead of having a descriptor for every file represented in the eventfs directory, only have the directory itself represented. Change the API to send in a list of entries that represent all the files in the directory (but not other directories). The entry list contains a name and a callback function that will be used to create the files when they are accessed. struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent, const struct eventfs_entry *entries, int size, void *data); is used for the top level eventfs directory, and returns an eventfs_inode that will be used by: struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent, const struct eventfs_entry *entries, int size, void *data); where both of the above take an array of struct eventfs_entry entries for every file that is in the directory. The entries are defined by: typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data, const struct file_operations **fops); struct eventfs_entry { const char *name; eventfs_callback callback; }; Where the name is the name of the file and the callback gets called when the file is being created. The callback passes in the name (in case the same callback is used for multiple files), a pointer to the mode, data and fops. The data will be pointing to the data that was passed in eventfs_create_dir() or eventfs_create_events_dir() but may be overridden to point to something else, as it will be used to point to the inode->i_private that is created. The information passed back from the callback is used to create the dentry/inode. If the callback fills the data and the file should be created, it must return a positive number. On zero or negative, the file is ignored. This logic may also be used as a prototype to convert entire pseudo file systems into just-in-time allocation. The "show_events_dentry" file has been updated to show the directories, and any files they have. With just the eventfs_file allocations: Before after deltas for meminfo (in kB): MemFree: -14360 MemAvailable: -14260 Buffers: 40 Cached: 24 Active: 44 Inactive: 48 Inactive(anon): 28 Active(file): 44 Inactive(file): 20 Dirty: -4 AnonPages: 28 Mapped: 4 KReclaimable: 132 Slab: 1604 SReclaimable: 132 SUnreclaim: 1472 Committed_AS: 12 Before after deltas for slabinfo: : [ * = ] ext4_inode_cache 27 [* 1184 = 31968 ] extent_status 102 [* 40 = 4080 ] tracefs_inode_cache 144 [* 656 = 94464 ] buffer_head 39 [* 104 = 4056 ] shmem_inode_cache 49 [* 800 = 39200 ] filp -53 [* 256 = -13568 ] dentry 251 [* 192 = 48192 ] lsm_file_cache 277 [* 32 = 8864 ] vm_area_struct -14 [* 184 = -2576 ] trace_event_file 1748 [* 88 = 153824 ] kmalloc-1k 35 [* 1024 = 35840 ] kmalloc-256 49 [* 256 = 12544 ] kmalloc-192 -28 [* 192 = -5376 ] kmalloc-128 -30 [* 128 = -3840 ] kmalloc-96 10581 [* 96 = 1015776 ] kmalloc-64 3056 [* 64 = 195584 ] kmalloc-32 1291 [* 32 = 41312 ] kmalloc-16 2310 [* 16 = 36960 ] kmalloc-8 9216 [* 8 = 73728 ] Free memory dropped by 14,360 kB Available memory dropped by 14,260 kB Total slab additions in size: 1,771,032 bytes With this change: Before after deltas for meminfo (in kB): MemFree: -12084 MemAvailable: -11976 Buffers: 32 Cached: 32 Active: 72 Inactive: 168 Inactive(anon): 176 Active(file): 72 Inactive(file): -8 Dirty: 24 AnonPages: 196 Mapped: 8 KReclaimable: 148 Slab: 836 SReclaimable: 148 SUnreclaim: 688 Committed_AS: 324 Before after deltas for slabinfo: : [ * = ] tracefs_inode_cache 144 [* 656 = 94464 ] shmem_inode_cache -23 [* 800 = -18400 ] filp -92 [* 256 = -23552 ] dentry 179 [* 192 = 34368 ] lsm_file_cache -3 [* 32 = -96 ] vm_area_struct -13 [* 184 = -2392 ] trace_event_file 1748 [* 88 = 153824 ] kmalloc-1k -49 [* 1024 = -50176 ] kmalloc-256 -27 [* 256 = -6912 ] kmalloc-128 1864 [* 128 = 238592 ] kmalloc-64 4685 [* 64 = 299840 ] kmalloc-32 -72 [* 32 = -2304 ] kmalloc-16 256 [* 16 = 4096 ] total = 721352 Free memory dropped by 12,084 kB Available memory dropped by 11,976 kB Total slab additions in size: 721,352 bytes That's over 2 MB in savings per instance for free and available memory, and over 1 MB in savings per instance of slab memory. Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Andrew Morton Cc: Ajay Kaher Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 2 +- include/linux/tracefs.h | 29 ++++++++++++++--------------- 2 files changed, 15 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 21ae37e49319..12207dc6722d 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -649,7 +649,7 @@ struct trace_event_file { struct list_head list; struct trace_event_call *event_call; struct event_filter __rcu *filter; - struct eventfs_file *ef; + struct eventfs_inode *ei; struct trace_array *tr; struct trace_subsystem_dir *system; struct list_head triggers; diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h index 009072792fa3..0c39704455d9 100644 --- a/include/linux/tracefs.h +++ b/include/linux/tracefs.h @@ -23,26 +23,25 @@ struct file_operations; struct eventfs_file; -struct dentry *eventfs_create_events_dir(const char *name, - struct dentry *parent); +typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data, + const struct file_operations **fops); -struct eventfs_file *eventfs_add_subsystem_dir(const char *name, - struct dentry *parent); +struct eventfs_entry { + const char *name; + eventfs_callback callback; +}; -struct eventfs_file *eventfs_add_dir(const char *name, - struct eventfs_file *ef_parent); +struct eventfs_inode; -int eventfs_add_file(const char *name, umode_t mode, - struct eventfs_file *ef_parent, void *data, - const struct file_operations *fops); +struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent, + const struct eventfs_entry *entries, + int size, void *data); -int eventfs_add_events_file(const char *name, umode_t mode, - struct dentry *parent, void *data, - const struct file_operations *fops); +struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent, + const struct eventfs_entry *entries, + int size, void *data); -void eventfs_remove(struct eventfs_file *ef); - -void eventfs_remove_events_dir(struct dentry *dentry); +void eventfs_remove_dir(struct eventfs_inode *ei); struct dentry *tracefs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, -- cgit v1.2.3 From 2819f23ac12ce93ff79ca7a54597df9a4a1f6331 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 5 Oct 2023 09:13:48 -0400 Subject: eventfs: Use eventfs_remove_events_dir() The update to removing the eventfs_file changed the way the events top level directory was handled. Instead of returning a dentry, it now returns the eventfs_inode. In this changed, the removing of the events top level directory is not much different than removing any of the other directories. Because of this, the removal just called eventfs_remove_dir() instead of eventfs_remove_events_dir(). Although eventfs_remove_dir() does the clean up, it misses out on the dget() of the ei->dentry done in eventfs_create_events_dir(). It makes more sense to match eventfs_create_events_dir() with a specific function eventfs_remove_events_dir() and this specific function can then perform the dput() to the dentry that had the dget() when it was created. Fixes: 5790b1fb3d67 ("eventfs: Remove eventfs_file and just use eventfs_inode") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202310051743.y9EobbUr-lkp@intel.com/ Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 19 +++++++------------ include/linux/tracefs.h | 1 + kernel/trace/trace_events.c | 2 +- 3 files changed, 9 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index eab18b157ef5..1ccd100bc565 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -901,22 +901,17 @@ void eventfs_remove_dir(struct eventfs_inode *ei) } /** - * eventfs_remove_events_dir - remove eventfs dir or file from list - * @dentry: events's dentry to be removed. + * eventfs_remove_events_dir - remove the top level eventfs directory + * @ei: the event_inode returned by eventfs_create_events_dir(). * - * This function remove events main directory + * This function removes the events main directory */ -void eventfs_remove_events_dir(struct dentry *dentry) +void eventfs_remove_events_dir(struct eventfs_inode *ei) { - struct tracefs_inode *ti; - - if (!dentry || !dentry->d_inode) - return; + struct dentry *dentry = ei->dentry; - ti = get_tracefs(dentry->d_inode); - if (!ti || !(ti->flags & TRACEFS_EVENT_INODE)) - return; + eventfs_remove_dir(ei); - d_invalidate(dentry); + /* Matches the dget() from eventfs_create_events_dir() */ dput(dentry); } diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h index 0c39704455d9..13359b1a35d1 100644 --- a/include/linux/tracefs.h +++ b/include/linux/tracefs.h @@ -41,6 +41,7 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode const struct eventfs_entry *entries, int size, void *data); +void eventfs_remove_events_dir(struct eventfs_inode *ei); void eventfs_remove_dir(struct eventfs_inode *ei); struct dentry *tracefs_create_file(const char *name, umode_t mode, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a3b9d9423824..0e3a1c70e410 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3872,7 +3872,7 @@ int event_trace_del_tracer(struct trace_array *tr) down_write(&trace_event_sem); __trace_remove_event_dirs(tr); - eventfs_remove_dir(tr->event_dir); + eventfs_remove_events_dir(tr->event_dir); up_write(&trace_event_sem); tr->event_dir = NULL; -- cgit v1.2.3 From d0ed46b60396cfa7e0056f55e1ce0b43c7db57b6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 20 Oct 2023 04:35:45 +0100 Subject: tracing: Move readpos from seq_buf to trace_seq To make seq_buf more lightweight as a string buf, move the readpos member from seq_buf to its container, trace_seq. That puts the responsibility of maintaining the readpos entirely in the tracing code. If some future users want to package up the readpos with a seq_buf, we can define a new struct then. Link: https://lore.kernel.org/linux-trace-kernel/20231020033545.2587554-2-willy@infradead.org Cc: Kees Cook Cc: Justin Stitt Cc: Kent Overstreet Cc: Petr Mladek Cc: Andy Shevchenko Cc: Rasmus Villemoes Cc: Sergey Senozhatsky Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Greg Kroah-Hartman Signed-off-by: Steven Rostedt (Google) --- include/linux/seq_buf.h | 5 +---- include/linux/trace_seq.h | 2 ++ kernel/trace/trace.c | 10 +++++----- kernel/trace/trace_seq.c | 6 +++++- lib/seq_buf.c | 22 ++++++++++------------ 5 files changed, 23 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h index 515d7fcb9634..a0fb013cebdf 100644 --- a/include/linux/seq_buf.h +++ b/include/linux/seq_buf.h @@ -14,19 +14,16 @@ * @buffer: pointer to the buffer * @size: size of the buffer * @len: the amount of data inside the buffer - * @readpos: The next position to read in the buffer. */ struct seq_buf { char *buffer; size_t size; size_t len; - loff_t readpos; }; static inline void seq_buf_clear(struct seq_buf *s) { s->len = 0; - s->readpos = 0; } static inline void @@ -143,7 +140,7 @@ extern __printf(2, 0) int seq_buf_vprintf(struct seq_buf *s, const char *fmt, va_list args); extern int seq_buf_print_seq(struct seq_file *m, struct seq_buf *s); extern int seq_buf_to_user(struct seq_buf *s, char __user *ubuf, - int cnt); + size_t start, int cnt); extern int seq_buf_puts(struct seq_buf *s, const char *str); extern int seq_buf_putc(struct seq_buf *s, unsigned char c); extern int seq_buf_putmem(struct seq_buf *s, const void *mem, unsigned int len); diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 6be92bf559fe..3691e0e76a1a 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -14,6 +14,7 @@ struct trace_seq { char buffer[PAGE_SIZE]; struct seq_buf seq; + size_t readpos; int full; }; @@ -22,6 +23,7 @@ trace_seq_init(struct trace_seq *s) { seq_buf_init(&s->seq, s->buffer, PAGE_SIZE); s->full = 0; + s->readpos = 0; } /** diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4383be8fa1b0..d629065c2383 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1731,15 +1731,15 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) { int len; - if (trace_seq_used(s) <= s->seq.readpos) + if (trace_seq_used(s) <= s->readpos) return -EBUSY; - len = trace_seq_used(s) - s->seq.readpos; + len = trace_seq_used(s) - s->readpos; if (cnt > len) cnt = len; - memcpy(buf, s->buffer + s->seq.readpos, cnt); + memcpy(buf, s->buffer + s->readpos, cnt); - s->seq.readpos += cnt; + s->readpos += cnt; return cnt; } @@ -7008,7 +7008,7 @@ waitagain: /* Now copy what we have to the user */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); - if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq)) + if (iter->seq.readpos >= trace_seq_used(&iter->seq)) trace_seq_init(&iter->seq); /* diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index bac06ee3b98b..7be97229ddf8 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -370,8 +370,12 @@ EXPORT_SYMBOL_GPL(trace_seq_path); */ int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt) { + int ret; __trace_seq_init(s); - return seq_buf_to_user(&s->seq, ubuf, cnt); + ret = seq_buf_to_user(&s->seq, ubuf, s->readpos, cnt); + if (ret > 0) + s->readpos += ret; + return ret; } EXPORT_SYMBOL_GPL(trace_seq_to_user); diff --git a/lib/seq_buf.c b/lib/seq_buf.c index 45c450f423fa..b7477aefff53 100644 --- a/lib/seq_buf.c +++ b/lib/seq_buf.c @@ -324,23 +324,24 @@ int seq_buf_path(struct seq_buf *s, const struct path *path, const char *esc) * seq_buf_to_user - copy the sequence buffer to user space * @s: seq_buf descriptor * @ubuf: The userspace memory location to copy to + * @start: The first byte in the buffer to copy * @cnt: The amount to copy * * Copies the sequence buffer into the userspace memory pointed to - * by @ubuf. It starts from the last read position (@s->readpos) - * and writes up to @cnt characters or till it reaches the end of - * the content in the buffer (@s->len), which ever comes first. + * by @ubuf. It starts from @start and writes up to @cnt characters + * or until it reaches the end of the content in the buffer (@s->len), + * whichever comes first. * * On success, it returns a positive number of the number of bytes * it copied. * * On failure it returns -EBUSY if all of the content in the * sequence has been already read, which includes nothing in the - * sequence (@s->len == @s->readpos). + * sequence (@s->len == @start). * * Returns -EFAULT if the copy to userspace fails. */ -int seq_buf_to_user(struct seq_buf *s, char __user *ubuf, int cnt) +int seq_buf_to_user(struct seq_buf *s, char __user *ubuf, size_t start, int cnt) { int len; int ret; @@ -350,20 +351,17 @@ int seq_buf_to_user(struct seq_buf *s, char __user *ubuf, int cnt) len = seq_buf_used(s); - if (len <= s->readpos) + if (len <= start) return -EBUSY; - len -= s->readpos; + len -= start; if (cnt > len) cnt = len; - ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); + ret = copy_to_user(ubuf, s->buffer + start, cnt); if (ret == cnt) return -EFAULT; - cnt -= ret; - - s->readpos += cnt; - return cnt; + return cnt - ret; } /** -- cgit v1.2.3 From 845e31e1101fc8533be52aff42d8f1ff48636024 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Fri, 20 Oct 2023 14:38:49 -0600 Subject: seq_buf: fix a misleading comment The comment for seq_buf_has_overflowed() says that an overflow condition is marked by len == size, but that's not what the code is testing. Make the comment match reality. Link: https://lkml.kernel.org/r/87pm19kp0m.fsf@meer.lwn.net Fixes: 8cd709ae7658a ("tracing: Have seq_buf use full buffer") Signed-off-by: Jonathan Corbet Signed-off-by: Steven Rostedt (Google) --- include/linux/seq_buf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h index a0fb013cebdf..8483e4b2d0d2 100644 --- a/include/linux/seq_buf.h +++ b/include/linux/seq_buf.h @@ -36,7 +36,7 @@ seq_buf_init(struct seq_buf *s, char *buf, unsigned int size) /* * seq_buf have a buffer that might overflow. When this happens - * the len and size are set to be equal. + * len is set to be greater than size. */ static inline bool seq_buf_has_overflowed(struct seq_buf *s) -- cgit v1.2.3 From dcc4e5728eeaeda84878ca0018758cff1abfca21 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 27 Oct 2023 08:56:38 -0700 Subject: seq_buf: Introduce DECLARE_SEQ_BUF and seq_buf_str() Solve two ergonomic issues with struct seq_buf; 1) Too much boilerplate is required to initialize: struct seq_buf s; char buf[32]; seq_buf_init(s, buf, sizeof(buf)); Instead, we can build this directly on the stack. Provide DECLARE_SEQ_BUF() macro to do this: DECLARE_SEQ_BUF(s, 32); 2) %NUL termination is fragile and requires 2 steps to get a valid C String (and is a layering violation exposing the "internals" of seq_buf): seq_buf_terminate(s); do_something(s->buffer); Instead, we can just return s->buffer directly after terminating it in the refactored seq_buf_terminate(), now known as seq_buf_str(): do_something(seq_buf_str(s)); Link: https://lore.kernel.org/linux-trace-kernel/20231027155634.make.260-kees@kernel.org Link: https://lore.kernel.org/linux-trace-kernel/20231026194033.it.702-kees@kernel.org/ Cc: Yosry Ahmed Cc: "Matthew Wilcox (Oracle)" Cc: Christoph Hellwig Cc: Justin Stitt Cc: Kent Overstreet Cc: Petr Mladek Cc: Andy Shevchenko Cc: Rasmus Villemoes Cc: Sergey Senozhatsky Cc: Masami Hiramatsu Cc: Greg Kroah-Hartman Cc: Arnd Bergmann Cc: Jonathan Corbet Cc: Yun Zhou Cc: Jacob Keller Cc: Zhen Lei Signed-off-by: Kees Cook Signed-off-by: Steven Rostedt (Google) --- include/linux/seq_buf.h | 21 +++++++++++++++++---- kernel/trace/trace.c | 11 +---------- lib/seq_buf.c | 4 +--- 3 files changed, 19 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h index 8483e4b2d0d2..5fb1f12c33f9 100644 --- a/include/linux/seq_buf.h +++ b/include/linux/seq_buf.h @@ -21,9 +21,18 @@ struct seq_buf { size_t len; }; +#define DECLARE_SEQ_BUF(NAME, SIZE) \ + char __ ## NAME ## _buffer[SIZE] = ""; \ + struct seq_buf NAME = { \ + .buffer = &__ ## NAME ## _buffer, \ + .size = SIZE, \ + } + static inline void seq_buf_clear(struct seq_buf *s) { s->len = 0; + if (s->size) + s->buffer[0] = '\0'; } static inline void @@ -69,8 +78,8 @@ static inline unsigned int seq_buf_used(struct seq_buf *s) } /** - * seq_buf_terminate - Make sure buffer is nul terminated - * @s: the seq_buf descriptor to terminate. + * seq_buf_str - get %NUL-terminated C string from seq_buf + * @s: the seq_buf handle * * This makes sure that the buffer in @s is nul terminated and * safe to read as a string. @@ -81,16 +90,20 @@ static inline unsigned int seq_buf_used(struct seq_buf *s) * * After this function is called, s->buffer is safe to use * in string operations. + * + * Returns @s->buf after making sure it is terminated. */ -static inline void seq_buf_terminate(struct seq_buf *s) +static inline const char *seq_buf_str(struct seq_buf *s) { if (WARN_ON(s->size == 0)) - return; + return ""; if (seq_buf_buffer_left(s)) s->buffer[s->len] = 0; else s->buffer[s->size - 1] = 0; + + return s->buffer; } /** diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d629065c2383..2539cfc20a97 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3828,15 +3828,6 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str, return false; } -static const char *show_buffer(struct trace_seq *s) -{ - struct seq_buf *seq = &s->seq; - - seq_buf_terminate(seq); - - return seq->buffer; -} - static DEFINE_STATIC_KEY_FALSE(trace_no_verify); static int test_can_verify_check(const char *fmt, ...) @@ -3976,7 +3967,7 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, */ if (WARN_ONCE(!trace_safe_str(iter, str, star, len), "fmt: '%s' current_buffer: '%s'", - fmt, show_buffer(&iter->seq))) { + fmt, seq_buf_str(&iter->seq.seq))) { int ret; /* Try to safely read the string */ diff --git a/lib/seq_buf.c b/lib/seq_buf.c index b7477aefff53..23518f77ea9c 100644 --- a/lib/seq_buf.c +++ b/lib/seq_buf.c @@ -109,9 +109,7 @@ void seq_buf_do_printk(struct seq_buf *s, const char *lvl) if (s->size == 0 || s->len == 0) return; - seq_buf_terminate(s); - - start = s->buffer; + start = seq_buf_str(s); while ((lf = strchr(start, '\n'))) { int len = lf - start + 1; -- cgit v1.2.3 From bb32500fb9b78215e4ef6ee8b4345c5f5d7eafb4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 31 Oct 2023 12:24:53 -0400 Subject: tracing: Have trace_event_file have ref counters The following can crash the kernel: # cd /sys/kernel/tracing # echo 'p:sched schedule' > kprobe_events # exec 5>>events/kprobes/sched/enable # > kprobe_events # exec 5>&- The above commands: 1. Change directory to the tracefs directory 2. Create a kprobe event (doesn't matter what one) 3. Open bash file descriptor 5 on the enable file of the kprobe event 4. Delete the kprobe event (removes the files too) 5. Close the bash file descriptor 5 The above causes a crash! BUG: kernel NULL pointer dereference, address: 0000000000000028 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 6 PID: 877 Comm: bash Not tainted 6.5.0-rc4-test-00008-g2c6b6b1029d4-dirty #186 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 RIP: 0010:tracing_release_file_tr+0xc/0x50 What happens here is that the kprobe event creates a trace_event_file "file" descriptor that represents the file in tracefs to the event. It maintains state of the event (is it enabled for the given instance?). Opening the "enable" file gets a reference to the event "file" descriptor via the open file descriptor. When the kprobe event is deleted, the file is also deleted from the tracefs system which also frees the event "file" descriptor. But as the tracefs file is still opened by user space, it will not be totally removed until the final dput() is called on it. But this is not true with the event "file" descriptor that is already freed. If the user does a write to or simply closes the file descriptor it will reference the event "file" descriptor that was just freed, causing a use-after-free bug. To solve this, add a ref count to the event "file" descriptor as well as a new flag called "FREED". The "file" will not be freed until the last reference is released. But the FREE flag will be set when the event is removed to prevent any more modifications to that event from happening, even if there's still a reference to the event "file" descriptor. Link: https://lore.kernel.org/linux-trace-kernel/20231031000031.1e705592@gandalf.local.home/ Link: https://lore.kernel.org/linux-trace-kernel/20231031122453.7a48b923@gandalf.local.home Cc: stable@vger.kernel.org Cc: Mark Rutland Fixes: f5ca233e2e66d ("tracing: Increase trace array ref count on enable and filter files") Reported-by: Beau Belgrave Tested-by: Beau Belgrave Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 4 ++++ kernel/trace/trace.c | 15 +++++++++++++++ kernel/trace/trace.h | 3 +++ kernel/trace/trace_events.c | 31 +++++++++++++++++++++++++++---- kernel/trace/trace_events_filter.c | 3 +++ 5 files changed, 52 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 12207dc6722d..696f8dc4aa53 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -492,6 +492,7 @@ enum { EVENT_FILE_FL_TRIGGER_COND_BIT, EVENT_FILE_FL_PID_FILTER_BIT, EVENT_FILE_FL_WAS_ENABLED_BIT, + EVENT_FILE_FL_FREED_BIT, }; extern struct trace_event_file *trace_get_event_file(const char *instance, @@ -630,6 +631,7 @@ extern int __kprobe_event_add_fields(struct dynevent_cmd *cmd, ...); * TRIGGER_COND - When set, one or more triggers has an associated filter * PID_FILTER - When set, the event is filtered based on pid * WAS_ENABLED - Set when enabled to know to clear trace on module removal + * FREED - File descriptor is freed, all fields should be considered invalid */ enum { EVENT_FILE_FL_ENABLED = (1 << EVENT_FILE_FL_ENABLED_BIT), @@ -643,6 +645,7 @@ enum { EVENT_FILE_FL_TRIGGER_COND = (1 << EVENT_FILE_FL_TRIGGER_COND_BIT), EVENT_FILE_FL_PID_FILTER = (1 << EVENT_FILE_FL_PID_FILTER_BIT), EVENT_FILE_FL_WAS_ENABLED = (1 << EVENT_FILE_FL_WAS_ENABLED_BIT), + EVENT_FILE_FL_FREED = (1 << EVENT_FILE_FL_FREED_BIT), }; struct trace_event_file { @@ -671,6 +674,7 @@ struct trace_event_file { * caching and such. Which is mostly OK ;-) */ unsigned long flags; + atomic_t ref; /* ref count for opened files */ atomic_t sm_ref; /* soft-mode reference counter */ atomic_t tm_ref; /* trigger-mode reference counter */ }; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2539cfc20a97..9aebf904ff97 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4978,6 +4978,20 @@ int tracing_open_file_tr(struct inode *inode, struct file *filp) if (ret) return ret; + mutex_lock(&event_mutex); + + /* Fail if the file is marked for removal */ + if (file->flags & EVENT_FILE_FL_FREED) { + trace_array_put(file->tr); + ret = -ENODEV; + } else { + event_file_get(file); + } + + mutex_unlock(&event_mutex); + if (ret) + return ret; + filp->private_data = inode->i_private; return 0; @@ -4988,6 +5002,7 @@ int tracing_release_file_tr(struct inode *inode, struct file *filp) struct trace_event_file *file = inode->i_private; trace_array_put(file->tr); + event_file_put(file); return 0; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0e1405abf4f7..b7f4ea25a194 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1669,6 +1669,9 @@ extern void event_trigger_unregister(struct event_command *cmd_ops, char *glob, struct event_trigger_data *trigger_data); +extern void event_file_get(struct trace_event_file *file); +extern void event_file_put(struct trace_event_file *file); + /** * struct event_trigger_ops - callbacks for trace event triggers * diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f9e3e24d8796..f29e815ca5b2 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -990,13 +990,35 @@ static void remove_subsystem(struct trace_subsystem_dir *dir) } } +void event_file_get(struct trace_event_file *file) +{ + atomic_inc(&file->ref); +} + +void event_file_put(struct trace_event_file *file) +{ + if (WARN_ON_ONCE(!atomic_read(&file->ref))) { + if (file->flags & EVENT_FILE_FL_FREED) + kmem_cache_free(file_cachep, file); + return; + } + + if (atomic_dec_and_test(&file->ref)) { + /* Count should only go to zero when it is freed */ + if (WARN_ON_ONCE(!(file->flags & EVENT_FILE_FL_FREED))) + return; + kmem_cache_free(file_cachep, file); + } +} + static void remove_event_file_dir(struct trace_event_file *file) { eventfs_remove_dir(file->ei); list_del(&file->list); remove_subsystem(file->system); free_event_filter(file->filter); - kmem_cache_free(file_cachep, file); + file->flags |= EVENT_FILE_FL_FREED; + event_file_put(file); } /* @@ -1369,7 +1391,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, flags = file->flags; mutex_unlock(&event_mutex); - if (!file) + if (!file || flags & EVENT_FILE_FL_FREED) return -ENODEV; if (flags & EVENT_FILE_FL_ENABLED && @@ -1403,7 +1425,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, ret = -ENODEV; mutex_lock(&event_mutex); file = event_file_data(filp); - if (likely(file)) { + if (likely(file && !(file->flags & EVENT_FILE_FL_FREED))) { ret = tracing_update_buffers(file->tr); if (ret < 0) { mutex_unlock(&event_mutex); @@ -1683,7 +1705,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, mutex_lock(&event_mutex); file = event_file_data(filp); - if (file) + if (file && !(file->flags & EVENT_FILE_FL_FREED)) print_event_filter(file, s); mutex_unlock(&event_mutex); @@ -2902,6 +2924,7 @@ trace_create_new_event(struct trace_event_call *call, atomic_set(&file->tm_ref, 0); INIT_LIST_HEAD(&file->triggers); list_add(&file->list, &tr->events); + event_file_get(file); return file; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 33264e510d16..0c611b281a5b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -2349,6 +2349,9 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string) struct event_filter *filter = NULL; int err; + if (file->flags & EVENT_FILE_FL_FREED) + return -ENODEV; + if (!strcmp(strstrip(filter_string), "0")) { filter_disable(file); filter = event_filter(file); -- cgit v1.2.3 From 44365329f8219fc379097c2c9a75ff53f123764f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 1 Nov 2023 13:25:46 -0400 Subject: eventfs: Hold eventfs_mutex when calling callback functions The callback function that is used to create inodes and dentries is not protected by anything and the data that is passed to it could become stale. After eventfs_remove_dir() is called by the tracing system, it is free to remove the events that are associated to that directory. Unfortunately, that means the callbacks must not be called after that. CPU0 CPU1 ---- ---- eventfs_root_lookup() { eventfs_remove_dir() { mutex_lock(&event_mutex); ei->is_freed = set; mutex_unlock(&event_mutex); } kfree(event_call); for (...) { entry = &ei->entries[i]; r = entry->callback() { call = data; // call == event_call above if (call->flags ...) [ USE AFTER FREE BUG ] The safest way to protect this is to wrap the callback with: mutex_lock(&eventfs_mutex); if (!ei->is_freed) r = entry->callback(); else r = -1; mutex_unlock(&eventfs_mutex); This will make sure that the callback will not be called after it is freed. But now it needs to be known that the callback is called while holding internal eventfs locks, and that it must not call back into the eventfs / tracefs system. There's no reason it should anyway, but document that as well. Link: https://lore.kernel.org/all/CA+G9fYu9GOEbD=rR5eMR-=HJ8H6rMsbzDC2ZY5=Y50WpWAE7_Q@mail.gmail.com/ Link: https://lkml.kernel.org/r/20231101172649.906696613@goodmis.org Cc: Ajay Kaher Cc: Mark Rutland Cc: Andrew Morton Fixes: 5790b1fb3d672 ("eventfs: Remove eventfs_file and just use eventfs_inode") Reported-by: Linux Kernel Functional Testing Reported-by: Naresh Kamboju Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 22 ++++++++++++++++++++-- include/linux/tracefs.h | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 93d08e552483..8ac9abf7a3d5 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -615,7 +615,13 @@ static struct dentry *eventfs_root_lookup(struct inode *dir, entry = &ei->entries[i]; if (strcmp(name, entry->name) == 0) { void *cdata = data; - r = entry->callback(name, &mode, &cdata, &fops); + mutex_lock(&eventfs_mutex); + /* If ei->is_freed, then the event itself may be too */ + if (!ei->is_freed) + r = entry->callback(name, &mode, &cdata, &fops); + else + r = -1; + mutex_unlock(&eventfs_mutex); if (r <= 0) continue; ret = simple_lookup(dir, dentry, flags); @@ -749,7 +755,13 @@ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file) void *cdata = data; entry = &ei->entries[i]; name = entry->name; - r = entry->callback(name, &mode, &cdata, &fops); + mutex_lock(&eventfs_mutex); + /* If ei->is_freed, then the event itself may be too */ + if (!ei->is_freed) + r = entry->callback(name, &mode, &cdata, &fops); + else + r = -1; + mutex_unlock(&eventfs_mutex); if (r <= 0) continue; d = create_file_dentry(ei, i, parent, name, mode, cdata, fops, false); @@ -819,6 +831,10 @@ static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx) * data = A pointer to @data, and the callback may replace it, which will * cause the file created to pass the new data to the open() call. * fops = the fops to use for the created file. + * + * NB. @callback is called while holding internal locks of the eventfs + * system. The callback must not call any code that might also call into + * the tracefs or eventfs system or it will risk creating a deadlock. */ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent, const struct eventfs_entry *entries, @@ -878,6 +894,8 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode * @data: The default data to pass to the files (an entry may override it). * * This function creates the top of the trace event directory. + * + * See eventfs_create_dir() for use of @entries. */ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent, const struct eventfs_entry *entries, diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h index 13359b1a35d1..7a5fe17b6bf9 100644 --- a/include/linux/tracefs.h +++ b/include/linux/tracefs.h @@ -23,9 +23,52 @@ struct file_operations; struct eventfs_file; +/** + * eventfs_callback - A callback function to create dynamic files in eventfs + * @name: The name of the file that is to be created + * @mode: return the file mode for the file (RW access, etc) + * @data: data to pass to the created file ops + * @fops: the file operations of the created file + * + * The evetnfs files are dynamically created. The struct eventfs_entry array + * is passed to eventfs_create_dir() or eventfs_create_events_dir() that will + * be used to create the files within those directories. When a lookup + * or access to a file within the directory is made, the struct eventfs_entry + * array is used to find a callback() with the matching name that is being + * referenced (for lookups, the entire array is iterated and each callback + * will be called). + * + * The callback will be called with @name for the name of the file to create. + * The callback can return less than 1 to indicate that no file should be + * created. + * + * If a file is to be created, then @mode should be populated with the file + * mode (permissions) for which the file is created for. This would be + * used to set the created inode i_mode field. + * + * The @data should be set to the data passed to the other file operations + * (read, write, etc). Note, @data will also point to the data passed in + * to eventfs_create_dir() or eventfs_create_events_dir(), but the callback + * can replace the data if it chooses to. Otherwise, the original data + * will be used for the file operation functions. + * + * The @fops should be set to the file operations that will be used to create + * the inode. + * + * NB. This callback is called while holding internal locks of the eventfs + * system. The callback must not call any code that might also call into + * the tracefs or eventfs system or it will risk creating a deadlock. + */ typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data, const struct file_operations **fops); +/** + * struct eventfs_entry - dynamically created eventfs file call back handler + * @name: Then name of the dynamic file in an eventfs directory + * @callback: The callback to get the fops of the file when it is created + * + * See evenfs_callback() typedef for how to set up @callback. + */ struct eventfs_entry { const char *name; eventfs_callback callback; -- cgit v1.2.3