From ac44dcc788b950606793e8f9690c30925f59df02 Mon Sep 17 00:00:00 2001 From: Shuran Liu Date: Sat, 6 Dec 2025 22:12:09 +0800 Subject: bpf: Fix verifier assumptions of bpf_d_path's output buffer Commit 37cce22dbd51 ("bpf: verifier: Refactor helper access type tracking") started distinguishing read vs write accesses performed by helpers. The second argument of bpf_d_path() is a pointer to a buffer that the helper fills with the resulting path. However, its prototype currently uses ARG_PTR_TO_MEM without MEM_WRITE. Before 37cce22dbd51, helper accesses were conservatively treated as potential writes, so this mismatch did not cause issues. Since that commit, the verifier may incorrectly assume that the buffer contents are unchanged across the helper call and base its optimizations on this wrong assumption. This can lead to misbehaviour in BPF programs that read back the buffer, such as prefix comparisons on the returned path. Fix this by marking the second argument of bpf_d_path() as ARG_PTR_TO_MEM | MEM_WRITE so that the verifier correctly models the write to the caller-provided buffer. Fixes: 37cce22dbd51 ("bpf: verifier: Refactor helper access type tracking") Co-developed-by: Zesen Liu Signed-off-by: Zesen Liu Co-developed-by: Peili Gao Signed-off-by: Peili Gao Co-developed-by: Haoran Ni Signed-off-by: Haoran Ni Signed-off-by: Shuran Liu Reviewed-by: Matt Bobrowski Link: https://lore.kernel.org/r/20251206141210.3148-2-electronlsr@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d57727abaade..fe28d86f7c35 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -965,7 +965,7 @@ static const struct bpf_func_proto bpf_d_path_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_d_path_btf_ids[0], - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .allowed = bpf_d_path_allowed, }; -- cgit v1.2.3 From ef7f38df890f5dcd2ae62f8dbde191d72f3bebae Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 16 Dec 2025 18:24:40 -0500 Subject: tracing: Do not register unsupported perf events Synthetic events currently do not have a function to register perf events. This leads to calling the tracepoint register functions with a NULL function pointer which triggers: ------------[ cut here ]------------ WARNING: kernel/tracepoint.c:175 at tracepoint_add_func+0x357/0x370, CPU#2: perf/2272 Modules linked in: kvm_intel kvm irqbypass CPU: 2 UID: 0 PID: 2272 Comm: perf Not tainted 6.18.0-ftest-11964-ge022764176fc-dirty #323 PREEMPTLAZY Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.17.0-debian-1.17.0-1 04/01/2014 RIP: 0010:tracepoint_add_func+0x357/0x370 Code: 28 9c e8 4c 0b f5 ff eb 0f 4c 89 f7 48 c7 c6 80 4d 28 9c e8 ab 89 f4 ff 31 c0 5b 41 5c 41 5d 41 5e 41 5f 5d c3 cc cc cc cc cc <0f> 0b 49 c7 c6 ea ff ff ff e9 ee fe ff ff 0f 0b e9 f9 fe ff ff 0f RSP: 0018:ffffabc0c44d3c40 EFLAGS: 00010246 RAX: 0000000000000001 RBX: ffff9380aa9e4060 RCX: 0000000000000000 RDX: 000000000000000a RSI: ffffffff9e1d4a98 RDI: ffff937fcf5fd6c8 RBP: 0000000000000001 R08: 0000000000000007 R09: ffff937fcf5fc780 R10: 0000000000000003 R11: ffffffff9c193910 R12: 000000000000000a R13: ffffffff9e1e5888 R14: 0000000000000000 R15: ffffabc0c44d3c78 FS: 00007f6202f5f340(0000) GS:ffff93819f00f000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055d3162281a8 CR3: 0000000106a56003 CR4: 0000000000172ef0 Call Trace: tracepoint_probe_register+0x5d/0x90 synth_event_reg+0x3c/0x60 perf_trace_event_init+0x204/0x340 perf_trace_init+0x85/0xd0 perf_tp_event_init+0x2e/0x50 perf_try_init_event+0x6f/0x230 ? perf_event_alloc+0x4bb/0xdc0 perf_event_alloc+0x65a/0xdc0 __se_sys_perf_event_open+0x290/0x9f0 do_syscall_64+0x93/0x7b0 ? entry_SYSCALL_64_after_hwframe+0x76/0x7e ? trace_hardirqs_off+0x53/0xc0 entry_SYSCALL_64_after_hwframe+0x76/0x7e Instead, have the code return -ENODEV, which doesn't warn and has perf error out with: # perf record -e synthetic:futex_wait Error: The sys_perf_event_open() syscall returned with 19 (No such device) for event (synthetic:futex_wait). "dmesg | grep -i perf" may provide additional information. Ideally perf should support synthetic events, but for now just fix the warning. The support can come later. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Namhyung Kim Link: https://patch.msgid.link/20251216182440.147e4453@gandalf.local.home Fixes: 4b147936fa509 ("tracing: Add support for 'synthetic' events") Reported-by: Ian Rogers Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index b16a5a158040..76067529db61 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -700,6 +700,8 @@ int trace_event_reg(struct trace_event_call *call, #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: + if (!call->class->perf_probe) + return -ENODEV; return tracepoint_probe_register(call->tp, call->class->perf_probe, call); -- cgit v1.2.3 From 74bf97e9a8b6443ba2119dc884940e9364c91bde Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 16 Dec 2025 09:49:50 -0800 Subject: tracing: Fix UBSAN warning in __remove_instance() xfs/558 triggers the following UBSAN warning: ------------[ cut here ]------------ UBSAN: shift-out-of-bounds in kernel/trace/trace.c:10510:10 shift exponent 32 is too large for 32-bit type 'int' CPU: 1 UID: 0 PID: 888674 Comm: rmdir Not tainted 6.19.0-rc1-xfsx #rc1 PREEMPT(lazy) dbf607ef4c142c563f76d706e71af9731d7b9c90 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-4.module+el8.8.0+21164+ed375313 04/01/2014 Call Trace: dump_stack_lvl+0x4a/0x70 ubsan_epilogue+0x5/0x2b __ubsan_handle_shift_out_of_bounds.cold+0x5e/0x113 __remove_instance.part.0.constprop.0.cold+0x18/0x26f instance_rmdir+0xf3/0x110 tracefs_syscall_rmdir+0x4d/0x90 vfs_rmdir+0x139/0x230 do_rmdir+0x143/0x230 __x64_sys_rmdir+0x1d/0x20 do_syscall_64+0x44/0x230 entry_SYSCALL_64_after_hwframe+0x4b/0x53 RIP: 0033:0x7f7ae8e51f17 Code: f0 ff ff 73 01 c3 48 8b 0d de 2e 0e 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 b8 54 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 01 c3 48 8b 15 b1 2e 0e 00 f7 d8 64 89 02 b8 RSP: 002b:00007ffd90743f08 EFLAGS: 00000246 ORIG_RAX: 0000000000000054 RAX: ffffffffffffffda RBX: 00007ffd907440f8 RCX: 00007f7ae8e51f17 RDX: 00007f7ae8f3c5c0 RSI: 00007ffd90744a21 RDI: 00007ffd90744a21 RBP: 0000000000000002 R08: 0000000000000000 R09: 0000000000000000 R10: 00007f7ae8f35ac0 R11: 0000000000000246 R12: 00007ffd90744a21 R13: 0000000000000001 R14: 00007f7ae8f8b000 R15: 000055e5283e6a98 ---[ end trace ]--- whilst tearing down an ftrace instance. TRACE_FLAGS_MAX_SIZE is now 64bit, so the mask comparison expression must be typecast to a u64 value to avoid an overflow. AFAICT, ZEROED_TRACE_FLAGS is already cast to ULL so this is ok. Link: https://patch.msgid.link/20251216174950.GA7705@frogsfrogsfrogs Fixes: bbec8e28cac592 ("tracing: Allow tracer to add more than 32 options") Signed-off-by: "Darrick J. Wong" Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e575956ef9b5..6f2148df14d9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -10507,7 +10507,7 @@ static int __remove_instance(struct trace_array *tr) /* Disable all the flags that were enabled coming in */ for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) { - if ((1 << i) & ZEROED_TRACE_FLAGS) + if ((1ULL << i) & ZEROED_TRACE_FLAGS) set_tracer_flag(tr, 1ULL << i, 0); } -- cgit v1.2.3 From 39263f986da55c5b7bc328c757fe378a6a41799d Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 17 Dec 2025 11:00:53 +0800 Subject: ftrace: Fix address for jmp mode in t_show() The address from ftrace_find_rec_direct() is printed directly in t_show(). This can mislead symbol offsets if it has the "jmp" bit in the last bit. Fix this by printing the address that returned by ftrace_jmp_get(). Link: https://patch.msgid.link/20251217030053.80343-1-dongml2@chinatelecom.cn Fixes: 25e4e3565d45 ("ftrace: Introduce FTRACE_OPS_FL_JMP") Signed-off-by: Menglong Dong Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3ec2033c0774..ef2d5dca6f70 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4518,8 +4518,11 @@ static int t_show(struct seq_file *m, void *v) unsigned long direct; direct = ftrace_find_rec_direct(rec->ip); - if (direct) - seq_printf(m, "\n\tdirect-->%pS", (void *)direct); + if (direct) { + seq_printf(m, "\n\tdirect%s-->%pS", + ftrace_is_jmp(direct) ? "(jmp)" : "", + (void *)ftrace_jmp_get(direct)); + } } } -- cgit v1.2.3 From 7cc3fe8e754eb1b7d9876c8ae2ee77dd2fb47b6d Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Fri, 26 Dec 2025 12:05:31 +0100 Subject: tracing: Drop unneeded assignment to soft_mode soft_mode is not read in the enable case, so drop the assignment. Drop also the comment text that refers to the assignment and realign the comment. Cc: "Paul E . McKenney" Cc: Gabriele Paoloni Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://patch.msgid.link/20251226110531.4129794-1-Julia.Lawall@inria.fr Signed-off-by: Julia Lawall Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 76067529db61..137b4d9bb116 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -826,16 +826,15 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, * When soft_disable is set and enable is set, we want to * register the tracepoint for the event, but leave the event * as is. That means, if the event was already enabled, we do - * nothing (but set soft_mode). If the event is disabled, we - * set SOFT_DISABLED before enabling the event tracepoint, so - * it still seems to be disabled. + * nothing. If the event is disabled, we set SOFT_DISABLED + * before enabling the event tracepoint, so it still seems + * to be disabled. */ if (!soft_disable) clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); else { if (atomic_inc_return(&file->sm_ref) > 1) break; - soft_mode = true; /* Enable use of trace_buffered_event */ trace_buffered_event_enable(); } -- cgit v1.2.3 From 6435ffd6c7fcba330dfa91c58dc30aed2df3d0bf Mon Sep 17 00:00:00 2001 From: Wupeng Ma Date: Sun, 28 Dec 2025 14:50:07 +0800 Subject: ring-buffer: Avoid softlockup in ring_buffer_resize() during memory free When user resize all trace ring buffer through file 'buffer_size_kb', then in ring_buffer_resize(), kernel allocates buffer pages for each cpu in a loop. If the kernel preemption model is PREEMPT_NONE and there are many cpus and there are many buffer pages to be freed, it may not give up cpu for a long time and finally cause a softlockup. To avoid it, call cond_resched() after each cpu buffer free as Commit f6bd2c92488c ("ring-buffer: Avoid softlockup in ring_buffer_resize()") does. Detailed call trace as follow: rcu: INFO: rcu_sched self-detected stall on CPU rcu: 24-....: (14837 ticks this GP) idle=521c/1/0x4000000000000000 softirq=230597/230597 fqs=5329 rcu: (t=15004 jiffies g=26003221 q=211022 ncpus=96) CPU: 24 UID: 0 PID: 11253 Comm: bash Kdump: loaded Tainted: G EL 6.18.2+ #278 NONE pc : arch_local_irq_restore+0x8/0x20 arch_local_irq_restore+0x8/0x20 (P) free_frozen_page_commit+0x28c/0x3b0 __free_frozen_pages+0x1c0/0x678 ___free_pages+0xc0/0xe0 free_pages+0x3c/0x50 ring_buffer_resize.part.0+0x6a8/0x880 ring_buffer_resize+0x3c/0x58 __tracing_resize_ring_buffer.part.0+0x34/0xd8 tracing_resize_ring_buffer+0x8c/0xd0 tracing_entries_write+0x74/0xd8 vfs_write+0xcc/0x288 ksys_write+0x74/0x118 __arm64_sys_write+0x24/0x38 Cc: Link: https://patch.msgid.link/20251228065008.2396573-1-mawupeng1@huawei.com Signed-off-by: Wupeng Ma Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 41c9f5d079be..630221b00838 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3137,6 +3137,8 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, list) { list_del_init(&bpage->list); free_buffer_page(bpage); + + cond_resched(); } } out_err_unlock: -- cgit v1.2.3 From 5f1ef0dfcb5b7f4a91a9b0e0ba533efd9f7e2cdb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 5 Jan 2026 20:31:41 -0500 Subject: tracing: Add recursion protection in kernel stack trace recording A bug was reported about an infinite recursion caused by tracing the rcu events with the kernel stack trace trigger enabled. The stack trace code called back into RCU which then called the stack trace again. Expand the ftrace recursion protection to add a set of bits to protect events from recursion. Each bit represents the context that the event is in (normal, softirq, interrupt and NMI). Have the stack trace code use the interrupt context to protect against recursion. Note, the bug showed an issue in both the RCU code as well as the tracing stacktrace code. This only handles the tracing stack trace side of the bug. The RCU fix will be handled separately. Link: https://lore.kernel.org/all/20260102122807.7025fc87@gandalf.local.home/ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Joel Fernandes Cc: "Paul E. McKenney" Cc: Boqun Feng Link: https://patch.msgid.link/20260105203141.515cd49f@gandalf.local.home Reported-by: Yao Kai Tested-by: Yao Kai Fixes: 5f5fa7ea89dc ("rcu: Don't use negative nesting depth in __rcu_read_unlock()") Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_recursion.h | 9 +++++++++ kernel/trace/trace.c | 6 ++++++ 2 files changed, 15 insertions(+) (limited to 'kernel/trace') diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index ae04054a1be3..e6ca052b2a85 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -34,6 +34,13 @@ enum { TRACE_INTERNAL_SIRQ_BIT, TRACE_INTERNAL_TRANSITION_BIT, + /* Internal event use recursion bits */ + TRACE_INTERNAL_EVENT_BIT, + TRACE_INTERNAL_EVENT_NMI_BIT, + TRACE_INTERNAL_EVENT_IRQ_BIT, + TRACE_INTERNAL_EVENT_SIRQ_BIT, + TRACE_INTERNAL_EVENT_TRANSITION_BIT, + TRACE_BRANCH_BIT, /* * Abuse of the trace_recursion. @@ -58,6 +65,8 @@ enum { #define TRACE_LIST_START TRACE_INTERNAL_BIT +#define TRACE_EVENT_START TRACE_INTERNAL_EVENT_BIT + #define TRACE_CONTEXT_MASK ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) /* diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6f2148df14d9..aef9058537d5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3012,6 +3012,11 @@ static void __ftrace_trace_stack(struct trace_array *tr, struct ftrace_stack *fstack; struct stack_entry *entry; int stackidx; + int bit; + + bit = trace_test_and_set_recursion(_THIS_IP_, _RET_IP_, TRACE_EVENT_START); + if (bit < 0) + return; /* * Add one, for this function and the call to save_stack_trace() @@ -3080,6 +3085,7 @@ static void __ftrace_trace_stack(struct trace_array *tr, /* Again, don't let gcc optimize things here */ barrier(); __this_cpu_dec(ftrace_stack_reserve); + trace_clear_recursion(bit); } static inline void ftrace_trace_stack(struct trace_array *tr, -- cgit v1.2.3 From 1e2ed4bfd50ace3c4272cfab7e9aa90956fb7ae0 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Tue, 6 Jan 2026 23:10:54 +0000 Subject: trace: ftrace_dump_on_oops[] is not exported, make it static The ftrace_dump_on_oops string is not used outside of trace.c so make it static to avoid the export warning from sparse: kernel/trace/trace.c:141:6: warning: symbol 'ftrace_dump_on_oops' was not declared. Should it be static? Fixes: dd293df6395a2 ("tracing: Move trace sysctls into trace.c") Link: https://patch.msgid.link/20260106231054.84270-1-ben.dooks@codethink.co.uk Signed-off-by: Ben Dooks Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index aef9058537d5..baec63134ab6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -138,7 +138,7 @@ cpumask_var_t __read_mostly tracing_buffer_mask; * by commas. */ /* Set to string format zero to disable by default */ -char ftrace_dump_on_oops[MAX_TRACER_SIZE] = "0"; +static char ftrace_dump_on_oops[MAX_TRACER_SIZE] = "0"; /* When set, tracing will stop when a WARN*() is hit */ static int __disable_trace_on_warning; -- cgit v1.2.3