summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-11-22 13:27:01 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2024-11-22 13:27:01 -0800
commit06afb0f36106ecb839c5e2509905e68c1e2677de (patch)
treee54b4528c648fc59ab89726b3786bf6626600c06 /include/linux
parent4b01712311c6e209137c4fa3e7d7920ec509456a (diff)
parent45af52e7d3b8560f21d139b3759735eead8b1653 (diff)
Merge tag 'trace-v6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace
Pull tracing updates from Steven Rostedt: - Addition of faultable tracepoints There's a tracepoint attached to both a system call entry and exit. This location is known to allow page faults. The tracepoints are called under an rcu_read_lock() which does not allow faults that can sleep. This limits the ability of tracepoint handlers to page fault in user space system call parameters. Now these tracepoints have been made "faultable", allowing the callbacks to fault in user space parameters and record them. Note, only the infrastructure has been implemented. The consumers (perf, ftrace, BPF) now need to have their code modified to allow faults. - Fix up of BPF code for the tracepoint faultable logic - Update tracepoints to use the new static branch API - Remove trace_*_rcuidle() variants and the SRCU protection they used - Remove unused TRACE_EVENT_FL_FILTERED logic - Replace strncpy() with strscpy() and memcpy() - Use replace per_cpu_ptr(smp_processor_id()) with this_cpu_ptr() - Fix perf events to not duplicate samples when tracing is enabled - Replace atomic64_add_return(1, counter) with atomic64_inc_return(counter) - Make stack trace buffer 4K instead of PAGE_SIZE - Remove TRACE_FLAG_IRQS_NOSUPPORT flag as it was never used - Get the true return address for function tracer when function graph tracer is also running. When function_graph trace is running along with function tracer, the parent function of the function tracer sometimes is "return_to_handler", which is the function graph trampoline to record the exit of the function. Use existing logic that calls into the fgraph infrastructure to find the real return address. - Remove (un)regfunc pointers out of tracepoint structure - Added last minute bug fix for setting pending modules in stack function filter. echo "write*:mod:ext3" > /sys/kernel/tracing/stack_trace_filter Would cause a kernel NULL dereference. - Minor clean ups * tag 'trace-v6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace: (31 commits) ftrace: Fix regression with module command in stack_trace_filter tracing: Fix function name for trampoline ftrace: Get the true parent ip for function tracer tracing: Remove redundant check on field->field in histograms bpf: ensure RCU Tasks Trace GP for sleepable raw tracepoint BPF links bpf: decouple BPF link/attach hook and BPF program sleepable semantics bpf: put bpf_link's program when link is safe to be deallocated tracing: Replace strncpy() with strscpy() when copying comm tracing: Add might_fault() check in __DECLARE_TRACE_SYSCALL tracing: Fix syscall tracepoint use-after-free tracing: Introduce tracepoint_is_faultable() tracing: Introduce tracepoint extended structure tracing: Remove TRACE_FLAG_IRQS_NOSUPPORT tracing: Replace multiple deprecated strncpy with memcpy tracing: Make percpu stack trace buffer invariant to PAGE_SIZE tracing: Use atomic64_inc_return() in trace_clock_counter() trace/trace_event_perf: remove duplicate samples on the first tracepoint event tracing/bpf: Add might_fault check to syscall probes tracing/perf: Add might_fault check to syscall probes tracing/ftrace: Add might_fault check to syscall probes ...
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/bpf.h20
-rw-r--r--include/linux/trace_events.h17
-rw-r--r--include/linux/tracepoint-defs.h14
-rw-r--r--include/linux/tracepoint.h169
4 files changed, 124 insertions, 96 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3ace0d6227e3..eaee2a819f4c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1645,6 +1645,11 @@ struct bpf_link {
enum bpf_link_type type;
const struct bpf_link_ops *ops;
struct bpf_prog *prog;
+ /* whether BPF link itself has "sleepable" semantics, which can differ
+ * from underlying BPF program having a "sleepable" semantics, as BPF
+ * link's semantics is determined by target attach hook
+ */
+ bool sleepable;
/* rcu is used before freeing, work can be used to schedule that
* RCU-based freeing before that, so they never overlap
*/
@@ -1661,8 +1666,10 @@ struct bpf_link_ops {
*/
void (*dealloc)(struct bpf_link *link);
/* deallocate link resources callback, called after RCU grace period;
- * if underlying BPF program is sleepable we go through tasks trace
- * RCU GP and then "classic" RCU GP
+ * if either the underlying BPF program is sleepable or BPF link's
+ * target hook is sleepable, we'll go through tasks trace RCU GP and
+ * then "classic" RCU GP; this need for chaining tasks trace and
+ * classic RCU GPs is designated by setting bpf_link->sleepable flag
*/
void (*dealloc_deferred)(struct bpf_link *link);
int (*detach)(struct bpf_link *link);
@@ -2409,6 +2416,9 @@ int bpf_prog_new_fd(struct bpf_prog *prog);
void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
const struct bpf_link_ops *ops, struct bpf_prog *prog);
+void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type,
+ const struct bpf_link_ops *ops, struct bpf_prog *prog,
+ bool sleepable);
int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer);
int bpf_link_settle(struct bpf_link_primer *primer);
void bpf_link_cleanup(struct bpf_link_primer *primer);
@@ -2764,6 +2774,12 @@ static inline void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
{
}
+static inline void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type,
+ const struct bpf_link_ops *ops, struct bpf_prog *prog,
+ bool sleepable)
+{
+}
+
static inline int bpf_link_prime(struct bpf_link *link,
struct bpf_link_primer *primer)
{
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 42bedcddd511..016b29a56c87 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -184,7 +184,6 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status);
enum trace_flag_type {
TRACE_FLAG_IRQS_OFF = 0x01,
- TRACE_FLAG_IRQS_NOSUPPORT = 0x02,
TRACE_FLAG_NEED_RESCHED = 0x04,
TRACE_FLAG_HARDIRQ = 0x08,
TRACE_FLAG_SOFTIRQ = 0x10,
@@ -193,7 +192,6 @@ enum trace_flag_type {
TRACE_FLAG_BH_OFF = 0x80,
};
-#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags)
{
unsigned int irq_status = irqs_disabled_flags(irqflags) ?
@@ -207,17 +205,6 @@ static inline unsigned int tracing_gen_ctx(void)
local_save_flags(irqflags);
return tracing_gen_ctx_flags(irqflags);
}
-#else
-
-static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags)
-{
- return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT);
-}
-static inline unsigned int tracing_gen_ctx(void)
-{
- return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT);
-}
-#endif
static inline unsigned int tracing_gen_ctx_dec(void)
{
@@ -326,7 +313,6 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
void trace_event_buffer_commit(struct trace_event_buffer *fbuffer);
enum {
- TRACE_EVENT_FL_FILTERED_BIT,
TRACE_EVENT_FL_CAP_ANY_BIT,
TRACE_EVENT_FL_NO_SET_FILTER_BIT,
TRACE_EVENT_FL_IGNORE_ENABLE_BIT,
@@ -341,7 +327,6 @@ enum {
/*
* Event flags:
- * FILTERED - The event has a filter attached
* CAP_ANY - Any user can enable for perf
* NO_SET_FILTER - Set when filter has error and is to be ignored
* IGNORE_ENABLE - For trace internal events, do not enable with debugfs file
@@ -356,7 +341,6 @@ enum {
* to a tracepoint yet, then it is cleared when it is.
*/
enum {
- TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT),
TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT),
TRACE_EVENT_FL_NO_SET_FILTER = (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT),
TRACE_EVENT_FL_IGNORE_ENABLE = (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT),
@@ -381,7 +365,6 @@ struct trace_event_call {
};
struct trace_event event;
char *print_fmt;
- struct event_filter *filter;
/*
* Static events can disappear with modules,
* where as dynamic ones need their own ref count.
diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h
index 4dc4955f0fbf..aebf0571c736 100644
--- a/include/linux/tracepoint-defs.h
+++ b/include/linux/tracepoint-defs.h
@@ -29,16 +29,22 @@ struct tracepoint_func {
int prio;
};
+struct tracepoint_ext {
+ int (*regfunc)(void);
+ void (*unregfunc)(void);
+ /* Flags. */
+ unsigned int faultable:1;
+};
+
struct tracepoint {
const char *name; /* Tracepoint name */
- struct static_key key;
+ struct static_key_false key;
struct static_call_key *static_call_key;
void *static_call_tramp;
void *iterator;
void *probestub;
- int (*regfunc)(void);
- void (*unregfunc)(void);
struct tracepoint_func __rcu *funcs;
+ struct tracepoint_ext *ext;
};
#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
@@ -83,7 +89,7 @@ struct bpf_raw_event_map {
#ifdef CONFIG_TRACEPOINTS
# define tracepoint_enabled(tp) \
- static_key_false(&(__tracepoint_##tp).key)
+ static_branch_unlikely(&(__tracepoint_##tp).key)
#else
# define tracepoint_enabled(tracepoint) false
#endif
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 93a9f3070b48..425123e921ac 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -17,6 +17,7 @@
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/rcupdate.h>
+#include <linux/rcupdate_trace.h>
#include <linux/tracepoint-defs.h>
#include <linux/static_call.h>
@@ -32,8 +33,6 @@ struct trace_eval_map {
#define TRACEPOINT_DEFAULT_PRIO 10
-extern struct srcu_struct tracepoint_srcu;
-
extern int
tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data);
extern int
@@ -105,16 +104,30 @@ void for_each_tracepoint_in_module(struct module *mod,
* tracepoint_synchronize_unregister must be called between the last tracepoint
* probe unregistration and the end of module exit to make sure there is no
* caller executing a probe when it is freed.
+ *
+ * An alternative is to use the following for batch reclaim associated
+ * with a given tracepoint:
+ *
+ * - tracepoint_is_faultable() == false: call_rcu()
+ * - tracepoint_is_faultable() == true: call_rcu_tasks_trace()
*/
#ifdef CONFIG_TRACEPOINTS
static inline void tracepoint_synchronize_unregister(void)
{
- synchronize_srcu(&tracepoint_srcu);
+ synchronize_rcu_tasks_trace();
synchronize_rcu();
}
+static inline bool tracepoint_is_faultable(struct tracepoint *tp)
+{
+ return tp->ext && tp->ext->faultable;
+}
#else
static inline void tracepoint_synchronize_unregister(void)
{ }
+static inline bool tracepoint_is_faultable(struct tracepoint *tp)
+{
+ return false;
+}
#endif
#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
@@ -197,66 +210,35 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
#endif /* CONFIG_HAVE_STATIC_CALL */
/*
- * ARCH_WANTS_NO_INSTR archs are expected to have sanitized entry and idle
- * code that disallow any/all tracing/instrumentation when RCU isn't watching.
- */
-#ifdef CONFIG_ARCH_WANTS_NO_INSTR
-#define RCUIDLE_COND(rcuidle) (rcuidle)
-#else
-/* srcu can't be used from NMI */
-#define RCUIDLE_COND(rcuidle) (rcuidle && in_nmi())
-#endif
-
-/*
* it_func[0] is never NULL because there is at least one element in the array
* when the array itself is non NULL.
+ *
+ * With @syscall=0, the tracepoint callback array dereference is
+ * protected by disabling preemption.
+ * With @syscall=1, the tracepoint callback array dereference is
+ * protected by Tasks Trace RCU, which allows probes to handle page
+ * faults.
*/
-#define __DO_TRACE(name, args, cond, rcuidle) \
+#define __DO_TRACE(name, args, cond, syscall) \
do { \
int __maybe_unused __idx = 0; \
\
if (!(cond)) \
return; \
\
- if (WARN_ONCE(RCUIDLE_COND(rcuidle), \
- "Bad RCU usage for tracepoint")) \
- return; \
- \
- /* keep srcu and sched-rcu usage consistent */ \
- preempt_disable_notrace(); \
- \
- /* \
- * For rcuidle callers, use srcu since sched-rcu \
- * doesn't work from the idle path. \
- */ \
- if (rcuidle) { \
- __idx = srcu_read_lock_notrace(&tracepoint_srcu);\
- ct_irq_enter_irqson(); \
- } \
+ if (syscall) \
+ rcu_read_lock_trace(); \
+ else \
+ preempt_disable_notrace(); \
\
__DO_TRACE_CALL(name, TP_ARGS(args)); \
\
- if (rcuidle) { \
- ct_irq_exit_irqson(); \
- srcu_read_unlock_notrace(&tracepoint_srcu, __idx);\
- } \
- \
- preempt_enable_notrace(); \
+ if (syscall) \
+ rcu_read_unlock_trace(); \
+ else \
+ preempt_enable_notrace(); \
} while (0)
-#ifndef MODULE
-#define __DECLARE_TRACE_RCU(name, proto, args, cond) \
- static inline void trace_##name##_rcuidle(proto) \
- { \
- if (static_key_false(&__tracepoint_##name.key)) \
- __DO_TRACE(name, \
- TP_ARGS(args), \
- TP_CONDITION(cond), 1); \
- }
-#else
-#define __DECLARE_TRACE_RCU(name, proto, args, cond)
-#endif
-
/*
* Make sure the alignment of the structure in the __tracepoints section will
* not add unwanted padding between the beginning of the section and the
@@ -268,23 +250,10 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
* site if it is not watching, as it will need to be active when the
* tracepoint is enabled.
*/
-#define __DECLARE_TRACE(name, proto, args, cond, data_proto) \
+#define __DECLARE_TRACE_COMMON(name, proto, args, cond, data_proto) \
extern int __traceiter_##name(data_proto); \
DECLARE_STATIC_CALL(tp_func_##name, __traceiter_##name); \
extern struct tracepoint __tracepoint_##name; \
- static inline void trace_##name(proto) \
- { \
- if (static_key_false(&__tracepoint_##name.key)) \
- __DO_TRACE(name, \
- TP_ARGS(args), \
- TP_CONDITION(cond), 0); \
- if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \
- WARN_ONCE(!rcu_is_watching(), \
- "RCU not watching for tracepoint"); \
- } \
- } \
- __DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args), \
- PARAMS(cond)) \
static inline int \
register_trace_##name(void (*probe)(data_proto), void *data) \
{ \
@@ -311,7 +280,36 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
static inline bool \
trace_##name##_enabled(void) \
{ \
- return static_key_false(&__tracepoint_##name.key); \
+ return static_branch_unlikely(&__tracepoint_##name.key);\
+ }
+
+#define __DECLARE_TRACE(name, proto, args, cond, data_proto) \
+ __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), cond, PARAMS(data_proto)) \
+ static inline void trace_##name(proto) \
+ { \
+ if (static_branch_unlikely(&__tracepoint_##name.key)) \
+ __DO_TRACE(name, \
+ TP_ARGS(args), \
+ TP_CONDITION(cond), 0); \
+ if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \
+ WARN_ONCE(!rcu_is_watching(), \
+ "RCU not watching for tracepoint"); \
+ } \
+ }
+
+#define __DECLARE_TRACE_SYSCALL(name, proto, args, cond, data_proto) \
+ __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), cond, PARAMS(data_proto)) \
+ static inline void trace_##name(proto) \
+ { \
+ might_fault(); \
+ if (static_branch_unlikely(&__tracepoint_##name.key)) \
+ __DO_TRACE(name, \
+ TP_ARGS(args), \
+ TP_CONDITION(cond), 1); \
+ if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \
+ WARN_ONCE(!rcu_is_watching(), \
+ "RCU not watching for tracepoint"); \
+ } \
}
/*
@@ -319,7 +317,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
* structures, so we create an array of pointers that will be used for iteration
* on the tracepoints.
*/
-#define DEFINE_TRACE_FN(_name, _reg, _unreg, proto, args) \
+#define __DEFINE_TRACE_EXT(_name, _ext, proto, args) \
static const char __tpstrtab_##_name[] \
__section("__tracepoints_strings") = #_name; \
extern struct static_call_key STATIC_CALL_KEY(tp_func_##_name); \
@@ -328,14 +326,14 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
struct tracepoint __tracepoint_##_name __used \
__section("__tracepoints") = { \
.name = __tpstrtab_##_name, \
- .key = STATIC_KEY_INIT_FALSE, \
+ .key = STATIC_KEY_FALSE_INIT, \
.static_call_key = &STATIC_CALL_KEY(tp_func_##_name), \
.static_call_tramp = STATIC_CALL_TRAMP_ADDR(tp_func_##_name), \
.iterator = &__traceiter_##_name, \
.probestub = &__probestub_##_name, \
- .regfunc = _reg, \
- .unregfunc = _unreg, \
- .funcs = NULL }; \
+ .funcs = NULL, \
+ .ext = _ext, \
+ }; \
__TRACEPOINT_ENTRY(_name); \
int __traceiter_##_name(void *__data, proto) \
{ \
@@ -358,8 +356,24 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
} \
DEFINE_STATIC_CALL(tp_func_##_name, __traceiter_##_name);
-#define DEFINE_TRACE(name, proto, args) \
- DEFINE_TRACE_FN(name, NULL, NULL, PARAMS(proto), PARAMS(args));
+#define DEFINE_TRACE_FN(_name, _reg, _unreg, _proto, _args) \
+ static struct tracepoint_ext __tracepoint_ext_##_name = { \
+ .regfunc = _reg, \
+ .unregfunc = _unreg, \
+ .faultable = false, \
+ }; \
+ __DEFINE_TRACE_EXT(_name, &__tracepoint_ext_##_name, PARAMS(_proto), PARAMS(_args));
+
+#define DEFINE_TRACE_SYSCALL(_name, _reg, _unreg, _proto, _args) \
+ static struct tracepoint_ext __tracepoint_ext_##_name = { \
+ .regfunc = _reg, \
+ .unregfunc = _unreg, \
+ .faultable = true, \
+ }; \
+ __DEFINE_TRACE_EXT(_name, &__tracepoint_ext_##_name, PARAMS(_proto), PARAMS(_args));
+
+#define DEFINE_TRACE(_name, _proto, _args) \
+ __DEFINE_TRACE_EXT(_name, NULL, PARAMS(_proto), PARAMS(_args));
#define EXPORT_TRACEPOINT_SYMBOL_GPL(name) \
EXPORT_SYMBOL_GPL(__tracepoint_##name); \
@@ -375,8 +389,6 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
#define __DECLARE_TRACE(name, proto, args, cond, data_proto) \
static inline void trace_##name(proto) \
{ } \
- static inline void trace_##name##_rcuidle(proto) \
- { } \
static inline int \
register_trace_##name(void (*probe)(data_proto), \
void *data) \
@@ -398,7 +410,10 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
return false; \
}
+#define __DECLARE_TRACE_SYSCALL __DECLARE_TRACE
+
#define DEFINE_TRACE_FN(name, reg, unreg, proto, args)
+#define DEFINE_TRACE_SYSCALL(name, reg, unreg, proto, args)
#define DEFINE_TRACE(name, proto, args)
#define EXPORT_TRACEPOINT_SYMBOL_GPL(name)
#define EXPORT_TRACEPOINT_SYMBOL(name)
@@ -459,6 +474,11 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
cpu_online(raw_smp_processor_id()) && (PARAMS(cond)), \
PARAMS(void *__data, proto))
+#define DECLARE_TRACE_SYSCALL(name, proto, args) \
+ __DECLARE_TRACE_SYSCALL(name, PARAMS(proto), PARAMS(args), \
+ cpu_online(raw_smp_processor_id()), \
+ PARAMS(void *__data, proto))
+
#define TRACE_EVENT_FLAGS(event, flag)
#define TRACE_EVENT_PERF_PERM(event, expr...)
@@ -596,6 +616,9 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
struct, assign, print) \
DECLARE_TRACE_CONDITION(name, PARAMS(proto), \
PARAMS(args), PARAMS(cond))
+#define TRACE_EVENT_SYSCALL(name, proto, args, struct, assign, \
+ print, reg, unreg) \
+ DECLARE_TRACE_SYSCALL(name, PARAMS(proto), PARAMS(args))
#define TRACE_EVENT_FLAGS(event, flag)