From 9e15db66136a14cde3f35691f1d839d950118826 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:25:00 -0700 Subject: bpf: Implement accurate raw_tp context access via BTF libbpf analyzes bpf C program, searches in-kernel BTF for given type name and stores it into expected_attach_type. The kernel verifier expects this btf_id to point to something like: typedef void (*btf_trace_kfree_skb)(void *, struct sk_buff *skb, void *loc); which represents signature of raw_tracepoint "kfree_skb". Then btf_ctx_access() matches ctx+0 access in bpf program with 'skb' and 'ctx+8' access with 'loc' arguments of "kfree_skb" tracepoint. In first case it passes btf_id of 'struct sk_buff *' back to the verifier core and 'void *' in second case. Then the verifier tracks PTR_TO_BTF_ID as any other pointer type. Like PTR_TO_SOCKET points to 'struct bpf_sock', PTR_TO_TCP_SOCK points to 'struct bpf_tcp_sock', and so on. PTR_TO_BTF_ID points to in-kernel structs. If 1234 is btf_id of 'struct sk_buff' in vmlinux's BTF then PTR_TO_BTF_ID#1234 points to one of in kernel skbs. When PTR_TO_BTF_ID#1234 is dereferenced (like r2 = *(u64 *)r1 + 32) the btf_struct_access() checks which field of 'struct sk_buff' is at offset 32. Checks that size of access matches type definition of the field and continues to track the dereferenced type. If that field was a pointer to 'struct net_device' the r2's type will be PTR_TO_BTF_ID#456. Where 456 is btf_id of 'struct net_device' in vmlinux's BTF. Such verifier analysis prevents "cheating" in BPF C program. The program cannot cast arbitrary pointer to 'struct sk_buff *' and access it. C compiler would allow type cast, of course, but the verifier will notice type mismatch based on BPF assembly and in-kernel BTF. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-7-ast@kernel.org --- kernel/trace/bpf_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 44bd08f2443b..6221e8c6ecc3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1074,7 +1074,7 @@ static bool raw_tp_prog_is_valid_access(int off, int size, return false; if (off % size != 0) return false; - return true; + return btf_ctx_access(off, size, type, prog, info); } const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { -- cgit v1.2.3 From a7658e1a4164ce2b9eb4a11aadbba38586e93bd6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:25:04 -0700 Subject: bpf: Check types of arguments passed into helpers Introduce new helper that reuses existing skb perf_event output implementation, but can be called from raw_tracepoint programs that receive 'struct sk_buff *' as tracepoint argument or can walk other kernel data structures to skb pointer. In order to do that teach verifier to resolve true C types of bpf helpers into in-kernel BTF ids. The type of kernel pointer passed by raw tracepoint into bpf program will be tracked by the verifier all the way until it's passed into helper function. For example: kfree_skb() kernel function calls trace_kfree_skb(skb, loc); bpf programs receives that skb pointer and may eventually pass it into bpf_skb_output() bpf helper which in-kernel is implemented via bpf_skb_event_output() kernel function. Its first argument in the kernel is 'struct sk_buff *'. The verifier makes sure that types match all the way. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-11-ast@kernel.org --- include/linux/bpf.h | 18 +++++++---- include/uapi/linux/bpf.h | 27 ++++++++++++++++- kernel/bpf/btf.c | 68 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 44 +++++++++++++++++---------- kernel/trace/bpf_trace.c | 4 +++ net/core/filter.c | 15 +++++++++- tools/include/uapi/linux/bpf.h | 27 ++++++++++++++++- 7 files changed, 180 insertions(+), 23 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a7330d75bb94..2c2c29b49845 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -213,6 +213,7 @@ enum bpf_arg_type { ARG_PTR_TO_INT, /* pointer to int */ ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ + ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ }; /* type of values returned from helper functions */ @@ -235,11 +236,17 @@ struct bpf_func_proto { bool gpl_only; bool pkt_access; enum bpf_return_type ret_type; - enum bpf_arg_type arg1_type; - enum bpf_arg_type arg2_type; - enum bpf_arg_type arg3_type; - enum bpf_arg_type arg4_type; - enum bpf_arg_type arg5_type; + union { + struct { + enum bpf_arg_type arg1_type; + enum bpf_arg_type arg2_type; + enum bpf_arg_type arg3_type; + enum bpf_arg_type arg4_type; + enum bpf_arg_type arg5_type; + }; + enum bpf_arg_type arg_type[5]; + }; + u32 *btf_id; /* BTF ids of arguments */ }; /* bpf_context is intentionally undefined structure. Pointer to bpf_context is @@ -765,6 +772,7 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id); +u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *, int); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3bb2cd1de341..4af8b0819a32 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2751,6 +2751,30 @@ union bpf_attr { * **-EOPNOTSUPP** kernel configuration does not enable SYN cookies * * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 + * + * int bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct sk_buff. + * + * This helper is similar to **bpf_perf_event_output**\ () but + * restricted to raw_tracepoint bpf programs. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2863,7 +2887,8 @@ union bpf_attr { FN(sk_storage_get), \ FN(sk_storage_delete), \ FN(send_signal), \ - FN(tcp_gen_syncookie), + FN(tcp_gen_syncookie), \ + FN(skb_output), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 271d27cd427f..f7557af39756 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3626,6 +3626,74 @@ again: return -EINVAL; } +u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn, int arg) +{ + char fnname[KSYM_SYMBOL_LEN + 4] = "btf_"; + const struct btf_param *args; + const struct btf_type *t; + const char *tname, *sym; + u32 btf_id, i; + + if (IS_ERR(btf_vmlinux)) { + bpf_log(log, "btf_vmlinux is malformed\n"); + return -EINVAL; + } + + sym = kallsyms_lookup((long)fn, NULL, NULL, NULL, fnname + 4); + if (!sym) { + bpf_log(log, "kernel doesn't have kallsyms\n"); + return -EFAULT; + } + + for (i = 1; i <= btf_vmlinux->nr_types; i++) { + t = btf_type_by_id(btf_vmlinux, i); + if (BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) + continue; + tname = __btf_name_by_offset(btf_vmlinux, t->name_off); + if (!strcmp(tname, fnname)) + break; + } + if (i > btf_vmlinux->nr_types) { + bpf_log(log, "helper %s type is not found\n", fnname); + return -ENOENT; + } + + t = btf_type_by_id(btf_vmlinux, t->type); + if (!btf_type_is_ptr(t)) + return -EFAULT; + t = btf_type_by_id(btf_vmlinux, t->type); + if (!btf_type_is_func_proto(t)) + return -EFAULT; + + args = (const struct btf_param *)(t + 1); + if (arg >= btf_type_vlen(t)) { + bpf_log(log, "bpf helper %s doesn't have %d-th argument\n", + fnname, arg); + return -EINVAL; + } + + t = btf_type_by_id(btf_vmlinux, args[arg].type); + if (!btf_type_is_ptr(t) || !t->type) { + /* anything but the pointer to struct is a helper config bug */ + bpf_log(log, "ARG_PTR_TO_BTF is misconfigured\n"); + return -EFAULT; + } + btf_id = t->type; + t = btf_type_by_id(btf_vmlinux, t->type); + /* skip modifiers */ + while (btf_type_is_modifier(t)) { + btf_id = t->type; + t = btf_type_by_id(btf_vmlinux, t->type); + } + if (!btf_type_is_struct(t)) { + bpf_log(log, "ARG_PTR_TO_BTF is not a struct\n"); + return -EFAULT; + } + bpf_log(log, "helper %s arg%d has btf_id %d struct %s\n", fnname + 4, + arg, btf_id, __btf_name_by_offset(btf_vmlinux, t->name_off)); + return btf_id; +} + void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fba9ef6a831b..556e82f8869b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -205,6 +205,7 @@ struct bpf_call_arg_meta { u64 msize_umax_value; int ref_obj_id; int func_id; + u32 btf_id; }; struct btf *btf_vmlinux; @@ -3439,6 +3440,22 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_SOCKET; if (type != expected_type) goto err_type; + } else if (arg_type == ARG_PTR_TO_BTF_ID) { + expected_type = PTR_TO_BTF_ID; + if (type != expected_type) + goto err_type; + if (reg->btf_id != meta->btf_id) { + verbose(env, "Helper has type %s got %s in R%d\n", + kernel_type_name(meta->btf_id), + kernel_type_name(reg->btf_id), regno); + + return -EACCES; + } + if (!tnum_is_const(reg->var_off) || reg->var_off.value || reg->off) { + verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n", + regno); + return -EACCES; + } } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { if (meta->func_id == BPF_FUNC_spin_lock) { if (process_spin_lock(env, regno, true)) @@ -3586,6 +3603,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_PERF_EVENT_ARRAY: if (func_id != BPF_FUNC_perf_event_read && func_id != BPF_FUNC_perf_event_output && + func_id != BPF_FUNC_skb_output && func_id != BPF_FUNC_perf_event_read_value) goto error; break; @@ -3673,6 +3691,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_perf_event_read: case BPF_FUNC_perf_event_output: case BPF_FUNC_perf_event_read_value: + case BPF_FUNC_skb_output: if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) goto error; break; @@ -4127,21 +4146,16 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn meta.func_id = func_id; /* check args */ - err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta); - if (err) - return err; - err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta); - if (err) - return err; - err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta); - if (err) - return err; - err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &meta); - if (err) - return err; - err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &meta); - if (err) - return err; + for (i = 0; i < 5; i++) { + if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID) { + if (!fn->btf_id[i]) + fn->btf_id[i] = btf_resolve_helper_id(&env->log, fn->func, i); + meta.btf_id = fn->btf_id[i]; + } + err = check_func_arg(env, BPF_REG_1 + i, fn->arg_type[i], &meta); + if (err) + return err; + } err = record_func_map(env, &meta, func_id, insn_idx); if (err) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6221e8c6ecc3..52f7e9d8c29b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -995,6 +995,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +extern const struct bpf_func_proto bpf_skb_output_proto; + BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, struct bpf_map *, map, u64, flags) { @@ -1053,6 +1055,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto_raw_tp; + case BPF_FUNC_skb_output: + return &bpf_skb_output_proto; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_raw_tp; case BPF_FUNC_get_stack: diff --git a/net/core/filter.c b/net/core/filter.c index 46196e212413..728ba6203c1f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3798,7 +3798,7 @@ BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map, if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; - if (unlikely(skb_size > skb->len)) + if (unlikely(!skb || skb_size > skb->len)) return -EFAULT; return bpf_event_output(map, flags, meta, meta_size, skb, skb_size, @@ -3816,6 +3816,19 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +static u32 bpf_skb_output_btf_ids[5]; +const struct bpf_func_proto bpf_skb_output_proto = { + .func = bpf_skb_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, + .btf_id = bpf_skb_output_btf_ids, +}; + static unsigned short bpf_tunnel_key_af(u64 flags) { return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3bb2cd1de341..4af8b0819a32 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2751,6 +2751,30 @@ union bpf_attr { * **-EOPNOTSUPP** kernel configuration does not enable SYN cookies * * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 + * + * int bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct sk_buff. + * + * This helper is similar to **bpf_perf_event_output**\ () but + * restricted to raw_tracepoint bpf programs. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2863,7 +2887,8 @@ union bpf_attr { FN(sk_storage_get), \ FN(sk_storage_delete), \ FN(send_signal), \ - FN(tcp_gen_syncookie), + FN(tcp_gen_syncookie), \ + FN(skb_output), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From da97e18458fb42d7c00fac5fd1c56a3896ec666e Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 14 Oct 2019 13:03:08 -0400 Subject: perf_event: Add support for LSM and SELinux checks In current mainline, the degree of access to perf_event_open(2) system call depends on the perf_event_paranoid sysctl. This has a number of limitations: 1. The sysctl is only a single value. Many types of accesses are controlled based on the single value thus making the control very limited and coarse grained. 2. The sysctl is global, so if the sysctl is changed, then that means all processes get access to perf_event_open(2) opening the door to security issues. This patch adds LSM and SELinux access checking which will be used in Android to access perf_event_open(2) for the purposes of attaching BPF programs to tracepoints, perf profiling and other operations from userspace. These operations are intended for production systems. 5 new LSM hooks are added: 1. perf_event_open: This controls access during the perf_event_open(2) syscall itself. The hook is called from all the places that the perf_event_paranoid sysctl is checked to keep it consistent with the systctl. The hook gets passed a 'type' argument which controls CPU, kernel and tracepoint accesses (in this context, CPU, kernel and tracepoint have the same semantics as the perf_event_paranoid sysctl). Additionally, I added an 'open' type which is similar to perf_event_paranoid sysctl == 3 patch carried in Android and several other distros but was rejected in mainline [1] in 2016. 2. perf_event_alloc: This allocates a new security object for the event which stores the current SID within the event. It will be useful when the perf event's FD is passed through IPC to another process which may try to read the FD. Appropriate security checks will limit access. 3. perf_event_free: Called when the event is closed. 4. perf_event_read: Called from the read(2) and mmap(2) syscalls for the event. 5. perf_event_write: Called from the ioctl(2) syscalls for the event. [1] https://lwn.net/Articles/696240/ Since Peter had suggest LSM hooks in 2016 [1], I am adding his Suggested-by tag below. To use this patch, we set the perf_event_paranoid sysctl to -1 and then apply selinux checking as appropriate (default deny everything, and then add policy rules to give access to domains that need it). In the future we can remove the perf_event_paranoid sysctl altogether. Suggested-by: Peter Zijlstra Co-developed-by: Peter Zijlstra Signed-off-by: Joel Fernandes (Google) Signed-off-by: Peter Zijlstra (Intel) Acked-by: James Morris Cc: Arnaldo Carvalho de Melo Cc: rostedt@goodmis.org Cc: Yonghong Song Cc: Kees Cook Cc: Ingo Molnar Cc: Alexei Starovoitov Cc: jeffv@google.com Cc: Jiri Olsa Cc: Daniel Borkmann Cc: primiano@google.com Cc: Song Liu Cc: rsavitski@google.com Cc: Namhyung Kim Cc: Matthew Garrett Link: https://lkml.kernel.org/r/20191014170308.70668-1-joel@joelfernandes.org --- arch/powerpc/perf/core-book3s.c | 18 +++++----- arch/x86/events/intel/bts.c | 8 +++-- arch/x86/events/intel/core.c | 5 +-- arch/x86/events/intel/p4.c | 5 +-- include/linux/lsm_hooks.h | 15 ++++++++ include/linux/perf_event.h | 36 ++++++++++++++++--- include/linux/security.h | 38 +++++++++++++++++++- kernel/events/core.c | 57 ++++++++++++++++++++++++------ kernel/trace/trace_event_perf.c | 15 +++++--- security/security.c | 27 +++++++++++++++ security/selinux/hooks.c | 69 +++++++++++++++++++++++++++++++++++++ security/selinux/include/classmap.h | 2 ++ security/selinux/include/objsec.h | 6 +++- 13 files changed, 261 insertions(+), 40 deletions(-) (limited to 'kernel/trace') diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index ca92e01d0bd1..48604625ab31 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -96,7 +96,7 @@ static inline unsigned long perf_ip_adjust(struct pt_regs *regs) { return 0; } -static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) { } +static inline void perf_get_data_addr(struct perf_event *event, struct pt_regs *regs, u64 *addrp) { } static inline u32 perf_get_misc_flags(struct pt_regs *regs) { return 0; @@ -127,7 +127,7 @@ static unsigned long ebb_switch_in(bool ebb, struct cpu_hw_events *cpuhw) static inline void power_pmu_bhrb_enable(struct perf_event *event) {} static inline void power_pmu_bhrb_disable(struct perf_event *event) {} static void power_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) {} -static inline void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw) {} +static inline void power_pmu_bhrb_read(struct perf_event *event, struct cpu_hw_events *cpuhw) {} static void pmao_restore_workaround(bool ebb) { } #endif /* CONFIG_PPC32 */ @@ -179,7 +179,7 @@ static inline unsigned long perf_ip_adjust(struct pt_regs *regs) * pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC, the * [POWER7P_]MMCRA_SDAR_VALID bit in MMCRA, or the SDAR_VALID bit in SIER. */ -static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) +static inline void perf_get_data_addr(struct perf_event *event, struct pt_regs *regs, u64 *addrp) { unsigned long mmcra = regs->dsisr; bool sdar_valid; @@ -204,8 +204,7 @@ static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) if (!(mmcra & MMCRA_SAMPLE_ENABLE) || sdar_valid) *addrp = mfspr(SPRN_SDAR); - if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN) && - is_kernel_addr(mfspr(SPRN_SDAR))) + if (is_kernel_addr(mfspr(SPRN_SDAR)) && perf_allow_kernel(&event->attr) != 0) *addrp = 0; } @@ -444,7 +443,7 @@ static __u64 power_pmu_bhrb_to(u64 addr) } /* Processing BHRB entries */ -static void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw) +static void power_pmu_bhrb_read(struct perf_event *event, struct cpu_hw_events *cpuhw) { u64 val; u64 addr; @@ -472,8 +471,7 @@ static void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw) * exporting it to userspace (avoid exposure of regions * where we could have speculative execution) */ - if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN) && - is_kernel_addr(addr)) + if (is_kernel_addr(addr) && perf_allow_kernel(&event->attr) != 0) continue; /* Branches are read most recent first (ie. mfbhrb 0 is @@ -2087,12 +2085,12 @@ static void record_and_restart(struct perf_event *event, unsigned long val, if (event->attr.sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) - perf_get_data_addr(regs, &data.addr); + perf_get_data_addr(event, regs, &data.addr); if (event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK) { struct cpu_hw_events *cpuhw; cpuhw = this_cpu_ptr(&cpu_hw_events); - power_pmu_bhrb_read(cpuhw); + power_pmu_bhrb_read(event, cpuhw); data.br_stack = &cpuhw->bhrb_stack; } diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 5ee3fed881d3..38de4a7f6752 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -549,9 +549,11 @@ static int bts_event_init(struct perf_event *event) * Note that the default paranoia setting permits unprivileged * users to profile the kernel. */ - if (event->attr.exclude_kernel && perf_paranoid_kernel() && - !capable(CAP_SYS_ADMIN)) - return -EACCES; + if (event->attr.exclude_kernel) { + ret = perf_allow_kernel(&event->attr); + if (ret) + return ret; + } if (x86_add_exclusive(x86_lbr_exclusive_bts)) return -EBUSY; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index fcef678c3423..bbf6588d47ee 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3315,8 +3315,9 @@ static int intel_pmu_hw_config(struct perf_event *event) if (x86_pmu.version < 3) return -EINVAL; - if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) - return -EACCES; + ret = perf_allow_cpu(&event->attr); + if (ret) + return ret; event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY; diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c index dee579efb2b2..a4cc66005ce8 100644 --- a/arch/x86/events/intel/p4.c +++ b/arch/x86/events/intel/p4.c @@ -776,8 +776,9 @@ static int p4_validate_raw_event(struct perf_event *event) * the user needs special permissions to be able to use it */ if (p4_ht_active() && p4_event_bind_map[v].shared) { - if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) - return -EACCES; + v = perf_allow_cpu(&event->attr); + if (v) + return v; } /* ESCR EventMask bits may be invalid */ diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index a3763247547c..20d8cf194fb7 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1818,6 +1818,14 @@ union security_list_options { void (*bpf_prog_free_security)(struct bpf_prog_aux *aux); #endif /* CONFIG_BPF_SYSCALL */ int (*locked_down)(enum lockdown_reason what); +#ifdef CONFIG_PERF_EVENTS + int (*perf_event_open)(struct perf_event_attr *attr, int type); + int (*perf_event_alloc)(struct perf_event *event); + void (*perf_event_free)(struct perf_event *event); + int (*perf_event_read)(struct perf_event *event); + int (*perf_event_write)(struct perf_event *event); + +#endif }; struct security_hook_heads { @@ -2060,6 +2068,13 @@ struct security_hook_heads { struct hlist_head bpf_prog_free_security; #endif /* CONFIG_BPF_SYSCALL */ struct hlist_head locked_down; +#ifdef CONFIG_PERF_EVENTS + struct hlist_head perf_event_open; + struct hlist_head perf_event_alloc; + struct hlist_head perf_event_free; + struct hlist_head perf_event_read; + struct hlist_head perf_event_write; +#endif } __randomize_layout; /* diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 61448c19a132..587ae4d002f5 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -56,6 +56,7 @@ struct perf_guest_info_callbacks { #include #include #include +#include #include struct perf_callchain_entry { @@ -721,6 +722,9 @@ struct perf_event { struct perf_cgroup *cgrp; /* cgroup event is attach to */ #endif +#ifdef CONFIG_SECURITY + void *security; +#endif struct list_head sb_list; #endif /* CONFIG_PERF_EVENTS */ }; @@ -1241,19 +1245,41 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, int perf_event_max_stack_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -static inline bool perf_paranoid_tracepoint_raw(void) +/* Access to perf_event_open(2) syscall. */ +#define PERF_SECURITY_OPEN 0 + +/* Finer grained perf_event_open(2) access control. */ +#define PERF_SECURITY_CPU 1 +#define PERF_SECURITY_KERNEL 2 +#define PERF_SECURITY_TRACEPOINT 3 + +static inline int perf_is_paranoid(void) { return sysctl_perf_event_paranoid > -1; } -static inline bool perf_paranoid_cpu(void) +static inline int perf_allow_kernel(struct perf_event_attr *attr) { - return sysctl_perf_event_paranoid > 0; + if (sysctl_perf_event_paranoid > 1 && !capable(CAP_SYS_ADMIN)) + return -EACCES; + + return security_perf_event_open(attr, PERF_SECURITY_KERNEL); } -static inline bool perf_paranoid_kernel(void) +static inline int perf_allow_cpu(struct perf_event_attr *attr) { - return sysctl_perf_event_paranoid > 1; + if (sysctl_perf_event_paranoid > 0 && !capable(CAP_SYS_ADMIN)) + return -EACCES; + + return security_perf_event_open(attr, PERF_SECURITY_CPU); +} + +static inline int perf_allow_tracepoint(struct perf_event_attr *attr) +{ + if (sysctl_perf_event_paranoid > -1 && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT); } extern void perf_event_init(void); diff --git a/include/linux/security.h b/include/linux/security.h index a8d59d612d27..4df79ffdc3a0 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1894,5 +1894,41 @@ static inline void security_bpf_prog_free(struct bpf_prog_aux *aux) #endif /* CONFIG_SECURITY */ #endif /* CONFIG_BPF_SYSCALL */ -#endif /* ! __LINUX_SECURITY_H */ +#ifdef CONFIG_PERF_EVENTS +struct perf_event_attr; + +#ifdef CONFIG_SECURITY +extern int security_perf_event_open(struct perf_event_attr *attr, int type); +extern int security_perf_event_alloc(struct perf_event *event); +extern void security_perf_event_free(struct perf_event *event); +extern int security_perf_event_read(struct perf_event *event); +extern int security_perf_event_write(struct perf_event *event); +#else +static inline int security_perf_event_open(struct perf_event_attr *attr, + int type) +{ + return 0; +} + +static inline int security_perf_event_alloc(struct perf_event *event) +{ + return 0; +} + +static inline void security_perf_event_free(struct perf_event *event) +{ +} + +static inline int security_perf_event_read(struct perf_event *event) +{ + return 0; +} +static inline int security_perf_event_write(struct perf_event *event) +{ + return 0; +} +#endif /* CONFIG_SECURITY */ +#endif /* CONFIG_PERF_EVENTS */ + +#endif /* ! __LINUX_SECURITY_H */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 9ec0b0bfddbd..f9a5d4356562 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4229,8 +4229,9 @@ find_get_context(struct pmu *pmu, struct task_struct *task, if (!task) { /* Must be root to operate on a CPU event: */ - if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EACCES); + err = perf_allow_cpu(&event->attr); + if (err) + return ERR_PTR(err); cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); ctx = &cpuctx->ctx; @@ -4539,6 +4540,8 @@ static void _free_event(struct perf_event *event) unaccount_event(event); + security_perf_event_free(event); + if (event->rb) { /* * Can happen when we close an event with re-directed output. @@ -4992,6 +4995,10 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) struct perf_event_context *ctx; int ret; + ret = security_perf_event_read(event); + if (ret) + return ret; + ctx = perf_event_ctx_lock(event); ret = __perf_read(event, buf, count); perf_event_ctx_unlock(event, ctx); @@ -5256,6 +5263,11 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct perf_event_context *ctx; long ret; + /* Treat ioctl like writes as it is likely a mutating operation. */ + ret = security_perf_event_write(event); + if (ret) + return ret; + ctx = perf_event_ctx_lock(event); ret = _perf_ioctl(event, cmd, arg); perf_event_ctx_unlock(event, ctx); @@ -5719,6 +5731,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; + ret = security_perf_event_read(event); + if (ret) + return ret; + vma_size = vma->vm_end - vma->vm_start; if (vma->vm_pgoff == 0) { @@ -5844,7 +5860,7 @@ accounting: lock_limit >>= PAGE_SHIFT; locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra; - if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && + if ((locked > lock_limit) && perf_is_paranoid() && !capable(CAP_IPC_LOCK)) { ret = -EPERM; goto unlock; @@ -10578,11 +10594,20 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } } + err = security_perf_event_alloc(event); + if (err) + goto err_callchain_buffer; + /* symmetric to unaccount_event() in _free_event() */ account_event(event); return event; +err_callchain_buffer: + if (!event->parent) { + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + put_callchain_buffers(); + } err_addr_filters: kfree(event->addr_filter_ranges); @@ -10671,9 +10696,11 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, attr->branch_sample_type = mask; } /* privileged levels capture (kernel, hv): check permissions */ - if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM) - && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EACCES; + if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) { + ret = perf_allow_kernel(attr); + if (ret) + return ret; + } } if (attr->sample_type & PERF_SAMPLE_REGS_USER) { @@ -10886,13 +10913,19 @@ SYSCALL_DEFINE5(perf_event_open, if (flags & ~PERF_FLAG_ALL) return -EINVAL; + /* Do we allow access to perf_event_open(2) ? */ + err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); + if (err) + return err; + err = perf_copy_attr(attr_uptr, &attr); if (err) return err; if (!attr.exclude_kernel) { - if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EACCES; + err = perf_allow_kernel(&attr); + if (err) + return err; } if (attr.namespaces) { @@ -10909,9 +10942,11 @@ SYSCALL_DEFINE5(perf_event_open, } /* Only privileged users can get physical addresses */ - if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) && - perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EACCES; + if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) { + err = perf_allow_kernel(&attr); + if (err) + return err; + } err = security_locked_down(LOCKDOWN_PERF); if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR)) diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 0892e38ed6fb..0917fee6ee7c 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -8,6 +8,7 @@ #include #include +#include #include "trace.h" #include "trace_probe.h" @@ -26,8 +27,10 @@ static int total_ref_count; static int perf_trace_event_perm(struct trace_event_call *tp_event, struct perf_event *p_event) { + int ret; + if (tp_event->perf_perm) { - int ret = tp_event->perf_perm(tp_event, p_event); + ret = tp_event->perf_perm(tp_event, p_event); if (ret) return ret; } @@ -46,8 +49,9 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event, /* The ftrace function trace is allowed only for root. */ if (ftrace_event_is_function(tp_event)) { - if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) - return -EPERM; + ret = perf_allow_tracepoint(&p_event->attr); + if (ret) + return ret; if (!is_sampling_event(p_event)) return 0; @@ -82,8 +86,9 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event, * ...otherwise raw tracepoint data can be a severe data leak, * only allow root to have these. */ - if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) - return -EPERM; + ret = perf_allow_tracepoint(&p_event->attr); + if (ret) + return ret; return 0; } diff --git a/security/security.c b/security/security.c index 1bc000f834e2..cd2d18d2d279 100644 --- a/security/security.c +++ b/security/security.c @@ -2404,3 +2404,30 @@ int security_locked_down(enum lockdown_reason what) return call_int_hook(locked_down, 0, what); } EXPORT_SYMBOL(security_locked_down); + +#ifdef CONFIG_PERF_EVENTS +int security_perf_event_open(struct perf_event_attr *attr, int type) +{ + return call_int_hook(perf_event_open, 0, attr, type); +} + +int security_perf_event_alloc(struct perf_event *event) +{ + return call_int_hook(perf_event_alloc, 0, event); +} + +void security_perf_event_free(struct perf_event *event) +{ + call_void_hook(perf_event_free, event); +} + +int security_perf_event_read(struct perf_event *event) +{ + return call_int_hook(perf_event_read, 0, event); +} + +int security_perf_event_write(struct perf_event *event) +{ + return call_int_hook(perf_event_write, 0, event); +} +#endif /* CONFIG_PERF_EVENTS */ diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 9625b99e677f..28eb05490d59 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -6795,6 +6795,67 @@ struct lsm_blob_sizes selinux_blob_sizes __lsm_ro_after_init = { .lbs_msg_msg = sizeof(struct msg_security_struct), }; +#ifdef CONFIG_PERF_EVENTS +static int selinux_perf_event_open(struct perf_event_attr *attr, int type) +{ + u32 requested, sid = current_sid(); + + if (type == PERF_SECURITY_OPEN) + requested = PERF_EVENT__OPEN; + else if (type == PERF_SECURITY_CPU) + requested = PERF_EVENT__CPU; + else if (type == PERF_SECURITY_KERNEL) + requested = PERF_EVENT__KERNEL; + else if (type == PERF_SECURITY_TRACEPOINT) + requested = PERF_EVENT__TRACEPOINT; + else + return -EINVAL; + + return avc_has_perm(&selinux_state, sid, sid, SECCLASS_PERF_EVENT, + requested, NULL); +} + +static int selinux_perf_event_alloc(struct perf_event *event) +{ + struct perf_event_security_struct *perfsec; + + perfsec = kzalloc(sizeof(*perfsec), GFP_KERNEL); + if (!perfsec) + return -ENOMEM; + + perfsec->sid = current_sid(); + event->security = perfsec; + + return 0; +} + +static void selinux_perf_event_free(struct perf_event *event) +{ + struct perf_event_security_struct *perfsec = event->security; + + event->security = NULL; + kfree(perfsec); +} + +static int selinux_perf_event_read(struct perf_event *event) +{ + struct perf_event_security_struct *perfsec = event->security; + u32 sid = current_sid(); + + return avc_has_perm(&selinux_state, sid, perfsec->sid, + SECCLASS_PERF_EVENT, PERF_EVENT__READ, NULL); +} + +static int selinux_perf_event_write(struct perf_event *event) +{ + struct perf_event_security_struct *perfsec = event->security; + u32 sid = current_sid(); + + return avc_has_perm(&selinux_state, sid, perfsec->sid, + SECCLASS_PERF_EVENT, PERF_EVENT__WRITE, NULL); +} +#endif + static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(binder_set_context_mgr, selinux_binder_set_context_mgr), LSM_HOOK_INIT(binder_transaction, selinux_binder_transaction), @@ -7030,6 +7091,14 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(bpf_map_free_security, selinux_bpf_map_free), LSM_HOOK_INIT(bpf_prog_free_security, selinux_bpf_prog_free), #endif + +#ifdef CONFIG_PERF_EVENTS + LSM_HOOK_INIT(perf_event_open, selinux_perf_event_open), + LSM_HOOK_INIT(perf_event_alloc, selinux_perf_event_alloc), + LSM_HOOK_INIT(perf_event_free, selinux_perf_event_free), + LSM_HOOK_INIT(perf_event_read, selinux_perf_event_read), + LSM_HOOK_INIT(perf_event_write, selinux_perf_event_write), +#endif }; static __init int selinux_init(void) diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index 32e9b03be3dd..7db24855e12d 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -244,6 +244,8 @@ struct security_class_mapping secclass_map[] = { {"map_create", "map_read", "map_write", "prog_load", "prog_run"} }, { "xdp_socket", { COMMON_SOCK_PERMS, NULL } }, + { "perf_event", + {"open", "cpu", "kernel", "tracepoint", "read", "write"} }, { NULL } }; diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h index 586b7abd0aa7..a4a86cbcfb0a 100644 --- a/security/selinux/include/objsec.h +++ b/security/selinux/include/objsec.h @@ -141,7 +141,11 @@ struct pkey_security_struct { }; struct bpf_security_struct { - u32 sid; /*SID of bpf obj creater*/ + u32 sid; /* SID of bpf obj creator */ +}; + +struct perf_event_security_struct { + u32 sid; /* SID of perf_event obj creator */ }; extern struct lsm_blob_sizes selinux_blob_sizes; -- cgit v1.2.3 From 3da2e1fd46a726531a1e6d621bda62821a5e559a Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 18 Oct 2019 11:18:43 +0800 Subject: trace: Use pr_warn instead of pr_warning As said in commit f2c2cbcc35d4 ("powerpc: Use pr_warn instead of pr_warning"), removing pr_warning so all logging messages use a consistent _warn style. Let's do it. Link: http://lkml.kernel.org/r/20191018031850.48498-26-wangkefeng.wang@huawei.com To: linux-kernel@vger.kernel.org Cc: Ingo Molnar Signed-off-by: Kefeng Wang Acked-by: Steven Rostedt (VMware) Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/trace/trace_benchmark.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 80e0b2aca703..2e9a4746ea85 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -178,14 +178,14 @@ static int benchmark_event_kthread(void *arg) int trace_benchmark_reg(void) { if (!ok_to_run) { - pr_warning("trace benchmark cannot be started via kernel command line\n"); + pr_warn("trace benchmark cannot be started via kernel command line\n"); return -EBUSY; } bm_event_thread = kthread_run(benchmark_event_kthread, NULL, "event_benchmark"); if (IS_ERR(bm_event_thread)) { - pr_warning("trace benchmark failed to create kernel thread\n"); + pr_warn("trace benchmark failed to create kernel thread\n"); return PTR_ERR(bm_event_thread); } -- cgit v1.2.3 From 1f5343c0ae9673543055e9794362766e1f0ed163 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 18 Oct 2019 17:03:44 +0800 Subject: bpf: Fix build error without CONFIG_NET If CONFIG_NET is n, building fails: kernel/trace/bpf_trace.o: In function `raw_tp_prog_func_proto': bpf_trace.c:(.text+0x1a34): undefined reference to `bpf_skb_output_proto' Wrap it into a #ifdef to fix this. Fixes: a7658e1a4164 ("bpf: Check types of arguments passed into helpers") Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20191018090344.26936-1-yuehaibing@huawei.com --- kernel/trace/bpf_trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 52f7e9d8c29b..c3240898cc44 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1055,8 +1055,10 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto_raw_tp; +#ifdef CONFIG_NET case BPF_FUNC_skb_output: return &bpf_skb_output_proto; +#endif case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_raw_tp; case BPF_FUNC_get_stack: -- cgit v1.2.3 From 3820729160440158a014add69cc0d371061a96b2 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 24 Oct 2019 17:18:11 -0700 Subject: bpf: Prepare btf_ctx_access for non raw_tp use case This patch makes a few changes to btf_ctx_access() to prepare it for non raw_tp use case where the attach_btf_id is not necessary a BTF_KIND_TYPEDEF. It moves the "btf_trace_" prefix check and typedef-follow logic to a new function "check_attach_btf_id()" which is called only once during bpf_check(). btf_ctx_access() only operates on a BTF_KIND_FUNC_PROTO type now. That should also be more efficient since it is done only one instead of every-time check_ctx_access() is called. "check_attach_btf_id()" needs to find the func_proto type from the attach_btf_id. It needs to store the result into the newly added prog->aux->attach_func_proto. func_proto btf type has no name, so a proper name should be stored into "attach_func_name" also. v2: - Move the "btf_trace_" check to an earlier verifier phase (Alexei) Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191025001811.1718491-1-kafai@fb.com --- include/linux/bpf.h | 5 ++++ include/linux/btf.h | 31 ++++++++++++++++++++ kernel/bpf/btf.c | 73 +++++++++--------------------------------------- kernel/bpf/syscall.c | 4 +-- kernel/bpf/verifier.c | 52 +++++++++++++++++++++++++++++++++- kernel/trace/bpf_trace.c | 2 ++ 6 files changed, 103 insertions(+), 64 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2c2c29b49845..171be30fe0ae 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -392,6 +392,11 @@ struct bpf_prog_aux { u32 attach_btf_id; /* in-kernel BTF type id to attach to */ bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool offload_requested; + bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ + /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ + const struct btf_type *attach_func_proto; + /* function name for valid attach_btf_id */ + const char *attach_func_name; struct bpf_prog **func; void *jit_data; /* JIT specific data. arch dependent */ struct latch_tree_node ksym_tnode; diff --git a/include/linux/btf.h b/include/linux/btf.h index 55d43bc856be..9dee00859c5f 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -5,6 +5,7 @@ #define _LINUX_BTF_H 1 #include +#include struct btf; struct btf_member; @@ -53,6 +54,36 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); bool btf_type_is_void(const struct btf_type *t); +static inline bool btf_type_is_ptr(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_PTR; +} + +static inline bool btf_type_is_int(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_INT; +} + +static inline bool btf_type_is_enum(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM; +} + +static inline bool btf_type_is_typedef(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF; +} + +static inline bool btf_type_is_func(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC; +} + +static inline bool btf_type_is_func_proto(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO; +} + #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); const char *btf_name_by_offset(const struct btf *btf, u32 offset); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f7557af39756..128d89601d73 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -336,16 +336,6 @@ static bool btf_type_is_fwd(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_FWD; } -static bool btf_type_is_func(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC; -} - -static bool btf_type_is_func_proto(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO; -} - static bool btf_type_nosize(const struct btf_type *t) { return btf_type_is_void(t) || btf_type_is_fwd(t) || @@ -377,16 +367,6 @@ static bool btf_type_is_array(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; } -static bool btf_type_is_ptr(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_PTR; -} - -static bool btf_type_is_int(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_INT; -} - static bool btf_type_is_var(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_VAR; @@ -3442,54 +3422,27 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { + const struct btf_type *t = prog->aux->attach_func_proto; + const char *tname = prog->aux->attach_func_name; struct bpf_verifier_log *log = info->log; - u32 btf_id = prog->aux->attach_btf_id; const struct btf_param *args; - const struct btf_type *t; - const char prefix[] = "btf_trace_"; - const char *tname; u32 nr_args, arg; - if (!btf_id) - return true; - - if (IS_ERR(btf_vmlinux)) { - bpf_log(log, "btf_vmlinux is malformed\n"); - return false; - } - - t = btf_type_by_id(btf_vmlinux, btf_id); - if (!t || BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) { - bpf_log(log, "btf_id is invalid\n"); - return false; - } - - tname = __btf_name_by_offset(btf_vmlinux, t->name_off); - if (strncmp(prefix, tname, sizeof(prefix) - 1)) { - bpf_log(log, "btf_id points to wrong type name %s\n", tname); - return false; - } - tname += sizeof(prefix) - 1; - - t = btf_type_by_id(btf_vmlinux, t->type); - if (!btf_type_is_ptr(t)) - return false; - t = btf_type_by_id(btf_vmlinux, t->type); - if (!btf_type_is_func_proto(t)) - return false; - if (off % 8) { - bpf_log(log, "raw_tp '%s' offset %d is not multiple of 8\n", + bpf_log(log, "func '%s' offset %d is not multiple of 8\n", tname, off); return false; } arg = off / 8; args = (const struct btf_param *)(t + 1); - /* skip first 'void *__data' argument in btf_trace_##name typedef */ - args++; - nr_args = btf_type_vlen(t) - 1; + nr_args = btf_type_vlen(t); + if (prog->aux->attach_btf_trace) { + /* skip first 'void *__data' argument in btf_trace_##name typedef */ + args++; + nr_args--; + } if (arg >= nr_args) { - bpf_log(log, "raw_tp '%s' doesn't have %d-th argument\n", + bpf_log(log, "func '%s' doesn't have %d-th argument\n", tname, arg); return false; } @@ -3503,7 +3456,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return true; if (!btf_type_is_ptr(t)) { bpf_log(log, - "raw_tp '%s' arg%d '%s' has type %s. Only pointer access is allowed\n", + "func '%s' arg%d '%s' has type %s. Only pointer access is allowed\n", tname, arg, __btf_name_by_offset(btf_vmlinux, t->name_off), btf_kind_str[BTF_INFO_KIND(t->info)]); @@ -3526,11 +3479,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, t = btf_type_by_id(btf_vmlinux, t->type); if (!btf_type_is_struct(t)) { bpf_log(log, - "raw_tp '%s' arg%d type %s is not a struct\n", + "func '%s' arg%d type %s is not a struct\n", tname, arg, btf_kind_str[BTF_INFO_KIND(t->info)]); return false; } - bpf_log(log, "raw_tp '%s' arg%d has btf_id %d type %s '%s'\n", + bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n", tname, arg, info->btf_id, btf_kind_str[BTF_INFO_KIND(t->info)], __btf_name_by_offset(btf_vmlinux, t->name_off)); return true; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 16ea3c0db4f6..ff5225759553 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1848,9 +1848,7 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) goto out_put_prog; } /* raw_tp name is taken from type name instead */ - tp_name = kernel_type_name(prog->aux->attach_btf_id); - /* skip the prefix */ - tp_name += sizeof("btf_trace_") - 1; + tp_name = prog->aux->attach_func_name; } else { if (strncpy_from_user(buf, u64_to_user_ptr(attr->raw_tracepoint.name), diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 556e82f8869b..c59778c0fc4d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9372,6 +9372,52 @@ static void print_verification_stats(struct bpf_verifier_env *env) env->peak_states, env->longest_mark_read_walk); } +static int check_attach_btf_id(struct bpf_verifier_env *env) +{ + struct bpf_prog *prog = env->prog; + u32 btf_id = prog->aux->attach_btf_id; + const struct btf_type *t; + const char *tname; + + if (prog->type == BPF_PROG_TYPE_RAW_TRACEPOINT && btf_id) { + const char prefix[] = "btf_trace_"; + + t = btf_type_by_id(btf_vmlinux, btf_id); + if (!t) { + verbose(env, "attach_btf_id %u is invalid\n", btf_id); + return -EINVAL; + } + if (!btf_type_is_typedef(t)) { + verbose(env, "attach_btf_id %u is not a typedef\n", + btf_id); + return -EINVAL; + } + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + if (!tname || strncmp(prefix, tname, sizeof(prefix) - 1)) { + verbose(env, "attach_btf_id %u points to wrong type name %s\n", + btf_id, tname); + return -EINVAL; + } + tname += sizeof(prefix) - 1; + t = btf_type_by_id(btf_vmlinux, t->type); + if (!btf_type_is_ptr(t)) + /* should never happen in valid vmlinux build */ + return -EINVAL; + t = btf_type_by_id(btf_vmlinux, t->type); + if (!btf_type_is_func_proto(t)) + /* should never happen in valid vmlinux build */ + return -EINVAL; + + /* remember two read only pointers that are valid for + * the life time of the kernel + */ + prog->aux->attach_func_name = tname; + prog->aux->attach_func_proto = t; + prog->aux->attach_btf_trace = true; + } + return 0; +} + int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -9435,9 +9481,13 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, /* Either gcc or pahole or kernel are broken. */ verbose(env, "in-kernel BTF is malformed\n"); ret = PTR_ERR(btf_vmlinux); - goto err_unlock; + goto skip_full_check; } + ret = check_attach_btf_id(env); + if (ret) + goto skip_full_check; + env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index c3240898cc44..571c25d60710 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1080,6 +1080,8 @@ static bool raw_tp_prog_is_valid_access(int off, int size, return false; if (off % size != 0) return false; + if (!prog->aux->attach_btf_id) + return true; return btf_ctx_access(off, size, type, prog, info); } -- cgit v1.2.3 From f1b9509c2fb0ef4db8d22dac9aef8e856a5d81f6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 30 Oct 2019 15:32:11 -0700 Subject: bpf: Replace prog_raw_tp+btf_id with prog_tracing The bpf program type raw_tp together with 'expected_attach_type' was the most appropriate api to indicate BTF-enabled raw_tp programs. But during development it became apparent that 'expected_attach_type' cannot be used and new 'attach_btf_id' field had to be introduced. Which means that the information is duplicated in two fields where one of them is ignored. Clean it up by introducing new program type where both 'expected_attach_type' and 'attach_btf_id' fields have specific meaning. In the future 'expected_attach_type' will be extended with other attach points that have similar semantics to raw_tp. This patch is replacing BTF-enabled BPF_PROG_TYPE_RAW_TRACEPOINT with prog_type = BPF_RPOG_TYPE_TRACING expected_attach_type = BPF_TRACE_RAW_TP attach_btf_id = btf_id of raw tracepoint inside the kernel Future patches will add expected_attach_type = BPF_TRACE_FENTRY or BPF_TRACE_FEXIT where programs have the same input context and the same helpers, but different attach points. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191030223212.953010-2-ast@kernel.org --- include/linux/bpf.h | 5 +++++ include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 6 +++--- kernel/bpf/verifier.c | 34 ++++++++++++++++++++++++---------- kernel/trace/bpf_trace.c | 44 ++++++++++++++++++++++++++++++++++++-------- 6 files changed, 71 insertions(+), 21 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 171be30fe0ae..80158cff44bd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -373,6 +373,11 @@ enum bpf_cgroup_storage_type { #define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX +/* The longest tracepoint has 12 args. + * See include/trace/bpf_probe.h + */ +#define MAX_BPF_FUNC_ARGS 12 + struct bpf_prog_stats { u64 cnt; u64 nsecs; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 36a9c2325176..de14872b01ba 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -26,6 +26,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint) BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event) BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable) +BPF_PROG_TYPE(BPF_PROG_TYPE_TRACING, tracing) #endif #ifdef CONFIG_CGROUP_BPF BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4af8b0819a32..a6bf19dabaab 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -173,6 +173,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_CGROUP_SYSCTL, BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, BPF_PROG_TYPE_CGROUP_SOCKOPT, + BPF_PROG_TYPE_TRACING, }; enum bpf_attach_type { @@ -199,6 +200,7 @@ enum bpf_attach_type { BPF_CGROUP_UDP6_RECVMSG, BPF_CGROUP_GETSOCKOPT, BPF_CGROUP_SETSOCKOPT, + BPF_TRACE_RAW_TP, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ff5225759553..985d01ced196 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1571,7 +1571,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, u32 btf_id) { switch (prog_type) { - case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_TRACING: if (btf_id > BTF_MAX_TYPE) return -EINVAL; break; @@ -1833,13 +1833,13 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) return PTR_ERR(prog); if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT && + prog->type != BPF_PROG_TYPE_TRACING && prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) { err = -EINVAL; goto out_put_prog; } - if (prog->type == BPF_PROG_TYPE_RAW_TRACEPOINT && - prog->aux->attach_btf_id) { + if (prog->type == BPF_PROG_TYPE_TRACING) { if (attr->raw_tracepoint.name) { /* raw_tp name should not be specified in raw_tp * programs that were verified via in-kernel BTF info diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6b0de04f8b91..2f2374967b36 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9381,24 +9381,36 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; u32 btf_id = prog->aux->attach_btf_id; + const char prefix[] = "btf_trace_"; const struct btf_type *t; const char *tname; - if (prog->type == BPF_PROG_TYPE_RAW_TRACEPOINT && btf_id) { - const char prefix[] = "btf_trace_"; + if (prog->type != BPF_PROG_TYPE_TRACING) + return 0; - t = btf_type_by_id(btf_vmlinux, btf_id); - if (!t) { - verbose(env, "attach_btf_id %u is invalid\n", btf_id); - return -EINVAL; - } + if (!btf_id) { + verbose(env, "Tracing programs must provide btf_id\n"); + return -EINVAL; + } + t = btf_type_by_id(btf_vmlinux, btf_id); + if (!t) { + verbose(env, "attach_btf_id %u is invalid\n", btf_id); + return -EINVAL; + } + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + if (!tname) { + verbose(env, "attach_btf_id %u doesn't have a name\n", btf_id); + return -EINVAL; + } + + switch (prog->expected_attach_type) { + case BPF_TRACE_RAW_TP: if (!btf_type_is_typedef(t)) { verbose(env, "attach_btf_id %u is not a typedef\n", btf_id); return -EINVAL; } - tname = btf_name_by_offset(btf_vmlinux, t->name_off); - if (!tname || strncmp(prefix, tname, sizeof(prefix) - 1)) { + if (strncmp(prefix, tname, sizeof(prefix) - 1)) { verbose(env, "attach_btf_id %u points to wrong type name %s\n", btf_id, tname); return -EINVAL; @@ -9419,8 +9431,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_name = tname; prog->aux->attach_func_proto = t; prog->aux->attach_btf_trace = true; + return 0; + default: + return -EINVAL; } - return 0; } int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 571c25d60710..f50bf19f7a05 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1055,10 +1055,6 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto_raw_tp; -#ifdef CONFIG_NET - case BPF_FUNC_skb_output: - return &bpf_skb_output_proto; -#endif case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_raw_tp; case BPF_FUNC_get_stack: @@ -1068,20 +1064,44 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static const struct bpf_func_proto * +tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { +#ifdef CONFIG_NET + case BPF_FUNC_skb_output: + return &bpf_skb_output_proto; +#endif + default: + return raw_tp_prog_func_proto(func_id, prog); + } +} + static bool raw_tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { - /* largest tracepoint in the kernel has 12 args */ - if (off < 0 || off >= sizeof(__u64) * 12) + if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) + return false; + return true; +} + +static bool tracing_prog_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) return false; if (type != BPF_READ) return false; if (off % size != 0) return false; - if (!prog->aux->attach_btf_id) - return true; return btf_ctx_access(off, size, type, prog, info); } @@ -1093,6 +1113,14 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { const struct bpf_prog_ops raw_tracepoint_prog_ops = { }; +const struct bpf_verifier_ops tracing_verifier_ops = { + .get_func_proto = tracing_prog_func_proto, + .is_valid_access = tracing_prog_is_valid_access, +}; + +const struct bpf_prog_ops tracing_prog_ops = { +}; + static bool raw_tp_writable_prog_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, -- cgit v1.2.3 From eb1b66887472eaa7342305b7890ae510dd9d1a79 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Nov 2019 00:17:58 +0100 Subject: bpf: Make use of probe_user_write in probe write helper Convert the bpf_probe_write_user() helper to probe_user_write() such that writes are not attempted under KERNEL_DS anymore which is buggy as kernel and user space pointers can have overlapping addresses. Also, given we have the access_ok() check inside probe_user_write(), the helper doesn't need to do it twice. Fixes: 96ae52279594 ("bpf: Add bpf_probe_write_user BPF helper to be called in tracers") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/841c461781874c07a0ee404a454c3bc0459eed30.1572649915.git.daniel@iogearbox.net --- kernel/trace/bpf_trace.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f50bf19f7a05..2d87fcdcb19b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -163,7 +163,7 @@ static const struct bpf_func_proto bpf_probe_read_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, +BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src, u32, size) { /* @@ -186,10 +186,8 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, return -EPERM; if (unlikely(!nmi_uaccess_okay())) return -EPERM; - if (!access_ok(unsafe_ptr, size)) - return -EPERM; - return probe_kernel_write(unsafe_ptr, src, size); + return probe_user_write(unsafe_ptr, src, size); } static const struct bpf_func_proto bpf_probe_write_user_proto = { -- cgit v1.2.3 From 6ae08ae3dea2cfa03dd3665a3c8475c2d429ef47 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Nov 2019 00:17:59 +0100 Subject: bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers The current bpf_probe_read() and bpf_probe_read_str() helpers are broken in that they assume they can be used for probing memory access for kernel space addresses /as well as/ user space addresses. However, plain use of probe_kernel_read() for both cases will attempt to always access kernel space address space given access is performed under KERNEL_DS and some archs in-fact have overlapping address spaces where a kernel pointer and user pointer would have the /same/ address value and therefore accessing application memory via bpf_probe_read{,_str}() would read garbage values. Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess: Add non-pagefault user-space read functions"). Unfortunately, the only way to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}() and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}() helpers are kept as-is to retain their current behavior. The two *_user() variants attempt the access always under USER_DS set, the two *_kernel() variants will -EFAULT when accessing user memory if the underlying architecture has non-overlapping address ranges, also avoiding throwing the kernel warning via 00c42373d397 ("x86-64: add warning for non-canonical user access address dereferences"). Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper") Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net --- include/uapi/linux/bpf.h | 122 ++++++++++++++++++--------- kernel/trace/bpf_trace.c | 181 ++++++++++++++++++++++++++++++----------- tools/include/uapi/linux/bpf.h | 122 ++++++++++++++++++--------- 3 files changed, 299 insertions(+), 126 deletions(-) (limited to 'kernel/trace') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a6bf19dabaab..df6809a76404 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -563,10 +563,13 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read(void *dst, u32 size, const void *src) + * int bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr) * Description * For tracing programs, safely attempt to read *size* bytes from - * address *src* and store the data in *dst*. + * kernel space address *unsafe_ptr* and store the data in *dst*. + * + * Generally, use bpf_probe_read_user() or bpf_probe_read_kernel() + * instead. * Return * 0 on success, or a negative error in case of failure. * @@ -1428,45 +1431,14 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr) + * int bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) * Description - * Copy a NUL terminated string from an unsafe address - * *unsafe_ptr* to *dst*. The *size* should include the - * terminating NUL byte. In case the string length is smaller than - * *size*, the target is not padded with further NUL bytes. If the - * string length is larger than *size*, just *size*-1 bytes are - * copied and the last byte is set to NUL. - * - * On success, the length of the copied string is returned. This - * makes this helper useful in tracing programs for reading - * strings, and more importantly to get its length at runtime. See - * the following snippet: - * - * :: - * - * SEC("kprobe/sys_open") - * void bpf_sys_open(struct pt_regs *ctx) - * { - * char buf[PATHLEN]; // PATHLEN is defined to 256 - * int res = bpf_probe_read_str(buf, sizeof(buf), - * ctx->di); - * - * // Consume buf, for example push it to - * // userspace via bpf_perf_event_output(); we - * // can use res (the string length) as event - * // size, after checking its boundaries. - * } - * - * In comparison, using **bpf_probe_read()** helper here instead - * to read the string would require to estimate the length at - * compile time, and would often result in copying more memory - * than necessary. + * Copy a NUL terminated string from an unsafe kernel address + * *unsafe_ptr* to *dst*. See bpf_probe_read_kernel_str() for + * more details. * - * Another useful use case is when parsing individual process - * arguments or individual environment variables navigating - * *current*\ **->mm->arg_start** and *current*\ - * **->mm->env_start**: using this helper and the return value, - * one can quickly iterate at the right offset of the memory area. + * Generally, use bpf_probe_read_user_str() or bpf_probe_read_kernel_str() + * instead. * Return * On success, the strictly positive length of the string, * including the trailing NUL character. On error, a negative @@ -2777,6 +2749,72 @@ union bpf_attr { * restricted to raw_tracepoint bpf programs. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Safely attempt to read *size* bytes from user space address + * *unsafe_ptr* and store the data in *dst*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Safely attempt to read *size* bytes from kernel space address + * *unsafe_ptr* and store the data in *dst*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Copy a NUL terminated string from an unsafe user address + * *unsafe_ptr* to *dst*. The *size* should include the + * terminating NUL byte. In case the string length is smaller than + * *size*, the target is not padded with further NUL bytes. If the + * string length is larger than *size*, just *size*-1 bytes are + * copied and the last byte is set to NUL. + * + * On success, the length of the copied string is returned. This + * makes this helper useful in tracing programs for reading + * strings, and more importantly to get its length at runtime. See + * the following snippet: + * + * :: + * + * SEC("kprobe/sys_open") + * void bpf_sys_open(struct pt_regs *ctx) + * { + * char buf[PATHLEN]; // PATHLEN is defined to 256 + * int res = bpf_probe_read_user_str(buf, sizeof(buf), + * ctx->di); + * + * // Consume buf, for example push it to + * // userspace via bpf_perf_event_output(); we + * // can use res (the string length) as event + * // size, after checking its boundaries. + * } + * + * In comparison, using **bpf_probe_read_user()** helper here + * instead to read the string would require to estimate the length + * at compile time, and would often result in copying more memory + * than necessary. + * + * Another useful use case is when parsing individual process + * arguments or individual environment variables navigating + * *current*\ **->mm->arg_start** and *current*\ + * **->mm->env_start**: using this helper and the return value, + * one can quickly iterate at the right offset of the memory area. + * Return + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. + * + * int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* + * to *dst*. Same semantics as with bpf_probe_read_user_str() apply. + * Return + * On success, the strictly positive length of the string, including + * the trailing NUL character. On error, a negative value. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2890,7 +2928,11 @@ union bpf_attr { FN(sk_storage_delete), \ FN(send_signal), \ FN(tcp_gen_syncookie), \ - FN(skb_output), + FN(skb_output), \ + FN(probe_read_user), \ + FN(probe_read_kernel), \ + FN(probe_read_user_str), \ + FN(probe_read_kernel_str), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 2d87fcdcb19b..ffc91d4935ac 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -138,24 +138,140 @@ static const struct bpf_func_proto bpf_override_return_proto = { }; #endif -BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) +BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size, + const void __user *, unsafe_ptr) { - int ret; + int ret = probe_user_read(dst, unsafe_ptr, size); - ret = security_locked_down(LOCKDOWN_BPF_READ); - if (ret < 0) - goto out; + if (unlikely(ret < 0)) + memset(dst, 0, size); + + return ret; +} + +static const struct bpf_func_proto bpf_probe_read_user_proto = { + .func = bpf_probe_read_user, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size, + const void __user *, unsafe_ptr) +{ + int ret = strncpy_from_unsafe_user(dst, unsafe_ptr, size); + + if (unlikely(ret < 0)) + memset(dst, 0, size); + + return ret; +} + +static const struct bpf_func_proto bpf_probe_read_user_str_proto = { + .func = bpf_probe_read_user_str, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; - ret = probe_kernel_read(dst, unsafe_ptr, size); +static __always_inline int +bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr, + const bool compat) +{ + int ret = security_locked_down(LOCKDOWN_BPF_READ); + + if (unlikely(ret < 0)) + goto out; + ret = compat ? probe_kernel_read(dst, unsafe_ptr, size) : + probe_kernel_read_strict(dst, unsafe_ptr, size); if (unlikely(ret < 0)) out: memset(dst, 0, size); + return ret; +} + +BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size, + const void *, unsafe_ptr) +{ + return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, false); +} + +static const struct bpf_func_proto bpf_probe_read_kernel_proto = { + .func = bpf_probe_read_kernel, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size, + const void *, unsafe_ptr) +{ + return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, true); +} +static const struct bpf_func_proto bpf_probe_read_compat_proto = { + .func = bpf_probe_read_compat, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + +static __always_inline int +bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr, + const bool compat) +{ + int ret = security_locked_down(LOCKDOWN_BPF_READ); + + if (unlikely(ret < 0)) + goto out; + /* + * The strncpy_from_unsafe_*() call will likely not fill the entire + * buffer, but that's okay in this circumstance as we're probing + * arbitrary memory anyway similar to bpf_probe_read_*() and might + * as well probe the stack. Thus, memory is explicitly cleared + * only in error case, so that improper users ignoring return + * code altogether don't copy garbage; otherwise length of string + * is returned that can be used for bpf_perf_event_output() et al. + */ + ret = compat ? strncpy_from_unsafe(dst, unsafe_ptr, size) : + strncpy_from_unsafe_strict(dst, unsafe_ptr, size); + if (unlikely(ret < 0)) +out: + memset(dst, 0, size); return ret; } -static const struct bpf_func_proto bpf_probe_read_proto = { - .func = bpf_probe_read, +BPF_CALL_3(bpf_probe_read_kernel_str, void *, dst, u32, size, + const void *, unsafe_ptr) +{ + return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, false); +} + +static const struct bpf_func_proto bpf_probe_read_kernel_str_proto = { + .func = bpf_probe_read_kernel_str, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_probe_read_compat_str, void *, dst, u32, size, + const void *, unsafe_ptr) +{ + return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, true); +} + +static const struct bpf_func_proto bpf_probe_read_compat_str_proto = { + .func = bpf_probe_read_compat_str, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, @@ -583,41 +699,6 @@ static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { .arg2_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size, - const void *, unsafe_ptr) -{ - int ret; - - ret = security_locked_down(LOCKDOWN_BPF_READ); - if (ret < 0) - goto out; - - /* - * The strncpy_from_unsafe() call will likely not fill the entire - * buffer, but that's okay in this circumstance as we're probing - * arbitrary memory anyway similar to bpf_probe_read() and might - * as well probe the stack. Thus, memory is explicitly cleared - * only in error case, so that improper users ignoring return - * code altogether don't copy garbage; otherwise length of string - * is returned that can be used for bpf_perf_event_output() et al. - */ - ret = strncpy_from_unsafe(dst, unsafe_ptr, size); - if (unlikely(ret < 0)) -out: - memset(dst, 0, size); - - return ret; -} - -static const struct bpf_func_proto bpf_probe_read_str_proto = { - .func = bpf_probe_read_str, - .gpl_only = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_UNINIT_MEM, - .arg2_type = ARG_CONST_SIZE_OR_ZERO, - .arg3_type = ARG_ANYTHING, -}; - struct send_signal_irq_work { struct irq_work irq_work; struct task_struct *task; @@ -697,8 +778,6 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_pop_elem_proto; case BPF_FUNC_map_peek_elem: return &bpf_map_peek_elem_proto; - case BPF_FUNC_probe_read: - return &bpf_probe_read_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; case BPF_FUNC_tail_call: @@ -725,8 +804,18 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_current_task_under_cgroup_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; + case BPF_FUNC_probe_read_user: + return &bpf_probe_read_user_proto; + case BPF_FUNC_probe_read_kernel: + return &bpf_probe_read_kernel_proto; + case BPF_FUNC_probe_read: + return &bpf_probe_read_compat_proto; + case BPF_FUNC_probe_read_user_str: + return &bpf_probe_read_user_str_proto; + case BPF_FUNC_probe_read_kernel_str: + return &bpf_probe_read_kernel_str_proto; case BPF_FUNC_probe_read_str: - return &bpf_probe_read_str_proto; + return &bpf_probe_read_compat_str_proto; #ifdef CONFIG_CGROUPS case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a6bf19dabaab..df6809a76404 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -563,10 +563,13 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read(void *dst, u32 size, const void *src) + * int bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr) * Description * For tracing programs, safely attempt to read *size* bytes from - * address *src* and store the data in *dst*. + * kernel space address *unsafe_ptr* and store the data in *dst*. + * + * Generally, use bpf_probe_read_user() or bpf_probe_read_kernel() + * instead. * Return * 0 on success, or a negative error in case of failure. * @@ -1428,45 +1431,14 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr) + * int bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) * Description - * Copy a NUL terminated string from an unsafe address - * *unsafe_ptr* to *dst*. The *size* should include the - * terminating NUL byte. In case the string length is smaller than - * *size*, the target is not padded with further NUL bytes. If the - * string length is larger than *size*, just *size*-1 bytes are - * copied and the last byte is set to NUL. - * - * On success, the length of the copied string is returned. This - * makes this helper useful in tracing programs for reading - * strings, and more importantly to get its length at runtime. See - * the following snippet: - * - * :: - * - * SEC("kprobe/sys_open") - * void bpf_sys_open(struct pt_regs *ctx) - * { - * char buf[PATHLEN]; // PATHLEN is defined to 256 - * int res = bpf_probe_read_str(buf, sizeof(buf), - * ctx->di); - * - * // Consume buf, for example push it to - * // userspace via bpf_perf_event_output(); we - * // can use res (the string length) as event - * // size, after checking its boundaries. - * } - * - * In comparison, using **bpf_probe_read()** helper here instead - * to read the string would require to estimate the length at - * compile time, and would often result in copying more memory - * than necessary. + * Copy a NUL terminated string from an unsafe kernel address + * *unsafe_ptr* to *dst*. See bpf_probe_read_kernel_str() for + * more details. * - * Another useful use case is when parsing individual process - * arguments or individual environment variables navigating - * *current*\ **->mm->arg_start** and *current*\ - * **->mm->env_start**: using this helper and the return value, - * one can quickly iterate at the right offset of the memory area. + * Generally, use bpf_probe_read_user_str() or bpf_probe_read_kernel_str() + * instead. * Return * On success, the strictly positive length of the string, * including the trailing NUL character. On error, a negative @@ -2777,6 +2749,72 @@ union bpf_attr { * restricted to raw_tracepoint bpf programs. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Safely attempt to read *size* bytes from user space address + * *unsafe_ptr* and store the data in *dst*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Safely attempt to read *size* bytes from kernel space address + * *unsafe_ptr* and store the data in *dst*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Copy a NUL terminated string from an unsafe user address + * *unsafe_ptr* to *dst*. The *size* should include the + * terminating NUL byte. In case the string length is smaller than + * *size*, the target is not padded with further NUL bytes. If the + * string length is larger than *size*, just *size*-1 bytes are + * copied and the last byte is set to NUL. + * + * On success, the length of the copied string is returned. This + * makes this helper useful in tracing programs for reading + * strings, and more importantly to get its length at runtime. See + * the following snippet: + * + * :: + * + * SEC("kprobe/sys_open") + * void bpf_sys_open(struct pt_regs *ctx) + * { + * char buf[PATHLEN]; // PATHLEN is defined to 256 + * int res = bpf_probe_read_user_str(buf, sizeof(buf), + * ctx->di); + * + * // Consume buf, for example push it to + * // userspace via bpf_perf_event_output(); we + * // can use res (the string length) as event + * // size, after checking its boundaries. + * } + * + * In comparison, using **bpf_probe_read_user()** helper here + * instead to read the string would require to estimate the length + * at compile time, and would often result in copying more memory + * than necessary. + * + * Another useful use case is when parsing individual process + * arguments or individual environment variables navigating + * *current*\ **->mm->arg_start** and *current*\ + * **->mm->env_start**: using this helper and the return value, + * one can quickly iterate at the right offset of the memory area. + * Return + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. + * + * int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* + * to *dst*. Same semantics as with bpf_probe_read_user_str() apply. + * Return + * On success, the strictly positive length of the string, including + * the trailing NUL character. On error, a negative value. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2890,7 +2928,11 @@ union bpf_attr { FN(sk_storage_delete), \ FN(send_signal), \ FN(tcp_gen_syncookie), \ - FN(skb_output), + FN(skb_output), \ + FN(probe_read_user), \ + FN(probe_read_kernel), \ + FN(probe_read_user_str), \ + FN(probe_read_kernel_str), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 7162431dcf72032835d369c8d7b51311df407938 Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Wed, 16 Oct 2019 13:33:13 +0200 Subject: ftrace: Introduce PERMANENT ftrace_ops flag Livepatch uses ftrace for redirection to new patched functions. It means that if ftrace is disabled, all live patched functions are disabled as well. Toggling global 'ftrace_enabled' sysctl thus affect it directly. It is not a problem per se, because only administrator can set sysctl values, but it still may be surprising. Introduce PERMANENT ftrace_ops flag to amend this. If the FTRACE_OPS_FL_PERMANENT is set on any ftrace ops, the tracing cannot be disabled by disabling ftrace_enabled. Equally, a callback with the flag set cannot be registered if ftrace_enabled is disabled. Link: http://lkml.kernel.org/r/20191016113316.13415-2-mbenes@suse.cz Reviewed-by: Petr Mladek Reviewed-by: Kamalesh Babulal Signed-off-by: Miroslav Benes Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/ftrace-uses.rst | 8 ++++++++ Documentation/trace/ftrace.rst | 4 +++- include/linux/ftrace.h | 3 +++ kernel/livepatch/patch.c | 3 ++- kernel/trace/ftrace.c | 23 +++++++++++++++++++++-- 5 files changed, 37 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/Documentation/trace/ftrace-uses.rst b/Documentation/trace/ftrace-uses.rst index 1fbc69894eed..740bd0224d35 100644 --- a/Documentation/trace/ftrace-uses.rst +++ b/Documentation/trace/ftrace-uses.rst @@ -170,6 +170,14 @@ FTRACE_OPS_FL_RCU a callback may be executed and RCU synchronization will not protect it. +FTRACE_OPS_FL_PERMANENT + If this is set on any ftrace ops, then the tracing cannot disabled by + writing 0 to the proc sysctl ftrace_enabled. Equally, a callback with + the flag set cannot be registered if ftrace_enabled is 0. + + Livepatch uses it not to lose the function redirection, so the system + stays protected. + Filtering which functions to trace ================================== diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst index e3060eedb22d..d2b5657ed33e 100644 --- a/Documentation/trace/ftrace.rst +++ b/Documentation/trace/ftrace.rst @@ -2976,7 +2976,9 @@ Note, the proc sysctl ftrace_enable is a big on/off switch for the function tracer. By default it is enabled (when function tracing is enabled in the kernel). If it is disabled, all function tracing is disabled. This includes not only the function tracers for ftrace, but -also for any other uses (perf, kprobes, stack tracing, profiling, etc). +also for any other uses (perf, kprobes, stack tracing, profiling, etc). It +cannot be disabled if there is a callback with FTRACE_OPS_FL_PERMANENT set +registered. Please disable this with care. diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 8a8cb3c401b2..8385cafe4f9f 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -142,6 +142,8 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops); * PID - Is affected by set_ftrace_pid (allows filtering on those pids) * RCU - Set when the ops can only be called when RCU is watching. * TRACE_ARRAY - The ops->private points to a trace_array descriptor. + * PERMANENT - Set when the ops is permanent and should not be affected by + * ftrace_enabled. */ enum { FTRACE_OPS_FL_ENABLED = 1 << 0, @@ -160,6 +162,7 @@ enum { FTRACE_OPS_FL_PID = 1 << 13, FTRACE_OPS_FL_RCU = 1 << 14, FTRACE_OPS_FL_TRACE_ARRAY = 1 << 15, + FTRACE_OPS_FL_PERMANENT = 1 << 16, }; #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index bd43537702bd..b552cf2d85f8 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -196,7 +196,8 @@ static int klp_patch_func(struct klp_func *func) ops->fops.func = klp_ftrace_handler; ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_DYNAMIC | - FTRACE_OPS_FL_IPMODIFY; + FTRACE_OPS_FL_IPMODIFY | + FTRACE_OPS_FL_PERMANENT; list_add(&ops->node, &klp_ops); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f296d89be757..89e9128652ef 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -326,6 +326,8 @@ int __register_ftrace_function(struct ftrace_ops *ops) if (ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED) ops->flags |= FTRACE_OPS_FL_SAVE_REGS; #endif + if (!ftrace_enabled && (ops->flags & FTRACE_OPS_FL_PERMANENT)) + return -EBUSY; if (!core_kernel_data((unsigned long)ops)) ops->flags |= FTRACE_OPS_FL_DYNAMIC; @@ -6754,6 +6756,18 @@ int unregister_ftrace_function(struct ftrace_ops *ops) } EXPORT_SYMBOL_GPL(unregister_ftrace_function); +static bool is_permanent_ops_registered(void) +{ + struct ftrace_ops *op; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (op->flags & FTRACE_OPS_FL_PERMANENT) + return true; + } while_for_each_ftrace_op(op); + + return false; +} + int ftrace_enable_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -6771,8 +6785,6 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) goto out; - last_ftrace_enabled = !!ftrace_enabled; - if (ftrace_enabled) { /* we are starting ftrace again */ @@ -6783,12 +6795,19 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, ftrace_startup_sysctl(); } else { + if (is_permanent_ops_registered()) { + ftrace_enabled = true; + ret = -EBUSY; + goto out; + } + /* stopping ftrace calls (just send to ftrace_stub) */ ftrace_trace_function = ftrace_stub; ftrace_shutdown_sysctl(); } + last_ftrace_enabled = !!ftrace_enabled; out: mutex_unlock(&ftrace_lock); return ret; -- cgit v1.2.3 From fbf6c73c5b264c25484fa9f449b5546569fe11f0 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 16 Oct 2019 17:51:10 +0100 Subject: ftrace: add ftrace_init_nop() Architectures may need to perform special initialization of ftrace callsites, and today they do so by special-casing ftrace_make_nop() when the expected branch address is MCOUNT_ADDR. In some cases (e.g. for patchable-function-entry), we don't have an mcount-like symbol and don't want a synthetic MCOUNT_ADDR, but we may need to perform some initialization of callsites. To make it possible to separate initialization from runtime modification, and to handle cases without an mcount-like symbol, this patch adds an optional ftrace_init_nop() function that architectures can implement, which does not pass a branch address. Where an architecture does not provide ftrace_init_nop(), we will fall back to the existing behaviour of calling ftrace_make_nop() with MCOUNT_ADDR. At the same time, ftrace_code_disable() is renamed to ftrace_nop_initialize() to make it clearer that it is intended to intialize a callsite into a disabled state, and is not for disabling a callsite that has been runtime enabled. The kerneldoc description of rec arguments is updated to cover non-mcount callsites. Signed-off-by: Mark Rutland Reviewed-by: Amit Daniel Kachhap Reviewed-by: Ard Biesheuvel Reviewed-by: Miroslav Benes Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Torsten Duwe Tested-by: Amit Daniel Kachhap Tested-by: Sven Schnelle Tested-by: Torsten Duwe Cc: Ingo Molnar --- include/linux/ftrace.h | 35 ++++++++++++++++++++++++++++++++--- kernel/trace/ftrace.c | 6 +++--- 2 files changed, 35 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 8a8cb3c401b2..9867d90d635e 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -499,7 +499,7 @@ static inline int ftrace_disable_ftrace_graph_caller(void) { return 0; } /** * ftrace_make_nop - convert code into nop * @mod: module structure if called by module load initialization - * @rec: the mcount call site record + * @rec: the call site record (e.g. mcount/fentry) * @addr: the address that the call site should be calling * * This is a very sensitive operation and great care needs @@ -520,9 +520,38 @@ static inline int ftrace_disable_ftrace_graph_caller(void) { return 0; } extern int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr); + +/** + * ftrace_init_nop - initialize a nop call site + * @mod: module structure if called by module load initialization + * @rec: the call site record (e.g. mcount/fentry) + * + * This is a very sensitive operation and great care needs + * to be taken by the arch. The operation should carefully + * read the location, check to see if what is read is indeed + * what we expect it to be, and then on success of the compare, + * it should write to the location. + * + * The code segment at @rec->ip should contain the contents created by + * the compiler + * + * Return must be: + * 0 on success + * -EFAULT on error reading the location + * -EINVAL on a failed compare of the contents + * -EPERM on error writing to the location + * Any other value will be considered a failure. + */ +#ifndef ftrace_init_nop +static inline int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) +{ + return ftrace_make_nop(mod, rec, MCOUNT_ADDR); +} +#endif + /** * ftrace_make_call - convert a nop call site into a call to addr - * @rec: the mcount call site record + * @rec: the call site record (e.g. mcount/fentry) * @addr: the address that the call site should call * * This is a very sensitive operation and great care needs @@ -545,7 +574,7 @@ extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS /** * ftrace_modify_call - convert from one addr to another (no nop) - * @rec: the mcount call site record + * @rec: the call site record (e.g. mcount/fentry) * @old_addr: the address expected to be currently called to * @addr: the address to change to * diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f296d89be757..5259d4dea675 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2494,14 +2494,14 @@ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter) } static int -ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) +ftrace_nop_initialize(struct module *mod, struct dyn_ftrace *rec) { int ret; if (unlikely(ftrace_disabled)) return 0; - ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); + ret = ftrace_init_nop(mod, rec); if (ret) { ftrace_bug_type = FTRACE_BUG_INIT; ftrace_bug(ret, rec); @@ -2943,7 +2943,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) * to the NOP instructions. */ if (!__is_defined(CC_USING_NOP_MCOUNT) && - !ftrace_code_disable(mod, p)) + !ftrace_nop_initialize(mod, p)) break; update_cnt++; -- cgit v1.2.3 From 714641c3670cdc75371a7ff5bdfd5e9a170c7ffd Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 12:25:46 -0500 Subject: ftrace: Separate out the copying of a ftrace_hash from __ftrace_hash_move() Most of the functionality of __ftrace_hash_move() can be reused, but not all of it. That is, __ftrace_hash_move() is used to simply make a new hash from an existing one, using the same size as the original. Creating a dup_hash(), where we can specify a new size will be useful when we want to create a hash with a default size, or simply copy the old one. Signed-off-by: Steven Rostedt (VMWare) --- kernel/trace/ftrace.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 89e9128652ef..76e5de8c7822 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1372,23 +1372,15 @@ ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash); static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, struct ftrace_hash *new_hash); -static struct ftrace_hash * -__ftrace_hash_move(struct ftrace_hash *src) +static struct ftrace_hash *dup_hash(struct ftrace_hash *src, int size) { struct ftrace_func_entry *entry; - struct hlist_node *tn; - struct hlist_head *hhd; struct ftrace_hash *new_hash; - int size = src->count; + struct hlist_head *hhd; + struct hlist_node *tn; int bits = 0; int i; - /* - * If the new source is empty, just return the empty_hash. - */ - if (ftrace_hash_empty(src)) - return EMPTY_HASH; - /* * Make the hash size about 1/2 the # found */ @@ -1413,10 +1405,23 @@ __ftrace_hash_move(struct ftrace_hash *src) __add_hash_entry(new_hash, entry); } } - return new_hash; } +static struct ftrace_hash * +__ftrace_hash_move(struct ftrace_hash *src) +{ + int size = src->count; + + /* + * If the new source is empty, just return the empty_hash. + */ + if (ftrace_hash_empty(src)) + return EMPTY_HASH; + + return dup_hash(src, size); +} + static int ftrace_hash_move(struct ftrace_ops *ops, int enable, struct ftrace_hash **dst, struct ftrace_hash *src) -- cgit v1.2.3 From 7e16f581a81759bafea04d049134b32d1a881226 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 12:26:46 -0500 Subject: ftrace: Separate out functionality from ftrace_location_range() Create a new function called lookup_rec() from the functionality of ftrace_location_range(). The difference between lookup_rec() is that it returns the record that it finds, where as ftrace_location_range() returns only if it found a match or not. The lookup_rec() is static, and can be used for new functionality where ftrace needs to find a record of a specific address. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 76e5de8c7822..b0e7f03919de 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1541,6 +1541,26 @@ static int ftrace_cmp_recs(const void *a, const void *b) return 0; } +static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec = NULL; + struct dyn_ftrace key; + + key.ip = start; + key.flags = end; /* overload flags, as it is unsigned long */ + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + if (end < pg->records[0].ip || + start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) + continue; + rec = bsearch(&key, pg->records, pg->index, + sizeof(struct dyn_ftrace), + ftrace_cmp_recs); + } + return rec; +} + /** * ftrace_location_range - return the first address of a traced location * if it touches the given ip range @@ -1555,23 +1575,11 @@ static int ftrace_cmp_recs(const void *a, const void *b) */ unsigned long ftrace_location_range(unsigned long start, unsigned long end) { - struct ftrace_page *pg; struct dyn_ftrace *rec; - struct dyn_ftrace key; - key.ip = start; - key.flags = end; /* overload flags, as it is unsigned long */ - - for (pg = ftrace_pages_start; pg; pg = pg->next) { - if (end < pg->records[0].ip || - start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) - continue; - rec = bsearch(&key, pg->records, pg->index, - sizeof(struct dyn_ftrace), - ftrace_cmp_recs); - if (rec) - return rec->ip; - } + rec = lookup_rec(start, end); + if (rec) + return rec->ip; return 0; } -- cgit v1.2.3 From 153bedbac2ebd475e1c7c2d2fa0c042f5525927d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 Nov 2019 17:08:55 +0100 Subject: irq_work: Convert flags to atomic_t We need to convert flags to atomic_t in order to later fix an ordering issue on atomic_cmpxchg() failure. This will allow us to use atomic_fetch_or(). Also clarify the nature of those flags. [ mingo: Converted two more usage site the original patch missed. ] Signed-off-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191108160858.31665-2-frederic@kernel.org Signed-off-by: Ingo Molnar --- include/linux/irq_work.h | 10 +++++++--- kernel/bpf/stackmap.c | 2 +- kernel/irq_work.c | 18 +++++++++--------- kernel/printk/printk.c | 2 +- kernel/trace/bpf_trace.c | 2 +- 5 files changed, 19 insertions(+), 15 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index b11fcdfd0770..02da997ad12c 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -22,7 +22,7 @@ #define IRQ_WORK_CLAIMED (IRQ_WORK_PENDING | IRQ_WORK_BUSY) struct irq_work { - unsigned long flags; + atomic_t flags; struct llist_node llnode; void (*func)(struct irq_work *); }; @@ -30,11 +30,15 @@ struct irq_work { static inline void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *)) { - work->flags = 0; + atomic_set(&work->flags, 0); work->func = func; } -#define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { .func = (_f), } +#define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { \ + .flags = ATOMIC_INIT(0), \ + .func = (_f) \ +} + bool irq_work_queue(struct irq_work *work); bool irq_work_queue_on(struct irq_work *work, int cpu); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 052580c33d26..4d31284095e2 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -289,7 +289,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, if (in_nmi()) { work = this_cpu_ptr(&up_read_work); - if (work->irq_work.flags & IRQ_WORK_BUSY) + if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) /* cannot queue more up_read, fallback */ irq_work_busy = true; } diff --git a/kernel/irq_work.c b/kernel/irq_work.c index d42acaf81886..df0dbf4d859b 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -29,16 +29,16 @@ static DEFINE_PER_CPU(struct llist_head, lazy_list); */ static bool irq_work_claim(struct irq_work *work) { - unsigned long flags, oflags, nflags; + int flags, oflags, nflags; /* * Start with our best wish as a premise but only trust any * flag value after cmpxchg() result. */ - flags = work->flags & ~IRQ_WORK_PENDING; + flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING; for (;;) { nflags = flags | IRQ_WORK_CLAIMED; - oflags = cmpxchg(&work->flags, flags, nflags); + oflags = atomic_cmpxchg(&work->flags, flags, nflags); if (oflags == flags) break; if (oflags & IRQ_WORK_PENDING) @@ -61,7 +61,7 @@ void __weak arch_irq_work_raise(void) static void __irq_work_queue_local(struct irq_work *work) { /* If the work is "lazy", handle it from next tick if any */ - if (work->flags & IRQ_WORK_LAZY) { + if (atomic_read(&work->flags) & IRQ_WORK_LAZY) { if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && tick_nohz_tick_stopped()) arch_irq_work_raise(); @@ -143,7 +143,7 @@ static void irq_work_run_list(struct llist_head *list) { struct irq_work *work, *tmp; struct llist_node *llnode; - unsigned long flags; + int flags; BUG_ON(!irqs_disabled()); @@ -159,15 +159,15 @@ static void irq_work_run_list(struct llist_head *list) * to claim that work don't rely on us to handle their data * while we are in the middle of the func. */ - flags = work->flags & ~IRQ_WORK_PENDING; - xchg(&work->flags, flags); + flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING; + atomic_xchg(&work->flags, flags); work->func(work); /* * Clear the BUSY bit and return to the free state if * no-one else claimed it meanwhile. */ - (void)cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); + (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); } } @@ -199,7 +199,7 @@ void irq_work_sync(struct irq_work *work) { lockdep_assert_irqs_enabled(); - while (work->flags & IRQ_WORK_BUSY) + while (atomic_read(&work->flags) & IRQ_WORK_BUSY) cpu_relax(); } EXPORT_SYMBOL_GPL(irq_work_sync); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ca65327a6de8..865727373a3b 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2961,7 +2961,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { .func = wake_up_klogd_work_func, - .flags = IRQ_WORK_LAZY, + .flags = ATOMIC_INIT(IRQ_WORK_LAZY), }; void wake_up_klogd(void) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 44bd08f2443b..ff467a4e2639 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -660,7 +660,7 @@ BPF_CALL_1(bpf_send_signal, u32, sig) return -EINVAL; work = this_cpu_ptr(&send_signal_work); - if (work->irq_work.flags & IRQ_WORK_BUSY) + if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) return -EBUSY; /* Add the current task, which is the target of sending signal, -- cgit v1.2.3 From 67c0496e87d193b8356d2af49ab95e8a1b954b3c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:30 -0800 Subject: kernfs: convert kernfs_node->id from union kernfs_node_id to u64 kernfs_node->id is currently a union kernfs_node_id which represents either a 32bit (ino, gen) pair or u64 value. I can't see much value in the usage of the union - all that's needed is a 64bit ID which the current code is already limited to. Using a union makes the code unnecessarily complicated and prevents using 64bit ino without adding practical benefits. This patch drops union kernfs_node_id and makes kernfs_node->id a u64. ino is stored in the lower 32bits and gen upper. Accessors - kernfs[_id]_ino() and kernfs[_id]_gen() - are added to retrieve the ino and gen. This simplifies ID handling less cumbersome and will allow using 64bit inos on supported archs. This patch doesn't make any functional changes. Signed-off-by: Tejun Heo Reviewed-by: Greg Kroah-Hartman Cc: Namhyung Kim Cc: Jens Axboe Cc: Alexei Starovoitov --- fs/kernfs/dir.c | 10 +++--- fs/kernfs/file.c | 4 +-- fs/kernfs/inode.c | 4 +-- fs/kernfs/mount.c | 7 ++--- include/linux/cgroup.h | 17 +++++----- include/linux/kernfs.h | 45 ++++++++++++++++----------- include/trace/events/writeback.h | 4 +-- kernel/bpf/helpers.c | 2 +- kernel/bpf/local_storage.c | 2 +- kernel/cgroup/cgroup.c | 3 +- kernel/trace/blktrace.c | 67 +++++++++++++++++++--------------------- net/core/filter.c | 4 +-- 12 files changed, 85 insertions(+), 84 deletions(-) (limited to 'kernel/trace') diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index beabb585a7d8..c67afb591e5b 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -532,7 +532,7 @@ void kernfs_put(struct kernfs_node *kn) kmem_cache_free(kernfs_iattrs_cache, kn->iattr); } spin_lock(&kernfs_idr_lock); - idr_remove(&root->ino_idr, kn->id.ino); + idr_remove(&root->ino_idr, kernfs_ino(kn)); spin_unlock(&kernfs_idr_lock); kmem_cache_free(kernfs_node_cache, kn); @@ -639,8 +639,8 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, idr_preload_end(); if (ret < 0) goto err_out2; - kn->id.ino = ret; - kn->id.generation = gen; + + kn->id = (u64)gen << 32 | ret; atomic_set(&kn->count, 1); atomic_set(&kn->active, KN_DEACTIVATED_BIAS); @@ -671,7 +671,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, return kn; err_out3: - idr_remove(&root->ino_idr, kn->id.ino); + idr_remove(&root->ino_idr, kernfs_ino(kn)); err_out2: kmem_cache_free(kernfs_node_cache, kn); err_out1: @@ -1656,7 +1656,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) const char *name = pos->name; unsigned int type = dt_type(pos); int len = strlen(name); - ino_t ino = pos->id.ino; + ino_t ino = kernfs_ino(pos); ctx->pos = pos->hash; file->private_data = pos; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index e8c792b49616..34366db3620d 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -892,7 +892,7 @@ repeat: * have the matching @file available. Look up the inodes * and generate the events manually. */ - inode = ilookup(info->sb, kn->id.ino); + inode = ilookup(info->sb, kernfs_ino(kn)); if (!inode) continue; @@ -901,7 +901,7 @@ repeat: if (parent) { struct inode *p_inode; - p_inode = ilookup(info->sb, parent->id.ino); + p_inode = ilookup(info->sb, kernfs_ino(parent)); if (p_inode) { fsnotify(p_inode, FS_MODIFY | FS_EVENT_ON_CHILD, inode, FSNOTIFY_EVENT_INODE, &name, 0); diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index f3eaa8869f42..eac277c63d42 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -201,7 +201,7 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) inode->i_private = kn; inode->i_mapping->a_ops = &kernfs_aops; inode->i_op = &kernfs_iops; - inode->i_generation = kn->id.generation; + inode->i_generation = kernfs_gen(kn); set_default_inode_attr(inode, kn->mode); kernfs_refresh_inode(kn, inode); @@ -247,7 +247,7 @@ struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) { struct inode *inode; - inode = iget_locked(sb, kn->id.ino); + inode = iget_locked(sb, kernfs_ino(kn)); if (inode && (inode->i_state & I_NEW)) kernfs_init_inode(kn, inode); diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 067b7c380056..f05d5d6f926d 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -57,15 +57,14 @@ const struct super_operations kernfs_sops = { * Similar to kernfs_fh_get_inode, this one gets kernfs node from inode * number and generation */ -struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, - const union kernfs_node_id *id) +struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, u64 id) { struct kernfs_node *kn; - kn = kernfs_find_and_get_node_by_ino(root, id->ino); + kn = kernfs_find_and_get_node_by_ino(root, kernfs_id_ino(id)); if (!kn) return NULL; - if (kn->id.generation != id->generation) { + if (kernfs_gen(kn) != kernfs_id_gen(id)) { kernfs_put(kn); return NULL; } diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f6b048902d6c..815fff49d555 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -616,7 +616,7 @@ static inline bool cgroup_is_populated(struct cgroup *cgrp) /* returns ino associated with a cgroup */ static inline ino_t cgroup_ino(struct cgroup *cgrp) { - return cgrp->kn->id.ino; + return kernfs_ino(cgrp->kn); } /* cft/css accessors for cftype->write() operation */ @@ -687,13 +687,12 @@ static inline void cgroup_kthread_ready(void) current->no_cgroup_migration = 0; } -static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) +static inline u64 cgroup_get_kernfs_id(struct cgroup *cgrp) { - return &cgrp->kn->id; + return cgrp->kn->id; } -void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, - char *buf, size_t buflen); +void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen); #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; @@ -718,9 +717,9 @@ static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } static inline void cgroup_init_kthreadd(void) {} static inline void cgroup_kthread_ready(void) {} -static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) +static inline union u64 cgroup_get_kernfs_id(struct cgroup *cgrp) { - return NULL; + return 0; } static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) @@ -739,8 +738,8 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, return true; } -static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, - char *buf, size_t buflen) {} +static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) +{} #endif /* !CONFIG_CGROUPS */ #ifdef CONFIG_CGROUPS diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index f797ccc650e7..b2fc5c8ef6d9 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -104,21 +104,6 @@ struct kernfs_elem_attr { struct kernfs_node *notify_next; /* for kernfs_notify() */ }; -/* represent a kernfs node */ -union kernfs_node_id { - struct { - /* - * blktrace will export this struct as a simplified 'struct - * fid' (which is a big data struction), so userspace can use - * it to find kernfs node. The layout must match the first two - * fields of 'struct fid' exactly. - */ - u32 ino; - u32 generation; - }; - u64 id; -}; - /* * kernfs_node - the building block of kernfs hierarchy. Each and every * kernfs node is represented by single kernfs_node. Most fields are @@ -155,7 +140,12 @@ struct kernfs_node { void *priv; - union kernfs_node_id id; + /* + * 64bit unique ID. Lower 32bits carry the inode number and lower + * generation. + */ + u64 id; + unsigned short flags; umode_t mode; struct kernfs_iattrs *iattr; @@ -292,6 +282,26 @@ static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) return kn->flags & KERNFS_TYPE_MASK; } +static inline ino_t kernfs_id_ino(u64 id) +{ + return (u32)id; +} + +static inline u32 kernfs_id_gen(u64 id) +{ + return id >> 32; +} + +static inline ino_t kernfs_ino(struct kernfs_node *kn) +{ + return kernfs_id_ino(kn->id); +} + +static inline ino_t kernfs_gen(struct kernfs_node *kn) +{ + return kernfs_id_gen(kn->id); +} + /** * kernfs_enable_ns - enable namespace under a directory * @kn: directory of interest, should be empty @@ -383,8 +393,7 @@ void kernfs_kill_sb(struct super_block *sb); void kernfs_init(void); -struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, - const union kernfs_node_id *id); +struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, u64 id); #else /* CONFIG_KERNFS */ static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 95e50677476b..b4f0ffe1817e 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -152,7 +152,7 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode, static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) { - return wb->memcg_css->cgroup->kn->id.ino; + return cgroup_ino(wb->memcg_css->cgroup); } static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) @@ -260,7 +260,7 @@ TRACE_EVENT(track_foreign_dirty, __entry->ino = inode ? inode->i_ino : 0; __entry->memcg_id = wb->memcg_css->id; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); - __entry->page_cgroup_ino = page->mem_cgroup->css.cgroup->kn->id.ino; + __entry->page_cgroup_ino = cgroup_ino(page->mem_cgroup->css.cgroup); ), TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu", diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5e28718928ca..912e761cd17a 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -317,7 +317,7 @@ BPF_CALL_0(bpf_get_current_cgroup_id) { struct cgroup *cgrp = task_dfl_cgroup(current); - return cgrp->kn->id.id; + return cgrp->kn->id; } const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index addd6fdceec8..5d867f6d7204 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -569,7 +569,7 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, return; storage->key.attach_type = type; - storage->key.cgroup_inode_id = cgroup->kn->id.id; + storage->key.cgroup_inode_id = cgroup->kn->id; map = storage->map; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index cf32c0c7a45d..c6bd1a5a1977 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5786,8 +5786,7 @@ static int __init cgroup_wq_init(void) } core_initcall(cgroup_wq_init); -void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, - char *buf, size_t buflen) +void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) { struct kernfs_node *kn; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2d6e93ab0478..a986d2e74ca2 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -64,8 +64,7 @@ static void blk_unregister_tracepoints(void); * Send out a notify message. */ static void trace_note(struct blk_trace *bt, pid_t pid, int action, - const void *data, size_t len, - union kernfs_node_id *cgid) + const void *data, size_t len, u64 cgid) { struct blk_io_trace *t; struct ring_buffer_event *event = NULL; @@ -73,7 +72,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, int pc = 0; int cpu = smp_processor_id(); bool blk_tracer = blk_tracer_enabled; - ssize_t cgid_len = cgid ? sizeof(*cgid) : 0; + ssize_t cgid_len = cgid ? sizeof(cgid) : 0; if (blk_tracer) { buffer = blk_tr->trace_buffer.buffer; @@ -100,8 +99,8 @@ record_it: t->pid = pid; t->cpu = cpu; t->pdu_len = len + cgid_len; - if (cgid) - memcpy((void *)t + sizeof(*t), cgid, cgid_len); + if (cgid_len) + memcpy((void *)t + sizeof(*t), &cgid, cgid_len); memcpy((void *) t + sizeof(*t) + cgid_len, data, len); if (blk_tracer) @@ -122,7 +121,7 @@ static void trace_note_tsk(struct task_struct *tsk) spin_lock_irqsave(&running_trace_lock, flags); list_for_each_entry(bt, &running_trace_list, running_list) { trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, - sizeof(tsk->comm), NULL); + sizeof(tsk->comm), 0); } spin_unlock_irqrestore(&running_trace_lock, flags); } @@ -139,7 +138,7 @@ static void trace_note_time(struct blk_trace *bt) words[1] = now.tv_nsec; local_irq_save(flags); - trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), NULL); + trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), 0); local_irq_restore(flags); } @@ -172,9 +171,9 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, blkcg = NULL; #ifdef CONFIG_BLK_CGROUP trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, - blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : NULL); + blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : 0); #else - trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, NULL); + trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, 0); #endif local_irq_restore(flags); } @@ -212,7 +211,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, int op, int op_flags, u32 what, int error, int pdu_len, - void *pdu_data, union kernfs_node_id *cgid) + void *pdu_data, u64 cgid) { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; @@ -223,7 +222,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, pid_t pid; int cpu, pc = 0; bool blk_tracer = blk_tracer_enabled; - ssize_t cgid_len = cgid ? sizeof(*cgid) : 0; + ssize_t cgid_len = cgid ? sizeof(cgid) : 0; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; @@ -294,7 +293,7 @@ record_it: t->pdu_len = pdu_len + cgid_len; if (cgid_len) - memcpy((void *)t + sizeof(*t), cgid, cgid_len); + memcpy((void *)t + sizeof(*t), &cgid, cgid_len); if (pdu_len) memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); @@ -751,31 +750,29 @@ void blk_trace_shutdown(struct request_queue *q) } #ifdef CONFIG_BLK_CGROUP -static union kernfs_node_id * -blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { struct blk_trace *bt = q->blk_trace; if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) - return NULL; + return 0; if (!bio->bi_blkg) - return NULL; + return 0; return cgroup_get_kernfs_id(bio_blkcg(bio)->css.cgroup); } #else -static union kernfs_node_id * -blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { - return NULL; + return 0; } #endif -static union kernfs_node_id * +static u64 blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) { if (!rq->bio) - return NULL; + return 0; /* Use the first bio */ return blk_trace_bio_get_cgid(q, rq->bio); } @@ -797,8 +794,7 @@ blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) * **/ static void blk_add_trace_rq(struct request *rq, int error, - unsigned int nr_bytes, u32 what, - union kernfs_node_id *cgid) + unsigned int nr_bytes, u32 what, u64 cgid) { struct blk_trace *bt = rq->q->blk_trace; @@ -913,7 +909,7 @@ static void blk_add_trace_getrq(void *ignore, if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, - NULL, NULL); + NULL, 0); } } @@ -929,7 +925,7 @@ static void blk_add_trace_sleeprq(void *ignore, if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, - 0, 0, NULL, NULL); + 0, 0, NULL, 0); } } @@ -938,7 +934,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) struct blk_trace *bt = q->blk_trace; if (bt) - __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, NULL); + __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0); } static void blk_add_trace_unplug(void *ignore, struct request_queue *q, @@ -955,7 +951,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, else what = BLK_TA_UNPLUG_TIMER; - __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, NULL); + __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0); } } @@ -1172,19 +1168,17 @@ const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg) { - return (void *)(te_blk_io_trace(ent) + 1) + - (has_cg ? sizeof(union kernfs_node_id) : 0); + return (void *)(te_blk_io_trace(ent) + 1) + (has_cg ? sizeof(u64) : 0); } -static inline const void *cgid_start(const struct trace_entry *ent) +static inline u64 t_cgid(const struct trace_entry *ent) { - return (void *)(te_blk_io_trace(ent) + 1); + return *(u64 *)(te_blk_io_trace(ent) + 1); } static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg) { - return te_blk_io_trace(ent)->pdu_len - - (has_cg ? sizeof(union kernfs_node_id) : 0); + return te_blk_io_trace(ent)->pdu_len - (has_cg ? sizeof(u64) : 0); } static inline u32 t_action(const struct trace_entry *ent) @@ -1257,7 +1251,7 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, fill_rwbs(rwbs, t); if (has_cg) { - const union kernfs_node_id *id = cgid_start(iter->ent); + u64 id = t_cgid(iter->ent); if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) { char blkcg_name_buf[NAME_MAX + 1] = "<...>"; @@ -1269,9 +1263,10 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, blkcg_name_buf, act, rwbs); } else trace_seq_printf(&iter->seq, - "%3d,%-3d %x,%-x %2s %3s ", + "%3d,%-3d %lx,%-x %2s %3s ", MAJOR(t->device), MINOR(t->device), - id->ino, id->generation, act, rwbs); + kernfs_id_ino(id), kernfs_id_gen(id), + act, rwbs); } else trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", MAJOR(t->device), MINOR(t->device), act, rwbs); diff --git a/net/core/filter.c b/net/core/filter.c index ed6563622ce3..b360a7beb6fc 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4089,7 +4089,7 @@ BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) return 0; cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return cgrp->kn->id.id; + return cgrp->kn->id; } static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { @@ -4114,7 +4114,7 @@ BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, if (!ancestor) return 0; - return ancestor->kn->id.id; + return ancestor->kn->id; } static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { -- cgit v1.2.3 From 40430452fd5da1509177ac597b394614cd3a121f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:30 -0800 Subject: kernfs: use 64bit inos if ino_t is 64bit Each kernfs_node is identified with a 64bit ID. The low 32bit is exposed as ino and the high gen. While this already allows using inos as keys by looking up with wildcard generation number of 0, it's adding unnecessary complications for 64bit ino archs which can directly use kernfs_node IDs as inos to uniquely identify each cgroup instance. This patch exposes IDs directly as inos on 64bit ino archs. The conversion is mostly straight-forward. * 32bit ino archs behave the same as before. 64bit ino archs now use the whole 64bit ID as ino and the generation number is fixed at 1. * 64bit inos still use the same idr allocator which gurantees that the lower 32bits identify the current live instance uniquely and the high 32bits are incremented whenever the low bits wrap. As the upper 32bits are no longer used as gen and we don't wanna start ino allocation with 33rd bit set, the initial value for highbits allocation is changed to 0 on 64bit ino archs. * blktrace exposes two 32bit numbers - (INO,GEN) pair - to identify the issuing cgroup. Userland builds FILEID_INO32_GEN fids from these numbers to look up the cgroups. To remain compatible with the behavior, always output (LOW32,HIGH32) which will be constructed back to the original 64bit ID by __kernfs_fh_to_dentry(). Signed-off-by: Tejun Heo Reviewed-by: Greg Kroah-Hartman Cc: Namhyung Kim --- fs/kernfs/dir.c | 42 +++++++++++++++++++++++++++++------------- fs/kernfs/mount.c | 7 ++++--- include/linux/kernfs.h | 20 ++++++++++++++------ kernel/trace/blktrace.c | 21 +++++++++++++++++---- 4 files changed, 64 insertions(+), 26 deletions(-) (limited to 'kernel/trace') diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 5dcf19d4adbc..b2d9f79c4a7c 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -532,7 +532,7 @@ void kernfs_put(struct kernfs_node *kn) kmem_cache_free(kernfs_iattrs_cache, kn->iattr); } spin_lock(&kernfs_idr_lock); - idr_remove(&root->ino_idr, kernfs_ino(kn)); + idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); spin_unlock(&kernfs_idr_lock); kmem_cache_free(kernfs_node_cache, kn); @@ -617,7 +617,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, unsigned flags) { struct kernfs_node *kn; - u32 gen; + u32 id_highbits; int ret; name = kstrdup_const(name, GFP_KERNEL); @@ -631,16 +631,16 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, idr_preload(GFP_KERNEL); spin_lock(&kernfs_idr_lock); ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC); - if (ret >= 0 && ret < root->last_ino) - root->next_generation++; - gen = root->next_generation; - root->last_ino = ret; + if (ret >= 0 && ret < root->last_id_lowbits) + root->id_highbits++; + id_highbits = root->id_highbits; + root->last_id_lowbits = ret; spin_unlock(&kernfs_idr_lock); idr_preload_end(); if (ret < 0) goto err_out2; - kn->id = (u64)gen << 32 | ret; + kn->id = (u64)id_highbits << 32 | ret; atomic_set(&kn->count, 1); atomic_set(&kn->active, KN_DEACTIVATED_BIAS); @@ -671,7 +671,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, return kn; err_out3: - idr_remove(&root->ino_idr, kernfs_ino(kn)); + idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); err_out2: kmem_cache_free(kernfs_node_cache, kn); err_out1: @@ -715,13 +715,19 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, spin_lock(&kernfs_idr_lock); - kn = idr_find(&root->ino_idr, ino); + kn = idr_find(&root->ino_idr, (u32)ino); if (!kn) goto err_unlock; - /* 0 matches all generations */ - if (unlikely(gen && kernfs_gen(kn) != gen)) - goto err_unlock; + if (sizeof(ino_t) >= sizeof(u64)) { + /* we looked up with the low 32bits, compare the whole */ + if (kernfs_ino(kn) != ino) + goto err_unlock; + } else { + /* 0 matches all generations */ + if (unlikely(gen && kernfs_gen(kn) != gen)) + goto err_unlock; + } /* * ACTIVATED is protected with kernfs_mutex but it was clear when @@ -949,7 +955,17 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, idr_init(&root->ino_idr); INIT_LIST_HEAD(&root->supers); - root->next_generation = 1; + + /* + * On 64bit ino setups, id is ino. On 32bit, low 32bits are ino. + * High bits generation. The starting value for both ino and + * genenration is 1. Initialize upper 32bit allocation + * accordingly. + */ + if (sizeof(ino_t) >= sizeof(u64)) + root->id_highbits = 0; + else + root->id_highbits = 1; kn = __kernfs_new_node(root, NULL, "", S_IFDIR | S_IRUGO | S_IXUGO, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 37a1e5df117a..4d31503abaee 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -87,9 +87,10 @@ static struct dentry *__kernfs_fh_to_dentry(struct super_block *sb, case FILEID_INO32_GEN: case FILEID_INO32_GEN_PARENT: /* - * blk_log_action() exposes (ino,gen) pair without type and - * userland can call us with generic fid constructed from - * them. Combine it back to ID. See blk_log_action(). + * blk_log_action() exposes "LOW32,HIGH32" pair without + * type and userland can call us with generic fid + * constructed from them. Combine it back to ID. See + * blk_log_action(). */ id = ((u64)fid->i32.gen << 32) | fid->i32.ino; break; diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 38267cc9420c..dded2e5a9f42 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -141,8 +141,8 @@ struct kernfs_node { void *priv; /* - * 64bit unique ID. Lower 32bits carry the inode number and lower - * generation. + * 64bit unique ID. On 64bit ino setups, id is the ino. On 32bit, + * the low 32bits are ino and upper generation. */ u64 id; @@ -177,8 +177,8 @@ struct kernfs_root { /* private fields, do not use outside kernfs proper */ struct idr ino_idr; - u32 last_ino; - u32 next_generation; + u32 last_id_lowbits; + u32 id_highbits; struct kernfs_syscall_ops *syscall_ops; /* list of kernfs_super_info of this root, protected by kernfs_mutex */ @@ -284,12 +284,20 @@ static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) static inline ino_t kernfs_id_ino(u64 id) { - return (u32)id; + /* id is ino if ino_t is 64bit; otherwise, low 32bits */ + if (sizeof(ino_t) >= sizeof(u64)) + return id; + else + return (u32)id; } static inline u32 kernfs_id_gen(u64 id) { - return id >> 32; + /* gen is fixed at 1 if ino_t is 64bit; otherwise, high 32bits */ + if (sizeof(ino_t) >= sizeof(u64)) + return 1; + else + return id >> 32; } static inline ino_t kernfs_ino(struct kernfs_node *kn) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index a986d2e74ca2..a7dac5b63f3f 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1261,12 +1261,25 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ", MAJOR(t->device), MINOR(t->device), blkcg_name_buf, act, rwbs); - } else + } else { + /* + * The cgid portion used to be "INO,GEN". Userland + * builds a FILEID_INO32_GEN fid out of them and + * opens the cgroup using open_by_handle_at(2). + * While 32bit ino setups are still the same, 64bit + * ones now use the 64bit ino as the whole ID and + * no longer use generation. + * + * Regarldess of the content, always output + * "LOW32,HIGH32" so that FILEID_INO32_GEN fid can + * be mapped back to @id on both 64 and 32bit ino + * setups. See __kernfs_fh_to_dentry(). + */ trace_seq_printf(&iter->seq, - "%3d,%-3d %lx,%-x %2s %3s ", + "%3d,%-3d %llx,%-llx %2s %3s ", MAJOR(t->device), MINOR(t->device), - kernfs_id_ino(id), kernfs_id_gen(id), - act, rwbs); + id & U32_MAX, id >> 32, act, rwbs); + } } else trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", MAJOR(t->device), MINOR(t->device), act, rwbs); -- cgit v1.2.3 From 743210386c0354a2f8ef3d697353c7d8477fa81d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:30 -0800 Subject: cgroup: use cgrp->kn->id as the cgroup ID cgroup ID is currently allocated using a dedicated per-hierarchy idr and used internally and exposed through tracepoints and bpf. This is confusing because there are tracepoints and other interfaces which use the cgroupfs ino as IDs. The preceding changes made kn->id exposed as ino as 64bit ino on supported archs or ino+gen (low 32bits as ino, high gen). There's no reason for cgroup to use different IDs. The kernfs IDs are unique and userland can easily discover them and map them back to paths using standard file operations. This patch replaces cgroup IDs with kernfs IDs. * cgroup_id() is added and all cgroup ID users are converted to use it. * kernfs_node creation is moved to earlier during cgroup init so that cgroup_id() is available during init. * While at it, s/cgroup/cgrp/ in psi helpers for consistency. * Fallback ID value is changed to 1 to be consistent with root cgroup ID. Signed-off-by: Tejun Heo Reviewed-by: Greg Kroah-Hartman Cc: Namhyung Kim --- include/linux/cgroup-defs.h | 17 ++-------- include/linux/cgroup.h | 17 ++++------ include/trace/events/cgroup.h | 6 ++-- kernel/bpf/helpers.c | 2 +- kernel/bpf/local_storage.c | 2 +- kernel/cgroup/cgroup.c | 76 ++++++++++++++----------------------------- kernel/trace/blktrace.c | 4 +-- net/core/filter.c | 4 +-- 8 files changed, 43 insertions(+), 85 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 4904b1ebd1ff..63097cb243cb 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -354,16 +354,6 @@ struct cgroup { unsigned long flags; /* "unsigned long" so bitops work */ - /* - * idr allocated in-hierarchy ID. - * - * ID 0 is not used, the ID of the root cgroup is always 1, and a - * new cgroup will be assigned with a smallest available ID. - * - * Allocating/Removing ID must be protected by cgroup_mutex. - */ - int id; - /* * The depth this cgroup is at. The root is at depth zero and each * step down the hierarchy increments the level. This along with @@ -488,7 +478,7 @@ struct cgroup { struct cgroup_freezer_state freezer; /* ids of the ancestors at each level including self */ - int ancestor_ids[]; + u64 ancestor_ids[]; }; /* @@ -509,7 +499,7 @@ struct cgroup_root { struct cgroup cgrp; /* for cgrp->ancestor_ids[0] */ - int cgrp_ancestor_id_storage; + u64 cgrp_ancestor_id_storage; /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ atomic_t nr_cgrps; @@ -520,9 +510,6 @@ struct cgroup_root { /* Hierarchy-specific flags */ unsigned int flags; - /* IDs for cgroups in this hierarchy */ - struct idr cgroup_idr; - /* The path to use for release notifications. */ char release_agent_path[PATH_MAX]; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 815fff49d555..d7ddebd0cdec 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -304,6 +304,11 @@ void css_task_iter_end(struct css_task_iter *it); * Inline functions. */ +static inline u64 cgroup_id(struct cgroup *cgrp) +{ + return cgrp->kn->id; +} + /** * css_get - obtain a reference on the specified css * @css: target css @@ -565,7 +570,7 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp, { if (cgrp->root != ancestor->root || cgrp->level < ancestor->level) return false; - return cgrp->ancestor_ids[ancestor->level] == ancestor->id; + return cgrp->ancestor_ids[ancestor->level] == cgroup_id(ancestor); } /** @@ -687,17 +692,13 @@ static inline void cgroup_kthread_ready(void) current->no_cgroup_migration = 0; } -static inline u64 cgroup_get_kernfs_id(struct cgroup *cgrp) -{ - return cgrp->kn->id; -} - void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen); #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; struct cgroup; +static inline u64 cgroup_id(struct cgroup *cgrp) { return 1; } static inline void css_get(struct cgroup_subsys_state *css) {} static inline void css_put(struct cgroup_subsys_state *css) {} static inline int cgroup_attach_task_all(struct task_struct *from, @@ -717,10 +718,6 @@ static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } static inline void cgroup_init_kthreadd(void) {} static inline void cgroup_kthread_ready(void) {} -static inline union u64 cgroup_get_kernfs_id(struct cgroup *cgrp) -{ - return 0; -} static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) { diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h index a566cc521476..7f42a3de59e6 100644 --- a/include/trace/events/cgroup.h +++ b/include/trace/events/cgroup.h @@ -66,7 +66,7 @@ DECLARE_EVENT_CLASS(cgroup, TP_fast_assign( __entry->root = cgrp->root->hierarchy_id; - __entry->id = cgrp->id; + __entry->id = cgroup_id(cgrp); __entry->level = cgrp->level; __assign_str(path, path); ), @@ -135,7 +135,7 @@ DECLARE_EVENT_CLASS(cgroup_migrate, TP_fast_assign( __entry->dst_root = dst_cgrp->root->hierarchy_id; - __entry->dst_id = dst_cgrp->id; + __entry->dst_id = cgroup_id(dst_cgrp); __entry->dst_level = dst_cgrp->level; __assign_str(dst_path, path); __entry->pid = task->pid; @@ -179,7 +179,7 @@ DECLARE_EVENT_CLASS(cgroup_event, TP_fast_assign( __entry->root = cgrp->root->hierarchy_id; - __entry->id = cgrp->id; + __entry->id = cgroup_id(cgrp); __entry->level = cgrp->level; __assign_str(path, path); __entry->val = val; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 912e761cd17a..cada974c9f4e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -317,7 +317,7 @@ BPF_CALL_0(bpf_get_current_cgroup_id) { struct cgroup *cgrp = task_dfl_cgroup(current); - return cgrp->kn->id; + return cgroup_id(cgrp); } const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 5d867f6d7204..2ba750725cb2 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -569,7 +569,7 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, return; storage->key.attach_type = type; - storage->key.cgroup_inode_id = cgroup->kn->id; + storage->key.cgroup_inode_id = cgroup_id(cgroup); map = storage->map; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b5dcbee5aa6c..c12dcf7dc432 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1308,10 +1308,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root) void cgroup_free_root(struct cgroup_root *root) { - if (root) { - idr_destroy(&root->cgroup_idr); - kfree(root); - } + kfree(root); } static void cgroup_destroy_root(struct cgroup_root *root) @@ -1917,7 +1914,6 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) atomic_set(&root->nr_cgrps, 1); cgrp->root = root; init_cgroup_housekeeping(cgrp); - idr_init(&root->cgroup_idr); root->flags = ctx->flags; if (ctx->release_agent) @@ -1938,12 +1934,6 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) lockdep_assert_held(&cgroup_mutex); - ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL); - if (ret < 0) - goto out; - root_cgrp->id = ret; - root_cgrp->ancestor_ids[0] = ret; - ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) @@ -1976,6 +1966,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) goto exit_root_id; } root_cgrp->kn = root->kf_root->kn; + WARN_ON_ONCE(cgroup_id(root_cgrp) != 1); + root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp); ret = css_populate_dir(&root_cgrp->self); if (ret) @@ -3552,22 +3544,22 @@ static int cpu_stat_show(struct seq_file *seq, void *v) #ifdef CONFIG_PSI static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { - struct cgroup *cgroup = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { - struct cgroup *cgroup = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { - struct cgroup *cgroup = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_CPU); } @@ -4987,9 +4979,6 @@ static void css_release_work_fn(struct work_struct *work) tcgrp->nr_dying_descendants--; spin_unlock_irq(&css_set_lock); - cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); - cgrp->id = -1; - /* * There are two control paths which try to determine * cgroup from dentry without going through kernfs - @@ -5154,10 +5143,12 @@ err_free_css: * it isn't associated with its kernfs_node and doesn't have the control * mask applied. */ -static struct cgroup *cgroup_create(struct cgroup *parent) +static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, + umode_t mode) { struct cgroup_root *root = parent->root; struct cgroup *cgrp, *tcgrp; + struct kernfs_node *kn; int level = parent->level + 1; int ret; @@ -5177,15 +5168,13 @@ static struct cgroup *cgroup_create(struct cgroup *parent) goto out_cancel_ref; } - /* - * Temporarily set the pointer to NULL, so idr_find() won't return - * a half-baked cgroup. - */ - cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); - if (cgrp->id < 0) { - ret = -ENOMEM; + /* create the directory */ + kn = kernfs_create_dir(parent->kn, name, mode, cgrp); + if (IS_ERR(kn)) { + ret = PTR_ERR(kn); goto out_stat_exit; } + cgrp->kn = kn; init_cgroup_housekeeping(cgrp); @@ -5195,7 +5184,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) ret = psi_cgroup_alloc(cgrp); if (ret) - goto out_idr_free; + goto out_kernfs_remove; ret = cgroup_bpf_inherit(cgrp); if (ret) @@ -5219,7 +5208,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) spin_lock_irq(&css_set_lock); for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { - cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; + cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp); if (tcgrp != cgrp) { tcgrp->nr_descendants++; @@ -5248,12 +5237,6 @@ static struct cgroup *cgroup_create(struct cgroup *parent) atomic_inc(&root->nr_cgrps); cgroup_get_live(parent); - /* - * @cgrp is now fully operational. If something fails after this - * point, it'll be released via the normal destruction path. - */ - cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); - /* * On the default hierarchy, a child doesn't automatically inherit * subtree_control from the parent. Each is configured manually. @@ -5267,8 +5250,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent) out_psi_free: psi_cgroup_free(cgrp); -out_idr_free: - cgroup_idr_remove(&root->cgroup_idr, cgrp->id); +out_kernfs_remove: + kernfs_remove(cgrp->kn); out_stat_exit: if (cgroup_on_dfl(parent)) cgroup_rstat_exit(cgrp); @@ -5305,7 +5288,6 @@ fail: int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { struct cgroup *parent, *cgrp; - struct kernfs_node *kn; int ret; /* do not accept '\n' to prevent making /proc//cgroup unparsable */ @@ -5321,27 +5303,19 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) goto out_unlock; } - cgrp = cgroup_create(parent); + cgrp = cgroup_create(parent, name, mode); if (IS_ERR(cgrp)) { ret = PTR_ERR(cgrp); goto out_unlock; } - /* create the directory */ - kn = kernfs_create_dir(parent->kn, name, mode, cgrp); - if (IS_ERR(kn)) { - ret = PTR_ERR(kn); - goto out_destroy; - } - cgrp->kn = kn; - /* * This extra ref will be put in cgroup_free_fn() and guarantees * that @cgrp->kn is always accessible. */ - kernfs_get(kn); + kernfs_get(cgrp->kn); - ret = cgroup_kn_set_ugid(kn); + ret = cgroup_kn_set_ugid(cgrp->kn); if (ret) goto out_destroy; @@ -5356,7 +5330,7 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) TRACE_CGROUP_PATH(mkdir, cgrp); /* let's create and online css's */ - kernfs_activate(kn); + kernfs_activate(cgrp->kn); ret = 0; goto out_unlock; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index a7dac5b63f3f..475e29498bca 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -171,7 +171,7 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, blkcg = NULL; #ifdef CONFIG_BLK_CGROUP trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, - blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : 0); + blkcg ? cgroup_id(blkcg->css.cgroup) : 1); #else trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, 0); #endif @@ -759,7 +759,7 @@ static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) if (!bio->bi_blkg) return 0; - return cgroup_get_kernfs_id(bio_blkcg(bio)->css.cgroup); + return cgroup_id(bio_blkcg(bio)->css.cgroup); } #else u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) diff --git a/net/core/filter.c b/net/core/filter.c index b360a7beb6fc..caef7c74cad5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4089,7 +4089,7 @@ BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) return 0; cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return cgrp->kn->id; + return cgroup_id(cgrp); } static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { @@ -4114,7 +4114,7 @@ BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, if (!ancestor) return 0; - return ancestor->kn->id; + return cgroup_id(ancestor); } static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { -- cgit v1.2.3 From 763e34e74bb7d5c316015e2e39fcc8520bfd071c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 13:07:06 -0500 Subject: ftrace: Add register_ftrace_direct() Add the start of the functionality to allow other trampolines to use the ftrace mcount/fentry/nop location. This adds two new functions: register_ftrace_direct() and unregister_ftrace_direct() Both take two parameters: the first is the instruction address of where the mcount/fentry/nop exists, and the second is the trampoline to have that location called. This will handle cases where ftrace is already used on that same location, and will make it still work, where the registered direct called trampoline will get called after all the registered ftrace callers are handled. Currently, it will not allow for IP_MODIFY functions to be called at the same locations, which include some kprobes and live kernel patching. At this point, no architecture supports this. This is only the start of implementing the framework. Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 36 ++++++- kernel/trace/Kconfig | 8 ++ kernel/trace/ftrace.c | 269 ++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 306 insertions(+), 7 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 8385cafe4f9f..efe3e521aff4 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -144,6 +144,8 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops); * TRACE_ARRAY - The ops->private points to a trace_array descriptor. * PERMANENT - Set when the ops is permanent and should not be affected by * ftrace_enabled. + * DIRECT - Used by the direct ftrace_ops helper for direct functions + * (internal ftrace only, should not be used by others) */ enum { FTRACE_OPS_FL_ENABLED = 1 << 0, @@ -163,6 +165,7 @@ enum { FTRACE_OPS_FL_RCU = 1 << 14, FTRACE_OPS_FL_TRACE_ARRAY = 1 << 15, FTRACE_OPS_FL_PERMANENT = 1 << 16, + FTRACE_OPS_FL_DIRECT = 1 << 17, }; #ifdef CONFIG_DYNAMIC_FTRACE @@ -242,6 +245,32 @@ static inline void ftrace_free_init_mem(void) { } static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { } #endif /* CONFIG_FUNCTION_TRACER */ +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +int register_ftrace_direct(unsigned long ip, unsigned long addr); +int unregister_ftrace_direct(unsigned long ip, unsigned long addr); +#else +static inline int register_ftrace_direct(unsigned long ip, unsigned long addr) +{ + return -ENODEV; +} +static inline int unregister_ftrace_direct(unsigned long ip, unsigned long addr) +{ + return -ENODEV; +} +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + +#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +/* + * This must be implemented by the architecture. + * It is the way the ftrace direct_ops helper, when called + * via ftrace (because there's other callbacks besides the + * direct call), can inform the architecture's trampoline that this + * routine has a direct caller, and what the caller is. + */ +static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs, + unsigned long addr) { } +#endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + #ifdef CONFIG_STACK_TRACER extern int stack_tracer_enabled; @@ -333,6 +362,7 @@ bool is_ftrace_trampoline(unsigned long addr); * REGS_EN - the function is set up to save regs. * IPMODIFY - the record allows for the IP address to be changed. * DISABLED - the record is not ready to be touched yet + * DIRECT - there is a direct function to call * * When a new ftrace_ops is registered and wants a function to save * pt_regs, the rec->flag REGS is set. When the function has been @@ -348,10 +378,12 @@ enum { FTRACE_FL_TRAMP_EN = (1UL << 27), FTRACE_FL_IPMODIFY = (1UL << 26), FTRACE_FL_DISABLED = (1UL << 25), + FTRACE_FL_DIRECT = (1UL << 24), + FTRACE_FL_DIRECT_EN = (1UL << 23), }; -#define FTRACE_REF_MAX_SHIFT 25 -#define FTRACE_FL_BITS 7 +#define FTRACE_REF_MAX_SHIFT 23 +#define FTRACE_FL_BITS 9 #define FTRACE_FL_MASKED_BITS ((1UL << FTRACE_FL_BITS) - 1) #define FTRACE_FL_MASK (FTRACE_FL_MASKED_BITS << FTRACE_REF_MAX_SHIFT) #define FTRACE_REF_MAX ((1UL << FTRACE_REF_MAX_SHIFT) - 1) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e08527f50d2a..624a05e99b0b 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -33,6 +33,9 @@ config HAVE_DYNAMIC_FTRACE config HAVE_DYNAMIC_FTRACE_WITH_REGS bool +config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + bool + config HAVE_FTRACE_MCOUNT_RECORD bool help @@ -557,6 +560,11 @@ config DYNAMIC_FTRACE_WITH_REGS depends on DYNAMIC_FTRACE depends on HAVE_DYNAMIC_FTRACE_WITH_REGS +config DYNAMIC_FTRACE_WITH_DIRECT_CALLS + def_bool y + depends on DYNAMIC_FTRACE + depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + config FUNCTION_PROFILER bool "Kernel function profiler" depends on FUNCTION_TRACER diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b0e7f03919de..329a3f3789a1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1023,6 +1023,7 @@ static bool update_all_ops; struct ftrace_func_entry { struct hlist_node hlist; unsigned long ip; + unsigned long direct; /* for direct lookup only */ }; struct ftrace_func_probe { @@ -1730,6 +1731,9 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX)) return false; + if (ops->flags & FTRACE_OPS_FL_DIRECT) + rec->flags |= FTRACE_FL_DIRECT; + /* * If there's only a single callback registered to a * function, and the ops has a trampoline registered @@ -1757,6 +1761,15 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, return false; rec->flags--; + /* + * Only the internal direct_ops should have the + * DIRECT flag set. Thus, if it is removing a + * function, then that function should no longer + * be direct. + */ + if (ops->flags & FTRACE_OPS_FL_DIRECT) + rec->flags &= ~FTRACE_FL_DIRECT; + /* * If the rec had REGS enabled and the ops that is * being removed had REGS set, then see if there is @@ -2092,15 +2105,34 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) * If enabling and the REGS flag does not match the REGS_EN, or * the TRAMP flag doesn't match the TRAMP_EN, then do not ignore * this record. Set flags to fail the compare against ENABLED. + * Same for direct calls. */ if (flag) { - if (!(rec->flags & FTRACE_FL_REGS) != + if (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN)) flag |= FTRACE_FL_REGS; - if (!(rec->flags & FTRACE_FL_TRAMP) != + if (!(rec->flags & FTRACE_FL_TRAMP) != !(rec->flags & FTRACE_FL_TRAMP_EN)) flag |= FTRACE_FL_TRAMP; + + /* + * Direct calls are special, as count matters. + * We must test the record for direct, if the + * DIRECT and DIRECT_EN do not match, but only + * if the count is 1. That's because, if the + * count is something other than one, we do not + * want the direct enabled (it will be done via the + * direct helper). But if DIRECT_EN is set, and + * the count is not one, we need to clear it. + */ + if (ftrace_rec_count(rec) == 1) { + if (!(rec->flags & FTRACE_FL_DIRECT) != + !(rec->flags & FTRACE_FL_DIRECT_EN)) + flag |= FTRACE_FL_DIRECT; + } else if (rec->flags & FTRACE_FL_DIRECT_EN) { + flag |= FTRACE_FL_DIRECT; + } } /* If the state of this record hasn't changed, then do nothing */ @@ -2125,6 +2157,25 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) else rec->flags &= ~FTRACE_FL_TRAMP_EN; } + if (flag & FTRACE_FL_DIRECT) { + /* + * If there's only one user (direct_ops helper) + * then we can call the direct function + * directly (no ftrace trampoline). + */ + if (ftrace_rec_count(rec) == 1) { + if (rec->flags & FTRACE_FL_DIRECT) + rec->flags |= FTRACE_FL_DIRECT_EN; + else + rec->flags &= ~FTRACE_FL_DIRECT_EN; + } else { + /* + * Can only call directly if there's + * only one callback to the function. + */ + rec->flags &= ~FTRACE_FL_DIRECT_EN; + } + } } /* @@ -2154,7 +2205,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) * and REGS states. The _EN flags must be disabled though. */ rec->flags &= ~(FTRACE_FL_ENABLED | FTRACE_FL_TRAMP_EN | - FTRACE_FL_REGS_EN); + FTRACE_FL_REGS_EN | FTRACE_FL_DIRECT_EN); } ftrace_bug_type = FTRACE_BUG_NOP; @@ -2309,6 +2360,51 @@ ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) return NULL; } +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +/* Protected by rcu_tasks for reading, and direct_mutex for writing */ +static struct ftrace_hash *direct_functions = EMPTY_HASH; +static DEFINE_MUTEX(direct_mutex); + +/* + * Search the direct_functions hash to see if the given instruction pointer + * has a direct caller attached to it. + */ +static unsigned long find_rec_direct(unsigned long ip) +{ + struct ftrace_func_entry *entry; + + entry = __ftrace_lookup_ip(direct_functions, ip); + if (!entry) + return 0; + + return entry->direct; +} + +static void call_direct_funcs(unsigned long ip, unsigned long pip, + struct ftrace_ops *ops, struct pt_regs *regs) +{ + unsigned long addr; + + addr = find_rec_direct(ip); + if (!addr) + return; + + arch_ftrace_set_direct_caller(regs, addr); +} + +struct ftrace_ops direct_ops = { + .func = call_direct_funcs, + .flags = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE + | FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS + | FTRACE_OPS_FL_PERMANENT, +}; +#else +static inline unsigned long find_rec_direct(unsigned long ip) +{ + return 0; +} +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + /** * ftrace_get_addr_new - Get the call address to set to * @rec: The ftrace record descriptor @@ -2322,6 +2418,15 @@ ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) { struct ftrace_ops *ops; + unsigned long addr; + + if ((rec->flags & FTRACE_FL_DIRECT) && + (ftrace_rec_count(rec) == 1)) { + addr = find_rec_direct(rec->ip); + if (addr) + return addr; + WARN_ON_ONCE(1); + } /* Trampolines take precedence over regs */ if (rec->flags & FTRACE_FL_TRAMP) { @@ -2354,6 +2459,15 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) { struct ftrace_ops *ops; + unsigned long addr; + + /* Direct calls take precedence over trampolines */ + if (rec->flags & FTRACE_FL_DIRECT_EN) { + addr = find_rec_direct(rec->ip); + if (addr) + return addr; + WARN_ON_ONCE(1); + } /* Trampolines take precedence over regs */ if (rec->flags & FTRACE_FL_TRAMP_EN) { @@ -3465,10 +3579,11 @@ static int t_show(struct seq_file *m, void *v) if (iter->flags & FTRACE_ITER_ENABLED) { struct ftrace_ops *ops; - seq_printf(m, " (%ld)%s%s", + seq_printf(m, " (%ld)%s%s%s", ftrace_rec_count(rec), rec->flags & FTRACE_FL_REGS ? " R" : " ", - rec->flags & FTRACE_FL_IPMODIFY ? " I" : " "); + rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ", + rec->flags & FTRACE_FL_DIRECT ? " D" : " "); if (rec->flags & FTRACE_FL_TRAMP_EN) { ops = ftrace_find_tramp_ops_any(rec); if (ops) { @@ -3484,6 +3599,13 @@ static int t_show(struct seq_file *m, void *v) } else { add_trampoline_func(m, NULL, rec); } + if (rec->flags & FTRACE_FL_DIRECT) { + unsigned long direct; + + direct = find_rec_direct(rec->ip); + if (direct) + seq_printf(m, "\n\tdirect-->%pS", (void *)direct); + } } seq_putc(m, '\n'); @@ -4815,6 +4937,143 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, return ftrace_set_hash(ops, NULL, 0, ip, remove, reset, enable); } +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +/** + * register_ftrace_direct - Call a custom trampoline directly + * @ip: The address of the nop at the beginning of a function + * @addr: The address of the trampoline to call at @ip + * + * This is used to connect a direct call from the nop location (@ip) + * at the start of ftrace traced functions. The location that it calls + * (@addr) must be able to handle a direct call, and save the parameters + * of the function being traced, and restore them (or inject new ones + * if needed), before returning. + * + * Returns: + * 0 on success + * -EBUSY - Another direct function is already attached (there can be only one) + * -ENODEV - @ip does not point to a ftrace nop location (or not supported) + * -ENOMEM - There was an allocation failure. + */ +int register_ftrace_direct(unsigned long ip, unsigned long addr) +{ + struct ftrace_func_entry *entry; + struct ftrace_hash *free_hash = NULL; + struct dyn_ftrace *rec; + int ret = -EBUSY; + + mutex_lock(&direct_mutex); + + /* See if there's a direct function at @ip already */ + if (find_rec_direct(ip)) + goto out_unlock; + + ret = -ENODEV; + rec = lookup_rec(ip, ip); + if (!rec) + goto out_unlock; + + /* + * Check if the rec says it has a direct call but we didn't + * find one earlier? + */ + if (WARN_ON(rec->flags & FTRACE_FL_DIRECT)) + goto out_unlock; + + /* Make sure the ip points to the exact record */ + ip = rec->ip; + + ret = -ENOMEM; + if (ftrace_hash_empty(direct_functions) || + direct_functions->count > 2 * (1 << direct_functions->size_bits)) { + struct ftrace_hash *new_hash; + int size = ftrace_hash_empty(direct_functions) ? 0 : + direct_functions->count + 1; + + if (size < 32) + size = 32; + + new_hash = dup_hash(direct_functions, size); + if (!new_hash) + goto out_unlock; + + free_hash = direct_functions; + direct_functions = new_hash; + } + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + goto out_unlock; + + entry->ip = ip; + entry->direct = addr; + __add_hash_entry(direct_functions, entry); + + ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0); + if (ret) + remove_hash_entry(direct_functions, entry); + + if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) { + ret = register_ftrace_function(&direct_ops); + if (ret) + ftrace_set_filter_ip(&direct_ops, ip, 1, 0); + } + + if (ret) + kfree(entry); + out_unlock: + mutex_unlock(&direct_mutex); + + if (free_hash) { + synchronize_rcu_tasks(); + free_ftrace_hash(free_hash); + } + + return ret; +} +EXPORT_SYMBOL_GPL(register_ftrace_direct); + +int unregister_ftrace_direct(unsigned long ip, unsigned long addr) +{ + struct ftrace_func_entry *entry; + struct dyn_ftrace *rec; + int ret = -ENODEV; + + mutex_lock(&direct_mutex); + + entry = __ftrace_lookup_ip(direct_functions, ip); + if (!entry) { + /* OK if it is off by a little */ + rec = lookup_rec(ip, ip); + if (!rec || rec->ip == ip) + goto out_unlock; + + entry = __ftrace_lookup_ip(direct_functions, rec->ip); + if (!entry) { + WARN_ON(rec->flags & FTRACE_FL_DIRECT); + goto out_unlock; + } + + WARN_ON(!(rec->flags & FTRACE_FL_DIRECT)); + } + + if (direct_functions->count == 1) + unregister_ftrace_function(&direct_ops); + + ret = ftrace_set_filter_ip(&direct_ops, ip, 1, 0); + + WARN_ON(ret); + + remove_hash_entry(direct_functions, entry); + + out_unlock: + mutex_unlock(&direct_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(unregister_ftrace_direct); +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + /** * ftrace_set_filter_ip - set a function to filter on in ftrace by address * @ops - the ops to set the filter with -- cgit v1.2.3 From 013bf0da0474816f57739daa006c8564ad7396a3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 13:11:27 -0500 Subject: ftrace: Add ftrace_find_direct_func() As function_graph tracer modifies the return address to insert a trampoline to trace the return of a function, it must be aware of a direct caller, as when it gets called, the function's return address may not be at on the stack where it expects. It may have to see if that return address points to the a direct caller and adjust if it is. Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 6 ++++ kernel/trace/ftrace.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 84 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index efe3e521aff4..8b37b8105398 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -51,6 +51,7 @@ static inline void early_trace_init(void) { } struct module; struct ftrace_hash; +struct ftrace_direct_func; #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_MODULES) && \ defined(CONFIG_DYNAMIC_FTRACE) @@ -248,6 +249,7 @@ static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS int register_ftrace_direct(unsigned long ip, unsigned long addr); int unregister_ftrace_direct(unsigned long ip, unsigned long addr); +struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr); #else static inline int register_ftrace_direct(unsigned long ip, unsigned long addr) { @@ -257,6 +259,10 @@ static inline int unregister_ftrace_direct(unsigned long ip, unsigned long addr) { return -ENODEV; } +static inline struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr) +{ + return NULL; +} #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 329a3f3789a1..c4446eabacbe 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4938,6 +4938,46 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, } #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + +struct ftrace_direct_func { + struct list_head next; + unsigned long addr; + int count; +}; + +static LIST_HEAD(ftrace_direct_funcs); + +/** + * ftrace_find_direct_func - test an address if it is a registered direct caller + * @addr: The address of a registered direct caller + * + * This searches to see if a ftrace direct caller has been registered + * at a specific address, and if so, it returns a descriptor for it. + * + * This can be used by architecture code to see if an address is + * a direct caller (trampoline) attached to a fentry/mcount location. + * This is useful for the function_graph tracer, as it may need to + * do adjustments if it traced a location that also has a direct + * trampoline attached to it. + */ +struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr) +{ + struct ftrace_direct_func *entry; + bool found = false; + + /* May be called by fgraph trampoline (protected by rcu tasks) */ + list_for_each_entry_rcu(entry, &ftrace_direct_funcs, next) { + if (entry->addr == addr) { + found = true; + break; + } + } + if (found) + return entry; + + return NULL; +} + /** * register_ftrace_direct - Call a custom trampoline directly * @ip: The address of the nop at the beginning of a function @@ -4957,6 +4997,7 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, */ int register_ftrace_direct(unsigned long ip, unsigned long addr) { + struct ftrace_direct_func *direct; struct ftrace_func_entry *entry; struct ftrace_hash *free_hash = NULL; struct dyn_ftrace *rec; @@ -5005,6 +5046,18 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) if (!entry) goto out_unlock; + direct = ftrace_find_direct_func(addr); + if (!direct) { + direct = kmalloc(sizeof(*direct), GFP_KERNEL); + if (!direct) { + kfree(entry); + goto out_unlock; + } + direct->addr = addr; + direct->count = 0; + list_add_rcu(&direct->next, &ftrace_direct_funcs); + } + entry->ip = ip; entry->direct = addr; __add_hash_entry(direct_functions, entry); @@ -5019,8 +5072,20 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) ftrace_set_filter_ip(&direct_ops, ip, 1, 0); } - if (ret) + if (ret) { kfree(entry); + if (!direct->count) { + list_del_rcu(&direct->next); + synchronize_rcu_tasks(); + kfree(direct); + if (free_hash) + free_ftrace_hash(free_hash); + free_hash = NULL; + } + } else { + if (!direct->count) + direct->count++; + } out_unlock: mutex_unlock(&direct_mutex); @@ -5036,6 +5101,7 @@ EXPORT_SYMBOL_GPL(register_ftrace_direct); int unregister_ftrace_direct(unsigned long ip, unsigned long addr) { struct ftrace_func_entry *entry; + struct ftrace_direct_func *direct; struct dyn_ftrace *rec; int ret = -ENODEV; @@ -5066,6 +5132,17 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) remove_hash_entry(direct_functions, entry); + direct = ftrace_find_direct_func(addr); + if (!WARN_ON(!direct)) { + /* This is the good path (see the ! before WARN) */ + direct->count--; + WARN_ON(direct->count < 0); + if (!direct->count) { + list_del_rcu(&direct->next); + synchronize_rcu_tasks(); + kfree(direct); + } + } out_unlock: mutex_unlock(&direct_mutex); -- cgit v1.2.3 From a3ad1a7e39689005cb04a4f2adb82f9d55b4724f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 13:12:57 -0500 Subject: ftrace/x86: Add a counter to test function_graph with direct As testing for direct calls from the function graph tracer adds a little overhead (which is a lot when tracing every function), add a counter that can be used to test if function_graph tracer needs to test for a direct caller or not. It would have been nicer if we could use a static branch, but the static branch logic fails when used within the function graph tracer trampoline. Signed-off-by: Steven Rostedt (VMware) --- arch/x86/kernel/ftrace.c | 8 +++++--- include/linux/ftrace.h | 2 ++ kernel/trace/ftrace.c | 4 ++++ 3 files changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index fef283f6341d..060a361d9d11 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -1049,9 +1049,11 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, * return address is actually off by one word, and we * need to adjust for that. */ - if (ftrace_find_direct_func(self_addr + MCOUNT_INSN_SIZE)) { - self_addr = *parent; - parent++; + if (ftrace_direct_func_count) { + if (ftrace_find_direct_func(self_addr + MCOUNT_INSN_SIZE)) { + self_addr = *parent; + parent++; + } } /* diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 2bc7bd6b8387..55647e185141 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -247,10 +247,12 @@ static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +extern int ftrace_direct_func_count; int register_ftrace_direct(unsigned long ip, unsigned long addr); int unregister_ftrace_direct(unsigned long ip, unsigned long addr); struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr); #else +# define ftrace_direct_func_count 0 static inline int register_ftrace_direct(unsigned long ip, unsigned long addr) { return -ENODEV; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c4446eabacbe..f9456346ec66 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2364,6 +2364,7 @@ ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) /* Protected by rcu_tasks for reading, and direct_mutex for writing */ static struct ftrace_hash *direct_functions = EMPTY_HASH; static DEFINE_MUTEX(direct_mutex); +int ftrace_direct_func_count; /* * Search the direct_functions hash to see if the given instruction pointer @@ -5056,6 +5057,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) direct->addr = addr; direct->count = 0; list_add_rcu(&direct->next, &ftrace_direct_funcs); + ftrace_direct_func_count++; } entry->ip = ip; @@ -5081,6 +5083,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) if (free_hash) free_ftrace_hash(free_hash); free_hash = NULL; + ftrace_direct_func_count--; } } else { if (!direct->count) @@ -5141,6 +5144,7 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) list_del_rcu(&direct->next); synchronize_rcu_tasks(); kfree(direct); + ftrace_direct_func_count--; } } out_unlock: -- cgit v1.2.3 From da537f0aef1372c5204356a7df06be8769467b7b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 1 Oct 2019 14:38:07 -0400 Subject: ftrace: Add information on number of page groups allocated Looking for ways to shrink the size of the dyn_ftrace structure, knowing the information about how many pages and the number of groups of those pages, is useful in working out the best ways to save on memory. This adds one info print on how many groups of pages were used to allocate the ftrace dyn_ftrace structures, and also shows the number of pages and groups in the dyn_ftrace_total_info (which is used for debugging). Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 14 ++++++++++++++ kernel/trace/trace.c | 21 +++++++++++++++------ kernel/trace/trace.h | 2 ++ 3 files changed, 31 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f9456346ec66..d2d488c43a6a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2991,6 +2991,8 @@ static void ftrace_shutdown_sysctl(void) static u64 ftrace_update_time; unsigned long ftrace_update_tot_cnt; +unsigned long ftrace_number_of_pages; +unsigned long ftrace_number_of_groups; static inline int ops_traces_mod(struct ftrace_ops *ops) { @@ -3115,6 +3117,9 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count) goto again; } + ftrace_number_of_pages += 1 << order; + ftrace_number_of_groups++; + cnt = (PAGE_SIZE << order) / ENTRY_SIZE; pg->size = cnt; @@ -3170,6 +3175,8 @@ ftrace_allocate_pages(unsigned long num_to_init) start_pg = pg->next; kfree(pg); pg = start_pg; + ftrace_number_of_pages -= 1 << order; + ftrace_number_of_groups--; } pr_info("ftrace: FAILED to allocate memory for functions\n"); return NULL; @@ -6173,6 +6180,8 @@ void ftrace_release_mod(struct module *mod) free_pages((unsigned long)pg->records, order); tmp_page = pg->next; kfree(pg); + ftrace_number_of_pages -= 1 << order; + ftrace_number_of_groups--; } } @@ -6514,6 +6523,8 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) *last_pg = pg->next; order = get_count_order(pg->size / ENTRIES_PER_PAGE); free_pages((unsigned long)pg->records, order); + ftrace_number_of_pages -= 1 << order; + ftrace_number_of_groups--; kfree(pg); pg = container_of(last_pg, struct ftrace_page, next); if (!(*last_pg)) @@ -6569,6 +6580,9 @@ void __init ftrace_init(void) __start_mcount_loc, __stop_mcount_loc); + pr_info("ftrace: allocated %ld pages with %ld groups\n", + ftrace_number_of_pages, ftrace_number_of_groups); + set_ftrace_early_filters(); return; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6a0ee9178365..5ea8c7c0f2d7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7583,14 +7583,23 @@ static ssize_t tracing_read_dyn_info(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - unsigned long *p = filp->private_data; - char buf[64]; /* Not too big for a shallow stack */ + ssize_t ret; + char *buf; int r; - r = scnprintf(buf, 63, "%ld", *p); - buf[r++] = '\n'; + /* 256 should be plenty to hold the amount needed */ + buf = kmalloc(256, GFP_KERNEL); + if (!buf) + return -ENOMEM; - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + r = scnprintf(buf, 256, "%ld pages:%ld groups: %ld\n", + ftrace_update_tot_cnt, + ftrace_number_of_pages, + ftrace_number_of_groups); + + ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + kfree(buf); + return ret; } static const struct file_operations tracing_dyn_info_fops = { @@ -8782,7 +8791,7 @@ static __init int tracer_init_tracefs(void) #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, - &ftrace_update_tot_cnt, &tracing_dyn_info_fops); + NULL, &tracing_dyn_info_fops); #endif create_trace_instances(d_tracer); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d685c61085c0..8b590f10bc72 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -804,6 +804,8 @@ extern void trace_event_follow_fork(struct trace_array *tr, bool enable); #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; +extern unsigned long ftrace_number_of_pages; +extern unsigned long ftrace_number_of_groups; void ftrace_init_trace_array(struct trace_array *tr); #else static inline void ftrace_init_trace_array(struct trace_array *tr) { } -- cgit v1.2.3 From 91edde2e6ae1dd5e33812f076f3fe4cb7ccbfdd0 Mon Sep 17 00:00:00 2001 From: "Viktor Rosendahl (BMW)" Date: Wed, 9 Oct 2019 00:08:21 +0200 Subject: ftrace: Implement fs notification for tracing_max_latency This patch implements the feature that the tracing_max_latency file, e.g. /sys/kernel/debug/tracing/tracing_max_latency will receive notifications through the fsnotify framework when a new latency is available. One particularly interesting use of this facility is when enabling threshold tracing, through /sys/kernel/debug/tracing/tracing_thresh, together with the preempt/irqsoff tracers. This makes it possible to implement a user space program that can, with equal probability, obtain traces of latencies that occur immediately after each other in spite of the fact that the preempt/irqsoff tracers operate in overwrite mode. This facility works with the hwlat, preempt/irqsoff, and wakeup tracers. The tracers may call the latency_fsnotify() from places such as __schedule() or do_idle(); this makes it impossible to call queue_work() directly without risking a deadlock. The same would happen with a softirq, kernel thread or tasklet. For this reason we use the irq_work mechanism to call queue_work(). This patch creates a new workqueue. The reason for doing this is that I wanted to use the WQ_UNBOUND and WQ_HIGHPRI flags. My thinking was that WQ_UNBOUND might help with the latency in some important cases. If we use: queue_work(system_highpri_wq, &tr->fsnotify_work); then the work will (almost) always execute on the same CPU but if we are unlucky that CPU could be too busy while there could be another CPU in the system that would be able to process the work soon enough. queue_work_on() could be used to queue the work on another CPU but it seems difficult to select the right CPU. Link: http://lkml.kernel.org/r/20191008220824.7911-2-viktor.rosendahl@gmail.com Reviewed-by: Joel Fernandes (Google) Signed-off-by: Viktor Rosendahl (BMW) [ Added max() to have one compare for max latency ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 75 ++++++++++++++++++++++++++++++++++++++++++++-- kernel/trace/trace.h | 18 +++++++++++ kernel/trace/trace_hwlat.c | 11 ++++--- 3 files changed, 98 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5ea8c7c0f2d7..f093a433cb42 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -45,6 +45,9 @@ #include #include #include +#include +#include +#include #include "trace.h" #include "trace_output.h" @@ -1497,6 +1500,74 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) } unsigned long __read_mostly tracing_thresh; +static const struct file_operations tracing_max_lat_fops; + +#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ + defined(CONFIG_FSNOTIFY) + +static struct workqueue_struct *fsnotify_wq; + +static void latency_fsnotify_workfn(struct work_struct *work) +{ + struct trace_array *tr = container_of(work, struct trace_array, + fsnotify_work); + fsnotify(tr->d_max_latency->d_inode, FS_MODIFY, + tr->d_max_latency->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0); +} + +static void latency_fsnotify_workfn_irq(struct irq_work *iwork) +{ + struct trace_array *tr = container_of(iwork, struct trace_array, + fsnotify_irqwork); + queue_work(fsnotify_wq, &tr->fsnotify_work); +} + +static void trace_create_maxlat_file(struct trace_array *tr, + struct dentry *d_tracer) +{ + INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn); + init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq); + tr->d_max_latency = trace_create_file("tracing_max_latency", 0644, + d_tracer, &tr->max_latency, + &tracing_max_lat_fops); +} + +__init static int latency_fsnotify_init(void) +{ + fsnotify_wq = alloc_workqueue("tr_max_lat_wq", + WQ_UNBOUND | WQ_HIGHPRI, 0); + if (!fsnotify_wq) { + pr_err("Unable to allocate tr_max_lat_wq\n"); + return -ENOMEM; + } + return 0; +} + +late_initcall_sync(latency_fsnotify_init); + +void latency_fsnotify(struct trace_array *tr) +{ + if (!fsnotify_wq) + return; + /* + * We cannot call queue_work(&tr->fsnotify_work) from here because it's + * possible that we are called from __schedule() or do_idle(), which + * could cause a deadlock. + */ + irq_work_queue(&tr->fsnotify_irqwork); +} + +/* + * (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ + * defined(CONFIG_FSNOTIFY) + */ +#else + +#define trace_create_maxlat_file(tr, d_tracer) \ + trace_create_file("tracing_max_latency", 0644, d_tracer, \ + &tr->max_latency, &tracing_max_lat_fops) + +#endif #ifdef CONFIG_TRACER_MAX_TRACE /* @@ -1536,6 +1607,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) /* record this tasks comm */ tracing_record_cmdline(tsk); + latency_fsnotify(tr); } /** @@ -8594,8 +8666,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) create_trace_options_dir(tr); #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) - trace_create_file("tracing_max_latency", 0644, d_tracer, - &tr->max_latency, &tracing_max_lat_fops); + trace_create_maxlat_file(tr, d_tracer); #endif if (ftrace_create_function_files(tr, d_tracer)) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8b590f10bc72..718eb998c13e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -16,6 +16,8 @@ #include #include #include +#include +#include #ifdef CONFIG_FTRACE_SYSCALLS #include /* For NR_SYSCALLS */ @@ -264,6 +266,11 @@ struct trace_array { #endif #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) unsigned long max_latency; +#ifdef CONFIG_FSNOTIFY + struct dentry *d_max_latency; + struct work_struct fsnotify_work; + struct irq_work fsnotify_irqwork; +#endif #endif struct trace_pid_list __rcu *filtered_pids; /* @@ -786,6 +793,17 @@ void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); #endif /* CONFIG_TRACER_MAX_TRACE */ +#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ + defined(CONFIG_FSNOTIFY) + +void latency_fsnotify(struct trace_array *tr); + +#else + +static void latency_fsnotify(struct trace_array *tr) { } + +#endif + #ifdef CONFIG_STACKTRACE void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc); diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 862f4b0139fc..63526670605a 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -237,6 +237,7 @@ static int get_sample(void) /* If we exceed the threshold value, we have found a hardware latency */ if (sample > thresh || outer_sample > thresh) { struct hwlat_sample s; + u64 latency; ret = 1; @@ -253,11 +254,13 @@ static int get_sample(void) s.nmi_count = nmi_count; trace_hwlat_sample(&s); + latency = max(sample, outer_sample); + /* Keep a running maximum ever recorded hardware latency */ - if (sample > tr->max_latency) - tr->max_latency = sample; - if (outer_sample > tr->max_latency) - tr->max_latency = outer_sample; + if (latency > tr->max_latency) { + tr->max_latency = latency; + latency_fsnotify(tr); + } } out: -- cgit v1.2.3 From 793937236d1ee032d2ee5ccc27bdd280a04e766e Mon Sep 17 00:00:00 2001 From: "Viktor Rosendahl (BMW)" Date: Wed, 9 Oct 2019 00:08:22 +0200 Subject: preemptirq_delay_test: Add the burst feature and a sysfs trigger This burst feature enables the user to generate a burst of preempt/irqsoff latencies. This makes it possible to test whether we are able to detect latencies that systematically occur very close to each other. The maximum burst size is 10. We also create 10 identical test functions, so that we get 10 different backtraces; this is useful when we want to test whether we can detect all the latencies in a burst. Otherwise, there would be no easy way of differentiating between which latency in a burst was captured by the tracer. In addition, there is a sysfs trigger, so that it's not necessary to reload the module to repeat the test. The trigger will appear as /sys/kernel/preemptirq_delay_test/trigger in sysfs. Link: http://lkml.kernel.org/r/20191008220824.7911-3-viktor.rosendahl@gmail.com Reviewed-by: Joel Fernandes (Google) Signed-off-by: Viktor Rosendahl (BMW) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 6 +- kernel/trace/preemptirq_delay_test.c | 144 ++++++++++++++++++++++++++++++----- 2 files changed, 128 insertions(+), 22 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 624a05e99b0b..d25314bc7a1c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -760,9 +760,9 @@ config PREEMPTIRQ_DELAY_TEST configurable delay. The module busy waits for the duration of the critical section. - For example, the following invocation forces a one-time irq-disabled - critical section for 500us: - modprobe preemptirq_delay_test test_mode=irq delay=500000 + For example, the following invocation generates a burst of three + irq-disabled critical sections for 500us: + modprobe preemptirq_delay_test test_mode=irq delay=500 burst_size=3 If unsure, say N diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c index d8765c952fab..31c0fad4cb9e 100644 --- a/kernel/trace/preemptirq_delay_test.c +++ b/kernel/trace/preemptirq_delay_test.c @@ -10,18 +10,25 @@ #include #include #include +#include #include #include #include #include +#include static ulong delay = 100; -static char test_mode[10] = "irq"; +static char test_mode[12] = "irq"; +static uint burst_size = 1; -module_param_named(delay, delay, ulong, S_IRUGO); -module_param_string(test_mode, test_mode, 10, S_IRUGO); -MODULE_PARM_DESC(delay, "Period in microseconds (100 uS default)"); -MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt or irq (default irq)"); +module_param_named(delay, delay, ulong, 0444); +module_param_string(test_mode, test_mode, 12, 0444); +module_param_named(burst_size, burst_size, uint, 0444); +MODULE_PARM_DESC(delay, "Period in microseconds (100 us default)"); +MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt, irq, or alternate (default irq)"); +MODULE_PARM_DESC(burst_size, "The size of a burst (default 1)"); + +#define MIN(x, y) ((x) < (y) ? (x) : (y)) static void busy_wait(ulong time) { @@ -34,37 +41,136 @@ static void busy_wait(ulong time) } while ((end - start) < (time * 1000)); } -static int preemptirq_delay_run(void *data) +static __always_inline void irqoff_test(void) { unsigned long flags; + local_irq_save(flags); + busy_wait(delay); + local_irq_restore(flags); +} - if (!strcmp(test_mode, "irq")) { - local_irq_save(flags); - busy_wait(delay); - local_irq_restore(flags); - } else if (!strcmp(test_mode, "preempt")) { - preempt_disable(); - busy_wait(delay); - preempt_enable(); +static __always_inline void preemptoff_test(void) +{ + preempt_disable(); + busy_wait(delay); + preempt_enable(); +} + +static void execute_preemptirqtest(int idx) +{ + if (!strcmp(test_mode, "irq")) + irqoff_test(); + else if (!strcmp(test_mode, "preempt")) + preemptoff_test(); + else if (!strcmp(test_mode, "alternate")) { + if (idx % 2 == 0) + irqoff_test(); + else + preemptoff_test(); } +} + +#define DECLARE_TESTFN(POSTFIX) \ + static void preemptirqtest_##POSTFIX(int idx) \ + { \ + execute_preemptirqtest(idx); \ + } \ +/* + * We create 10 different functions, so that we can get 10 different + * backtraces. + */ +DECLARE_TESTFN(0) +DECLARE_TESTFN(1) +DECLARE_TESTFN(2) +DECLARE_TESTFN(3) +DECLARE_TESTFN(4) +DECLARE_TESTFN(5) +DECLARE_TESTFN(6) +DECLARE_TESTFN(7) +DECLARE_TESTFN(8) +DECLARE_TESTFN(9) + +static void (*testfuncs[])(int) = { + preemptirqtest_0, + preemptirqtest_1, + preemptirqtest_2, + preemptirqtest_3, + preemptirqtest_4, + preemptirqtest_5, + preemptirqtest_6, + preemptirqtest_7, + preemptirqtest_8, + preemptirqtest_9, +}; + +#define NR_TEST_FUNCS ARRAY_SIZE(testfuncs) + +static int preemptirq_delay_run(void *data) +{ + int i; + int s = MIN(burst_size, NR_TEST_FUNCS); + + for (i = 0; i < s; i++) + (testfuncs[i])(i); return 0; } -static int __init preemptirq_delay_init(void) +static struct task_struct *preemptirq_start_test(void) { char task_name[50]; - struct task_struct *test_task; snprintf(task_name, sizeof(task_name), "%s_test", test_mode); + return kthread_run(preemptirq_delay_run, NULL, task_name); +} + + +static ssize_t trigger_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + preemptirq_start_test(); + return count; +} + +static struct kobj_attribute trigger_attribute = + __ATTR(trigger, 0200, NULL, trigger_store); + +static struct attribute *attrs[] = { + &trigger_attribute.attr, + NULL, +}; + +static struct attribute_group attr_group = { + .attrs = attrs, +}; + +static struct kobject *preemptirq_delay_kobj; + +static int __init preemptirq_delay_init(void) +{ + struct task_struct *test_task; + int retval; + + test_task = preemptirq_start_test(); + retval = PTR_ERR_OR_ZERO(test_task); + if (retval != 0) + return retval; + + preemptirq_delay_kobj = kobject_create_and_add("preemptirq_delay_test", + kernel_kobj); + if (!preemptirq_delay_kobj) + return -ENOMEM; + + retval = sysfs_create_group(preemptirq_delay_kobj, &attr_group); + if (retval) + kobject_put(preemptirq_delay_kobj); - test_task = kthread_run(preemptirq_delay_run, NULL, task_name); - return PTR_ERR_OR_ZERO(test_task); + return retval; } static void __exit preemptirq_delay_exit(void) { - return; + kobject_put(preemptirq_delay_kobj); } module_init(preemptirq_delay_init) -- cgit v1.2.3 From 9c34fc4b7e903117cc27712b9e6c8690debb7e95 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 15 Oct 2019 21:18:20 +0200 Subject: tracing: Use CONFIG_PREEMPTION CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT. Both PREEMPT and PREEMPT_RT require the same functionality which today depends on CONFIG_PREEMPT. Add additional header output for PREEMPT_RT. Link: http://lkml.kernel.org/r/20191015191821.11479-34-bigeasy@linutronix.de Cc: Ingo Molnar Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/ftrace-uses.rst | 2 +- kernel/trace/trace.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/Documentation/trace/ftrace-uses.rst b/Documentation/trace/ftrace-uses.rst index 740bd0224d35..2a05e770618a 100644 --- a/Documentation/trace/ftrace-uses.rst +++ b/Documentation/trace/ftrace-uses.rst @@ -146,7 +146,7 @@ FTRACE_OPS_FL_RECURSION_SAFE itself or any nested functions that those functions call. If this flag is set, it is possible that the callback will also - be called with preemption enabled (when CONFIG_PREEMPT is set), + be called with preemption enabled (when CONFIG_PREEMPTION is set), but this is not guaranteed. FTRACE_OPS_FL_IPMODIFY diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f093a433cb42..db7d06a26861 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3726,6 +3726,8 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) "desktop", #elif defined(CONFIG_PREEMPT) "preempt", +#elif defined(CONFIG_PREEMPT_RT) + "preempt_rt", #else "unknown", #endif -- cgit v1.2.3 From 6dff4d7dd3e0158688683a17dd792861aa9d61e2 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Tue, 15 Oct 2019 13:10:12 +0100 Subject: tracing: Make internal ftrace events static The event_class_ftrace_##call and event_##call do not seem to be used outside of trace_export.c so make them both static to avoid a number of sparse warnings: kernel/trace/trace_entries.h:59:1: warning: symbol 'event_class_ftrace_function' was not declared. Should it be static? kernel/trace/trace_entries.h:59:1: warning: symbol '__event_function' was not declared. Should it be static? kernel/trace/trace_entries.h:77:1: warning: symbol 'event_class_ftrace_funcgraph_entry' was not declared. Should it be static? kernel/trace/trace_entries.h:77:1: warning: symbol '__event_funcgraph_entry' was not declared. Should it be static? kernel/trace/trace_entries.h:93:1: warning: symbol 'event_class_ftrace_funcgraph_exit' was not declared. Should it be static? kernel/trace/trace_entries.h:93:1: warning: symbol '__event_funcgraph_exit' was not declared. Should it be static? kernel/trace/trace_entries.h:129:1: warning: symbol 'event_class_ftrace_context_switch' was not declared. Should it be static? kernel/trace/trace_entries.h:129:1: warning: symbol '__event_context_switch' was not declared. Should it be static? kernel/trace/trace_entries.h:149:1: warning: symbol 'event_class_ftrace_wakeup' was not declared. Should it be static? kernel/trace/trace_entries.h:149:1: warning: symbol '__event_wakeup' was not declared. Should it be static? kernel/trace/trace_entries.h:171:1: warning: symbol 'event_class_ftrace_kernel_stack' was not declared. Should it be static? kernel/trace/trace_entries.h:171:1: warning: symbol '__event_kernel_stack' was not declared. Should it be static? kernel/trace/trace_entries.h:191:1: warning: symbol 'event_class_ftrace_user_stack' was not declared. Should it be static? kernel/trace/trace_entries.h:191:1: warning: symbol '__event_user_stack' was not declared. Should it be static? kernel/trace/trace_entries.h:214:1: warning: symbol 'event_class_ftrace_bprint' was not declared. Should it be static? kernel/trace/trace_entries.h:214:1: warning: symbol '__event_bprint' was not declared. Should it be static? kernel/trace/trace_entries.h:230:1: warning: symbol 'event_class_ftrace_print' was not declared. Should it be static? kernel/trace/trace_entries.h:230:1: warning: symbol '__event_print' was not declared. Should it be static? kernel/trace/trace_entries.h:247:1: warning: symbol 'event_class_ftrace_raw_data' was not declared. Should it be static? kernel/trace/trace_entries.h:247:1: warning: symbol '__event_raw_data' was not declared. Should it be static? kernel/trace/trace_entries.h:262:1: warning: symbol 'event_class_ftrace_bputs' was not declared. Should it be static? kernel/trace/trace_entries.h:262:1: warning: symbol '__event_bputs' was not declared. Should it be static? kernel/trace/trace_entries.h:277:1: warning: symbol 'event_class_ftrace_mmiotrace_rw' was not declared. Should it be static? kernel/trace/trace_entries.h:277:1: warning: symbol '__event_mmiotrace_rw' was not declared. Should it be static? kernel/trace/trace_entries.h:298:1: warning: symbol 'event_class_ftrace_mmiotrace_map' was not declared. Should it be static? kernel/trace/trace_entries.h:298:1: warning: symbol '__event_mmiotrace_map' was not declared. Should it be static? kernel/trace/trace_entries.h:322:1: warning: symbol 'event_class_ftrace_branch' was not declared. Should it be static? kernel/trace/trace_entries.h:322:1: warning: symbol '__event_branch' was not declared. Should it be static? kernel/trace/trace_entries.h:343:1: warning: symbol 'event_class_ftrace_hwlat' was not declared. Should it be static? kernel/trace/trace_entries.h:343:1: warning: symbol '__event_hwlat' was not declared. Should it be static? Link: http://lkml.kernel.org/r/20191015121012.18824-1-ben.dooks@codethink.co.uk Signed-off-by: Ben Dooks Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_export.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 45630a76ed3a..2e6d2e9741cc 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -171,7 +171,7 @@ ftrace_define_fields_##name(struct trace_event_call *event_call) \ #define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ regfn) \ \ -struct trace_event_class __refdata event_class_ftrace_##call = { \ +static struct trace_event_class __refdata event_class_ftrace_##call = { \ .system = __stringify(TRACE_SYSTEM), \ .define_fields = ftrace_define_fields_##call, \ .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ @@ -187,7 +187,7 @@ struct trace_event_call __used event_##call = { \ .print_fmt = print, \ .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ }; \ -struct trace_event_call __used \ +static struct trace_event_call __used \ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; #undef FTRACE_ENTRY -- cgit v1.2.3 From 2d6425af61166e026e7476db64f70f1266127b1d Mon Sep 17 00:00:00 2001 From: Divya Indi Date: Wed, 14 Aug 2019 10:55:23 -0700 Subject: tracing: Declare newly exported APIs in include/linux/trace.h Declare the newly introduced and exported APIs in the header file - include/linux/trace.h. Moving previous declarations from kernel/trace/trace.h to include/linux/trace.h. Link: http://lkml.kernel.org/r/1565805327-579-2-git-send-email-divya.indi@oracle.com Signed-off-by: Divya Indi Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace.h | 7 +++++++ kernel/trace/trace.h | 4 +--- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/trace.h b/include/linux/trace.h index b95ffb2188ab..24fcf07812ae 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -24,6 +24,13 @@ struct trace_export { int register_ftrace_export(struct trace_export *export); int unregister_ftrace_export(struct trace_export *export); +struct trace_array; + +void trace_printk_init_buffers(void); +int trace_array_printk(struct trace_array *tr, unsigned long ip, + const char *fmt, ...); +struct trace_array *trace_array_create(const char *name); +int trace_array_destroy(struct trace_array *tr); #endif /* CONFIG_TRACING */ #endif /* _LINUX_TRACE_H */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 718eb998c13e..90cba68c8b50 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -873,8 +874,6 @@ trace_vprintk(unsigned long ip, const char *fmt, va_list args); extern int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args); -int trace_array_printk(struct trace_array *tr, - unsigned long ip, const char *fmt, ...); int trace_array_printk_buf(struct ring_buffer *buffer, unsigned long ip, const char *fmt, ...); void trace_printk_seq(struct trace_seq *s); @@ -1890,7 +1889,6 @@ extern const char *__start___tracepoint_str[]; extern const char *__stop___tracepoint_str[]; void trace_printk_control(bool enabled); -void trace_printk_init_buffers(void); void trace_printk_start_comm(void); int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); -- cgit v1.2.3 From e585e6469d6f476b82aa148dc44aaf7ae269a4e2 Mon Sep 17 00:00:00 2001 From: Divya Indi Date: Wed, 14 Aug 2019 10:55:24 -0700 Subject: tracing: Verify if trace array exists before destroying it. A trace array can be destroyed from userspace or kernel. Verify if the trace array exists before proceeding to destroy/remove it. Link: http://lkml.kernel.org/r/1565805327-579-3-git-send-email-divya.indi@oracle.com Reviewed-by: Aruna Ramakrishna Signed-off-by: Divya Indi [ Removed unneeded braces ] Signed-off-by: Steven Rostedt (VMware) --- kernel/module.c | 6 +++++- kernel/trace/trace.c | 15 ++++++++++++--- 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/module.c b/kernel/module.c index ff2d7359a418..6e2fd40a6ed9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3728,7 +3728,6 @@ static int complete_formation(struct module *mod, struct load_info *info) module_enable_ro(mod, false); module_enable_nx(mod); - module_enable_x(mod); /* Mark state as coming so strong_try_module_get() ignores us, * but kallsyms etc. can see us. */ @@ -3751,6 +3750,11 @@ static int prepare_coming_module(struct module *mod) if (err) return err; + /* Make module executable after ftrace is enabled */ + mutex_lock(&module_mutex); + module_enable_x(mod); + mutex_unlock(&module_mutex); + blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); return 0; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index db7d06a26861..fa4f742fc449 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8556,17 +8556,26 @@ static int __remove_instance(struct trace_array *tr) return 0; } -int trace_array_destroy(struct trace_array *tr) +int trace_array_destroy(struct trace_array *this_tr) { + struct trace_array *tr; int ret; - if (!tr) + if (!this_tr) return -EINVAL; mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); - ret = __remove_instance(tr); + ret = -ENODEV; + + /* Making sure trace array exists before destroying it. */ + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr == this_tr) { + ret = __remove_instance(tr); + break; + } + } mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); -- cgit v1.2.3 From 953ae45a0c25e09428d4a03d7654f97ab8a36647 Mon Sep 17 00:00:00 2001 From: Divya Indi Date: Wed, 14 Aug 2019 10:55:25 -0700 Subject: tracing: Adding NULL checks for trace_array descriptor pointer As part of commit f45d1225adb0 ("tracing: Kernel access to Ftrace instances") we exported certain functions. Here, we are adding some additional NULL checks to ensure safe usage by users of these APIs. Link: http://lkml.kernel.org/r/1565805327-579-4-git-send-email-divya.indi@oracle.com Signed-off-by: Divya Indi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 3 +++ kernel/trace/trace_events.c | 2 ++ 2 files changed, 5 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fa4f742fc449..79fe4d6ecbd8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3297,6 +3297,9 @@ int trace_array_printk(struct trace_array *tr, if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) return 0; + if (!tr) + return -ENOENT; + va_start(ap, fmt); ret = trace_array_vprintk(tr, ip, fmt, ap); va_end(ap); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index fba87d10f0c1..2a3ac2365445 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -793,6 +793,8 @@ int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) char *event = NULL, *sub = NULL, *match; int ret; + if (!tr) + return -ENOENT; /* * The buf format can be : * *: means any event by that name. -- cgit v1.2.3 From b83b43ffc6e4b514ca034a0fbdee01322e2f7022 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 15 Oct 2019 09:00:55 -0400 Subject: fgraph: Fix function type mismatches of ftrace_graph_return using ftrace_stub The C compiler is allowing more checks to make sure that function pointers are assigned to the correct prototype function. Unfortunately, the function graph tracer uses a special name with its assigned ftrace_graph_return function pointer that maps to a stub function used by the function tracer (ftrace_stub). The ftrace_graph_return variable is compared to the ftrace_stub in some archs to know if the function graph tracer is enabled or not. This means we can not just simply create a new function stub that compares it without modifying all the archs. Instead, have the linker script create a function_graph_stub that maps to ftrace_stub, and this way we can define the prototype for it to match the prototype of ftrace_graph_return, and make the compiler checks all happy! Link: http://lkml.kernel.org/r/20191015090055.789a0aed@gandalf.local.home Cc: linux-sh@vger.kernel.org Cc: Yoshinori Sato Cc: Rich Felker Reported-by: Sami Tolvanen Signed-off-by: Steven Rostedt (VMware) --- arch/sh/boot/compressed/misc.c | 5 +++++ include/asm-generic/vmlinux.lds.h | 17 ++++++++++++++--- kernel/trace/fgraph.c | 11 ++++++++--- 3 files changed, 27 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/arch/sh/boot/compressed/misc.c b/arch/sh/boot/compressed/misc.c index c15cac9251b9..e69ec12cbbe6 100644 --- a/arch/sh/boot/compressed/misc.c +++ b/arch/sh/boot/compressed/misc.c @@ -111,6 +111,11 @@ void __stack_chk_fail(void) error("stack-protector: Kernel stack is corrupted\n"); } +/* Needed because vmlinux.lds.h references this */ +void ftrace_stub(void) +{ +} + #ifdef CONFIG_SUPERH64 #define stackalign 8 #else diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index dae64600ccbf..0f358be551cd 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -111,18 +111,29 @@ #ifdef CONFIG_FTRACE_MCOUNT_RECORD #ifdef CC_USING_PATCHABLE_FUNCTION_ENTRY +/* + * Need to also make ftrace_graph_stub point to ftrace_stub + * so that the same stub location may have different protocols + * and not mess up with C verifiers. + */ #define MCOUNT_REC() . = ALIGN(8); \ __start_mcount_loc = .; \ KEEP(*(__patchable_function_entries)) \ - __stop_mcount_loc = .; + __stop_mcount_loc = .; \ + ftrace_graph_stub = ftrace_stub; #else #define MCOUNT_REC() . = ALIGN(8); \ __start_mcount_loc = .; \ KEEP(*(__mcount_loc)) \ - __stop_mcount_loc = .; + __stop_mcount_loc = .; \ + ftrace_graph_stub = ftrace_stub; #endif #else -#define MCOUNT_REC() +# ifdef CONFIG_FUNCTION_TRACER +# define MCOUNT_REC() ftrace_graph_stub = ftrace_stub; +# else +# define MCOUNT_REC() +# endif #endif #ifdef CONFIG_TRACE_BRANCH_PROFILING diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 7950a0356042..fa3ce10d0405 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -332,9 +332,14 @@ int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) return 0; } +/* + * Simply points to ftrace_stub, but with the proper protocol. + * Defined by the linker script in linux/vmlinux.lds.h + */ +extern void ftrace_graph_stub(struct ftrace_graph_ret *); + /* The callbacks that hook a function */ -trace_func_graph_ret_t ftrace_graph_return = - (trace_func_graph_ret_t)ftrace_stub; +trace_func_graph_ret_t ftrace_graph_return = ftrace_graph_stub; trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub; @@ -614,7 +619,7 @@ void unregister_ftrace_graph(struct fgraph_ops *gops) goto out; ftrace_graph_active--; - ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; + ftrace_graph_return = ftrace_graph_stub; ftrace_graph_entry = ftrace_graph_entry_stub; __ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET); -- cgit v1.2.3 From 80042c8f06bf5a7b87a63deaa3deb56f2cd52645 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 7 Oct 2019 16:56:56 +0300 Subject: tracing: Use generic type for comparator function Comparator function type, cmp_func_t, is defined in the types.h, use it in the code. Link: http://lkml.kernel.org/r/20191007135656.37734-3-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 12 ++++++------ kernel/trace/trace_branch.c | 8 ++++---- kernel/trace/trace_stat.c | 6 ++---- kernel/trace/trace_stat.h | 2 +- 4 files changed, 13 insertions(+), 15 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d2d488c43a6a..82ef8d60a42b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -465,10 +465,10 @@ static void *function_stat_start(struct tracer_stat *trace) #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* function graph compares on total time */ -static int function_stat_cmp(void *p1, void *p2) +static int function_stat_cmp(const void *p1, const void *p2) { - struct ftrace_profile *a = p1; - struct ftrace_profile *b = p2; + const struct ftrace_profile *a = p1; + const struct ftrace_profile *b = p2; if (a->time < b->time) return -1; @@ -479,10 +479,10 @@ static int function_stat_cmp(void *p1, void *p2) } #else /* not function graph compares against hits */ -static int function_stat_cmp(void *p1, void *p2) +static int function_stat_cmp(const void *p1, const void *p2) { - struct ftrace_profile *a = p1; - struct ftrace_profile *b = p2; + const struct ftrace_profile *a = p1; + const struct ftrace_profile *b = p2; if (a->counter < b->counter) return -1; diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 3ea65cdff30d..88e158d27965 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -244,7 +244,7 @@ static int annotated_branch_stat_headers(struct seq_file *m) return 0; } -static inline long get_incorrect_percent(struct ftrace_branch_data *p) +static inline long get_incorrect_percent(const struct ftrace_branch_data *p) { long percent; @@ -332,10 +332,10 @@ annotated_branch_stat_next(void *v, int idx) return p; } -static int annotated_branch_stat_cmp(void *p1, void *p2) +static int annotated_branch_stat_cmp(const void *p1, const void *p2) { - struct ftrace_branch_data *a = p1; - struct ftrace_branch_data *b = p2; + const struct ftrace_branch_data *a = p1; + const struct ftrace_branch_data *b = p2; long percent_a, percent_b; diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 9ab0a1a7ad5e..874f1274cf99 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -72,9 +72,7 @@ static void destroy_session(struct stat_session *session) kfree(session); } -typedef int (*cmp_stat_t)(void *, void *); - -static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp) +static int insert_stat(struct rb_root *root, void *stat, cmp_func_t cmp) { struct rb_node **new = &(root->rb_node), *parent = NULL; struct stat_node *data; @@ -112,7 +110,7 @@ static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp) * This one will force an insertion as right-most node * in the rbtree. */ -static int dummy_cmp(void *p1, void *p2) +static int dummy_cmp(const void *p1, const void *p2) { return -1; } diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h index 8786d17caf49..31d7dc5bf1db 100644 --- a/kernel/trace/trace_stat.h +++ b/kernel/trace/trace_stat.h @@ -16,7 +16,7 @@ struct tracer_stat { void *(*stat_start)(struct tracer_stat *trace); void *(*stat_next)(void *prev, int idx); /* Compare two entries for stats sorting */ - int (*stat_cmp)(void *p1, void *p2); + cmp_func_t stat_cmp; /* Print a stat entry */ int (*stat_show)(struct seq_file *s, void *p); /* Release an entry */ -- cgit v1.2.3 From 0c3c86bdc691c794a6154f8515b7fa82c82dfc4d Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat (VMware)" Date: Thu, 10 Oct 2019 11:51:17 -0700 Subject: tracing/hwlat: Fix a few trivial nits Update the source file name in the comments, and fix a grammatical error. Link: http://lkml.kernel.org/r/157073346821.17189.8946944856026592247.stgit@srivatsa-ubuntu Signed-off-by: Srivatsa S. Bhat (VMware) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_hwlat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 63526670605a..6638d63f0921 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * trace_hwlatdetect.c - A simple Hardware Latency detector. + * trace_hwlat.c - A simple Hardware Latency detector. * * Use this tracer to detect large system latencies induced by the behavior of * certain underlying system hardware or firmware, independent of Linux itself. @@ -279,7 +279,7 @@ static void move_to_next_cpu(void) return; /* * If for some reason the user modifies the CPU affinity - * of this thread, than stop migrating for the duration + * of this thread, then stop migrating for the duration * of the current test. */ if (!cpumask_equal(current_mask, current->cpus_ptr)) -- cgit v1.2.3 From 6ee40511cb838f9ced002dff7131bca87e3ccbdd Mon Sep 17 00:00:00 2001 From: Yuming Han Date: Thu, 24 Oct 2019 11:34:30 +0800 Subject: tracing: use kvcalloc for tgid_map array allocation Fail to allocate memory for tgid_map, because it requires order-6 page. detail as: c3 sh: page allocation failure: order:6, mode:0x140c0c0(GFP_KERNEL), nodemask=(null) c3 sh cpuset=/ mems_allowed=0 c3 CPU: 3 PID: 5632 Comm: sh Tainted: G W O 4.14.133+ #10 c3 Hardware name: Generic DT based system c3 Backtrace: c3 [] (dump_backtrace) from [](show_stack+0x18/0x1c) c3 [] (show_stack) from [](dump_stack+0x84/0xa4) c3 [] (dump_stack) from [](warn_alloc+0xc4/0x19c) c3 [] (warn_alloc) from [](__alloc_pages_nodemask+0xd18/0xf28) c3 [] (__alloc_pages_nodemask) from [](kmalloc_order+0x20/0x38) c3 [] (kmalloc_order) from [](kmalloc_order_trace+0x24/0x108) c3 [] (kmalloc_order_trace) from [](set_tracer_flag+0xb0/0x158) c3 [] (set_tracer_flag) from [](trace_options_core_write+0x7c/0xcc) c3 [] (trace_options_core_write) from [](__vfs_write+0x40/0x14c) c3 [] (__vfs_write) from [](vfs_write+0xc4/0x198) c3 [] (vfs_write) from [](SyS_write+0x6c/0xd0) c3 [] (SyS_write) from [](ret_fast_syscall+0x0/0x54) Switch to use kvcalloc to avoid unexpected allocation failures. Link: http://lkml.kernel.org/r/1571888070-24425-1-git-send-email-chunyan.zhang@unisoc.com Signed-off-by: Yuming Han Signed-off-by: Chunyan Zhang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 79fe4d6ecbd8..42659ce6ac0c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4686,7 +4686,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) if (mask == TRACE_ITER_RECORD_TGID) { if (!tgid_map) - tgid_map = kcalloc(PID_MAX_DEFAULT + 1, + tgid_map = kvcalloc(PID_MAX_DEFAULT + 1, sizeof(*tgid_map), GFP_KERNEL); if (!tgid_map) { -- cgit v1.2.3 From c7411a1a126f649be71526a36d4afac9e5aefa13 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 29 Oct 2019 17:31:44 +0900 Subject: tracing/kprobe: Check whether the non-suffixed symbol is notrace Check whether the non-suffixed symbol is notrace, since suffixed symbols are generated by the compilers for optimization. Based on these suffixed symbols, notrace check might not work because some of them are just a partial code of the original function. (e.g. cold-cache (unlikely) code is separated from original function as FUNCTION.cold.XX) For example, without this fix, # echo p device_add.cold.67 > /sys/kernel/debug/tracing/kprobe_events sh: write error: Invalid argument # cat /sys/kernel/debug/tracing/error_log [ 135.491035] trace_kprobe: error: Failed to register probe event Command: p device_add.cold.67 ^ # dmesg | tail -n 1 [ 135.488599] trace_kprobe: Could not probe notrace function device_add.cold.67 With this, # echo p device_add.cold.66 > /sys/kernel/debug/tracing/kprobe_events # cat /sys/kernel/debug/kprobes/list ffffffff81599de9 k device_add.cold.66+0x0 [DISABLED] Actually, kprobe blacklist already did similar thing, see within_kprobe_blacklist(). Link: http://lkml.kernel.org/r/157233790394.6706.18243942030937189679.stgit@devnote2 Fixes: 45408c4f9250 ("tracing: kprobes: Prohibit probing on notrace function") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1552a95c743b..7f890262c8a3 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -435,11 +435,10 @@ static int disable_trace_kprobe(struct trace_event_call *call, #if defined(CONFIG_KPROBES_ON_FTRACE) && \ !defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE) -static bool within_notrace_func(struct trace_kprobe *tk) +static bool __within_notrace_func(unsigned long addr) { - unsigned long offset, size, addr; + unsigned long offset, size; - addr = trace_kprobe_address(tk); if (!addr || !kallsyms_lookup_size_offset(addr, &size, &offset)) return false; @@ -452,6 +451,28 @@ static bool within_notrace_func(struct trace_kprobe *tk) */ return !ftrace_location_range(addr, addr + size - 1); } + +static bool within_notrace_func(struct trace_kprobe *tk) +{ + unsigned long addr = addr = trace_kprobe_address(tk); + char symname[KSYM_NAME_LEN], *p; + + if (!__within_notrace_func(addr)) + return false; + + /* Check if the address is on a suffixed-symbol */ + if (!lookup_symbol_name(addr, symname)) { + p = strchr(symname, '.'); + if (!p) + return true; + *p = '\0'; + addr = (unsigned long)kprobe_lookup_name(symname, 0); + if (addr) + return __within_notrace_func(addr); + } + + return true; +} #else #define within_notrace_func(tk) (false) #endif -- cgit v1.2.3 From ef56e047b2bd4dabb801fd073dfcab5f40de5f78 Mon Sep 17 00:00:00 2001 From: Piotr Maziarz Date: Thu, 7 Nov 2019 13:45:38 +0100 Subject: tracing: Use seq_buf_hex_dump() to dump buffers Without this, buffers can be printed with __print_array macro that has no formatting options and can be hard to read. The other way is to mimic formatting capability with multiple calls of trace event with one call per row which gives performance impact and different timestamp in each row. Link: http://lkml.kernel.org/r/1573130738-29390-2-git-send-email-piotrx.maziarz@linux.intel.com Signed-off-by: Piotr Maziarz Signed-off-by: Cezary Rojewski Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 5 +++++ include/linux/trace_seq.h | 4 ++++ include/trace/trace_events.h | 6 ++++++ kernel/trace/trace_output.c | 15 +++++++++++++++ kernel/trace/trace_seq.c | 30 ++++++++++++++++++++++++++++++ 5 files changed, 60 insertions(+) (limited to 'kernel/trace') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 30a8cdcfd4a4..60a41b7069dd 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -45,6 +45,11 @@ const char *trace_print_array_seq(struct trace_seq *p, const void *buf, int count, size_t el_size); +const char * +trace_print_hex_dump_seq(struct trace_seq *p, const char *prefix_str, + int prefix_type, int rowsize, int groupsize, + const void *buf, size_t len, bool ascii); + struct trace_iterator; struct trace_event; diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 6609b39a7232..6c30508fca19 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -92,6 +92,10 @@ extern int trace_seq_path(struct trace_seq *s, const struct path *path); extern void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, int nmaskbits); +extern int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str, + int prefix_type, int rowsize, int groupsize, + const void *buf, size_t len, bool ascii); + #else /* CONFIG_TRACING */ static inline void trace_seq_printf(struct trace_seq *s, const char *fmt, ...) { diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 4ecdfe2e3580..7089760d4c7a 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -340,6 +340,12 @@ TRACE_MAKE_SYSTEM_STR(); trace_print_array_seq(p, array, count, el_size); \ }) +#undef __print_hex_dump +#define __print_hex_dump(prefix_str, prefix_type, \ + rowsize, groupsize, buf, len, ascii) \ + trace_print_hex_dump_seq(p, prefix_str, prefix_type, \ + rowsize, groupsize, buf, len, ascii) + #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ static notrace enum print_line_t \ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index d54ce252b05a..d9b4b7c22db4 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -274,6 +274,21 @@ trace_print_array_seq(struct trace_seq *p, const void *buf, int count, } EXPORT_SYMBOL(trace_print_array_seq); +const char * +trace_print_hex_dump_seq(struct trace_seq *p, const char *prefix_str, + int prefix_type, int rowsize, int groupsize, + const void *buf, size_t len, bool ascii) +{ + const char *ret = trace_seq_buffer_ptr(p); + + trace_seq_putc(p, '\n'); + trace_seq_hex_dump(p, prefix_str, prefix_type, + rowsize, groupsize, buf, len, ascii); + trace_seq_putc(p, 0); + return ret; +} +EXPORT_SYMBOL(trace_print_hex_dump_seq); + int trace_raw_output_prep(struct trace_iterator *iter, struct trace_event *trace_event) { diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index 6b1c562ffdaf..344e4c1aa09c 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -376,3 +376,33 @@ int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt) return seq_buf_to_user(&s->seq, ubuf, cnt); } EXPORT_SYMBOL_GPL(trace_seq_to_user); + +int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str, + int prefix_type, int rowsize, int groupsize, + const void *buf, size_t len, bool ascii) +{ + unsigned int save_len = s->seq.len; + + if (s->full) + return 0; + + __trace_seq_init(s); + + if (TRACE_SEQ_BUF_LEFT(s) < 1) { + s->full = 1; + return 0; + } + + seq_buf_hex_dump(&(s->seq), prefix_str, + prefix_type, rowsize, groupsize, + buf, len, ascii); + + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + return 0; + } + + return 1; +} +EXPORT_SYMBOL(trace_seq_hex_dump); -- cgit v1.2.3 From 9b4712044d059e7842aaeeafd7c7a7ee88c589db Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 12 Nov 2019 18:42:19 +0100 Subject: tracing: Remove stray tab in TRACE_EVAL_MAP_FILE's help text There was a stray tab in the help text of the aforementioned config option which showed like this: The "print fmt" of the trace events will show the enum/sizeof names instead of their values. This can cause problems for user space tools ... in menuconfig. Remove it and end a sentence with a fullstop. No functional changes. Link: http://lkml.kernel.org/r/20191112174219.10933-1-bp@alien8.de Signed-off-by: Borislav Petkov Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d25314bc7a1c..b872716bb2a0 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -771,7 +771,7 @@ config TRACE_EVAL_MAP_FILE depends on TRACING help The "print fmt" of the trace events will show the enum/sizeof names - instead of their values. This can cause problems for user space tools + instead of their values. This can cause problems for user space tools that use this string to parse the raw data as user space does not know how to convert the string to its value. @@ -792,7 +792,7 @@ config TRACE_EVAL_MAP_FILE they are needed for the "eval_map" file. Enabling this option will increase the memory footprint of the running kernel. - If unsure, say N + If unsure, say N. config GCOV_PROFILE_FTRACE bool "Enable GCOV profiling on ftrace subsystem" -- cgit v1.2.3 From 36b3615dc3b625c8b587f34e413a600f7ac16403 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 14 Nov 2019 22:43:58 -0500 Subject: tracing: Add missing "inline" in stub function of latency_fsnotify() The latency_fsnotify() stub when the function is not defined, was missing the "inline". Link: https://lore.kernel.org/r/20191115140213.74c5efe7@canb.auug.org.au Reported-by: Stephen Rothwell Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 90cba68c8b50..2df8aed6a8f0 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -801,7 +801,7 @@ void latency_fsnotify(struct trace_array *tr); #else -static void latency_fsnotify(struct trace_array *tr) { } +static inline void latency_fsnotify(struct trace_array *tr) { } #endif -- cgit v1.2.3 From 0567d6809182df53da03636fad36c507c5cf07a5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 14 Nov 2019 14:39:35 -0500 Subject: ftrace: Add modify_ftrace_direct() Add a new function modify_ftrace_direct() that will allow a user to update an existing direct caller to a new trampoline, without missing hits due to unregistering one and then adding another. Link: https://lore.kernel.org/r/20191109022907.6zzo6orhxpt5n2sv@ast-mbp.dhcp.thefacebook.com Suggested-by: Alexei Starovoitov Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 6 ++++ kernel/trace/ftrace.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) (limited to 'kernel/trace') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 55647e185141..73eb2e93593f 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -250,6 +250,7 @@ static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { extern int ftrace_direct_func_count; int register_ftrace_direct(unsigned long ip, unsigned long addr); int unregister_ftrace_direct(unsigned long ip, unsigned long addr); +int modify_ftrace_direct(unsigned long ip, unsigned long old_addr, unsigned long new_addr); struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr); #else # define ftrace_direct_func_count 0 @@ -261,6 +262,11 @@ static inline int unregister_ftrace_direct(unsigned long ip, unsigned long addr) { return -ENODEV; } +static inline int modify_ftrace_direct(unsigned long ip, + unsigned long old_addr, unsigned long new_addr) +{ + return -ENODEV; +} static inline struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr) { return NULL; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 82ef8d60a42b..834f3556ea1e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5160,6 +5160,84 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) return ret; } EXPORT_SYMBOL_GPL(unregister_ftrace_direct); + +static struct ftrace_ops stub_ops = { + .func = ftrace_stub, +}; + +/** + * modify_ftrace_direct - Modify an existing direct call to call something else + * @ip: The instruction pointer to modify + * @old_addr: The address that the current @ip calls directly + * @new_addr: The address that the @ip should call + * + * This modifies a ftrace direct caller at an instruction pointer without + * having to disable it first. The direct call will switch over to the + * @new_addr without missing anything. + * + * Returns: zero on success. Non zero on error, which includes: + * -ENODEV : the @ip given has no direct caller attached + * -EINVAL : the @old_addr does not match the current direct caller + */ +int modify_ftrace_direct(unsigned long ip, + unsigned long old_addr, unsigned long new_addr) +{ + struct ftrace_func_entry *entry; + struct dyn_ftrace *rec; + int ret = -ENODEV; + + mutex_lock(&direct_mutex); + entry = __ftrace_lookup_ip(direct_functions, ip); + if (!entry) { + /* OK if it is off by a little */ + rec = lookup_rec(ip, ip); + if (!rec || rec->ip == ip) + goto out_unlock; + + entry = __ftrace_lookup_ip(direct_functions, rec->ip); + if (!entry) + goto out_unlock; + + ip = rec->ip; + WARN_ON(!(rec->flags & FTRACE_FL_DIRECT)); + } + + ret = -EINVAL; + if (entry->direct != old_addr) + goto out_unlock; + + /* + * By setting a stub function at the same address, we force + * the code to call the iterator and the direct_ops helper. + * This means that @ip does not call the direct call, and + * we can simply modify it. + */ + ret = ftrace_set_filter_ip(&stub_ops, ip, 0, 0); + if (ret) + goto out_unlock; + + ret = register_ftrace_function(&stub_ops); + if (ret) { + ftrace_set_filter_ip(&stub_ops, ip, 1, 0); + goto out_unlock; + } + + entry->direct = new_addr; + + /* + * By removing the stub, we put back the direct call, calling + * the @new_addr. + */ + unregister_ftrace_function(&stub_ops); + ftrace_set_filter_ip(&stub_ops, ip, 1, 0); + + ret = 0; + + out_unlock: + mutex_unlock(&direct_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(modify_ftrace_direct); #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ /** -- cgit v1.2.3 From 58a74a2925a5b4125dd4f4e728490b9642534c81 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Fri, 15 Nov 2019 11:17:30 +0200 Subject: tracing: Increase SYNTH_FIELDS_MAX for synthetic_events Increase the maximum allowed count of synthetic event fields from 16 to 32 in order to allow for larger-than-usual events. Link: http://lkml.kernel.org/r/20191115091730.9192-1-dedekind1@gmail.com Reviewed-by: Tom Zanussi Signed-off-by: Artem Bityutskiy Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 7482a1466ebf..f49d1a36d3ae 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -23,7 +23,7 @@ #include "trace_dynevent.h" #define SYNTH_SYSTEM "synthetic" -#define SYNTH_FIELDS_MAX 16 +#define SYNTH_FIELDS_MAX 32 #define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */ -- cgit v1.2.3 From 1c7f9b673dc0a15753274c4e7f5ebfd4468fc69f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 15 Nov 2019 14:13:20 -0500 Subject: ftrace: Fix accounting bug with direct->count in register_ftrace_direct() The direct->count wasn't being updated properly, where it only was updated when the first entry was added, but should be updated every time. Fixes: 013bf0da04748 ("ftrace: Add ftrace_find_direct_func()") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 834f3556ea1e..32e4e5ffdd97 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5093,8 +5093,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) ftrace_direct_func_count--; } } else { - if (!direct->count) - direct->count++; + direct->count++; } out_unlock: mutex_unlock(&direct_mutex); -- cgit v1.2.3 From 406acdd32d3e7d5a6dcb7f67798e89068fbe0d77 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 15 Nov 2019 14:19:04 -0500 Subject: ftrace: Add another check for match in register_ftrace_direct() As an instruction pointer passed into register_ftrace_direct() may just exist on the ftrace call site, but may not be the start of the call site itself, register_ftrace_direct() still needs to update test if a direct call exists on the normalized site, as only one direct call is allowed at any one time. Fixes: 763e34e74bb7d ("ftrace: Add register_ftrace_direct()") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 32e4e5ffdd97..9fe33ebaf914 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5030,7 +5030,12 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) goto out_unlock; /* Make sure the ip points to the exact record */ - ip = rec->ip; + if (ip != rec->ip) { + ip = rec->ip; + /* Need to check this ip for a direct. */ + if (find_rec_direct(ip)) + goto out_unlock; + } ret = -ENOMEM; if (ftrace_hash_empty(direct_functions) || -- cgit v1.2.3 From 128161f47bc3797b0d068da13e311770685d6e4f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 15 Nov 2019 14:14:45 -0500 Subject: ftrace: Add helper find_direct_entry() to consolidate code Both unregister_ftrace_direct() and modify_ftrace_direct() needs to normalize the ip passed in to match the rec->ip, as it is acceptable to have the ip on the ftrace call site but not the start. There are also common validity checks with the record found by the ip, these should be done for both unregister_ftrace_direct() and modify_ftrace_direct(). Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 59 +++++++++++++++++++++++++-------------------------- 1 file changed, 29 insertions(+), 30 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9fe33ebaf914..ef79c8393f53 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5112,30 +5112,40 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) } EXPORT_SYMBOL_GPL(register_ftrace_direct); -int unregister_ftrace_direct(unsigned long ip, unsigned long addr) +static struct ftrace_func_entry *find_direct_entry(unsigned long *ip) { struct ftrace_func_entry *entry; - struct ftrace_direct_func *direct; struct dyn_ftrace *rec; - int ret = -ENODEV; - mutex_lock(&direct_mutex); + rec = lookup_rec(*ip, *ip); + if (!rec) + return NULL; - entry = __ftrace_lookup_ip(direct_functions, ip); + entry = __ftrace_lookup_ip(direct_functions, rec->ip); if (!entry) { - /* OK if it is off by a little */ - rec = lookup_rec(ip, ip); - if (!rec || rec->ip == ip) - goto out_unlock; + WARN_ON(rec->flags & FTRACE_FL_DIRECT); + return NULL; + } - entry = __ftrace_lookup_ip(direct_functions, rec->ip); - if (!entry) { - WARN_ON(rec->flags & FTRACE_FL_DIRECT); - goto out_unlock; - } + WARN_ON(!(rec->flags & FTRACE_FL_DIRECT)); - WARN_ON(!(rec->flags & FTRACE_FL_DIRECT)); - } + /* Passed in ip just needs to be on the call site */ + *ip = rec->ip; + + return entry; +} + +int unregister_ftrace_direct(unsigned long ip, unsigned long addr) +{ + struct ftrace_direct_func *direct; + struct ftrace_func_entry *entry; + int ret = -ENODEV; + + mutex_lock(&direct_mutex); + + entry = find_direct_entry(&ip); + if (!entry) + goto out_unlock; if (direct_functions->count == 1) unregister_ftrace_function(&direct_ops); @@ -5187,24 +5197,13 @@ int modify_ftrace_direct(unsigned long ip, unsigned long old_addr, unsigned long new_addr) { struct ftrace_func_entry *entry; - struct dyn_ftrace *rec; int ret = -ENODEV; mutex_lock(&direct_mutex); - entry = __ftrace_lookup_ip(direct_functions, ip); - if (!entry) { - /* OK if it is off by a little */ - rec = lookup_rec(ip, ip); - if (!rec || rec->ip == ip) - goto out_unlock; - - entry = __ftrace_lookup_ip(direct_functions, rec->ip); - if (!entry) - goto out_unlock; - ip = rec->ip; - WARN_ON(!(rec->flags & FTRACE_FL_DIRECT)); - } + entry = find_direct_entry(&ip); + if (!entry) + goto out_unlock; ret = -EINVAL; if (entry->direct != old_addr) -- cgit v1.2.3 From ea806eb3eab35528b578a061b2c4b28f0f92c465 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sun, 17 Nov 2019 17:04:15 -0500 Subject: ftrace: Add a helper function to modify_ftrace_direct() to allow arch optimization If a direct ftrace callback is at a location that does not have any other ftrace helpers attached to it, it is possible to simply just change the text to call the new caller (if the architecture supports it). But this requires special architecture code. Currently, modify_ftrace_direct() uses a trick to add a stub ftrace callback to the location forcing it to call the ftrace iterator. Then it can change the direct helper to call the new function in C, and then remove the stub. Removing the stub will have the location now call the new location that the direct helper is using. The new helper function does the registering the stub trick, but is a weak function, allowing an architecture to override it to do something a bit more direct. Link: https://lore.kernel.org/r/20191115215125.mbqv7taqnx376yed@ast-mbp.dhcp.thefacebook.com Suggested-by: Alexei Starovoitov Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 21 ++++++++- kernel/trace/ftrace.c | 120 ++++++++++++++++++++++++++++++++++++------------- 2 files changed, 107 insertions(+), 34 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 73eb2e93593f..dfaa37e1943d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -246,12 +246,24 @@ static inline void ftrace_free_init_mem(void) { } static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { } #endif /* CONFIG_FUNCTION_TRACER */ +struct ftrace_func_entry { + struct hlist_node hlist; + unsigned long ip; + unsigned long direct; /* for direct lookup only */ +}; + +struct dyn_ftrace; + #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS extern int ftrace_direct_func_count; int register_ftrace_direct(unsigned long ip, unsigned long addr); int unregister_ftrace_direct(unsigned long ip, unsigned long addr); int modify_ftrace_direct(unsigned long ip, unsigned long old_addr, unsigned long new_addr); struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr); +int ftrace_modify_direct_caller(struct ftrace_func_entry *entry, + struct dyn_ftrace *rec, + unsigned long old_addr, + unsigned long new_addr); #else # define ftrace_direct_func_count 0 static inline int register_ftrace_direct(unsigned long ip, unsigned long addr) @@ -271,6 +283,13 @@ static inline struct ftrace_direct_func *ftrace_find_direct_func(unsigned long a { return NULL; } +static inline int ftrace_modify_direct_caller(struct ftrace_func_entry *entry, + struct dyn_ftrace *rec, + unsigned long old_addr, + unsigned long new_addr) +{ + return -ENODEV; +} #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS @@ -343,8 +362,6 @@ static inline void stack_tracer_enable(void) { } int ftrace_arch_code_modify_prepare(void); int ftrace_arch_code_modify_post_process(void); -struct dyn_ftrace; - enum ftrace_bug_type { FTRACE_BUG_UNKNOWN, FTRACE_BUG_INIT, diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ef79c8393f53..caae523f4ef3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1020,12 +1020,6 @@ static bool update_all_ops; # error Dynamic ftrace depends on MCOUNT_RECORD #endif -struct ftrace_func_entry { - struct hlist_node hlist; - unsigned long ip; - unsigned long direct; /* for direct lookup only */ -}; - struct ftrace_func_probe { struct ftrace_probe_ops *probe_ops; struct ftrace_ops ops; @@ -5112,7 +5106,8 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) } EXPORT_SYMBOL_GPL(register_ftrace_direct); -static struct ftrace_func_entry *find_direct_entry(unsigned long *ip) +static struct ftrace_func_entry *find_direct_entry(unsigned long *ip, + struct dyn_ftrace **recp) { struct ftrace_func_entry *entry; struct dyn_ftrace *rec; @@ -5132,6 +5127,9 @@ static struct ftrace_func_entry *find_direct_entry(unsigned long *ip) /* Passed in ip just needs to be on the call site */ *ip = rec->ip; + if (recp) + *recp = rec; + return entry; } @@ -5143,7 +5141,7 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) mutex_lock(&direct_mutex); - entry = find_direct_entry(&ip); + entry = find_direct_entry(&ip, NULL); if (!entry) goto out_unlock; @@ -5179,6 +5177,75 @@ static struct ftrace_ops stub_ops = { .func = ftrace_stub, }; +/** + * ftrace_modify_direct_caller - modify ftrace nop directly + * @entry: The ftrace hash entry of the direct helper for @rec + * @rec: The record representing the function site to patch + * @old_addr: The location that the site at @rec->ip currently calls + * @new_addr: The location that the site at @rec->ip should call + * + * An architecture may overwrite this function to optimize the + * changing of the direct callback on an ftrace nop location. + * This is called with the ftrace_lock mutex held, and no other + * ftrace callbacks are on the associated record (@rec). Thus, + * it is safe to modify the ftrace record, where it should be + * currently calling @old_addr directly, to call @new_addr. + * + * Safety checks should be made to make sure that the code at + * @rec->ip is currently calling @old_addr. And this must + * also update entry->direct to @new_addr. + */ +int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry, + struct dyn_ftrace *rec, + unsigned long old_addr, + unsigned long new_addr) +{ + unsigned long ip = rec->ip; + int ret; + + /* + * The ftrace_lock was used to determine if the record + * had more than one registered user to it. If it did, + * we needed to prevent that from changing to do the quick + * switch. But if it did not (only a direct caller was attached) + * then this function is called. But this function can deal + * with attached callers to the rec that we care about, and + * since this function uses standard ftrace calls that take + * the ftrace_lock mutex, we need to release it. + */ + mutex_unlock(&ftrace_lock); + + /* + * By setting a stub function at the same address, we force + * the code to call the iterator and the direct_ops helper. + * This means that @ip does not call the direct call, and + * we can simply modify it. + */ + ret = ftrace_set_filter_ip(&stub_ops, ip, 0, 0); + if (ret) + goto out_lock; + + ret = register_ftrace_function(&stub_ops); + if (ret) { + ftrace_set_filter_ip(&stub_ops, ip, 1, 0); + goto out_lock; + } + + entry->direct = new_addr; + + /* + * By removing the stub, we put back the direct call, calling + * the @new_addr. + */ + unregister_ftrace_function(&stub_ops); + ftrace_set_filter_ip(&stub_ops, ip, 1, 0); + + out_lock: + mutex_lock(&ftrace_lock); + + return ret; +} + /** * modify_ftrace_direct - Modify an existing direct call to call something else * @ip: The instruction pointer to modify @@ -5197,11 +5264,13 @@ int modify_ftrace_direct(unsigned long ip, unsigned long old_addr, unsigned long new_addr) { struct ftrace_func_entry *entry; + struct dyn_ftrace *rec; int ret = -ENODEV; mutex_lock(&direct_mutex); - entry = find_direct_entry(&ip); + mutex_lock(&ftrace_lock); + entry = find_direct_entry(&ip, &rec); if (!entry) goto out_unlock; @@ -5210,33 +5279,20 @@ int modify_ftrace_direct(unsigned long ip, goto out_unlock; /* - * By setting a stub function at the same address, we force - * the code to call the iterator and the direct_ops helper. - * This means that @ip does not call the direct call, and - * we can simply modify it. + * If there's no other ftrace callback on the rec->ip location, + * then it can be changed directly by the architecture. + * If there is another caller, then we just need to change the + * direct caller helper to point to @new_addr. */ - ret = ftrace_set_filter_ip(&stub_ops, ip, 0, 0); - if (ret) - goto out_unlock; - - ret = register_ftrace_function(&stub_ops); - if (ret) { - ftrace_set_filter_ip(&stub_ops, ip, 1, 0); - goto out_unlock; + if (ftrace_rec_count(rec) == 1) { + ret = ftrace_modify_direct_caller(entry, rec, old_addr, new_addr); + } else { + entry->direct = new_addr; + ret = 0; } - entry->direct = new_addr; - - /* - * By removing the stub, we put back the direct call, calling - * the @new_addr. - */ - unregister_ftrace_function(&stub_ops); - ftrace_set_filter_ip(&stub_ops, ip, 1, 0); - - ret = 0; - out_unlock: + mutex_unlock(&ftrace_lock); mutex_unlock(&direct_mutex); return ret; } -- cgit v1.2.3 From 46f9469247c6f4697cbbf37e4b3961120bf07f29 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 18 Nov 2019 10:41:29 -0500 Subject: ftrace: Rename ftrace_graph_stub to ftrace_stub_graph The ftrace_graph_stub was created and points to ftrace_stub as a way to assign the functon graph tracer function pointer to a stub function with a different prototype than what ftrace_stub has and not trigger the C verifier. The ftrace_graph_stub was created via the linker script vmlinux.lds.h. Unfortunately, powerpc already uses the name ftrace_graph_stub for its internal implementation of the function graph tracer, and even though powerpc would still build, the change via the linker script broke function tracer on powerpc from working. By using the name ftrace_stub_graph, which does not exist anywhere else in the kernel, this should not be a problem. Link: https://lore.kernel.org/r/1573849732.5937.136.camel@lca.pw Fixes: b83b43ffc6e4 ("fgraph: Fix function type mismatches of ftrace_graph_return using ftrace_stub") Reorted-by: Qian Cai Signed-off-by: Steven Rostedt (VMware) --- include/asm-generic/vmlinux.lds.h | 8 ++++---- kernel/trace/fgraph.c | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel/trace') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 0f358be551cd..996db32c491b 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -112,7 +112,7 @@ #ifdef CONFIG_FTRACE_MCOUNT_RECORD #ifdef CC_USING_PATCHABLE_FUNCTION_ENTRY /* - * Need to also make ftrace_graph_stub point to ftrace_stub + * Need to also make ftrace_stub_graph point to ftrace_stub * so that the same stub location may have different protocols * and not mess up with C verifiers. */ @@ -120,17 +120,17 @@ __start_mcount_loc = .; \ KEEP(*(__patchable_function_entries)) \ __stop_mcount_loc = .; \ - ftrace_graph_stub = ftrace_stub; + ftrace_stub_graph = ftrace_stub; #else #define MCOUNT_REC() . = ALIGN(8); \ __start_mcount_loc = .; \ KEEP(*(__mcount_loc)) \ __stop_mcount_loc = .; \ - ftrace_graph_stub = ftrace_stub; + ftrace_stub_graph = ftrace_stub; #endif #else # ifdef CONFIG_FUNCTION_TRACER -# define MCOUNT_REC() ftrace_graph_stub = ftrace_stub; +# define MCOUNT_REC() ftrace_stub_graph = ftrace_stub; # else # define MCOUNT_REC() # endif diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index fa3ce10d0405..67e0c462b059 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -336,10 +336,10 @@ int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) * Simply points to ftrace_stub, but with the proper protocol. * Defined by the linker script in linux/vmlinux.lds.h */ -extern void ftrace_graph_stub(struct ftrace_graph_ret *); +extern void ftrace_stub_graph(struct ftrace_graph_ret *); /* The callbacks that hook a function */ -trace_func_graph_ret_t ftrace_graph_return = ftrace_graph_stub; +trace_func_graph_ret_t ftrace_graph_return = ftrace_stub_graph; trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub; @@ -619,7 +619,7 @@ void unregister_ftrace_graph(struct fgraph_ops *gops) goto out; ftrace_graph_active--; - ftrace_graph_return = ftrace_graph_stub; + ftrace_graph_return = ftrace_stub_graph; ftrace_graph_entry = ftrace_graph_entry_stub; __ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET); -- cgit v1.2.3 From 0e4a459f56c32d3e52ae69a4b447db2f48a65f44 Mon Sep 17 00:00:00 2001 From: Kusanagi Kouichi Date: Wed, 20 Nov 2019 19:43:50 +0900 Subject: tracing: Remove unnecessary DEBUG_FS dependency Tracing replaced debugfs with tracefs. Signed-off-by: Kusanagi Kouichi Reviewed-by: Steven Rostedt (VMware) Link: https://lore.kernel.org/r/20191120104350753.EWCT.12796.ppp.dion.ne.jp@dmta0009.auone-net.jp Signed-off-by: Greg Kroah-Hartman --- kernel/trace/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e08527f50d2a..382628b9b759 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -106,7 +106,6 @@ config PREEMPTIRQ_TRACEPOINTS config TRACING bool - select DEBUG_FS select RING_BUFFER select STACKTRACE if STACKTRACE_SUPPORT select TRACEPOINTS -- cgit v1.2.3 From a82a4804b4ee3636c8988fea14d44f70f4de45f1 Mon Sep 17 00:00:00 2001 From: Xianting Tian Date: Sat, 16 Nov 2019 10:05:55 -0500 Subject: ring-buffer: Fix typos in function ring_buffer_producer Fix spelling and other typos Link: http://lkml.kernel.org/r/1573916755-32478-1-git-send-email-xianting_tian@126.com Signed-off-by: Xianting Tian Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer_benchmark.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 09b0b49f346e..32149e46551c 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -269,10 +269,10 @@ static void ring_buffer_producer(void) #ifndef CONFIG_PREEMPTION /* - * If we are a non preempt kernel, the 10 second run will + * If we are a non preempt kernel, the 10 seconds run will * stop everything while it runs. Instead, we will call * cond_resched and also add any time that was lost by a - * rescedule. + * reschedule. * * Do a cond resched at the same frequency we would wake up * the reader. -- cgit v1.2.3 From fc809bc5ceaa665497384064ab2d76713c774bad Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 20 Nov 2019 21:38:07 +0800 Subject: tracing: Fix Kconfig indentation Adjust indentation from spaces to tab (+optional two spaces) as in coding style with command like: $ sed -e 's/^ /\t/' -i */Kconfig Link: http://lkml.kernel.org/r/20191120133807.12741-1-krzk@kernel.org Signed-off-by: Krzysztof Kozlowski Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index b872716bb2a0..f67620499faa 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -79,7 +79,7 @@ config FTRACE_NMI_ENTER config EVENT_TRACING select CONTEXT_SWITCH_TRACER - select GLOB + select GLOB bool config CONTEXT_SWITCH_TRACER @@ -311,7 +311,7 @@ config TRACER_SNAPSHOT cat snapshot config TRACER_SNAPSHOT_PER_CPU_SWAP - bool "Allow snapshot to swap per CPU" + bool "Allow snapshot to swap per CPU" depends on TRACER_SNAPSHOT select RING_BUFFER_ALLOW_SWAP help @@ -683,7 +683,7 @@ config MMIOTRACE_TEST Say N, unless you absolutely know what you are doing. config TRACEPOINT_BENCHMARK - bool "Add tracepoint that benchmarks tracepoints" + bool "Add tracepoint that benchmarks tracepoints" help This option creates the tracepoint "benchmark:benchmark_event". When the tracepoint is enabled, it kicks off a kernel thread that @@ -732,7 +732,7 @@ config RING_BUFFER_STARTUP_TEST bool "Ring buffer startup self test" depends on RING_BUFFER help - Run a simple self test on the ring buffer on boot up. Late in the + Run a simple self test on the ring buffer on boot up. Late in the kernel boot sequence, the test will start that kicks off a thread per cpu. Each thread will write various size events into the ring buffer. Another thread is created to send IPIs -- cgit v1.2.3 From 28879787147358e8ffcae397f11748de3dd26577 Mon Sep 17 00:00:00 2001 From: Divya Indi Date: Wed, 20 Nov 2019 11:08:38 -0800 Subject: tracing: Adding new functions for kernel access to Ftrace instances Adding 2 new functions - 1) struct trace_array *trace_array_get_by_name(const char *name); Return pointer to a trace array with given name. If it does not exist, create and return pointer to the new trace array. 2) int trace_array_set_clr_event(struct trace_array *tr, const char *system ,const char *event, bool enable); Enable/Disable events to this trace array. Additionally, - To handle reference counters, export trace_array_put() - Due to introduction of the above 2 new functions, we no longer need to export - ftrace_set_clr_event & trace_array_create APIs. Link: http://lkml.kernel.org/r/1574276919-11119-2-git-send-email-divya.indi@oracle.com Signed-off-by: Divya Indi Reviewed-by: Aruna Ramakrishna Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace.h | 3 +- include/linux/trace_events.h | 3 +- kernel/trace/trace.c | 96 +++++++++++++++++++++++++++++++++++--------- kernel/trace/trace.h | 1 - kernel/trace/trace_events.c | 27 ++++++++++++- 5 files changed, 106 insertions(+), 24 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/trace.h b/include/linux/trace.h index 24fcf07812ae..7fd86d3c691f 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -29,7 +29,8 @@ struct trace_array; void trace_printk_init_buffers(void); int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...); -struct trace_array *trace_array_create(const char *name); +void trace_array_put(struct trace_array *tr); +struct trace_array *trace_array_get_by_name(const char *name); int trace_array_destroy(struct trace_array *tr); #endif /* CONFIG_TRACING */ diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 60a41b7069dd..4c6e15605766 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -555,7 +555,8 @@ extern int trace_event_get_offsets(struct trace_event_call *call); int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set); int trace_set_clr_event(const char *system, const char *event, int set); - +int trace_array_set_clr_event(struct trace_array *tr, const char *system, + const char *event, bool enable); /* * The double __builtin_constant_p is because gcc will give us an error * if we try to allocate the static variable to fmt if it is not a diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 42659ce6ac0c..02a23a6e5e00 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -301,12 +301,24 @@ static void __trace_array_put(struct trace_array *this_tr) this_tr->ref--; } +/** + * trace_array_put - Decrement the reference counter for this trace array. + * + * NOTE: Use this when we no longer need the trace array returned by + * trace_array_get_by_name(). This ensures the trace array can be later + * destroyed. + * + */ void trace_array_put(struct trace_array *this_tr) { + if (!this_tr) + return; + mutex_lock(&trace_types_lock); __trace_array_put(this_tr); mutex_unlock(&trace_types_lock); } +EXPORT_SYMBOL_GPL(trace_array_put); int tracing_check_open_get_tr(struct trace_array *tr) { @@ -8437,24 +8449,15 @@ static void update_tracer_options(struct trace_array *tr) mutex_unlock(&trace_types_lock); } -struct trace_array *trace_array_create(const char *name) +static struct trace_array *trace_array_create(const char *name) { struct trace_array *tr; int ret; - mutex_lock(&event_mutex); - mutex_lock(&trace_types_lock); - - ret = -EEXIST; - list_for_each_entry(tr, &ftrace_trace_arrays, list) { - if (tr->name && strcmp(tr->name, name) == 0) - goto out_unlock; - } - ret = -ENOMEM; tr = kzalloc(sizeof(*tr), GFP_KERNEL); if (!tr) - goto out_unlock; + return ERR_PTR(ret); tr->name = kstrdup(name, GFP_KERNEL); if (!tr->name) @@ -8499,8 +8502,8 @@ struct trace_array *trace_array_create(const char *name) list_add(&tr->list, &ftrace_trace_arrays); - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); + tr->ref++; + return tr; @@ -8510,24 +8513,77 @@ struct trace_array *trace_array_create(const char *name) kfree(tr->name); kfree(tr); - out_unlock: - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); - return ERR_PTR(ret); } -EXPORT_SYMBOL_GPL(trace_array_create); static int instance_mkdir(const char *name) { - return PTR_ERR_OR_ZERO(trace_array_create(name)); + struct trace_array *tr; + int ret; + + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + + ret = -EEXIST; + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr->name && strcmp(tr->name, name) == 0) + goto out_unlock; + } + + tr = trace_array_create(name); + + ret = PTR_ERR_OR_ZERO(tr); + +out_unlock: + mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); + return ret; +} + +/** + * trace_array_get_by_name - Create/Lookup a trace array, given its name. + * @name: The name of the trace array to be looked up/created. + * + * Returns pointer to trace array with given name. + * NULL, if it cannot be created. + * + * NOTE: This function increments the reference counter associated with the + * trace array returned. This makes sure it cannot be freed while in use. + * Use trace_array_put() once the trace array is no longer needed. + * + */ +struct trace_array *trace_array_get_by_name(const char *name) +{ + struct trace_array *tr; + + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr->name && strcmp(tr->name, name) == 0) + goto out_unlock; + } + + tr = trace_array_create(name); + + if (IS_ERR(tr)) + tr = NULL; +out_unlock: + if (tr) + tr->ref++; + + mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); + return tr; } +EXPORT_SYMBOL_GPL(trace_array_get_by_name); static int __remove_instance(struct trace_array *tr) { int i; - if (tr->ref || (tr->current_trace && tr->current_trace->ref)) + /* Reference counter for a newly created trace array = 1. */ + if (tr->ref > 1 || (tr->current_trace && tr->current_trace->ref)) return -EBUSY; list_del(&tr->list); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2df8aed6a8f0..ca7fccafbcbb 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -345,7 +345,6 @@ extern struct list_head ftrace_trace_arrays; extern struct mutex trace_types_lock; extern int trace_array_get(struct trace_array *tr); -extern void trace_array_put(struct trace_array *tr); extern int tracing_check_open_get_tr(struct trace_array *tr); extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 2a3ac2365445..6b3a69e9aa6a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -827,7 +827,6 @@ int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) return ret; } -EXPORT_SYMBOL_GPL(ftrace_set_clr_event); /** * trace_set_clr_event - enable or disable an event @@ -852,6 +851,32 @@ int trace_set_clr_event(const char *system, const char *event, int set) } EXPORT_SYMBOL_GPL(trace_set_clr_event); +/** + * trace_array_set_clr_event - enable or disable an event for a trace array. + * @tr: concerned trace array. + * @system: system name to match (NULL for any system) + * @event: event name to match (NULL for all events, within system) + * @enable: true to enable, false to disable + * + * This is a way for other parts of the kernel to enable or disable + * event recording. + * + * Returns 0 on success, -EINVAL if the parameters do not match any + * registered events. + */ +int trace_array_set_clr_event(struct trace_array *tr, const char *system, + const char *event, bool enable) +{ + int set; + + if (!tr) + return -ENOENT; + + set = (enable == true) ? 1 : 0; + return __ftrace_set_clr_event(tr, NULL, system, event, set); +} +EXPORT_SYMBOL_GPL(trace_array_set_clr_event); + /* 128 should be much more than enough */ #define EVENT_BUF_SIZE 127 -- cgit v1.2.3 From 0e24220821b0e0e330a18bfef29ac6396545d62e Mon Sep 17 00:00:00 2001 From: Hassan Naveed Date: Fri, 15 Nov 2019 23:44:42 +0000 Subject: tracing: Use xarray for syscall trace events Currently, a lot of memory is wasted for architectures like MIPS when init_ftrace_syscalls() allocates the array for syscalls using kcalloc. This is because syscalls numbers start from 4000, 5000 or 6000 and array elements up to that point are unused. Fix this by using a data structure more suited to storing sparsely populated arrays. The XARRAY data structure, implemented using radix trees, is much more memory efficient for storing the syscalls in question. Link: http://lkml.kernel.org/r/20191115234314.21599-1-hnaveed@wavecomp.com Signed-off-by: Hassan Naveed Reviewed-by: Paul Burton Signed-off-by: Steven Rostedt (VMware) --- arch/Kconfig | 8 ++++++++ kernel/trace/trace_syscalls.c | 32 +++++++++++++++++++++++++------- 2 files changed, 33 insertions(+), 7 deletions(-) (limited to 'kernel/trace') diff --git a/arch/Kconfig b/arch/Kconfig index 5f8a5d84dbbe..69c87e8608d8 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -960,6 +960,14 @@ config RELR config ARCH_HAS_MEM_ENCRYPT bool +config HAVE_SPARSE_SYSCALL_NR + bool + help + An architecture should select this if its syscall numbering is sparse + to save space. For example, MIPS architecture has a syscall array with + entries at 4000, 5000 and 6000 locations. This option turns on syscall + related optimizations for a given architecture. + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index fa8fbff736d6..16fa218556fa 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -7,6 +7,7 @@ #include /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */ #include #include +#include #include #include "trace_output.h" @@ -30,6 +31,7 @@ syscall_get_enter_fields(struct trace_event_call *call) extern struct syscall_metadata *__start_syscalls_metadata[]; extern struct syscall_metadata *__stop_syscalls_metadata[]; +static DEFINE_XARRAY(syscalls_metadata_sparse); static struct syscall_metadata **syscalls_metadata; #ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME @@ -101,6 +103,9 @@ find_syscall_meta(unsigned long syscall) static struct syscall_metadata *syscall_nr_to_meta(int nr) { + if (IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) + return xa_load(&syscalls_metadata_sparse, (unsigned long)nr); + if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) return NULL; @@ -536,12 +541,16 @@ void __init init_ftrace_syscalls(void) struct syscall_metadata *meta; unsigned long addr; int i; - - syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata), - GFP_KERNEL); - if (!syscalls_metadata) { - WARN_ON(1); - return; + void *ret; + + if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) { + syscalls_metadata = kcalloc(NR_syscalls, + sizeof(*syscalls_metadata), + GFP_KERNEL); + if (!syscalls_metadata) { + WARN_ON(1); + return; + } } for (i = 0; i < NR_syscalls; i++) { @@ -551,7 +560,16 @@ void __init init_ftrace_syscalls(void) continue; meta->syscall_nr = i; - syscalls_metadata[i] = meta; + + if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) { + syscalls_metadata[i] = meta; + } else { + ret = xa_store(&syscalls_metadata_sparse, i, meta, + GFP_KERNEL); + WARN(xa_is_err(ret), + "Syscall memory allocation failed\n"); + } + } } -- cgit v1.2.3 From 6c3edaf9fd6a3be7fb5bc6931897c24cd3848f84 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 29 Nov 2019 20:52:18 -0800 Subject: tracing: Introduce trace event injection We have been trying to use rasdaemon to monitor hardware errors like correctable memory errors. rasdaemon uses trace events to monitor various hardware errors. In order to test it, we have to inject some hardware errors, unfortunately not all of them provide error injections. MCE does provide a way to inject MCE errors, but errors like PCI error and devlink error don't, it is not easy to add error injection to each of them. Instead, it is relatively easier to just allow users to inject trace events in a generic way so that all trace events can be injected. This patch introduces trace event injection, where a new 'inject' is added to each tracepoint directory. Users could write into this file with key=value pairs to specify the value of each fields of the trace event, all unspecified fields are set to zero values by default. For example, for the net/net_dev_queue tracepoint, we can inject: INJECT=/sys/kernel/debug/tracing/events/net/net_dev_queue/inject echo "" > $INJECT echo "name='test'" > $INJECT echo "name='test' len=1024" > $INJECT cat /sys/kernel/debug/tracing/trace ... <...>-614 [000] .... 36.571483: net_dev_queue: dev= skbaddr=00000000fbf338c2 len=0 <...>-614 [001] .... 136.588252: net_dev_queue: dev=test skbaddr=00000000fbf338c2 len=0 <...>-614 [001] .N.. 208.431878: net_dev_queue: dev=test skbaddr=00000000fbf338c2 len=1024 Triggers could be triggered as usual too: echo "stacktrace if len == 1025" > /sys/kernel/debug/tracing/events/net/net_dev_queue/trigger echo "len=1025" > $INJECT cat /sys/kernel/debug/tracing/trace ... bash-614 [000] .... 36.571483: net_dev_queue: dev= skbaddr=00000000fbf338c2 len=0 bash-614 [001] .... 136.588252: net_dev_queue: dev=test skbaddr=00000000fbf338c2 len=0 bash-614 [001] .N.. 208.431878: net_dev_queue: dev=test skbaddr=00000000fbf338c2 len=1024 bash-614 [001] .N.1 284.236349: => event_inject_write => vfs_write => ksys_write => do_syscall_64 => entry_SYSCALL_64_after_hwframe The only thing that can't be injected is string pointers as they require constant string pointers, this can't be done at run time. Link: http://lkml.kernel.org/r/20191130045218.18979-1-xiyou.wangcong@gmail.com Cc: Ingo Molnar Signed-off-by: Cong Wang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 9 + kernel/trace/Makefile | 1 + kernel/trace/trace.h | 1 + kernel/trace/trace_events.c | 6 + kernel/trace/trace_events_inject.c | 331 +++++++++++++++++++++++++++++++++++++ 5 files changed, 348 insertions(+) create mode 100644 kernel/trace/trace_events_inject.c (limited to 'kernel/trace') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f67620499faa..29a9c5058b62 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -672,6 +672,15 @@ config HIST_TRIGGERS See Documentation/trace/histogram.rst. If in doubt, say N. +config TRACE_EVENT_INJECT + bool "Trace event injection" + depends on TRACING + help + Allow user-space to inject a specific trace event into the ring + buffer. This is mainly used for testing purpose. + + If unsure, say N. + config MMIOTRACE_TEST tristate "Test module for mmiotrace" depends on MMIOTRACE && m diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index c2b2148bb1d2..0e63db62225f 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -69,6 +69,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o +obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ca7fccafbcbb..63bf60f79398 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1601,6 +1601,7 @@ extern struct list_head ftrace_events; extern const struct file_operations event_trigger_fops; extern const struct file_operations event_hist_fops; +extern const struct file_operations event_inject_fops; #ifdef CONFIG_HIST_TRIGGERS extern int register_trigger_hist_cmd(void); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6b3a69e9aa6a..c6de3cebc127 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2044,6 +2044,12 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) trace_create_file("format", 0444, file->dir, call, &ftrace_event_format_fops); +#ifdef CONFIG_TRACE_EVENT_INJECT + if (call->event.type && call->class->reg) + trace_create_file("inject", 0200, file->dir, file, + &event_inject_fops); +#endif + return 0; } diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c new file mode 100644 index 000000000000..d43710718ee5 --- /dev/null +++ b/kernel/trace/trace_events_inject.c @@ -0,0 +1,331 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * trace_events_inject - trace event injection + * + * Copyright (C) 2019 Cong Wang + */ + +#include +#include +#include +#include +#include + +#include "trace.h" + +static int +trace_inject_entry(struct trace_event_file *file, void *rec, int len) +{ + struct trace_event_buffer fbuffer; + struct ring_buffer *buffer; + int written = 0; + void *entry; + + rcu_read_lock_sched(); + buffer = file->tr->trace_buffer.buffer; + entry = trace_event_buffer_reserve(&fbuffer, file, len); + if (entry) { + memcpy(entry, rec, len); + written = len; + trace_event_buffer_commit(&fbuffer); + } + rcu_read_unlock_sched(); + + return written; +} + +static int +parse_field(char *str, struct trace_event_call *call, + struct ftrace_event_field **pf, u64 *pv) +{ + struct ftrace_event_field *field; + char *field_name; + int s, i = 0; + int len; + u64 val; + + if (!str[i]) + return 0; + /* First find the field to associate to */ + while (isspace(str[i])) + i++; + s = i; + while (isalnum(str[i]) || str[i] == '_') + i++; + len = i - s; + if (!len) + return -EINVAL; + + field_name = kmemdup_nul(str + s, len, GFP_KERNEL); + if (!field_name) + return -ENOMEM; + field = trace_find_event_field(call, field_name); + kfree(field_name); + if (!field) + return -ENOENT; + + *pf = field; + while (isspace(str[i])) + i++; + if (str[i] != '=') + return -EINVAL; + i++; + while (isspace(str[i])) + i++; + s = i; + if (isdigit(str[i]) || str[i] == '-') { + char *num, c; + int ret; + + /* Make sure the field is not a string */ + if (is_string_field(field)) + return -EINVAL; + + if (str[i] == '-') + i++; + + /* We allow 0xDEADBEEF */ + while (isalnum(str[i])) + i++; + num = str + s; + c = str[i]; + if (c != '\0' && !isspace(c)) + return -EINVAL; + str[i] = '\0'; + /* Make sure it is a value */ + if (field->is_signed) + ret = kstrtoll(num, 0, &val); + else + ret = kstrtoull(num, 0, &val); + str[i] = c; + if (ret) + return ret; + + *pv = val; + return i; + } else if (str[i] == '\'' || str[i] == '"') { + char q = str[i]; + + /* Make sure the field is OK for strings */ + if (!is_string_field(field)) + return -EINVAL; + + for (i++; str[i]; i++) { + if (str[i] == '\\' && str[i + 1]) { + i++; + continue; + } + if (str[i] == q) + break; + } + if (!str[i]) + return -EINVAL; + + /* Skip quotes */ + s++; + len = i - s; + if (len >= MAX_FILTER_STR_VAL) + return -EINVAL; + + *pv = (unsigned long)(str + s); + str[i] = 0; + /* go past the last quote */ + i++; + return i; + } + + return -EINVAL; +} + +static int trace_get_entry_size(struct trace_event_call *call) +{ + struct ftrace_event_field *field; + struct list_head *head; + int size = 0; + + head = trace_get_fields(call); + list_for_each_entry(field, head, link) { + if (field->size + field->offset > size) + size = field->size + field->offset; + } + + return size; +} + +static void *trace_alloc_entry(struct trace_event_call *call, int *size) +{ + int entry_size = trace_get_entry_size(call); + struct ftrace_event_field *field; + struct list_head *head; + void *entry = NULL; + + /* We need an extra '\0' at the end. */ + entry = kzalloc(entry_size + 1, GFP_KERNEL); + if (!entry) + return NULL; + + head = trace_get_fields(call); + list_for_each_entry(field, head, link) { + if (!is_string_field(field)) + continue; + if (field->filter_type == FILTER_STATIC_STRING) + continue; + if (field->filter_type == FILTER_DYN_STRING) { + u32 *str_item; + int str_loc = entry_size & 0xffff; + + str_item = (u32 *)(entry + field->offset); + *str_item = str_loc; /* string length is 0. */ + } else { + char **paddr; + + paddr = (char **)(entry + field->offset); + *paddr = ""; + } + } + + *size = entry_size + 1; + return entry; +} + +#define INJECT_STRING "STATIC STRING CAN NOT BE INJECTED" + +/* Caller is responsible to free the *pentry. */ +static int parse_entry(char *str, struct trace_event_call *call, void **pentry) +{ + struct ftrace_event_field *field; + unsigned long irq_flags; + void *entry = NULL; + int entry_size; + u64 val; + int len; + + entry = trace_alloc_entry(call, &entry_size); + *pentry = entry; + if (!entry) + return -ENOMEM; + + local_save_flags(irq_flags); + tracing_generic_entry_update(entry, call->event.type, irq_flags, + preempt_count()); + + while ((len = parse_field(str, call, &field, &val)) > 0) { + if (is_function_field(field)) + return -EINVAL; + + if (is_string_field(field)) { + char *addr = (char *)(unsigned long) val; + + if (field->filter_type == FILTER_STATIC_STRING) { + strlcpy(entry + field->offset, addr, field->size); + } else if (field->filter_type == FILTER_DYN_STRING) { + int str_len = strlen(addr) + 1; + int str_loc = entry_size & 0xffff; + u32 *str_item; + + entry_size += str_len; + *pentry = krealloc(entry, entry_size, GFP_KERNEL); + if (!*pentry) { + kfree(entry); + return -ENOMEM; + } + entry = *pentry; + + strlcpy(entry + (entry_size - str_len), addr, str_len); + str_item = (u32 *)(entry + field->offset); + *str_item = (str_len << 16) | str_loc; + } else { + char **paddr; + + paddr = (char **)(entry + field->offset); + *paddr = INJECT_STRING; + } + } else { + switch (field->size) { + case 1: { + u8 tmp = (u8) val; + + memcpy(entry + field->offset, &tmp, 1); + break; + } + case 2: { + u16 tmp = (u16) val; + + memcpy(entry + field->offset, &tmp, 2); + break; + } + case 4: { + u32 tmp = (u32) val; + + memcpy(entry + field->offset, &tmp, 4); + break; + } + case 8: + memcpy(entry + field->offset, &val, 8); + break; + default: + return -EINVAL; + } + } + + str += len; + } + + if (len < 0) + return len; + + return entry_size; +} + +static ssize_t +event_inject_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct trace_event_call *call; + struct trace_event_file *file; + int err = -ENODEV, size; + void *entry = NULL; + char *buf; + + if (cnt >= PAGE_SIZE) + return -EINVAL; + + buf = memdup_user_nul(ubuf, cnt); + if (IS_ERR(buf)) + return PTR_ERR(buf); + strim(buf); + + mutex_lock(&event_mutex); + file = event_file_data(filp); + if (file) { + call = file->event_call; + size = parse_entry(buf, call, &entry); + if (size < 0) + err = size; + else + err = trace_inject_entry(file, entry, size); + } + mutex_unlock(&event_mutex); + + kfree(entry); + kfree(buf); + + if (err < 0) + return err; + + *ppos += err; + return cnt; +} + +static ssize_t +event_inject_read(struct file *file, char __user *buf, size_t size, + loff_t *ppos) +{ + return -EPERM; +} + +const struct file_operations event_inject_fops = { + .open = tracing_open_generic, + .read = event_inject_read, + .write = event_inject_write, +}; -- cgit v1.2.3 From a356646a56857c2e5ad875beec734d7145ecd49a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 2 Dec 2019 16:25:27 -0500 Subject: tracing: Do not create directories if lockdown is in affect If lockdown is disabling tracing on boot up, it prevents the tracing files from even bering created. But when that happens, there's several places that will give a warning that the files were not created as that is usually a sign of a bug. Add in strategic locations where a check is made to see if tracing is disabled by lockdown, and if it is, do not go further, and fail silently (but print that tracing is disabled by lockdown, without doing a WARN_ON()). Cc: Matthew Garrett Fixes: 17911ff38aa5 ("tracing: Add locked_down checks to the open calls of files created for tracefs") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 6 ++++++ kernel/trace/trace.c | 17 +++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 66358d66c933..4bf050fcfe3b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include /* for self test */ @@ -5068,6 +5069,11 @@ static __init int test_ringbuffer(void) int cpu; int ret = 0; + if (security_locked_down(LOCKDOWN_TRACEFS)) { + pr_warning("Lockdown is enabled, skipping ring buffer tests\n"); + return 0; + } + pr_info("Running ring buffer tests...\n"); buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 02a23a6e5e00..23459d53d576 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1888,6 +1888,12 @@ int __init register_tracer(struct tracer *type) return -1; } + if (security_locked_down(LOCKDOWN_TRACEFS)) { + pr_warning("Can not register tracer %s due to lockdown\n", + type->name); + return -EPERM; + } + mutex_lock(&trace_types_lock); tracing_selftest_running = true; @@ -8789,6 +8795,11 @@ struct dentry *tracing_init_dentry(void) { struct trace_array *tr = &global_trace; + if (security_locked_down(LOCKDOWN_TRACEFS)) { + pr_warning("Tracing disabled due to lockdown\n"); + return ERR_PTR(-EPERM); + } + /* The top level trace array uses NULL as parent */ if (tr->dir) return NULL; @@ -9231,6 +9242,12 @@ __init static int tracer_alloc_buffers(void) int ring_buf_size; int ret = -ENOMEM; + + if (security_locked_down(LOCKDOWN_TRACEFS)) { + pr_warning("Tracing disabled due to lockdown\n"); + return -EPERM; + } + /* * Make sure we don't accidently add more trace options * than we have bits for. -- cgit v1.2.3 From ee19545220a8663f0ca58c3427bc08fd6a104a42 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Fri, 6 Dec 2019 09:25:03 +1100 Subject: Fix up for "printk: Drop pr_warning definition" Link: http://lkml.kernel.org/r/20191206092503.303d6a57@canb.auug.org.au Cc: Linux Next Mailing List Cc: Linux Kernel Mailing List Cc: "Steven Rostedt (VMware)" Cc: Kefeng Wang Cc: Linus Torvalds Signed-off-by: Stephen Rothwell Signed-off-by: Petr Mladek --- kernel/trace/ring_buffer.c | 2 +- kernel/trace/trace.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 4bf050fcfe3b..3f655371eaf6 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5070,7 +5070,7 @@ static __init int test_ringbuffer(void) int ret = 0; if (security_locked_down(LOCKDOWN_TRACEFS)) { - pr_warning("Lockdown is enabled, skipping ring buffer tests\n"); + pr_warn("Lockdown is enabled, skipping ring buffer tests\n"); return 0; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 23459d53d576..6c75410f9698 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1889,7 +1889,7 @@ int __init register_tracer(struct tracer *type) } if (security_locked_down(LOCKDOWN_TRACEFS)) { - pr_warning("Can not register tracer %s due to lockdown\n", + pr_warn("Can not register tracer %s due to lockdown\n", type->name); return -EPERM; } @@ -8796,7 +8796,7 @@ struct dentry *tracing_init_dentry(void) struct trace_array *tr = &global_trace; if (security_locked_down(LOCKDOWN_TRACEFS)) { - pr_warning("Tracing disabled due to lockdown\n"); + pr_warn("Tracing disabled due to lockdown\n"); return ERR_PTR(-EPERM); } @@ -9244,7 +9244,7 @@ __init static int tracer_alloc_buffers(void) if (security_locked_down(LOCKDOWN_TRACEFS)) { - pr_warning("Tracing disabled due to lockdown\n"); + pr_warn("Tracing disabled due to lockdown\n"); return -EPERM; } -- cgit v1.2.3 From a61f810567be0ef101d8bbb65b89ffeac7d62103 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 7 Dec 2019 11:44:09 +0800 Subject: tracing: remove set but not used variable 'buffer' kernel/trace/trace_events_inject.c: In function trace_inject_entry: kernel/trace/trace_events_inject.c:20:22: warning: variable buffer set but not used [-Wunused-but-set-variable] It is never used, so remove it. Link: http://lkml.kernel.org/r/20191207034409.25668-1-yuehaibing@huawei.com Reported-by: Hulk Robot Acked-by: Cong Wang Signed-off-by: YueHaibing Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_inject.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c index d43710718ee5..d45079ee62f8 100644 --- a/kernel/trace/trace_events_inject.c +++ b/kernel/trace/trace_events_inject.c @@ -17,12 +17,10 @@ static int trace_inject_entry(struct trace_event_file *file, void *rec, int len) { struct trace_event_buffer fbuffer; - struct ring_buffer *buffer; int written = 0; void *entry; rcu_read_lock_sched(); - buffer = file->tr->trace_buffer.buffer; entry = trace_event_buffer_reserve(&fbuffer, file, len); if (entry) { memcpy(entry, rec, len); -- cgit v1.2.3 From ff205766dbbee024a4a716638868d98ffb17748a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sun, 8 Dec 2019 16:01:12 -0800 Subject: ftrace: Fix function_graph tracer interaction with BPF trampoline Depending on type of BPF programs served by BPF trampoline it can call original function. In such case the trampoline will skip one stack frame while returning. That will confuse function_graph tracer and will cause crashes with bad RIP. Teach graph tracer to skip functions that have BPF trampoline attached. Signed-off-by: Alexei Starovoitov Signed-off-by: Steven Rostedt (VMware) --- arch/x86/kernel/ftrace.c | 14 -------------- include/linux/ftrace.h | 5 +++++ kernel/trace/fgraph.c | 9 +++++++++ kernel/trace/ftrace.c | 19 +++++++------------ 4 files changed, 21 insertions(+), 26 deletions(-) (limited to 'kernel/trace') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 060a361d9d11..024c3053dbba 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -1042,20 +1042,6 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, if (unlikely(atomic_read(¤t->tracing_graph_pause))) return; - /* - * If the return location is actually pointing directly to - * the start of a direct trampoline (if we trace the trampoline - * it will still be offset by MCOUNT_INSN_SIZE), then the - * return address is actually off by one word, and we - * need to adjust for that. - */ - if (ftrace_direct_func_count) { - if (ftrace_find_direct_func(self_addr + MCOUNT_INSN_SIZE)) { - self_addr = *parent; - parent++; - } - } - /* * Protect against fault, even if it shouldn't * happen. This tool is too much intrusive to diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 232806d5689d..987c2dc55bde 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -264,6 +264,7 @@ int ftrace_modify_direct_caller(struct ftrace_func_entry *entry, struct dyn_ftrace *rec, unsigned long old_addr, unsigned long new_addr); +unsigned long ftrace_find_rec_direct(unsigned long ip); #else # define ftrace_direct_func_count 0 static inline int register_ftrace_direct(unsigned long ip, unsigned long addr) @@ -290,6 +291,10 @@ static inline int ftrace_modify_direct_caller(struct ftrace_func_entry *entry, { return -ENODEV; } +static inline unsigned long ftrace_find_rec_direct(unsigned long ip) +{ + return 0; +} #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 67e0c462b059..a2659735db73 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -101,6 +101,15 @@ int function_graph_enter(unsigned long ret, unsigned long func, { struct ftrace_graph_ent trace; + /* + * Skip graph tracing if the return location is served by direct trampoline, + * since call sequence and return addresses is unpredicatable anymore. + * Ex: BPF trampoline may call original function and may skip frame + * depending on type of BPF programs attached. + */ + if (ftrace_direct_func_count && + ftrace_find_rec_direct(ret - MCOUNT_INSN_SIZE)) + return -EBUSY; trace.func = func; trace.depth = ++current->curr_ret_depth; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index caae523f4ef3..57477dc683db 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2364,7 +2364,7 @@ int ftrace_direct_func_count; * Search the direct_functions hash to see if the given instruction pointer * has a direct caller attached to it. */ -static unsigned long find_rec_direct(unsigned long ip) +unsigned long ftrace_find_rec_direct(unsigned long ip) { struct ftrace_func_entry *entry; @@ -2380,7 +2380,7 @@ static void call_direct_funcs(unsigned long ip, unsigned long pip, { unsigned long addr; - addr = find_rec_direct(ip); + addr = ftrace_find_rec_direct(ip); if (!addr) return; @@ -2393,11 +2393,6 @@ struct ftrace_ops direct_ops = { | FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_PERMANENT, }; -#else -static inline unsigned long find_rec_direct(unsigned long ip) -{ - return 0; -} #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ /** @@ -2417,7 +2412,7 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) if ((rec->flags & FTRACE_FL_DIRECT) && (ftrace_rec_count(rec) == 1)) { - addr = find_rec_direct(rec->ip); + addr = ftrace_find_rec_direct(rec->ip); if (addr) return addr; WARN_ON_ONCE(1); @@ -2458,7 +2453,7 @@ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) /* Direct calls take precedence over trampolines */ if (rec->flags & FTRACE_FL_DIRECT_EN) { - addr = find_rec_direct(rec->ip); + addr = ftrace_find_rec_direct(rec->ip); if (addr) return addr; WARN_ON_ONCE(1); @@ -3604,7 +3599,7 @@ static int t_show(struct seq_file *m, void *v) if (rec->flags & FTRACE_FL_DIRECT) { unsigned long direct; - direct = find_rec_direct(rec->ip); + direct = ftrace_find_rec_direct(rec->ip); if (direct) seq_printf(m, "\n\tdirect-->%pS", (void *)direct); } @@ -5008,7 +5003,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) mutex_lock(&direct_mutex); /* See if there's a direct function at @ip already */ - if (find_rec_direct(ip)) + if (ftrace_find_rec_direct(ip)) goto out_unlock; ret = -ENODEV; @@ -5027,7 +5022,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) if (ip != rec->ip) { ip = rec->ip; /* Need to check this ip for a direct. */ - if (find_rec_direct(ip)) + if (ftrace_find_rec_direct(ip)) goto out_unlock; } -- cgit v1.2.3 From 79e65c27f09683fbb50c33acab395d0ddf5302d2 Mon Sep 17 00:00:00 2001 From: Keita Suzuki Date: Wed, 11 Dec 2019 09:12:58 +0000 Subject: tracing: Avoid memory leak in process_system_preds() When failing in the allocation of filter_item, process_system_preds() goes to fail_mem, where the allocated filter is freed. However, this leads to memory leak of filter->filter_string and filter->prog, which is allocated before and in process_preds(). This bug has been detected by kmemleak as well. Fix this by changing kfree to __free_fiter. unreferenced object 0xffff8880658007c0 (size 32): comm "bash", pid 579, jiffies 4295096372 (age 17.752s) hex dump (first 32 bytes): 63 6f 6d 6d 6f 6e 5f 70 69 64 20 20 3e 20 31 30 common_pid > 10 00 00 00 00 00 00 00 00 65 73 00 00 00 00 00 00 ........es...... backtrace: [<0000000067441602>] kstrdup+0x2d/0x60 [<00000000141cf7b7>] apply_subsystem_event_filter+0x378/0x932 [<000000009ca32334>] subsystem_filter_write+0x5a/0x90 [<0000000072da2bee>] vfs_write+0xe1/0x240 [<000000004f14f473>] ksys_write+0xb4/0x150 [<00000000a968b4a0>] do_syscall_64+0x6d/0x1e0 [<000000001a189f40>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 unreferenced object 0xffff888060c22d00 (size 64): comm "bash", pid 579, jiffies 4295096372 (age 17.752s) hex dump (first 32 bytes): 01 00 00 00 00 00 00 00 00 e8 d7 41 80 88 ff ff ...........A.... 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000b8c1b109>] process_preds+0x243/0x1820 [<000000003972c7f0>] apply_subsystem_event_filter+0x3be/0x932 [<000000009ca32334>] subsystem_filter_write+0x5a/0x90 [<0000000072da2bee>] vfs_write+0xe1/0x240 [<000000004f14f473>] ksys_write+0xb4/0x150 [<00000000a968b4a0>] do_syscall_64+0x6d/0x1e0 [<000000001a189f40>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 unreferenced object 0xffff888041d7e800 (size 512): comm "bash", pid 579, jiffies 4295096372 (age 17.752s) hex dump (first 32 bytes): 70 bc 85 97 ff ff ff ff 0a 00 00 00 00 00 00 00 p............... 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<000000001e04af34>] process_preds+0x71a/0x1820 [<000000003972c7f0>] apply_subsystem_event_filter+0x3be/0x932 [<000000009ca32334>] subsystem_filter_write+0x5a/0x90 [<0000000072da2bee>] vfs_write+0xe1/0x240 [<000000004f14f473>] ksys_write+0xb4/0x150 [<00000000a968b4a0>] do_syscall_64+0x6d/0x1e0 [<000000001a189f40>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Link: http://lkml.kernel.org/r/20191211091258.11310-1-keitasuzuki.park@sslab.ics.keio.ac.jp Cc: Ingo Molnar Cc: stable@vger.kernel.org Fixes: 404a3add43c9c ("tracing: Only add filter list when needed") Signed-off-by: Keita Suzuki Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index c9a74f82b14a..bf44f6bbd0c3 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1662,7 +1662,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir, parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); return -EINVAL; fail_mem: - kfree(filter); + __free_filter(filter); /* If any call succeeded, we still need to sync */ if (!fail) tracepoint_synchronize_unregister(); -- cgit v1.2.3 From 106f41f5a302cb1f36c7543fae6a05de12e96fa4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 11 Dec 2019 15:44:22 -0500 Subject: tracing: Have the histogram compare functions convert to u64 first The compare functions of the histogram code would be specific for the size of the value being compared (byte, short, int, long long). It would reference the value from the array via the type of the compare, but the value was stored in a 64 bit number. This is fine for little endian machines, but for big endian machines, it would end up comparing zeros or all ones (depending on the sign) for anything but 64 bit numbers. To fix this, first derference the value as a u64 then convert it to the type being compared. Link: http://lkml.kernel.org/r/20191211103557.7bed6928@gandalf.local.home Cc: stable@vger.kernel.org Fixes: 08d43a5fa063e ("tracing: Add lock-free tracing_map") Acked-by: Tom Zanussi Reported-by: Sven Schnelle Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/tracing_map.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 9a1c22310323..9e31bfc818ff 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -148,8 +148,8 @@ static int tracing_map_cmp_atomic64(void *val_a, void *val_b) #define DEFINE_TRACING_MAP_CMP_FN(type) \ static int tracing_map_cmp_##type(void *val_a, void *val_b) \ { \ - type a = *(type *)val_a; \ - type b = *(type *)val_b; \ + type a = (type)(*(u64 *)val_a); \ + type b = (type)(*(u64 *)val_b); \ \ return (a > b) ? 1 : ((a < b) ? -1 : 0); \ } -- cgit v1.2.3 From 3a53acf1d9bea11b57c1f6205e3fe73f9d8a3688 Mon Sep 17 00:00:00 2001 From: Prateek Sood Date: Tue, 10 Dec 2019 09:15:16 +0000 Subject: tracing: Fix lock inversion in trace_event_enable_tgid_record() Task T2 Task T3 trace_options_core_write() subsystem_open() mutex_lock(trace_types_lock) mutex_lock(event_mutex) set_tracer_flag() trace_event_enable_tgid_record() mutex_lock(trace_types_lock) mutex_lock(event_mutex) This gives a circular dependency deadlock between trace_types_lock and event_mutex. To fix this invert the usage of trace_types_lock and event_mutex in trace_options_core_write(). This keeps the sequence of lock usage consistent. Link: http://lkml.kernel.org/r/0101016eef175e38-8ca71caf-a4eb-480d-a1e6-6f0bbc015495-000000@us-west-2.amazonses.com Cc: stable@vger.kernel.org Fixes: d914ba37d7145 ("tracing: Add support for recording tgid of tasks") Signed-off-by: Prateek Sood Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 8 ++++++++ kernel/trace/trace_events.c | 8 ++++---- 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6c75410f9698..ddb7e7f5fe8d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4685,6 +4685,10 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) { + if ((mask == TRACE_ITER_RECORD_TGID) || + (mask == TRACE_ITER_RECORD_CMD)) + lockdep_assert_held(&event_mutex); + /* do nothing if flag is already set */ if (!!(tr->trace_flags & mask) == !!enabled) return 0; @@ -4752,6 +4756,7 @@ static int trace_set_options(struct trace_array *tr, char *option) cmp += len; + mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); ret = match_string(trace_options, -1, cmp); @@ -4762,6 +4767,7 @@ static int trace_set_options(struct trace_array *tr, char *option) ret = set_tracer_flag(tr, 1 << ret, !neg); mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); /* * If the first trailing whitespace is replaced with '\0' by strstrip, @@ -8076,9 +8082,11 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, if (val != 0 && val != 1) return -EINVAL; + mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); ret = set_tracer_flag(tr, 1 << index, val); mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); if (ret < 0) return ret; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index c6de3cebc127..a5b614cc3887 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -320,7 +320,8 @@ void trace_event_enable_cmd_record(bool enable) struct trace_event_file *file; struct trace_array *tr; - mutex_lock(&event_mutex); + lockdep_assert_held(&event_mutex); + do_for_each_event_file(tr, file) { if (!(file->flags & EVENT_FILE_FL_ENABLED)) @@ -334,7 +335,6 @@ void trace_event_enable_cmd_record(bool enable) clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } } while_for_each_event_file(); - mutex_unlock(&event_mutex); } void trace_event_enable_tgid_record(bool enable) @@ -342,7 +342,8 @@ void trace_event_enable_tgid_record(bool enable) struct trace_event_file *file; struct trace_array *tr; - mutex_lock(&event_mutex); + lockdep_assert_held(&event_mutex); + do_for_each_event_file(tr, file) { if (!(file->flags & EVENT_FILE_FL_ENABLED)) continue; @@ -356,7 +357,6 @@ void trace_event_enable_tgid_record(bool enable) &file->flags); } } while_for_each_event_file(); - mutex_unlock(&event_mutex); } static int __ftrace_event_enable_disable(struct trace_event_file *file, -- cgit v1.2.3 From fe6e096a5bbf73a142f09c72e7aa2835026eb1a3 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Wed, 18 Dec 2019 08:44:27 +0100 Subject: tracing: Fix endianness bug in histogram trigger At least on PA-RISC and s390 synthetic histogram triggers are failing selftests because trace_event_raw_event_synth() always writes a 64 bit values, but the reader expects a field->size sized value. On little endian machines this doesn't hurt, but on big endian this makes the reader always read zero values. Link: http://lore.kernel.org/linux-trace-devel/20191218074427.96184-4-svens@linux.ibm.com Cc: stable@vger.kernel.org Fixes: 4b147936fa509 ("tracing: Add support for 'synthetic' events") Acked-by: Tom Zanussi Signed-off-by: Sven Schnelle Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f49d1a36d3ae..f62de5f43e79 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -911,7 +911,26 @@ static notrace void trace_event_raw_event_synth(void *__data, strscpy(str_field, str_val, STR_VAR_LEN_MAX); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } else { - entry->fields[n_u64] = var_ref_vals[var_ref_idx + i]; + struct synth_field *field = event->fields[i]; + u64 val = var_ref_vals[var_ref_idx + i]; + + switch (field->size) { + case 1: + *(u8 *)&entry->fields[n_u64] = (u8)val; + break; + + case 2: + *(u16 *)&entry->fields[n_u64] = (u16)val; + break; + + case 4: + *(u32 *)&entry->fields[n_u64] = (u32)val; + break; + + default: + entry->fields[n_u64] = val; + break; + } n_u64++; } } -- cgit v1.2.3