From c195651e565ae7f41a68acb7d4aa7390ad215de1 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 28 Apr 2018 22:28:08 -0700 Subject: bpf: add bpf_get_stack helper Currently, stackmap and bpf_get_stackid helper are provided for bpf program to get the stack trace. This approach has a limitation though. If two stack traces have the same hash, only one will get stored in the stackmap table, so some stack traces are missing from user perspective. This patch implements a new helper, bpf_get_stack, will send stack traces directly to bpf program. The bpf program is able to see all stack traces, and then can do in-kernel processing or send stack traces to user space through shared map or bpf_perf_event_output. Acked-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel/bpf/core.c') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ba03ec39efb3..9349a5db3cf2 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -1722,6 +1723,10 @@ static void bpf_prog_free_deferred(struct work_struct *work) aux = container_of(work, struct bpf_prog_aux, work); if (bpf_prog_is_dev_bound(aux)) bpf_prog_offload_destroy(aux->prog); +#ifdef CONFIG_PERF_EVENTS + if (aux->prog->has_callchain_buf) + put_callchain_buffers(); +#endif for (i = 0; i < aux->func_cnt; i++) bpf_jit_free(aux->func[i]); if (aux->func_cnt) { -- cgit v1.2.3 From 4d220ed0f8140c478ab7b0a14d96821da639b646 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sat, 28 Apr 2018 19:56:37 -0700 Subject: bpf: remove tracepoints from bpf core tracepoints to bpf core were added as a way to provide introspection to bpf programs and maps, but after some time it became clear that this approach is inadequate, so prog_id, map_id and corresponding get_next_id, get_fd_by_id, get_info_by_fd, prog_query APIs were introduced and fully adopted by bpftool and other applications. The tracepoints in bpf core started to rot and causing syzbot warnings: WARNING: CPU: 0 PID: 3008 at kernel/trace/trace_event_perf.c:274 Kernel panic - not syncing: panic_on_warn set ... perf_trace_bpf_map_keyval+0x260/0xbd0 include/trace/events/bpf.h:228 trace_bpf_map_update_elem include/trace/events/bpf.h:274 [inline] map_update_elem kernel/bpf/syscall.c:597 [inline] SYSC_bpf kernel/bpf/syscall.c:1478 [inline] Hence this patch deletes tracepoints in bpf core. Reported-by: Eric Biggers Reported-by: syzbot Signed-off-by: Alexei Starovoitov Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- MAINTAINERS | 1 - include/linux/bpf_trace.h | 1 - include/trace/events/bpf.h | 355 --------------------------------------------- kernel/bpf/core.c | 6 - kernel/bpf/inode.c | 16 +- kernel/bpf/syscall.c | 15 +- 6 files changed, 2 insertions(+), 392 deletions(-) delete mode 100644 include/trace/events/bpf.h (limited to 'kernel/bpf/core.c') diff --git a/MAINTAINERS b/MAINTAINERS index a52800867850..537fd17a211b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2727,7 +2727,6 @@ F: Documentation/networking/filter.txt F: Documentation/bpf/ F: include/linux/bpf* F: include/linux/filter.h -F: include/trace/events/bpf.h F: include/trace/events/xdp.h F: include/uapi/linux/bpf* F: include/uapi/linux/filter.h diff --git a/include/linux/bpf_trace.h b/include/linux/bpf_trace.h index e6fe98ae3794..ddf896abcfb6 100644 --- a/include/linux/bpf_trace.h +++ b/include/linux/bpf_trace.h @@ -2,7 +2,6 @@ #ifndef __LINUX_BPF_TRACE_H__ #define __LINUX_BPF_TRACE_H__ -#include #include #endif /* __LINUX_BPF_TRACE_H__ */ diff --git a/include/trace/events/bpf.h b/include/trace/events/bpf.h deleted file mode 100644 index 150185647e6b..000000000000 --- a/include/trace/events/bpf.h +++ /dev/null @@ -1,355 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM bpf - -#if !defined(_TRACE_BPF_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_BPF_H - -/* These are only used within the BPF_SYSCALL code */ -#ifdef CONFIG_BPF_SYSCALL - -#include -#include -#include -#include - -#define __PROG_TYPE_MAP(FN) \ - FN(SOCKET_FILTER) \ - FN(KPROBE) \ - FN(SCHED_CLS) \ - FN(SCHED_ACT) \ - FN(TRACEPOINT) \ - FN(XDP) \ - FN(PERF_EVENT) \ - FN(CGROUP_SKB) \ - FN(CGROUP_SOCK) \ - FN(LWT_IN) \ - FN(LWT_OUT) \ - FN(LWT_XMIT) - -#define __MAP_TYPE_MAP(FN) \ - FN(HASH) \ - FN(ARRAY) \ - FN(PROG_ARRAY) \ - FN(PERF_EVENT_ARRAY) \ - FN(PERCPU_HASH) \ - FN(PERCPU_ARRAY) \ - FN(STACK_TRACE) \ - FN(CGROUP_ARRAY) \ - FN(LRU_HASH) \ - FN(LRU_PERCPU_HASH) \ - FN(LPM_TRIE) - -#define __PROG_TYPE_TP_FN(x) \ - TRACE_DEFINE_ENUM(BPF_PROG_TYPE_##x); -#define __PROG_TYPE_SYM_FN(x) \ - { BPF_PROG_TYPE_##x, #x }, -#define __PROG_TYPE_SYM_TAB \ - __PROG_TYPE_MAP(__PROG_TYPE_SYM_FN) { -1, 0 } -__PROG_TYPE_MAP(__PROG_TYPE_TP_FN) - -#define __MAP_TYPE_TP_FN(x) \ - TRACE_DEFINE_ENUM(BPF_MAP_TYPE_##x); -#define __MAP_TYPE_SYM_FN(x) \ - { BPF_MAP_TYPE_##x, #x }, -#define __MAP_TYPE_SYM_TAB \ - __MAP_TYPE_MAP(__MAP_TYPE_SYM_FN) { -1, 0 } -__MAP_TYPE_MAP(__MAP_TYPE_TP_FN) - -DECLARE_EVENT_CLASS(bpf_prog_event, - - TP_PROTO(const struct bpf_prog *prg), - - TP_ARGS(prg), - - TP_STRUCT__entry( - __array(u8, prog_tag, 8) - __field(u32, type) - ), - - TP_fast_assign( - BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag)); - memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag)); - __entry->type = prg->type; - ), - - TP_printk("prog=%s type=%s", - __print_hex_str(__entry->prog_tag, 8), - __print_symbolic(__entry->type, __PROG_TYPE_SYM_TAB)) -); - -DEFINE_EVENT(bpf_prog_event, bpf_prog_get_type, - - TP_PROTO(const struct bpf_prog *prg), - - TP_ARGS(prg) -); - -DEFINE_EVENT(bpf_prog_event, bpf_prog_put_rcu, - - TP_PROTO(const struct bpf_prog *prg), - - TP_ARGS(prg) -); - -TRACE_EVENT(bpf_prog_load, - - TP_PROTO(const struct bpf_prog *prg, int ufd), - - TP_ARGS(prg, ufd), - - TP_STRUCT__entry( - __array(u8, prog_tag, 8) - __field(u32, type) - __field(int, ufd) - ), - - TP_fast_assign( - BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag)); - memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag)); - __entry->type = prg->type; - __entry->ufd = ufd; - ), - - TP_printk("prog=%s type=%s ufd=%d", - __print_hex_str(__entry->prog_tag, 8), - __print_symbolic(__entry->type, __PROG_TYPE_SYM_TAB), - __entry->ufd) -); - -TRACE_EVENT(bpf_map_create, - - TP_PROTO(const struct bpf_map *map, int ufd), - - TP_ARGS(map, ufd), - - TP_STRUCT__entry( - __field(u32, type) - __field(u32, size_key) - __field(u32, size_value) - __field(u32, max_entries) - __field(u32, flags) - __field(int, ufd) - ), - - TP_fast_assign( - __entry->type = map->map_type; - __entry->size_key = map->key_size; - __entry->size_value = map->value_size; - __entry->max_entries = map->max_entries; - __entry->flags = map->map_flags; - __entry->ufd = ufd; - ), - - TP_printk("map type=%s ufd=%d key=%u val=%u max=%u flags=%x", - __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB), - __entry->ufd, __entry->size_key, __entry->size_value, - __entry->max_entries, __entry->flags) -); - -DECLARE_EVENT_CLASS(bpf_obj_prog, - - TP_PROTO(const struct bpf_prog *prg, int ufd, - const struct filename *pname), - - TP_ARGS(prg, ufd, pname), - - TP_STRUCT__entry( - __array(u8, prog_tag, 8) - __field(int, ufd) - __string(path, pname->name) - ), - - TP_fast_assign( - BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag)); - memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag)); - __assign_str(path, pname->name); - __entry->ufd = ufd; - ), - - TP_printk("prog=%s path=%s ufd=%d", - __print_hex_str(__entry->prog_tag, 8), - __get_str(path), __entry->ufd) -); - -DEFINE_EVENT(bpf_obj_prog, bpf_obj_pin_prog, - - TP_PROTO(const struct bpf_prog *prg, int ufd, - const struct filename *pname), - - TP_ARGS(prg, ufd, pname) -); - -DEFINE_EVENT(bpf_obj_prog, bpf_obj_get_prog, - - TP_PROTO(const struct bpf_prog *prg, int ufd, - const struct filename *pname), - - TP_ARGS(prg, ufd, pname) -); - -DECLARE_EVENT_CLASS(bpf_obj_map, - - TP_PROTO(const struct bpf_map *map, int ufd, - const struct filename *pname), - - TP_ARGS(map, ufd, pname), - - TP_STRUCT__entry( - __field(u32, type) - __field(int, ufd) - __string(path, pname->name) - ), - - TP_fast_assign( - __assign_str(path, pname->name); - __entry->type = map->map_type; - __entry->ufd = ufd; - ), - - TP_printk("map type=%s ufd=%d path=%s", - __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB), - __entry->ufd, __get_str(path)) -); - -DEFINE_EVENT(bpf_obj_map, bpf_obj_pin_map, - - TP_PROTO(const struct bpf_map *map, int ufd, - const struct filename *pname), - - TP_ARGS(map, ufd, pname) -); - -DEFINE_EVENT(bpf_obj_map, bpf_obj_get_map, - - TP_PROTO(const struct bpf_map *map, int ufd, - const struct filename *pname), - - TP_ARGS(map, ufd, pname) -); - -DECLARE_EVENT_CLASS(bpf_map_keyval, - - TP_PROTO(const struct bpf_map *map, int ufd, - const void *key, const void *val), - - TP_ARGS(map, ufd, key, val), - - TP_STRUCT__entry( - __field(u32, type) - __field(u32, key_len) - __dynamic_array(u8, key, map->key_size) - __field(bool, key_trunc) - __field(u32, val_len) - __dynamic_array(u8, val, map->value_size) - __field(bool, val_trunc) - __field(int, ufd) - ), - - TP_fast_assign( - memcpy(__get_dynamic_array(key), key, map->key_size); - memcpy(__get_dynamic_array(val), val, map->value_size); - __entry->type = map->map_type; - __entry->key_len = min(map->key_size, 16U); - __entry->key_trunc = map->key_size != __entry->key_len; - __entry->val_len = min(map->value_size, 16U); - __entry->val_trunc = map->value_size != __entry->val_len; - __entry->ufd = ufd; - ), - - TP_printk("map type=%s ufd=%d key=[%s%s] val=[%s%s]", - __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB), - __entry->ufd, - __print_hex(__get_dynamic_array(key), __entry->key_len), - __entry->key_trunc ? " ..." : "", - __print_hex(__get_dynamic_array(val), __entry->val_len), - __entry->val_trunc ? " ..." : "") -); - -DEFINE_EVENT(bpf_map_keyval, bpf_map_lookup_elem, - - TP_PROTO(const struct bpf_map *map, int ufd, - const void *key, const void *val), - - TP_ARGS(map, ufd, key, val) -); - -DEFINE_EVENT(bpf_map_keyval, bpf_map_update_elem, - - TP_PROTO(const struct bpf_map *map, int ufd, - const void *key, const void *val), - - TP_ARGS(map, ufd, key, val) -); - -TRACE_EVENT(bpf_map_delete_elem, - - TP_PROTO(const struct bpf_map *map, int ufd, - const void *key), - - TP_ARGS(map, ufd, key), - - TP_STRUCT__entry( - __field(u32, type) - __field(u32, key_len) - __dynamic_array(u8, key, map->key_size) - __field(bool, key_trunc) - __field(int, ufd) - ), - - TP_fast_assign( - memcpy(__get_dynamic_array(key), key, map->key_size); - __entry->type = map->map_type; - __entry->key_len = min(map->key_size, 16U); - __entry->key_trunc = map->key_size != __entry->key_len; - __entry->ufd = ufd; - ), - - TP_printk("map type=%s ufd=%d key=[%s%s]", - __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB), - __entry->ufd, - __print_hex(__get_dynamic_array(key), __entry->key_len), - __entry->key_trunc ? " ..." : "") -); - -TRACE_EVENT(bpf_map_next_key, - - TP_PROTO(const struct bpf_map *map, int ufd, - const void *key, const void *key_next), - - TP_ARGS(map, ufd, key, key_next), - - TP_STRUCT__entry( - __field(u32, type) - __field(u32, key_len) - __dynamic_array(u8, key, map->key_size) - __dynamic_array(u8, nxt, map->key_size) - __field(bool, key_trunc) - __field(bool, key_null) - __field(int, ufd) - ), - - TP_fast_assign( - if (key) - memcpy(__get_dynamic_array(key), key, map->key_size); - __entry->key_null = !key; - memcpy(__get_dynamic_array(nxt), key_next, map->key_size); - __entry->type = map->map_type; - __entry->key_len = min(map->key_size, 16U); - __entry->key_trunc = map->key_size != __entry->key_len; - __entry->ufd = ufd; - ), - - TP_printk("map type=%s ufd=%d key=[%s%s] next=[%s%s]", - __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB), - __entry->ufd, - __entry->key_null ? "NULL" : __print_hex(__get_dynamic_array(key), - __entry->key_len), - __entry->key_trunc && !__entry->key_null ? " ..." : "", - __print_hex(__get_dynamic_array(nxt), __entry->key_len), - __entry->key_trunc ? " ..." : "") -); -#endif /* CONFIG_BPF_SYSCALL */ -#endif /* _TRACE_BPF_H */ - -#include diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9349a5db3cf2..90feeba3a1a1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1845,9 +1845,3 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, #include EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); - -/* These are only used within the BPF_SYSCALL code */ -#ifdef CONFIG_BPF_SYSCALL -EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type); -EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu); -#endif diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index a41343009ccc..ed13645bd80c 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -429,13 +429,6 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname) ret = bpf_obj_do_pin(pname, raw, type); if (ret != 0) bpf_any_put(raw, type); - if ((trace_bpf_obj_pin_prog_enabled() || - trace_bpf_obj_pin_map_enabled()) && !ret) { - if (type == BPF_TYPE_PROG) - trace_bpf_obj_pin_prog(raw, ufd, pname); - if (type == BPF_TYPE_MAP) - trace_bpf_obj_pin_map(raw, ufd, pname); - } out: putname(pname); return ret; @@ -502,15 +495,8 @@ int bpf_obj_get_user(const char __user *pathname, int flags) else goto out; - if (ret < 0) { + if (ret < 0) bpf_any_put(raw, type); - } else if (trace_bpf_obj_get_prog_enabled() || - trace_bpf_obj_get_map_enabled()) { - if (type == BPF_TYPE_PROG) - trace_bpf_obj_get_prog(raw, ret, pname); - if (type == BPF_TYPE_MAP) - trace_bpf_obj_get_map(raw, ret, pname); - } out: putname(pname); return ret; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0bd2944eafb9..263e13ede029 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -503,7 +503,6 @@ static int map_create(union bpf_attr *attr) return err; } - trace_bpf_map_create(map, err); return err; free_map: @@ -663,7 +662,6 @@ static int map_lookup_elem(union bpf_attr *attr) if (copy_to_user(uvalue, value, value_size) != 0) goto free_value; - trace_bpf_map_lookup_elem(map, ufd, key, value); err = 0; free_value: @@ -760,8 +758,6 @@ static int map_update_elem(union bpf_attr *attr) __this_cpu_dec(bpf_prog_active); preempt_enable(); out: - if (!err) - trace_bpf_map_update_elem(map, ufd, key, value); free_value: kfree(value); free_key: @@ -814,8 +810,6 @@ static int map_delete_elem(union bpf_attr *attr) __this_cpu_dec(bpf_prog_active); preempt_enable(); out: - if (!err) - trace_bpf_map_delete_elem(map, ufd, key); kfree(key); err_put: fdput(f); @@ -879,7 +873,6 @@ out: if (copy_to_user(unext_key, next_key, map->key_size) != 0) goto free_next_key; - trace_bpf_map_next_key(map, ufd, key, next_key); err = 0; free_next_key: @@ -1027,7 +1020,6 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) if (atomic_dec_and_test(&prog->aux->refcnt)) { int i; - trace_bpf_prog_put_rcu(prog); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); @@ -1194,11 +1186,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd) struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, bool attach_drv) { - struct bpf_prog *prog = __bpf_prog_get(ufd, &type, attach_drv); - - if (!IS_ERR(prog)) - trace_bpf_prog_get_type(prog); - return prog; + return __bpf_prog_get(ufd, &type, attach_drv); } EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); @@ -1373,7 +1361,6 @@ static int bpf_prog_load(union bpf_attr *attr) } bpf_prog_kallsyms_add(prog); - trace_bpf_prog_load(prog, err); return err; free_used_maps: -- cgit v1.2.3 From e0cea7ce988cf48cc4052235d2ad2550b3bc4fa0 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 May 2018 01:08:14 +0200 Subject: bpf: implement ld_abs/ld_ind in native bpf The main part of this work is to finally allow removal of LD_ABS and LD_IND from the BPF core by reimplementing them through native eBPF instead. Both LD_ABS/LD_IND were carried over from cBPF and keeping them around in native eBPF caused way more trouble than actually worth it. To just list some of the security issues in the past: * fdfaf64e7539 ("x86: bpf_jit: support negative offsets") * 35607b02dbef ("sparc: bpf_jit: fix loads from negative offsets") * e0ee9c12157d ("x86: bpf_jit: fix two bugs in eBPF JIT compiler") * 07aee9439454 ("bpf, sparc: fix usage of wrong reg for load_skb_regs after call") * 6d59b7dbf72e ("bpf, s390x: do not reload skb pointers in non-skb context") * 87338c8e2cbb ("bpf, ppc64: do not reload skb pointers in non-skb context") For programs in native eBPF, LD_ABS/LD_IND are pretty much legacy these days due to their limitations and more efficient/flexible alternatives that have been developed over time such as direct packet access. LD_ABS/LD_IND only cover 1/2/4 byte loads into a register, the load happens in host endianness and its exception handling can yield unexpected behavior. The latter is explained in depth in f6b1b3bf0d5f ("bpf: fix subprog verifier bypass by div/mod by 0 exception") with similar cases of exceptions we had. In native eBPF more recent program types will disable LD_ABS/LD_IND altogether through may_access_skb() in verifier, and given the limitations in terms of exception handling, it's also disabled in programs that use BPF to BPF calls. In terms of cBPF, the LD_ABS/LD_IND is used in networking programs to access packet data. It is not used in seccomp-BPF but programs that use it for socket filtering or reuseport for demuxing with cBPF. This is mostly relevant for applications that have not yet migrated to native eBPF. The main complexity and source of bugs in LD_ABS/LD_IND is coming from their implementation in the various JITs. Most of them keep the model around from cBPF times by implementing a fastpath written in asm. They use typically two from the BPF program hidden CPU registers for caching the skb's headlen (skb->len - skb->data_len) and skb->data. Throughout the JIT phase this requires to keep track whether LD_ABS/LD_IND are used and if so, the two registers need to be recached each time a BPF helper would change the underlying packet data in native eBPF case. At least in eBPF case, available CPU registers are rare and the additional exit path out of the asm written JIT helper makes it also inflexible since not all parts of the JITer are in control from plain C. A LD_ABS/LD_IND implementation in eBPF therefore allows to significantly reduce the complexity in JITs with comparable performance results for them, e.g.: test_bpf tcpdump port 22 tcpdump complex x64 - before 15 21 10 14 19 18 - after 7 10 10 7 10 15 arm64 - before 40 91 92 40 91 151 - after 51 64 73 51 62 113 For cBPF we now track any usage of LD_ABS/LD_IND in bpf_convert_filter() and cache the skb's headlen and data in the cBPF prologue. The BPF_REG_TMP gets remapped from R8 to R2 since it's mainly just used as a local temporary variable. This allows to shrink the image on x86_64 also for seccomp programs slightly since mapping to %rsi is not an ereg. In callee-saved R8 and R9 we now track skb data and headlen, respectively. For normal prologue emission in the JITs this does not add any extra instructions since R8, R9 are pushed to stack in any case from eBPF side. cBPF uses the convert_bpf_ld_abs() emitter which probes the fast path inline already and falls back to bpf_skb_load_helper_{8,16,32}() helper relying on the cached skb data and headlen as well. R8 and R9 never need to be reloaded due to bpf_helper_changes_pkt_data() since all skb access in cBPF is read-only. Then, for the case of native eBPF, we use the bpf_gen_ld_abs() emitter, which calls the bpf_skb_load_helper_{8,16,32}_no_cache() helper unconditionally, does neither cache skb data and headlen nor has an inlined fast path. The reason for the latter is that native eBPF does not have any extra registers available anyway, but even if there were, it avoids any reload of skb data and headlen in the first place. Additionally, for the negative offsets, we provide an alternative bpf_skb_load_bytes_relative() helper in eBPF which operates similarly as bpf_skb_load_bytes() and allows for more flexibility. Tested myself on x64, arm64, s390x, from Sandipan on ppc64. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 + include/linux/filter.h | 4 +- kernel/bpf/core.c | 96 ++------------------ kernel/bpf/verifier.c | 24 +++++ net/core/filter.c | 236 ++++++++++++++++++++++++++++++++++++++++++++++--- 5 files changed, 262 insertions(+), 100 deletions(-) (limited to 'kernel/bpf/core.c') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d0e3d7ef36a8..0e00a13ff01b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -235,6 +235,8 @@ struct bpf_verifier_ops { struct bpf_insn_access_aux *info); int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, const struct bpf_prog *prog); + int (*gen_ld_abs)(const struct bpf_insn *orig, + struct bpf_insn *insn_buf); u32 (*convert_ctx_access)(enum bpf_access_type type, const struct bpf_insn *src, struct bpf_insn *dst, diff --git a/include/linux/filter.h b/include/linux/filter.h index b7f81e3a70cb..da7e16523128 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -47,7 +47,9 @@ struct xdp_buff; /* Additional register mappings for converted user programs. */ #define BPF_REG_A BPF_REG_0 #define BPF_REG_X BPF_REG_7 -#define BPF_REG_TMP BPF_REG_8 +#define BPF_REG_TMP BPF_REG_2 /* scratch reg */ +#define BPF_REG_D BPF_REG_8 /* data, callee-saved */ +#define BPF_REG_H BPF_REG_9 /* hlen, callee-saved */ /* Kernel hidden auxiliary/helper register for hardening step. * Only used by eBPF JITs. It's nothing more than a temporary diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 90feeba3a1a1..1127552c8033 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -634,23 +634,6 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off); break; - case BPF_LD | BPF_ABS | BPF_W: - case BPF_LD | BPF_ABS | BPF_H: - case BPF_LD | BPF_ABS | BPF_B: - *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); - *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); - *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0); - break; - - case BPF_LD | BPF_IND | BPF_W: - case BPF_LD | BPF_IND | BPF_H: - case BPF_LD | BPF_IND | BPF_B: - *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); - *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); - *to++ = BPF_ALU32_REG(BPF_ADD, BPF_REG_AX, from->src_reg); - *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0); - break; - case BPF_LD | BPF_IMM | BPF_DW: *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm); *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); @@ -891,14 +874,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); INSN_3(LDX, MEM, W), \ INSN_3(LDX, MEM, DW), \ /* Immediate based. */ \ - INSN_3(LD, IMM, DW), \ - /* Misc (old cBPF carry-over). */ \ - INSN_3(LD, ABS, B), \ - INSN_3(LD, ABS, H), \ - INSN_3(LD, ABS, W), \ - INSN_3(LD, IND, B), \ - INSN_3(LD, IND, H), \ - INSN_3(LD, IND, W) + INSN_3(LD, IMM, DW) bool bpf_opcode_in_insntable(u8 code) { @@ -908,6 +884,13 @@ bool bpf_opcode_in_insntable(u8 code) [0 ... 255] = false, /* Now overwrite non-defaults ... */ BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL), + /* UAPI exposed, but rewritten opcodes. cBPF carry-over. */ + [BPF_LD | BPF_ABS | BPF_B] = true, + [BPF_LD | BPF_ABS | BPF_H] = true, + [BPF_LD | BPF_ABS | BPF_W] = true, + [BPF_LD | BPF_IND | BPF_B] = true, + [BPF_LD | BPF_IND | BPF_H] = true, + [BPF_LD | BPF_IND | BPF_W] = true, }; #undef BPF_INSN_3_TBL #undef BPF_INSN_2_TBL @@ -938,8 +921,6 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) #undef BPF_INSN_3_LBL #undef BPF_INSN_2_LBL u32 tail_call_cnt = 0; - void *ptr; - int off; #define CONT ({ insn++; goto select_insn; }) #define CONT_JMP ({ insn++; goto select_insn; }) @@ -1266,67 +1247,6 @@ out: atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) (DST + insn->off)); CONT; - LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */ - off = IMM; -load_word: - /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are only - * appearing in the programs where ctx == skb - * (see may_access_skb() in the verifier). All programs - * keep 'ctx' in regs[BPF_REG_CTX] == BPF_R6, - * bpf_convert_filter() saves it in BPF_R6, internal BPF - * verifier will check that BPF_R6 == ctx. - * - * BPF_ABS and BPF_IND are wrappers of function calls, - * so they scratch BPF_R1-BPF_R5 registers, preserve - * BPF_R6-BPF_R9, and store return value into BPF_R0. - * - * Implicit input: - * ctx == skb == BPF_R6 == CTX - * - * Explicit input: - * SRC == any register - * IMM == 32-bit immediate - * - * Output: - * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness - */ - - ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp); - if (likely(ptr != NULL)) { - BPF_R0 = get_unaligned_be32(ptr); - CONT; - } - - return 0; - LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */ - off = IMM; -load_half: - ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp); - if (likely(ptr != NULL)) { - BPF_R0 = get_unaligned_be16(ptr); - CONT; - } - - return 0; - LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */ - off = IMM; -load_byte: - ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp); - if (likely(ptr != NULL)) { - BPF_R0 = *(u8 *)ptr; - CONT; - } - - return 0; - LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */ - off = IMM + SRC; - goto load_word; - LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */ - off = IMM + SRC; - goto load_half; - LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */ - off = IMM + SRC; - goto load_byte; default_label: /* If we ever reach this, we have a bug somewhere. Die hard here diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0d91f18b2eb5..6ba10a83909d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3884,6 +3884,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if (!env->ops->gen_ld_abs) { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } + if (env->subprog_cnt) { /* when program has LD_ABS insn JITs and interpreter assume * that r1 == ctx == skb which is not the case for callees @@ -5519,6 +5524,25 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) continue; } + if (BPF_CLASS(insn->code) == BPF_LD && + (BPF_MODE(insn->code) == BPF_ABS || + BPF_MODE(insn->code) == BPF_IND)) { + cnt = env->ops->gen_ld_abs(insn, insn_buf); + if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + if (insn->code != (BPF_JMP | BPF_CALL)) continue; if (insn->src_reg == BPF_PSEUDO_CALL) diff --git a/net/core/filter.c b/net/core/filter.c index 865500f6180d..a49729842b3d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -162,6 +162,87 @@ BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) return 0; } +BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *, + data, int, headlen, int, offset) +{ + u8 tmp, *ptr; + const int len = sizeof(tmp); + + if (offset >= 0) { + if (headlen - offset >= len) + return *(u8 *)(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return tmp; + } else { + ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); + if (likely(ptr)) + return *(u8 *)ptr; + } + + return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, + int, offset) +{ + return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len, + offset); +} + +BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *, + data, int, headlen, int, offset) +{ + u16 tmp, *ptr; + const int len = sizeof(tmp); + + if (offset >= 0) { + if (headlen - offset >= len) + return get_unaligned_be16(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be16_to_cpu(tmp); + } else { + ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); + if (likely(ptr)) + return get_unaligned_be16(ptr); + } + + return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, + int, offset) +{ + return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len, + offset); +} + +BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *, + data, int, headlen, int, offset) +{ + u32 tmp, *ptr; + const int len = sizeof(tmp); + + if (likely(offset >= 0)) { + if (headlen - offset >= len) + return get_unaligned_be32(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be32_to_cpu(tmp); + } else { + ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); + if (likely(ptr)) + return get_unaligned_be32(ptr); + } + + return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, + int, offset) +{ + return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len, + offset); +} + BPF_CALL_0(bpf_get_raw_cpu_id) { return raw_smp_processor_id(); @@ -354,26 +435,87 @@ static bool convert_bpf_extensions(struct sock_filter *fp, return true; } +static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) +{ + const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS); + int size = bpf_size_to_bytes(BPF_SIZE(fp->code)); + bool endian = BPF_SIZE(fp->code) == BPF_H || + BPF_SIZE(fp->code) == BPF_W; + bool indirect = BPF_MODE(fp->code) == BPF_IND; + const int ip_align = NET_IP_ALIGN; + struct bpf_insn *insn = *insnp; + int offset = fp->k; + + if (!indirect && + ((unaligned_ok && offset >= 0) || + (!unaligned_ok && offset >= 0 && + offset + ip_align >= 0 && + offset + ip_align % size == 0))) { + *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); + *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); + *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian); + *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_D, + offset); + if (endian) + *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8); + *insn++ = BPF_JMP_A(8); + } + + *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); + *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D); + *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H); + if (!indirect) { + *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset); + } else { + *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X); + if (fp->k) + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset); + } + + switch (BPF_SIZE(fp->code)) { + case BPF_B: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8); + break; + case BPF_H: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16); + break; + case BPF_W: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32); + break; + default: + return false; + } + + *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2); + *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *insn = BPF_EXIT_INSN(); + + *insnp = insn; + return true; +} + /** * bpf_convert_filter - convert filter program * @prog: the user passed filter program * @len: the length of the user passed filter program * @new_prog: allocated 'struct bpf_prog' or NULL * @new_len: pointer to store length of converted program + * @seen_ld_abs: bool whether we've seen ld_abs/ind * * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn' * style extended BPF (eBPF). * Conversion workflow: * * 1) First pass for calculating the new program length: - * bpf_convert_filter(old_prog, old_len, NULL, &new_len) + * bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs) * * 2) 2nd pass to remap in two passes: 1st pass finds new * jump offsets, 2nd pass remapping: - * bpf_convert_filter(old_prog, old_len, new_prog, &new_len); + * bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs) */ static int bpf_convert_filter(struct sock_filter *prog, int len, - struct bpf_prog *new_prog, int *new_len) + struct bpf_prog *new_prog, int *new_len, + bool *seen_ld_abs) { int new_flen = 0, pass = 0, target, i, stack_off; struct bpf_insn *new_insn, *first_insn = NULL; @@ -412,12 +554,27 @@ do_pass: * do this ourself. Initial CTX is present in BPF_REG_ARG1. */ *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); + if (*seen_ld_abs) { + /* For packet access in classic BPF, cache skb->data + * in callee-saved BPF R8 and skb->len - skb->data_len + * (headlen) in BPF R9. Since classic BPF is read-only + * on CTX, we only need to cache it once. + */ + *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), + BPF_REG_D, BPF_REG_CTX, + offsetof(struct sk_buff, data)); + *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX, + offsetof(struct sk_buff, len)); + *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX, + offsetof(struct sk_buff, data_len)); + *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP); + } } else { new_insn += 3; } for (i = 0; i < len; fp++, i++) { - struct bpf_insn tmp_insns[6] = { }; + struct bpf_insn tmp_insns[32] = { }; struct bpf_insn *insn = tmp_insns; if (addrs) @@ -460,6 +617,11 @@ do_pass: BPF_MODE(fp->code) == BPF_ABS && convert_bpf_extensions(fp, &insn)) break; + if (BPF_CLASS(fp->code) == BPF_LD && + convert_bpf_ld_abs(fp, &insn)) { + *seen_ld_abs = true; + break; + } if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { @@ -562,21 +724,31 @@ jmp_rest: break; /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */ - case BPF_LDX | BPF_MSH | BPF_B: - /* tmp = A */ - *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A); + case BPF_LDX | BPF_MSH | BPF_B: { + struct sock_filter tmp = { + .code = BPF_LD | BPF_ABS | BPF_B, + .k = fp->k, + }; + + *seen_ld_abs = true; + + /* X = A */ + *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); /* A = BPF_R0 = *(u8 *) (skb->data + K) */ - *insn++ = BPF_LD_ABS(BPF_B, fp->k); + convert_bpf_ld_abs(&tmp, &insn); + insn++; /* A &= 0xf */ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); /* A <<= 2 */ *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); + /* tmp = X */ + *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X); /* X = A */ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); /* A = tmp */ *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); break; - + } /* RET_K is remaped into 2 insns. RET_A case doesn't need an * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. */ @@ -658,6 +830,8 @@ jmp_rest: if (!new_prog) { /* Only calculating new length. */ *new_len = new_insn - first_insn; + if (*seen_ld_abs) + *new_len += 4; /* Prologue bits. */ return 0; } @@ -1019,6 +1193,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) struct sock_filter *old_prog; struct bpf_prog *old_fp; int err, new_len, old_len = fp->len; + bool seen_ld_abs = false; /* We are free to overwrite insns et al right here as it * won't be used at this point in time anymore internally @@ -1040,7 +1215,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) } /* 1st pass: calculate the new program length. */ - err = bpf_convert_filter(old_prog, old_len, NULL, &new_len); + err = bpf_convert_filter(old_prog, old_len, NULL, &new_len, + &seen_ld_abs); if (err) goto out_err_free; @@ -1059,7 +1235,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) fp->len = new_len; /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ - err = bpf_convert_filter(old_prog, old_len, fp, &new_len); + err = bpf_convert_filter(old_prog, old_len, fp, &new_len, + &seen_ld_abs); if (err) /* 2nd bpf_convert_filter() can fail only if it fails * to allocate memory, remapping must succeed. Note, @@ -4330,6 +4507,41 @@ static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, return insn - insn_buf; } +static int bpf_gen_ld_abs(const struct bpf_insn *orig, + struct bpf_insn *insn_buf) +{ + bool indirect = BPF_MODE(orig->code) == BPF_IND; + struct bpf_insn *insn = insn_buf; + + /* We're guaranteed here that CTX is in R6. */ + *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX); + if (!indirect) { + *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm); + } else { + *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg); + if (orig->imm) + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm); + } + + switch (BPF_SIZE(orig->code)) { + case BPF_B: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache); + break; + case BPF_H: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache); + break; + case BPF_W: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache); + break; + } + + *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2); + *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0); + *insn++ = BPF_EXIT_INSN(); + + return insn - insn_buf; +} + static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { @@ -5599,6 +5811,7 @@ const struct bpf_verifier_ops sk_filter_verifier_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, + .gen_ld_abs = bpf_gen_ld_abs, }; const struct bpf_prog_ops sk_filter_prog_ops = { @@ -5610,6 +5823,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .is_valid_access = tc_cls_act_is_valid_access, .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, + .gen_ld_abs = bpf_gen_ld_abs, }; const struct bpf_prog_ops tc_cls_act_prog_ops = { -- cgit v1.2.3 From 6cb5fb3891db9d3f6feb0f7669946550c9abc420 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 May 2018 18:37:10 -0700 Subject: bpf: export bpf_event_output() bpf_event_output() is useful for offloads to add events to BPF event rings, export it. Note that export is placed near the stub since tracing is optional and kernel/bpf/core.c is always going to be built. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Reviewed-by: Jiong Wang Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/bpf/core.c') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1127552c8033..d0d7d9462368 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1719,6 +1719,7 @@ bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, { return -ENOTSUPP; } +EXPORT_SYMBOL_GPL(bpf_event_output); /* Always built-in helper functions. */ const struct bpf_func_proto bpf_tail_call_proto = { -- cgit v1.2.3 From 81110384441a59cff47430f20f049e69b98c17f4 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 14 May 2018 10:00:17 -0700 Subject: bpf: sockmap, add hash map support Sockmap is currently backed by an array and enforces keys to be four bytes. This works well for many use cases and was originally modeled after devmap which also uses four bytes keys. However, this has become limiting in larger use cases where a hash would be more appropriate. For example users may want to use the 5-tuple of the socket as the lookup key. To support this add hash support. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 8 + include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 54 ++++- kernel/bpf/core.c | 1 + kernel/bpf/sockmap.c | 494 ++++++++++++++++++++++++++++++++++++++++++++-- kernel/bpf/verifier.c | 14 +- net/core/filter.c | 58 ++++++ 7 files changed, 611 insertions(+), 19 deletions(-) (limited to 'kernel/bpf/core.c') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a38e474bf7ee..ed0122b45b63 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -668,6 +668,7 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map) #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET) struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key); +struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key); int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type); #else static inline struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) @@ -675,6 +676,12 @@ static inline struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) return NULL; } +static inline struct sock *__sock_hash_lookup_elem(struct bpf_map *map, + void *key) +{ + return NULL; +} + static inline int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) @@ -724,6 +731,7 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_get_stack_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto; +extern const struct bpf_func_proto bpf_sock_hash_update_proto; /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index d7df1b323082..b67f8793de0d 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -47,6 +47,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_INET) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 02e4112510f8..d94d333a8225 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -118,6 +118,7 @@ enum bpf_map_type { BPF_MAP_TYPE_SOCKMAP, BPF_MAP_TYPE_CPUMAP, BPF_MAP_TYPE_XSKMAP, + BPF_MAP_TYPE_SOCKHASH, }; enum bpf_prog_type { @@ -1828,7 +1829,6 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) * Description * Do FIB lookup in kernel tables using parameters in *params*. @@ -1855,6 +1855,53 @@ union bpf_attr { * Egress device index on success, 0 if packet needs to continue * up the stack for further processing or a negative error in case * of failure. + * + * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) + * Description + * Add an entry to, or update a sockhash *map* referencing sockets. + * The *skops* is used as a new value for the entry associated to + * *key*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * If the *map* has eBPF programs (parser and verdict), those will + * be inherited by the socket being added. If the socket is + * already attached to eBPF programs, this results in an error. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * socket level. If the message *msg* is allowed to pass (i.e. if + * the verdict eBPF program returns **SK_PASS**), redirect it to + * the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. + * if the verdeict eBPF program returns **SK_PASS**), redirect it + * to the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -1926,7 +1973,10 @@ union bpf_attr { FN(skb_get_xfrm_state), \ FN(get_stack), \ FN(skb_load_bytes_relative), \ - FN(fib_lookup), + FN(fib_lookup), \ + FN(sock_hash_update), \ + FN(msg_redirect_hash), \ + FN(sk_redirect_hash), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d0d7d9462368..2194c6a9df42 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1707,6 +1707,7 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; const struct bpf_func_proto bpf_sock_map_update_proto __weak; +const struct bpf_func_proto bpf_sock_hash_update_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index beab9ec9b023..56879c9fd3a4 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -60,6 +60,28 @@ struct bpf_stab { struct bpf_sock_progs progs; }; +struct bucket { + struct hlist_head head; + raw_spinlock_t lock; +}; + +struct bpf_htab { + struct bpf_map map; + struct bucket *buckets; + atomic_t count; + u32 n_buckets; + u32 elem_size; + struct bpf_sock_progs progs; +}; + +struct htab_elem { + struct rcu_head rcu; + struct hlist_node hash_node; + u32 hash; + struct sock *sk; + char key[0]; +}; + enum smap_psock_state { SMAP_TX_RUNNING, }; @@ -67,6 +89,8 @@ enum smap_psock_state { struct smap_psock_map_entry { struct list_head list; struct sock **entry; + struct htab_elem *hash_link; + struct bpf_htab *htab; }; struct smap_psock { @@ -195,6 +219,12 @@ out: rcu_read_unlock(); } +static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) +{ + atomic_dec(&htab->count); + kfree_rcu(l, rcu); +} + static void bpf_tcp_close(struct sock *sk, long timeout) { void (*close_fun)(struct sock *sk, long timeout); @@ -231,10 +261,16 @@ static void bpf_tcp_close(struct sock *sk, long timeout) } list_for_each_entry_safe(e, tmp, &psock->maps, list) { - osk = cmpxchg(e->entry, sk, NULL); - if (osk == sk) { - list_del(&e->list); - smap_release_sock(psock, sk); + if (e->entry) { + osk = cmpxchg(e->entry, sk, NULL); + if (osk == sk) { + list_del(&e->list); + smap_release_sock(psock, sk); + } + } else { + hlist_del_rcu(&e->hash_link->hash_node); + smap_release_sock(psock, e->hash_link->sk); + free_htab_elem(e->htab, e->hash_link); } } write_unlock_bh(&sk->sk_callback_lock); @@ -1527,12 +1563,14 @@ free_stab: return ERR_PTR(err); } -static void smap_list_remove(struct smap_psock *psock, struct sock **entry) +static void smap_list_remove(struct smap_psock *psock, + struct sock **entry, + struct htab_elem *hash_link) { struct smap_psock_map_entry *e, *tmp; list_for_each_entry_safe(e, tmp, &psock->maps, list) { - if (e->entry == entry) { + if (e->entry == entry || e->hash_link == hash_link) { list_del(&e->list); break; } @@ -1570,7 +1608,7 @@ static void sock_map_free(struct bpf_map *map) * to be null and queued for garbage collection. */ if (likely(psock)) { - smap_list_remove(psock, &stab->sock_map[i]); + smap_list_remove(psock, &stab->sock_map[i], NULL); smap_release_sock(psock, sock); } write_unlock_bh(&sock->sk_callback_lock); @@ -1629,7 +1667,7 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) if (psock->bpf_parse) smap_stop_sock(psock, sock); - smap_list_remove(psock, &stab->sock_map[k]); + smap_list_remove(psock, &stab->sock_map[k], NULL); smap_release_sock(psock, sock); out: write_unlock_bh(&sock->sk_callback_lock); @@ -1746,10 +1784,12 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, new = true; } - e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); - if (!e) { - err = -ENOMEM; - goto out_progs; + if (map_link) { + e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); + if (!e) { + err = -ENOMEM; + goto out_progs; + } } /* 3. At this point we have a reference to a valid psock that is @@ -1783,6 +1823,7 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, write_unlock_bh(&sock->sk_callback_lock); return err; out_free: + kfree(e); smap_release_sock(psock, sock); out_progs: if (verdict) @@ -1829,7 +1870,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, struct smap_psock *opsock = smap_psock_sk(osock); write_lock_bh(&osock->sk_callback_lock); - smap_list_remove(opsock, &stab->sock_map[i]); + smap_list_remove(opsock, &stab->sock_map[i], NULL); smap_release_sock(opsock, osock); write_unlock_bh(&osock->sk_callback_lock); } @@ -1846,6 +1887,10 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) struct bpf_stab *stab = container_of(map, struct bpf_stab, map); progs = &stab->progs; + } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH) { + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + + progs = &htab->progs; } else { return -EINVAL; } @@ -1906,11 +1951,19 @@ static int sock_map_update_elem(struct bpf_map *map, static void sock_map_release(struct bpf_map *map) { - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct bpf_sock_progs *progs; struct bpf_prog *orig; - progs = &stab->progs; + if (map->map_type == BPF_MAP_TYPE_SOCKMAP) { + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + + progs = &stab->progs; + } else { + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + + progs = &htab->progs; + } + orig = xchg(&progs->bpf_parse, NULL); if (orig) bpf_prog_put(orig); @@ -1923,6 +1976,390 @@ static void sock_map_release(struct bpf_map *map) bpf_prog_put(orig); } +static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) +{ + struct bpf_htab *htab; + int i, err; + u64 cost; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->value_size != 4 || + attr->map_flags & ~SOCK_CREATE_FLAG_MASK) + return ERR_PTR(-EINVAL); + + err = bpf_tcp_ulp_register(); + if (err && err != -EEXIST) + return ERR_PTR(err); + + htab = kzalloc(sizeof(*htab), GFP_USER); + if (!htab) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&htab->map, attr); + + htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); + htab->elem_size = sizeof(struct htab_elem) + + round_up(htab->map.key_size, 8); + err = -EINVAL; + if (htab->n_buckets == 0 || + htab->n_buckets > U32_MAX / sizeof(struct bucket)) + goto free_htab; + + cost = (u64) htab->n_buckets * sizeof(struct bucket) + + (u64) htab->elem_size * htab->map.max_entries; + + if (cost >= U32_MAX - PAGE_SIZE) + goto free_htab; + + htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + err = bpf_map_precharge_memlock(htab->map.pages); + if (err) + goto free_htab; + + err = -ENOMEM; + htab->buckets = bpf_map_area_alloc( + htab->n_buckets * sizeof(struct bucket), + htab->map.numa_node); + if (!htab->buckets) + goto free_htab; + + for (i = 0; i < htab->n_buckets; i++) { + INIT_HLIST_HEAD(&htab->buckets[i].head); + raw_spin_lock_init(&htab->buckets[i].lock); + } + + return &htab->map; +free_htab: + kfree(htab); + return ERR_PTR(err); +} + +static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &htab->buckets[hash & (htab->n_buckets - 1)]; +} + +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &__select_bucket(htab, hash)->head; +} + +static void sock_hash_free(struct bpf_map *map) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + int i; + + synchronize_rcu(); + + /* At this point no update, lookup or delete operations can happen. + * However, be aware we can still get a socket state event updates, + * and data ready callabacks that reference the psock from sk_user_data + * Also psock worker threads are still in-flight. So smap_release_sock + * will only free the psock after cancel_sync on the worker threads + * and a grace period expire to ensure psock is really safe to remove. + */ + rcu_read_lock(); + for (i = 0; i < htab->n_buckets; i++) { + struct hlist_head *head = select_bucket(htab, i); + struct hlist_node *n; + struct htab_elem *l; + + hlist_for_each_entry_safe(l, n, head, hash_node) { + struct sock *sock = l->sk; + struct smap_psock *psock; + + hlist_del_rcu(&l->hash_node); + write_lock_bh(&sock->sk_callback_lock); + psock = smap_psock_sk(sock); + /* This check handles a racing sock event that can get + * the sk_callback_lock before this case but after xchg + * causing the refcnt to hit zero and sock user data + * (psock) to be null and queued for garbage collection. + */ + if (likely(psock)) { + smap_list_remove(psock, NULL, l); + smap_release_sock(psock, sock); + } + write_unlock_bh(&sock->sk_callback_lock); + kfree(l); + } + } + rcu_read_unlock(); + bpf_map_area_free(htab->buckets); + kfree(htab); +} + +static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, + void *key, u32 key_size, u32 hash, + struct sock *sk, + struct htab_elem *old_elem) +{ + struct htab_elem *l_new; + + if (atomic_inc_return(&htab->count) > htab->map.max_entries) { + if (!old_elem) { + atomic_dec(&htab->count); + return ERR_PTR(-E2BIG); + } + } + l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, + htab->map.numa_node); + if (!l_new) + return ERR_PTR(-ENOMEM); + + memcpy(l_new->key, key, key_size); + l_new->sk = sk; + l_new->hash = hash; + return l_new; +} + +static struct htab_elem *lookup_elem_raw(struct hlist_head *head, + u32 hash, void *key, u32 key_size) +{ + struct htab_elem *l; + + hlist_for_each_entry_rcu(l, head, hash_node) { + if (l->hash == hash && !memcmp(&l->key, key, key_size)) + return l; + } + + return NULL; +} + +static inline u32 htab_map_hash(const void *key, u32 key_len) +{ + return jhash(key, key_len, 0); +} + +static int sock_hash_get_next_key(struct bpf_map *map, + void *key, void *next_key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l, *next_l; + struct hlist_head *h; + u32 hash, key_size; + int i = 0; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + if (!key) + goto find_first_elem; + hash = htab_map_hash(key, key_size); + h = select_bucket(htab, hash); + + l = lookup_elem_raw(h, hash, key, key_size); + if (!l) + goto find_first_elem; + next_l = hlist_entry_safe( + rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), + struct htab_elem, hash_node); + if (next_l) { + memcpy(next_key, next_l->key, key_size); + return 0; + } + + /* no more elements in this hash list, go to the next bucket */ + i = hash & (htab->n_buckets - 1); + i++; + +find_first_elem: + /* iterate over buckets */ + for (; i < htab->n_buckets; i++) { + h = select_bucket(htab, i); + + /* pick first element in the bucket */ + next_l = hlist_entry_safe( + rcu_dereference_raw(hlist_first_rcu(h)), + struct htab_elem, hash_node); + if (next_l) { + /* if it's not empty, just return it */ + memcpy(next_key, next_l->key, key_size); + return 0; + } + } + + /* iterated over all buckets and all elements */ + return -ENOENT; +} + +static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, + struct bpf_map *map, + void *key, u64 map_flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct bpf_sock_progs *progs = &htab->progs; + struct htab_elem *l_new = NULL, *l_old; + struct smap_psock_map_entry *e = NULL; + struct hlist_head *head; + struct smap_psock *psock; + u32 key_size, hash; + struct sock *sock; + struct bucket *b; + int err; + + sock = skops->sk; + + if (sock->sk_type != SOCK_STREAM || + sock->sk_protocol != IPPROTO_TCP) + return -EOPNOTSUPP; + + if (unlikely(map_flags > BPF_EXIST)) + return -EINVAL; + + e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); + if (!e) + return -ENOMEM; + + WARN_ON_ONCE(!rcu_read_lock_held()); + key_size = map->key_size; + hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; + + err = __sock_map_ctx_update_elem(map, progs, sock, NULL, key); + if (err) + goto err; + + /* bpf_map_update_elem() can be called in_irq() */ + raw_spin_lock_bh(&b->lock); + l_old = lookup_elem_raw(head, hash, key, key_size); + if (l_old && map_flags == BPF_NOEXIST) { + err = -EEXIST; + goto bucket_err; + } + if (!l_old && map_flags == BPF_EXIST) { + err = -ENOENT; + goto bucket_err; + } + + l_new = alloc_sock_hash_elem(htab, key, key_size, hash, sock, l_old); + if (IS_ERR(l_new)) { + err = PTR_ERR(l_new); + goto bucket_err; + } + + psock = smap_psock_sk(sock); + if (unlikely(!psock)) { + err = -EINVAL; + goto bucket_err; + } + + e->hash_link = l_new; + e->htab = container_of(map, struct bpf_htab, map); + list_add_tail(&e->list, &psock->maps); + + /* add new element to the head of the list, so that + * concurrent search will find it before old elem + */ + hlist_add_head_rcu(&l_new->hash_node, head); + if (l_old) { + psock = smap_psock_sk(l_old->sk); + + hlist_del_rcu(&l_old->hash_node); + smap_list_remove(psock, NULL, l_old); + smap_release_sock(psock, l_old->sk); + free_htab_elem(htab, l_old); + } + raw_spin_unlock_bh(&b->lock); + return 0; +bucket_err: + raw_spin_unlock_bh(&b->lock); +err: + kfree(e); + psock = smap_psock_sk(sock); + if (psock) + smap_release_sock(psock, sock); + return err; +} + +static int sock_hash_update_elem(struct bpf_map *map, + void *key, void *value, u64 flags) +{ + struct bpf_sock_ops_kern skops; + u32 fd = *(u32 *)value; + struct socket *socket; + int err; + + socket = sockfd_lookup(fd, &err); + if (!socket) + return err; + + skops.sk = socket->sk; + if (!skops.sk) { + fput(socket->file); + return -EINVAL; + } + + err = sock_hash_ctx_update_elem(&skops, map, key, flags); + fput(socket->file); + return err; +} + +static int sock_hash_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct bucket *b; + struct htab_elem *l; + u32 hash, key_size; + int ret = -ENOENT; + + key_size = map->key_size; + hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; + + raw_spin_lock_bh(&b->lock); + l = lookup_elem_raw(head, hash, key, key_size); + if (l) { + struct sock *sock = l->sk; + struct smap_psock *psock; + + hlist_del_rcu(&l->hash_node); + write_lock_bh(&sock->sk_callback_lock); + psock = smap_psock_sk(sock); + /* This check handles a racing sock event that can get the + * sk_callback_lock before this case but after xchg happens + * causing the refcnt to hit zero and sock user data (psock) + * to be null and queued for garbage collection. + */ + if (likely(psock)) { + smap_list_remove(psock, NULL, l); + smap_release_sock(psock, sock); + } + write_unlock_bh(&sock->sk_callback_lock); + free_htab_elem(htab, l); + ret = 0; + } + raw_spin_unlock_bh(&b->lock); + return ret; +} + +struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct htab_elem *l; + u32 key_size, hash; + struct bucket *b; + struct sock *sk; + + key_size = map->key_size; + hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; + + raw_spin_lock_bh(&b->lock); + l = lookup_elem_raw(head, hash, key, key_size); + sk = l ? l->sk : NULL; + raw_spin_unlock_bh(&b->lock); + return sk; +} + const struct bpf_map_ops sock_map_ops = { .map_alloc = sock_map_alloc, .map_free = sock_map_free, @@ -1933,6 +2370,15 @@ const struct bpf_map_ops sock_map_ops = { .map_release_uref = sock_map_release, }; +const struct bpf_map_ops sock_hash_ops = { + .map_alloc = sock_hash_alloc, + .map_free = sock_hash_free, + .map_lookup_elem = sock_map_lookup, + .map_get_next_key = sock_hash_get_next_key, + .map_update_elem = sock_hash_update_elem, + .map_delete_elem = sock_hash_delete_elem, +}; + BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, struct bpf_map *, map, void *, key, u64, flags) { @@ -1950,3 +2396,21 @@ const struct bpf_func_proto bpf_sock_map_update_proto = { .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; + +BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock, + struct bpf_map *, map, void *, key, u64, flags) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return sock_hash_ctx_update_elem(bpf_sock, map, key, flags); +} + +const struct bpf_func_proto bpf_sock_hash_update_proto = { + .func = bpf_sock_hash_update, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d92d9c37affd..a9e4b1372da6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2093,6 +2093,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_msg_redirect_map) goto error; break; + case BPF_MAP_TYPE_SOCKHASH: + if (func_id != BPF_FUNC_sk_redirect_hash && + func_id != BPF_FUNC_sock_hash_update && + func_id != BPF_FUNC_map_delete_elem && + func_id != BPF_FUNC_msg_redirect_hash) + goto error; + break; default: break; } @@ -2130,11 +2137,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, break; case BPF_FUNC_sk_redirect_map: case BPF_FUNC_msg_redirect_map: + case BPF_FUNC_sock_map_update: if (map->map_type != BPF_MAP_TYPE_SOCKMAP) goto error; break; - case BPF_FUNC_sock_map_update: - if (map->map_type != BPF_MAP_TYPE_SOCKMAP) + case BPF_FUNC_sk_redirect_hash: + case BPF_FUNC_msg_redirect_hash: + case BPF_FUNC_sock_hash_update: + if (map->map_type != BPF_MAP_TYPE_SOCKHASH) goto error; break; default: diff --git a/net/core/filter.c b/net/core/filter.c index 61a3ed6bac25..6d0d1560bd70 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2074,6 +2074,33 @@ static const struct bpf_func_proto bpf_redirect_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, + struct bpf_map *, map, void *, key, u64, flags) +{ + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + + /* If user passes invalid input drop the packet. */ + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + + tcb->bpf.flags = flags; + tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key); + if (!tcb->bpf.sk_redir) + return SK_DROP; + + return SK_PASS; +} + +static const struct bpf_func_proto bpf_sk_redirect_hash_proto = { + .func = bpf_sk_redirect_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct bpf_map *, map, u32, key, u64, flags) { @@ -2108,6 +2135,31 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg, + struct bpf_map *, map, void *, key, u64, flags) +{ + /* If user passes invalid input drop the packet. */ + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + + msg->flags = flags; + msg->sk_redir = __sock_hash_lookup_elem(map, key); + if (!msg->sk_redir) + return SK_DROP; + + return SK_PASS; +} + +static const struct bpf_func_proto bpf_msg_redirect_hash_proto = { + .func = bpf_msg_redirect_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, struct bpf_map *, map, u32, key, u64, flags) { @@ -4502,6 +4554,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sock_ops_cb_flags_set_proto; case BPF_FUNC_sock_map_update: return &bpf_sock_map_update_proto; + case BPF_FUNC_sock_hash_update: + return &bpf_sock_hash_update_proto; default: return bpf_base_func_proto(func_id); } @@ -4513,6 +4567,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_msg_redirect_map: return &bpf_msg_redirect_map_proto; + case BPF_FUNC_msg_redirect_hash: + return &bpf_msg_redirect_hash_proto; case BPF_FUNC_msg_apply_bytes: return &bpf_msg_apply_bytes_proto; case BPF_FUNC_msg_cork_bytes: @@ -4544,6 +4600,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_uid_proto; case BPF_FUNC_sk_redirect_map: return &bpf_sk_redirect_map_proto; + case BPF_FUNC_sk_redirect_hash: + return &bpf_sk_redirect_hash_proto; default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3 From 170a7e3ea0709eae12c8f944b9f33c54fe80c6c1 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sun, 27 May 2018 12:24:08 +0100 Subject: bpf: bpf_prog_array_copy() should return -ENOENT if exclude_prog not found This makes is it possible for bpf prog detach to return -ENOENT. Acked-by: Yonghong Song Signed-off-by: Sean Young Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 11 +++++++++-- kernel/trace/bpf_trace.c | 2 ++ 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel/bpf/core.c') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b574dddc05b8..527587de8a67 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1616,6 +1616,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, int new_prog_cnt, carry_prog_cnt = 0; struct bpf_prog **existing_prog; struct bpf_prog_array *array; + bool found_exclude = false; int new_prog_idx = 0; /* Figure out how many existing progs we need to carry over to @@ -1624,14 +1625,20 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, if (old_array) { existing_prog = old_array->progs; for (; *existing_prog; existing_prog++) { - if (*existing_prog != exclude_prog && - *existing_prog != &dummy_bpf_prog.prog) + if (*existing_prog == exclude_prog) { + found_exclude = true; + continue; + } + if (*existing_prog != &dummy_bpf_prog.prog) carry_prog_cnt++; if (*existing_prog == include_prog) return -EEXIST; } } + if (exclude_prog && !found_exclude) + return -ENOENT; + /* How many progs (not NULL) will be in the new array? */ new_prog_cnt = carry_prog_cnt; if (include_prog) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 81fdf2fc94ac..af1486d9a0ed 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1006,6 +1006,8 @@ void perf_event_detach_bpf_prog(struct perf_event *event) old_array = event->tp_event->prog_array; ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array); + if (ret == -ENOENT) + goto unlock; if (ret < 0) { bpf_prog_array_delete_safe(old_array, event->prog); } else { -- cgit v1.2.3 From bf6fa2c893c5237b48569a13fa3c673041430b6c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 3 Jun 2018 15:59:41 -0700 Subject: bpf: implement bpf_get_current_cgroup_id() helper bpf has been used extensively for tracing. For example, bcc contains an almost full set of bpf-based tools to trace kernel and user functions/events. Most tracing tools are currently either filtered based on pid or system-wide. Containers have been used quite extensively in industry and cgroup is often used together to provide resource isolation and protection. Several processes may run inside the same container. It is often desirable to get container-level tracing results as well, e.g. syscall count, function count, I/O activity, etc. This patch implements a new helper, bpf_get_current_cgroup_id(), which will return cgroup id based on the cgroup within which the current task is running. The later patch will provide an example to show that userspace can get the same cgroup id so it could configure a filter or policy in the bpf program based on task cgroup id. The helper is currently implemented for tracing. It can be added to other program types as well when needed. Acked-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 8 +++++++- kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 15 +++++++++++++++ kernel/trace/bpf_trace.c | 2 ++ 5 files changed, 26 insertions(+), 1 deletion(-) (limited to 'kernel/bpf/core.c') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bbe297436e5d..995c3b1e59bf 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -746,6 +746,7 @@ extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_get_stack_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; +extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f0b6608b1f1c..18712b0dbfe7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2070,6 +2070,11 @@ union bpf_attr { * **CONFIG_SOCK_CGROUP_DATA** configuration option. * Return * The id is returned or 0 in case the id could not be retrieved. + * + * u64 bpf_get_current_cgroup_id(void) + * Return + * A 64-bit integer containing the current cgroup id based + * on the cgroup within which the current task is running. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2151,7 +2156,8 @@ union bpf_attr { FN(lwt_seg6_action), \ FN(rc_repeat), \ FN(rc_keydown), \ - FN(skb_cgroup_id), + FN(skb_cgroup_id), \ + FN(get_current_cgroup_id), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 527587de8a67..9f1493705f40 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1765,6 +1765,7 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; const struct bpf_func_proto bpf_sock_map_update_proto __weak; const struct bpf_func_proto bpf_sock_hash_update_proto __weak; +const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 3d24e238221e..73065e2d23c2 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -179,3 +179,18 @@ const struct bpf_func_proto bpf_get_current_comm_proto = { .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE, }; + +#ifdef CONFIG_CGROUPS +BPF_CALL_0(bpf_get_current_cgroup_id) +{ + struct cgroup *cgrp = task_dfl_cgroup(current); + + return cgrp->kn->id.id; +} + +const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { + .func = bpf_get_current_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; +#endif diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 752992ce3513..e2ab5b7f29d2 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -564,6 +564,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_prandom_u32_proto; case BPF_FUNC_probe_read_str: return &bpf_probe_read_str_proto; + case BPF_FUNC_get_current_cgroup_id: + return &bpf_get_current_cgroup_id_proto; default: return NULL; } -- cgit v1.2.3