summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/cpumap.c113
-rw-r--r--kernel/bpf/helpers.c8
-rw-r--r--kernel/bpf/syscall.c135
-rw-r--r--kernel/bpf/verifier.c94
-rw-r--r--kernel/trace/bpf_trace.c342
5 files changed, 495 insertions, 197 deletions
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index ef28c64f1eb1..e42a1bdb7f53 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -68,11 +68,8 @@ struct bpf_cpu_map_entry {
struct bpf_cpumap_val value;
struct bpf_prog *prog;
- atomic_t refcnt; /* Control when this struct can be free'ed */
- struct rcu_head rcu;
-
- struct work_struct kthread_stop_wq;
struct completion kthread_running;
+ struct rcu_work free_work;
};
struct bpf_cpu_map {
@@ -117,11 +114,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
return &cmap->map;
}
-static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
-{
- atomic_inc(&rcpu->refcnt);
-}
-
static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
{
/* The tear-down procedure should have made sure that queue is
@@ -142,35 +134,6 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
}
}
-static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
-{
- if (atomic_dec_and_test(&rcpu->refcnt)) {
- if (rcpu->prog)
- bpf_prog_put(rcpu->prog);
- /* The queue should be empty at this point */
- __cpu_map_ring_cleanup(rcpu->queue);
- ptr_ring_cleanup(rcpu->queue, NULL);
- kfree(rcpu->queue);
- kfree(rcpu);
- }
-}
-
-/* called from workqueue, to workaround syscall using preempt_disable */
-static void cpu_map_kthread_stop(struct work_struct *work)
-{
- struct bpf_cpu_map_entry *rcpu;
-
- rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq);
-
- /* Wait for flush in __cpu_map_entry_free(), via full RCU barrier,
- * as it waits until all in-flight call_rcu() callbacks complete.
- */
- rcu_barrier();
-
- /* kthread_stop will wake_up_process and wait for it to complete */
- kthread_stop(rcpu->kthread);
-}
-
static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
struct list_head *listp,
struct xdp_cpumap_stats *stats)
@@ -395,7 +358,6 @@ static int cpu_map_kthread_run(void *data)
}
__set_current_state(TASK_RUNNING);
- put_cpu_map_entry(rcpu);
return 0;
}
@@ -472,9 +434,6 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
if (IS_ERR(rcpu->kthread))
goto free_prog;
- get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */
- get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */
-
/* Make sure kthread runs on a single CPU */
kthread_bind(rcpu->kthread, cpu);
wake_up_process(rcpu->kthread);
@@ -501,40 +460,40 @@ free_rcu:
return NULL;
}
-static void __cpu_map_entry_free(struct rcu_head *rcu)
+static void __cpu_map_entry_free(struct work_struct *work)
{
struct bpf_cpu_map_entry *rcpu;
/* This cpu_map_entry have been disconnected from map and one
- * RCU grace-period have elapsed. Thus, XDP cannot queue any
+ * RCU grace-period have elapsed. Thus, XDP cannot queue any
* new packets and cannot change/set flush_needed that can
* find this entry.
*/
- rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu);
+ rcpu = container_of(to_rcu_work(work), struct bpf_cpu_map_entry, free_work);
+ /* kthread_stop will wake_up_process and wait for it to complete.
+ * cpu_map_kthread_run() makes sure the pointer ring is empty
+ * before exiting.
+ */
+ kthread_stop(rcpu->kthread);
+
+ if (rcpu->prog)
+ bpf_prog_put(rcpu->prog);
+ /* The queue should be empty at this point */
+ __cpu_map_ring_cleanup(rcpu->queue);
+ ptr_ring_cleanup(rcpu->queue, NULL);
+ kfree(rcpu->queue);
free_percpu(rcpu->bulkq);
- /* Cannot kthread_stop() here, last put free rcpu resources */
- put_cpu_map_entry(rcpu);
+ kfree(rcpu);
}
-/* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to
- * ensure any driver rcu critical sections have completed, but this
- * does not guarantee a flush has happened yet. Because driver side
- * rcu_read_lock/unlock only protects the running XDP program. The
- * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a
- * pending flush op doesn't fail.
- *
- * The bpf_cpu_map_entry is still used by the kthread, and there can
- * still be pending packets (in queue and percpu bulkq). A refcnt
- * makes sure to last user (kthread_stop vs. call_rcu) free memory
- * resources.
- *
- * The rcu callback __cpu_map_entry_free flush remaining packets in
- * percpu bulkq to queue. Due to caller map_delete_elem() disable
- * preemption, cannot call kthread_stop() to make sure queue is empty.
- * Instead a work_queue is started for stopping kthread,
- * cpu_map_kthread_stop, which waits for an RCU grace period before
- * stopping kthread, emptying the queue.
+/* After the xchg of the bpf_cpu_map_entry pointer, we need to make sure the old
+ * entry is no longer in use before freeing. We use queue_rcu_work() to call
+ * __cpu_map_entry_free() in a separate workqueue after waiting for an RCU grace
+ * period. This means that (a) all pending enqueue and flush operations have
+ * completed (because of the RCU callback), and (b) we are in a workqueue
+ * context where we can stop the kthread and wait for it to exit before freeing
+ * everything.
*/
static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
@@ -543,9 +502,8 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
old_rcpu = unrcu_pointer(xchg(&cmap->cpu_map[key_cpu], RCU_INITIALIZER(rcpu)));
if (old_rcpu) {
- call_rcu(&old_rcpu->rcu, __cpu_map_entry_free);
- INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop);
- schedule_work(&old_rcpu->kthread_stop_wq);
+ INIT_RCU_WORK(&old_rcpu->free_work, __cpu_map_entry_free);
+ queue_rcu_work(system_wq, &old_rcpu->free_work);
}
}
@@ -557,7 +515,7 @@ static long cpu_map_delete_elem(struct bpf_map *map, void *key)
if (key_cpu >= map->max_entries)
return -EINVAL;
- /* notice caller map_delete_elem() use preempt_disable() */
+ /* notice caller map_delete_elem() uses rcu_read_lock() */
__cpu_map_entry_replace(cmap, key_cpu, NULL);
return 0;
}
@@ -608,16 +566,15 @@ static void cpu_map_free(struct bpf_map *map)
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
* so the bpf programs (can be more than one that used this map) were
* disconnected from events. Wait for outstanding critical sections in
- * these programs to complete. The rcu critical section only guarantees
- * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map.
- * It does __not__ ensure pending flush operations (if any) are
- * complete.
+ * these programs to complete. synchronize_rcu() below not only
+ * guarantees no further "XDP/bpf-side" reads against
+ * bpf_cpu_map->cpu_map, but also ensure pending flush operations
+ * (if any) are completed.
*/
-
synchronize_rcu();
- /* For cpu_map the remote CPUs can still be using the entries
- * (struct bpf_cpu_map_entry).
+ /* The only possible user of bpf_cpu_map_entry is
+ * cpu_map_kthread_run().
*/
for (i = 0; i < cmap->map.max_entries; i++) {
struct bpf_cpu_map_entry *rcpu;
@@ -626,8 +583,8 @@ static void cpu_map_free(struct bpf_map *map)
if (!rcpu)
continue;
- /* bq flush and cleanup happens after RCU grace-period */
- __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
+ /* Stop kthread and cleanup entry directly */
+ __cpu_map_entry_free(&rcpu->free_work.work);
}
bpf_map_area_free(cmap->cpu_map);
bpf_map_area_free(cmap);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index eb91cae0612a..8bd3812fb8df 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -286,6 +286,7 @@ static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0");
BUILD_BUG_ON(sizeof(*l) != sizeof(__u32));
BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32));
+ preempt_disable();
arch_spin_lock(l);
}
@@ -294,6 +295,7 @@ static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
arch_spinlock_t *l = (void *)lock;
arch_spin_unlock(l);
+ preempt_enable();
}
#else
@@ -1913,7 +1915,11 @@ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
if (rec)
bpf_obj_free_fields(rec, p);
- bpf_mem_free(&bpf_global_ma, p);
+
+ if (rec && rec->refcount_off >= 0)
+ bpf_mem_free_rcu(&bpf_global_ma, p);
+ else
+ bpf_mem_free(&bpf_global_ma, p);
}
__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cb658543bdb4..ebeb0695305a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -657,7 +657,6 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
if (!btf_is_kernel(field->kptr.btf)) {
pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
field->kptr.btf_id);
- WARN_ON_ONCE(!pointee_struct_meta);
migrate_disable();
__bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
pointee_struct_meta->record :
@@ -2815,10 +2814,12 @@ static void bpf_link_free_id(int id)
/* Clean up bpf_link and corresponding anon_inode file and FD. After
* anon_inode is created, bpf_link can't be just kfree()'d due to deferred
- * anon_inode's release() call. This helper marksbpf_link as
+ * anon_inode's release() call. This helper marks bpf_link as
* defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt
* is not decremented, it's the responsibility of a calling code that failed
* to complete bpf_link initialization.
+ * This helper eventually calls link's dealloc callback, but does not call
+ * link's release callback.
*/
void bpf_link_cleanup(struct bpf_link_primer *primer)
{
@@ -3655,34 +3656,6 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
return fd;
}
-static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
- enum bpf_attach_type attach_type)
-{
- switch (prog->type) {
- case BPF_PROG_TYPE_CGROUP_SOCK:
- case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
- case BPF_PROG_TYPE_CGROUP_SOCKOPT:
- case BPF_PROG_TYPE_SK_LOOKUP:
- return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
- case BPF_PROG_TYPE_CGROUP_SKB:
- if (!capable(CAP_NET_ADMIN))
- /* cg-skb progs can be loaded by unpriv user.
- * check permissions at attach time.
- */
- return -EPERM;
- return prog->enforce_expected_attach_type &&
- prog->expected_attach_type != attach_type ?
- -EINVAL : 0;
- case BPF_PROG_TYPE_KPROBE:
- if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI &&
- attach_type != BPF_TRACE_KPROBE_MULTI)
- return -EINVAL;
- return 0;
- default:
- return 0;
- }
-}
-
static enum bpf_prog_type
attach_type_to_prog_type(enum bpf_attach_type attach_type)
{
@@ -3749,6 +3722,62 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
}
}
+static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
+ enum bpf_attach_type attach_type)
+{
+ enum bpf_prog_type ptype;
+
+ switch (prog->type) {
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ case BPF_PROG_TYPE_SK_LOOKUP:
+ return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ if (!capable(CAP_NET_ADMIN))
+ /* cg-skb progs can be loaded by unpriv user.
+ * check permissions at attach time.
+ */
+ return -EPERM;
+ return prog->enforce_expected_attach_type &&
+ prog->expected_attach_type != attach_type ?
+ -EINVAL : 0;
+ case BPF_PROG_TYPE_EXT:
+ return 0;
+ case BPF_PROG_TYPE_NETFILTER:
+ if (attach_type != BPF_NETFILTER)
+ return -EINVAL;
+ return 0;
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ if (attach_type != BPF_PERF_EVENT)
+ return -EINVAL;
+ return 0;
+ case BPF_PROG_TYPE_KPROBE:
+ if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI &&
+ attach_type != BPF_TRACE_KPROBE_MULTI)
+ return -EINVAL;
+ if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI &&
+ attach_type != BPF_TRACE_UPROBE_MULTI)
+ return -EINVAL;
+ if (attach_type != BPF_PERF_EVENT &&
+ attach_type != BPF_TRACE_KPROBE_MULTI &&
+ attach_type != BPF_TRACE_UPROBE_MULTI)
+ return -EINVAL;
+ return 0;
+ case BPF_PROG_TYPE_SCHED_CLS:
+ if (attach_type != BPF_TCX_INGRESS &&
+ attach_type != BPF_TCX_EGRESS)
+ return -EINVAL;
+ return 0;
+ default:
+ ptype = attach_type_to_prog_type(attach_type);
+ if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type)
+ return -EINVAL;
+ return 0;
+ }
+}
+
#define BPF_PROG_ATTACH_LAST_FIELD expected_revision
#define BPF_F_ATTACH_MASK_BASE \
@@ -4852,10 +4881,9 @@ err_put:
return err;
}
-#define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies
+#define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.pid
static int link_create(union bpf_attr *attr, bpfptr_t uattr)
{
- enum bpf_prog_type ptype;
struct bpf_prog *prog;
int ret;
@@ -4875,45 +4903,6 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
goto out;
switch (prog->type) {
- case BPF_PROG_TYPE_EXT:
- break;
- case BPF_PROG_TYPE_NETFILTER:
- if (attr->link_create.attach_type != BPF_NETFILTER) {
- ret = -EINVAL;
- goto out;
- }
- break;
- case BPF_PROG_TYPE_PERF_EVENT:
- case BPF_PROG_TYPE_TRACEPOINT:
- if (attr->link_create.attach_type != BPF_PERF_EVENT) {
- ret = -EINVAL;
- goto out;
- }
- break;
- case BPF_PROG_TYPE_KPROBE:
- if (attr->link_create.attach_type != BPF_PERF_EVENT &&
- attr->link_create.attach_type != BPF_TRACE_KPROBE_MULTI) {
- ret = -EINVAL;
- goto out;
- }
- break;
- case BPF_PROG_TYPE_SCHED_CLS:
- if (attr->link_create.attach_type != BPF_TCX_INGRESS &&
- attr->link_create.attach_type != BPF_TCX_EGRESS) {
- ret = -EINVAL;
- goto out;
- }
- break;
- default:
- ptype = attach_type_to_prog_type(attr->link_create.attach_type);
- if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) {
- ret = -EINVAL;
- goto out;
- }
- break;
- }
-
- switch (prog->type) {
case BPF_PROG_TYPE_CGROUP_SKB:
case BPF_PROG_TYPE_CGROUP_SOCK:
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
@@ -4969,8 +4958,10 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
case BPF_PROG_TYPE_KPROBE:
if (attr->link_create.attach_type == BPF_PERF_EVENT)
ret = bpf_perf_link_attach(attr, prog);
- else
+ else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI)
ret = bpf_kprobe_multi_link_attach(attr, prog);
+ else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI)
+ ret = bpf_uprobe_multi_link_attach(attr, prog);
break;
default:
ret = -EINVAL;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4ccca1f6c998..bb78212fa5b2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4990,20 +4990,22 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
struct bpf_reg_state *reg, u32 regno)
{
const char *targ_name = btf_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
- int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU;
+ int perm_flags;
const char *reg_name = "";
- /* Only unreferenced case accepts untrusted pointers */
- if (kptr_field->type == BPF_KPTR_UNREF)
- perm_flags |= PTR_UNTRUSTED;
+ if (btf_is_kernel(reg->btf)) {
+ perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU;
+
+ /* Only unreferenced case accepts untrusted pointers */
+ if (kptr_field->type == BPF_KPTR_UNREF)
+ perm_flags |= PTR_UNTRUSTED;
+ } else {
+ perm_flags = PTR_MAYBE_NULL | MEM_ALLOC;
+ }
if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags))
goto bad_type;
- if (!btf_is_kernel(reg->btf)) {
- verbose(env, "R%d must point to kernel BTF\n", regno);
- return -EINVAL;
- }
/* We need to verify reg->type and reg->btf, before accessing reg->btf */
reg_name = btf_type_name(reg->btf, reg->btf_id);
@@ -5016,7 +5018,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
if (__check_ptr_off_reg(env, reg, regno, true))
return -EACCES;
- /* A full type match is needed, as BTF can be vmlinux or module BTF, and
+ /* A full type match is needed, as BTF can be vmlinux, module or prog BTF, and
* we also need to take into account the reg->off.
*
* We want to support cases like:
@@ -5062,7 +5064,9 @@ bad_type:
*/
static bool in_rcu_cs(struct bpf_verifier_env *env)
{
- return env->cur_state->active_rcu_lock || !env->prog->aux->sleepable;
+ return env->cur_state->active_rcu_lock ||
+ env->cur_state->active_lock.ptr ||
+ !env->prog->aux->sleepable;
}
/* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */
@@ -7916,7 +7920,10 @@ found:
verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n");
return -EFAULT;
}
- /* Handled by helper specific checks */
+ if (meta->func_id == BPF_FUNC_kptr_xchg) {
+ if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
+ return -EACCES;
+ }
break;
case PTR_TO_BTF_ID | MEM_PERCPU:
case PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED:
@@ -7968,17 +7975,6 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
if (arg_type_is_dynptr(arg_type) && type == PTR_TO_STACK)
return 0;
- if ((type_is_ptr_alloc_obj(type) || type_is_non_owning_ref(type)) && reg->off) {
- if (reg_find_field_offset(reg, reg->off, BPF_GRAPH_NODE_OR_ROOT))
- return __check_ptr_off_reg(env, reg, regno, true);
-
- verbose(env, "R%d must have zero offset when passed to release func\n",
- regno);
- verbose(env, "No graph node or root found at R%d type:%s off:%d\n", regno,
- btf_type_name(reg->btf, reg->btf_id), reg->off);
- return -EINVAL;
- }
-
/* Doing check_ptr_off_reg check for the offset will catch this
* because fixed_off_ok is false, but checking here allows us
* to give the user a better error message.
@@ -8013,6 +8009,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
case PTR_TO_BTF_ID | PTR_TRUSTED:
case PTR_TO_BTF_ID | MEM_RCU:
case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
+ case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU:
/* When referenced PTR_TO_BTF_ID is passed to release function,
* its fixed offset must be 0. In the other cases, fixed offset
* can be non-zero. This was already checked above. So pass
@@ -10479,6 +10476,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
struct bpf_verifier_state *state = env->cur_state;
+ struct btf_record *rec = reg_btf_record(reg);
if (!state->active_lock.ptr) {
verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n");
@@ -10491,6 +10489,9 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state
}
reg->type |= NON_OWN_REF;
+ if (rec->refcount_off >= 0)
+ reg->type |= MEM_RCU;
+
return 0;
}
@@ -11223,10 +11224,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
verbose(env, "arg#%d doesn't point to a type with bpf_refcount field\n", i);
return -EINVAL;
}
- if (rec->refcount_off >= 0) {
- verbose(env, "bpf_refcount_acquire calls are disabled for now\n");
- return -EINVAL;
- }
+
meta->arg_btf = reg->btf;
meta->arg_btf_id = reg->btf_id;
break;
@@ -11331,6 +11329,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
struct bpf_func_state *state;
struct bpf_reg_state *reg;
+ if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
+ verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
+ return -EACCES;
+ }
+
if (rcu_lock) {
verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
return -EINVAL;
@@ -14047,6 +14050,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return -EINVAL;
}
+ /* check src2 operand */
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ dst_reg = &regs[insn->dst_reg];
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0) {
verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
@@ -14058,12 +14067,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
if (err)
return err;
- if (is_pointer_value(env, insn->src_reg)) {
+ src_reg = &regs[insn->src_reg];
+ if (!(reg_is_pkt_pointer_any(dst_reg) && reg_is_pkt_pointer_any(src_reg)) &&
+ is_pointer_value(env, insn->src_reg)) {
verbose(env, "R%d pointer comparison prohibited\n",
insn->src_reg);
return -EACCES;
}
- src_reg = &regs[insn->src_reg];
} else {
if (insn->src_reg != BPF_REG_0) {
verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
@@ -14071,12 +14081,6 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
}
}
- /* check src2 operand */
- err = check_reg_arg(env, insn->dst_reg, SRC_OP);
- if (err)
- return err;
-
- dst_reg = &regs[insn->dst_reg];
is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
if (BPF_SRC(insn->code) == BPF_K) {
@@ -16692,7 +16696,8 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
- if (env->cur_state->active_rcu_lock) {
+ if (env->cur_state->active_rcu_lock &&
+ !in_rbtree_lock_required_cb(env)) {
verbose(env, "bpf_rcu_read_unlock is missing\n");
return -EINVAL;
}
@@ -16972,11 +16977,6 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
verbose(env, "tracing progs cannot use bpf_spin_lock yet\n");
return -EINVAL;
}
-
- if (prog->aux->sleepable) {
- verbose(env, "sleepable progs cannot use bpf_spin_lock yet\n");
- return -EINVAL;
- }
}
if (btf_record_has_field(map->record, BPF_TIMER)) {
@@ -18281,6 +18281,13 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
+ if (desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] &&
+ !kptr_struct_meta) {
+ verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n",
+ insn_idx);
+ return -EFAULT;
+ }
+
insn_buf[0] = addr[0];
insn_buf[1] = addr[1];
insn_buf[2] = *insn;
@@ -18288,6 +18295,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
} else if (desc->func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
desc->func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
int struct_meta_reg = BPF_REG_3;
int node_offset_reg = BPF_REG_4;
@@ -18297,6 +18305,12 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
node_offset_reg = BPF_REG_5;
}
+ if (!kptr_struct_meta) {
+ verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n",
+ insn_idx);
+ return -EFAULT;
+ }
+
__fixup_collection_insert_kfunc(&env->insn_aux_data[insn_idx], struct_meta_reg,
node_offset_reg, insn, insn_buf, cnt);
} else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 792445e1f3f0..a7264b2c17ad 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -23,6 +23,7 @@
#include <linux/sort.h>
#include <linux/key.h>
#include <linux/verification.h>
+#include <linux/namei.h>
#include <net/bpf_sk_storage.h>
@@ -86,6 +87,9 @@ static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size,
static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx);
static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx);
+static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx);
+static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx);
+
/**
* trace_call_bpf - invoke BPF program
* @call: tracepoint event
@@ -1103,6 +1107,30 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto_kmulti = {
.arg1_type = ARG_PTR_TO_CTX,
};
+BPF_CALL_1(bpf_get_func_ip_uprobe_multi, struct pt_regs *, regs)
+{
+ return bpf_uprobe_multi_entry_ip(current->bpf_ctx);
+}
+
+static const struct bpf_func_proto bpf_get_func_ip_proto_uprobe_multi = {
+ .func = bpf_get_func_ip_uprobe_multi,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_attach_cookie_uprobe_multi, struct pt_regs *, regs)
+{
+ return bpf_uprobe_multi_cookie(current->bpf_ctx);
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto_umulti = {
+ .func = bpf_get_attach_cookie_uprobe_multi,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx)
{
struct bpf_trace_run_ctx *run_ctx;
@@ -1545,13 +1573,17 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_override_return_proto;
#endif
case BPF_FUNC_get_func_ip:
- return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ?
- &bpf_get_func_ip_proto_kprobe_multi :
- &bpf_get_func_ip_proto_kprobe;
+ if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI)
+ return &bpf_get_func_ip_proto_kprobe_multi;
+ if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI)
+ return &bpf_get_func_ip_proto_uprobe_multi;
+ return &bpf_get_func_ip_proto_kprobe;
case BPF_FUNC_get_attach_cookie:
- return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ?
- &bpf_get_attach_cookie_proto_kmulti :
- &bpf_get_attach_cookie_proto_trace;
+ if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI)
+ return &bpf_get_attach_cookie_proto_kmulti;
+ if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI)
+ return &bpf_get_attach_cookie_proto_umulti;
+ return &bpf_get_attach_cookie_proto_trace;
default:
return bpf_tracing_func_proto(func_id, prog);
}
@@ -2970,3 +3002,301 @@ static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
return 0;
}
#endif
+
+#ifdef CONFIG_UPROBES
+struct bpf_uprobe_multi_link;
+
+struct bpf_uprobe {
+ struct bpf_uprobe_multi_link *link;
+ loff_t offset;
+ u64 cookie;
+ struct uprobe_consumer consumer;
+};
+
+struct bpf_uprobe_multi_link {
+ struct path path;
+ struct bpf_link link;
+ u32 cnt;
+ struct bpf_uprobe *uprobes;
+ struct task_struct *task;
+};
+
+struct bpf_uprobe_multi_run_ctx {
+ struct bpf_run_ctx run_ctx;
+ unsigned long entry_ip;
+ struct bpf_uprobe *uprobe;
+};
+
+static void bpf_uprobe_unregister(struct path *path, struct bpf_uprobe *uprobes,
+ u32 cnt)
+{
+ u32 i;
+
+ for (i = 0; i < cnt; i++) {
+ uprobe_unregister(d_real_inode(path->dentry), uprobes[i].offset,
+ &uprobes[i].consumer);
+ }
+}
+
+static void bpf_uprobe_multi_link_release(struct bpf_link *link)
+{
+ struct bpf_uprobe_multi_link *umulti_link;
+
+ umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
+ bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt);
+}
+
+static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_uprobe_multi_link *umulti_link;
+
+ umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
+ if (umulti_link->task)
+ put_task_struct(umulti_link->task);
+ path_put(&umulti_link->path);
+ kvfree(umulti_link->uprobes);
+ kfree(umulti_link);
+}
+
+static const struct bpf_link_ops bpf_uprobe_multi_link_lops = {
+ .release = bpf_uprobe_multi_link_release,
+ .dealloc = bpf_uprobe_multi_link_dealloc,
+};
+
+static int uprobe_prog_run(struct bpf_uprobe *uprobe,
+ unsigned long entry_ip,
+ struct pt_regs *regs)
+{
+ struct bpf_uprobe_multi_link *link = uprobe->link;
+ struct bpf_uprobe_multi_run_ctx run_ctx = {
+ .entry_ip = entry_ip,
+ .uprobe = uprobe,
+ };
+ struct bpf_prog *prog = link->link.prog;
+ bool sleepable = prog->aux->sleepable;
+ struct bpf_run_ctx *old_run_ctx;
+ int err = 0;
+
+ if (link->task && current != link->task)
+ return 0;
+
+ if (sleepable)
+ rcu_read_lock_trace();
+ else
+ rcu_read_lock();
+
+ migrate_disable();
+
+ old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+ err = bpf_prog_run(link->link.prog, regs);
+ bpf_reset_run_ctx(old_run_ctx);
+
+ migrate_enable();
+
+ if (sleepable)
+ rcu_read_unlock_trace();
+ else
+ rcu_read_unlock();
+ return err;
+}
+
+static bool
+uprobe_multi_link_filter(struct uprobe_consumer *con, enum uprobe_filter_ctx ctx,
+ struct mm_struct *mm)
+{
+ struct bpf_uprobe *uprobe;
+
+ uprobe = container_of(con, struct bpf_uprobe, consumer);
+ return uprobe->link->task->mm == mm;
+}
+
+static int
+uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs)
+{
+ struct bpf_uprobe *uprobe;
+
+ uprobe = container_of(con, struct bpf_uprobe, consumer);
+ return uprobe_prog_run(uprobe, instruction_pointer(regs), regs);
+}
+
+static int
+uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs)
+{
+ struct bpf_uprobe *uprobe;
+
+ uprobe = container_of(con, struct bpf_uprobe, consumer);
+ return uprobe_prog_run(uprobe, func, regs);
+}
+
+static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
+{
+ struct bpf_uprobe_multi_run_ctx *run_ctx;
+
+ run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, run_ctx);
+ return run_ctx->entry_ip;
+}
+
+static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx)
+{
+ struct bpf_uprobe_multi_run_ctx *run_ctx;
+
+ run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, run_ctx);
+ return run_ctx->uprobe->cookie;
+}
+
+int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_uprobe_multi_link *link = NULL;
+ unsigned long __user *uref_ctr_offsets;
+ unsigned long *ref_ctr_offsets = NULL;
+ struct bpf_link_primer link_primer;
+ struct bpf_uprobe *uprobes = NULL;
+ struct task_struct *task = NULL;
+ unsigned long __user *uoffsets;
+ u64 __user *ucookies;
+ void __user *upath;
+ u32 flags, cnt, i;
+ struct path path;
+ char *name;
+ pid_t pid;
+ int err;
+
+ /* no support for 32bit archs yet */
+ if (sizeof(u64) != sizeof(void *))
+ return -EOPNOTSUPP;
+
+ if (prog->expected_attach_type != BPF_TRACE_UPROBE_MULTI)
+ return -EINVAL;
+
+ flags = attr->link_create.uprobe_multi.flags;
+ if (flags & ~BPF_F_UPROBE_MULTI_RETURN)
+ return -EINVAL;
+
+ /*
+ * path, offsets and cnt are mandatory,
+ * ref_ctr_offsets and cookies are optional
+ */
+ upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path);
+ uoffsets = u64_to_user_ptr(attr->link_create.uprobe_multi.offsets);
+ cnt = attr->link_create.uprobe_multi.cnt;
+
+ if (!upath || !uoffsets || !cnt)
+ return -EINVAL;
+
+ uref_ctr_offsets = u64_to_user_ptr(attr->link_create.uprobe_multi.ref_ctr_offsets);
+ ucookies = u64_to_user_ptr(attr->link_create.uprobe_multi.cookies);
+
+ name = strndup_user(upath, PATH_MAX);
+ if (IS_ERR(name)) {
+ err = PTR_ERR(name);
+ return err;
+ }
+
+ err = kern_path(name, LOOKUP_FOLLOW, &path);
+ kfree(name);
+ if (err)
+ return err;
+
+ if (!d_is_reg(path.dentry)) {
+ err = -EBADF;
+ goto error_path_put;
+ }
+
+ pid = attr->link_create.uprobe_multi.pid;
+ if (pid) {
+ rcu_read_lock();
+ task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
+ rcu_read_unlock();
+ if (!task)
+ goto error_path_put;
+ }
+
+ err = -ENOMEM;
+
+ link = kzalloc(sizeof(*link), GFP_KERNEL);
+ uprobes = kvcalloc(cnt, sizeof(*uprobes), GFP_KERNEL);
+
+ if (!uprobes || !link)
+ goto error_free;
+
+ if (uref_ctr_offsets) {
+ ref_ctr_offsets = kvcalloc(cnt, sizeof(*ref_ctr_offsets), GFP_KERNEL);
+ if (!ref_ctr_offsets)
+ goto error_free;
+ }
+
+ for (i = 0; i < cnt; i++) {
+ if (ucookies && __get_user(uprobes[i].cookie, ucookies + i)) {
+ err = -EFAULT;
+ goto error_free;
+ }
+ if (uref_ctr_offsets && __get_user(ref_ctr_offsets[i], uref_ctr_offsets + i)) {
+ err = -EFAULT;
+ goto error_free;
+ }
+ if (__get_user(uprobes[i].offset, uoffsets + i)) {
+ err = -EFAULT;
+ goto error_free;
+ }
+
+ uprobes[i].link = link;
+
+ if (flags & BPF_F_UPROBE_MULTI_RETURN)
+ uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler;
+ else
+ uprobes[i].consumer.handler = uprobe_multi_link_handler;
+
+ if (pid)
+ uprobes[i].consumer.filter = uprobe_multi_link_filter;
+ }
+
+ link->cnt = cnt;
+ link->uprobes = uprobes;
+ link->path = path;
+ link->task = task;
+
+ bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI,
+ &bpf_uprobe_multi_link_lops, prog);
+
+ for (i = 0; i < cnt; i++) {
+ err = uprobe_register_refctr(d_real_inode(link->path.dentry),
+ uprobes[i].offset,
+ ref_ctr_offsets ? ref_ctr_offsets[i] : 0,
+ &uprobes[i].consumer);
+ if (err) {
+ bpf_uprobe_unregister(&path, uprobes, i);
+ goto error_free;
+ }
+ }
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err)
+ goto error_free;
+
+ kvfree(ref_ctr_offsets);
+ return bpf_link_settle(&link_primer);
+
+error_free:
+ kvfree(ref_ctr_offsets);
+ kvfree(uprobes);
+ kfree(link);
+ if (task)
+ put_task_struct(task);
+error_path_put:
+ path_put(&path);
+ return err;
+}
+#else /* !CONFIG_UPROBES */
+int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ return -EOPNOTSUPP;
+}
+static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx)
+{
+ return 0;
+}
+static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
+{
+ return 0;
+}
+#endif /* CONFIG_UPROBES */