summaryrefslogtreecommitdiff
path: root/kernel/bpf
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/bpf')
-rw-r--r--kernel/bpf/Kconfig2
-rw-r--r--kernel/bpf/Makefile3
-rw-r--r--kernel/bpf/arena.c30
-rw-r--r--kernel/bpf/arraymap.c21
-rw-r--r--kernel/bpf/bpf_cgrp_storage.c6
-rw-r--r--kernel/bpf/bpf_inode_storage.c6
-rw-r--r--kernel/bpf/bpf_iter.c6
-rw-r--r--kernel/bpf/bpf_lru_list.c10
-rw-r--r--kernel/bpf/bpf_struct_ops.c12
-rw-r--r--kernel/bpf/bpf_task_storage.c6
-rw-r--r--kernel/bpf/btf.c99
-rw-r--r--kernel/bpf/cgroup.c11
-rw-r--r--kernel/bpf/core.c81
-rw-r--r--kernel/bpf/cpumap.c6
-rw-r--r--kernel/bpf/crypto.c2
-rw-r--r--kernel/bpf/devmap.c2
-rw-r--r--kernel/bpf/hashtab.c43
-rw-r--r--kernel/bpf/helpers.c628
-rw-r--r--kernel/bpf/inode.c6
-rw-r--r--kernel/bpf/liveness.c733
-rw-r--r--kernel/bpf/local_storage.c2
-rw-r--r--kernel/bpf/log.c30
-rw-r--r--kernel/bpf/memalloc.c2
-rw-r--r--kernel/bpf/rqspinlock.c2
-rw-r--r--kernel/bpf/stackmap.c20
-rw-r--r--kernel/bpf/syscall.c125
-rw-r--r--kernel/bpf/tnum.c63
-rw-r--r--kernel/bpf/trampoline.c18
-rw-r--r--kernel/bpf/verifier.c876
29 files changed, 2102 insertions, 749 deletions
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index 17067dcb4386..eb3de35734f0 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -3,7 +3,7 @@
# BPF interpreter that, for example, classic socket filters depend on.
config BPF
bool
- select CRYPTO_LIB_SHA1
+ select CRYPTO_LIB_SHA256
# Used by archs to tell that they support BPF JIT compiler plus which
# flavour. Only one of the two can be selected for a specific arch since
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 269c04a24664..7fd0badfacb1 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
endif
CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy)
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
@@ -62,3 +62,4 @@ CFLAGS_REMOVE_bpf_lru_list.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_queue_stack_maps.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_lpm_trie.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_ringbuf.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_rqspinlock.o = $(CC_FLAGS_FTRACE)
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 5b37753799d2..1074ac4459f2 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -633,3 +633,33 @@ static int __init kfunc_init(void)
return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set);
}
late_initcall(kfunc_init);
+
+void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip)
+{
+ struct bpf_stream_stage ss;
+ struct bpf_prog *prog;
+ u64 user_vm_start;
+
+ /*
+ * The RCU read lock is held to safely traverse the latch tree, but we
+ * don't need its protection when accessing the prog, since it will not
+ * disappear while we are handling the fault.
+ */
+ rcu_read_lock();
+ prog = bpf_prog_ksym_find(fault_ip);
+ rcu_read_unlock();
+ if (!prog)
+ return;
+
+ /* Use main prog for stream access */
+ prog = prog->aux->main_prog_aux->prog;
+
+ user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena);
+ addr += clear_lo32(user_vm_start);
+
+ bpf_stream_stage(ss, prog, BPF_STDERR, ({
+ bpf_stream_printk(ss, "ERROR: Arena %s access at unmapped address 0x%lx\n",
+ write ? "WRITE" : "READ", addr);
+ bpf_stream_dump_stack(ss);
+ }));
+}
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 3d080916faf9..80b1765a3159 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -12,6 +12,7 @@
#include <uapi/linux/btf.h>
#include <linux/rcupdate_trace.h>
#include <linux/btf_ids.h>
+#include <crypto/sha2.h>
#include "map_in_map.h"
@@ -174,6 +175,17 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
return array->value + (u64)array->elem_size * (index & array->index_mask);
}
+static int array_map_get_hash(struct bpf_map *map, u32 hash_buf_size,
+ void *hash_buf)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+ sha256(array->value, (u64)array->elem_size * array->map.max_entries,
+ hash_buf);
+ memcpy(array->map.sha, hash_buf, sizeof(array->map.sha));
+ return 0;
+}
+
static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm,
u32 off)
{
@@ -431,7 +443,7 @@ static void *array_map_vmalloc_addr(struct bpf_array *array)
return (void *)round_down((unsigned long)array, PAGE_SIZE);
}
-static void array_map_free_timers_wq(struct bpf_map *map)
+static void array_map_free_internal_structs(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
@@ -439,12 +451,14 @@ static void array_map_free_timers_wq(struct bpf_map *map)
/* We don't reset or free fields other than timer and workqueue
* on uref dropping to zero.
*/
- if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE)) {
+ if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) {
for (i = 0; i < array->map.max_entries; i++) {
if (btf_record_has_field(map->record, BPF_TIMER))
bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i));
if (btf_record_has_field(map->record, BPF_WORKQUEUE))
bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i));
+ if (btf_record_has_field(map->record, BPF_TASK_WORK))
+ bpf_obj_free_task_work(map->record, array_map_elem_ptr(array, i));
}
}
}
@@ -783,7 +797,7 @@ const struct bpf_map_ops array_map_ops = {
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
- .map_release_uref = array_map_free_timers_wq,
+ .map_release_uref = array_map_free_internal_structs,
.map_lookup_elem = array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
@@ -800,6 +814,7 @@ const struct bpf_map_ops array_map_ops = {
.map_mem_usage = array_map_mem_usage,
.map_btf_id = &array_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
+ .map_get_hash = &array_map_get_hash,
};
const struct bpf_map_ops percpu_array_map_ops = {
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index 148da8f7ff36..0687a760974a 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -45,8 +45,7 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup)
{
struct bpf_local_storage *local_storage;
- migrate_disable();
- rcu_read_lock();
+ rcu_read_lock_dont_migrate();
local_storage = rcu_dereference(cgroup->bpf_cgrp_storage);
if (!local_storage)
goto out;
@@ -55,8 +54,7 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup)
bpf_local_storage_destroy(local_storage);
bpf_cgrp_storage_unlock();
out:
- rcu_read_unlock();
- migrate_enable();
+ rcu_read_unlock_migrate();
}
static struct bpf_local_storage_data *
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 15a3eb9b02d9..e54cce2b9175 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -62,8 +62,7 @@ void bpf_inode_storage_free(struct inode *inode)
if (!bsb)
return;
- migrate_disable();
- rcu_read_lock();
+ rcu_read_lock_dont_migrate();
local_storage = rcu_dereference(bsb->storage);
if (!local_storage)
@@ -71,8 +70,7 @@ void bpf_inode_storage_free(struct inode *inode)
bpf_local_storage_destroy(local_storage);
out:
- rcu_read_unlock();
- migrate_enable();
+ rcu_read_unlock_migrate();
}
static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 0cbcae727079..6ac35430c573 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -705,13 +705,11 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
migrate_enable();
rcu_read_unlock_trace();
} else {
- rcu_read_lock();
- migrate_disable();
+ rcu_read_lock_dont_migrate();
old_run_ctx = bpf_set_run_ctx(&run_ctx);
ret = bpf_prog_run(prog, ctx);
bpf_reset_run_ctx(old_run_ctx);
- migrate_enable();
- rcu_read_unlock();
+ rcu_read_unlock_migrate();
}
/* bpf program can only return 0 or 1:
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index 2d6e1c98d8ad..e7a2fc60523f 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -19,14 +19,6 @@
#define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING)
#define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET)
-static int get_next_cpu(int cpu)
-{
- cpu = cpumask_next(cpu, cpu_possible_mask);
- if (cpu >= nr_cpu_ids)
- cpu = cpumask_first(cpu_possible_mask);
- return cpu;
-}
-
/* Local list helpers */
static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l)
{
@@ -482,7 +474,7 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags);
- steal = get_next_cpu(steal);
+ steal = cpumask_next_wrap(steal, cpu_possible_mask);
} while (!node && steal != first_steal);
loc_l->next_steal = steal;
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 687a3e9c76f5..a41e6730edcf 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -1174,6 +1174,18 @@ void bpf_struct_ops_put(const void *kdata)
bpf_map_put(&st_map->map);
}
+u32 bpf_struct_ops_id(const void *kdata)
+{
+ struct bpf_struct_ops_value *kvalue;
+ struct bpf_struct_ops_map *st_map;
+
+ kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+ st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
+
+ return st_map->map.id;
+}
+EXPORT_SYMBOL_GPL(bpf_struct_ops_id);
+
static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 1109475953c0..a1dc1bf0848a 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -70,8 +70,7 @@ void bpf_task_storage_free(struct task_struct *task)
{
struct bpf_local_storage *local_storage;
- migrate_disable();
- rcu_read_lock();
+ rcu_read_lock_dont_migrate();
local_storage = rcu_dereference(task->bpf_storage);
if (!local_storage)
@@ -81,8 +80,7 @@ void bpf_task_storage_free(struct task_struct *task)
bpf_local_storage_destroy(local_storage);
bpf_task_storage_unlock();
out:
- rcu_read_unlock();
- migrate_enable();
+ rcu_read_unlock_migrate();
}
static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 64739308902f..0de8fc8a0e0b 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3478,60 +3478,45 @@ btf_find_graph_root(const struct btf *btf, const struct btf_type *pt,
return BTF_FIELD_FOUND;
}
-#define field_mask_test_name(field_type, field_type_str) \
- if (field_mask & field_type && !strcmp(name, field_type_str)) { \
- type = field_type; \
- goto end; \
- }
-
static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_type,
- u32 field_mask, u32 *seen_mask,
- int *align, int *sz)
-{
- int type = 0;
+ u32 field_mask, u32 *seen_mask, int *align, int *sz)
+{
+ const struct {
+ enum btf_field_type type;
+ const char *const name;
+ const bool is_unique;
+ } field_types[] = {
+ { BPF_SPIN_LOCK, "bpf_spin_lock", true },
+ { BPF_RES_SPIN_LOCK, "bpf_res_spin_lock", true },
+ { BPF_TIMER, "bpf_timer", true },
+ { BPF_WORKQUEUE, "bpf_wq", true },
+ { BPF_TASK_WORK, "bpf_task_work", true },
+ { BPF_LIST_HEAD, "bpf_list_head", false },
+ { BPF_LIST_NODE, "bpf_list_node", false },
+ { BPF_RB_ROOT, "bpf_rb_root", false },
+ { BPF_RB_NODE, "bpf_rb_node", false },
+ { BPF_REFCOUNT, "bpf_refcount", false },
+ };
+ int type = 0, i;
const char *name = __btf_name_by_offset(btf, var_type->name_off);
-
- if (field_mask & BPF_SPIN_LOCK) {
- if (!strcmp(name, "bpf_spin_lock")) {
- if (*seen_mask & BPF_SPIN_LOCK)
- return -E2BIG;
- *seen_mask |= BPF_SPIN_LOCK;
- type = BPF_SPIN_LOCK;
- goto end;
- }
- }
- if (field_mask & BPF_RES_SPIN_LOCK) {
- if (!strcmp(name, "bpf_res_spin_lock")) {
- if (*seen_mask & BPF_RES_SPIN_LOCK)
- return -E2BIG;
- *seen_mask |= BPF_RES_SPIN_LOCK;
- type = BPF_RES_SPIN_LOCK;
- goto end;
- }
- }
- if (field_mask & BPF_TIMER) {
- if (!strcmp(name, "bpf_timer")) {
- if (*seen_mask & BPF_TIMER)
- return -E2BIG;
- *seen_mask |= BPF_TIMER;
- type = BPF_TIMER;
- goto end;
- }
- }
- if (field_mask & BPF_WORKQUEUE) {
- if (!strcmp(name, "bpf_wq")) {
- if (*seen_mask & BPF_WORKQUEUE)
+ const char *field_type_name;
+ enum btf_field_type field_type;
+ bool is_unique;
+
+ for (i = 0; i < ARRAY_SIZE(field_types); ++i) {
+ field_type = field_types[i].type;
+ field_type_name = field_types[i].name;
+ is_unique = field_types[i].is_unique;
+ if (!(field_mask & field_type) || strcmp(name, field_type_name))
+ continue;
+ if (is_unique) {
+ if (*seen_mask & field_type)
return -E2BIG;
- *seen_mask |= BPF_WORKQUEUE;
- type = BPF_WORKQUEUE;
- goto end;
+ *seen_mask |= field_type;
}
+ type = field_type;
+ goto end;
}
- field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head");
- field_mask_test_name(BPF_LIST_NODE, "bpf_list_node");
- field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root");
- field_mask_test_name(BPF_RB_NODE, "bpf_rb_node");
- field_mask_test_name(BPF_REFCOUNT, "bpf_refcount");
/* Only return BPF_KPTR when all other types with matchable names fail */
if (field_mask & (BPF_KPTR | BPF_UPTR) && !__btf_type_is_struct(var_type)) {
@@ -3545,8 +3530,6 @@ end:
return type;
}
-#undef field_mask_test_name
-
/* Repeat a number of fields for a specified number of times.
*
* Copy the fields starting from the first field and repeat them for
@@ -3693,6 +3676,7 @@ static int btf_find_field_one(const struct btf *btf,
case BPF_LIST_NODE:
case BPF_RB_NODE:
case BPF_REFCOUNT:
+ case BPF_TASK_WORK:
ret = btf_find_struct(btf, var_type, off, sz, field_type,
info_cnt ? &info[0] : &tmp);
if (ret < 0)
@@ -3985,6 +3969,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
rec->timer_off = -EINVAL;
rec->wq_off = -EINVAL;
rec->refcount_off = -EINVAL;
+ rec->task_work_off = -EINVAL;
for (i = 0; i < cnt; i++) {
field_type_size = btf_field_type_size(info_arr[i].type);
if (info_arr[i].off + field_type_size > value_size) {
@@ -4024,6 +4009,10 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
/* Cache offset for faster lookup at runtime */
rec->wq_off = rec->fields[i].offset;
break;
+ case BPF_TASK_WORK:
+ WARN_ON_ONCE(rec->task_work_off >= 0);
+ rec->task_work_off = rec->fields[i].offset;
+ break;
case BPF_REFCOUNT:
WARN_ON_ONCE(rec->refcount_off >= 0);
/* Cache offset for faster lookup at runtime */
@@ -6762,7 +6751,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
/* skip modifiers */
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (btf_type_is_small_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t))
+ if (btf_type_is_small_int(t) || btf_is_any_enum(t) || btf_type_is_struct(t))
/* accessing a scalar */
return true;
if (!btf_type_is_ptr(t)) {
@@ -7334,7 +7323,7 @@ static int __get_type_size(struct btf *btf, u32 btf_id,
if (btf_type_is_ptr(t))
/* kernel size of pointer. Not BPF's size of pointer*/
return sizeof(void *);
- if (btf_type_is_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t))
+ if (btf_type_is_int(t) || btf_is_any_enum(t) || btf_type_is_struct(t))
return t->size;
return -EINVAL;
}
@@ -7343,7 +7332,7 @@ static u8 __get_type_fmodel_flags(const struct btf_type *t)
{
u8 flags = 0;
- if (__btf_type_is_struct(t))
+ if (btf_type_is_struct(t))
flags |= BTF_FMODEL_STRUCT_ARG;
if (btf_type_is_signed_int(t))
flags |= BTF_FMODEL_SIGNED_ARG;
@@ -7384,7 +7373,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
return -EINVAL;
}
ret = __get_type_size(btf, func->type, &t);
- if (ret < 0 || __btf_type_is_struct(t)) {
+ if (ret < 0 || btf_type_is_struct(t)) {
bpf_log(log,
"The function %s return type %s is unsupported.\n",
tname, btf_type_str(t));
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 180b630279b9..248f517d66d0 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -27,14 +27,15 @@ EXPORT_SYMBOL(cgroup_bpf_enabled_key);
/*
* cgroup bpf destruction makes heavy use of work items and there can be a lot
* of concurrent destructions. Use a separate workqueue so that cgroup bpf
- * destruction work items don't end up filling up max_active of system_wq
+ * destruction work items don't end up filling up max_active of system_percpu_wq
* which may lead to deadlock.
*/
static struct workqueue_struct *cgroup_bpf_destroy_wq;
static int __init cgroup_bpf_wq_init(void)
{
- cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 0, 1);
+ cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy",
+ WQ_PERCPU, 1);
if (!cgroup_bpf_destroy_wq)
panic("Failed to alloc workqueue for cgroup bpf destroy.\n");
return 0;
@@ -71,8 +72,7 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
u32 func_ret;
run_ctx.retval = retval;
- migrate_disable();
- rcu_read_lock();
+ rcu_read_lock_dont_migrate();
array = rcu_dereference(cgrp->effective[atype]);
item = &array->items[0];
old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
@@ -88,8 +88,7 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
item++;
}
bpf_reset_run_ctx(old_run_ctx);
- rcu_read_unlock();
- migrate_enable();
+ rcu_read_unlock_migrate();
return run_ctx.retval;
}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 5d1650af899d..d595fe512498 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -18,6 +18,7 @@
*/
#include <uapi/linux/btf.h>
+#include <crypto/sha1.h>
#include <linux/filter.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
@@ -38,6 +39,7 @@
#include <linux/bpf_mem_alloc.h>
#include <linux/memcontrol.h>
#include <linux/execmem.h>
+#include <crypto/sha2.h>
#include <asm/barrier.h>
#include <linux/unaligned.h>
@@ -119,6 +121,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
fp->pages = size / PAGE_SIZE;
fp->aux = aux;
+ fp->aux->main_prog_aux = aux;
fp->aux->prog = fp;
fp->jit_requested = ebpf_jit_enabled();
fp->blinding_requested = bpf_jit_blinding_enabled(fp);
@@ -293,28 +296,18 @@ void __bpf_prog_free(struct bpf_prog *fp)
int bpf_prog_calc_tag(struct bpf_prog *fp)
{
- const u32 bits_offset = SHA1_BLOCK_SIZE - sizeof(__be64);
- u32 raw_size = bpf_prog_tag_scratch_size(fp);
- u32 digest[SHA1_DIGEST_WORDS];
- u32 ws[SHA1_WORKSPACE_WORDS];
- u32 i, bsize, psize, blocks;
+ size_t size = bpf_prog_insn_size(fp);
struct bpf_insn *dst;
bool was_ld_map;
- u8 *raw, *todo;
- __be32 *result;
- __be64 *bits;
+ u32 i;
- raw = vmalloc(raw_size);
- if (!raw)
+ dst = vmalloc(size);
+ if (!dst)
return -ENOMEM;
- sha1_init_raw(digest);
- memset(ws, 0, sizeof(ws));
-
/* We need to take out the map fd for the digest calculation
* since they are unstable from user space side.
*/
- dst = (void *)raw;
for (i = 0, was_ld_map = false; i < fp->len; i++) {
dst[i] = fp->insnsi[i];
if (!was_ld_map &&
@@ -334,33 +327,8 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
was_ld_map = false;
}
}
-
- psize = bpf_prog_insn_size(fp);
- memset(&raw[psize], 0, raw_size - psize);
- raw[psize++] = 0x80;
-
- bsize = round_up(psize, SHA1_BLOCK_SIZE);
- blocks = bsize / SHA1_BLOCK_SIZE;
- todo = raw;
- if (bsize - psize >= sizeof(__be64)) {
- bits = (__be64 *)(todo + bsize - sizeof(__be64));
- } else {
- bits = (__be64 *)(todo + bsize + bits_offset);
- blocks++;
- }
- *bits = cpu_to_be64((psize - 1) << 3);
-
- while (blocks--) {
- sha1_transform(digest, todo, ws);
- todo += SHA1_BLOCK_SIZE;
- }
-
- result = (__force __be32 *)digest;
- for (i = 0; i < SHA1_DIGEST_WORDS; i++)
- result[i] = cpu_to_be32(digest[i]);
- memcpy(fp->tag, result, sizeof(fp->tag));
-
- vfree(raw);
+ sha256((u8 *)dst, size, fp->digest);
+ vfree(dst);
return 0;
}
@@ -2366,8 +2334,7 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx,
const struct bpf_insn *insn)
{
/* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON
- * is not working properly, or interpreter is being used when
- * prog->jit_requested is not 0, so warn about it!
+ * is not working properly, so warn about it!
*/
WARN_ON_ONCE(1);
return 0;
@@ -2394,6 +2361,7 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map,
map->owner->type = prog_type;
map->owner->jited = fp->jited;
map->owner->xdp_has_frags = aux->xdp_has_frags;
+ map->owner->expected_attach_type = fp->expected_attach_type;
map->owner->attach_func_proto = aux->attach_func_proto;
for_each_cgroup_storage_type(i) {
map->owner->storage_cookie[i] =
@@ -2405,6 +2373,10 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map,
ret = map->owner->type == prog_type &&
map->owner->jited == fp->jited &&
map->owner->xdp_has_frags == aux->xdp_has_frags;
+ if (ret &&
+ map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
+ map->owner->expected_attach_type != fp->expected_attach_type)
+ ret = false;
for_each_cgroup_storage_type(i) {
if (!ret)
break;
@@ -2468,8 +2440,9 @@ out:
return ret;
}
-static void bpf_prog_select_func(struct bpf_prog *fp)
+static bool bpf_prog_select_interpreter(struct bpf_prog *fp)
{
+ bool select_interpreter = false;
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
u32 idx = (round_up(stack_depth, 32) / 32) - 1;
@@ -2478,15 +2451,16 @@ static void bpf_prog_select_func(struct bpf_prog *fp)
* But for non-JITed programs, we don't need bpf_func, so no bounds
* check needed.
*/
- if (!fp->jit_requested &&
- !WARN_ON_ONCE(idx >= ARRAY_SIZE(interpreters))) {
+ if (idx < ARRAY_SIZE(interpreters)) {
fp->bpf_func = interpreters[idx];
+ select_interpreter = true;
} else {
fp->bpf_func = __bpf_prog_ret0_warn;
}
#else
fp->bpf_func = __bpf_prog_ret0_warn;
#endif
+ return select_interpreter;
}
/**
@@ -2505,7 +2479,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
/* In case of BPF to BPF calls, verifier did all the prep
* work with regards to JITing, etc.
*/
- bool jit_needed = fp->jit_requested;
+ bool jit_needed = false;
if (fp->bpf_func)
goto finalize;
@@ -2514,7 +2488,8 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
bpf_prog_has_kfunc_call(fp))
jit_needed = true;
- bpf_prog_select_func(fp);
+ if (!bpf_prog_select_interpreter(fp))
+ jit_needed = true;
/* eBPF JITs can rewrite the program in case constant
* blinding is active. However, in case of error during
@@ -3024,7 +2999,10 @@ EXPORT_SYMBOL_GPL(bpf_event_output);
/* Always built-in helper functions. */
const struct bpf_func_proto bpf_tail_call_proto = {
- .func = NULL,
+ /* func is unused for tail_call, we set it to pass the
+ * get_helper_proto check
+ */
+ .func = BPF_PTR_POISON,
.gpl_only = false,
.ret_type = RET_VOID,
.arg1_type = ARG_PTR_TO_CTX,
@@ -3324,9 +3302,8 @@ static bool find_from_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp)
rcu_read_unlock();
if (!prog)
return true;
- if (bpf_is_subprog(prog))
- return true;
- ctxp->prog = prog;
+ /* Make sure we return the main prog if we found a subprog */
+ ctxp->prog = prog->aux->main_prog_aux->prog;
return false;
}
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index b2b7b8ec2c2a..703e5df1f4ef 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -186,7 +186,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
struct xdp_buff xdp;
int i, nframes = 0;
- xdp_set_return_frame_no_direct();
xdp.rxq = &rxq;
for (i = 0; i < n; i++) {
@@ -231,7 +230,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
}
}
- xdp_clear_return_frame_no_direct();
stats->pass += nframes;
return nframes;
@@ -255,6 +253,7 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
rcu_read_lock();
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+ xdp_set_return_frame_no_direct();
ret->xdp_n = cpu_map_bpf_prog_run_xdp(rcpu, frames, ret->xdp_n, stats);
if (unlikely(ret->skb_n))
@@ -264,6 +263,7 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
if (stats->redirect)
xdp_do_flush();
+ xdp_clear_return_frame_no_direct();
bpf_net_ctx_clear(bpf_net_ctx);
rcu_read_unlock();
@@ -550,7 +550,7 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
old_rcpu = unrcu_pointer(xchg(&cmap->cpu_map[key_cpu], RCU_INITIALIZER(rcpu)));
if (old_rcpu) {
INIT_RCU_WORK(&old_rcpu->free_work, __cpu_map_entry_free);
- queue_rcu_work(system_wq, &old_rcpu->free_work);
+ queue_rcu_work(system_percpu_wq, &old_rcpu->free_work);
}
}
diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c
index 94854cd9c4cc..83c4d9943084 100644
--- a/kernel/bpf/crypto.c
+++ b/kernel/bpf/crypto.c
@@ -278,7 +278,7 @@ static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx,
siv_len = siv ? __bpf_dynptr_size(siv) : 0;
src_len = __bpf_dynptr_size(src);
dst_len = __bpf_dynptr_size(dst);
- if (!src_len || !dst_len)
+ if (!src_len || !dst_len || src_len > dst_len)
return -EINVAL;
if (siv_len != ctx->siv_len)
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 482d284a1553..2625601de76e 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -865,7 +865,7 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
struct bpf_dtab_netdev *dev;
dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev),
- GFP_NOWAIT | __GFP_NOWARN,
+ GFP_NOWAIT,
dtab->map.numa_node);
if (!dev)
return ERR_PTR(-ENOMEM);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 71f9931ac64c..c2fcd0cd51e5 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -215,7 +215,20 @@ static bool htab_has_extra_elems(struct bpf_htab *htab)
return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab);
}
-static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab)
+static void htab_free_internal_structs(struct bpf_htab *htab, struct htab_elem *elem)
+{
+ if (btf_record_has_field(htab->map.record, BPF_TIMER))
+ bpf_obj_free_timer(htab->map.record,
+ htab_elem_value(elem, htab->map.key_size));
+ if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
+ bpf_obj_free_workqueue(htab->map.record,
+ htab_elem_value(elem, htab->map.key_size));
+ if (btf_record_has_field(htab->map.record, BPF_TASK_WORK))
+ bpf_obj_free_task_work(htab->map.record,
+ htab_elem_value(elem, htab->map.key_size));
+}
+
+static void htab_free_prealloced_internal_structs(struct bpf_htab *htab)
{
u32 num_entries = htab->map.max_entries;
int i;
@@ -227,12 +240,7 @@ static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab)
struct htab_elem *elem;
elem = get_htab_elem(htab, i);
- if (btf_record_has_field(htab->map.record, BPF_TIMER))
- bpf_obj_free_timer(htab->map.record,
- htab_elem_value(elem, htab->map.key_size));
- if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
- bpf_obj_free_workqueue(htab->map.record,
- htab_elem_value(elem, htab->map.key_size));
+ htab_free_internal_structs(htab, elem);
cond_resched();
}
}
@@ -1490,7 +1498,7 @@ static void delete_all_elements(struct bpf_htab *htab)
}
}
-static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab)
+static void htab_free_malloced_internal_structs(struct bpf_htab *htab)
{
int i;
@@ -1502,28 +1510,23 @@ static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab)
hlist_nulls_for_each_entry(l, n, head, hash_node) {
/* We only free timer on uref dropping to zero */
- if (btf_record_has_field(htab->map.record, BPF_TIMER))
- bpf_obj_free_timer(htab->map.record,
- htab_elem_value(l, htab->map.key_size));
- if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
- bpf_obj_free_workqueue(htab->map.record,
- htab_elem_value(l, htab->map.key_size));
+ htab_free_internal_structs(htab, l);
}
cond_resched_rcu();
}
rcu_read_unlock();
}
-static void htab_map_free_timers_and_wq(struct bpf_map *map)
+static void htab_map_free_internal_structs(struct bpf_map *map)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
/* We only free timer and workqueue on uref dropping to zero */
- if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE)) {
+ if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) {
if (!htab_is_prealloc(htab))
- htab_free_malloced_timers_and_wq(htab);
+ htab_free_malloced_internal_structs(htab);
else
- htab_free_prealloced_timers_and_wq(htab);
+ htab_free_prealloced_internal_structs(htab);
}
}
@@ -2255,7 +2258,7 @@ const struct bpf_map_ops htab_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
- .map_release_uref = htab_map_free_timers_and_wq,
+ .map_release_uref = htab_map_free_internal_structs,
.map_lookup_elem = htab_map_lookup_elem,
.map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
.map_update_elem = htab_map_update_elem,
@@ -2276,7 +2279,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
- .map_release_uref = htab_map_free_timers_and_wq,
+ .map_release_uref = htab_map_free_internal_structs,
.map_lookup_elem = htab_lru_map_lookup_elem,
.map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 6b4877e85a68..c9fab9a356df 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -25,6 +25,9 @@
#include <linux/kasan.h>
#include <linux/bpf_verifier.h>
#include <linux/uaccess.h>
+#include <linux/verification.h>
+#include <linux/task_work.h>
+#include <linux/irq_work.h>
#include "../../lib/kstrtox.h"
@@ -774,11 +777,9 @@ int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs)
{
int nest_level;
- preempt_disable();
nest_level = this_cpu_inc_return(bpf_bprintf_nest_level);
if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) {
this_cpu_dec(bpf_bprintf_nest_level);
- preempt_enable();
return -EBUSY;
}
*bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]);
@@ -791,7 +792,6 @@ void bpf_put_buffers(void)
if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0))
return;
this_cpu_dec(bpf_bprintf_nest_level);
- preempt_enable();
}
void bpf_bprintf_cleanup(struct bpf_bprintf_data *data)
@@ -1084,6 +1084,17 @@ const struct bpf_func_proto bpf_snprintf_proto = {
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
+static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx)
+{
+ if (map->map_type == BPF_MAP_TYPE_ARRAY) {
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+ *arr_idx = ((char *)value - array->value) / array->elem_size;
+ return arr_idx;
+ }
+ return (void *)value - round_up(map->key_size, 8);
+}
+
struct bpf_async_cb {
struct bpf_map *map;
struct bpf_prog *prog;
@@ -1166,15 +1177,8 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
* bpf_map_delete_elem() on the same timer.
*/
this_cpu_write(hrtimer_running, t);
- if (map->map_type == BPF_MAP_TYPE_ARRAY) {
- struct bpf_array *array = container_of(map, struct bpf_array, map);
- /* compute the key */
- idx = ((char *)value - array->value) / array->elem_size;
- key = &idx;
- } else { /* hash or lru */
- key = value - round_up(map->key_size, 8);
- }
+ key = map_key_from_value(map, value, &idx);
callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
/* The verifier checked that return value is zero. */
@@ -1200,15 +1204,7 @@ static void bpf_wq_work(struct work_struct *work)
if (!callback_fn)
return;
- if (map->map_type == BPF_MAP_TYPE_ARRAY) {
- struct bpf_array *array = container_of(map, struct bpf_array, map);
-
- /* compute the key */
- idx = ((char *)value - array->value) / array->elem_size;
- key = &idx;
- } else { /* hash or lru */
- key = value - round_up(map->key_size, 8);
- }
+ key = map_key_from_value(map, value, &idx);
rcu_read_lock_trace();
migrate_disable();
@@ -1274,8 +1270,11 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
goto out;
}
- /* allocate hrtimer via map_kmalloc to use memcg accounting */
- cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
+ /* Allocate via bpf_map_kmalloc_node() for memcg accounting. Until
+ * kmalloc_nolock() is available, avoid locking issues by using
+ * __GFP_HIGH (GFP_ATOMIC & ~__GFP_RECLAIM).
+ */
+ cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node);
if (!cb) {
ret = -ENOMEM;
goto out;
@@ -1597,7 +1596,7 @@ void bpf_timer_cancel_and_free(void *val)
* timer callback.
*/
if (this_cpu_read(hrtimer_running)) {
- queue_work(system_unbound_wq, &t->cb.delete_work);
+ queue_work(system_dfl_wq, &t->cb.delete_work);
return;
}
@@ -1610,7 +1609,7 @@ void bpf_timer_cancel_and_free(void *val)
if (hrtimer_try_to_cancel(&t->timer) >= 0)
kfree_rcu(t, cb.rcu);
else
- queue_work(system_unbound_wq, &t->cb.delete_work);
+ queue_work(system_dfl_wq, &t->cb.delete_work);
} else {
bpf_timer_delete_work(&t->cb.delete_work);
}
@@ -1780,6 +1779,9 @@ static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *s
return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len);
case BPF_DYNPTR_TYPE_XDP:
return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len);
+ case BPF_DYNPTR_TYPE_SKB_META:
+ memmove(dst, bpf_skb_meta_pointer(src->data, src->offset + offset), len);
+ return 0;
default:
WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type);
return -EFAULT;
@@ -1836,6 +1838,11 @@ int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src,
if (flags)
return -EINVAL;
return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len);
+ case BPF_DYNPTR_TYPE_SKB_META:
+ if (flags)
+ return -EINVAL;
+ memmove(bpf_skb_meta_pointer(dst->data, dst->offset + offset), src, len);
+ return 0;
default:
WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type);
return -EFAULT;
@@ -1882,6 +1889,7 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3
return (unsigned long)(ptr->data + ptr->offset + offset);
case BPF_DYNPTR_TYPE_SKB:
case BPF_DYNPTR_TYPE_XDP:
+ case BPF_DYNPTR_TYPE_SKB_META:
/* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */
return 0;
default:
@@ -2537,7 +2545,7 @@ __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid)
{
struct cgroup *cgrp;
- cgrp = cgroup_get_from_id(cgid);
+ cgrp = __cgroup_get_from_id(cgid);
if (IS_ERR(cgrp))
return NULL;
return cgrp;
@@ -2710,6 +2718,8 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset,
bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false);
return buffer__opt;
}
+ case BPF_DYNPTR_TYPE_SKB_META:
+ return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset);
default:
WARN_ONCE(true, "unknown dynptr type %d\n", type);
return NULL;
@@ -3341,39 +3351,30 @@ __bpf_kfunc void __bpf_trap(void)
* __get_kernel_nofault instead of plain dereference to make them safe.
*/
-/**
- * bpf_strcmp - Compare two strings
- * @s1__ign: One string
- * @s2__ign: Another string
- *
- * Return:
- * * %0 - Strings are equal
- * * %-1 - @s1__ign is smaller
- * * %1 - @s2__ign is smaller
- * * %-EFAULT - Cannot read one of the strings
- * * %-E2BIG - One of strings is too large
- * * %-ERANGE - One of strings is outside of kernel address space
- */
-__bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign)
+static int __bpf_strcasecmp(const char *s1, const char *s2, bool ignore_case)
{
char c1, c2;
int i;
- if (!copy_from_kernel_nofault_allowed(s1__ign, 1) ||
- !copy_from_kernel_nofault_allowed(s2__ign, 1)) {
+ if (!copy_from_kernel_nofault_allowed(s1, 1) ||
+ !copy_from_kernel_nofault_allowed(s2, 1)) {
return -ERANGE;
}
guard(pagefault)();
for (i = 0; i < XATTR_SIZE_MAX; i++) {
- __get_kernel_nofault(&c1, s1__ign, char, err_out);
- __get_kernel_nofault(&c2, s2__ign, char, err_out);
+ __get_kernel_nofault(&c1, s1, char, err_out);
+ __get_kernel_nofault(&c2, s2, char, err_out);
+ if (ignore_case) {
+ c1 = tolower(c1);
+ c2 = tolower(c2);
+ }
if (c1 != c2)
return c1 < c2 ? -1 : 1;
if (c1 == '\0')
return 0;
- s1__ign++;
- s2__ign++;
+ s1++;
+ s2++;
}
return -E2BIG;
err_out:
@@ -3381,6 +3382,42 @@ err_out:
}
/**
+ * bpf_strcmp - Compare two strings
+ * @s1__ign: One string
+ * @s2__ign: Another string
+ *
+ * Return:
+ * * %0 - Strings are equal
+ * * %-1 - @s1__ign is smaller
+ * * %1 - @s2__ign is smaller
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of strings is too large
+ * * %-ERANGE - One of strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign)
+{
+ return __bpf_strcasecmp(s1__ign, s2__ign, false);
+}
+
+/**
+ * bpf_strcasecmp - Compare two strings, ignoring the case of the characters
+ * @s1__ign: One string
+ * @s2__ign: Another string
+ *
+ * Return:
+ * * %0 - Strings are equal
+ * * %-1 - @s1__ign is smaller
+ * * %1 - @s2__ign is smaller
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of strings is too large
+ * * %-ERANGE - One of strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strcasecmp(const char *s1__ign, const char *s2__ign)
+{
+ return __bpf_strcasecmp(s1__ign, s2__ign, true);
+}
+
+/**
* bpf_strnchr - Find a character in a length limited string
* @s__ign: The string to be searched
* @count: The number of characters to be searched
@@ -3664,10 +3701,17 @@ __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len
guard(pagefault)();
for (i = 0; i < XATTR_SIZE_MAX; i++) {
- for (j = 0; i + j < len && j < XATTR_SIZE_MAX; j++) {
+ for (j = 0; i + j <= len && j < XATTR_SIZE_MAX; j++) {
__get_kernel_nofault(&c2, s2__ign + j, char, err_out);
if (c2 == '\0')
return i;
+ /*
+ * We allow reading an extra byte from s2 (note the
+ * `i + j <= len` above) to cover the case when s2 is
+ * a suffix of the first len chars of s1.
+ */
+ if (i + j == len)
+ break;
__get_kernel_nofault(&c1, s1__ign + j, char, err_out);
if (c1 == '\0')
return -ENOENT;
@@ -3702,9 +3746,490 @@ __bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign)
{
return bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX);
}
+#ifdef CONFIG_KEYS
+/**
+ * bpf_lookup_user_key - lookup a key by its serial
+ * @serial: key handle serial number
+ * @flags: lookup-specific flags
+ *
+ * Search a key with a given *serial* and the provided *flags*.
+ * If found, increment the reference count of the key by one, and
+ * return it in the bpf_key structure.
+ *
+ * The bpf_key structure must be passed to bpf_key_put() when done
+ * with it, so that the key reference count is decremented and the
+ * bpf_key structure is freed.
+ *
+ * Permission checks are deferred to the time the key is used by
+ * one of the available key-specific kfuncs.
+ *
+ * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested
+ * special keyring (e.g. session keyring), if it doesn't yet exist.
+ * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting
+ * for the key construction, and to retrieve uninstantiated keys (keys
+ * without data attached to them).
+ *
+ * Return: a bpf_key pointer with a valid key pointer if the key is found, a
+ * NULL pointer otherwise.
+ */
+__bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags)
+{
+ key_ref_t key_ref;
+ struct bpf_key *bkey;
+
+ if (flags & ~KEY_LOOKUP_ALL)
+ return NULL;
+
+ /*
+ * Permission check is deferred until the key is used, as the
+ * intent of the caller is unknown here.
+ */
+ key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK);
+ if (IS_ERR(key_ref))
+ return NULL;
+
+ bkey = kmalloc(sizeof(*bkey), GFP_KERNEL);
+ if (!bkey) {
+ key_put(key_ref_to_ptr(key_ref));
+ return NULL;
+ }
+
+ bkey->key = key_ref_to_ptr(key_ref);
+ bkey->has_ref = true;
+
+ return bkey;
+}
+
+/**
+ * bpf_lookup_system_key - lookup a key by a system-defined ID
+ * @id: key ID
+ *
+ * Obtain a bpf_key structure with a key pointer set to the passed key ID.
+ * The key pointer is marked as invalid, to prevent bpf_key_put() from
+ * attempting to decrement the key reference count on that pointer. The key
+ * pointer set in such way is currently understood only by
+ * verify_pkcs7_signature().
+ *
+ * Set *id* to one of the values defined in include/linux/verification.h:
+ * 0 for the primary keyring (immutable keyring of system keys);
+ * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring
+ * (where keys can be added only if they are vouched for by existing keys
+ * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform
+ * keyring (primarily used by the integrity subsystem to verify a kexec'ed
+ * kerned image and, possibly, the initramfs signature).
+ *
+ * Return: a bpf_key pointer with an invalid key pointer set from the
+ * pre-determined ID on success, a NULL pointer otherwise
+ */
+__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id)
+{
+ struct bpf_key *bkey;
+
+ if (system_keyring_id_check(id) < 0)
+ return NULL;
+
+ bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC);
+ if (!bkey)
+ return NULL;
+
+ bkey->key = (struct key *)(unsigned long)id;
+ bkey->has_ref = false;
+
+ return bkey;
+}
+
+/**
+ * bpf_key_put - decrement key reference count if key is valid and free bpf_key
+ * @bkey: bpf_key structure
+ *
+ * Decrement the reference count of the key inside *bkey*, if the pointer
+ * is valid, and free *bkey*.
+ */
+__bpf_kfunc void bpf_key_put(struct bpf_key *bkey)
+{
+ if (bkey->has_ref)
+ key_put(bkey->key);
+
+ kfree(bkey);
+}
+
+/**
+ * bpf_verify_pkcs7_signature - verify a PKCS#7 signature
+ * @data_p: data to verify
+ * @sig_p: signature of the data
+ * @trusted_keyring: keyring with keys trusted for signature verification
+ *
+ * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr*
+ * with keys in a keyring referenced by *trusted_keyring*.
+ *
+ * Return: 0 on success, a negative value on error.
+ */
+__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p,
+ struct bpf_dynptr *sig_p,
+ struct bpf_key *trusted_keyring)
+{
+#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
+ struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p;
+ struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p;
+ const void *data, *sig;
+ u32 data_len, sig_len;
+ int ret;
+
+ if (trusted_keyring->has_ref) {
+ /*
+ * Do the permission check deferred in bpf_lookup_user_key().
+ * See bpf_lookup_user_key() for more details.
+ *
+ * A call to key_task_permission() here would be redundant, as
+ * it is already done by keyring_search() called by
+ * find_asymmetric_key().
+ */
+ ret = key_validate(trusted_keyring->key);
+ if (ret < 0)
+ return ret;
+ }
+
+ data_len = __bpf_dynptr_size(data_ptr);
+ data = __bpf_dynptr_data(data_ptr, data_len);
+ sig_len = __bpf_dynptr_size(sig_ptr);
+ sig = __bpf_dynptr_data(sig_ptr, sig_len);
+
+ return verify_pkcs7_signature(data, data_len, sig, sig_len,
+ trusted_keyring->key,
+ VERIFYING_BPF_SIGNATURE, NULL,
+ NULL);
+#else
+ return -EOPNOTSUPP;
+#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
+}
+#endif /* CONFIG_KEYS */
+
+typedef int (*bpf_task_work_callback_t)(struct bpf_map *map, void *key, void *value);
+
+enum bpf_task_work_state {
+ /* bpf_task_work is ready to be used */
+ BPF_TW_STANDBY = 0,
+ /* irq work scheduling in progress */
+ BPF_TW_PENDING,
+ /* task work scheduling in progress */
+ BPF_TW_SCHEDULING,
+ /* task work is scheduled successfully */
+ BPF_TW_SCHEDULED,
+ /* callback is running */
+ BPF_TW_RUNNING,
+ /* associated BPF map value is deleted */
+ BPF_TW_FREED,
+};
+
+struct bpf_task_work_ctx {
+ enum bpf_task_work_state state;
+ refcount_t refcnt;
+ struct callback_head work;
+ struct irq_work irq_work;
+ /* bpf_prog that schedules task work */
+ struct bpf_prog *prog;
+ /* task for which callback is scheduled */
+ struct task_struct *task;
+ /* the map and map value associated with this context */
+ struct bpf_map *map;
+ void *map_val;
+ enum task_work_notify_mode mode;
+ bpf_task_work_callback_t callback_fn;
+ struct rcu_head rcu;
+} __aligned(8);
+
+/* Actual type for struct bpf_task_work */
+struct bpf_task_work_kern {
+ struct bpf_task_work_ctx *ctx;
+};
+
+static void bpf_task_work_ctx_reset(struct bpf_task_work_ctx *ctx)
+{
+ if (ctx->prog) {
+ bpf_prog_put(ctx->prog);
+ ctx->prog = NULL;
+ }
+ if (ctx->task) {
+ bpf_task_release(ctx->task);
+ ctx->task = NULL;
+ }
+}
+
+static bool bpf_task_work_ctx_tryget(struct bpf_task_work_ctx *ctx)
+{
+ return refcount_inc_not_zero(&ctx->refcnt);
+}
+
+static void bpf_task_work_ctx_put(struct bpf_task_work_ctx *ctx)
+{
+ if (!refcount_dec_and_test(&ctx->refcnt))
+ return;
+
+ bpf_task_work_ctx_reset(ctx);
+
+ /* bpf_mem_free expects migration to be disabled */
+ migrate_disable();
+ bpf_mem_free(&bpf_global_ma, ctx);
+ migrate_enable();
+}
+
+static void bpf_task_work_cancel(struct bpf_task_work_ctx *ctx)
+{
+ /*
+ * Scheduled task_work callback holds ctx ref, so if we successfully
+ * cancelled, we put that ref on callback's behalf. If we couldn't
+ * cancel, callback will inevitably run or has already completed
+ * running, and it would have taken care of its ctx ref itself.
+ */
+ if (task_work_cancel(ctx->task, &ctx->work))
+ bpf_task_work_ctx_put(ctx);
+}
+
+static void bpf_task_work_callback(struct callback_head *cb)
+{
+ struct bpf_task_work_ctx *ctx = container_of(cb, struct bpf_task_work_ctx, work);
+ enum bpf_task_work_state state;
+ u32 idx;
+ void *key;
+
+ /* Read lock is needed to protect ctx and map key/value access */
+ guard(rcu_tasks_trace)();
+ /*
+ * This callback may start running before bpf_task_work_irq() switched to
+ * SCHEDULED state, so handle both transition variants SCHEDULING|SCHEDULED -> RUNNING.
+ */
+ state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_RUNNING);
+ if (state == BPF_TW_SCHEDULED)
+ state = cmpxchg(&ctx->state, BPF_TW_SCHEDULED, BPF_TW_RUNNING);
+ if (state == BPF_TW_FREED) {
+ bpf_task_work_ctx_put(ctx);
+ return;
+ }
+
+ key = (void *)map_key_from_value(ctx->map, ctx->map_val, &idx);
+
+ migrate_disable();
+ ctx->callback_fn(ctx->map, key, ctx->map_val);
+ migrate_enable();
+
+ bpf_task_work_ctx_reset(ctx);
+ (void)cmpxchg(&ctx->state, BPF_TW_RUNNING, BPF_TW_STANDBY);
+
+ bpf_task_work_ctx_put(ctx);
+}
+
+static void bpf_task_work_irq(struct irq_work *irq_work)
+{
+ struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work);
+ enum bpf_task_work_state state;
+ int err;
+
+ guard(rcu_tasks_trace)();
+
+ if (cmpxchg(&ctx->state, BPF_TW_PENDING, BPF_TW_SCHEDULING) != BPF_TW_PENDING) {
+ bpf_task_work_ctx_put(ctx);
+ return;
+ }
+
+ err = task_work_add(ctx->task, &ctx->work, ctx->mode);
+ if (err) {
+ bpf_task_work_ctx_reset(ctx);
+ /*
+ * try to switch back to STANDBY for another task_work reuse, but we might have
+ * gone to FREED already, which is fine as we already cleaned up after ourselves
+ */
+ (void)cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_STANDBY);
+ bpf_task_work_ctx_put(ctx);
+ return;
+ }
+
+ /*
+ * It's technically possible for just scheduled task_work callback to
+ * complete running by now, going SCHEDULING -> RUNNING and then
+ * dropping its ctx refcount. Instead of capturing extra ref just to
+ * protected below ctx->state access, we rely on RCU protection to
+ * perform below SCHEDULING -> SCHEDULED attempt.
+ */
+ state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_SCHEDULED);
+ if (state == BPF_TW_FREED)
+ bpf_task_work_cancel(ctx); /* clean up if we switched into FREED state */
+}
+
+static struct bpf_task_work_ctx *bpf_task_work_fetch_ctx(struct bpf_task_work *tw,
+ struct bpf_map *map)
+{
+ struct bpf_task_work_kern *twk = (void *)tw;
+ struct bpf_task_work_ctx *ctx, *old_ctx;
+
+ ctx = READ_ONCE(twk->ctx);
+ if (ctx)
+ return ctx;
+
+ ctx = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_task_work_ctx));
+ if (!ctx)
+ return ERR_PTR(-ENOMEM);
+
+ memset(ctx, 0, sizeof(*ctx));
+ refcount_set(&ctx->refcnt, 1); /* map's own ref */
+ ctx->state = BPF_TW_STANDBY;
+
+ old_ctx = cmpxchg(&twk->ctx, NULL, ctx);
+ if (old_ctx) {
+ /*
+ * tw->ctx is set by concurrent BPF program, release allocated
+ * memory and try to reuse already set context.
+ */
+ bpf_mem_free(&bpf_global_ma, ctx);
+ return old_ctx;
+ }
+
+ return ctx; /* Success */
+}
+
+static struct bpf_task_work_ctx *bpf_task_work_acquire_ctx(struct bpf_task_work *tw,
+ struct bpf_map *map)
+{
+ struct bpf_task_work_ctx *ctx;
+
+ ctx = bpf_task_work_fetch_ctx(tw, map);
+ if (IS_ERR(ctx))
+ return ctx;
+
+ /* try to get ref for task_work callback to hold */
+ if (!bpf_task_work_ctx_tryget(ctx))
+ return ERR_PTR(-EBUSY);
+
+ if (cmpxchg(&ctx->state, BPF_TW_STANDBY, BPF_TW_PENDING) != BPF_TW_STANDBY) {
+ /* lost acquiring race or map_release_uref() stole it from us, put ref and bail */
+ bpf_task_work_ctx_put(ctx);
+ return ERR_PTR(-EBUSY);
+ }
+
+ /*
+ * If no process or bpffs is holding a reference to the map, no new callbacks should be
+ * scheduled. This does not address any race or correctness issue, but rather is a policy
+ * choice: dropping user references should stop everything.
+ */
+ if (!atomic64_read(&map->usercnt)) {
+ /* drop ref we just got for task_work callback itself */
+ bpf_task_work_ctx_put(ctx);
+ /* transfer map's ref into cancel_and_free() */
+ bpf_task_work_cancel_and_free(tw);
+ return ERR_PTR(-EBUSY);
+ }
+
+ return ctx;
+}
+
+static int bpf_task_work_schedule(struct task_struct *task, struct bpf_task_work *tw,
+ struct bpf_map *map, bpf_task_work_callback_t callback_fn,
+ struct bpf_prog_aux *aux, enum task_work_notify_mode mode)
+{
+ struct bpf_prog *prog;
+ struct bpf_task_work_ctx *ctx;
+ int err;
+
+ BTF_TYPE_EMIT(struct bpf_task_work);
+
+ prog = bpf_prog_inc_not_zero(aux->prog);
+ if (IS_ERR(prog))
+ return -EBADF;
+ task = bpf_task_acquire(task);
+ if (!task) {
+ err = -EBADF;
+ goto release_prog;
+ }
+
+ ctx = bpf_task_work_acquire_ctx(tw, map);
+ if (IS_ERR(ctx)) {
+ err = PTR_ERR(ctx);
+ goto release_all;
+ }
+
+ ctx->task = task;
+ ctx->callback_fn = callback_fn;
+ ctx->prog = prog;
+ ctx->mode = mode;
+ ctx->map = map;
+ ctx->map_val = (void *)tw - map->record->task_work_off;
+ init_task_work(&ctx->work, bpf_task_work_callback);
+ init_irq_work(&ctx->irq_work, bpf_task_work_irq);
+
+ irq_work_queue(&ctx->irq_work);
+ return 0;
+
+release_all:
+ bpf_task_release(task);
+release_prog:
+ bpf_prog_put(prog);
+ return err;
+}
+
+/**
+ * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode
+ * @task: Task struct for which callback should be scheduled
+ * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
+ * @map__map: bpf_map that embeds struct bpf_task_work in the values
+ * @callback: pointer to BPF subprogram to call
+ * @aux__prog: user should pass NULL
+ *
+ * Return: 0 if task work has been scheduled successfully, negative error code otherwise
+ */
+__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw,
+ void *map__map, bpf_task_work_callback_t callback,
+ void *aux__prog)
+{
+ return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL);
+}
+
+/**
+ * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME mode
+ * @task: Task struct for which callback should be scheduled
+ * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
+ * @map__map: bpf_map that embeds struct bpf_task_work in the values
+ * @callback: pointer to BPF subprogram to call
+ * @aux__prog: user should pass NULL
+ *
+ * Return: 0 if task work has been scheduled successfully, negative error code otherwise
+ */
+__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw,
+ void *map__map, bpf_task_work_callback_t callback,
+ void *aux__prog)
+{
+ return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME);
+}
__bpf_kfunc_end_defs();
+static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work)
+{
+ struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work);
+
+ bpf_task_work_cancel(ctx); /* this might put task_work callback's ref */
+ bpf_task_work_ctx_put(ctx); /* and here we put map's own ref that was transferred to us */
+}
+
+void bpf_task_work_cancel_and_free(void *val)
+{
+ struct bpf_task_work_kern *twk = val;
+ struct bpf_task_work_ctx *ctx;
+ enum bpf_task_work_state state;
+
+ ctx = xchg(&twk->ctx, NULL);
+ if (!ctx)
+ return;
+
+ state = xchg(&ctx->state, BPF_TW_FREED);
+ if (state == BPF_TW_SCHEDULED) {
+ /* run in irq_work to avoid locks in NMI */
+ init_irq_work(&ctx->irq_work, bpf_task_work_cancel_scheduled);
+ irq_work_queue(&ctx->irq_work);
+ return;
+ }
+
+ bpf_task_work_ctx_put(ctx); /* put bpf map's ref */
+}
+
BTF_KFUNCS_START(generic_btf_ids)
#ifdef CONFIG_CRASH_DUMP
BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
@@ -3743,6 +4268,14 @@ BTF_ID_FLAGS(func, bpf_throw)
#ifdef CONFIG_BPF_EVENTS
BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS)
#endif
+#ifdef CONFIG_KEYS
+BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE)
+#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
+BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE)
+#endif
+#endif
BTF_KFUNCS_END(generic_btf_ids)
static const struct btf_kfunc_id_set generic_kfunc_set = {
@@ -3824,6 +4357,7 @@ BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
#endif
BTF_ID_FLAGS(func, __bpf_trap)
BTF_ID_FLAGS(func, bpf_strcmp);
+BTF_ID_FLAGS(func, bpf_strcasecmp);
BTF_ID_FLAGS(func, bpf_strchr);
BTF_ID_FLAGS(func, bpf_strchrnul);
BTF_ID_FLAGS(func, bpf_strnchr);
@@ -3838,6 +4372,8 @@ BTF_ID_FLAGS(func, bpf_strnstr);
BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU)
#endif
BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 5c2e96b19392..f90bdcc0a047 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -442,7 +442,7 @@ static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw,
umode_t mode;
int ret;
- dentry = user_path_create(path_fd, pathname, &path, 0);
+ dentry = start_creating_user_path(path_fd, pathname, &path, 0);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
@@ -471,7 +471,7 @@ static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw,
ret = -EPERM;
}
out:
- done_path_create(&path, dentry);
+ end_creating_path(&path, dentry);
return ret;
}
@@ -788,7 +788,7 @@ static void bpf_free_inode(struct inode *inode)
const struct super_operations bpf_super_ops = {
.statfs = simple_statfs,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.show_options = bpf_show_options,
.free_inode = bpf_free_inode,
};
diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c
new file mode 100644
index 000000000000..3c611aba7f52
--- /dev/null
+++ b/kernel/bpf/liveness.c
@@ -0,0 +1,733 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+
+#include <linux/bpf_verifier.h>
+#include <linux/hashtable.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+
+/*
+ * This file implements live stack slots analysis. After accumulating
+ * stack usage data, the analysis answers queries about whether a
+ * particular stack slot may be read by an instruction or any of it's
+ * successors. This data is consumed by the verifier states caching
+ * mechanism to decide which stack slots are important when looking for a
+ * visited state corresponding to the current state.
+ *
+ * The analysis is call chain sensitive, meaning that data is collected
+ * and queried for tuples (call chain, subprogram instruction index).
+ * Such sensitivity allows identifying if some subprogram call always
+ * leads to writes in the caller's stack.
+ *
+ * The basic idea is as follows:
+ * - As the verifier accumulates a set of visited states, the analysis instance
+ * accumulates a conservative estimate of stack slots that can be read
+ * or must be written for each visited tuple (call chain, instruction index).
+ * - If several states happen to visit the same instruction with the same
+ * call chain, stack usage information for the corresponding tuple is joined:
+ * - "may_read" set represents a union of all possibly read slots
+ * (any slot in "may_read" set might be read at or after the instruction);
+ * - "must_write" set represents an intersection of all possibly written slots
+ * (any slot in "must_write" set is guaranteed to be written by the instruction).
+ * - The analysis is split into two phases:
+ * - read and write marks accumulation;
+ * - read and write marks propagation.
+ * - The propagation phase is a textbook live variable data flow analysis:
+ *
+ * state[cc, i].live_after = U [state[cc, s].live_before for s in insn_successors(i)]
+ * state[cc, i].live_before =
+ * (state[cc, i].live_after / state[cc, i].must_write) U state[i].may_read
+ *
+ * Where:
+ * - `U` stands for set union
+ * - `/` stands for set difference;
+ * - `cc` stands for a call chain;
+ * - `i` and `s` are instruction indexes;
+ *
+ * The above equations are computed for each call chain and instruction
+ * index until state stops changing.
+ * - Additionally, in order to transfer "must_write" information from a
+ * subprogram to call instructions invoking this subprogram,
+ * the "must_write_acc" set is tracked for each (cc, i) tuple.
+ * A set of stack slots that are guaranteed to be written by this
+ * instruction or any of its successors (within the subprogram).
+ * The equation for "must_write_acc" propagation looks as follows:
+ *
+ * state[cc, i].must_write_acc =
+ * ∩ [state[cc, s].must_write_acc for s in insn_successors(i)]
+ * U state[cc, i].must_write
+ *
+ * (An intersection of all "must_write_acc" for instruction successors
+ * plus all "must_write" slots for the instruction itself).
+ * - After the propagation phase completes for a subprogram, information from
+ * (cc, 0) tuple (subprogram entry) is transferred to the caller's call chain:
+ * - "must_write_acc" set is intersected with the call site's "must_write" set;
+ * - "may_read" set is added to the call site's "may_read" set.
+ * - Any live stack queries must be taken after the propagation phase.
+ * - Accumulation and propagation phases can be entered multiple times,
+ * at any point in time:
+ * - "may_read" set only grows;
+ * - "must_write" set only shrinks;
+ * - for each visited verifier state with zero branches, all relevant
+ * read and write marks are already recorded by the analysis instance.
+ *
+ * Technically, the analysis is facilitated by the following data structures:
+ * - Call chain: for given verifier state, the call chain is a tuple of call
+ * instruction indexes leading to the current subprogram plus the subprogram
+ * entry point index.
+ * - Function instance: for a given call chain, for each instruction in
+ * the current subprogram, a mapping between instruction index and a
+ * set of "may_read", "must_write" and other marks accumulated for this
+ * instruction.
+ * - A hash table mapping call chains to function instances.
+ */
+
+struct callchain {
+ u32 callsites[MAX_CALL_FRAMES]; /* instruction pointer for each frame */
+ /* cached subprog_info[*].start for functions owning the frames:
+ * - sp_starts[curframe] used to get insn relative index within current function;
+ * - sp_starts[0..current-1] used for fast callchain_frame_up().
+ */
+ u32 sp_starts[MAX_CALL_FRAMES];
+ u32 curframe; /* depth of callsites and sp_starts arrays */
+};
+
+struct per_frame_masks {
+ u64 may_read; /* stack slots that may be read by this instruction */
+ u64 must_write; /* stack slots written by this instruction */
+ u64 must_write_acc; /* stack slots written by this instruction and its successors */
+ u64 live_before; /* stack slots that may be read by this insn and its successors */
+};
+
+/*
+ * A function instance created for a specific callchain.
+ * Encapsulates read and write marks for each instruction in the function.
+ * Marks are tracked for each frame in the callchain.
+ */
+struct func_instance {
+ struct hlist_node hl_node;
+ struct callchain callchain;
+ u32 insn_cnt; /* cached number of insns in the function */
+ bool updated;
+ bool must_write_dropped;
+ /* Per frame, per instruction masks, frames allocated lazily. */
+ struct per_frame_masks *frames[MAX_CALL_FRAMES];
+ /* For each instruction a flag telling if "must_write" had been initialized for it. */
+ bool *must_write_set;
+};
+
+struct live_stack_query {
+ struct func_instance *instances[MAX_CALL_FRAMES]; /* valid in range [0..curframe] */
+ u32 curframe;
+ u32 insn_idx;
+};
+
+struct bpf_liveness {
+ DECLARE_HASHTABLE(func_instances, 8); /* maps callchain to func_instance */
+ struct live_stack_query live_stack_query; /* cache to avoid repetitive ht lookups */
+ /* Cached instance corresponding to env->cur_state, avoids per-instruction ht lookup */
+ struct func_instance *cur_instance;
+ /*
+ * Below fields are used to accumulate stack write marks for instruction at
+ * @write_insn_idx before submitting the marks to @cur_instance.
+ */
+ u64 write_masks_acc[MAX_CALL_FRAMES];
+ u32 write_insn_idx;
+};
+
+/* Compute callchain corresponding to state @st at depth @frameno */
+static void compute_callchain(struct bpf_verifier_env *env, struct bpf_verifier_state *st,
+ struct callchain *callchain, u32 frameno)
+{
+ struct bpf_subprog_info *subprog_info = env->subprog_info;
+ u32 i;
+
+ memset(callchain, 0, sizeof(*callchain));
+ for (i = 0; i <= frameno; i++) {
+ callchain->sp_starts[i] = subprog_info[st->frame[i]->subprogno].start;
+ if (i < st->curframe)
+ callchain->callsites[i] = st->frame[i + 1]->callsite;
+ }
+ callchain->curframe = frameno;
+ callchain->callsites[callchain->curframe] = callchain->sp_starts[callchain->curframe];
+}
+
+static u32 hash_callchain(struct callchain *callchain)
+{
+ return jhash2(callchain->callsites, callchain->curframe, 0);
+}
+
+static bool same_callsites(struct callchain *a, struct callchain *b)
+{
+ int i;
+
+ if (a->curframe != b->curframe)
+ return false;
+ for (i = a->curframe; i >= 0; i--)
+ if (a->callsites[i] != b->callsites[i])
+ return false;
+ return true;
+}
+
+/*
+ * Find existing or allocate new function instance corresponding to @callchain.
+ * Instances are accumulated in env->liveness->func_instances and persist
+ * until the end of the verification process.
+ */
+static struct func_instance *__lookup_instance(struct bpf_verifier_env *env,
+ struct callchain *callchain)
+{
+ struct bpf_liveness *liveness = env->liveness;
+ struct bpf_subprog_info *subprog;
+ struct func_instance *result;
+ u32 subprog_sz, size, key;
+
+ key = hash_callchain(callchain);
+ hash_for_each_possible(liveness->func_instances, result, hl_node, key)
+ if (same_callsites(&result->callchain, callchain))
+ return result;
+
+ subprog = bpf_find_containing_subprog(env, callchain->sp_starts[callchain->curframe]);
+ subprog_sz = (subprog + 1)->start - subprog->start;
+ size = sizeof(struct func_instance);
+ result = kvzalloc(size, GFP_KERNEL_ACCOUNT);
+ if (!result)
+ return ERR_PTR(-ENOMEM);
+ result->must_write_set = kvcalloc(subprog_sz, sizeof(*result->must_write_set),
+ GFP_KERNEL_ACCOUNT);
+ if (!result->must_write_set)
+ return ERR_PTR(-ENOMEM);
+ memcpy(&result->callchain, callchain, sizeof(*callchain));
+ result->insn_cnt = subprog_sz;
+ hash_add(liveness->func_instances, &result->hl_node, key);
+ return result;
+}
+
+static struct func_instance *lookup_instance(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st,
+ u32 frameno)
+{
+ struct callchain callchain;
+
+ compute_callchain(env, st, &callchain, frameno);
+ return __lookup_instance(env, &callchain);
+}
+
+int bpf_stack_liveness_init(struct bpf_verifier_env *env)
+{
+ env->liveness = kvzalloc(sizeof(*env->liveness), GFP_KERNEL_ACCOUNT);
+ if (!env->liveness)
+ return -ENOMEM;
+ hash_init(env->liveness->func_instances);
+ return 0;
+}
+
+void bpf_stack_liveness_free(struct bpf_verifier_env *env)
+{
+ struct func_instance *instance;
+ struct hlist_node *tmp;
+ int bkt, i;
+
+ if (!env->liveness)
+ return;
+ hash_for_each_safe(env->liveness->func_instances, bkt, tmp, instance, hl_node) {
+ for (i = 0; i <= instance->callchain.curframe; i++)
+ kvfree(instance->frames[i]);
+ kvfree(instance->must_write_set);
+ kvfree(instance);
+ }
+ kvfree(env->liveness);
+}
+
+/*
+ * Convert absolute instruction index @insn_idx to an index relative
+ * to start of the function corresponding to @instance.
+ */
+static int relative_idx(struct func_instance *instance, u32 insn_idx)
+{
+ return insn_idx - instance->callchain.sp_starts[instance->callchain.curframe];
+}
+
+static struct per_frame_masks *get_frame_masks(struct func_instance *instance,
+ u32 frame, u32 insn_idx)
+{
+ if (!instance->frames[frame])
+ return NULL;
+
+ return &instance->frames[frame][relative_idx(instance, insn_idx)];
+}
+
+static struct per_frame_masks *alloc_frame_masks(struct bpf_verifier_env *env,
+ struct func_instance *instance,
+ u32 frame, u32 insn_idx)
+{
+ struct per_frame_masks *arr;
+
+ if (!instance->frames[frame]) {
+ arr = kvcalloc(instance->insn_cnt, sizeof(*arr), GFP_KERNEL_ACCOUNT);
+ instance->frames[frame] = arr;
+ if (!arr)
+ return ERR_PTR(-ENOMEM);
+ }
+ return get_frame_masks(instance, frame, insn_idx);
+}
+
+void bpf_reset_live_stack_callchain(struct bpf_verifier_env *env)
+{
+ env->liveness->cur_instance = NULL;
+}
+
+/* If @env->liveness->cur_instance is null, set it to instance corresponding to @env->cur_state. */
+static int ensure_cur_instance(struct bpf_verifier_env *env)
+{
+ struct bpf_liveness *liveness = env->liveness;
+ struct func_instance *instance;
+
+ if (liveness->cur_instance)
+ return 0;
+
+ instance = lookup_instance(env, env->cur_state, env->cur_state->curframe);
+ if (IS_ERR(instance))
+ return PTR_ERR(instance);
+
+ liveness->cur_instance = instance;
+ return 0;
+}
+
+/* Accumulate may_read masks for @frame at @insn_idx */
+static int mark_stack_read(struct bpf_verifier_env *env,
+ struct func_instance *instance, u32 frame, u32 insn_idx, u64 mask)
+{
+ struct per_frame_masks *masks;
+ u64 new_may_read;
+
+ masks = alloc_frame_masks(env, instance, frame, insn_idx);
+ if (IS_ERR(masks))
+ return PTR_ERR(masks);
+ new_may_read = masks->may_read | mask;
+ if (new_may_read != masks->may_read &&
+ ((new_may_read | masks->live_before) != masks->live_before))
+ instance->updated = true;
+ masks->may_read |= mask;
+ return 0;
+}
+
+int bpf_mark_stack_read(struct bpf_verifier_env *env, u32 frame, u32 insn_idx, u64 mask)
+{
+ int err;
+
+ err = ensure_cur_instance(env);
+ err = err ?: mark_stack_read(env, env->liveness->cur_instance, frame, insn_idx, mask);
+ return err;
+}
+
+static void reset_stack_write_marks(struct bpf_verifier_env *env,
+ struct func_instance *instance, u32 insn_idx)
+{
+ struct bpf_liveness *liveness = env->liveness;
+ int i;
+
+ liveness->write_insn_idx = insn_idx;
+ for (i = 0; i <= instance->callchain.curframe; i++)
+ liveness->write_masks_acc[i] = 0;
+}
+
+int bpf_reset_stack_write_marks(struct bpf_verifier_env *env, u32 insn_idx)
+{
+ struct bpf_liveness *liveness = env->liveness;
+ int err;
+
+ err = ensure_cur_instance(env);
+ if (err)
+ return err;
+
+ reset_stack_write_marks(env, liveness->cur_instance, insn_idx);
+ return 0;
+}
+
+void bpf_mark_stack_write(struct bpf_verifier_env *env, u32 frame, u64 mask)
+{
+ env->liveness->write_masks_acc[frame] |= mask;
+}
+
+static int commit_stack_write_marks(struct bpf_verifier_env *env,
+ struct func_instance *instance)
+{
+ struct bpf_liveness *liveness = env->liveness;
+ u32 idx, frame, curframe, old_must_write;
+ struct per_frame_masks *masks;
+ u64 mask;
+
+ if (!instance)
+ return 0;
+
+ curframe = instance->callchain.curframe;
+ idx = relative_idx(instance, liveness->write_insn_idx);
+ for (frame = 0; frame <= curframe; frame++) {
+ mask = liveness->write_masks_acc[frame];
+ /* avoid allocating frames for zero masks */
+ if (mask == 0 && !instance->must_write_set[idx])
+ continue;
+ masks = alloc_frame_masks(env, instance, frame, liveness->write_insn_idx);
+ if (IS_ERR(masks))
+ return PTR_ERR(masks);
+ old_must_write = masks->must_write;
+ /*
+ * If instruction at this callchain is seen for a first time, set must_write equal
+ * to @mask. Otherwise take intersection with the previous value.
+ */
+ if (instance->must_write_set[idx])
+ mask &= old_must_write;
+ if (old_must_write != mask) {
+ masks->must_write = mask;
+ instance->updated = true;
+ }
+ if (old_must_write & ~mask)
+ instance->must_write_dropped = true;
+ }
+ instance->must_write_set[idx] = true;
+ liveness->write_insn_idx = 0;
+ return 0;
+}
+
+/*
+ * Merge stack writes marks in @env->liveness->write_masks_acc
+ * with information already in @env->liveness->cur_instance.
+ */
+int bpf_commit_stack_write_marks(struct bpf_verifier_env *env)
+{
+ return commit_stack_write_marks(env, env->liveness->cur_instance);
+}
+
+static char *fmt_callchain(struct bpf_verifier_env *env, struct callchain *callchain)
+{
+ char *buf_end = env->tmp_str_buf + sizeof(env->tmp_str_buf);
+ char *buf = env->tmp_str_buf;
+ int i;
+
+ buf += snprintf(buf, buf_end - buf, "(");
+ for (i = 0; i <= callchain->curframe; i++)
+ buf += snprintf(buf, buf_end - buf, "%s%d", i ? "," : "", callchain->callsites[i]);
+ snprintf(buf, buf_end - buf, ")");
+ return env->tmp_str_buf;
+}
+
+static void log_mask_change(struct bpf_verifier_env *env, struct callchain *callchain,
+ char *pfx, u32 frame, u32 insn_idx, u64 old, u64 new)
+{
+ u64 changed_bits = old ^ new;
+ u64 new_ones = new & changed_bits;
+ u64 new_zeros = ~new & changed_bits;
+
+ if (!changed_bits)
+ return;
+ bpf_log(&env->log, "%s frame %d insn %d ", fmt_callchain(env, callchain), frame, insn_idx);
+ if (new_ones) {
+ bpf_fmt_stack_mask(env->tmp_str_buf, sizeof(env->tmp_str_buf), new_ones);
+ bpf_log(&env->log, "+%s %s ", pfx, env->tmp_str_buf);
+ }
+ if (new_zeros) {
+ bpf_fmt_stack_mask(env->tmp_str_buf, sizeof(env->tmp_str_buf), new_zeros);
+ bpf_log(&env->log, "-%s %s", pfx, env->tmp_str_buf);
+ }
+ bpf_log(&env->log, "\n");
+}
+
+int bpf_jmp_offset(struct bpf_insn *insn)
+{
+ u8 code = insn->code;
+
+ if (code == (BPF_JMP32 | BPF_JA))
+ return insn->imm;
+ return insn->off;
+}
+
+__diag_push();
+__diag_ignore_all("-Woverride-init", "Allow field initialization overrides for opcode_info_tbl");
+
+inline int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2])
+{
+ static const struct opcode_info {
+ bool can_jump;
+ bool can_fallthrough;
+ } opcode_info_tbl[256] = {
+ [0 ... 255] = {.can_jump = false, .can_fallthrough = true},
+ #define _J(code, ...) \
+ [BPF_JMP | code] = __VA_ARGS__, \
+ [BPF_JMP32 | code] = __VA_ARGS__
+
+ _J(BPF_EXIT, {.can_jump = false, .can_fallthrough = false}),
+ _J(BPF_JA, {.can_jump = true, .can_fallthrough = false}),
+ _J(BPF_JEQ, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JNE, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JLT, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JLE, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JGT, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JGE, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JSGT, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JSGE, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JSLT, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JSLE, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JCOND, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JSET, {.can_jump = true, .can_fallthrough = true}),
+ #undef _J
+ };
+ struct bpf_insn *insn = &prog->insnsi[idx];
+ const struct opcode_info *opcode_info;
+ int i = 0, insn_sz;
+
+ opcode_info = &opcode_info_tbl[BPF_CLASS(insn->code) | BPF_OP(insn->code)];
+ insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
+ if (opcode_info->can_fallthrough)
+ succ[i++] = idx + insn_sz;
+
+ if (opcode_info->can_jump)
+ succ[i++] = idx + bpf_jmp_offset(insn) + 1;
+
+ return i;
+}
+
+__diag_pop();
+
+static struct func_instance *get_outer_instance(struct bpf_verifier_env *env,
+ struct func_instance *instance)
+{
+ struct callchain callchain = instance->callchain;
+
+ /* Adjust @callchain to represent callchain one frame up */
+ callchain.callsites[callchain.curframe] = 0;
+ callchain.sp_starts[callchain.curframe] = 0;
+ callchain.curframe--;
+ callchain.callsites[callchain.curframe] = callchain.sp_starts[callchain.curframe];
+ return __lookup_instance(env, &callchain);
+}
+
+static u32 callchain_subprog_start(struct callchain *callchain)
+{
+ return callchain->sp_starts[callchain->curframe];
+}
+
+/*
+ * Transfer @may_read and @must_write_acc marks from the first instruction of @instance,
+ * to the call instruction in function instance calling @instance.
+ */
+static int propagate_to_outer_instance(struct bpf_verifier_env *env,
+ struct func_instance *instance)
+{
+ struct callchain *callchain = &instance->callchain;
+ u32 this_subprog_start, callsite, frame;
+ struct func_instance *outer_instance;
+ struct per_frame_masks *insn;
+ int err;
+
+ this_subprog_start = callchain_subprog_start(callchain);
+ outer_instance = get_outer_instance(env, instance);
+ callsite = callchain->callsites[callchain->curframe - 1];
+
+ reset_stack_write_marks(env, outer_instance, callsite);
+ for (frame = 0; frame < callchain->curframe; frame++) {
+ insn = get_frame_masks(instance, frame, this_subprog_start);
+ if (!insn)
+ continue;
+ bpf_mark_stack_write(env, frame, insn->must_write_acc);
+ err = mark_stack_read(env, outer_instance, frame, callsite, insn->live_before);
+ if (err)
+ return err;
+ }
+ commit_stack_write_marks(env, outer_instance);
+ return 0;
+}
+
+static inline bool update_insn(struct bpf_verifier_env *env,
+ struct func_instance *instance, u32 frame, u32 insn_idx)
+{
+ struct bpf_insn_aux_data *aux = env->insn_aux_data;
+ u64 new_before, new_after, must_write_acc;
+ struct per_frame_masks *insn, *succ_insn;
+ u32 succ_num, s, succ[2];
+ bool changed;
+
+ succ_num = bpf_insn_successors(env->prog, insn_idx, succ);
+ if (unlikely(succ_num == 0))
+ return false;
+
+ changed = false;
+ insn = get_frame_masks(instance, frame, insn_idx);
+ new_before = 0;
+ new_after = 0;
+ /*
+ * New "must_write_acc" is an intersection of all "must_write_acc"
+ * of successors plus all "must_write" slots of instruction itself.
+ */
+ must_write_acc = U64_MAX;
+ for (s = 0; s < succ_num; ++s) {
+ succ_insn = get_frame_masks(instance, frame, succ[s]);
+ new_after |= succ_insn->live_before;
+ must_write_acc &= succ_insn->must_write_acc;
+ }
+ must_write_acc |= insn->must_write;
+ /*
+ * New "live_before" is a union of all "live_before" of successors
+ * minus slots written by instruction plus slots read by instruction.
+ */
+ new_before = (new_after & ~insn->must_write) | insn->may_read;
+ changed |= new_before != insn->live_before;
+ changed |= must_write_acc != insn->must_write_acc;
+ if (unlikely(env->log.level & BPF_LOG_LEVEL2) &&
+ (insn->may_read || insn->must_write ||
+ insn_idx == callchain_subprog_start(&instance->callchain) ||
+ aux[insn_idx].prune_point)) {
+ log_mask_change(env, &instance->callchain, "live",
+ frame, insn_idx, insn->live_before, new_before);
+ log_mask_change(env, &instance->callchain, "written",
+ frame, insn_idx, insn->must_write_acc, must_write_acc);
+ }
+ insn->live_before = new_before;
+ insn->must_write_acc = must_write_acc;
+ return changed;
+}
+
+/* Fixed-point computation of @live_before and @must_write_acc marks */
+static int update_instance(struct bpf_verifier_env *env, struct func_instance *instance)
+{
+ u32 i, frame, po_start, po_end, cnt, this_subprog_start;
+ struct callchain *callchain = &instance->callchain;
+ int *insn_postorder = env->cfg.insn_postorder;
+ struct bpf_subprog_info *subprog;
+ struct per_frame_masks *insn;
+ bool changed;
+ int err;
+
+ this_subprog_start = callchain_subprog_start(callchain);
+ /*
+ * If must_write marks were updated must_write_acc needs to be reset
+ * (to account for the case when new must_write sets became smaller).
+ */
+ if (instance->must_write_dropped) {
+ for (frame = 0; frame <= callchain->curframe; frame++) {
+ if (!instance->frames[frame])
+ continue;
+
+ for (i = 0; i < instance->insn_cnt; i++) {
+ insn = get_frame_masks(instance, frame, this_subprog_start + i);
+ insn->must_write_acc = 0;
+ }
+ }
+ }
+
+ subprog = bpf_find_containing_subprog(env, this_subprog_start);
+ po_start = subprog->postorder_start;
+ po_end = (subprog + 1)->postorder_start;
+ cnt = 0;
+ /* repeat until fixed point is reached */
+ do {
+ cnt++;
+ changed = false;
+ for (frame = 0; frame <= instance->callchain.curframe; frame++) {
+ if (!instance->frames[frame])
+ continue;
+
+ for (i = po_start; i < po_end; i++)
+ changed |= update_insn(env, instance, frame, insn_postorder[i]);
+ }
+ } while (changed);
+
+ if (env->log.level & BPF_LOG_LEVEL2)
+ bpf_log(&env->log, "%s live stack update done in %d iterations\n",
+ fmt_callchain(env, callchain), cnt);
+
+ /* transfer marks accumulated for outer frames to outer func instance (caller) */
+ if (callchain->curframe > 0) {
+ err = propagate_to_outer_instance(env, instance);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/*
+ * Prepare all callchains within @env->cur_state for querying.
+ * This function should be called after each verifier.c:pop_stack()
+ * and whenever verifier.c:do_check_insn() processes subprogram exit.
+ * This would guarantee that visited verifier states with zero branches
+ * have their bpf_mark_stack_{read,write}() effects propagated in
+ * @env->liveness.
+ */
+int bpf_update_live_stack(struct bpf_verifier_env *env)
+{
+ struct func_instance *instance;
+ int err, frame;
+
+ bpf_reset_live_stack_callchain(env);
+ for (frame = env->cur_state->curframe; frame >= 0; --frame) {
+ instance = lookup_instance(env, env->cur_state, frame);
+ if (IS_ERR(instance))
+ return PTR_ERR(instance);
+
+ if (instance->updated) {
+ err = update_instance(env, instance);
+ if (err)
+ return err;
+ instance->updated = false;
+ instance->must_write_dropped = false;
+ }
+ }
+ return 0;
+}
+
+static bool is_live_before(struct func_instance *instance, u32 insn_idx, u32 frameno, u32 spi)
+{
+ struct per_frame_masks *masks;
+
+ masks = get_frame_masks(instance, frameno, insn_idx);
+ return masks && (masks->live_before & BIT(spi));
+}
+
+int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ struct live_stack_query *q = &env->liveness->live_stack_query;
+ struct func_instance *instance;
+ u32 frame;
+
+ memset(q, 0, sizeof(*q));
+ for (frame = 0; frame <= st->curframe; frame++) {
+ instance = lookup_instance(env, st, frame);
+ if (IS_ERR(instance))
+ return PTR_ERR(instance);
+ q->instances[frame] = instance;
+ }
+ q->curframe = st->curframe;
+ q->insn_idx = st->insn_idx;
+ return 0;
+}
+
+bool bpf_stack_slot_alive(struct bpf_verifier_env *env, u32 frameno, u32 spi)
+{
+ /*
+ * Slot is alive if it is read before q->st->insn_idx in current func instance,
+ * or if for some outer func instance:
+ * - alive before callsite if callsite calls callback, otherwise
+ * - alive after callsite
+ */
+ struct live_stack_query *q = &env->liveness->live_stack_query;
+ struct func_instance *instance, *curframe_instance;
+ u32 i, callsite;
+ bool alive;
+
+ curframe_instance = q->instances[q->curframe];
+ if (is_live_before(curframe_instance, q->insn_idx, frameno, spi))
+ return true;
+
+ for (i = frameno; i < q->curframe; i++) {
+ callsite = curframe_instance->callchain.callsites[i];
+ instance = q->instances[i];
+ alive = bpf_calls_callback(env, callsite)
+ ? is_live_before(instance, callsite, frameno, spi)
+ : is_live_before(instance, callsite + 1, frameno, spi);
+ if (alive)
+ return true;
+ }
+
+ return false;
+}
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 632d51b05fe9..c93a756e035c 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -165,7 +165,7 @@ static long cgroup_storage_update_elem(struct bpf_map *map, void *key,
}
new = bpf_map_kmalloc_node(map, struct_size(new, data, map->value_size),
- __GFP_ZERO | GFP_NOWAIT | __GFP_NOWARN,
+ __GFP_ZERO | GFP_NOWAIT,
map->numa_node);
if (!new)
return -ENOMEM;
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 38050f4ee400..f50533169cc3 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -498,6 +498,8 @@ const char *dynptr_type_str(enum bpf_dynptr_type type)
return "skb";
case BPF_DYNPTR_TYPE_XDP:
return "xdp";
+ case BPF_DYNPTR_TYPE_SKB_META:
+ return "skb_meta";
case BPF_DYNPTR_TYPE_INVALID:
return "<invalid>";
default:
@@ -540,19 +542,6 @@ static char slot_type_char[] = {
[STACK_IRQ_FLAG] = 'f'
};
-static void print_liveness(struct bpf_verifier_env *env,
- enum bpf_reg_liveness live)
-{
- if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE))
- verbose(env, "_");
- if (live & REG_LIVE_READ)
- verbose(env, "r");
- if (live & REG_LIVE_WRITTEN)
- verbose(env, "w");
- if (live & REG_LIVE_DONE)
- verbose(env, "D");
-}
-
#define UNUM_MAX_DECIMAL U16_MAX
#define SNUM_MAX_DECIMAL S16_MAX
#define SNUM_MIN_DECIMAL S16_MIN
@@ -770,7 +759,6 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie
if (!print_all && !reg_scratched(env, i))
continue;
verbose(env, " R%d", i);
- print_liveness(env, reg->live);
verbose(env, "=");
print_reg_state(env, state, reg);
}
@@ -803,9 +791,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie
break;
types_buf[j] = '\0';
- verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
- print_liveness(env, reg->live);
- verbose(env, "=%s", types_buf);
+ verbose(env, " fp%d=%s", (-i - 1) * BPF_REG_SIZE, types_buf);
print_reg_state(env, state, reg);
break;
case STACK_DYNPTR:
@@ -814,7 +800,6 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie
reg = &state->stack[i].spilled_ptr;
verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
- print_liveness(env, reg->live);
verbose(env, "=dynptr_%s(", dynptr_type_str(reg->dynptr.type));
if (reg->id)
verbose_a("id=%d", reg->id);
@@ -829,9 +814,8 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie
if (!reg->ref_obj_id)
continue;
- verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
- print_liveness(env, reg->live);
- verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)",
+ verbose(env, " fp%d=iter_%s(ref_id=%d,state=%s,depth=%u)",
+ (-i - 1) * BPF_REG_SIZE,
iter_type_str(reg->iter.btf, reg->iter.btf_id),
reg->ref_obj_id, iter_state_str(reg->iter.state),
reg->iter.depth);
@@ -839,9 +823,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie
case STACK_MISC:
case STACK_ZERO:
default:
- verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
- print_liveness(env, reg->live);
- verbose(env, "=%s", types_buf);
+ verbose(env, " fp%d=%s", (-i - 1) * BPF_REG_SIZE, types_buf);
break;
}
}
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 889374722d0a..bd45dda9dc35 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -736,7 +736,7 @@ static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress)
/* Defer barriers into worker to let the rest of map memory to be freed */
memset(ma, 0, sizeof(*ma));
INIT_WORK(&copy->work, free_mem_alloc_deferred);
- queue_work(system_unbound_wq, &copy->work);
+ queue_work(system_dfl_wq, &copy->work);
}
void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c
index 5ab354d55d82..a00561b1d3e5 100644
--- a/kernel/bpf/rqspinlock.c
+++ b/kernel/bpf/rqspinlock.c
@@ -471,7 +471,7 @@ queue:
* any MCS node. This is not the most elegant solution, but is
* simple enough.
*/
- if (unlikely(idx >= _Q_MAX_NODES)) {
+ if (unlikely(idx >= _Q_MAX_NODES || in_nmi())) {
lockevent_inc(lock_no_node);
RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT);
while (!queued_spin_trylock(lock)) {
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 3615c06b7dfa..4d53cdd1374c 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -314,7 +314,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
if (max_depth > sysctl_perf_event_max_stack)
max_depth = sysctl_perf_event_max_stack;
- trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
+ trace = get_perf_callchain(regs, kernel, user, max_depth,
false, false);
if (unlikely(!trace))
@@ -451,7 +451,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
else if (kernel && task)
trace = get_callchain_entry_for_task(task, max_depth);
else
- trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
+ trace = get_perf_callchain(regs, kernel, user, max_depth,
crosstask, false);
if (unlikely(!trace) || trace->nr < skip) {
@@ -646,7 +646,15 @@ static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
}
/* Called from syscall */
-int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
+static int stack_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ return bpf_stackmap_extract(map, key, value, true);
+}
+
+/* Called from syscall */
+int bpf_stackmap_extract(struct bpf_map *map, void *key, void *value,
+ bool delete)
{
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct stack_map_bucket *bucket, *old_bucket;
@@ -663,7 +671,10 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
memcpy(value, bucket->data, trace_len);
memset(value + trace_len, 0, map->value_size - trace_len);
- old_bucket = xchg(&smap->buckets[id], bucket);
+ if (delete)
+ old_bucket = bucket;
+ else
+ old_bucket = xchg(&smap->buckets[id], bucket);
if (old_bucket)
pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
return 0;
@@ -754,6 +765,7 @@ const struct bpf_map_ops stack_trace_map_ops = {
.map_free = stack_map_free,
.map_get_next_key = stack_map_get_next_key,
.map_lookup_elem = stack_map_lookup_elem,
+ .map_lookup_and_delete_elem = stack_map_lookup_and_delete_elem,
.map_update_elem = stack_map_update_elem,
.map_delete_elem = stack_map_delete_elem,
.map_check_btf = map_check_no_btf,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0fbfa8532c39..a48fa86f82a7 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
*/
+#include <crypto/sha2.h>
#include <linux/bpf.h>
#include <linux/bpf-cgroup.h>
#include <linux/bpf_trace.h>
@@ -38,6 +39,7 @@
#include <linux/tracepoint.h>
#include <linux/overflow.h>
#include <linux/cookie.h>
+#include <linux/verification.h>
#include <net/netfilter/nf_bpf_link.h>
#include <net/netkit.h>
@@ -318,7 +320,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
err = bpf_percpu_cgroup_storage_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
- err = bpf_stackmap_copy(map, key, value);
+ err = bpf_stackmap_extract(map, key, value, false);
} else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
err = bpf_fd_array_map_lookup_elem(map, key, value);
} else if (IS_FD_HASH(map)) {
@@ -672,6 +674,7 @@ void btf_record_free(struct btf_record *rec)
case BPF_TIMER:
case BPF_REFCOUNT:
case BPF_WORKQUEUE:
+ case BPF_TASK_WORK:
/* Nothing to release */
break;
default:
@@ -725,6 +728,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
case BPF_TIMER:
case BPF_REFCOUNT:
case BPF_WORKQUEUE:
+ case BPF_TASK_WORK:
/* Nothing to acquire */
break;
default:
@@ -783,6 +787,13 @@ void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj)
bpf_wq_cancel_and_free(obj + rec->wq_off);
}
+void bpf_obj_free_task_work(const struct btf_record *rec, void *obj)
+{
+ if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TASK_WORK)))
+ return;
+ bpf_task_work_cancel_and_free(obj + rec->task_work_off);
+}
+
void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
{
const struct btf_field *fields;
@@ -807,6 +818,9 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
case BPF_WORKQUEUE:
bpf_wq_cancel_and_free(field_ptr);
break;
+ case BPF_TASK_WORK:
+ bpf_task_work_cancel_and_free(field_ptr);
+ break;
case BPF_KPTR_UNREF:
WRITE_ONCE(*(u64 *)field_ptr, 0);
break;
@@ -860,6 +874,7 @@ static void bpf_map_free(struct bpf_map *map)
* the free of values or special fields allocated from bpf memory
* allocator.
*/
+ kfree(map->excl_prog_sha);
migrate_disable();
map->ops->map_free(map);
migrate_enable();
@@ -905,7 +920,7 @@ static void bpf_map_free_in_work(struct bpf_map *map)
/* Avoid spawning kworkers, since they all might contend
* for the same mutex like slab_mutex.
*/
- queue_work(system_unbound_wq, &map->work);
+ queue_work(system_dfl_wq, &map->work);
}
static void bpf_map_free_rcu_gp(struct rcu_head *rcu)
@@ -1237,7 +1252,8 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
map->record = btf_parse_fields(btf, value_type,
BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
- BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR,
+ BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR |
+ BPF_TASK_WORK,
map->value_size);
if (!IS_ERR_OR_NULL(map->record)) {
int i;
@@ -1269,6 +1285,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
break;
case BPF_TIMER:
case BPF_WORKQUEUE:
+ case BPF_TASK_WORK:
if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY) {
@@ -1338,9 +1355,9 @@ static bool bpf_net_capable(void)
return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN);
}
-#define BPF_MAP_CREATE_LAST_FIELD map_token_fd
+#define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size
/* called via syscall */
-static int map_create(union bpf_attr *attr, bool kernel)
+static int map_create(union bpf_attr *attr, bpfptr_t uattr)
{
const struct bpf_map_ops *ops;
struct bpf_token *token = NULL;
@@ -1534,7 +1551,29 @@ static int map_create(union bpf_attr *attr, bool kernel)
attr->btf_vmlinux_value_type_id;
}
- err = security_bpf_map_create(map, attr, token, kernel);
+ if (attr->excl_prog_hash) {
+ bpfptr_t uprog_hash = make_bpfptr(attr->excl_prog_hash, uattr.is_kernel);
+
+ if (attr->excl_prog_hash_size != SHA256_DIGEST_SIZE) {
+ err = -EINVAL;
+ goto free_map;
+ }
+
+ map->excl_prog_sha = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
+ if (!map->excl_prog_sha) {
+ err = -ENOMEM;
+ goto free_map;
+ }
+
+ if (copy_from_bpfptr(map->excl_prog_sha, uprog_hash, SHA256_DIGEST_SIZE)) {
+ err = -EFAULT;
+ goto free_map;
+ }
+ } else if (attr->excl_prog_hash_size) {
+ return -EINVAL;
+ }
+
+ err = security_bpf_map_create(map, attr, token, uattr.is_kernel);
if (err)
goto free_map_sec;
@@ -1627,7 +1666,8 @@ struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map)
}
EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);
-int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
+int __weak bpf_stackmap_extract(struct bpf_map *map, void *key, void *value,
+ bool delete)
{
return -ENOTSUPP;
}
@@ -2158,7 +2198,8 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
} else if (map->map_type == BPF_MAP_TYPE_HASH ||
map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
if (!bpf_map_is_offloaded(map)) {
bpf_disable_instrumentation();
rcu_read_lock();
@@ -2761,8 +2802,44 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
}
}
+static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr,
+ bool is_kernel)
+{
+ bpfptr_t usig = make_bpfptr(attr->signature, is_kernel);
+ struct bpf_dynptr_kern sig_ptr, insns_ptr;
+ struct bpf_key *key = NULL;
+ void *sig;
+ int err = 0;
+
+ if (system_keyring_id_check(attr->keyring_id) == 0)
+ key = bpf_lookup_system_key(attr->keyring_id);
+ else
+ key = bpf_lookup_user_key(attr->keyring_id, 0);
+
+ if (!key)
+ return -EINVAL;
+
+ sig = kvmemdup_bpfptr(usig, attr->signature_size);
+ if (IS_ERR(sig)) {
+ bpf_key_put(key);
+ return -ENOMEM;
+ }
+
+ bpf_dynptr_init(&sig_ptr, sig, BPF_DYNPTR_TYPE_LOCAL, 0,
+ attr->signature_size);
+ bpf_dynptr_init(&insns_ptr, prog->insnsi, BPF_DYNPTR_TYPE_LOCAL, 0,
+ prog->len * sizeof(struct bpf_insn));
+
+ err = bpf_verify_pkcs7_signature((struct bpf_dynptr *)&insns_ptr,
+ (struct bpf_dynptr *)&sig_ptr, key);
+
+ bpf_key_put(key);
+ kvfree(sig);
+ return err;
+}
+
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD fd_array_cnt
+#define BPF_PROG_LOAD_LAST_FIELD keyring_id
static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
@@ -2926,6 +3003,12 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
/* eBPF programs must be GPL compatible to use GPL-ed functions */
prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0;
+ if (attr->signature) {
+ err = bpf_prog_verify_signature(prog, attr, uattr.is_kernel);
+ if (err)
+ goto free_prog;
+ }
+
prog->orig_prog = NULL;
prog->jited = 0;
@@ -5161,6 +5244,9 @@ static int bpf_map_get_info_by_fd(struct file *file,
info_len = min_t(u32, sizeof(info), info_len);
memset(&info, 0, sizeof(info));
+ if (copy_from_user(&info, uinfo, info_len))
+ return -EFAULT;
+
info.type = map->map_type;
info.id = map->id;
info.key_size = map->key_size;
@@ -5185,6 +5271,25 @@ static int bpf_map_get_info_by_fd(struct file *file,
return err;
}
+ if (info.hash) {
+ char __user *uhash = u64_to_user_ptr(info.hash);
+
+ if (!map->ops->map_get_hash)
+ return -EINVAL;
+
+ if (info.hash_size != SHA256_DIGEST_SIZE)
+ return -EINVAL;
+
+ err = map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, map->sha);
+ if (err != 0)
+ return err;
+
+ if (copy_to_user(uhash, map->sha, SHA256_DIGEST_SIZE) != 0)
+ return -EFAULT;
+ } else if (info.hash_size) {
+ return -EINVAL;
+ }
+
if (copy_to_user(uinfo, &info, info_len) ||
put_user(info_len, &uattr->info.info_len))
return -EFAULT;
@@ -6008,7 +6113,7 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
switch (cmd) {
case BPF_MAP_CREATE:
- err = map_create(&attr, uattr.is_kernel);
+ err = map_create(&attr, uattr);
break;
case BPF_MAP_LOOKUP_ELEM:
err = map_lookup_elem(&attr);
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index fa353c5d550f..f8e70e9c3998 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -116,31 +116,55 @@ struct tnum tnum_xor(struct tnum a, struct tnum b)
return TNUM(v & ~mu, mu);
}
-/* Generate partial products by multiplying each bit in the multiplier (tnum a)
- * with the multiplicand (tnum b), and add the partial products after
- * appropriately bit-shifting them. Instead of directly performing tnum addition
- * on the generated partial products, equivalenty, decompose each partial
- * product into two tnums, consisting of the value-sum (acc_v) and the
- * mask-sum (acc_m) and then perform tnum addition on them. The following paper
- * explains the algorithm in more detail: https://arxiv.org/abs/2105.05398.
+/* Perform long multiplication, iterating through the bits in a using rshift:
+ * - if LSB(a) is a known 0, keep current accumulator
+ * - if LSB(a) is a known 1, add b to current accumulator
+ * - if LSB(a) is unknown, take a union of the above cases.
+ *
+ * For example:
+ *
+ * acc_0: acc_1:
+ *
+ * 11 * -> 11 * -> 11 * -> union(0011, 1001) == x0x1
+ * x1 01 11
+ * ------ ------ ------
+ * 11 11 11
+ * xx 00 11
+ * ------ ------ ------
+ * ???? 0011 1001
*/
struct tnum tnum_mul(struct tnum a, struct tnum b)
{
- u64 acc_v = a.value * b.value;
- struct tnum acc_m = TNUM(0, 0);
+ struct tnum acc = TNUM(0, 0);
while (a.value || a.mask) {
/* LSB of tnum a is a certain 1 */
if (a.value & 1)
- acc_m = tnum_add(acc_m, TNUM(0, b.mask));
+ acc = tnum_add(acc, b);
/* LSB of tnum a is uncertain */
- else if (a.mask & 1)
- acc_m = tnum_add(acc_m, TNUM(0, b.value | b.mask));
+ else if (a.mask & 1) {
+ /* acc = tnum_union(acc_0, acc_1), where acc_0 and
+ * acc_1 are partial accumulators for cases
+ * LSB(a) = certain 0 and LSB(a) = certain 1.
+ * acc_0 = acc + 0 * b = acc.
+ * acc_1 = acc + 1 * b = tnum_add(acc, b).
+ */
+
+ acc = tnum_union(acc, tnum_add(acc, b));
+ }
/* Note: no case for LSB is certain 0 */
a = tnum_rshift(a, 1);
b = tnum_lshift(b, 1);
}
- return tnum_add(TNUM(acc_v, 0), acc_m);
+ return acc;
+}
+
+bool tnum_overlap(struct tnum a, struct tnum b)
+{
+ u64 mu;
+
+ mu = ~a.mask & ~b.mask;
+ return (a.value & mu) == (b.value & mu);
}
/* Note that if a and b disagree - i.e. one has a 'known 1' where the other has
@@ -155,6 +179,19 @@ struct tnum tnum_intersect(struct tnum a, struct tnum b)
return TNUM(v & ~mu, mu);
}
+/* Returns a tnum with the uncertainty from both a and b, and in addition, new
+ * uncertainty at any position that a and b disagree. This represents a
+ * superset of the union of the concrete sets of both a and b. Despite the
+ * overapproximation, it is optimal.
+ */
+struct tnum tnum_union(struct tnum a, struct tnum b)
+{
+ u64 v = a.value & b.value;
+ u64 mu = (a.value ^ b.value) | a.mask | b.mask;
+
+ return TNUM(v & ~mu, mu);
+}
+
struct tnum tnum_cast(struct tnum a, u8 size)
{
a.value &= (1ULL << (size * 8)) - 1;
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 0e364614c3a2..5949095e51c3 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -899,8 +899,7 @@ static __always_inline u64 notrace bpf_prog_start_time(void)
static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
__acquires(RCU)
{
- rcu_read_lock();
- migrate_disable();
+ rcu_read_lock_dont_migrate();
run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
@@ -949,8 +948,7 @@ static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start,
update_prog_stats(prog, start);
this_cpu_dec(*(prog->active));
- migrate_enable();
- rcu_read_unlock();
+ rcu_read_unlock_migrate();
}
static u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
@@ -960,8 +958,7 @@ static u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
/* Runtime stats are exported via actual BPF_LSM_CGROUP
* programs, not the shims.
*/
- rcu_read_lock();
- migrate_disable();
+ rcu_read_lock_dont_migrate();
run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
@@ -974,8 +971,7 @@ static void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
{
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
- migrate_enable();
- rcu_read_unlock();
+ rcu_read_unlock_migrate();
}
u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
@@ -1033,8 +1029,7 @@ static u64 notrace __bpf_prog_enter(struct bpf_prog *prog,
struct bpf_tramp_run_ctx *run_ctx)
__acquires(RCU)
{
- rcu_read_lock();
- migrate_disable();
+ rcu_read_lock_dont_migrate();
run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
@@ -1048,8 +1043,7 @@ static void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start,
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
update_prog_stats(prog, start);
- migrate_enable();
- rcu_read_unlock();
+ rcu_read_unlock_migrate();
}
void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c4f69a9e9af6..73bba397672a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -674,6 +674,8 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
return BPF_DYNPTR_TYPE_SKB;
case DYNPTR_TYPE_XDP:
return BPF_DYNPTR_TYPE_XDP;
+ case DYNPTR_TYPE_SKB_META:
+ return BPF_DYNPTR_TYPE_SKB_META;
default:
return BPF_DYNPTR_TYPE_INVALID;
}
@@ -690,6 +692,8 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
return DYNPTR_TYPE_SKB;
case BPF_DYNPTR_TYPE_XDP:
return DYNPTR_TYPE_XDP;
+ case BPF_DYNPTR_TYPE_SKB_META:
+ return DYNPTR_TYPE_SKB_META;
default:
return 0;
}
@@ -783,8 +787,7 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
state->stack[spi - 1].spilled_ptr.ref_obj_id = id;
}
- state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
- state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
+ bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi));
return 0;
}
@@ -801,29 +804,7 @@ static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_stat
__mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
__mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
- /* Why do we need to set REG_LIVE_WRITTEN for STACK_INVALID slot?
- *
- * While we don't allow reading STACK_INVALID, it is still possible to
- * do <8 byte writes marking some but not all slots as STACK_MISC. Then,
- * helpers or insns can do partial read of that part without failing,
- * but check_stack_range_initialized, check_stack_read_var_off, and
- * check_stack_read_fixed_off will do mark_reg_read for all 8-bytes of
- * the slot conservatively. Hence we need to prevent those liveness
- * marking walks.
- *
- * This was not a problem before because STACK_INVALID is only set by
- * default (where the default reg state has its reg->parent as NULL), or
- * in clean_live_states after REG_LIVE_DONE (at which point
- * mark_reg_read won't walk reg->parent chain), but not randomly during
- * verifier state exploration (like we did above). Hence, for our case
- * parentage chain will still be live (i.e. reg->parent may be
- * non-NULL), while earlier reg->parent was NULL, so we need
- * REG_LIVE_WRITTEN to screen off read marker propagation when it is
- * done later on reads or by mark_dynptr_read as well to unnecessary
- * mark registers in verifier state.
- */
- state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
- state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
+ bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi));
}
static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
@@ -932,9 +913,7 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
__mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
__mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
- /* Same reason as unmark_stack_slots_dynptr above */
- state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
- state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
+ bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi));
return 0;
}
@@ -1052,7 +1031,6 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env,
else
st->type |= PTR_UNTRUSTED;
}
- st->live |= REG_LIVE_WRITTEN;
st->ref_obj_id = i == 0 ? id : 0;
st->iter.btf = btf;
st->iter.btf_id = btf_id;
@@ -1062,6 +1040,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env,
for (j = 0; j < BPF_REG_SIZE; j++)
slot->slot_type[j] = STACK_ITER;
+ bpf_mark_stack_write(env, state->frameno, BIT(spi - i));
mark_stack_slot_scratched(env, spi - i);
}
@@ -1087,12 +1066,10 @@ static int unmark_stack_slots_iter(struct bpf_verifier_env *env,
__mark_reg_not_init(env, st);
- /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */
- st->live |= REG_LIVE_WRITTEN;
-
for (j = 0; j < BPF_REG_SIZE; j++)
slot->slot_type[j] = STACK_INVALID;
+ bpf_mark_stack_write(env, state->frameno, BIT(spi - i));
mark_stack_slot_scratched(env, spi - i);
}
@@ -1182,9 +1159,9 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env,
slot = &state->stack[spi];
st = &slot->spilled_ptr;
+ bpf_mark_stack_write(env, reg->frameno, BIT(spi));
__mark_reg_known_zero(st);
st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
- st->live |= REG_LIVE_WRITTEN;
st->ref_obj_id = id;
st->irq.kfunc_class = kfunc_class;
@@ -1238,8 +1215,7 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r
__mark_reg_not_init(env, st);
- /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */
- st->live |= REG_LIVE_WRITTEN;
+ bpf_mark_stack_write(env, reg->frameno, BIT(spi));
for (i = 0; i < BPF_REG_SIZE; i++)
slot->slot_type[i] = STACK_INVALID;
@@ -1754,6 +1730,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
return err;
dst_state->speculative = src->speculative;
dst_state->in_sleepable = src->in_sleepable;
+ dst_state->cleaned = src->cleaned;
dst_state->curframe = src->curframe;
dst_state->branches = src->branches;
dst_state->parent = src->parent;
@@ -1946,9 +1923,24 @@ static int maybe_exit_scc(struct bpf_verifier_env *env, struct bpf_verifier_stat
return 0;
visit = scc_visit_lookup(env, callchain);
if (!visit) {
- verifier_bug(env, "scc exit: no visit info for call chain %s",
- format_callchain(env, callchain));
- return -EFAULT;
+ /*
+ * If path traversal stops inside an SCC, corresponding bpf_scc_visit
+ * must exist for non-speculative paths. For non-speculative paths
+ * traversal stops when:
+ * a. Verification error is found, maybe_exit_scc() is not called.
+ * b. Top level BPF_EXIT is reached. Top level BPF_EXIT is not a member
+ * of any SCC.
+ * c. A checkpoint is reached and matched. Checkpoints are created by
+ * is_state_visited(), which calls maybe_enter_scc(), which allocates
+ * bpf_scc_visit instances for checkpoints within SCCs.
+ * (c) is the only case that can reach this point.
+ */
+ if (!st->speculative) {
+ verifier_bug(env, "scc exit: no visit info for call chain %s",
+ format_callchain(env, callchain));
+ return -EFAULT;
+ }
+ return 0;
}
if (visit->entry_state != st)
return 0;
@@ -2017,7 +2009,7 @@ static void free_backedges(struct bpf_scc_visit *visit)
for (backedge = visit->backedges; backedge; backedge = next) {
free_verifier_state(&backedge->state, false);
next = backedge->next;
- kvfree(backedge);
+ kfree(backedge);
}
visit->backedges = NULL;
}
@@ -2232,10 +2224,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
/* transfer reg's id which is unique for every map_lookup_elem
* as UID of the inner map.
*/
- if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER))
- reg->map_uid = reg->id;
- if (btf_record_has_field(map->inner_map_meta->record, BPF_WORKQUEUE))
+ if (btf_record_has_field(map->inner_map_meta->record,
+ BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) {
reg->map_uid = reg->id;
+ }
} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
reg->type = PTR_TO_XDP_SOCK;
} else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
@@ -2274,7 +2266,8 @@ static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
static bool reg_is_dynptr_slice_pkt(const struct bpf_reg_state *reg)
{
return base_type(reg->type) == PTR_TO_MEM &&
- (reg->type & DYNPTR_TYPE_SKB || reg->type & DYNPTR_TYPE_XDP);
+ (reg->type &
+ (DYNPTR_TYPE_SKB | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META));
}
/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
@@ -2873,8 +2866,6 @@ static void init_reg_state(struct bpf_verifier_env *env,
for (i = 0; i < MAX_BPF_REG; i++) {
mark_reg_not_init(env, regs, i);
- regs[i].live = REG_LIVE_NONE;
- regs[i].parent = NULL;
regs[i].subreg_def = DEF_NOT_SUBREG;
}
@@ -2958,7 +2949,7 @@ static int cmp_subprogs(const void *a, const void *b)
}
/* Find subprogram that contains instruction at 'off' */
-static struct bpf_subprog_info *find_containing_subprog(struct bpf_verifier_env *env, int off)
+struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *env, int off)
{
struct bpf_subprog_info *vals = env->subprog_info;
int l, r, m;
@@ -2983,7 +2974,7 @@ static int find_subprog(struct bpf_verifier_env *env, int off)
{
struct bpf_subprog_info *p;
- p = find_containing_subprog(env, off);
+ p = bpf_find_containing_subprog(env, off);
if (!p || p->start != off)
return -ENOENT;
return p - env->subprog_info;
@@ -3494,15 +3485,6 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
return 0;
}
-static int jmp_offset(struct bpf_insn *insn)
-{
- u8 code = insn->code;
-
- if (code == (BPF_JMP32 | BPF_JA))
- return insn->imm;
- return insn->off;
-}
-
static int check_subprogs(struct bpf_verifier_env *env)
{
int i, subprog_start, subprog_end, off, cur_subprog = 0;
@@ -3529,7 +3511,7 @@ static int check_subprogs(struct bpf_verifier_env *env)
goto next;
if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
goto next;
- off = i + jmp_offset(&insn[i]) + 1;
+ off = i + bpf_jmp_offset(&insn[i]) + 1;
if (off < subprog_start || off >= subprog_end) {
verbose(env, "jump out of range from insn %d to %d\n", i, off);
return -EINVAL;
@@ -3555,69 +3537,15 @@ next:
return 0;
}
-/* Parentage chain of this register (or stack slot) should take care of all
- * issues like callee-saved registers, stack slot allocation time, etc.
- */
-static int mark_reg_read(struct bpf_verifier_env *env,
- const struct bpf_reg_state *state,
- struct bpf_reg_state *parent, u8 flag)
-{
- bool writes = parent == state->parent; /* Observe write marks */
- int cnt = 0;
-
- while (parent) {
- /* if read wasn't screened by an earlier write ... */
- if (writes && state->live & REG_LIVE_WRITTEN)
- break;
- if (verifier_bug_if(parent->live & REG_LIVE_DONE, env,
- "type %s var_off %lld off %d",
- reg_type_str(env, parent->type),
- parent->var_off.value, parent->off))
- return -EFAULT;
- /* The first condition is more likely to be true than the
- * second, checked it first.
- */
- if ((parent->live & REG_LIVE_READ) == flag ||
- parent->live & REG_LIVE_READ64)
- /* The parentage chain never changes and
- * this parent was already marked as LIVE_READ.
- * There is no need to keep walking the chain again and
- * keep re-marking all parents as LIVE_READ.
- * This case happens when the same register is read
- * multiple times without writes into it in-between.
- * Also, if parent has the stronger REG_LIVE_READ64 set,
- * then no need to set the weak REG_LIVE_READ32.
- */
- break;
- /* ... then we depend on parent's value */
- parent->live |= flag;
- /* REG_LIVE_READ64 overrides REG_LIVE_READ32. */
- if (flag == REG_LIVE_READ64)
- parent->live &= ~REG_LIVE_READ32;
- state = parent;
- parent = state->parent;
- writes = true;
- cnt++;
- }
-
- if (env->longest_mark_read_walk < cnt)
- env->longest_mark_read_walk = cnt;
- return 0;
-}
-
static int mark_stack_slot_obj_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
int spi, int nr_slots)
{
- struct bpf_func_state *state = func(env, reg);
int err, i;
for (i = 0; i < nr_slots; i++) {
- struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr;
-
- err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64);
+ err = bpf_mark_stack_read(env, reg->frameno, env->insn_idx, BIT(spi - i));
if (err)
return err;
-
mark_stack_slot_scratched(env, spi - i);
}
return 0;
@@ -3663,7 +3591,7 @@ static int mark_irq_flag_read(struct bpf_verifier_env *env, struct bpf_reg_state
* code only. It returns TRUE if the source or destination register operates
* on 64-bit, otherwise return FALSE.
*/
-static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
+static bool is_reg64(struct bpf_insn *insn,
u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t)
{
u8 code, class, op;
@@ -3774,14 +3702,14 @@ static int insn_def_regno(const struct bpf_insn *insn)
}
/* Return TRUE if INSN has defined any 32-bit value explicitly. */
-static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn)
+static bool insn_has_def32(struct bpf_insn *insn)
{
int dst_reg = insn_def_regno(insn);
if (dst_reg == -1)
return false;
- return !is_reg64(env, insn, dst_reg, NULL, DST_OP);
+ return !is_reg64(insn, dst_reg, NULL, DST_OP);
}
static void mark_insn_zext(struct bpf_verifier_env *env,
@@ -3812,7 +3740,7 @@ static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *r
mark_reg_scratched(env, regno);
reg = &regs[regno];
- rw64 = is_reg64(env, insn, regno, reg, t);
+ rw64 = is_reg64(insn, regno, reg, t);
if (t == SRC_OP) {
/* check whether register used as source operand can be read */
if (reg->type == NOT_INIT) {
@@ -3826,15 +3754,13 @@ static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *r
if (rw64)
mark_insn_zext(env, reg);
- return mark_reg_read(env, reg, reg->parent,
- rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32);
+ return 0;
} else {
/* check whether register used as dest operand can be written to */
if (regno == BPF_REG_FP) {
verbose(env, "frame pointer is read only\n");
return -EACCES;
}
- reg->live |= REG_LIVE_WRITTEN;
reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1;
if (t == DST_OP)
mark_reg_unknown(env, regs, regno);
@@ -4195,7 +4121,7 @@ static void fmt_reg_mask(char *buf, ssize_t buf_sz, u32 reg_mask)
}
}
/* format stack slots bitmask, e.g., "-8,-24,-40" for 0x15 mask */
-static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
+void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
{
DECLARE_BITMAP(mask, 64);
bool first = true;
@@ -4250,8 +4176,6 @@ static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_histo
}
}
-static bool calls_callback(struct bpf_verifier_env *env, int insn_idx);
-
/* For given verifier state backtrack_insn() is called from the last insn to
* the first insn. Its purpose is to compute a bitmask of registers and
* stack slots that needs precision in the parent verifier state.
@@ -4278,7 +4202,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_reg_mask(bt));
verbose(env, "mark_precise: frame%d: regs=%s ",
bt->frame, env->tmp_str_buf);
- fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt));
+ bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt));
verbose(env, "stack=%s before ", env->tmp_str_buf);
verbose(env, "%d: ", idx);
verbose_insn(env, insn);
@@ -4479,7 +4403,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
* backtracking, as these registers are set by the function
* invoking callback.
*/
- if (subseq_idx >= 0 && calls_callback(env, subseq_idx))
+ if (subseq_idx >= 0 && bpf_calls_callback(env, subseq_idx))
for (i = BPF_REG_1; i <= BPF_REG_5; i++)
bt_clear_reg(bt, i);
if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
@@ -4918,7 +4842,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env,
bt_frame_reg_mask(bt, fr));
verbose(env, "mark_precise: frame%d: parent state regs=%s ",
fr, env->tmp_str_buf);
- fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
+ bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
bt_frame_stack_mask(bt, fr));
verbose(env, "stack=%s: ", env->tmp_str_buf);
print_verifier_state(env, st, fr, true);
@@ -5041,12 +4965,7 @@ static void assign_scalar_id_before_mov(struct bpf_verifier_env *env,
/* Copy src state preserving dst->parent and dst->live fields */
static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src)
{
- struct bpf_reg_state *parent = dst->parent;
- enum bpf_reg_liveness live = dst->live;
-
*dst = *src;
- dst->parent = parent;
- dst->live = live;
}
static void save_register_state(struct bpf_verifier_env *env,
@@ -5057,8 +4976,6 @@ static void save_register_state(struct bpf_verifier_env *env,
int i;
copy_register_state(&state->stack[spi].spilled_ptr, reg);
- if (size == BPF_REG_SIZE)
- state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
for (i = BPF_REG_SIZE; i > BPF_REG_SIZE - size; i--)
state->stack[spi].slot_type[i - 1] = STACK_SPILL;
@@ -5152,6 +5069,18 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
if (err)
return err;
+ if (!(off % BPF_REG_SIZE) && size == BPF_REG_SIZE) {
+ /* only mark the slot as written if all 8 bytes were written
+ * otherwise read propagation may incorrectly stop too soon
+ * when stack slots are partially written.
+ * This heuristic means that read propagation will be
+ * conservative, since it will add reg_live_read marks
+ * to stack slots all the way to first state when programs
+ * writes+reads less than 8 bytes
+ */
+ bpf_mark_stack_write(env, state->frameno, BIT(spi));
+ }
+
check_fastcall_stack_contract(env, state, insn_idx, off);
mark_stack_slot_scratched(env, spi);
if (reg && !(off % BPF_REG_SIZE) && reg->type == SCALAR_VALUE && env->bpf_capable) {
@@ -5195,17 +5124,6 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
for (i = 0; i < BPF_REG_SIZE; i++)
scrub_spilled_slot(&state->stack[spi].slot_type[i]);
- /* only mark the slot as written if all 8 bytes were written
- * otherwise read propagation may incorrectly stop too soon
- * when stack slots are partially written.
- * This heuristic means that read propagation will be
- * conservative, since it will add reg_live_read marks
- * to stack slots all the way to first state when programs
- * writes+reads less than 8 bytes
- */
- if (size == BPF_REG_SIZE)
- state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
-
/* when we zero initialize stack slots mark them as such */
if ((reg && register_is_null(reg)) ||
(!reg && is_bpf_st_mem(insn) && insn->imm == 0)) {
@@ -5398,7 +5316,6 @@ static void mark_reg_stack_read(struct bpf_verifier_env *env,
/* have read misc data from the stack */
mark_reg_unknown(env, state->regs, dst_regno);
}
- state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
}
/* Read the stack at 'off' and put the results into the register indicated by
@@ -5421,12 +5338,16 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
struct bpf_reg_state *reg;
u8 *stype, type;
int insn_flags = insn_stack_access_flags(reg_state->frameno, spi);
+ int err;
stype = reg_state->stack[spi].slot_type;
reg = &reg_state->stack[spi].spilled_ptr;
mark_stack_slot_scratched(env, spi);
check_fastcall_stack_contract(env, state, env->insn_idx, off);
+ err = bpf_mark_stack_read(env, reg_state->frameno, env->insn_idx, BIT(spi));
+ if (err)
+ return err;
if (is_spilled_reg(&reg_state->stack[spi])) {
u8 spill_size = 1;
@@ -5441,7 +5362,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
return -EACCES;
}
- mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
if (dst_regno < 0)
return 0;
@@ -5495,7 +5415,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
insn_flags = 0; /* not restoring original register state */
}
}
- state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
} else if (dst_regno >= 0) {
/* restore register state from stack */
copy_register_state(&state->regs[dst_regno], reg);
@@ -5503,7 +5422,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
* has its liveness marks cleared by is_state_visited()
* which resets stack/reg liveness for state transitions
*/
- state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
} else if (__is_pointer_value(env->allow_ptr_leaks, reg)) {
/* If dst_regno==-1, the caller is asking us whether
* it is acceptable to use this value as a SCALAR_VALUE
@@ -5515,7 +5433,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
off);
return -EACCES;
}
- mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
} else {
for (i = 0; i < size; i++) {
type = stype[(slot - i) % BPF_REG_SIZE];
@@ -5529,7 +5446,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
off, i, size);
return -EACCES;
}
- mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
if (dst_regno >= 0)
mark_reg_stack_read(env, reg_state, off, off + size, dst_regno);
insn_flags = 0; /* we are not restoring spilled register */
@@ -8157,10 +8073,10 @@ mark:
/* reading any byte out of 8-byte 'spill_slot' will cause
* the whole slot to be marked as 'read'
*/
- mark_reg_read(env, &state->stack[spi].spilled_ptr,
- state->stack[spi].spilled_ptr.parent,
- REG_LIVE_READ64);
- /* We do not set REG_LIVE_WRITTEN for stack slot, as we can not
+ err = bpf_mark_stack_read(env, reg->frameno, env->insn_idx, BIT(spi));
+ if (err)
+ return err;
+ /* We do not call bpf_mark_stack_write(), as we can not
* be sure that whether stack slot is written to or not. Hence,
* we must still conservatively propagate reads upwards even if
* helper may write to the entire memory range.
@@ -8515,38 +8431,70 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags)
return 0;
}
-static int process_timer_func(struct bpf_verifier_env *env, int regno,
- struct bpf_call_arg_meta *meta)
+/* Check if @regno is a pointer to a specific field in a map value */
+static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno,
+ enum btf_field_type field_type)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
bool is_const = tnum_is_const(reg->var_off);
struct bpf_map *map = reg->map_ptr;
u64 val = reg->var_off.value;
+ const char *struct_name = btf_field_type_name(field_type);
+ int field_off = -1;
if (!is_const) {
verbose(env,
- "R%d doesn't have constant offset. bpf_timer has to be at the constant offset\n",
- regno);
+ "R%d doesn't have constant offset. %s has to be at the constant offset\n",
+ regno, struct_name);
return -EINVAL;
}
if (!map->btf) {
- verbose(env, "map '%s' has to have BTF in order to use bpf_timer\n",
- map->name);
+ verbose(env, "map '%s' has to have BTF in order to use %s\n", map->name,
+ struct_name);
+ return -EINVAL;
+ }
+ if (!btf_record_has_field(map->record, field_type)) {
+ verbose(env, "map '%s' has no valid %s\n", map->name, struct_name);
return -EINVAL;
}
- if (!btf_record_has_field(map->record, BPF_TIMER)) {
- verbose(env, "map '%s' has no valid bpf_timer\n", map->name);
+ switch (field_type) {
+ case BPF_TIMER:
+ field_off = map->record->timer_off;
+ break;
+ case BPF_TASK_WORK:
+ field_off = map->record->task_work_off;
+ break;
+ default:
+ verifier_bug(env, "unsupported BTF field type: %s\n", struct_name);
return -EINVAL;
}
- if (map->record->timer_off != val + reg->off) {
- verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n",
- val + reg->off, map->record->timer_off);
+ if (field_off != val + reg->off) {
+ verbose(env, "off %lld doesn't point to 'struct %s' that is at %d\n",
+ val + reg->off, struct_name, field_off);
return -EINVAL;
}
+ return 0;
+}
+
+static int process_timer_func(struct bpf_verifier_env *env, int regno,
+ struct bpf_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_map *map = reg->map_ptr;
+ int err;
+
+ err = check_map_field_pointer(env, regno, BPF_TIMER);
+ if (err)
+ return err;
+
if (meta->map_ptr) {
verifier_bug(env, "Two map pointers in a timer helper");
return -EFAULT;
}
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n");
+ return -EOPNOTSUPP;
+ }
meta->map_uid = reg->map_uid;
meta->map_ptr = map;
return 0;
@@ -8569,6 +8517,26 @@ static int process_wq_func(struct bpf_verifier_env *env, int regno,
return 0;
}
+static int process_task_work_func(struct bpf_verifier_env *env, int regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_map *map = reg->map_ptr;
+ int err;
+
+ err = check_map_field_pointer(env, regno, BPF_TASK_WORK);
+ if (err)
+ return err;
+
+ if (meta->map.ptr) {
+ verifier_bug(env, "Two map pointers in a bpf_task_work helper");
+ return -EFAULT;
+ }
+ meta->map.uid = reg->map_uid;
+ meta->map.ptr = map;
+ return 0;
+}
+
static int process_kptr_func(struct bpf_verifier_env *env, int regno,
struct bpf_call_arg_meta *meta)
{
@@ -10398,6 +10366,8 @@ typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env,
struct bpf_func_state *callee,
int insn_idx);
+static bool is_task_work_add_kfunc(u32 func_id);
+
static int set_callee_state(struct bpf_verifier_env *env,
struct bpf_func_state *caller,
struct bpf_func_state *callee, int insn_idx);
@@ -10616,7 +10586,8 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins
env->subprog_info[subprog].is_async_cb = true;
async_cb = push_async_cb(env, env->subprog_info[subprog].start,
insn_idx, subprog,
- is_bpf_wq_set_callback_impl_kfunc(insn->imm));
+ is_bpf_wq_set_callback_impl_kfunc(insn->imm) ||
+ is_task_work_add_kfunc(insn->imm));
if (!async_cb)
return -EFAULT;
callee = async_cb->frame[0];
@@ -10717,6 +10688,8 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
/* and go analyze first insn of the callee */
*insn_idx = env->subprog_info[subprog].start - 1;
+ bpf_reset_live_stack_callchain(env);
+
if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "caller:\n");
print_verifier_state(env, state, caller->frameno, true);
@@ -10842,7 +10815,7 @@ static int set_timer_callback_state(struct bpf_verifier_env *env,
__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_async_callback_fn = true;
- callee->callback_ret_range = retval_range(0, 1);
+ callee->callback_ret_range = retval_range(0, 0);
return 0;
}
@@ -10929,6 +10902,36 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env,
return 0;
}
+static int set_task_work_schedule_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ struct bpf_map *map_ptr = caller->regs[BPF_REG_3].map_ptr;
+
+ /*
+ * callback_fn(struct bpf_map *map, void *key, void *value);
+ */
+ callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_1]);
+ callee->regs[BPF_REG_1].map_ptr = map_ptr;
+
+ callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+ callee->regs[BPF_REG_2].map_ptr = map_ptr;
+
+ callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_3]);
+ callee->regs[BPF_REG_3].map_ptr = map_ptr;
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ callee->in_async_callback_fn = true;
+ callee->callback_ret_range = retval_range(S32_MIN, S32_MAX);
+ return 0;
+}
+
static bool is_rbtree_lock_required_kfunc(u32 btf_id);
/* Are we currently verifying the callback for a rbtree helper that must
@@ -10992,8 +10995,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
}
/* we are going to rely on register's precise value */
- err = mark_reg_read(env, r0, r0->parent, REG_LIVE_READ64);
- err = err ?: mark_chain_precision(env, BPF_REG_0);
+ err = mark_chain_precision(env, BPF_REG_0);
if (err)
return err;
@@ -11003,7 +11005,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
"At callback return", "R0");
return -EINVAL;
}
- if (!calls_callback(env, callee->callsite)) {
+ if (!bpf_calls_callback(env, callee->callsite)) {
verifier_bug(env, "in callback at %d, callsite %d !calls_callback",
*insn_idx, callee->callsite);
return -EFAULT;
@@ -11354,7 +11356,7 @@ static int get_helper_proto(struct bpf_verifier_env *env, int func_id,
return -EINVAL;
*ptr = env->ops->get_func_proto(func_id, env->prog);
- return *ptr ? 0 : -EINVAL;
+ return *ptr && (*ptr)->func ? 0 : -EINVAL;
}
static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
@@ -11641,7 +11643,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (dynptr_type == BPF_DYNPTR_TYPE_INVALID)
return -EFAULT;
- if (dynptr_type == BPF_DYNPTR_TYPE_SKB)
+ if (dynptr_type == BPF_DYNPTR_TYPE_SKB ||
+ dynptr_type == BPF_DYNPTR_TYPE_SKB_META)
/* this will trigger clear_all_pkt_pointers(), which will
* invalidate all dynptr slices associated with the skb
*/
@@ -11896,17 +11899,11 @@ static void __mark_btf_func_reg_size(struct bpf_verifier_env *env, struct bpf_re
if (regno == BPF_REG_0) {
/* Function return value */
- reg->live |= REG_LIVE_WRITTEN;
reg->subreg_def = reg_size == sizeof(u64) ?
DEF_NOT_SUBREG : env->insn_idx + 1;
- } else {
+ } else if (reg_size == sizeof(u64)) {
/* Function argument */
- if (reg_size == sizeof(u64)) {
- mark_insn_zext(env, reg);
- mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
- } else {
- mark_reg_read(env, reg, reg->parent, REG_LIVE_READ32);
- }
+ mark_insn_zext(env, reg);
}
}
@@ -12059,6 +12056,7 @@ enum {
KF_ARG_RB_NODE_ID,
KF_ARG_WORKQUEUE_ID,
KF_ARG_RES_SPIN_LOCK_ID,
+ KF_ARG_TASK_WORK_ID,
};
BTF_ID_LIST(kf_arg_btf_ids)
@@ -12069,6 +12067,7 @@ BTF_ID(struct, bpf_rb_root)
BTF_ID(struct, bpf_rb_node)
BTF_ID(struct, bpf_wq)
BTF_ID(struct, bpf_res_spin_lock)
+BTF_ID(struct, bpf_task_work)
static bool __is_kfunc_ptr_arg_type(const struct btf *btf,
const struct btf_param *arg, int type)
@@ -12117,6 +12116,11 @@ static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg)
return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID);
}
+static bool is_kfunc_arg_task_work(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_TASK_WORK_ID);
+}
+
static bool is_kfunc_arg_res_spin_lock(const struct btf *btf, const struct btf_param *arg)
{
return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RES_SPIN_LOCK_ID);
@@ -12204,6 +12208,7 @@ enum kfunc_ptr_arg_type {
KF_ARG_PTR_TO_WORKQUEUE,
KF_ARG_PTR_TO_IRQ_FLAG,
KF_ARG_PTR_TO_RES_SPIN_LOCK,
+ KF_ARG_PTR_TO_TASK_WORK,
};
enum special_kfunc_type {
@@ -12228,6 +12233,8 @@ enum special_kfunc_type {
KF_bpf_rbtree_right,
KF_bpf_dynptr_from_skb,
KF_bpf_dynptr_from_xdp,
+ KF_bpf_dynptr_from_skb_meta,
+ KF_bpf_xdp_pull_data,
KF_bpf_dynptr_slice,
KF_bpf_dynptr_slice_rdwr,
KF_bpf_dynptr_clone,
@@ -12252,6 +12259,8 @@ enum special_kfunc_type {
KF_bpf_res_spin_lock_irqsave,
KF_bpf_res_spin_unlock_irqrestore,
KF___bpf_trap,
+ KF_bpf_task_work_schedule_signal,
+ KF_bpf_task_work_schedule_resume,
};
BTF_ID_LIST(special_kfunc_list)
@@ -12277,9 +12286,13 @@ BTF_ID(func, bpf_rbtree_right)
#ifdef CONFIG_NET
BTF_ID(func, bpf_dynptr_from_skb)
BTF_ID(func, bpf_dynptr_from_xdp)
+BTF_ID(func, bpf_dynptr_from_skb_meta)
+BTF_ID(func, bpf_xdp_pull_data)
#else
BTF_ID_UNUSED
BTF_ID_UNUSED
+BTF_ID_UNUSED
+BTF_ID_UNUSED
#endif
BTF_ID(func, bpf_dynptr_slice)
BTF_ID(func, bpf_dynptr_slice_rdwr)
@@ -12318,6 +12331,14 @@ BTF_ID(func, bpf_res_spin_unlock)
BTF_ID(func, bpf_res_spin_lock_irqsave)
BTF_ID(func, bpf_res_spin_unlock_irqrestore)
BTF_ID(func, __bpf_trap)
+BTF_ID(func, bpf_task_work_schedule_signal)
+BTF_ID(func, bpf_task_work_schedule_resume)
+
+static bool is_task_work_add_kfunc(u32 func_id)
+{
+ return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] ||
+ func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume];
+}
static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
{
@@ -12349,6 +12370,11 @@ static bool is_kfunc_bpf_preempt_enable(struct bpf_kfunc_call_arg_meta *meta)
return meta->func_id == special_kfunc_list[KF_bpf_preempt_enable];
}
+static bool is_kfunc_pkt_changing(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->func_id == special_kfunc_list[KF_bpf_xdp_pull_data];
+}
+
static enum kfunc_ptr_arg_type
get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
struct bpf_kfunc_call_arg_meta *meta,
@@ -12408,6 +12434,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_wq(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_WORKQUEUE;
+ if (is_kfunc_arg_task_work(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_TASK_WORK;
+
if (is_kfunc_arg_irq_flag(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_IRQ_FLAG;
@@ -12751,7 +12780,8 @@ static bool is_sync_callback_calling_kfunc(u32 btf_id)
static bool is_async_callback_calling_kfunc(u32 btf_id)
{
- return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl];
+ return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl] ||
+ is_task_work_add_kfunc(btf_id);
}
static bool is_bpf_throw_kfunc(struct bpf_insn *insn)
@@ -13132,7 +13162,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
verbose(env, "pointer in R%d isn't map pointer\n", regno);
return -EINVAL;
}
- if (meta->map.ptr && reg->map_ptr->record->wq_off >= 0) {
+ if (meta->map.ptr && (reg->map_ptr->record->wq_off >= 0 ||
+ reg->map_ptr->record->task_work_off >= 0)) {
/* Use map_uid (which is unique id of inner map) to reject:
* inner_map1 = bpf_map_lookup_elem(outer_map, key1)
* inner_map2 = bpf_map_lookup_elem(outer_map, key2)
@@ -13147,6 +13178,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
*/
if (meta->map.ptr != reg->map_ptr ||
meta->map.uid != reg->map_uid) {
+ if (reg->map_ptr->record->task_work_off >= 0) {
+ verbose(env,
+ "bpf_task_work pointer in R2 map_uid=%d doesn't match map pointer in R3 map_uid=%d\n",
+ meta->map.uid, reg->map_uid);
+ return -EINVAL;
+ }
verbose(env,
"workqueue pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n",
meta->map.uid, reg->map_uid);
@@ -13185,6 +13222,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
case KF_ARG_PTR_TO_CONST_STR:
case KF_ARG_PTR_TO_WORKQUEUE:
+ case KF_ARG_PTR_TO_TASK_WORK:
case KF_ARG_PTR_TO_IRQ_FLAG:
case KF_ARG_PTR_TO_RES_SPIN_LOCK:
break;
@@ -13253,6 +13291,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
dynptr_arg_type |= DYNPTR_TYPE_SKB;
} else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp]) {
dynptr_arg_type |= DYNPTR_TYPE_XDP;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb_meta]) {
+ dynptr_arg_type |= DYNPTR_TYPE_SKB_META;
} else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] &&
(dynptr_arg_type & MEM_UNINIT)) {
enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type;
@@ -13476,6 +13516,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
if (ret < 0)
return ret;
break;
+ case KF_ARG_PTR_TO_TASK_WORK:
+ if (reg->type != PTR_TO_MAP_VALUE) {
+ verbose(env, "arg#%d doesn't point to a map value\n", i);
+ return -EINVAL;
+ }
+ ret = process_task_work_func(env, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
case KF_ARG_PTR_TO_IRQ_FLAG:
if (reg->type != PTR_TO_STACK) {
verbose(env, "arg#%d doesn't point to an irq flag on stack\n", i);
@@ -13842,6 +13891,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
}
+ if (is_task_work_add_kfunc(meta.func_id)) {
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_task_work_schedule_callback_state);
+ if (err) {
+ verbose(env, "kfunc %s#%d failed callback verification\n",
+ func_name, meta.func_id);
+ return err;
+ }
+ }
+
rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);
@@ -13901,6 +13960,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EACCES;
}
+ if (is_kfunc_rcu_protected(&meta) && !in_rcu_cs(env)) {
+ verbose(env, "kernel func %s requires RCU critical section protection\n", func_name);
+ return -EACCES;
+ }
+
/* In case of release function, we get register number of refcounted
* PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
*/
@@ -14014,6 +14078,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
/* Ensures we don't access the memory after a release_reference() */
if (meta.ref_obj_id)
regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
+
+ if (is_kfunc_rcu_protected(&meta))
+ regs[BPF_REG_0].type |= MEM_RCU;
} else {
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].btf = desc_btf;
@@ -14022,6 +14089,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache])
regs[BPF_REG_0].type |= PTR_UNTRUSTED;
+ else if (is_kfunc_rcu_protected(&meta))
+ regs[BPF_REG_0].type |= MEM_RCU;
if (is_iter_next_kfunc(&meta)) {
struct bpf_reg_state *cur_iter;
@@ -14066,6 +14135,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
}
+ if (is_kfunc_pkt_changing(&meta))
+ clear_all_pkt_pointers(env);
+
nargs = btf_type_vlen(meta.func_proto);
args = (const struct btf_param *)(meta.func_proto + 1);
for (i = 0; i < nargs; i++) {
@@ -15645,7 +15717,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
*/
assign_scalar_id_before_mov(env, src_reg);
copy_register_state(dst_reg, src_reg);
- dst_reg->live |= REG_LIVE_WRITTEN;
dst_reg->subreg_def = DEF_NOT_SUBREG;
} else {
/* case: R1 = (s8, s16 s32)R2 */
@@ -15664,7 +15735,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (!no_sext)
dst_reg->id = 0;
coerce_reg_to_size_sx(dst_reg, insn->off >> 3);
- dst_reg->live |= REG_LIVE_WRITTEN;
dst_reg->subreg_def = DEF_NOT_SUBREG;
} else {
mark_reg_unknown(env, regs, insn->dst_reg);
@@ -15690,7 +15760,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
*/
if (!is_src_reg_u32)
dst_reg->id = 0;
- dst_reg->live |= REG_LIVE_WRITTEN;
dst_reg->subreg_def = env->insn_idx + 1;
} else {
/* case: W1 = (s8, s16)W2 */
@@ -15701,7 +15770,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
copy_register_state(dst_reg, src_reg);
if (!no_sext)
dst_reg->id = 0;
- dst_reg->live |= REG_LIVE_WRITTEN;
dst_reg->subreg_def = env->insn_idx + 1;
coerce_subreg_to_size_sx(dst_reg, insn->off >> 3);
}
@@ -15886,6 +15954,8 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta
*/
if (tnum_is_const(t1) && tnum_is_const(t2))
return t1.value == t2.value;
+ if (!tnum_overlap(t1, t2))
+ return 0;
/* non-overlapping ranges */
if (umin1 > umax2 || umax1 < umin2)
return 0;
@@ -15910,6 +15980,8 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta
*/
if (tnum_is_const(t1) && tnum_is_const(t2))
return t1.value != t2.value;
+ if (!tnum_overlap(t1, t2))
+ return 1;
/* non-overlapping ranges */
if (umin1 > umax2 || umax1 < umin2)
return 1;
@@ -17117,9 +17189,8 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
}
if (frame->in_async_callback_fn) {
- /* enforce return zero from async callbacks like timer */
exit_ctx = "At async callback return";
- range = retval_range(0, 0);
+ range = frame->callback_ret_range;
goto enforce_retval;
}
@@ -17258,7 +17329,7 @@ static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off)
{
struct bpf_subprog_info *subprog;
- subprog = find_containing_subprog(env, off);
+ subprog = bpf_find_containing_subprog(env, off);
subprog->changes_pkt_data = true;
}
@@ -17266,7 +17337,7 @@ static void mark_subprog_might_sleep(struct bpf_verifier_env *env, int off)
{
struct bpf_subprog_info *subprog;
- subprog = find_containing_subprog(env, off);
+ subprog = bpf_find_containing_subprog(env, off);
subprog->might_sleep = true;
}
@@ -17280,8 +17351,8 @@ static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w)
{
struct bpf_subprog_info *caller, *callee;
- caller = find_containing_subprog(env, t);
- callee = find_containing_subprog(env, w);
+ caller = bpf_find_containing_subprog(env, t);
+ callee = bpf_find_containing_subprog(env, w);
caller->changes_pkt_data |= callee->changes_pkt_data;
caller->might_sleep |= callee->might_sleep;
}
@@ -17351,7 +17422,7 @@ static void mark_calls_callback(struct bpf_verifier_env *env, int idx)
env->insn_aux_data[idx].calls_callback = true;
}
-static bool calls_callback(struct bpf_verifier_env *env, int insn_idx)
+bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx)
{
return env->insn_aux_data[insn_idx].calls_callback;
}
@@ -17783,6 +17854,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
*/
if (ret == 0 && is_kfunc_sleepable(&meta))
mark_subprog_might_sleep(env, t);
+ if (ret == 0 && is_kfunc_pkt_changing(&meta))
+ mark_subprog_changes_pkt_data(env, t);
}
return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);
@@ -17825,7 +17898,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
static int check_cfg(struct bpf_verifier_env *env)
{
int insn_cnt = env->prog->len;
- int *insn_stack, *insn_state, *insn_postorder;
+ int *insn_stack, *insn_state;
int ex_insn_beg, i, ret = 0;
insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
@@ -17838,14 +17911,6 @@ static int check_cfg(struct bpf_verifier_env *env)
return -ENOMEM;
}
- insn_postorder = env->cfg.insn_postorder =
- kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
- if (!insn_postorder) {
- kvfree(insn_state);
- kvfree(insn_stack);
- return -ENOMEM;
- }
-
ex_insn_beg = env->exception_callback_subprog
? env->subprog_info[env->exception_callback_subprog].start
: 0;
@@ -17863,7 +17928,6 @@ walk_cfg:
case DONE_EXPLORING:
insn_state[t] = EXPLORED;
env->cfg.cur_stack--;
- insn_postorder[env->cfg.cur_postorder++] = t;
break;
case KEEP_EXPLORING:
break;
@@ -17917,6 +17981,56 @@ err_free:
return ret;
}
+/*
+ * For each subprogram 'i' fill array env->cfg.insn_subprogram sub-range
+ * [env->subprog_info[i].postorder_start, env->subprog_info[i+1].postorder_start)
+ * with indices of 'i' instructions in postorder.
+ */
+static int compute_postorder(struct bpf_verifier_env *env)
+{
+ u32 cur_postorder, i, top, stack_sz, s, succ_cnt, succ[2];
+ int *stack = NULL, *postorder = NULL, *state = NULL;
+
+ postorder = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT);
+ state = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT);
+ stack = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT);
+ if (!postorder || !state || !stack) {
+ kvfree(postorder);
+ kvfree(state);
+ kvfree(stack);
+ return -ENOMEM;
+ }
+ cur_postorder = 0;
+ for (i = 0; i < env->subprog_cnt; i++) {
+ env->subprog_info[i].postorder_start = cur_postorder;
+ stack[0] = env->subprog_info[i].start;
+ stack_sz = 1;
+ do {
+ top = stack[stack_sz - 1];
+ state[top] |= DISCOVERED;
+ if (state[top] & EXPLORED) {
+ postorder[cur_postorder++] = top;
+ stack_sz--;
+ continue;
+ }
+ succ_cnt = bpf_insn_successors(env->prog, top, succ);
+ for (s = 0; s < succ_cnt; ++s) {
+ if (!state[succ[s]]) {
+ stack[stack_sz++] = succ[s];
+ state[succ[s]] |= DISCOVERED;
+ }
+ }
+ state[top] |= EXPLORED;
+ } while (stack_sz);
+ }
+ env->subprog_info[i].postorder_start = cur_postorder;
+ env->cfg.insn_postorder = postorder;
+ env->cfg.cur_postorder = cur_postorder;
+ kvfree(stack);
+ kvfree(state);
+ return 0;
+}
+
static int check_abnormal_return(struct bpf_verifier_env *env)
{
int i;
@@ -18449,16 +18563,15 @@ static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
}
static void clean_func_state(struct bpf_verifier_env *env,
- struct bpf_func_state *st)
+ struct bpf_func_state *st,
+ u32 ip)
{
- enum bpf_reg_liveness live;
+ u16 live_regs = env->insn_aux_data[ip].live_regs_before;
int i, j;
for (i = 0; i < BPF_REG_FP; i++) {
- live = st->regs[i].live;
/* liveness must not touch this register anymore */
- st->regs[i].live |= REG_LIVE_DONE;
- if (!(live & REG_LIVE_READ))
+ if (!(live_regs & BIT(i)))
/* since the register is unused, clear its state
* to make further comparison simpler
*/
@@ -18466,10 +18579,7 @@ static void clean_func_state(struct bpf_verifier_env *env,
}
for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) {
- live = st->stack[i].spilled_ptr.live;
- /* liveness must not touch this stack slot anymore */
- st->stack[i].spilled_ptr.live |= REG_LIVE_DONE;
- if (!(live & REG_LIVE_READ)) {
+ if (!bpf_stack_slot_alive(env, st->frameno, i)) {
__mark_reg_not_init(env, &st->stack[i].spilled_ptr);
for (j = 0; j < BPF_REG_SIZE; j++)
st->stack[i].slot_type[j] = STACK_INVALID;
@@ -18480,10 +18590,14 @@ static void clean_func_state(struct bpf_verifier_env *env,
static void clean_verifier_state(struct bpf_verifier_env *env,
struct bpf_verifier_state *st)
{
- int i;
+ int i, ip;
- for (i = 0; i <= st->curframe; i++)
- clean_func_state(env, st->frame[i]);
+ bpf_live_stack_query_init(env, st);
+ st->cleaned = true;
+ for (i = 0; i <= st->curframe; i++) {
+ ip = frame_insn_idx(st, i);
+ clean_func_state(env, st->frame[i], ip);
+ }
}
/* the parentage chains form a tree.
@@ -18494,25 +18608,23 @@ static void clean_verifier_state(struct bpf_verifier_env *env,
* but a lot of states will get revised from liveness point of view when
* the verifier explores other branches.
* Example:
- * 1: r0 = 1
+ * 1: *(u64)(r10 - 8) = 1
* 2: if r1 == 100 goto pc+1
- * 3: r0 = 2
- * 4: exit
- * when the verifier reaches exit insn the register r0 in the state list of
- * insn 2 will be seen as !REG_LIVE_READ. Then the verifier pops the other_branch
- * of insn 2 and goes exploring further. At the insn 4 it will walk the
- * parentage chain from insn 4 into insn 2 and will mark r0 as REG_LIVE_READ.
+ * 3: *(u64)(r10 - 8) = 2
+ * 4: r0 = *(u64)(r10 - 8)
+ * 5: exit
+ * when the verifier reaches exit insn the stack slot -8 in the state list of
+ * insn 2 is not yet marked alive. Then the verifier pops the other_branch
+ * of insn 2 and goes exploring further. After the insn 4 read, liveness
+ * analysis would propagate read mark for -8 at insn 2.
*
* Since the verifier pushes the branch states as it sees them while exploring
* the program the condition of walking the branch instruction for the second
* time means that all states below this branch were already explored and
* their final liveness marks are already propagated.
* Hence when the verifier completes the search of state list in is_state_visited()
- * we can call this clean_live_states() function to mark all liveness states
- * as REG_LIVE_DONE to indicate that 'parent' pointers of 'struct bpf_reg_state'
- * will not be used.
- * This function also clears the registers and stack for states that !READ
- * to simplify state merging.
+ * we can call this clean_live_states() function to clear dead the registers and stack
+ * slots to simplify state merging.
*
* Important note here that walking the same branch instruction in the callee
* doesn't meant that the states are DONE. The verifier has to compare
@@ -18532,7 +18644,7 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn,
if (sl->state.insn_idx != insn ||
!same_callsites(&sl->state, cur))
continue;
- if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE)
+ if (sl->state.cleaned)
/* all regs in this state in all frames were already marked */
continue;
if (incomplete_read_marks(env, &sl->state))
@@ -18564,9 +18676,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
if (exact == EXACT)
return regs_exact(rold, rcur, idmap);
- if (!(rold->live & REG_LIVE_READ) && exact == NOT_EXACT)
- /* explored state didn't use this */
- return true;
if (rold->type == NOT_INIT) {
if (exact == NOT_EXACT || rcur->type == NOT_INIT)
/* explored state can't have used this */
@@ -18690,7 +18799,6 @@ static struct bpf_reg_state unbound_reg;
static __init int unbound_reg_init(void)
{
__mark_reg_unknown_imprecise(&unbound_reg);
- unbound_reg.live |= REG_LIVE_READ;
return 0;
}
late_initcall(unbound_reg_init);
@@ -18743,13 +18851,6 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
cur->stack[spi].slot_type[i % BPF_REG_SIZE]))
return false;
- if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)
- && exact == NOT_EXACT) {
- i += BPF_REG_SIZE - 1;
- /* explored state didn't use this */
- continue;
- }
-
if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
continue;
@@ -18992,91 +19093,6 @@ static bool states_equal(struct bpf_verifier_env *env,
return true;
}
-/* Return 0 if no propagation happened. Return negative error code if error
- * happened. Otherwise, return the propagated bit.
- */
-static int propagate_liveness_reg(struct bpf_verifier_env *env,
- struct bpf_reg_state *reg,
- struct bpf_reg_state *parent_reg)
-{
- u8 parent_flag = parent_reg->live & REG_LIVE_READ;
- u8 flag = reg->live & REG_LIVE_READ;
- int err;
-
- /* When comes here, read flags of PARENT_REG or REG could be any of
- * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need
- * of propagation if PARENT_REG has strongest REG_LIVE_READ64.
- */
- if (parent_flag == REG_LIVE_READ64 ||
- /* Or if there is no read flag from REG. */
- !flag ||
- /* Or if the read flag from REG is the same as PARENT_REG. */
- parent_flag == flag)
- return 0;
-
- err = mark_reg_read(env, reg, parent_reg, flag);
- if (err)
- return err;
-
- return flag;
-}
-
-/* A write screens off any subsequent reads; but write marks come from the
- * straight-line code between a state and its parent. When we arrive at an
- * equivalent state (jump target or such) we didn't arrive by the straight-line
- * code, so read marks in the state must propagate to the parent regardless
- * of the state's write marks. That's what 'parent == state->parent' comparison
- * in mark_reg_read() is for.
- */
-static int propagate_liveness(struct bpf_verifier_env *env,
- const struct bpf_verifier_state *vstate,
- struct bpf_verifier_state *vparent,
- bool *changed)
-{
- struct bpf_reg_state *state_reg, *parent_reg;
- struct bpf_func_state *state, *parent;
- int i, frame, err = 0;
- bool tmp = false;
-
- changed = changed ?: &tmp;
- if (vparent->curframe != vstate->curframe) {
- WARN(1, "propagate_live: parent frame %d current frame %d\n",
- vparent->curframe, vstate->curframe);
- return -EFAULT;
- }
- /* Propagate read liveness of registers... */
- BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
- for (frame = 0; frame <= vstate->curframe; frame++) {
- parent = vparent->frame[frame];
- state = vstate->frame[frame];
- parent_reg = parent->regs;
- state_reg = state->regs;
- /* We don't need to worry about FP liveness, it's read-only */
- for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) {
- err = propagate_liveness_reg(env, &state_reg[i],
- &parent_reg[i]);
- if (err < 0)
- return err;
- *changed |= err > 0;
- if (err == REG_LIVE_READ64)
- mark_insn_zext(env, &parent_reg[i]);
- }
-
- /* Propagate stack slots. */
- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&
- i < parent->allocated_stack / BPF_REG_SIZE; i++) {
- parent_reg = &parent->stack[i].spilled_ptr;
- state_reg = &state->stack[i].spilled_ptr;
- err = propagate_liveness_reg(env, state_reg,
- parent_reg);
- *changed |= err > 0;
- if (err < 0)
- return err;
- }
- }
- return 0;
-}
-
/* find precise scalars in the previous equivalent state and
* propagate them into the current state
*/
@@ -19096,8 +19112,7 @@ static int propagate_precision(struct bpf_verifier_env *env,
first = true;
for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
if (state_reg->type != SCALAR_VALUE ||
- !state_reg->precise ||
- !(state_reg->live & REG_LIVE_READ))
+ !state_reg->precise)
continue;
if (env->log.level & BPF_LOG_LEVEL2) {
if (first)
@@ -19114,8 +19129,7 @@ static int propagate_precision(struct bpf_verifier_env *env,
continue;
state_reg = &state->stack[i].spilled_ptr;
if (state_reg->type != SCALAR_VALUE ||
- !state_reg->precise ||
- !(state_reg->live & REG_LIVE_READ))
+ !state_reg->precise)
continue;
if (env->log.level & BPF_LOG_LEVEL2) {
if (first)
@@ -19165,9 +19179,6 @@ static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visi
changed = false;
for (backedge = visit->backedges; backedge; backedge = backedge->next) {
st = &backedge->state;
- err = propagate_liveness(env, st->equal_state, st, &changed);
- if (err)
- return err;
err = propagate_precision(env, st->equal_state, st, &changed);
if (err)
return err;
@@ -19191,7 +19202,7 @@ static bool states_maybe_looping(struct bpf_verifier_state *old,
fcur = cur->frame[fr];
for (i = 0; i < MAX_BPF_REG; i++)
if (memcmp(&fold->regs[i], &fcur->regs[i],
- offsetof(struct bpf_reg_state, parent)))
+ offsetof(struct bpf_reg_state, frameno)))
return false;
return true;
}
@@ -19289,7 +19300,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
struct bpf_verifier_state_list *sl;
struct bpf_verifier_state *cur = env->cur_state, *new;
bool force_new_state, add_new_state, loop;
- int i, j, n, err, states_cnt = 0;
+ int n, err, states_cnt = 0;
struct list_head *pos, *tmp, *head;
force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) ||
@@ -19404,7 +19415,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
goto hit;
}
}
- if (calls_callback(env, insn_idx)) {
+ if (bpf_calls_callback(env, insn_idx)) {
if (states_equal(env, &sl->state, cur, RANGE_WITHIN))
goto hit;
goto skip_inf_loop_check;
@@ -19447,25 +19458,15 @@ skip_inf_loop_check:
if (states_equal(env, &sl->state, cur, loop ? RANGE_WITHIN : NOT_EXACT)) {
hit:
sl->hit_cnt++;
- /* reached equivalent register/stack state,
- * prune the search.
- * Registers read by the continuation are read by us.
- * If we have any write marks in env->cur_state, they
- * will prevent corresponding reads in the continuation
- * from reaching our parent (an explored_state). Our
- * own state will get the read marks recorded, but
- * they'll be immediately forgotten as we're pruning
- * this state and will pop a new one.
- */
- err = propagate_liveness(env, &sl->state, cur, NULL);
/* if previous state reached the exit with precision and
* current state is equivalent to it (except precision marks)
* the precision needs to be propagated back in
* the current state.
*/
+ err = 0;
if (is_jmp_point(env, env->insn_idx))
- err = err ? : push_jmp_history(env, cur, 0, 0);
+ err = push_jmp_history(env, cur, 0, 0);
err = err ? : propagate_precision(env, &sl->state, cur, NULL);
if (err)
return err;
@@ -19553,7 +19554,7 @@ hit:
err = err ?: add_scc_backedge(env, &sl->state, backedge);
if (err) {
free_verifier_state(&backedge->state, false);
- kvfree(backedge);
+ kfree(backedge);
return err;
}
}
@@ -19636,7 +19637,7 @@ miss:
err = maybe_enter_scc(env, new);
if (err) {
free_verifier_state(new, false);
- kvfree(new_sl);
+ kfree(new_sl);
return err;
}
@@ -19645,38 +19646,6 @@ miss:
cur->dfs_depth = new->dfs_depth + 1;
clear_jmp_history(cur);
list_add(&new_sl->node, head);
-
- /* connect new state to parentage chain. Current frame needs all
- * registers connected. Only r6 - r9 of the callers are alive (pushed
- * to the stack implicitly by JITs) so in callers' frames connect just
- * r6 - r9 as an optimization. Callers will have r1 - r5 connected to
- * the state of the call instruction (with WRITTEN set), and r0 comes
- * from callee with its full parentage chain, anyway.
- */
- /* clear write marks in current state: the writes we did are not writes
- * our child did, so they don't screen off its reads from us.
- * (There are no read marks in current state, because reads always mark
- * their parent and current state never has children yet. Only
- * explored_states can get read marks.)
- */
- for (j = 0; j <= cur->curframe; j++) {
- for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++)
- cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i];
- for (i = 0; i < BPF_REG_FP; i++)
- cur->frame[j]->regs[i].live = REG_LIVE_NONE;
- }
-
- /* all stack frames are accessible from callee, clear them all */
- for (j = 0; j <= cur->curframe; j++) {
- struct bpf_func_state *frame = cur->frame[j];
- struct bpf_func_state *newframe = new->frame[j];
-
- for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) {
- frame->stack[i].spilled_ptr.live = REG_LIVE_NONE;
- frame->stack[i].spilled_ptr.parent =
- &newframe->stack[i].spilled_ptr;
- }
- }
return 0;
}
@@ -19812,6 +19781,9 @@ static int process_bpf_exit_full(struct bpf_verifier_env *env,
return PROCESS_BPF_EXIT;
if (env->cur_state->curframe) {
+ err = bpf_update_live_stack(env);
+ if (err)
+ return err;
/* exit from nested function */
err = prepare_func_exit(env, &env->insn_idx);
if (err)
@@ -19997,7 +19969,7 @@ static int do_check(struct bpf_verifier_env *env)
for (;;) {
struct bpf_insn *insn;
struct bpf_insn_aux_data *insn_aux;
- int err;
+ int err, marks_err;
/* reset current history entry on each new instruction */
env->cur_hist_ent = NULL;
@@ -20090,7 +20062,15 @@ static int do_check(struct bpf_verifier_env *env)
if (state->speculative && insn_aux->nospec)
goto process_bpf_exit;
+ err = bpf_reset_stack_write_marks(env, env->insn_idx);
+ if (err)
+ return err;
err = do_check_insn(env, &do_print_state);
+ if (err >= 0 || error_recoverable_with_nospec(err)) {
+ marks_err = bpf_commit_stack_write_marks(env);
+ if (marks_err)
+ return marks_err;
+ }
if (error_recoverable_with_nospec(err) && state->speculative) {
/* Prevent this speculative path from ever reaching the
* insn that would have been unsafe to execute.
@@ -20131,6 +20111,9 @@ process_bpf_exit:
err = update_branch_counts(env, env->cur_state);
if (err)
return err;
+ err = bpf_update_live_stack(env);
+ if (err)
+ return err;
err = pop_stack(env, &prev_insn_idx, &env->insn_idx,
pop_log);
if (err < 0) {
@@ -20193,8 +20176,11 @@ static int __add_used_btf(struct bpf_verifier_env *env, struct btf *btf)
if (env->used_btfs[i].btf == btf)
return i;
- if (env->used_btf_cnt >= MAX_USED_BTFS)
+ if (env->used_btf_cnt >= MAX_USED_BTFS) {
+ verbose(env, "The total number of btfs per program has reached the limit of %u\n",
+ MAX_USED_BTFS);
return -E2BIG;
+ }
btf_get(btf);
@@ -20360,6 +20346,12 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
{
enum bpf_prog_type prog_type = resolve_prog_type(prog);
+ if (map->excl_prog_sha &&
+ memcmp(map->excl_prog_sha, prog->digest, SHA256_DIGEST_SIZE)) {
+ verbose(env, "program's hash doesn't match map's excl_prog_hash\n");
+ return -EACCES;
+ }
+
if (btf_record_has_field(map->record, BPF_LIST_HEAD) ||
btf_record_has_field(map->record, BPF_RB_ROOT)) {
if (is_tracing_prog_type(prog_type)) {
@@ -20699,12 +20691,11 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
* [0, off) and [off, end) to new locations, so the patched range stays zero
*/
static void adjust_insn_aux_data(struct bpf_verifier_env *env,
- struct bpf_insn_aux_data *new_data,
struct bpf_prog *new_prog, u32 off, u32 cnt)
{
- struct bpf_insn_aux_data *old_data = env->insn_aux_data;
+ struct bpf_insn_aux_data *data = env->insn_aux_data;
struct bpf_insn *insn = new_prog->insnsi;
- u32 old_seen = old_data[off].seen;
+ u32 old_seen = data[off].seen;
u32 prog_len;
int i;
@@ -20712,22 +20703,20 @@ static void adjust_insn_aux_data(struct bpf_verifier_env *env,
* (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the
* original insn at old prog.
*/
- old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1);
+ data[off].zext_dst = insn_has_def32(insn + off + cnt - 1);
if (cnt == 1)
return;
prog_len = new_prog->len;
- memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
- memcpy(new_data + off + cnt - 1, old_data + off,
- sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
+ memmove(data + off + cnt - 1, data + off,
+ sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
+ memset(data + off, 0, sizeof(struct bpf_insn_aux_data) * (cnt - 1));
for (i = off; i < off + cnt - 1; i++) {
/* Expand insni[off]'s seen count to the patched range. */
- new_data[i].seen = old_seen;
- new_data[i].zext_dst = insn_has_def32(env, insn + i);
+ data[i].seen = old_seen;
+ data[i].zext_dst = insn_has_def32(insn + i);
}
- env->insn_aux_data = new_data;
- vfree(old_data);
}
static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len)
@@ -20765,10 +20754,14 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
struct bpf_insn_aux_data *new_data = NULL;
if (len > 1) {
- new_data = vzalloc(array_size(env->prog->len + len - 1,
- sizeof(struct bpf_insn_aux_data)));
+ new_data = vrealloc(env->insn_aux_data,
+ array_size(env->prog->len + len - 1,
+ sizeof(struct bpf_insn_aux_data)),
+ GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!new_data)
return NULL;
+
+ env->insn_aux_data = new_data;
}
new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
@@ -20777,10 +20770,9 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
verbose(env,
"insn %d cannot be patched due to 16-bit range\n",
env->insn_aux_data[off].orig_idx);
- vfree(new_data);
return NULL;
}
- adjust_insn_aux_data(env, new_data, new_prog, off, len);
+ adjust_insn_aux_data(env, new_prog, off, len);
adjust_subprog_starts(env, off, len);
adjust_poke_descs(new_prog, off, len);
return new_prog;
@@ -21131,7 +21123,7 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
* BPF_STX + SRC_OP, so it is safe to pass NULL
* here.
*/
- if (is_reg64(env, &insn, load_reg, NULL, DST_OP)) {
+ if (is_reg64(&insn, load_reg, NULL, DST_OP)) {
if (class == BPF_LD &&
BPF_MODE(code) == BPF_IMM)
i++;
@@ -21400,10 +21392,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
continue;
case PTR_TO_ARENA:
if (BPF_MODE(insn->code) == BPF_MEMSX) {
- verbose(env, "sign extending loads from arena are not supported yet\n");
- return -EOPNOTSUPP;
+ if (!bpf_jit_supports_insn(insn, true)) {
+ verbose(env, "sign extending loads from arena are not supported yet\n");
+ return -EOPNOTSUPP;
+ }
+ insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32SX | BPF_SIZE(insn->code);
+ } else {
+ insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code);
}
- insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code);
env->prog->aux->num_exentries++;
continue;
default:
@@ -21578,6 +21574,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
func[i]->aux->func_info_cnt = prog->aux->func_info_cnt;
func[i]->aux->poke_tab = prog->aux->poke_tab;
func[i]->aux->size_poke_tab = prog->aux->size_poke_tab;
+ func[i]->aux->main_prog_aux = prog->aux;
for (j = 0; j < prog->aux->size_poke_tab; j++) {
struct bpf_jit_poke_descriptor *poke;
@@ -21608,6 +21605,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
if (BPF_CLASS(insn->code) == BPF_LDX &&
(BPF_MODE(insn->code) == BPF_PROBE_MEM ||
BPF_MODE(insn->code) == BPF_PROBE_MEM32 ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEM32SX ||
BPF_MODE(insn->code) == BPF_PROBE_MEMSX))
num_exentries++;
if ((BPF_CLASS(insn->code) == BPF_STX ||
@@ -23855,6 +23853,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
BTF_SET_START(btf_id_deny)
BTF_ID_UNUSED
#ifdef CONFIG_SMP
+BTF_ID(func, ___migrate_enable)
BTF_ID(func, migrate_disable)
BTF_ID(func, migrate_enable)
#endif
@@ -24084,67 +24083,6 @@ static int process_fd_array(struct bpf_verifier_env *env, union bpf_attr *attr,
return 0;
}
-static bool can_fallthrough(struct bpf_insn *insn)
-{
- u8 class = BPF_CLASS(insn->code);
- u8 opcode = BPF_OP(insn->code);
-
- if (class != BPF_JMP && class != BPF_JMP32)
- return true;
-
- if (opcode == BPF_EXIT || opcode == BPF_JA)
- return false;
-
- return true;
-}
-
-static bool can_jump(struct bpf_insn *insn)
-{
- u8 class = BPF_CLASS(insn->code);
- u8 opcode = BPF_OP(insn->code);
-
- if (class != BPF_JMP && class != BPF_JMP32)
- return false;
-
- switch (opcode) {
- case BPF_JA:
- case BPF_JEQ:
- case BPF_JNE:
- case BPF_JLT:
- case BPF_JLE:
- case BPF_JGT:
- case BPF_JGE:
- case BPF_JSGT:
- case BPF_JSGE:
- case BPF_JSLT:
- case BPF_JSLE:
- case BPF_JCOND:
- case BPF_JSET:
- return true;
- }
-
- return false;
-}
-
-static int insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2])
-{
- struct bpf_insn *insn = &prog->insnsi[idx];
- int i = 0, insn_sz;
- u32 dst;
-
- insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
- if (can_fallthrough(insn) && idx + 1 < prog->len)
- succ[i++] = idx + insn_sz;
-
- if (can_jump(insn)) {
- dst = idx + jmp_offset(insn) + 1;
- if (i == 0 || succ[0] != dst)
- succ[i++] = dst;
- }
-
- return i;
-}
-
/* Each field is a register bitmask */
struct insn_live_regs {
u16 use; /* registers read by instruction */
@@ -24342,7 +24280,7 @@ static int compute_live_registers(struct bpf_verifier_env *env)
u16 new_out = 0;
u16 new_in = 0;
- succ_num = insn_successors(env->prog, insn_idx, succ);
+ succ_num = bpf_insn_successors(env->prog, insn_idx, succ);
for (int s = 0; s < succ_num; ++s)
new_out |= state[succ[s]].in;
new_in = (new_out & ~live->def) | live->use;
@@ -24379,9 +24317,6 @@ static int compute_live_registers(struct bpf_verifier_env *env)
out:
kvfree(state);
- kvfree(env->cfg.insn_postorder);
- env->cfg.insn_postorder = NULL;
- env->cfg.cur_postorder = 0;
return err;
}
@@ -24511,7 +24446,7 @@ dfs_continue:
stack[stack_sz++] = w;
}
/* Visit 'w' successors */
- succ_cnt = insn_successors(env->prog, w, succ);
+ succ_cnt = bpf_insn_successors(env->prog, w, succ);
for (j = 0; j < succ_cnt; ++j) {
if (pre[succ[j]]) {
low[w] = min(low[w], low[succ[j]]);
@@ -24684,6 +24619,14 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (ret < 0)
goto skip_full_check;
+ ret = compute_postorder(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ ret = bpf_stack_liveness_init(env);
+ if (ret)
+ goto skip_full_check;
+
ret = check_attach_btf_id(env);
if (ret)
goto skip_full_check;
@@ -24833,6 +24776,7 @@ err_unlock:
mutex_unlock(&bpf_verifier_lock);
vfree(env->insn_aux_data);
err_free_env:
+ bpf_stack_liveness_free(env);
kvfree(env->cfg.insn_postorder);
kvfree(env->scc_info);
kvfree(env);