From 1588c81b9f215170bd596984691eda2f05a84a1f Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 3 Dec 2025 15:37:43 -0800 Subject: bpf: Allow verifier to fixup kernel module kfuncs Allow verifier to fixup kfuncs in kernel module to support kfuncs with __prog arguments. Currently, special kfuncs and kfuncs with __prog arguments are kernel kfuncs. Allowing kernel module kfuncs should not affect existing kfunc fixup as kernel module kfuncs have BTF IDs greater than kernel kfuncs' BTF IDs. Signed-off-by: Amery Hung Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20251203233748.668365-2-ameryhung@gmail.com --- kernel/bpf/verifier.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f0ca69f888fa..bb7eca1025c3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -22493,8 +22493,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (!bpf_jit_supports_far_kfunc_call()) insn->imm = BPF_CALL_IMM(desc->addr); - if (insn->off) - return 0; + if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl] || desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; -- cgit v1.2.3 From b5709f6d26d65f6bb9711f4b5f98469fd507cb5b Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 3 Dec 2025 15:37:44 -0800 Subject: bpf: Support associating BPF program with struct_ops Add a new BPF command BPF_PROG_ASSOC_STRUCT_OPS to allow associating a BPF program with a struct_ops map. This command takes a file descriptor of a struct_ops map and a BPF program and set prog->aux->st_ops_assoc to the kdata of the struct_ops map. The command does not accept a struct_ops program nor a non-struct_ops map. Programs of a struct_ops map is automatically associated with the map during map update. If a program is shared between two struct_ops maps, prog->aux->st_ops_assoc will be poisoned to indicate that the associated struct_ops is ambiguous. The pointer, once poisoned, cannot be reset since we have lost track of associated struct_ops. For other program types, the associated struct_ops map, once set, cannot be changed later. This restriction may be lifted in the future if there is a use case. A kernel helper bpf_prog_get_assoc_struct_ops() can be used to retrieve the associated struct_ops pointer. The returned pointer, if not NULL, is guaranteed to be valid and point to a fully updated struct_ops struct. For struct_ops program reused in multiple struct_ops map, the return will be NULL. prog->aux->st_ops_assoc is protected by bumping the refcount for non-struct_ops programs and RCU for struct_ops programs. Since it would be inefficient to track programs associated with a struct_ops map, every non-struct_ops program will bump the refcount of the map to make sure st_ops_assoc stays valid. For a struct_ops program, it is protected by RCU as map_free will wait for an RCU grace period before disassociating the program with the map. The helper must be called in BPF program context or RCU read-side critical section. struct_ops implementers should note that the struct_ops returned may not be initialized nor attached yet. The struct_ops implementer will be responsible for tracking and checking the state of the associated struct_ops map if the use case expects an initialized or attached struct_ops. Signed-off-by: Amery Hung Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20251203233748.668365-3-ameryhung@gmail.com --- include/linux/bpf.h | 16 ++++++++ include/uapi/linux/bpf.h | 17 ++++++++ kernel/bpf/bpf_struct_ops.c | 88 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/core.c | 3 ++ kernel/bpf/syscall.c | 46 ++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 17 ++++++++ 6 files changed, 187 insertions(+) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6498be4c44f8..28d8d6b7bb1e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1739,6 +1739,8 @@ struct bpf_prog_aux { struct rcu_head rcu; }; struct bpf_stream stream[2]; + struct mutex st_ops_assoc_mutex; + struct bpf_map __rcu *st_ops_assoc; }; struct bpf_prog { @@ -2041,6 +2043,9 @@ static inline void bpf_module_put(const void *data, struct module *owner) module_put(owner); } int bpf_struct_ops_link_create(union bpf_attr *attr); +int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map); +void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog); +void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux); u32 bpf_struct_ops_id(const void *kdata); #ifdef CONFIG_NET @@ -2088,6 +2093,17 @@ static inline int bpf_struct_ops_link_create(union bpf_attr *attr) { return -EOPNOTSUPP; } +static inline int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map) +{ + return -EOPNOTSUPP; +} +static inline void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog) +{ +} +static inline void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux) +{ + return NULL; +} static inline void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map) { } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f8d8513eda27..84ced3ed2d21 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -918,6 +918,16 @@ union bpf_iter_link_info { * Number of bytes read from the stream on success, or -1 if an * error occurred (in which case, *errno* is set appropriately). * + * BPF_PROG_ASSOC_STRUCT_OPS + * Description + * Associate a BPF program with a struct_ops map. The struct_ops + * map is identified by *map_fd* and the BPF program is + * identified by *prog_fd*. + * + * Return + * 0 on success or -1 if an error occurred (in which case, + * *errno* is set appropriately). + * * NOTES * eBPF objects (maps and programs) can be shared between processes. * @@ -974,6 +984,7 @@ enum bpf_cmd { BPF_PROG_BIND_MAP, BPF_TOKEN_CREATE, BPF_PROG_STREAM_READ_BY_FD, + BPF_PROG_ASSOC_STRUCT_OPS, __MAX_BPF_CMD, }; @@ -1894,6 +1905,12 @@ union bpf_attr { __u32 prog_fd; } prog_stream_read; + struct { + __u32 map_fd; + __u32 prog_fd; + __u32 flags; + } prog_assoc_struct_ops; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 278490683d28..c43346cb3d76 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -533,6 +533,17 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) } } +static void bpf_struct_ops_map_dissoc_progs(struct bpf_struct_ops_map *st_map) +{ + u32 i; + + for (i = 0; i < st_map->funcs_cnt; i++) { + if (!st_map->links[i]) + break; + bpf_prog_disassoc_struct_ops(st_map->links[i]->prog); + } +} + static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map *st_map) { int i; @@ -801,6 +812,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, goto reset_unlock; } + /* Poison pointer on error instead of return for backward compatibility */ + bpf_prog_assoc_struct_ops(prog, &st_map->map); + link = kzalloc(sizeof(*link), GFP_USER); if (!link) { bpf_prog_put(prog); @@ -980,6 +994,8 @@ static void bpf_struct_ops_map_free(struct bpf_map *map) if (btf_is_module(st_map->btf)) module_put(st_map->st_ops_desc->st_ops->owner); + bpf_struct_ops_map_dissoc_progs(st_map); + bpf_struct_ops_map_del_ksyms(st_map); /* The struct_ops's function may switch to another struct_ops. @@ -1396,6 +1412,78 @@ err_out: return err; } +int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map) +{ + struct bpf_map *st_ops_assoc; + + guard(mutex)(&prog->aux->st_ops_assoc_mutex); + + st_ops_assoc = rcu_dereference_protected(prog->aux->st_ops_assoc, + lockdep_is_held(&prog->aux->st_ops_assoc_mutex)); + if (st_ops_assoc && st_ops_assoc == map) + return 0; + + if (st_ops_assoc) { + if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) + return -EBUSY; + + rcu_assign_pointer(prog->aux->st_ops_assoc, BPF_PTR_POISON); + } else { + /* + * struct_ops map does not track associated non-struct_ops programs. + * Bump the refcount to make sure st_ops_assoc is always valid. + */ + if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) + bpf_map_inc(map); + + rcu_assign_pointer(prog->aux->st_ops_assoc, map); + } + + return 0; +} + +void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog) +{ + struct bpf_map *st_ops_assoc; + + guard(mutex)(&prog->aux->st_ops_assoc_mutex); + + st_ops_assoc = rcu_dereference_protected(prog->aux->st_ops_assoc, + lockdep_is_held(&prog->aux->st_ops_assoc_mutex)); + if (!st_ops_assoc || st_ops_assoc == BPF_PTR_POISON) + return; + + if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) + bpf_map_put(st_ops_assoc); + + RCU_INIT_POINTER(prog->aux->st_ops_assoc, NULL); +} + +/* + * Get a reference to the struct_ops struct (i.e., kdata) associated with a + * program. Should only be called in BPF program context (e.g., in a kfunc). + * + * If the returned pointer is not NULL, it must points to a valid struct_ops. + * The struct_ops map is not guaranteed to be initialized nor attached. + * Kernel struct_ops implementers are responsible for tracking and checking + * the state of the struct_ops if the use case requires an initialized or + * attached struct_ops. + */ +void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux) +{ + struct bpf_struct_ops_map *st_map; + struct bpf_map *st_ops_assoc; + + st_ops_assoc = rcu_dereference_check(aux->st_ops_assoc, bpf_rcu_lock_held()); + if (!st_ops_assoc || st_ops_assoc == BPF_PTR_POISON) + return NULL; + + st_map = (struct bpf_struct_ops_map *)st_ops_assoc; + + return &st_map->kvalue.data; +} +EXPORT_SYMBOL_GPL(bpf_prog_get_assoc_struct_ops); + void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c8ae6ab31651..67226145a4db 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -136,6 +136,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag mutex_init(&fp->aux->used_maps_mutex); mutex_init(&fp->aux->ext_mutex); mutex_init(&fp->aux->dst_mutex); + mutex_init(&fp->aux->st_ops_assoc_mutex); #ifdef CONFIG_BPF_SYSCALL bpf_prog_stream_init(fp); @@ -286,6 +287,7 @@ void __bpf_prog_free(struct bpf_prog *fp) if (fp->aux) { mutex_destroy(&fp->aux->used_maps_mutex); mutex_destroy(&fp->aux->dst_mutex); + mutex_destroy(&fp->aux->st_ops_assoc_mutex); kfree(fp->aux->poke_tab); kfree(fp->aux); } @@ -2896,6 +2898,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) #endif bpf_free_used_maps(aux); bpf_free_used_btfs(aux); + bpf_prog_disassoc_struct_ops(aux->prog); if (bpf_prog_is_dev_bound(aux)) bpf_prog_dev_bound_destroy(aux->prog); #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6589acc89ef8..3080cc48bfc3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -6122,6 +6122,49 @@ static int prog_stream_read(union bpf_attr *attr) return ret; } +#define BPF_PROG_ASSOC_STRUCT_OPS_LAST_FIELD prog_assoc_struct_ops.prog_fd + +static int prog_assoc_struct_ops(union bpf_attr *attr) +{ + struct bpf_prog *prog; + struct bpf_map *map; + int ret; + + if (CHECK_ATTR(BPF_PROG_ASSOC_STRUCT_OPS)) + return -EINVAL; + + if (attr->prog_assoc_struct_ops.flags) + return -EINVAL; + + prog = bpf_prog_get(attr->prog_assoc_struct_ops.prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) { + ret = -EINVAL; + goto put_prog; + } + + map = bpf_map_get(attr->prog_assoc_struct_ops.map_fd); + if (IS_ERR(map)) { + ret = PTR_ERR(map); + goto put_prog; + } + + if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS) { + ret = -EINVAL; + goto put_map; + } + + ret = bpf_prog_assoc_struct_ops(prog, map); + +put_map: + bpf_map_put(map); +put_prog: + bpf_prog_put(prog); + return ret; +} + static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; @@ -6261,6 +6304,9 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) case BPF_PROG_STREAM_READ_BY_FD: err = prog_stream_read(&attr); break; + case BPF_PROG_ASSOC_STRUCT_OPS: + err = prog_assoc_struct_ops(&attr); + break; default: err = -EINVAL; break; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index be7d8e060e10..6b92b0847ec2 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -918,6 +918,16 @@ union bpf_iter_link_info { * Number of bytes read from the stream on success, or -1 if an * error occurred (in which case, *errno* is set appropriately). * + * BPF_PROG_ASSOC_STRUCT_OPS + * Description + * Associate a BPF program with a struct_ops map. The struct_ops + * map is identified by *map_fd* and the BPF program is + * identified by *prog_fd*. + * + * Return + * 0 on success or -1 if an error occurred (in which case, + * *errno* is set appropriately). + * * NOTES * eBPF objects (maps and programs) can be shared between processes. * @@ -974,6 +984,7 @@ enum bpf_cmd { BPF_PROG_BIND_MAP, BPF_TOKEN_CREATE, BPF_PROG_STREAM_READ_BY_FD, + BPF_PROG_ASSOC_STRUCT_OPS, __MAX_BPF_CMD, }; @@ -1894,6 +1905,12 @@ union bpf_attr { __u32 prog_fd; } prog_stream_read; + struct { + __u32 map_fd; + __u32 prog_fd; + __u32 flags; + } prog_assoc_struct_ops; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF -- cgit v1.2.3 From 48e11bad9a1fd803a22d25c9a7d3aced1ba87817 Mon Sep 17 00:00:00 2001 From: Kohei Enju Date: Mon, 8 Dec 2025 22:14:31 +0900 Subject: bpf: cpumap: propagate underlying error in cpu_map_update_elem() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After commit 9216477449f3 ("bpf: cpumap: Add the possibility to attach an eBPF program to cpumap"), __cpu_map_entry_alloc() may fail with errors other than -ENOMEM, such as -EBADF or -EINVAL. However, __cpu_map_entry_alloc() returns NULL on all failures, and cpu_map_update_elem() unconditionally converts this NULL into -ENOMEM. As a result, user space always receives -ENOMEM regardless of the actual underlying error. Examples of unexpected behavior: - Nonexistent fd : -ENOMEM (should be -EBADF) - Non-BPF fd : -ENOMEM (should be -EINVAL) - Bad attach type : -ENOMEM (should be -EINVAL) Change __cpu_map_entry_alloc() to return ERR_PTR(err) instead of NULL and have cpu_map_update_elem() propagate this error. Signed-off-by: Kohei Enju Reviewed-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20251208131449.73036-2-enjuk@amazon.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/cpumap.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 703e5df1f4ef..04171fbc39cb 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -430,7 +430,7 @@ static struct bpf_cpu_map_entry * __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, u32 cpu) { - int numa, err, i, fd = value->bpf_prog.fd; + int numa, err = -ENOMEM, i, fd = value->bpf_prog.fd; gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; struct xdp_bulk_queue *bq; @@ -440,7 +440,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, rcpu = bpf_map_kmalloc_node(map, sizeof(*rcpu), gfp | __GFP_ZERO, numa); if (!rcpu) - return NULL; + return ERR_PTR(err); /* Alloc percpu bulkq */ rcpu->bulkq = bpf_map_alloc_percpu(map, sizeof(*rcpu->bulkq), @@ -468,16 +468,21 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, rcpu->value.qsize = value->qsize; gro_init(&rcpu->gro); - if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd)) - goto free_ptr_ring; + if (fd > 0) { + err = __cpu_map_load_bpf_program(rcpu, map, fd); + if (err) + goto free_ptr_ring; + } /* Setup kthread */ init_completion(&rcpu->kthread_running); rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, "cpumap/%d/map:%d", cpu, map->id); - if (IS_ERR(rcpu->kthread)) + if (IS_ERR(rcpu->kthread)) { + err = PTR_ERR(rcpu->kthread); goto free_prog; + } /* Make sure kthread runs on a single CPU */ kthread_bind(rcpu->kthread, cpu); @@ -503,7 +508,7 @@ free_bulkq: free_percpu(rcpu->bulkq); free_rcu: kfree(rcpu); - return NULL; + return ERR_PTR(err); } static void __cpu_map_entry_free(struct work_struct *work) @@ -596,8 +601,8 @@ static long cpu_map_update_elem(struct bpf_map *map, void *key, void *value, } else { /* Updating qsize cause re-allocation of bpf_cpu_map_entry */ rcpu = __cpu_map_entry_alloc(map, &cpumap_value, key_cpu); - if (!rcpu) - return -ENOMEM; + if (IS_ERR(rcpu)) + return PTR_ERR(rcpu); } rcu_read_lock(); __cpu_map_entry_replace(cmap, key_cpu, rcpu); -- cgit v1.2.3 From d18dec4b8990048ce75f0ece32bb96b3fbd3f422 Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Tue, 2 Dec 2025 18:02:19 +0000 Subject: bpf: verifier improvement in 32bit shift sign extension pattern This patch improves the verifier to correctly compute bounds for sign extension compiler pattern composed of left shift by 32bits followed by a sign right shift by 32bits. Pattern in the verifier was limitted to positive value bounds and would reset bound computation for negative values. New code allows both positive and negative values for sign extension without compromising bound computation and verifier to pass. This change is required by GCC which generate such pattern, and was detected in the context of systemd, as described in the following GCC bugzilla: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119731 Three new tests were added in verifier_subreg.c. Signed-off-by: Cupertino Miranda Signed-off-by: Andrew Pinski Acked-by: Eduard Zingerman Cc: David Faust Cc: Jose Marchesi Cc: Elena Zannoni Link: https://lore.kernel.org/r/20251202180220.11128-2-cupertino.miranda@oracle.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bb7eca1025c3..a31c032b2dd6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15300,21 +15300,17 @@ static void __scalar64_min_max_lsh(struct bpf_reg_state *dst_reg, u64 umin_val, u64 umax_val) { /* Special case <<32 because it is a common compiler pattern to sign - * extend subreg by doing <<32 s>>32. In this case if 32bit bounds are - * positive we know this shift will also be positive so we can track - * bounds correctly. Otherwise we lose all sign bit information except - * what we can pick up from var_off. Perhaps we can generalize this - * later to shifts of any length. + * extend subreg by doing <<32 s>>32. smin/smax assignments are correct + * because s32 bounds don't flip sign when shifting to the left by + * 32bits. */ - if (umin_val == 32 && umax_val == 32 && dst_reg->s32_max_value >= 0) + if (umin_val == 32 && umax_val == 32) { dst_reg->smax_value = (s64)dst_reg->s32_max_value << 32; - else - dst_reg->smax_value = S64_MAX; - - if (umin_val == 32 && umax_val == 32 && dst_reg->s32_min_value >= 0) dst_reg->smin_value = (s64)dst_reg->s32_min_value << 32; - else + } else { + dst_reg->smax_value = S64_MAX; dst_reg->smin_value = S64_MIN; + } /* If we might shift our top bit out, then we know nothing */ if (dst_reg->umax_value > 1ULL << (63 - umax_val)) { -- cgit v1.2.3 From 6f0b824a61f212e9707ff68abcabfdfa4724b811 Mon Sep 17 00:00:00 2001 From: "T.J. Mercier" Date: Sun, 7 Dec 2025 01:10:04 -0800 Subject: bpf: Fix bpf_seq_read docs for increased buffer size Commit af65320948b8 ("bpf: Bump iter seq size to support BTF representation of large data structures") increased the fixed buffer size from PAGE_SIZE to PAGE_SIZE << 3, but the docs for the function didn't get updated at the same time. Update them. Signed-off-by: T.J. Mercier Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20251207091005.2829703-1-tjmercier@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_iter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index eec60b57bd3d..4b58d56ecab1 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -86,7 +86,7 @@ static bool bpf_iter_support_resched(struct seq_file *seq) /* bpf_seq_read, a customized and simpler version for bpf iterator. * The following are differences from seq_read(): - * . fixed buffer size (PAGE_SIZE) + * . fixed buffer size (PAGE_SIZE << 3) * . assuming NULL ->llseek() * . stop() may call bpf program, handling potential overflow there */ -- cgit v1.2.3 From 12a1fe6e12dbad39f2f0dad1a385625f0298eff4 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 16 Dec 2025 12:33:22 -0500 Subject: bpf/verifier: Do not limit maximum direct offset into arena map The verifier currently limits direct offsets into a map to 512MiB to avoid overflow during pointer arithmetic. However, this prevents arena maps from using direct addressing instructions to access data at the end of > 512MiB arena maps. This is necessary when moving arena globals to the end of the arena instead of the front. Refactor the verifier code to remove the offset calculation during direct value access calculations. This is possible because the only two map types that implement .map_direct_value_addr() are arrays and arenas, and they both do their own internal checks to ensure the offset is within bounds. Adjust selftests that expect the old error. These tests still fail because the verifier identifies the access as out of bounds for the map, so change them to expect an "invalid access to map value pointer" error instead. Signed-off-by: Emil Tsalapatis Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20251216173325.98465-3-emil@etsalapatis.com --- kernel/bpf/verifier.c | 5 ----- tools/testing/selftests/bpf/verifier/direct_value_access.c | 4 ++-- 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a31c032b2dd6..d6b8a77fbe3b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21132,11 +21132,6 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) } else { u32 off = insn[1].imm; - if (off >= BPF_MAX_VAR_OFF) { - verbose(env, "direct value offset of %u is not allowed\n", off); - return -EINVAL; - } - if (!map->ops->map_direct_value_addr) { verbose(env, "no direct value access support for this map type\n"); return -EINVAL; diff --git a/tools/testing/selftests/bpf/verifier/direct_value_access.c b/tools/testing/selftests/bpf/verifier/direct_value_access.c index c0648dc009b5..e569d119fb60 100644 --- a/tools/testing/selftests/bpf/verifier/direct_value_access.c +++ b/tools/testing/selftests/bpf/verifier/direct_value_access.c @@ -81,7 +81,7 @@ }, .fixup_map_array_48b = { 1 }, .result = REJECT, - .errstr = "direct value offset of 4294967295 is not allowed", + .errstr = "invalid access to map value pointer, value_size=48 off=4294967295", }, { "direct map access, write test 8", @@ -141,7 +141,7 @@ }, .fixup_map_array_48b = { 1 }, .result = REJECT, - .errstr = "direct value offset of 536870912 is not allowed", + .errstr = "invalid access to map value pointer, value_size=48 off=536870912", }, { "direct map access, write test 13", -- cgit v1.2.3 From 93f0d09697613beba922a387d21a09a41eeefef5 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 19 Dec 2025 10:44:17 -0800 Subject: bpf: move recursion detection logic to helpers BPF programs detect recursion by doing atomic inc/dec on a per-cpu active counter from the trampoline. Create two helpers for operations on this active counter, this makes it easy to changes the recursion detection logic in future. This commit makes no functional changes. Acked-by: Yonghong Song Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20251219184422.2899902-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 10 ++++++++++ kernel/bpf/trampoline.c | 8 ++++---- kernel/trace/bpf_trace.c | 4 ++-- 3 files changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bb3847caeae1..2da986136d26 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2004,6 +2004,16 @@ struct bpf_struct_ops_common_value { enum bpf_struct_ops_state state; }; +static inline bool bpf_prog_get_recursion_context(struct bpf_prog *prog) +{ + return this_cpu_inc_return(*(prog->active)) == 1; +} + +static inline void bpf_prog_put_recursion_context(struct bpf_prog *prog) +{ + this_cpu_dec(*(prog->active)); +} + #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) /* This macro helps developer to register a struct_ops type and generate * type information correctly. Developers should use this macro to register diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 976d89011b15..2a125d063e62 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -949,7 +949,7 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); - if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { + if (unlikely(!bpf_prog_get_recursion_context(prog))) { bpf_prog_inc_misses_counter(prog); if (prog->aux->recursion_detected) prog->aux->recursion_detected(prog); @@ -993,7 +993,7 @@ static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start, bpf_reset_run_ctx(run_ctx->saved_run_ctx); update_prog_stats(prog, start); - this_cpu_dec(*(prog->active)); + bpf_prog_put_recursion_context(prog); rcu_read_unlock_migrate(); } @@ -1029,7 +1029,7 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); - if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { + if (unlikely(!bpf_prog_get_recursion_context(prog))) { bpf_prog_inc_misses_counter(prog); if (prog->aux->recursion_detected) prog->aux->recursion_detected(prog); @@ -1044,7 +1044,7 @@ void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start, bpf_reset_run_ctx(run_ctx->saved_run_ctx); update_prog_stats(prog, start); - this_cpu_dec(*(prog->active)); + bpf_prog_put_recursion_context(prog); migrate_enable(); rcu_read_unlock_trace(); } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index fe28d86f7c35..6e076485bf70 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2063,7 +2063,7 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) struct bpf_trace_run_ctx run_ctx; cant_sleep(); - if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { + if (unlikely(!bpf_prog_get_recursion_context(prog))) { bpf_prog_inc_misses_counter(prog); goto out; } @@ -2077,7 +2077,7 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) bpf_reset_run_ctx(old_run_ctx); out: - this_cpu_dec(*(prog->active)); + bpf_prog_put_recursion_context(prog); } #define UNPACK(...) __VA_ARGS__ -- cgit v1.2.3 From c3e34f88f9992866a1fb510850921a8fe299a97b Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 19 Dec 2025 10:44:18 -0800 Subject: bpf: arm64: Optimize recursion detection by not using atomics BPF programs detect recursion using a per-CPU 'active' flag in struct bpf_prog. The trampoline currently sets/clears this flag with atomic operations. On some arm64 platforms (e.g., Neoverse V2 with LSE), per-CPU atomic operations are relatively slow. Unlike x86_64 - where per-CPU updates can avoid cross-core atomicity, arm64 LSE atomics are always atomic across all cores, which is unnecessary overhead for strictly per-CPU state. This patch removes atomics from the recursion detection path on arm64 by changing 'active' to a per-CPU array of four u8 counters, one per context: {NMI, hard-irq, soft-irq, normal}. The running context uses a non-atomic increment/decrement on its element. After increment, recursion is detected by reading the array as a u32 and verifying that only the expected element changed; any change in another element indicates inter-context recursion, and a value > 1 in the same element indicates same-context recursion. For example, starting from {0,0,0,0}, a normal-context trigger changes the array to {0,0,0,1}. If an NMI arrives on the same CPU and triggers the program, the array becomes {1,0,0,1}. When the NMI context checks the u32 against the expected mask for normal (0x00000001), it observes 0x01000001 and correctly reports recursion. Same-context recursion is detected analogously. Acked-by: Yonghong Song Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20251219184422.2899902-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 32 +++++++++++++++++++++++++++++--- kernel/bpf/core.c | 3 ++- 2 files changed, 31 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2da986136d26..da6a00dd313f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1746,6 +1746,8 @@ struct bpf_prog_aux { struct bpf_map __rcu *st_ops_assoc; }; +#define BPF_NR_CONTEXTS 4 /* normal, softirq, hardirq, NMI */ + struct bpf_prog { u16 pages; /* Number of allocated pages */ u16 jited:1, /* Is our filter JIT'ed? */ @@ -1772,7 +1774,7 @@ struct bpf_prog { u8 tag[BPF_TAG_SIZE]; }; struct bpf_prog_stats __percpu *stats; - int __percpu *active; + u8 __percpu *active; /* u8[BPF_NR_CONTEXTS] for recursion protection */ unsigned int (*bpf_func)(const void *ctx, const struct bpf_insn *insn); struct bpf_prog_aux *aux; /* Auxiliary fields */ @@ -2006,12 +2008,36 @@ struct bpf_struct_ops_common_value { static inline bool bpf_prog_get_recursion_context(struct bpf_prog *prog) { - return this_cpu_inc_return(*(prog->active)) == 1; +#ifdef CONFIG_ARM64 + u8 rctx = interrupt_context_level(); + u8 *active = this_cpu_ptr(prog->active); + u32 val; + + preempt_disable(); + active[rctx]++; + val = le32_to_cpu(*(__le32 *)active); + preempt_enable(); + if (val != BIT(rctx * 8)) + return false; + + return true; +#else + return this_cpu_inc_return(*(int __percpu *)(prog->active)) == 1; +#endif } static inline void bpf_prog_put_recursion_context(struct bpf_prog *prog) { - this_cpu_dec(*(prog->active)); +#ifdef CONFIG_ARM64 + u8 rctx = interrupt_context_level(); + u8 *active = this_cpu_ptr(prog->active); + + preempt_disable(); + active[rctx]--; + preempt_enable(); +#else + this_cpu_dec(*(int __percpu *)(prog->active)); +#endif } #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c66316e32563..e0b8a8a5aaa9 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -112,7 +112,8 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag vfree(fp); return NULL; } - fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags)); + fp->active = __alloc_percpu_gfp(sizeof(u8[BPF_NR_CONTEXTS]), 4, + bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags)); if (!fp->active) { vfree(fp); kfree(aux); -- cgit v1.2.3 From 94e948b7e684c0465bb3faca8fafee5caf421b84 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Tue, 16 Dec 2025 13:29:59 +0000 Subject: bpf: annotate file argument as __nullable in bpf_lsm_mmap_file As reported in [0], anonymous memory mappings are not backed by a struct file instance. Consequently, the struct file pointer passed to the security_mmap_file() LSM hook is NULL in such cases. The BPF verifier is currently unaware of this, allowing BPF LSM programs to dereference this struct file pointer without needing to perform an explicit NULL check. This leads to potential NULL pointer dereference and a kernel crash. Add a strong override for bpf_lsm_mmap_file() which annotates the struct file pointer parameter with the __nullable suffix. This explicitly informs the BPF verifier that this pointer (PTR_MAYBE_NULL) can be NULL, forcing BPF LSM programs to perform a check on it before dereferencing it. [0] https://lore.kernel.org/bpf/5e460d3c.4c3e9.19adde547d8.Coremail.kaiyanm@hust.edu.cn/ Reported-by: Kaiyan Mei Reported-by: Yinhao Hu Reviewed-by: Dongliang Mu Closes: https://lore.kernel.org/bpf/5e460d3c.4c3e9.19adde547d8.Coremail.kaiyanm@hust.edu.cn/ Signed-off-by: Matt Bobrowski Acked-by: Song Liu Link: https://lore.kernel.org/r/20251216133000.3690723-1-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- MAINTAINERS | 1 + kernel/bpf/Makefile | 12 +++++++++++- kernel/bpf/bpf_lsm.c | 5 +++-- kernel/bpf/bpf_lsm_proto.c | 19 +++++++++++++++++++ 4 files changed, 34 insertions(+), 3 deletions(-) create mode 100644 kernel/bpf/bpf_lsm_proto.c (limited to 'kernel') diff --git a/MAINTAINERS b/MAINTAINERS index cb1898a85b05..e7027fba97db 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4848,6 +4848,7 @@ S: Maintained F: Documentation/bpf/prog_lsm.rst F: include/linux/bpf_lsm.h F: kernel/bpf/bpf_lsm.c +F: kernel/bpf/bpf_lsm_proto.c F: kernel/trace/bpf_trace.c F: security/bpf/ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 232cbc97434d..79cf22860a99 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -42,7 +42,17 @@ endif ifeq ($(CONFIG_BPF_JIT),y) obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o obj-$(CONFIG_BPF_SYSCALL) += cpumask.o -obj-${CONFIG_BPF_LSM} += bpf_lsm.o +# bpf_lsm_proto.o must precede bpf_lsm.o. The current pahole logic +# deduplicates function prototypes within +# btf_encoder__add_saved_func() by keeping the first instance seen. We +# need the function prototype(s) in bpf_lsm_proto.o to take precedence +# over those within bpf_lsm.o. Having bpf_lsm_proto.o precede +# bpf_lsm.o ensures its DWARF CU is processed early, forcing the +# generated BTF to contain the overrides. +# +# Notably, this is a temporary workaround whilst the deduplication +# semantics within pahole are revisited accordingly. +obj-${CONFIG_BPF_LSM} += bpf_lsm_proto.o bpf_lsm.o endif ifneq ($(CONFIG_CRYPTO),) obj-$(CONFIG_BPF_SYSCALL) += crypto.o diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 7cb6e8d4282c..0c4a0c8e6f70 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -18,10 +18,11 @@ #include /* For every LSM hook that allows attachment of BPF programs, declare a nop - * function where a BPF program can be attached. + * function where a BPF program can be attached. Notably, we qualify each with + * weak linkage such that strong overrides can be implemented if need be. */ #define LSM_HOOK(RET, DEFAULT, NAME, ...) \ -noinline RET bpf_lsm_##NAME(__VA_ARGS__) \ +__weak noinline RET bpf_lsm_##NAME(__VA_ARGS__) \ { \ return DEFAULT; \ } diff --git a/kernel/bpf/bpf_lsm_proto.c b/kernel/bpf/bpf_lsm_proto.c new file mode 100644 index 000000000000..44a54fd8045e --- /dev/null +++ b/kernel/bpf/bpf_lsm_proto.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2025 Google LLC. + */ + +#include +#include + +/* + * Strong definition of the mmap_file() BPF LSM hook. The __nullable suffix on + * the struct file pointer parameter name marks it as PTR_MAYBE_NULL. This + * explicitly enforces that BPF LSM programs check for NULL before attempting to + * dereference it. + */ +int bpf_lsm_mmap_file(struct file *file__nullable, unsigned long reqprot, + unsigned long prot, unsigned long flags) +{ + return 0; +} -- cgit v1.2.3 From 342297d51146b1a3184e53925901a0cc282b3f76 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 22 Dec 2025 05:32:45 -0800 Subject: bpf: allow calling kfuncs from raw_tp programs Associate raw tracepoint program type with the kfunc tracing hook. This allows calling kfuncs from raw_tp programs. Signed-off-by: Puranjay Mohan Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20251222133250.1890587-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 0de8fc8a0e0b..539c9fdea41d 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -8681,6 +8681,7 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) return BTF_KFUNC_HOOK_STRUCT_OPS; case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_LSM: return BTF_KFUNC_HOOK_TRACING; -- cgit v1.2.3 From ac1c5bc7c4c7e20e2070e6eaa673fc3e11619dbb Mon Sep 17 00:00:00 2001 From: Daniel Gomez Date: Sat, 20 Dec 2025 04:48:09 +0100 Subject: bpf: crypto: replace -EEXIST with -EBUSY The -EEXIST error code is reserved by the module loading infrastructure to indicate that a module is already loaded. When a module's init function returns -EEXIST, userspace tools like kmod interpret this as "module already loaded" and treat the operation as successful, returning 0 to the user even though the module initialization actually failed. This follows the precedent set by commit 54416fd76770 ("netfilter: conntrack: helper: Replace -EEXIST by -EBUSY") which fixed the same issue in nf_conntrack_helper_register(). This affects bpf_crypto_skcipher module. While the configuration required to build it as a module is unlikely in practice, it is technically possible, so fix it for correctness. Signed-off-by: Daniel Gomez Acked-by: Vadim Fedorenko Link: https://lore.kernel.org/r/20251220-dev-module-init-eexists-bpf-v1-1-7f186663dbe7@samsung.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/crypto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c index 83c4d9943084..1ab79a6dec84 100644 --- a/kernel/bpf/crypto.c +++ b/kernel/bpf/crypto.c @@ -60,7 +60,7 @@ struct bpf_crypto_ctx { int bpf_crypto_register_type(const struct bpf_crypto_type *type) { struct bpf_crypto_type_list *node; - int err = -EEXIST; + int err = -EBUSY; down_write(&bpf_crypto_types_sem); list_for_each_entry(node, &bpf_crypto_types, list) { -- cgit v1.2.3 From c336b0b327120052c331f6839ee60069065a7c74 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 22 Dec 2025 11:50:16 -0800 Subject: bpf: arena: populate vm_area without allocating memory vm_area_map_pages() may allocate memory while inserting pages into bpf arena's vm_area. In order to make bpf_arena_alloc_pages() kfunc non-sleepable change bpf arena to populate pages without allocating memory: - at arena creation time populate all page table levels except the last level - when new pages need to be inserted call apply_to_page_range() again with apply_range_set_cb() which will only set_pte_at() those pages and will not allocate memory. - when freeing pages call apply_to_existing_page_range with apply_range_clear_cb() to clear the pte for the page to be removed. This doesn't free intermediate page table levels. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20251222195022.431211-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/arena.c | 100 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 90 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 872dc0e41c65..55b198b9f1a3 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -2,11 +2,13 @@ /* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ #include #include +#include #include #include "linux/filter.h" #include #include #include +#include #include "range_tree.h" /* @@ -92,6 +94,68 @@ static long compute_pgoff(struct bpf_arena *arena, long uaddr) return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT; } +struct apply_range_data { + struct page **pages; + int i; +}; + +static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data) +{ + struct apply_range_data *d = data; + struct page *page; + + if (!data) + return 0; + /* sanity check */ + if (unlikely(!pte_none(ptep_get(pte)))) + return -EBUSY; + + page = d->pages[d->i]; + /* paranoia, similar to vmap_pages_pte_range() */ + if (WARN_ON_ONCE(!pfn_valid(page_to_pfn(page)))) + return -EINVAL; + + set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL)); + d->i++; + return 0; +} + +static void flush_vmap_cache(unsigned long start, unsigned long size) +{ + flush_cache_vmap(start, start + size); +} + +static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data) +{ + pte_t old_pte; + struct page *page; + + /* sanity check */ + old_pte = ptep_get(pte); + if (pte_none(old_pte) || !pte_present(old_pte)) + return 0; /* nothing to do */ + + /* get page and free it */ + page = pte_page(old_pte); + if (WARN_ON_ONCE(!page)) + return -EINVAL; + + pte_clear(&init_mm, addr, pte); + + /* ensure no stale TLB entries */ + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); + + __free_page(page); + + return 0; +} + +static int populate_pgtable_except_pte(struct bpf_arena *arena) +{ + return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena), + KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL); +} + static struct bpf_map *arena_map_alloc(union bpf_attr *attr) { struct vm_struct *kern_vm; @@ -144,6 +208,12 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) goto err; } mutex_init(&arena->lock); + err = populate_pgtable_except_pte(arena); + if (err) { + range_tree_destroy(&arena->rt); + bpf_map_area_free(arena); + goto err; + } return &arena->map; err: @@ -286,6 +356,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) if (ret) return VM_FAULT_SIGSEGV; + struct apply_range_data data = { .pages = &page, .i = 0 }; /* Account into memcg of the process that created bpf_arena */ ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page); if (ret) { @@ -293,12 +364,13 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) return VM_FAULT_SIGSEGV; } - ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page); + ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data); if (ret) { range_tree_set(&arena->rt, vmf->pgoff, 1); __free_page(page); return VM_FAULT_SIGSEGV; } + flush_vmap_cache(kaddr, PAGE_SIZE); out: page_ref_add(page, 1); vmf->page = page; @@ -428,7 +500,8 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt /* user_vm_end/start are fixed before bpf prog runs */ long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena); - struct page **pages; + struct page **pages = NULL; + long mapped = 0; long pgoff = 0; u32 uaddr32; int ret, i; @@ -450,7 +523,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt if (!pages) return 0; - guard(mutex)(&arena->lock); + mutex_lock(&arena->lock); if (uaddr) { ret = is_range_tree_set(&arena->rt, pgoff, page_cnt); @@ -465,6 +538,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt if (ret) goto out_free_pages; + struct apply_range_data data = { .pages = pages, .i = 0 }; ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages); if (ret) goto out; @@ -477,18 +551,24 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow * lower 32-bit and it's ok. */ - ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32, - kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages); - if (ret) { - for (i = 0; i < page_cnt; i++) + apply_to_page_range(&init_mm, kern_vm_start + uaddr32, + page_cnt << PAGE_SHIFT, apply_range_set_cb, &data); + mapped = data.i; + flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT); + if (mapped < page_cnt) { + for (i = mapped; i < page_cnt; i++) __free_page(pages[i]); goto out; } + mutex_unlock(&arena->lock); kvfree(pages); return clear_lo32(arena->user_vm_start) + uaddr32; out: - range_tree_set(&arena->rt, pgoff, page_cnt); + range_tree_set(&arena->rt, pgoff + mapped, page_cnt - mapped); out_free_pages: + mutex_unlock(&arena->lock); + if (mapped) + arena_free_pages(arena, uaddr32, mapped); kvfree(pages); return 0; } @@ -545,8 +625,8 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) * page_cnt is big it's faster to do the batched zap. */ zap_pages(arena, full_uaddr, 1); - vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE); - __free_page(page); + apply_to_existing_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_clear_cb, + NULL); } } -- cgit v1.2.3 From 360c35f8ffae0f184805d9eb7d126474345bac9b Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 22 Dec 2025 11:50:17 -0800 Subject: bpf: arena: use kmalloc_nolock() in place of kvcalloc() To make arena_alloc_pages() safe to be called from any context, replace kvcalloc() with kmalloc_nolock() so as it doesn't sleep or take any locks. kmalloc_nolock() returns NULL for allocations larger than KMALLOC_MAX_CACHE_SIZE, which is (PAGE_SIZE * 2) = 8KB on systems with 4KB pages. So, round down the allocation done by kmalloc_nolock to 1024 * 8 and reuse the array in a loop. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20251222195022.431211-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/arena.c | 84 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 55 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 55b198b9f1a3..128efb68d47b 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -44,6 +44,8 @@ #define GUARD_SZ round_up(1ull << sizeof_field(struct bpf_insn, off) * 8, PAGE_SIZE << 1) #define KERN_VM_SZ (SZ_4G + GUARD_SZ) +static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt); + struct bpf_arena { struct bpf_map map; u64 user_vm_start; @@ -500,8 +502,10 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt /* user_vm_end/start are fixed before bpf prog runs */ long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena); + struct apply_range_data data; struct page **pages = NULL; - long mapped = 0; + long remaining, mapped = 0; + long alloc_pages; long pgoff = 0; u32 uaddr32; int ret, i; @@ -518,17 +522,19 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt return 0; } - /* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */ - pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL); + /* Cap allocation size to KMALLOC_MAX_CACHE_SIZE so kmalloc_nolock() can succeed. */ + alloc_pages = min(page_cnt, KMALLOC_MAX_CACHE_SIZE / sizeof(struct page *)); + pages = kmalloc_nolock(alloc_pages * sizeof(struct page *), 0, NUMA_NO_NODE); if (!pages) return 0; + data.pages = pages; mutex_lock(&arena->lock); if (uaddr) { ret = is_range_tree_set(&arena->rt, pgoff, page_cnt); if (ret) - goto out_free_pages; + goto out_unlock_free_pages; ret = range_tree_clear(&arena->rt, pgoff, page_cnt); } else { ret = pgoff = range_tree_find(&arena->rt, page_cnt); @@ -536,40 +542,60 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt ret = range_tree_clear(&arena->rt, pgoff, page_cnt); } if (ret) - goto out_free_pages; - - struct apply_range_data data = { .pages = pages, .i = 0 }; - ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages); - if (ret) - goto out; + goto out_unlock_free_pages; + remaining = page_cnt; uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE); - /* Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1 - * will not overflow 32-bit. Lower 32-bit need to represent - * contiguous user address range. - * Map these pages at kern_vm_start base. - * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow - * lower 32-bit and it's ok. - */ - apply_to_page_range(&init_mm, kern_vm_start + uaddr32, - page_cnt << PAGE_SHIFT, apply_range_set_cb, &data); - mapped = data.i; - flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT); - if (mapped < page_cnt) { - for (i = mapped; i < page_cnt; i++) - __free_page(pages[i]); - goto out; + + while (remaining) { + long this_batch = min(remaining, alloc_pages); + + /* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */ + memset(pages, 0, this_batch * sizeof(struct page *)); + + ret = bpf_map_alloc_pages(&arena->map, node_id, this_batch, pages); + if (ret) + goto out; + + /* + * Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1 + * will not overflow 32-bit. Lower 32-bit need to represent + * contiguous user address range. + * Map these pages at kern_vm_start base. + * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow + * lower 32-bit and it's ok. + */ + data.i = 0; + ret = apply_to_page_range(&init_mm, + kern_vm_start + uaddr32 + (mapped << PAGE_SHIFT), + this_batch << PAGE_SHIFT, apply_range_set_cb, &data); + if (ret) { + /* data.i pages were mapped, account them and free the remaining */ + mapped += data.i; + for (i = data.i; i < this_batch; i++) + __free_page(pages[i]); + goto out; + } + + mapped += this_batch; + remaining -= this_batch; } + flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT); mutex_unlock(&arena->lock); - kvfree(pages); + kfree_nolock(pages); return clear_lo32(arena->user_vm_start) + uaddr32; out: range_tree_set(&arena->rt, pgoff + mapped, page_cnt - mapped); -out_free_pages: mutex_unlock(&arena->lock); - if (mapped) + if (mapped) { + flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT); arena_free_pages(arena, uaddr32, mapped); - kvfree(pages); + } + goto out_free_pages; +out_unlock_free_pages: + mutex_unlock(&arena->lock); +out_free_pages: + kfree_nolock(pages); return 0; } -- cgit v1.2.3 From b8467290edab4bafae352bf3f317055669a1a458 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 22 Dec 2025 11:50:18 -0800 Subject: bpf: arena: make arena kfuncs any context safe Make arena related kfuncs any context safe by the following changes: bpf_arena_alloc_pages() and bpf_arena_reserve_pages(): Replace the usage of the mutex with a rqspinlock for range tree and use kmalloc_nolock() wherever needed. Use free_pages_nolock() to free pages from any context. apply_range_set/clear_cb() with apply_to_page_range() has already made populating the vm_area in bpf_arena_alloc_pages() any context safe. bpf_arena_free_pages(): defer the main logic to a workqueue if it is called from a non-sleepable context. specialize_kfunc() is used to replace the sleepable arena_free_pages() with bpf_arena_free_pages_non_sleepable() when the verifier detects the call is from a non-sleepable context. In the non-sleepable case, arena_free_pages() queues the address and the page count to be freed to a lock-less list of struct arena_free_spans and raises an irq_work. The irq_work handler calls schedules_work() as it is safe to be called from irq context. arena_free_worker() (the work queue handler) iterates these spans and clears ptes, flushes tlb, zaps pages, and calls __free_page(). Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20251222195022.431211-4-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 16 ++++ kernel/bpf/arena.c | 248 +++++++++++++++++++++++++++++++++++++++++--------- kernel/bpf/verifier.c | 10 ++ 3 files changed, 233 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index da6a00dd313f..4e7d72dfbcd4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -673,6 +673,22 @@ void bpf_map_free_internal_structs(struct bpf_map *map, void *obj); int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit); +#if defined(CONFIG_MMU) && defined(CONFIG_64BIT) +void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id, + u64 flags); +void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt); +#else +static inline void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, + int node_id, u64 flags) +{ + return NULL; +} + +static inline void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt) +{ +} +#endif + extern const struct bpf_map_ops bpf_map_offload_ops; /* bpf_type_flag contains a set of flags that are applicable to the values of diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 128efb68d47b..456ac989269d 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -4,7 +4,9 @@ #include #include #include +#include #include "linux/filter.h" +#include #include #include #include @@ -44,7 +46,7 @@ #define GUARD_SZ round_up(1ull << sizeof_field(struct bpf_insn, off) * 8, PAGE_SIZE << 1) #define KERN_VM_SZ (SZ_4G + GUARD_SZ) -static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt); +static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable); struct bpf_arena { struct bpf_map map; @@ -52,8 +54,23 @@ struct bpf_arena { u64 user_vm_end; struct vm_struct *kern_vm; struct range_tree rt; + /* protects rt */ + rqspinlock_t spinlock; struct list_head vma_list; + /* protects vma_list */ struct mutex lock; + struct irq_work free_irq; + struct work_struct free_work; + struct llist_head free_spans; +}; + +static void arena_free_worker(struct work_struct *work); +static void arena_free_irq(struct irq_work *iw); + +struct arena_free_span { + struct llist_node node; + unsigned long uaddr; + u32 page_cnt; }; u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena) @@ -127,7 +144,7 @@ static void flush_vmap_cache(unsigned long start, unsigned long size) flush_cache_vmap(start, start + size); } -static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data) +static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *free_pages) { pte_t old_pte; struct page *page; @@ -137,17 +154,15 @@ static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data) if (pte_none(old_pte) || !pte_present(old_pte)) return 0; /* nothing to do */ - /* get page and free it */ page = pte_page(old_pte); if (WARN_ON_ONCE(!page)) return -EINVAL; pte_clear(&init_mm, addr, pte); - /* ensure no stale TLB entries */ - flush_tlb_kernel_range(addr, addr + PAGE_SIZE); - - __free_page(page); + /* Add page to the list so it is freed later */ + if (free_pages) + __llist_add(&page->pcp_llist, free_pages); return 0; } @@ -202,6 +217,9 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) arena->user_vm_end = arena->user_vm_start + vm_range; INIT_LIST_HEAD(&arena->vma_list); + init_llist_head(&arena->free_spans); + init_irq_work(&arena->free_irq, arena_free_irq); + INIT_WORK(&arena->free_work, arena_free_worker); bpf_map_init_from_attr(&arena->map, attr); range_tree_init(&arena->rt); err = range_tree_set(&arena->rt, 0, attr->max_entries); @@ -210,6 +228,7 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) goto err; } mutex_init(&arena->lock); + raw_res_spin_lock_init(&arena->spinlock); err = populate_pgtable_except_pte(arena); if (err) { range_tree_destroy(&arena->rt); @@ -256,6 +275,10 @@ static void arena_map_free(struct bpf_map *map) if (WARN_ON_ONCE(!list_empty(&arena->vma_list))) return; + /* Ensure no pending deferred frees */ + irq_work_sync(&arena->free_irq); + flush_work(&arena->free_work); + /* * free_vm_area() calls remove_vm_area() that calls free_unmap_vmap_area(). * It unmaps everything from vmalloc area and clears pgtables. @@ -339,12 +362,16 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) struct bpf_arena *arena = container_of(map, struct bpf_arena, map); struct page *page; long kbase, kaddr; + unsigned long flags; int ret; kbase = bpf_arena_get_kern_vm_start(arena); kaddr = kbase + (u32)(vmf->address); - guard(mutex)(&arena->lock); + if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) + /* Make a reasonable effort to address impossible case */ + return VM_FAULT_RETRY; + page = vmalloc_to_page((void *)kaddr); if (page) /* already have a page vmap-ed */ @@ -352,31 +379,35 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT) /* User space requested to segfault when page is not allocated by bpf prog */ - return VM_FAULT_SIGSEGV; + goto out_unlock_sigsegv; ret = range_tree_clear(&arena->rt, vmf->pgoff, 1); if (ret) - return VM_FAULT_SIGSEGV; + goto out_unlock_sigsegv; struct apply_range_data data = { .pages = &page, .i = 0 }; /* Account into memcg of the process that created bpf_arena */ ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page); if (ret) { range_tree_set(&arena->rt, vmf->pgoff, 1); - return VM_FAULT_SIGSEGV; + goto out_unlock_sigsegv; } ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data); if (ret) { range_tree_set(&arena->rt, vmf->pgoff, 1); - __free_page(page); - return VM_FAULT_SIGSEGV; + free_pages_nolock(page, 0); + goto out_unlock_sigsegv; } flush_vmap_cache(kaddr, PAGE_SIZE); out: page_ref_add(page, 1); + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); vmf->page = page; return 0; +out_unlock_sigsegv: + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); + return VM_FAULT_SIGSEGV; } static const struct vm_operations_struct arena_vm_ops = { @@ -497,7 +528,8 @@ static u64 clear_lo32(u64 val) * Allocate pages and vmap them into kernel vmalloc area. * Later the pages will be mmaped into user space vma. */ -static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id) +static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id, + bool sleepable) { /* user_vm_end/start are fixed before bpf prog runs */ long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; @@ -506,6 +538,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt struct page **pages = NULL; long remaining, mapped = 0; long alloc_pages; + unsigned long flags; long pgoff = 0; u32 uaddr32; int ret, i; @@ -529,7 +562,8 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt return 0; data.pages = pages; - mutex_lock(&arena->lock); + if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) + goto out_free_pages; if (uaddr) { ret = is_range_tree_set(&arena->rt, pgoff, page_cnt); @@ -573,7 +607,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt /* data.i pages were mapped, account them and free the remaining */ mapped += data.i; for (i = data.i; i < this_batch; i++) - __free_page(pages[i]); + free_pages_nolock(pages[i], 0); goto out; } @@ -581,19 +615,19 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt remaining -= this_batch; } flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT); - mutex_unlock(&arena->lock); + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); kfree_nolock(pages); return clear_lo32(arena->user_vm_start) + uaddr32; out: range_tree_set(&arena->rt, pgoff + mapped, page_cnt - mapped); - mutex_unlock(&arena->lock); + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); if (mapped) { flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT); - arena_free_pages(arena, uaddr32, mapped); + arena_free_pages(arena, uaddr32, mapped, sleepable); } goto out_free_pages; out_unlock_free_pages: - mutex_unlock(&arena->lock); + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); out_free_pages: kfree_nolock(pages); return 0; @@ -608,42 +642,64 @@ static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt) { struct vma_list *vml; + guard(mutex)(&arena->lock); + /* iterate link list under lock */ list_for_each_entry(vml, &arena->vma_list, head) zap_page_range_single(vml->vma, uaddr, PAGE_SIZE * page_cnt, NULL); } -static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) +static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable) { u64 full_uaddr, uaddr_end; - long kaddr, pgoff, i; + long kaddr, pgoff; struct page *page; + struct llist_head free_pages; + struct llist_node *pos, *t; + struct arena_free_span *s; + unsigned long flags; + int ret = 0; /* only aligned lower 32-bit are relevant */ uaddr = (u32)uaddr; uaddr &= PAGE_MASK; + kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr; full_uaddr = clear_lo32(arena->user_vm_start) + uaddr; uaddr_end = min(arena->user_vm_end, full_uaddr + (page_cnt << PAGE_SHIFT)); if (full_uaddr >= uaddr_end) return; page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT; + pgoff = compute_pgoff(arena, uaddr); - guard(mutex)(&arena->lock); + if (!sleepable) + goto defer; + + ret = raw_res_spin_lock_irqsave(&arena->spinlock, flags); + + /* Can't proceed without holding the spinlock so defer the free */ + if (ret) + goto defer; - pgoff = compute_pgoff(arena, uaddr); - /* clear range */ range_tree_set(&arena->rt, pgoff, page_cnt); + init_llist_head(&free_pages); + /* clear ptes and collect struct pages */ + apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT, + apply_range_clear_cb, &free_pages); + + /* drop the lock to do the tlb flush and zap pages */ + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); + + /* ensure no stale TLB entries */ + flush_tlb_kernel_range(kaddr, kaddr + (page_cnt * PAGE_SIZE)); + if (page_cnt > 1) /* bulk zap if multiple pages being freed */ zap_pages(arena, full_uaddr, page_cnt); - kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr; - for (i = 0; i < page_cnt; i++, kaddr += PAGE_SIZE, full_uaddr += PAGE_SIZE) { - page = vmalloc_to_page((void *)kaddr); - if (!page) - continue; + llist_for_each_safe(pos, t, __llist_del_all(&free_pages)) { + page = llist_entry(pos, struct page, pcp_llist); if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */ /* Optimization for the common case of page_cnt==1: * If page wasn't mapped into some user vma there @@ -651,9 +707,25 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) * page_cnt is big it's faster to do the batched zap. */ zap_pages(arena, full_uaddr, 1); - apply_to_existing_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_clear_cb, - NULL); + __free_page(page); } + + return; + +defer: + s = kmalloc_nolock(sizeof(struct arena_free_span), 0, -1); + if (!s) + /* + * If allocation fails in non-sleepable context, pages are intentionally left + * inaccessible (leaked) until the arena is destroyed. Cleanup or retries are not + * possible here, so we intentionally omit them for safety. + */ + return; + + s->page_cnt = page_cnt; + s->uaddr = uaddr; + llist_add(&s->node, &arena->free_spans); + irq_work_queue(&arena->free_irq); } /* @@ -663,6 +735,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt) { long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; + unsigned long flags; long pgoff; int ret; @@ -673,15 +746,87 @@ static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt if (pgoff + page_cnt > page_cnt_max) return -EINVAL; - guard(mutex)(&arena->lock); + if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) + return -EBUSY; /* Cannot guard already allocated pages. */ ret = is_range_tree_set(&arena->rt, pgoff, page_cnt); - if (ret) - return -EBUSY; + if (ret) { + ret = -EBUSY; + goto out; + } /* "Allocate" the region to prevent it from being allocated. */ - return range_tree_clear(&arena->rt, pgoff, page_cnt); + ret = range_tree_clear(&arena->rt, pgoff, page_cnt); +out: + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); + return ret; +} + +static void arena_free_worker(struct work_struct *work) +{ + struct bpf_arena *arena = container_of(work, struct bpf_arena, free_work); + struct llist_node *list, *pos, *t; + struct arena_free_span *s; + u64 arena_vm_start, user_vm_start; + struct llist_head free_pages; + struct page *page; + unsigned long full_uaddr; + long kaddr, page_cnt, pgoff; + unsigned long flags; + + if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) { + schedule_work(work); + return; + } + + init_llist_head(&free_pages); + arena_vm_start = bpf_arena_get_kern_vm_start(arena); + user_vm_start = bpf_arena_get_user_vm_start(arena); + + list = llist_del_all(&arena->free_spans); + llist_for_each(pos, list) { + s = llist_entry(pos, struct arena_free_span, node); + page_cnt = s->page_cnt; + kaddr = arena_vm_start + s->uaddr; + pgoff = compute_pgoff(arena, s->uaddr); + + /* clear ptes and collect pages in free_pages llist */ + apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT, + apply_range_clear_cb, &free_pages); + + range_tree_set(&arena->rt, pgoff, page_cnt); + } + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); + + /* Iterate the list again without holding spinlock to do the tlb flush and zap_pages */ + llist_for_each_safe(pos, t, list) { + s = llist_entry(pos, struct arena_free_span, node); + page_cnt = s->page_cnt; + full_uaddr = clear_lo32(user_vm_start) + s->uaddr; + kaddr = arena_vm_start + s->uaddr; + + /* ensure no stale TLB entries */ + flush_tlb_kernel_range(kaddr, kaddr + (page_cnt * PAGE_SIZE)); + + /* remove pages from user vmas */ + zap_pages(arena, full_uaddr, page_cnt); + + kfree_nolock(s); + } + + /* free all pages collected by apply_to_existing_page_range() in the first loop */ + llist_for_each_safe(pos, t, __llist_del_all(&free_pages)) { + page = llist_entry(pos, struct page, pcp_llist); + __free_page(page); + } +} + +static void arena_free_irq(struct irq_work *iw) +{ + struct bpf_arena *arena = container_of(iw, struct bpf_arena, free_irq); + + schedule_work(&arena->free_work); } __bpf_kfunc_start_defs(); @@ -695,9 +840,20 @@ __bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_ if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt) return NULL; - return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id); + return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true); } +void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, + int node_id, u64 flags) +{ + struct bpf_map *map = p__map; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt) + return NULL; + + return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false); +} __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt) { struct bpf_map *map = p__map; @@ -705,7 +861,17 @@ __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign) return; - arena_free_pages(arena, (long)ptr__ign, page_cnt); + arena_free_pages(arena, (long)ptr__ign, page_cnt, true); +} + +void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt) +{ + struct bpf_map *map = p__map; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign) + return; + arena_free_pages(arena, (long)ptr__ign, page_cnt, false); } __bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_cnt) @@ -724,9 +890,9 @@ __bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_c __bpf_kfunc_end_defs(); BTF_KFUNCS_START(arena_kfuncs) -BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_RET | KF_ARENA_ARG2) -BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2) -BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_ARENA_RET | KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_TRUSTED_ARGS | KF_ARENA_ARG2) BTF_KFUNCS_END(arena_kfuncs) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d6b8a77fbe3b..2de1a736ef69 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12380,6 +12380,8 @@ enum special_kfunc_type { KF___bpf_trap, KF_bpf_task_work_schedule_signal_impl, KF_bpf_task_work_schedule_resume_impl, + KF_bpf_arena_alloc_pages, + KF_bpf_arena_free_pages, }; BTF_ID_LIST(special_kfunc_list) @@ -12454,6 +12456,8 @@ BTF_ID(func, bpf_dynptr_file_discard) BTF_ID(func, __bpf_trap) BTF_ID(func, bpf_task_work_schedule_signal_impl) BTF_ID(func, bpf_task_work_schedule_resume_impl) +BTF_ID(func, bpf_arena_alloc_pages) +BTF_ID(func, bpf_arena_free_pages) static bool is_task_work_add_kfunc(u32 func_id) { @@ -22432,6 +22436,12 @@ static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) { if (!env->insn_aux_data[insn_idx].non_sleepable) addr = (unsigned long)bpf_dynptr_from_file_sleepable; + } else if (func_id == special_kfunc_list[KF_bpf_arena_alloc_pages]) { + if (env->insn_aux_data[insn_idx].non_sleepable) + addr = (unsigned long)bpf_arena_alloc_pages_non_sleepable; + } else if (func_id == special_kfunc_list[KF_bpf_arena_free_pages]) { + if (env->insn_aux_data[insn_idx].non_sleepable) + addr = (unsigned long)bpf_arena_free_pages_non_sleepable; } desc->addr = addr; return 0; -- cgit v1.2.3 From f597664454bde5ac45ceaf24da55b590ccfa60e3 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 29 Dec 2025 23:13:07 -0800 Subject: bpf: bpf_scc_visit instance and backedges accumulation for bpf_loop() Calls like bpf_loop() or bpf_for_each_map_elem() introduce loops that are not explicitly present in the control-flow graph. The verifier processes such calls by repeatedly interpreting the callback function body within the same verification path (until the current state converges with a previous state). Such loops require a bpf_scc_visit instance in order to allow the accumulation of the state graph backedges. Otherwise, certain checkpoint states created within the bodies of such loops will have incomplete precision marks. See the next patch for an example of a program that leads to the verifier accepting an unsafe program. Fixes: 96c6aa4c63af ("bpf: compute SCCs in program control flow graph") Fixes: c9e31900b54c ("bpf: propagate read/precision marks over state graph backedges") Reported-by: Breno Leitao Signed-off-by: Eduard Zingerman Tested-by: Breno Leitao Link: https://lore.kernel.org/r/20251229-scc-for-callbacks-v1-1-ceadfe679900@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2de1a736ef69..0baae7828af2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19830,8 +19830,10 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) } } if (bpf_calls_callback(env, insn_idx)) { - if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) + if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) { + loop = true; goto hit; + } goto skip_inf_loop_check; } /* attempt to detect infinite loop to avoid unnecessary doomed work */ @@ -25071,15 +25073,18 @@ dfs_continue: } /* * Assign SCC number only if component has two or more elements, - * or if component has a self reference. + * or if component has a self reference, or if instruction is a + * callback calling function (implicit loop). */ - assign_scc = stack[stack_sz - 1] != w; - for (j = 0; j < succ->cnt; ++j) { + assign_scc = stack[stack_sz - 1] != w; /* two or more elements? */ + for (j = 0; j < succ->cnt; ++j) { /* self reference? */ if (succ->items[j] == w) { assign_scc = true; break; } } + if (bpf_calls_callback(env, w)) /* implicit loop? */ + assign_scc = true; /* Pop component elements from stack */ do { t = stack[--stack_sz]; -- cgit v1.2.3 From 840692326e92b5deb76c224931e8ca145ce7cfb8 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 30 Dec 2025 21:36:03 -0800 Subject: bpf: allow states pruning for misc/invalid slots in iterator loops Within an iterator or callback based loop, it should be safe to prune the current state if the old state stack slot is marked as STACK_INVALID or STACK_MISC: - either all branches of the old state lead to a program exit; - or some branch of the old state leads the current state. This is the same logic as applied in non-loop cases when states_equal() is called in NOT_EXACT mode. The test case that exercises stacksafe() and demonstrates the difference in verification performance is included in the next patch. I'm not sure if it is possible to prepare a test case that exercises regsafe(); it appears that the compute_live_registers() pass makes this impossible. Nevertheless, for code readability reasons, I think that stacksafe() and regsafe() should handle STACK_INVALID / NOT_INIT symmetrically. Hence, this commit changes both functions. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251230-loop-stack-misc-pruning-v1-1-585cfd6cec51@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0baae7828af2..3d44c5d06623 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19086,11 +19086,9 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, if (exact == EXACT) return regs_exact(rold, rcur, idmap); - if (rold->type == NOT_INIT) { - if (exact == NOT_EXACT || rcur->type == NOT_INIT) - /* explored state can't have used this */ - return true; - } + if (rold->type == NOT_INIT) + /* explored state can't have used this */ + return true; /* Enforce that register types have to match exactly, including their * modifiers (like PTR_MAYBE_NULL, MEM_RDONLY, etc), as a general @@ -19259,7 +19257,7 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, spi = i / BPF_REG_SIZE; - if (exact != NOT_EXACT && + if (exact == EXACT && (i >= cur->allocated_stack || old->stack[spi].slot_type[i % BPF_REG_SIZE] != cur->stack[spi].slot_type[i % BPF_REG_SIZE])) -- cgit v1.2.3 From 1a5c01d2508a845825eece360c6145d7f436dbf8 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 10:00:27 -0800 Subject: bpf: Make KF_TRUSTED_ARGS the default for all kfuncs Change the verifier to make trusted args the default requirement for all kfuncs by removing is_kfunc_trusted_args() assuming it be to always return true. This works because: 1. Context pointers (xdp_md, __sk_buff, etc.) are handled through their own KF_ARG_PTR_TO_CTX case label and bypass the trusted check 2. Struct_ops callback arguments are already marked as PTR_TRUSTED during initialization and pass is_trusted_reg() 3. KF_RCU kfuncs are handled separately via is_kfunc_rcu() checks at call sites (always checked with || alongside is_kfunc_trusted_args) This simple change makes all kfuncs require trusted args by default while maintaining correct behavior for all existing special cases. Note: This change means kfuncs that previously accepted NULL pointers without KF_TRUSTED_ARGS will now reject NULL at verification time. Several netfilter kfuncs are affected: bpf_xdp_ct_lookup(), bpf_skb_ct_lookup(), bpf_xdp_ct_alloc(), and bpf_skb_ct_alloc() all accept NULL for their bpf_tuple and opts parameters internally (checked in __bpf_nf_ct_lookup), but after this change the verifier rejects NULL before the kfunc is even called. This is acceptable because these kfuncs don't work with NULL parameters in their proper usage. Now they will be rejected rather than returning an error, which shouldn't make a difference to BPF programs that were using these kfuncs properly. Acked-by: Eduard Zingerman Reviewed-by: Emil Tsalapatis Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102180038.2708325-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- Documentation/bpf/kfuncs.rst | 184 +++++++++++++++++++++---------------------- fs/bpf_fs_kfuncs.c | 10 +-- kernel/bpf/verifier.c | 14 +--- 3 files changed, 97 insertions(+), 111 deletions(-) (limited to 'kernel') diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index e38941370b90..6cb6857bfa6f 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -50,7 +50,70 @@ A wrapper kfunc is often needed when we need to annotate parameters of the kfunc. Otherwise one may directly make the kfunc visible to the BPF program by registering it with the BPF subsystem. See :ref:`BPF_kfunc_nodef`. -2.2 Annotating kfunc parameters +2.2 kfunc Parameters +-------------------- + +All kfuncs now require trusted arguments by default. This means that all +pointer arguments must be valid, and all pointers to BTF objects must be +passed in their unmodified form (at a zero offset, and without having been +obtained from walking another pointer, with exceptions described below). + +There are two types of pointers to kernel objects which are considered "trusted": + +1. Pointers which are passed as tracepoint or struct_ops callback arguments. +2. Pointers which were returned from a KF_ACQUIRE kfunc. + +Pointers to non-BTF objects (e.g. scalar pointers) may also be passed to +kfuncs, and may have a non-zero offset. + +The definition of "valid" pointers is subject to change at any time, and has +absolutely no ABI stability guarantees. + +As mentioned above, a nested pointer obtained from walking a trusted pointer is +no longer trusted, with one exception. If a struct type has a field that is +guaranteed to be valid (trusted or rcu, as in KF_RCU description below) as long +as its parent pointer is valid, the following macros can be used to express +that to the verifier: + +* ``BTF_TYPE_SAFE_TRUSTED`` +* ``BTF_TYPE_SAFE_RCU`` +* ``BTF_TYPE_SAFE_RCU_OR_NULL`` + +For example, + +.. code-block:: c + + BTF_TYPE_SAFE_TRUSTED(struct socket) { + struct sock *sk; + }; + +or + +.. code-block:: c + + BTF_TYPE_SAFE_RCU(struct task_struct) { + const cpumask_t *cpus_ptr; + struct css_set __rcu *cgroups; + struct task_struct __rcu *real_parent; + struct task_struct *group_leader; + }; + +In other words, you must: + +1. Wrap the valid pointer type in a ``BTF_TYPE_SAFE_*`` macro. + +2. Specify the type and name of the valid nested field. This field must match + the field in the original type definition exactly. + +A new type declared by a ``BTF_TYPE_SAFE_*`` macro also needs to be emitted so +that it appears in BTF. For example, ``BTF_TYPE_SAFE_TRUSTED(struct socket)`` +is emitted in the ``type_is_trusted()`` function as follows: + +.. code-block:: c + + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct socket)); + +2.3 Annotating kfunc parameters ------------------------------- Similar to BPF helpers, there is sometime need for additional context required @@ -58,7 +121,7 @@ by the verifier to make the usage of kernel functions safer and more useful. Hence, we can annotate a parameter by suffixing the name of the argument of the kfunc with a __tag, where tag may be one of the supported annotations. -2.2.1 __sz Annotation +2.3.1 __sz Annotation --------------------- This annotation is used to indicate a memory and size pair in the argument list. @@ -74,7 +137,7 @@ argument as its size. By default, without __sz annotation, the size of the type of the pointer is used. Without __sz annotation, a kfunc cannot accept a void pointer. -2.2.2 __k Annotation +2.3.2 __k Annotation -------------------- This annotation is only understood for scalar arguments, where it indicates that @@ -98,7 +161,7 @@ Hence, whenever a constant scalar argument is accepted by a kfunc which is not a size parameter, and the value of the constant matters for program safety, __k suffix should be used. -2.2.3 __uninit Annotation +2.3.3 __uninit Annotation ------------------------- This annotation is used to indicate that the argument will be treated as @@ -115,7 +178,7 @@ Here, the dynptr will be treated as an uninitialized dynptr. Without this annotation, the verifier will reject the program if the dynptr passed in is not initialized. -2.2.4 __opt Annotation +2.3.4 __opt Annotation ------------------------- This annotation is used to indicate that the buffer associated with an __sz or __szk @@ -135,7 +198,7 @@ Either way, the returned buffer is either NULL, or of size buffer_szk. Without t annotation, the verifier will reject the program if a null pointer is passed in with a nonzero size. -2.2.5 __str Annotation +2.3.5 __str Annotation ---------------------------- This annotation is used to indicate that the argument is a constant string. @@ -160,7 +223,7 @@ Or:: ... } -2.2.6 __prog Annotation +2.3.6 __prog Annotation --------------------------- This annotation is used to indicate that the argument needs to be fixed up to the bpf_prog_aux of the caller BPF program. Any value passed into this argument @@ -179,7 +242,7 @@ An example is given below:: .. _BPF_kfunc_nodef: -2.3 Using an existing kernel function +2.4 Using an existing kernel function ------------------------------------- When an existing function in the kernel is fit for consumption by BPF programs, @@ -187,7 +250,7 @@ it can be directly registered with the BPF subsystem. However, care must still be taken to review the context in which it will be invoked by the BPF program and whether it is safe to do so. -2.4 Annotating kfuncs +2.5 Annotating kfuncs --------------------- In addition to kfuncs' arguments, verifier may need more information about the @@ -216,7 +279,7 @@ protected. An example is given below:: ... } -2.4.1 KF_ACQUIRE flag +2.5.1 KF_ACQUIRE flag --------------------- The KF_ACQUIRE flag is used to indicate that the kfunc returns a pointer to a @@ -226,7 +289,7 @@ referenced kptr (by invoking bpf_kptr_xchg). If not, the verifier fails the loading of the BPF program until no lingering references remain in all possible explored states of the program. -2.4.2 KF_RET_NULL flag +2.5.2 KF_RET_NULL flag ---------------------- The KF_RET_NULL flag is used to indicate that the pointer returned by the kfunc @@ -235,87 +298,21 @@ returned from the kfunc before making use of it (dereferencing or passing to another helper). This flag is often used in pairing with KF_ACQUIRE flag, but both are orthogonal to each other. -2.4.3 KF_RELEASE flag +2.5.3 KF_RELEASE flag --------------------- The KF_RELEASE flag is used to indicate that the kfunc releases the pointer passed in to it. There can be only one referenced pointer that can be passed in. All copies of the pointer being released are invalidated as a result of -invoking kfunc with this flag. KF_RELEASE kfuncs automatically receive the -protection afforded by the KF_TRUSTED_ARGS flag described below. - -2.4.4 KF_TRUSTED_ARGS flag --------------------------- - -The KF_TRUSTED_ARGS flag is used for kfuncs taking pointer arguments. It -indicates that the all pointer arguments are valid, and that all pointers to -BTF objects have been passed in their unmodified form (that is, at a zero -offset, and without having been obtained from walking another pointer, with one -exception described below). - -There are two types of pointers to kernel objects which are considered "valid": - -1. Pointers which are passed as tracepoint or struct_ops callback arguments. -2. Pointers which were returned from a KF_ACQUIRE kfunc. - -Pointers to non-BTF objects (e.g. scalar pointers) may also be passed to -KF_TRUSTED_ARGS kfuncs, and may have a non-zero offset. - -The definition of "valid" pointers is subject to change at any time, and has -absolutely no ABI stability guarantees. - -As mentioned above, a nested pointer obtained from walking a trusted pointer is -no longer trusted, with one exception. If a struct type has a field that is -guaranteed to be valid (trusted or rcu, as in KF_RCU description below) as long -as its parent pointer is valid, the following macros can be used to express -that to the verifier: - -* ``BTF_TYPE_SAFE_TRUSTED`` -* ``BTF_TYPE_SAFE_RCU`` -* ``BTF_TYPE_SAFE_RCU_OR_NULL`` - -For example, - -.. code-block:: c - - BTF_TYPE_SAFE_TRUSTED(struct socket) { - struct sock *sk; - }; - -or - -.. code-block:: c - - BTF_TYPE_SAFE_RCU(struct task_struct) { - const cpumask_t *cpus_ptr; - struct css_set __rcu *cgroups; - struct task_struct __rcu *real_parent; - struct task_struct *group_leader; - }; - -In other words, you must: - -1. Wrap the valid pointer type in a ``BTF_TYPE_SAFE_*`` macro. - -2. Specify the type and name of the valid nested field. This field must match - the field in the original type definition exactly. - -A new type declared by a ``BTF_TYPE_SAFE_*`` macro also needs to be emitted so -that it appears in BTF. For example, ``BTF_TYPE_SAFE_TRUSTED(struct socket)`` -is emitted in the ``type_is_trusted()`` function as follows: - -.. code-block:: c - - BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct socket)); - +invoking kfunc with this flag. -2.4.5 KF_SLEEPABLE flag +2.5.4 KF_SLEEPABLE flag ----------------------- The KF_SLEEPABLE flag is used for kfuncs that may sleep. Such kfuncs can only be called by sleepable BPF programs (BPF_F_SLEEPABLE). -2.4.6 KF_DESTRUCTIVE flag +2.5.5 KF_DESTRUCTIVE flag -------------------------- The KF_DESTRUCTIVE flag is used to indicate functions calling which is @@ -324,18 +321,19 @@ rebooting or panicking. Due to this additional restrictions apply to these calls. At the moment they only require CAP_SYS_BOOT capability, but more can be added later. -2.4.7 KF_RCU flag +2.5.6 KF_RCU flag ----------------- -The KF_RCU flag is a weaker version of KF_TRUSTED_ARGS. The kfuncs marked with -KF_RCU expect either PTR_TRUSTED or MEM_RCU arguments. The verifier guarantees -that the objects are valid and there is no use-after-free. The pointers are not -NULL, but the object's refcount could have reached zero. The kfuncs need to -consider doing refcnt != 0 check, especially when returning a KF_ACQUIRE -pointer. Note as well that a KF_ACQUIRE kfunc that is KF_RCU should very likely -also be KF_RET_NULL. +The KF_RCU flag allows kfuncs to opt out of the default trusted args +requirement and accept RCU pointers with weaker guarantees. The kfuncs marked +with KF_RCU expect either PTR_TRUSTED or MEM_RCU arguments. The verifier +guarantees that the objects are valid and there is no use-after-free. The +pointers are not NULL, but the object's refcount could have reached zero. The +kfuncs need to consider doing refcnt != 0 check, especially when returning a +KF_ACQUIRE pointer. Note as well that a KF_ACQUIRE kfunc that is KF_RCU should +very likely also be KF_RET_NULL. -2.4.8 KF_RCU_PROTECTED flag +2.5.7 KF_RCU_PROTECTED flag --------------------------- The KF_RCU_PROTECTED flag is used to indicate that the kfunc must be invoked in @@ -354,7 +352,7 @@ RCU protection but do not take RCU protected arguments. .. _KF_deprecated_flag: -2.4.9 KF_DEPRECATED flag +2.5.8 KF_DEPRECATED flag ------------------------ The KF_DEPRECATED flag is used for kfuncs which are scheduled to be @@ -374,7 +372,7 @@ encouraged to make their use-cases known as early as possible, and participate in upstream discussions regarding whether to keep, change, deprecate, or remove those kfuncs if and when such discussions occur. -2.5 Registering the kfuncs +2.6 Registering the kfuncs -------------------------- Once the kfunc is prepared for use, the final step to making it visible is @@ -397,7 +395,7 @@ type. An example is shown below:: } late_initcall(init_subsystem); -2.6 Specifying no-cast aliases with ___init +2.7 Specifying no-cast aliases with ___init -------------------------------------------- The verifier will always enforce that the BTF type of a pointer passed to a diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c index 5ace2511fec5..abd5eaa4892e 100644 --- a/fs/bpf_fs_kfuncs.c +++ b/fs/bpf_fs_kfuncs.c @@ -68,10 +68,7 @@ __bpf_kfunc void bpf_put_file(struct file *file) * * Resolve the pathname for the supplied *path* and store it in *buf*. This BPF * kfunc is the safer variant of the legacy bpf_d_path() helper and should be - * used in place of bpf_d_path() whenever possible. It enforces KF_TRUSTED_ARGS - * semantics, meaning that the supplied *path* must itself hold a valid - * reference, or else the BPF program will be outright rejected by the BPF - * verifier. + * used in place of bpf_d_path() whenever possible. * * This BPF kfunc may only be called from BPF LSM programs. * @@ -377,9 +374,8 @@ static int bpf_fs_kfuncs_filter(const struct bpf_prog *prog, u32 kfunc_id) return -EACCES; } -/* bpf_[set|remove]_dentry_xattr.* hooks have KF_TRUSTED_ARGS and - * KF_SLEEPABLE, so they are only available to sleepable hooks with - * dentry arguments. +/* bpf_[set|remove]_dentry_xattr.* hooks have KF_SLEEPABLE, so they are only + * available to sleepable hooks with dentry arguments. * * Setting and removing xattr requires exclusive lock on dentry->d_inode. * Some hooks already locked d_inode, while some hooks have not locked diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3d44c5d06623..359a962d69a1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12040,11 +12040,6 @@ static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta) return meta->kfunc_flags & KF_RELEASE; } -static bool is_kfunc_trusted_args(struct bpf_kfunc_call_arg_meta *meta) -{ - return (meta->kfunc_flags & KF_TRUSTED_ARGS) || is_kfunc_release(meta); -} - static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta) { return meta->kfunc_flags & KF_SLEEPABLE; @@ -13253,9 +13248,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EINVAL; } - if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) && - (register_is_null(reg) || type_may_be_null(reg->type)) && - !is_kfunc_arg_nullable(meta->btf, &args[i])) { + if ((register_is_null(reg) || type_may_be_null(reg->type)) && + !is_kfunc_arg_nullable(meta->btf, &args[i]) && + !is_kfunc_arg_optional(meta->btf, &args[i])) { verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i); return -EACCES; } @@ -13320,9 +13315,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ fallthrough; case KF_ARG_PTR_TO_ALLOC_BTF_ID: case KF_ARG_PTR_TO_BTF_ID: - if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta)) - break; - if (!is_trusted_reg(reg)) { if (!is_kfunc_rcu(meta)) { verbose(env, "R%d must be referenced or trusted\n", regno); -- cgit v1.2.3 From 7646c7afd9a95db0b0cb4ad066ed90f6024da67d Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 10:00:28 -0800 Subject: bpf: Remove redundant KF_TRUSTED_ARGS flag from all kfuncs Now that KF_TRUSTED_ARGS is the default for all kfuncs, remove the explicit KF_TRUSTED_ARGS flag from all kfunc definitions and remove the flag itself. Acked-by: Eduard Zingerman Reviewed-by: Emil Tsalapatis Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102180038.2708325-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- fs/bpf_fs_kfuncs.c | 13 ++++++------- fs/verity/measure.c | 2 +- include/linux/bpf.h | 2 +- include/linux/btf.h | 3 +-- kernel/bpf/arena.c | 6 +++--- kernel/bpf/cpumask.c | 2 +- kernel/bpf/helpers.c | 20 ++++++++++---------- kernel/bpf/map_iter.c | 2 +- kernel/bpf/verifier.c | 2 +- kernel/sched/ext.c | 8 ++++---- mm/bpf_memcontrol.c | 10 +++++----- net/core/filter.c | 10 +++++----- net/core/xdp.c | 2 +- net/netfilter/nf_conntrack_bpf.c | 8 ++++---- net/netfilter/nf_flow_table_bpf.c | 2 +- net/netfilter/nf_nat_bpf.c | 2 +- net/sched/bpf_qdisc.c | 12 ++++++------ tools/testing/selftests/bpf/progs/cpumask_failure.c | 2 +- tools/testing/selftests/bpf/test_kmods/bpf_testmod.c | 20 ++++++++++---------- 19 files changed, 63 insertions(+), 65 deletions(-) (limited to 'kernel') diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c index abd5eaa4892e..e4e51a1d0de2 100644 --- a/fs/bpf_fs_kfuncs.c +++ b/fs/bpf_fs_kfuncs.c @@ -356,14 +356,13 @@ __bpf_kfunc int bpf_cgroup_read_xattr(struct cgroup *cgroup, const char *name__s __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_fs_kfunc_set_ids) -BTF_ID_FLAGS(func, bpf_get_task_exe_file, - KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_get_task_exe_file, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_put_file, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_path_d_path, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_get_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_set_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_remove_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_path_d_path) +BTF_ID_FLAGS(func, bpf_get_dentry_xattr, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_set_dentry_xattr, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_remove_dentry_xattr, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_fs_kfunc_set_ids) static int bpf_fs_kfuncs_filter(const struct bpf_prog *prog, u32 kfunc_id) diff --git a/fs/verity/measure.c b/fs/verity/measure.c index 388734132f01..6a35623ebdf0 100644 --- a/fs/verity/measure.c +++ b/fs/verity/measure.c @@ -162,7 +162,7 @@ __bpf_kfunc int bpf_get_fsverity_digest(struct file *file, struct bpf_dynptr *di __bpf_kfunc_end_defs(); BTF_KFUNCS_START(fsverity_set_ids) -BTF_ID_FLAGS(func, bpf_get_fsverity_digest, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_get_fsverity_digest) BTF_KFUNCS_END(fsverity_set_ids) static int bpf_get_fsverity_digest_filter(const struct bpf_prog *prog, u32 kfunc_id) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4e7d72dfbcd4..9efb2ddf331c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -753,7 +753,7 @@ enum bpf_type_flag { MEM_ALLOC = BIT(11 + BPF_BASE_TYPE_BITS), /* PTR was passed from the kernel in a trusted context, and may be - * passed to KF_TRUSTED_ARGS kfuncs or BPF helper functions. + * passed to kfuncs or BPF helper functions. * Confusingly, this is _not_ the opposite of PTR_UNTRUSTED above. * PTR_UNTRUSTED refers to a kptr that was read directly from a map * without invoking bpf_kptr_xchg(). What we really need to know is diff --git a/include/linux/btf.h b/include/linux/btf.h index f06976ffb63f..691f09784933 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -34,7 +34,7 @@ * * And the following kfunc: * - * BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) + * BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE) * * All invocations to the kfunc must pass the unmodified, unwalked task: * @@ -66,7 +66,6 @@ * return 0; * } */ -#define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ #define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ #define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ #define KF_RCU (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */ diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 456ac989269d..2274319a95e6 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -890,9 +890,9 @@ __bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_c __bpf_kfunc_end_defs(); BTF_KFUNCS_START(arena_kfuncs) -BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_ARENA_RET | KF_ARENA_ARG2) -BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_ARENA_ARG2) -BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_TRUSTED_ARGS | KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_ARENA_RET | KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_ARENA_ARG2) BTF_KFUNCS_END(arena_kfuncs) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index 9876c5fe6c2a..b8c805b4b06a 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -477,7 +477,7 @@ __bpf_kfunc_end_defs(); BTF_KFUNCS_START(cpumask_kfunc_btf_ids) BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE) BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_first_and, KF_RCU) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index db72b96f9c8c..2c15f77c74db 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4427,7 +4427,7 @@ BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_throw) #ifdef CONFIG_BPF_EVENTS -BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_send_signal_task) #endif #ifdef CONFIG_KEYS BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) @@ -4467,14 +4467,14 @@ BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU) BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY) #ifdef CONFIG_CGROUPS -BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY) -BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY) #endif -BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_dynptr_adjust) @@ -4510,8 +4510,8 @@ BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr) BTF_ID_FLAGS(func, bpf_probe_read_kernel_str_dynptr) BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE) #endif #ifdef CONFIG_DMA_SHARED_BUFFER BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE) @@ -4536,10 +4536,10 @@ BTF_ID_FLAGS(func, bpf_strncasestr); #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS) BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif -BTF_ID_FLAGS(func, bpf_stream_vprintk_impl, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_dynptr_from_file, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_stream_vprintk_impl) +BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl) +BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl) +BTF_ID_FLAGS(func, bpf_dynptr_from_file) BTF_ID_FLAGS(func, bpf_dynptr_file_discard) BTF_KFUNCS_END(common_btf_ids) diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 9575314f40a6..261a03ea73d3 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -214,7 +214,7 @@ __bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map) __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_map_iter_kfunc_ids) -BTF_ID_FLAGS(func, bpf_map_sum_elem_count, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_map_sum_elem_count) BTF_KFUNCS_END(bpf_map_iter_kfunc_ids) static const struct btf_kfunc_id_set bpf_map_iter_kfunc_set = { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 359a962d69a1..c9da70dd3e72 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12619,7 +12619,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, /* Enforce strict type matching for calls to kfuncs that are acquiring * or releasing a reference, or are no-cast aliases. We do _not_ - * enforce strict matching for plain KF_TRUSTED_ARGS kfuncs by default, + * enforce strict matching for kfuncs by default, * as we want to enable BPF programs to pass types that are bitwise * equivalent without forcing them to explicitly cast with something * like bpf_cast_to_kern_ctx(). diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 94164f2dec6d..fd5423428dde 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -7229,9 +7229,9 @@ BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) -BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, scx_bpf_exit_bstr) +BTF_ID_FLAGS(func, scx_bpf_error_bstr) +BTF_ID_FLAGS(func, scx_bpf_dump_bstr) BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) @@ -7250,7 +7250,7 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) #endif BTF_ID_FLAGS(func, scx_bpf_now) -BTF_ID_FLAGS(func, scx_bpf_events, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, scx_bpf_events) BTF_KFUNCS_END(scx_kfunc_ids_any) static const struct btf_kfunc_id_set scx_kfunc_set_any = { diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c index e8fa7f5855f9..716df49d7647 100644 --- a/mm/bpf_memcontrol.c +++ b/mm/bpf_memcontrol.c @@ -166,11 +166,11 @@ BTF_ID_FLAGS(func, bpf_get_root_mem_cgroup, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_get_mem_cgroup, KF_ACQUIRE | KF_RET_NULL | KF_RCU) BTF_ID_FLAGS(func, bpf_put_mem_cgroup, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_mem_cgroup_vm_events, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_mem_cgroup_memory_events, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_mem_cgroup_usage, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_mem_cgroup_page_state, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_mem_cgroup_flush_stats, KF_TRUSTED_ARGS | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_mem_cgroup_vm_events) +BTF_ID_FLAGS(func, bpf_mem_cgroup_memory_events) +BTF_ID_FLAGS(func, bpf_mem_cgroup_usage) +BTF_ID_FLAGS(func, bpf_mem_cgroup_page_state) +BTF_ID_FLAGS(func, bpf_mem_cgroup_flush_stats, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_memcontrol_kfuncs) diff --git a/net/core/filter.c b/net/core/filter.c index 616e0520a0bb..d43df98e1ded 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -12438,11 +12438,11 @@ int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, } BTF_KFUNCS_START(bpf_kfunc_check_set_skb) -BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb) BTF_KFUNCS_END(bpf_kfunc_check_set_skb) BTF_KFUNCS_START(bpf_kfunc_check_set_skb_meta) -BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta) BTF_KFUNCS_END(bpf_kfunc_check_set_skb_meta) BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) @@ -12455,11 +12455,11 @@ BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path) BTF_KFUNCS_END(bpf_kfunc_check_set_sock_addr) BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk) -BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk) BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk) BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops) -BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp) BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops) static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { @@ -12554,7 +12554,7 @@ __bpf_kfunc int bpf_sock_destroy(struct sock_common *sock) __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_sk_iter_kfunc_ids) -BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_sock_destroy) BTF_KFUNCS_END(bpf_sk_iter_kfunc_ids) static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id) diff --git a/net/core/xdp.c b/net/core/xdp.c index 9100e160113a..fee6d080ee85 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -964,7 +964,7 @@ __bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx, __bpf_kfunc_end_defs(); BTF_KFUNCS_START(xdp_metadata_kfunc_ids) -#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS) +#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name) XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC BTF_KFUNCS_END(xdp_metadata_kfunc_ids) diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index 4a136fc3a9c0..a630139bd0c3 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -516,10 +516,10 @@ BTF_ID_FLAGS(func, bpf_skb_ct_alloc, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_skb_ct_lookup, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_ct_insert_entry, KF_ACQUIRE | KF_RET_NULL | KF_RELEASE) BTF_ID_FLAGS(func, bpf_ct_release, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_ct_set_timeout) +BTF_ID_FLAGS(func, bpf_ct_change_timeout) +BTF_ID_FLAGS(func, bpf_ct_set_status) +BTF_ID_FLAGS(func, bpf_ct_change_status) BTF_KFUNCS_END(nf_ct_kfunc_set) static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = { diff --git a/net/netfilter/nf_flow_table_bpf.c b/net/netfilter/nf_flow_table_bpf.c index 4a5f5195f2d2..cbd5b97a6329 100644 --- a/net/netfilter/nf_flow_table_bpf.c +++ b/net/netfilter/nf_flow_table_bpf.c @@ -105,7 +105,7 @@ __diag_pop() __bpf_kfunc_end_defs(); BTF_KFUNCS_START(nf_ft_kfunc_set) -BTF_ID_FLAGS(func, bpf_xdp_flow_lookup, KF_TRUSTED_ARGS | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_xdp_flow_lookup, KF_RET_NULL) BTF_KFUNCS_END(nf_ft_kfunc_set) static const struct btf_kfunc_id_set nf_flow_kfunc_set = { diff --git a/net/netfilter/nf_nat_bpf.c b/net/netfilter/nf_nat_bpf.c index 481be15609b1..f9dd85ccea01 100644 --- a/net/netfilter/nf_nat_bpf.c +++ b/net/netfilter/nf_nat_bpf.c @@ -55,7 +55,7 @@ __bpf_kfunc int bpf_ct_set_nat_info(struct nf_conn___init *nfct, __bpf_kfunc_end_defs(); BTF_KFUNCS_START(nf_nat_kfunc_set) -BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_ct_set_nat_info) BTF_KFUNCS_END(nf_nat_kfunc_set) static const struct btf_kfunc_id_set nf_bpf_nat_kfunc_set = { diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c index adcb618a2bfc..b9771788b9b3 100644 --- a/net/sched/bpf_qdisc.c +++ b/net/sched/bpf_qdisc.c @@ -271,14 +271,14 @@ __bpf_kfunc void bpf_qdisc_bstats_update(struct Qdisc *sch, const struct sk_buff __bpf_kfunc_end_defs(); BTF_KFUNCS_START(qdisc_kfunc_ids) -BTF_ID_FLAGS(func, bpf_skb_get_hash, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_skb_get_hash) BTF_ID_FLAGS(func, bpf_kfree_skb, KF_RELEASE) BTF_ID_FLAGS(func, bpf_qdisc_skb_drop, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_qdisc_watchdog_schedule, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_qdisc_init_prologue, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_qdisc_reset_destroy_epilogue, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_qdisc_bstats_update, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb) +BTF_ID_FLAGS(func, bpf_qdisc_watchdog_schedule) +BTF_ID_FLAGS(func, bpf_qdisc_init_prologue) +BTF_ID_FLAGS(func, bpf_qdisc_reset_destroy_epilogue) +BTF_ID_FLAGS(func, bpf_qdisc_bstats_update) BTF_KFUNCS_END(qdisc_kfunc_ids) BTF_SET_START(qdisc_common_kfunc_set) diff --git a/tools/testing/selftests/bpf/progs/cpumask_failure.c b/tools/testing/selftests/bpf/progs/cpumask_failure.c index 8a2fd596c8a3..61c32e91e8c3 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_failure.c +++ b/tools/testing/selftests/bpf/progs/cpumask_failure.c @@ -110,7 +110,7 @@ SEC("tp_btf/task_newtask") __failure __msg("NULL pointer passed to trusted arg0") int BPF_PROG(test_cpumask_null, struct task_struct *task, u64 clone_flags) { - /* NULL passed to KF_TRUSTED_ARGS kfunc. */ + /* NULL passed to kfunc. */ bpf_cpumask_empty(NULL); return 0; diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 90c4b1a51de6..1c41d03bd5a1 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -693,9 +693,9 @@ BTF_ID_FLAGS(func, bpf_kfunc_dynptr_test) BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_nonzero_offset_test, KF_ACQUIRE) BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_zero_offset_test, KF_ACQUIRE) BTF_ID_FLAGS(func, bpf_kfunc_nested_release_test, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_kfunc_trusted_vma_test, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_kfunc_trusted_task_test, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_kfunc_trusted_num_test, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_trusted_vma_test) +BTF_ID_FLAGS(func, bpf_kfunc_trusted_task_test) +BTF_ID_FLAGS(func, bpf_kfunc_trusted_num_test) BTF_ID_FLAGS(func, bpf_kfunc_rcu_task_test, KF_RCU) BTF_ID_FLAGS(func, bpf_kfunc_ret_rcu_test, KF_RET_NULL | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_kfunc_ret_rcu_test_nostruct, KF_RET_NULL | KF_RCU_PROTECTED) @@ -1158,7 +1158,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_pass2) BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail1) BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail2) BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail3) -BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS | KF_RCU) +BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_RCU) BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE) BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg) BTF_ID_FLAGS(func, bpf_kfunc_call_test_offset) @@ -1172,12 +1172,12 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_sendmsg, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_call_sock_sendmsg, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getsockname, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getpeername, KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_prologue, KF_TRUSTED_ARGS | KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_impl, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_prologue, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10) +BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1) +BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_impl) BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids) static int bpf_testmod_ops_init(struct btf *btf) -- cgit v1.2.3 From 817593af7b9ba56c23d9dd1858232c5493a14f55 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 12:02:27 -0800 Subject: bpf: syscall: Introduce memcg enter/exit helpers Introduce bpf_map_memcg_enter() and bpf_map_memcg_exit() helpers to reduce code duplication in memcg context management. bpf_map_memcg_enter() gets the memcg from the map, sets it as active, and returns both the previous and the now active memcg. bpf_map_memcg_exit() restores the previous active memcg and releases the reference obtained during enter. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102200230.25168-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 15 +++++++++++++++ kernel/bpf/syscall.c | 54 +++++++++++++++++++++++++--------------------------- 2 files changed, 41 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9efb2ddf331c..4e9667ed6630 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2608,6 +2608,10 @@ struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); int bpf_map_alloc_pages(const struct bpf_map *map, int nid, unsigned long nr_pages, struct page **page_array); #ifdef CONFIG_MEMCG +void bpf_map_memcg_enter(const struct bpf_map *map, struct mem_cgroup **old_memcg, + struct mem_cgroup **new_memcg); +void bpf_map_memcg_exit(struct mem_cgroup *old_memcg, + struct mem_cgroup *memcg); void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node); void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags, @@ -2632,6 +2636,17 @@ void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, kvcalloc(_n, _size, _flags) #define bpf_map_alloc_percpu(_map, _size, _align, _flags) \ __alloc_percpu_gfp(_size, _align, _flags) +static inline void bpf_map_memcg_enter(const struct bpf_map *map, struct mem_cgroup **old_memcg, + struct mem_cgroup **new_memcg) +{ + *new_memcg = NULL; + *old_memcg = NULL; +} + +static inline void bpf_map_memcg_exit(struct mem_cgroup *old_memcg, + struct mem_cgroup *memcg) +{ +} #endif static inline int diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a4d38272d8bc..c77ab2e32659 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -505,17 +505,29 @@ static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map) return root_mem_cgroup; } +void bpf_map_memcg_enter(const struct bpf_map *map, struct mem_cgroup **old_memcg, + struct mem_cgroup **new_memcg) +{ + *new_memcg = bpf_map_get_memcg(map); + *old_memcg = set_active_memcg(*new_memcg); +} + +void bpf_map_memcg_exit(struct mem_cgroup *old_memcg, + struct mem_cgroup *new_memcg) +{ + set_active_memcg(old_memcg); + mem_cgroup_put(new_memcg); +} + void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node) { struct mem_cgroup *memcg, *old_memcg; void *ptr; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); + bpf_map_memcg_enter(map, &old_memcg, &memcg); ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node); - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); + bpf_map_memcg_exit(old_memcg, memcg); return ptr; } @@ -526,11 +538,9 @@ void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags struct mem_cgroup *memcg, *old_memcg; void *ptr; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); + bpf_map_memcg_enter(map, &old_memcg, &memcg); ptr = kmalloc_nolock(size, flags | __GFP_ACCOUNT, node); - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); + bpf_map_memcg_exit(old_memcg, memcg); return ptr; } @@ -540,11 +550,9 @@ void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) struct mem_cgroup *memcg, *old_memcg; void *ptr; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); + bpf_map_memcg_enter(map, &old_memcg, &memcg); ptr = kzalloc(size, flags | __GFP_ACCOUNT); - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); + bpf_map_memcg_exit(old_memcg, memcg); return ptr; } @@ -555,11 +563,9 @@ void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, struct mem_cgroup *memcg, *old_memcg; void *ptr; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); + bpf_map_memcg_enter(map, &old_memcg, &memcg); ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT); - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); + bpf_map_memcg_exit(old_memcg, memcg); return ptr; } @@ -570,11 +576,9 @@ void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, struct mem_cgroup *memcg, *old_memcg; void __percpu *ptr; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); + bpf_map_memcg_enter(map, &old_memcg, &memcg); ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT); - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); + bpf_map_memcg_exit(old_memcg, memcg); return ptr; } @@ -612,12 +616,9 @@ int bpf_map_alloc_pages(const struct bpf_map *map, int nid, unsigned long i, j; struct page *pg; int ret = 0; -#ifdef CONFIG_MEMCG struct mem_cgroup *memcg, *old_memcg; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); -#endif + bpf_map_memcg_enter(map, &old_memcg, &memcg); for (i = 0; i < nr_pages; i++) { pg = __bpf_alloc_page(nid); @@ -631,10 +632,7 @@ int bpf_map_alloc_pages(const struct bpf_map *map, int nid, break; } -#ifdef CONFIG_MEMCG - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); -#endif + bpf_map_memcg_exit(old_memcg, memcg); return ret; } -- cgit v1.2.3 From e66fe1bc6d25d6fbced99de6c377f1b3d961a80e Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 12:02:28 -0800 Subject: bpf: arena: Reintroduce memcg accounting When arena allocations were converted from bpf_map_alloc_pages() to kmalloc_nolock() to support non-sleepable contexts, memcg accounting was inadvertently lost. This commit restores proper memory accounting for all arena-related allocations. All arena related allocations are accounted into memcg of the process that created bpf_arena. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102200230.25168-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/arena.c | 29 ++++++++++++++++++++++++++--- kernel/bpf/range_tree.c | 5 +++-- kernel/bpf/syscall.c | 3 --- 3 files changed, 29 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 2274319a95e6..42fae0a9f314 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -360,6 +360,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) { struct bpf_map *map = vmf->vma->vm_file->private_data; struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + struct mem_cgroup *new_memcg, *old_memcg; struct page *page; long kbase, kaddr; unsigned long flags; @@ -377,6 +378,8 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) /* already have a page vmap-ed */ goto out; + bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); + if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT) /* User space requested to segfault when page is not allocated by bpf prog */ goto out_unlock_sigsegv; @@ -400,12 +403,14 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) goto out_unlock_sigsegv; } flush_vmap_cache(kaddr, PAGE_SIZE); + bpf_map_memcg_exit(old_memcg, new_memcg); out: page_ref_add(page, 1); raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); vmf->page = page; return 0; out_unlock_sigsegv: + bpf_map_memcg_exit(old_memcg, new_memcg); raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); return VM_FAULT_SIGSEGV; } @@ -534,6 +539,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt /* user_vm_end/start are fixed before bpf prog runs */ long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena); + struct mem_cgroup *new_memcg, *old_memcg; struct apply_range_data data; struct page **pages = NULL; long remaining, mapped = 0; @@ -555,11 +561,14 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt return 0; } + bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); /* Cap allocation size to KMALLOC_MAX_CACHE_SIZE so kmalloc_nolock() can succeed. */ alloc_pages = min(page_cnt, KMALLOC_MAX_CACHE_SIZE / sizeof(struct page *)); - pages = kmalloc_nolock(alloc_pages * sizeof(struct page *), 0, NUMA_NO_NODE); - if (!pages) + pages = kmalloc_nolock(alloc_pages * sizeof(struct page *), __GFP_ACCOUNT, NUMA_NO_NODE); + if (!pages) { + bpf_map_memcg_exit(old_memcg, new_memcg); return 0; + } data.pages = pages; if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) @@ -617,6 +626,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT); raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); kfree_nolock(pages); + bpf_map_memcg_exit(old_memcg, new_memcg); return clear_lo32(arena->user_vm_start) + uaddr32; out: range_tree_set(&arena->rt, pgoff + mapped, page_cnt - mapped); @@ -630,6 +640,7 @@ out_unlock_free_pages: raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); out_free_pages: kfree_nolock(pages); + bpf_map_memcg_exit(old_memcg, new_memcg); return 0; } @@ -651,6 +662,7 @@ static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt) static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable) { + struct mem_cgroup *new_memcg, *old_memcg; u64 full_uaddr, uaddr_end; long kaddr, pgoff; struct page *page; @@ -671,6 +683,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT; pgoff = compute_pgoff(arena, uaddr); + bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); if (!sleepable) goto defer; @@ -709,11 +722,13 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, zap_pages(arena, full_uaddr, 1); __free_page(page); } + bpf_map_memcg_exit(old_memcg, new_memcg); return; defer: - s = kmalloc_nolock(sizeof(struct arena_free_span), 0, -1); + s = kmalloc_nolock(sizeof(struct arena_free_span), __GFP_ACCOUNT, -1); + bpf_map_memcg_exit(old_memcg, new_memcg); if (!s) /* * If allocation fails in non-sleepable context, pages are intentionally left @@ -735,6 +750,7 @@ defer: static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt) { long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; + struct mem_cgroup *new_memcg, *old_memcg; unsigned long flags; long pgoff; int ret; @@ -757,7 +773,9 @@ static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt } /* "Allocate" the region to prevent it from being allocated. */ + bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); ret = range_tree_clear(&arena->rt, pgoff, page_cnt); + bpf_map_memcg_exit(old_memcg, new_memcg); out: raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); return ret; @@ -766,6 +784,7 @@ out: static void arena_free_worker(struct work_struct *work) { struct bpf_arena *arena = container_of(work, struct bpf_arena, free_work); + struct mem_cgroup *new_memcg, *old_memcg; struct llist_node *list, *pos, *t; struct arena_free_span *s; u64 arena_vm_start, user_vm_start; @@ -780,6 +799,8 @@ static void arena_free_worker(struct work_struct *work) return; } + bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); + init_llist_head(&free_pages); arena_vm_start = bpf_arena_get_kern_vm_start(arena); user_vm_start = bpf_arena_get_user_vm_start(arena); @@ -820,6 +841,8 @@ static void arena_free_worker(struct work_struct *work) page = llist_entry(pos, struct page, pcp_llist); __free_page(page); } + + bpf_map_memcg_exit(old_memcg, new_memcg); } static void arena_free_irq(struct irq_work *iw) diff --git a/kernel/bpf/range_tree.c b/kernel/bpf/range_tree.c index 99c63d982c5d..2f28886f3ff7 100644 --- a/kernel/bpf/range_tree.c +++ b/kernel/bpf/range_tree.c @@ -149,7 +149,8 @@ int range_tree_clear(struct range_tree *rt, u32 start, u32 len) range_it_insert(rn, rt); /* Add a range */ - new_rn = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE); + new_rn = kmalloc_nolock(sizeof(struct range_node), __GFP_ACCOUNT, + NUMA_NO_NODE); if (!new_rn) return -ENOMEM; new_rn->rn_start = last + 1; @@ -234,7 +235,7 @@ int range_tree_set(struct range_tree *rt, u32 start, u32 len) right->rn_start = start; range_it_insert(right, rt); } else { - left = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE); + left = kmalloc_nolock(sizeof(struct range_node), __GFP_ACCOUNT, NUMA_NO_NODE); if (!left) return -ENOMEM; left->rn_start = start; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c77ab2e32659..6dd2ad2f9e81 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -616,9 +616,7 @@ int bpf_map_alloc_pages(const struct bpf_map *map, int nid, unsigned long i, j; struct page *pg; int ret = 0; - struct mem_cgroup *memcg, *old_memcg; - bpf_map_memcg_enter(map, &old_memcg, &memcg); for (i = 0; i < nr_pages; i++) { pg = __bpf_alloc_page(nid); @@ -632,7 +630,6 @@ int bpf_map_alloc_pages(const struct bpf_map *map, int nid, break; } - bpf_map_memcg_exit(old_memcg, memcg); return ret; } -- cgit v1.2.3 From a069190b590e108223cd841a1c2d0bfb92230ecc Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 14:15:12 -0800 Subject: bpf: Replace __opt annotation with __nullable for kfuncs The __opt annotation was originally introduced specifically for buffer/size argument pairs in bpf_dynptr_slice() and bpf_dynptr_slice_rdwr(), allowing the buffer pointer to be NULL while still validating the size as a constant. The __nullable annotation serves the same purpose but is more general and is already used throughout the BPF subsystem for raw tracepoints, struct_ops, and other kfuncs. This patch unifies the two annotations by replacing __opt with __nullable. The key change is in the verifier's get_kfunc_ptr_arg_type() function, where mem/size pair detection is now performed before the nullable check. This ensures that buffer/size pairs are correctly classified as KF_ARG_PTR_TO_MEM_SIZE even when the buffer is nullable, while adding an !arg_mem_size condition to the nullable check prevents interference with mem/size pair handling. When processing KF_ARG_PTR_TO_MEM_SIZE arguments, the verifier now uses is_kfunc_arg_nullable() instead of the removed is_kfunc_arg_optional() to determine whether to skip size validation for NULL buffers. This is the first documentation added for the __nullable annotation, which has been in use since it was introduced but was previously undocumented. No functional changes to verifier behavior - nullable buffer/size pairs continue to work exactly as before. Acked-by: Eduard Zingerman Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102221513.1961781-1-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- Documentation/bpf/kfuncs.rst | 31 ++++++++++++++++++++----------- include/linux/bpf.h | 2 +- kernel/bpf/helpers.c | 28 ++++++++++++++-------------- kernel/bpf/verifier.c | 23 +++++++++-------------- 4 files changed, 44 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index 6cb6857bfa6f..3eb59a8f9f34 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -178,25 +178,34 @@ Here, the dynptr will be treated as an uninitialized dynptr. Without this annotation, the verifier will reject the program if the dynptr passed in is not initialized. -2.3.4 __opt Annotation -------------------------- +2.3.4 __nullable Annotation +--------------------------- -This annotation is used to indicate that the buffer associated with an __sz or __szk -argument may be null. If the function is passed a nullptr in place of the buffer, -the verifier will not check that length is appropriate for the buffer. The kfunc is -responsible for checking if this buffer is null before using it. +This annotation is used to indicate that the pointer argument may be NULL. +The verifier will allow passing NULL for such arguments. An example is given below:: - __bpf_kfunc void *bpf_dynptr_slice(..., void *buffer__opt, u32 buffer__szk) + __bpf_kfunc void bpf_task_release(struct task_struct *task__nullable) { ... } -Here, the buffer may be null. If buffer is not null, it at least of size buffer_szk. -Either way, the returned buffer is either NULL, or of size buffer_szk. Without this -annotation, the verifier will reject the program if a null pointer is passed in with -a nonzero size. +Here, the task pointer may be NULL. The kfunc is responsible for checking if +the pointer is NULL before dereferencing it. + +The __nullable annotation can be combined with other annotations. For example, +when used with __sz or __szk annotations for memory and size pairs, the +verifier will skip size validation when a NULL pointer is passed, but will +still process the size argument to extract constant size information when +needed:: + + __bpf_kfunc void *bpf_dynptr_slice(..., void *buffer__nullable, + u32 buffer__szk) + +Here, the buffer may be NULL. If the buffer is not NULL, it must be at least +buffer__szk bytes in size. The kfunc is responsible for checking if the buffer +is NULL before using it. 2.3.5 __str Annotation ---------------------------- diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4e9667ed6630..a63e47d2109c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1434,7 +1434,7 @@ bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr); int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset, void *src, u64 len, u64 flags); void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, - void *buffer__opt, u64 buffer__szk); + void *buffer__nullable, u64 buffer__szk); static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u64 offset, u64 len) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 2c15f77c74db..9eaa4185e0a7 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2709,14 +2709,14 @@ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid) * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data. * @p: The dynptr whose data slice to retrieve * @offset: Offset into the dynptr - * @buffer__opt: User-provided buffer to copy contents into. May be NULL + * @buffer__nullable: User-provided buffer to copy contents into. May be NULL * @buffer__szk: Size (in bytes) of the buffer if present. This is the * length of the requested slice. This must be a constant. * * For non-skb and non-xdp type dynptrs, there is no difference between * bpf_dynptr_slice and bpf_dynptr_data. * - * If buffer__opt is NULL, the call will fail if buffer_opt was needed. + * If buffer__nullable is NULL, the call will fail if buffer_opt was needed. * * If the intention is to write to the data slice, please use * bpf_dynptr_slice_rdwr. @@ -2734,7 +2734,7 @@ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid) * direct pointer) */ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset, - void *buffer__opt, u64 buffer__szk) + void *buffer__nullable, u64 buffer__szk) { const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; enum bpf_dynptr_type type; @@ -2755,8 +2755,8 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset, case BPF_DYNPTR_TYPE_RINGBUF: return ptr->data + ptr->offset + offset; case BPF_DYNPTR_TYPE_SKB: - if (buffer__opt) - return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__opt); + if (buffer__nullable) + return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__nullable); else return skb_pointer_if_linear(ptr->data, ptr->offset + offset, len); case BPF_DYNPTR_TYPE_XDP: @@ -2765,16 +2765,16 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset, if (!IS_ERR_OR_NULL(xdp_ptr)) return xdp_ptr; - if (!buffer__opt) + if (!buffer__nullable) return NULL; - bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false); - return buffer__opt; + bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__nullable, len, false); + return buffer__nullable; } case BPF_DYNPTR_TYPE_SKB_META: return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset); case BPF_DYNPTR_TYPE_FILE: - err = bpf_file_fetch_bytes(ptr->data, offset, buffer__opt, buffer__szk); - return err ? NULL : buffer__opt; + err = bpf_file_fetch_bytes(ptr->data, offset, buffer__nullable, buffer__szk); + return err ? NULL : buffer__nullable; default: WARN_ONCE(true, "unknown dynptr type %d\n", type); return NULL; @@ -2785,14 +2785,14 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset, * bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data. * @p: The dynptr whose data slice to retrieve * @offset: Offset into the dynptr - * @buffer__opt: User-provided buffer to copy contents into. May be NULL + * @buffer__nullable: User-provided buffer to copy contents into. May be NULL * @buffer__szk: Size (in bytes) of the buffer if present. This is the * length of the requested slice. This must be a constant. * * For non-skb and non-xdp type dynptrs, there is no difference between * bpf_dynptr_slice and bpf_dynptr_data. * - * If buffer__opt is NULL, the call will fail if buffer_opt was needed. + * If buffer__nullable is NULL, the call will fail if buffer_opt was needed. * * The returned pointer is writable and may point to either directly the dynptr * data at the requested offset or to the buffer if unable to obtain a direct @@ -2824,7 +2824,7 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset, * direct pointer) */ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, - void *buffer__opt, u64 buffer__szk) + void *buffer__nullable, u64 buffer__szk) { const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; @@ -2853,7 +2853,7 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, * will be copied out into the buffer and the user will need to call * bpf_dynptr_write() to commit changes. */ - return bpf_dynptr_slice(p, offset, buffer__opt, buffer__szk); + return bpf_dynptr_slice(p, offset, buffer__nullable, buffer__szk); } __bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c9da70dd3e72..9394b0de2ef0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12086,11 +12086,6 @@ static bool is_kfunc_arg_const_mem_size(const struct btf *btf, return btf_param_match_suffix(btf, arg, "__szk"); } -static bool is_kfunc_arg_optional(const struct btf *btf, const struct btf_param *arg) -{ - return btf_param_match_suffix(btf, arg, "__opt"); -} - static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg) { return btf_param_match_suffix(btf, arg, "__k"); @@ -12510,6 +12505,11 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) return KF_ARG_PTR_TO_CTX; + if (argno + 1 < nargs && + (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]) || + is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]))) + arg_mem_size = true; + /* In this function, we verify the kfunc's BTF as per the argument type, * leaving the rest of the verification with respect to the register * type to our caller. When a set of conditions hold in the BTF type of @@ -12518,7 +12518,8 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno)) return KF_ARG_PTR_TO_CTX; - if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg)) + if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg) && + !arg_mem_size) return KF_ARG_PTR_TO_NULL; if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno])) @@ -12575,11 +12576,6 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_callback(env, meta->btf, &args[argno])) return KF_ARG_PTR_TO_CALLBACK; - if (argno + 1 < nargs && - (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]) || - is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]))) - arg_mem_size = true; - /* This is the catch all argument type of register types supported by * check_helper_mem_access. However, we only allow when argument type is * pointer to scalar, or struct composed (recursively) of scalars. When @@ -13249,8 +13245,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } if ((register_is_null(reg) || type_may_be_null(reg->type)) && - !is_kfunc_arg_nullable(meta->btf, &args[i]) && - !is_kfunc_arg_optional(meta->btf, &args[i])) { + !is_kfunc_arg_nullable(meta->btf, &args[i])) { verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i); return -EACCES; } @@ -13566,7 +13561,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ struct bpf_reg_state *size_reg = ®s[regno + 1]; const struct btf_param *size_arg = &args[i + 1]; - if (!register_is_null(buff_reg) || !is_kfunc_arg_optional(meta->btf, buff_arg)) { + if (!register_is_null(buff_reg) || !is_kfunc_arg_nullable(meta->btf, buff_arg)) { ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1); if (ret < 0) { verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1); -- cgit v1.2.3 From b25b48c7d37617601ebc8cf2633bee95aa82c697 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 6 Jan 2026 18:36:43 -0500 Subject: bpf: Check active lock count in in_sleepable_context() The in_sleepable_context() function is used to specialize the BPF code in do_misc_fixups(). With the addition of nonsleepable arena kfuncs, there are kfuncs whose specialization depends on whether we are holding a lock. We should use the nonsleepable version while holding a lock and the sleepable one when not. Add a check for active_locks to account for locking when specializing arena kfuncs. Signed-off-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260106-arena-under-lock-v2-1-378e9eab3066@etsalapatis.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9394b0de2ef0..7f82e27dd7e7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11466,6 +11466,7 @@ static inline bool in_sleepable_context(struct bpf_verifier_env *env) { return !env->cur_state->active_rcu_locks && !env->cur_state->active_preempt_locks && + !env->cur_state->active_locks && !env->cur_state->active_irq_id && in_sleepable(env); } -- cgit v1.2.3 From 39f77533b6c16e7fbd72e2560e13c9435d2602f5 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 6 Jan 2026 18:36:44 -0500 Subject: bpf: Allow calls to arena functions while holding spinlocks The bpf_arena_*_pages() kfuncs can be called from sleepable contexts, but the verifier still prevents BPF programs from calling them while holding a spinlock. Amend the verifier to allow for BPF programs calling arena page management functions while holding a lock. Signed-off-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260106-arena-under-lock-v2-2-378e9eab3066@etsalapatis.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7f82e27dd7e7..53635ea2e41b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12373,6 +12373,7 @@ enum special_kfunc_type { KF_bpf_task_work_schedule_resume_impl, KF_bpf_arena_alloc_pages, KF_bpf_arena_free_pages, + KF_bpf_arena_reserve_pages, }; BTF_ID_LIST(special_kfunc_list) @@ -12449,6 +12450,7 @@ BTF_ID(func, bpf_task_work_schedule_signal_impl) BTF_ID(func, bpf_task_work_schedule_resume_impl) BTF_ID(func, bpf_arena_alloc_pages) BTF_ID(func, bpf_arena_free_pages) +BTF_ID(func, bpf_arena_reserve_pages) static bool is_task_work_add_kfunc(u32 func_id) { @@ -12884,10 +12886,17 @@ static bool is_bpf_res_spin_lock_kfunc(u32 btf_id) btf_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]; } +static bool is_bpf_arena_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_arena_alloc_pages] || + btf_id == special_kfunc_list[KF_bpf_arena_free_pages] || + btf_id == special_kfunc_list[KF_bpf_arena_reserve_pages]; +} + static bool kfunc_spin_allowed(u32 btf_id) { return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id) || - is_bpf_res_spin_lock_kfunc(btf_id); + is_bpf_res_spin_lock_kfunc(btf_id) || is_bpf_arena_kfunc(btf_id); } static bool is_sync_callback_calling_kfunc(u32 btf_id) -- cgit v1.2.3 From 2b421662c7887a0649fe409155a1f101562d0fa9 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 7 Jan 2026 10:20:16 +0800 Subject: bpf: Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags and check them for following APIs: * 'map_lookup_elem()' * 'map_update_elem()' * 'generic_map_lookup_batch()' * 'generic_map_update_batch()' And, get the correct value size for these APIs. Acked-by: Andrii Nakryiko Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260107022022.12843-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 23 ++++++++++++++++++++++- include/uapi/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 31 +++++++++++++++++-------------- tools/include/uapi/linux/bpf.h | 2 ++ 4 files changed, 43 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a63e47d2109c..108bab1bda9d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3915,14 +3915,35 @@ bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image) } #endif +static inline bool bpf_map_supports_cpu_flags(enum bpf_map_type map_type) +{ + return false; +} + static inline int bpf_map_check_op_flags(struct bpf_map *map, u64 flags, u64 allowed_flags) { - if (flags & ~allowed_flags) + u32 cpu; + + if ((u32)flags & ~allowed_flags) return -EINVAL; if ((flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)) return -EINVAL; + if (!(flags & BPF_F_CPU) && flags >> 32) + return -EINVAL; + + if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS)) { + if (!bpf_map_supports_cpu_flags(map->map_type)) + return -EINVAL; + if ((flags & BPF_F_CPU) && (flags & BPF_F_ALL_CPUS)) + return -EINVAL; + + cpu = flags >> 32; + if ((flags & BPF_F_CPU) && cpu >= num_possible_cpus()) + return -ERANGE; + } + return 0; } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 84ced3ed2d21..2a2ade4be60f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1384,6 +1384,8 @@ enum { BPF_NOEXIST = 1, /* create new element if it didn't exist */ BPF_EXIST = 2, /* update existing element */ BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */ + BPF_F_CPU = 8, /* cpu flag for percpu maps, upper 32-bit of flags is a cpu number */ + BPF_F_ALL_CPUS = 16, /* update value across all CPUs for percpu maps */ }; /* flags for BPF_MAP_CREATE command */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6dd2ad2f9e81..e8cfe9d67e64 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -133,12 +133,14 @@ bool bpf_map_write_active(const struct bpf_map *map) return atomic64_read(&map->writecnt) != 0; } -static u32 bpf_map_value_size(const struct bpf_map *map) -{ - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || - map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) +static u32 bpf_map_value_size(const struct bpf_map *map, u64 flags) +{ + if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS)) + return map->value_size; + else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) return round_up(map->value_size, 8) * num_possible_cpus(); else if (IS_FD_MAP(map)) return sizeof(u32); @@ -1729,7 +1731,7 @@ static int map_lookup_elem(union bpf_attr *attr) if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) return -EPERM; - err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK); + err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK | BPF_F_CPU); if (err) return err; @@ -1737,7 +1739,7 @@ static int map_lookup_elem(union bpf_attr *attr) if (IS_ERR(key)) return PTR_ERR(key); - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->flags); err = -ENOMEM; value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); @@ -1804,7 +1806,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) goto err_put; } - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->flags); value = kvmemdup_bpfptr(uvalue, value_size); if (IS_ERR(value)) { err = PTR_ERR(value); @@ -2000,11 +2002,12 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file, void *key, *value; int err = 0; - err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK); + err = bpf_map_check_op_flags(map, attr->batch.elem_flags, + BPF_F_LOCK | BPF_F_CPU | BPF_F_ALL_CPUS); if (err) return err; - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->batch.elem_flags); max_count = attr->batch.count; if (!max_count) @@ -2059,11 +2062,11 @@ int generic_map_lookup_batch(struct bpf_map *map, u32 value_size, cp, max_count; int err; - err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK); + err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK | BPF_F_CPU); if (err) return err; - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->batch.elem_flags); max_count = attr->batch.count; if (!max_count) @@ -2185,7 +2188,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) goto err_put; } - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, 0); err = -ENOMEM; value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6b92b0847ec2..b816bc53d2e1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1384,6 +1384,8 @@ enum { BPF_NOEXIST = 1, /* create new element if it didn't exist */ BPF_EXIST = 2, /* update existing element */ BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */ + BPF_F_CPU = 8, /* cpu flag for percpu maps, upper 32-bit of flags is a cpu number */ + BPF_F_ALL_CPUS = 16, /* update value across all CPUs for percpu maps */ }; /* flags for BPF_MAP_CREATE command */ -- cgit v1.2.3 From 8eb76cb03f0f6c2bd7b15cf45dcffcd6bd07a360 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 7 Jan 2026 10:20:17 +0800 Subject: bpf: Add BPF_F_CPU and BPF_F_ALL_CPUS flags support for percpu_array maps Introduce support for the BPF_F_ALL_CPUS flag in percpu_array maps to allow updating values for all CPUs with a single value for both update_elem and update_batch APIs. Introduce support for the BPF_F_CPU flag in percpu_array maps to allow: * update value for specified CPU for both update_elem and update_batch APIs. * lookup value for specified CPU for both lookup_elem and lookup_batch APIs. The BPF_F_CPU flag is passed via: * map_flags of lookup_elem and update_elem APIs along with embedded cpu info. * elem_flags of lookup_batch and update_batch APIs along with embedded cpu info. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260107022022.12843-3-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 9 +++++++-- kernel/bpf/arraymap.c | 29 +++++++++++++++++++++++------ kernel/bpf/syscall.c | 2 +- 3 files changed, 31 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 108bab1bda9d..dc92d001d978 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2848,7 +2848,7 @@ int map_set_for_each_callback_args(struct bpf_verifier_env *env, struct bpf_func_state *callee); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); -int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); +int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, @@ -3917,7 +3917,12 @@ bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image) static inline bool bpf_map_supports_cpu_flags(enum bpf_map_type map_type) { - return false; + switch (map_type) { + case BPF_MAP_TYPE_PERCPU_ARRAY: + return true; + default: + return false; + } } static inline int bpf_map_check_op_flags(struct bpf_map *map, u64 flags, u64 allowed_flags) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 1eeb31c5b317..67e9e811de3a 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -307,7 +307,7 @@ static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, return per_cpu_ptr(array->pptrs[index & array->index_mask], cpu); } -int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) +int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; @@ -325,11 +325,18 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) size = array->elem_size; rcu_read_lock(); pptr = array->pptrs[index & array->index_mask]; + if (map_flags & BPF_F_CPU) { + cpu = map_flags >> 32; + copy_map_value(map, value, per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(map, value); + goto unlock; + } for_each_possible_cpu(cpu) { copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu)); check_and_init_map_value(map, value + off); off += size; } +unlock: rcu_read_unlock(); return 0; } @@ -398,10 +405,11 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; void __percpu *pptr; - int cpu, off = 0; + void *ptr, *val; u32 size; + int cpu; - if (unlikely(map_flags > BPF_EXIST)) + if (unlikely((map_flags & BPF_F_LOCK) || (u32)map_flags > BPF_F_ALL_CPUS)) /* unknown flags */ return -EINVAL; @@ -422,11 +430,20 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, size = array->elem_size; rcu_read_lock(); pptr = array->pptrs[index & array->index_mask]; + if (map_flags & BPF_F_CPU) { + cpu = map_flags >> 32; + ptr = per_cpu_ptr(pptr, cpu); + copy_map_value(map, ptr, value); + bpf_obj_free_fields(array->map.record, ptr); + goto unlock; + } for_each_possible_cpu(cpu) { - copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off); - bpf_obj_free_fields(array->map.record, per_cpu_ptr(pptr, cpu)); - off += size; + ptr = per_cpu_ptr(pptr, cpu); + val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu; + copy_map_value(map, ptr, val); + bpf_obj_free_fields(array->map.record, ptr); } +unlock: rcu_read_unlock(); return 0; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e8cfe9d67e64..b5341ab1eb84 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -318,7 +318,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { - err = bpf_percpu_array_copy(map, key, value); + err = bpf_percpu_array_copy(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { err = bpf_percpu_cgroup_storage_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { -- cgit v1.2.3 From c6936161fd55d0c6ffde01da726e66ff5f2934e8 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 7 Jan 2026 10:20:18 +0800 Subject: bpf: Add BPF_F_CPU and BPF_F_ALL_CPUS flags support for percpu_hash and lru_percpu_hash maps Introduce BPF_F_ALL_CPUS flag support for percpu_hash and lru_percpu_hash maps to allow updating values for all CPUs with a single value for both update_elem and update_batch APIs. Introduce BPF_F_CPU flag support for percpu_hash and lru_percpu_hash maps to allow: * update value for specified CPU for both update_elem and update_batch APIs. * lookup value for specified CPU for both lookup_elem and lookup_batch APIs. The BPF_F_CPU flag is passed via: * map_flags along with embedded cpu info. * elem_flags along with embedded cpu info. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260107022022.12843-4-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++- kernel/bpf/hashtab.c | 94 +++++++++++++++++++++++++++++++++++----------------- kernel/bpf/syscall.c | 2 +- 3 files changed, 68 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dc92d001d978..f5c259dcc425 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2847,7 +2847,7 @@ int map_set_for_each_callback_args(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee); -int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); +int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, u64 flags); @@ -3919,6 +3919,8 @@ static inline bool bpf_map_supports_cpu_flags(enum bpf_map_type map_type) { switch (map_type) { case BPF_MAP_TYPE_PERCPU_ARRAY: + case BPF_MAP_TYPE_PERCPU_HASH: + case BPF_MAP_TYPE_LRU_PERCPU_HASH: return true; default: return false; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index c8a9b27f8663..441ff5bc54ac 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -932,7 +932,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) } static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, - void *value, bool onallcpus) + void *value, bool onallcpus, u64 map_flags) { void *ptr; @@ -943,19 +943,28 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, bpf_obj_free_fields(htab->map.record, ptr); } else { u32 size = round_up(htab->map.value_size, 8); - int off = 0, cpu; + void *val; + int cpu; + + if (map_flags & BPF_F_CPU) { + cpu = map_flags >> 32; + ptr = per_cpu_ptr(pptr, cpu); + copy_map_value(&htab->map, ptr, value); + bpf_obj_free_fields(htab->map.record, ptr); + return; + } for_each_possible_cpu(cpu) { ptr = per_cpu_ptr(pptr, cpu); - copy_map_value_long(&htab->map, ptr, value + off); + val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu; + copy_map_value(&htab->map, ptr, val); bpf_obj_free_fields(htab->map.record, ptr); - off += size; } } } static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, - void *value, bool onallcpus) + void *value, bool onallcpus, u64 map_flags) { /* When not setting the initial value on all cpus, zero-fill element * values for other cpus. Otherwise, bpf program has no way to ensure @@ -973,7 +982,7 @@ static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu)); } } else { - pcpu_copy_value(htab, pptr, value, onallcpus); + pcpu_copy_value(htab, pptr, value, onallcpus, map_flags); } } @@ -985,7 +994,7 @@ static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, bool percpu, bool onallcpus, - struct htab_elem *old_elem) + struct htab_elem *old_elem, u64 map_flags) { u32 size = htab->map.value_size; bool prealloc = htab_is_prealloc(htab); @@ -1043,7 +1052,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, pptr = *(void __percpu **)ptr; } - pcpu_init_value(htab, pptr, value, onallcpus); + pcpu_init_value(htab, pptr, value, onallcpus, map_flags); if (!prealloc) htab_elem_set_ptr(l_new, key_size, pptr); @@ -1147,7 +1156,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, } l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, - l_old); + l_old, map_flags); if (IS_ERR(l_new)) { /* all pre-allocated elements are in use or memory exhausted */ ret = PTR_ERR(l_new); @@ -1249,6 +1258,15 @@ err_lock_bucket: return ret; } +static int htab_map_check_update_flags(bool onallcpus, u64 map_flags) +{ + if (unlikely(!onallcpus && map_flags > BPF_EXIST)) + return -EINVAL; + if (unlikely(onallcpus && ((map_flags & BPF_F_LOCK) || (u32)map_flags > BPF_F_ALL_CPUS))) + return -EINVAL; + return 0; +} + static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, void *value, u64 map_flags, bool percpu, bool onallcpus) @@ -1262,9 +1280,9 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, u32 key_size, hash; int ret; - if (unlikely(map_flags > BPF_EXIST)) - /* unknown flags */ - return -EINVAL; + ret = htab_map_check_update_flags(onallcpus, map_flags); + if (unlikely(ret)) + return ret; WARN_ON_ONCE(!bpf_rcu_lock_held()); @@ -1289,7 +1307,7 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, /* Update value in-place */ if (percpu) { pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), - value, onallcpus); + value, onallcpus, map_flags); } else { void **inner_map_pptr = htab_elem_value(l_old, key_size); @@ -1298,7 +1316,7 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, } } else { l_new = alloc_htab_elem(htab, key, value, key_size, - hash, percpu, onallcpus, NULL); + hash, percpu, onallcpus, NULL, map_flags); if (IS_ERR(l_new)) { ret = PTR_ERR(l_new); goto err; @@ -1324,9 +1342,9 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, u32 key_size, hash; int ret; - if (unlikely(map_flags > BPF_EXIST)) - /* unknown flags */ - return -EINVAL; + ret = htab_map_check_update_flags(onallcpus, map_flags); + if (unlikely(ret)) + return ret; WARN_ON_ONCE(!bpf_rcu_lock_held()); @@ -1363,10 +1381,10 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, /* per-cpu hash map can update value in-place */ pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), - value, onallcpus); + value, onallcpus, map_flags); } else { pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size), - value, onallcpus); + value, onallcpus, map_flags); hlist_nulls_add_head_rcu(&l_new->hash_node, head); l_new = NULL; } @@ -1678,9 +1696,9 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, void __user *ukeys = u64_to_user_ptr(attr->batch.keys); void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); u32 batch, max_count, size, bucket_size, map_id; + u64 elem_map_flags, map_flags, allowed_flags; u32 bucket_cnt, total, key_size, value_size; struct htab_elem *node_to_free = NULL; - u64 elem_map_flags, map_flags; struct hlist_nulls_head *head; struct hlist_nulls_node *n; unsigned long flags = 0; @@ -1690,9 +1708,12 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, int ret = 0; elem_map_flags = attr->batch.elem_flags; - if ((elem_map_flags & ~BPF_F_LOCK) || - ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK))) - return -EINVAL; + allowed_flags = BPF_F_LOCK; + if (!do_delete && is_percpu) + allowed_flags |= BPF_F_CPU; + ret = bpf_map_check_op_flags(map, elem_map_flags, allowed_flags); + if (ret) + return ret; map_flags = attr->batch.flags; if (map_flags) @@ -1715,7 +1736,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, key_size = htab->map.key_size; value_size = htab->map.value_size; size = round_up(value_size, 8); - if (is_percpu) + if (is_percpu && !(elem_map_flags & BPF_F_CPU)) value_size = size * num_possible_cpus(); total = 0; /* while experimenting with hash tables with sizes ranging from 10 to @@ -1798,10 +1819,17 @@ again_nocopy: void __percpu *pptr; pptr = htab_elem_get_ptr(l, map->key_size); - for_each_possible_cpu(cpu) { - copy_map_value_long(&htab->map, dst_val + off, per_cpu_ptr(pptr, cpu)); - check_and_init_map_value(&htab->map, dst_val + off); - off += size; + if (elem_map_flags & BPF_F_CPU) { + cpu = elem_map_flags >> 32; + copy_map_value(&htab->map, dst_val, per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(&htab->map, dst_val); + } else { + for_each_possible_cpu(cpu) { + copy_map_value_long(&htab->map, dst_val + off, + per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(&htab->map, dst_val + off); + off += size; + } } } else { value = htab_elem_value(l, key_size); @@ -2357,7 +2385,7 @@ static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *k return NULL; } -int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) +int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct htab_elem *l; void __percpu *pptr; @@ -2374,16 +2402,22 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) l = __htab_map_lookup_elem(map, key); if (!l) goto out; + ret = 0; /* We do not mark LRU map element here in order to not mess up * eviction heuristics when user space does a map walk. */ pptr = htab_elem_get_ptr(l, map->key_size); + if (map_flags & BPF_F_CPU) { + cpu = map_flags >> 32; + copy_map_value(map, value, per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(map, value); + goto out; + } for_each_possible_cpu(cpu) { copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu)); check_and_init_map_value(map, value + off); off += size; } - ret = 0; out: rcu_read_unlock(); return ret; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b5341ab1eb84..5e3b5d828856 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -316,7 +316,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, bpf_disable_instrumentation(); if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { - err = bpf_percpu_hash_copy(map, key, value); + err = bpf_percpu_hash_copy(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { -- cgit v1.2.3 From 8526397c3caf8d0289bab69bfb2c889e2b39b30a Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 7 Jan 2026 10:20:19 +0800 Subject: bpf: Copy map value using copy_map_value_long for percpu_cgroup_storage maps Copy map value using 'copy_map_value_long()'. It's to keep consistent style with the way of other percpu maps. No functional change intended. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260107022022.12843-5-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/local_storage.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index c93a756e035c..2ab4b60ffe61 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -200,8 +200,7 @@ int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key, */ size = round_up(_map->value_size, 8); for_each_possible_cpu(cpu) { - bpf_long_memcpy(value + off, - per_cpu_ptr(storage->percpu_buf, cpu), size); + copy_map_value_long(_map, value + off, per_cpu_ptr(storage->percpu_buf, cpu)); off += size; } rcu_read_unlock(); @@ -234,8 +233,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key, */ size = round_up(_map->value_size, 8); for_each_possible_cpu(cpu) { - bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu), - value + off, size); + copy_map_value_long(_map, per_cpu_ptr(storage->percpu_buf, cpu), value + off); off += size; } rcu_read_unlock(); -- cgit v1.2.3 From 47c79f05aa0d289f760caf5ad6521fd8dbae37a1 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 7 Jan 2026 10:20:20 +0800 Subject: bpf: Add BPF_F_CPU and BPF_F_ALL_CPUS flags support for percpu_cgroup_storage maps Introduce BPF_F_ALL_CPUS flag support for percpu_cgroup_storage maps to allow updating values for all CPUs with a single value for update_elem API. Introduce BPF_F_CPU flag support for percpu_cgroup_storage maps to allow: * update value for specified CPU for update_elem API. * lookup value for specified CPU for lookup_elem API. The BPF_F_CPU flag is passed via map_flags along with embedded cpu info. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260107022022.12843-6-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup.h | 4 ++-- include/linux/bpf.h | 1 + kernel/bpf/local_storage.c | 23 ++++++++++++++++++----- kernel/bpf/syscall.c | 2 +- 4 files changed, 22 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index d1eb5c7729cb..2f535331f926 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -172,7 +172,7 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage); int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map); -int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value); +int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, void *value, u64 flags); @@ -470,7 +470,7 @@ static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( static inline void bpf_cgroup_storage_free( struct bpf_cgroup_storage *storage) {} static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, - void *value) { + void *value, u64 flags) { return 0; } static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f5c259dcc425..5936f8e2996f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3921,6 +3921,7 @@ static inline bool bpf_map_supports_cpu_flags(enum bpf_map_type map_type) case BPF_MAP_TYPE_PERCPU_ARRAY: case BPF_MAP_TYPE_PERCPU_HASH: case BPF_MAP_TYPE_LRU_PERCPU_HASH: + case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: return true; default: return false; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 2ab4b60ffe61..1ccbf28b2ad9 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -180,7 +180,7 @@ static long cgroup_storage_update_elem(struct bpf_map *map, void *key, } int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key, - void *value) + void *value, u64 map_flags) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); struct bpf_cgroup_storage *storage; @@ -198,11 +198,17 @@ int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key, * access 'value_size' of them, so copying rounded areas * will not leak any kernel data */ + if (map_flags & BPF_F_CPU) { + cpu = map_flags >> 32; + copy_map_value(_map, value, per_cpu_ptr(storage->percpu_buf, cpu)); + goto unlock; + } size = round_up(_map->value_size, 8); for_each_possible_cpu(cpu) { copy_map_value_long(_map, value + off, per_cpu_ptr(storage->percpu_buf, cpu)); off += size; } +unlock: rcu_read_unlock(); return 0; } @@ -212,10 +218,11 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key, { struct bpf_cgroup_storage_map *map = map_to_storage(_map); struct bpf_cgroup_storage *storage; - int cpu, off = 0; + void *val; u32 size; + int cpu; - if (map_flags != BPF_ANY && map_flags != BPF_EXIST) + if ((u32)map_flags & ~(BPF_ANY | BPF_EXIST | BPF_F_CPU | BPF_F_ALL_CPUS)) return -EINVAL; rcu_read_lock(); @@ -231,11 +238,17 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key, * returned or zeros which were zero-filled by percpu_alloc, * so no kernel data leaks possible */ + if (map_flags & BPF_F_CPU) { + cpu = map_flags >> 32; + copy_map_value(_map, per_cpu_ptr(storage->percpu_buf, cpu), value); + goto unlock; + } size = round_up(_map->value_size, 8); for_each_possible_cpu(cpu) { - copy_map_value_long(_map, per_cpu_ptr(storage->percpu_buf, cpu), value + off); - off += size; + val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu; + copy_map_value(_map, per_cpu_ptr(storage->percpu_buf, cpu), val); } +unlock: rcu_read_unlock(); return 0; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5e3b5d828856..ecc0929ce462 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -320,7 +320,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { - err = bpf_percpu_cgroup_storage_copy(map, key, value); + err = bpf_percpu_cgroup_storage_copy(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_extract(map, key, value, false); } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { -- cgit v1.2.3 From b40a5d724f29fc2eed23ff353808a9aae616b48a Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Sat, 10 Jan 2026 08:25:50 +0000 Subject: bpf: crypto: Use the correct destructor kfunc type With CONFIG_CFI enabled, the kernel strictly enforces that indirect function calls use a function pointer type that matches the target function. I ran into the following type mismatch when running BPF self-tests: CFI failure at bpf_obj_free_fields+0x190/0x238 (target: bpf_crypto_ctx_release+0x0/0x94; expected type: 0xa488ebfc) Internal error: Oops - CFI: 00000000f2008228 [#1] SMP ... As bpf_crypto_ctx_release() is also used in BPF programs and using a void pointer as the argument would make the verifier unhappy, add a simple stub function with the correct type and register it as the destructor kfunc instead. Signed-off-by: Sami Tolvanen Acked-by: Yonghong Song Tested-by: Viktor Malik Link: https://lore.kernel.org/r/20260110082548.113748-7-samitolvanen@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/crypto.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c index 1ab79a6dec84..7e75a1936256 100644 --- a/kernel/bpf/crypto.c +++ b/kernel/bpf/crypto.c @@ -261,6 +261,12 @@ __bpf_kfunc void bpf_crypto_ctx_release(struct bpf_crypto_ctx *ctx) call_rcu(&ctx->rcu, crypto_free_cb); } +__bpf_kfunc void bpf_crypto_ctx_release_dtor(void *ctx) +{ + bpf_crypto_ctx_release(ctx); +} +CFI_NOSEAL(bpf_crypto_ctx_release_dtor); + static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx, const struct bpf_dynptr_kern *src, const struct bpf_dynptr_kern *dst, @@ -368,7 +374,7 @@ static const struct btf_kfunc_id_set crypt_kfunc_set = { BTF_ID_LIST(bpf_crypto_dtor_ids) BTF_ID(struct, bpf_crypto_ctx) -BTF_ID(func, bpf_crypto_ctx_release) +BTF_ID(func, bpf_crypto_ctx_release_dtor) static int __init crypto_kfunc_init(void) { -- cgit v1.2.3 From 99fde4d0626176d03cea35c64a063df73816e64d Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Sat, 10 Jan 2026 08:25:53 +0000 Subject: bpf, btf: Enforce destructor kfunc type with CFI Ensure that registered destructor kfuncs have the same type as btf_dtor_kfunc_t to avoid a kernel panic on systems with CONFIG_CFI enabled. Signed-off-by: Sami Tolvanen Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20260110082548.113748-10-samitolvanen@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 539c9fdea41d..2c6076fc29b9 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -8846,6 +8846,13 @@ static int btf_check_dtor_kfuncs(struct btf *btf, const struct btf_id_dtor_kfunc */ if (!t || !btf_type_is_ptr(t)) return -EINVAL; + + if (IS_ENABLED(CONFIG_CFI_CLANG)) { + /* Ensure the destructor kfunc type matches btf_dtor_kfunc_t */ + t = btf_type_by_id(btf, t->type); + if (!btf_type_is_void(t)) + return -EINVAL; + } } return 0; } -- cgit v1.2.3 From 7af3339948601e188d93d7e03326aeb8fcbd6bea Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 13 Jan 2026 13:48:26 +0000 Subject: bpf: Consistently use reg_state() for register access in the verifier Replace the pattern of declaring a local regs array from cur_regs() and then indexing into it with the more concise reg_state() helper. This simplifies the code by eliminating intermediate variables and makes register access more consistent throughout the verifier. Signed-off-by: Mykyta Yatsenko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20260113134826.2214860-1-mykyta.yatsenko5@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 42 +++++++++++++++++++----------------------- 1 file changed, 19 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 53635ea2e41b..02a43cafbb25 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5654,8 +5654,8 @@ static int check_stack_write(struct bpf_verifier_env *env, static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, int off, int size, enum bpf_access_type type) { - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_map *map = regs[regno].map_ptr; + struct bpf_reg_state *reg = reg_state(env, regno); + struct bpf_map *map = reg->map_ptr; u32 cap = bpf_map_flags_to_cap(map); if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) { @@ -6168,8 +6168,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) { - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); int err; /* We may have added a variable offset to the packet pointer; but any @@ -6256,8 +6255,7 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off, int size, enum bpf_access_type t) { - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_insn_access_aux info = {}; bool valid; @@ -7453,8 +7451,7 @@ static int check_stack_access_within_bounds( int regno, int off, int access_size, enum bpf_access_type type) { - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = regs + regno; + struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = func(env, reg); s64 min_off, max_off; int err; @@ -8408,7 +8405,7 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) { bool is_lock = flags & PROCESS_SPIN_LOCK, is_res_lock = flags & PROCESS_RES_LOCK; const char *lock_str = is_res_lock ? "bpf_res_spin" : "bpf_spin"; - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_verifier_state *cur = env->cur_state; bool is_const = tnum_is_const(reg->var_off); bool is_irq = flags & PROCESS_LOCK_IRQ; @@ -8524,7 +8521,7 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, enum btf_field_type field_type) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); bool is_const = tnum_is_const(reg->var_off); struct bpf_map *map = reg->map_ptr; u64 val = reg->var_off.value; @@ -8571,7 +8568,7 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, static int process_timer_func(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_map *map = reg->map_ptr; int err; @@ -8595,7 +8592,7 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, static int process_wq_func(struct bpf_verifier_env *env, int regno, struct bpf_kfunc_call_arg_meta *meta) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_map *map = reg->map_ptr; int err; @@ -8616,7 +8613,7 @@ static int process_wq_func(struct bpf_verifier_env *env, int regno, static int process_task_work_func(struct bpf_verifier_env *env, int regno, struct bpf_kfunc_call_arg_meta *meta) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_map *map = reg->map_ptr; int err; @@ -8636,7 +8633,7 @@ static int process_task_work_func(struct bpf_verifier_env *env, int regno, static int process_kptr_func(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); struct btf_field *kptr_field; struct bpf_map *map_ptr; struct btf_record *rec; @@ -8709,7 +8706,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx, enum bpf_arg_type arg_type, int clone_ref_obj_id) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); int err; if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) { @@ -8829,7 +8826,7 @@ static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg_idx, static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx, struct bpf_kfunc_call_arg_meta *meta) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); const struct btf_type *t; int spi, err, i, nr_slots, btf_id; @@ -9301,7 +9298,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, const u32 *arg_btf_id, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); enum bpf_reg_type expected, type = reg->type; const struct bpf_reg_types *compatible; int i, j; @@ -9714,7 +9711,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, int insn_idx) { u32 regno = BPF_REG_1 + arg; - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); enum bpf_arg_type arg_type = fn->arg_type[arg]; enum bpf_reg_type type = reg->type; u32 *arg_btf_id = NULL; @@ -11247,7 +11244,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, int func_id, int insn_idx) { struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; - struct bpf_reg_state *regs = cur_regs(env), *reg; + struct bpf_reg_state *reg; struct bpf_map *map = meta->map_ptr; u64 val, max; int err; @@ -11259,7 +11256,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return -EINVAL; } - reg = ®s[BPF_REG_3]; + reg = reg_state(env, BPF_REG_3); val = reg->var_off.value; max = map->max_entries; @@ -11405,8 +11402,7 @@ static struct bpf_insn_aux_data *cur_aux(const struct bpf_verifier_env *env) static bool loop_flag_is_zero(struct bpf_verifier_env *env) { - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = ®s[BPF_REG_4]; + struct bpf_reg_state *reg = reg_state(env, BPF_REG_4); bool reg_is_null = register_is_null(reg); if (reg_is_null) @@ -12668,7 +12664,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, static int process_irq_flag(struct bpf_verifier_env *env, int regno, struct bpf_kfunc_call_arg_meta *meta) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); int err, kfunc_class = IRQ_NATIVE_KFUNC; bool irq_save; -- cgit v1.2.3 From bffacdb80b93b7b5e96b26fad64cc490a6c7d6c7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 12 Jan 2026 12:13:57 -0800 Subject: bpf: Recognize special arithmetic shift in the verifier cilium bpf_wiregard.bpf.c when compiled with -O1 fails to load with the following verifier log: 192: (79) r2 = *(u64 *)(r10 -304) ; R2=pkt(r=40) R10=fp0 fp-304=pkt(r=40) ... 227: (85) call bpf_skb_store_bytes#9 ; R0=scalar() 228: (bc) w2 = w0 ; R0=scalar() R2=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) 229: (c4) w2 s>>= 31 ; R2=scalar(smin=0,smax=umax=0xffffffff,smin32=-1,smax32=0,var_off=(0x0; 0xffffffff)) 230: (54) w2 &= -134 ; R2=scalar(smin=0,smax=umax=umax32=0xffffff7a,smax32=0x7fffff7a,var_off=(0x0; 0xffffff7a)) ... 232: (66) if w2 s> 0xffffffff goto pc+125 ; R2=scalar(smin=umin=umin32=0x80000000,smax=umax=umax32=0xffffff7a,smax32=-134,var_off=(0x80000000; 0x7fffff7a)) ... 238: (79) r4 = *(u64 *)(r10 -304) ; R4=scalar() R10=fp0 fp-304=scalar() 239: (56) if w2 != 0xffffff78 goto pc+210 ; R2=0xffffff78 // -136 ... 258: (71) r1 = *(u8 *)(r4 +0) R4 invalid mem access 'scalar' The error might confuse most bpf authors, since fp-304 slot had 'pkt' pointer at insn 192 and became 'scalar' at 238. That happened because bpf_skb_store_bytes() clears all packet pointers including those in the stack. On the first glance it might look like a bug in the source code, since ctx->data pointer should have been reloaded after the call to bpf_skb_store_bytes(). The relevant part of cilium source code looks like this: // bpf/lib/nodeport.h int dsr_set_ipip6() { if (ctx_adjust_hroom(...)) return DROP_INVALID; // -134 if (ctx_store_bytes(...)) return DROP_WRITE_ERROR; // -141 return 0; } bool dsr_fail_needs_reply(int code) { if (code == DROP_FRAG_NEEDED) // -136 return true; return false; } tail_nodeport_ipv6_dsr() { ret = dsr_set_ipip6(...); if (!IS_ERR(ret)) { ... } else { if (dsr_fail_needs_reply(ret)) return dsr_reply_icmp6(...); } } The code doesn't have arithmetic shift by 31 and it reloads ctx->data every time it needs to access it. So it's not a bug in the source code. The reason is DAGCombiner::foldSelectCCToShiftAnd() LLVM transformation: // If this is a select where the false operand is zero and the compare is a // check of the sign bit, see if we can perform the "gzip trick": // select_cc setlt X, 0, A, 0 -> and (sra X, size(X)-1), A // select_cc setgt X, 0, A, 0 -> and (not (sra X, size(X)-1)), A The conditional branch in dsr_set_ipip6() and its return values are optimized into BPF_ARSH plus BPF_AND: 227: (85) call bpf_skb_store_bytes#9 228: (bc) w2 = w0 229: (c4) w2 s>>= 31 ; R2=scalar(smin=0,smax=umax=0xffffffff,smin32=-1,smax32=0,var_off=(0x0; 0xffffffff)) 230: (54) w2 &= -134 ; R2=scalar(smin=0,smax=umax=umax32=0xffffff7a,smax32=0x7fffff7a,var_off=(0x0; 0xffffff7a)) after insn 230 the register w2 can only be 0 or -134, but the verifier approximates it, since there is no way to represent two scalars in bpf_reg_state. After fallthough at insn 232 the w2 can only be -134, hence the branch at insn 239: (56) if w2 != -136 goto pc+210 should be always taken, and trapping insn 258 should never execute. LLVM generated correct code, but the verifier follows impossible path and rejects valid program. To fix this issue recognize this special LLVM optimization and fork the verifier state. So after insn 229: (c4) w2 s>>= 31 the verifier has two states to explore: one with w2 = 0 and another with w2 = 0xffffffff which makes the verifier accept bpf_wiregard.c A similar pattern exists were OR operation is used in place of the AND operation, the verifier detects that pattern as well by forking the state before the OR operation with a scalar in range [-1,0]. Note there are 20+ such patterns in bpf_wiregard.o compiled with -O1 and -O2, but they're rarely seen in other production bpf programs, so push_stack() approach is not a concern. Reported-by: Hao Sun Signed-off-by: Alexei Starovoitov Co-developed-by: Puranjay Mohan Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260112201424.816836-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 02a43cafbb25..9d3ad2876d8f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15491,6 +15491,35 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, } } +static int maybe_fork_scalars(struct bpf_verifier_env *env, struct bpf_insn *insn, + struct bpf_reg_state *dst_reg) +{ + struct bpf_verifier_state *branch; + struct bpf_reg_state *regs; + bool alu32; + + if (dst_reg->smin_value == -1 && dst_reg->smax_value == 0) + alu32 = false; + else if (dst_reg->s32_min_value == -1 && dst_reg->s32_max_value == 0) + alu32 = true; + else + return 0; + + branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false); + if (IS_ERR(branch)) + return PTR_ERR(branch); + + regs = branch->frame[branch->curframe]->regs; + if (alu32) { + __mark_reg32_known(®s[insn->dst_reg], 0); + __mark_reg32_known(dst_reg, -1ull); + } else { + __mark_reg_known(®s[insn->dst_reg], 0); + __mark_reg_known(dst_reg, -1ull); + } + return 0; +} + /* WARNING: This function does calculations on 64-bit values, but the actual * execution may occur on 32-bit values. Therefore, things like bitshifts * need extra checks in the 32-bit case. @@ -15553,11 +15582,21 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, scalar_min_max_mul(dst_reg, &src_reg); break; case BPF_AND: + if (tnum_is_const(src_reg.var_off)) { + ret = maybe_fork_scalars(env, insn, dst_reg); + if (ret) + return ret; + } dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off); scalar32_min_max_and(dst_reg, &src_reg); scalar_min_max_and(dst_reg, &src_reg); break; case BPF_OR: + if (tnum_is_const(src_reg.var_off)) { + ret = maybe_fork_scalars(env, insn, dst_reg); + if (ret) + return ret; + } dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off); scalar32_min_max_or(dst_reg, &src_reg); scalar_min_max_or(dst_reg, &src_reg); -- cgit v1.2.3 From c9c9f6bf7fbcec4bc1cba845633e48035b883630 Mon Sep 17 00:00:00 2001 From: Song Chen Date: Mon, 5 Jan 2026 23:50:09 +0800 Subject: bpf: Remove an unused parameter in check_func_proto The func_id parameter is not needed in check_func_proto. This patch removes it. Signed-off-by: Song Chen Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260105155009.4581-1-chensong_2000@189.cn --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9d3ad2876d8f..092003cc7841 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10346,7 +10346,7 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn) return true; } -static int check_func_proto(const struct bpf_func_proto *fn, int func_id) +static int check_func_proto(const struct bpf_func_proto *fn) { return check_raw_mode_ok(fn) && check_arg_pair_ok(fn) && @@ -11521,7 +11521,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; - err = check_func_proto(fn, func_id); + err = check_func_proto(fn); if (err) { verifier_bug(env, "incorrect func proto %s#%d", func_id_name(func_id), func_id); return err; -- cgit v1.2.3 From 8c3070e159ba00424f0389ead694cacd85af260e Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Fri, 9 Jan 2026 20:59:58 +0800 Subject: btf: Optimize type lookup with binary search Improve btf_find_by_name_kind() performance by adding binary search support for sorted types. Falls back to linear search for compatibility. Signed-off-by: Donglin Peng Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260109130003.3313716-7-dolinux.peng@gmail.com --- include/linux/btf.h | 1 + kernel/bpf/btf.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 82 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/btf.h b/include/linux/btf.h index 691f09784933..78dc79810c7d 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -219,6 +219,7 @@ bool btf_is_module(const struct btf *btf); bool btf_is_vmlinux(const struct btf *btf); struct module *btf_try_get_module(const struct btf *btf); u32 btf_nr_types(const struct btf *btf); +u32 btf_named_start_id(const struct btf *btf, bool own); struct btf *btf_base_btf(const struct btf *btf); bool btf_type_is_i32(const struct btf_type *t); bool btf_type_is_i64(const struct btf_type *t); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2c6076fc29b9..be8aa31e9e94 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -259,6 +259,7 @@ struct btf { void *nohdr_data; struct btf_header hdr; u32 nr_types; /* includes VOID for base BTF */ + u32 named_start_id; u32 types_size; u32 data_size; refcount_t refcnt; @@ -494,6 +495,11 @@ static bool btf_type_is_modifier(const struct btf_type *t) return false; } +static int btf_start_id(const struct btf *btf) +{ + return btf->start_id + (btf->base_btf ? 0 : 1); +} + bool btf_type_is_void(const struct btf_type *t) { return t == &btf_void; @@ -544,21 +550,84 @@ u32 btf_nr_types(const struct btf *btf) return total; } +/* + * btf_named_start_id - Get the named starting ID for the BTF + * @btf: Pointer to the target BTF object + * @own: Flag indicating whether to query only the current BTF (true = current BTF only, + * false = recursively traverse the base BTF chain) + * + * Return value rules: + * 1. For a sorted btf, return its named_start_id + * 2. Else for a split BTF, return its start_id + * 3. Else for a base BTF, return 1 + */ +u32 btf_named_start_id(const struct btf *btf, bool own) +{ + const struct btf *base_btf = btf; + + while (!own && base_btf->base_btf) + base_btf = base_btf->base_btf; + + return base_btf->named_start_id ?: (base_btf->start_id ?: 1); +} + +static s32 btf_find_by_name_kind_bsearch(const struct btf *btf, const char *name) +{ + const struct btf_type *t; + const char *tname; + s32 l, r, m; + + l = btf_named_start_id(btf, true); + r = btf_nr_types(btf) - 1; + while (l <= r) { + m = l + (r - l) / 2; + t = btf_type_by_id(btf, m); + tname = btf_name_by_offset(btf, t->name_off); + if (strcmp(tname, name) >= 0) { + if (l == r) + return r; + r = m; + } else { + l = m + 1; + } + } + + return btf_nr_types(btf); +} + s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind) { + const struct btf *base_btf = btf_base_btf(btf); const struct btf_type *t; const char *tname; - u32 i, total; + s32 id, total; - total = btf_nr_types(btf); - for (i = 1; i < total; i++) { - t = btf_type_by_id(btf, i); - if (BTF_INFO_KIND(t->info) != kind) - continue; + if (base_btf) { + id = btf_find_by_name_kind(base_btf, name, kind); + if (id > 0) + return id; + } - tname = btf_name_by_offset(btf, t->name_off); - if (!strcmp(tname, name)) - return i; + total = btf_nr_types(btf); + if (btf->named_start_id > 0 && name[0]) { + id = btf_find_by_name_kind_bsearch(btf, name); + for (; id < total; id++) { + t = btf_type_by_id(btf, id); + tname = btf_name_by_offset(btf, t->name_off); + if (strcmp(tname, name) != 0) + return -ENOENT; + if (BTF_INFO_KIND(t->info) == kind) + return id; + } + } else { + for (id = btf_start_id(btf); id < total; id++) { + t = btf_type_by_id(btf, id); + if (BTF_INFO_KIND(t->info) != kind) + continue; + tname = btf_name_by_offset(btf, t->name_off); + if (strcmp(tname, name) == 0) + return id; + } } return -ENOENT; @@ -5791,6 +5860,7 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat goto errout; } env->btf = btf; + btf->named_start_id = 0; data = kvmalloc(attr->btf_size, GFP_KERNEL | __GFP_NOWARN); if (!data) { @@ -6210,6 +6280,7 @@ static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name btf->data = data; btf->data_size = data_size; btf->kernel_btf = true; + btf->named_start_id = 0; snprintf(btf->name, sizeof(btf->name), "%s", name); err = btf_parse_hdr(env); @@ -6327,6 +6398,7 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, btf->start_id = base_btf->nr_types; btf->start_str_off = base_btf->hdr.str_len; btf->kernel_btf = true; + btf->named_start_id = 0; snprintf(btf->name, sizeof(btf->name), "%s", module_name); btf->data = kvmemdup(data, data_size, GFP_KERNEL | __GFP_NOWARN); -- cgit v1.2.3 From 342bf525ba0d83374f318e19186d50b1e7160d0e Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Fri, 9 Jan 2026 20:59:59 +0800 Subject: btf: Verify BTF sorting This patch checks whether the BTF is sorted by name in ascending order. If sorted, binary search will be used when looking up types. Specifically, vmlinux and kernel module BTFs are always sorted during the build phase with anonymous types placed before named types, so we only need to identify the starting ID of named types. Signed-off-by: Donglin Peng Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260109130003.3313716-8-dolinux.peng@gmail.com --- kernel/bpf/btf.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index be8aa31e9e94..d5b3a37c8560 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -550,6 +550,47 @@ u32 btf_nr_types(const struct btf *btf) return total; } +/* + * Note that vmlinux and kernel module BTFs are always sorted + * during the building phase. + */ +static void btf_check_sorted(struct btf *btf) +{ + u32 i, n, named_start_id = 0; + + n = btf_nr_types(btf); + if (btf_is_vmlinux(btf)) { + for (i = btf_start_id(btf); i < n; i++) { + const struct btf_type *t = btf_type_by_id(btf, i); + const char *n = btf_name_by_offset(btf, t->name_off); + + if (n[0] != '\0') { + btf->named_start_id = i; + return; + } + } + return; + } + + for (i = btf_start_id(btf) + 1; i < n; i++) { + const struct btf_type *ta = btf_type_by_id(btf, i - 1); + const struct btf_type *tb = btf_type_by_id(btf, i); + const char *na = btf_name_by_offset(btf, ta->name_off); + const char *nb = btf_name_by_offset(btf, tb->name_off); + + if (strcmp(na, nb) > 0) + return; + + if (named_start_id == 0 && na[0] != '\0') + named_start_id = i - 1; + if (named_start_id == 0 && nb[0] != '\0') + named_start_id = i; + } + + if (named_start_id) + btf->named_start_id = named_start_id; +} + /* * btf_named_start_id - Get the named starting ID for the BTF * @btf: Pointer to the target BTF object @@ -6301,6 +6342,7 @@ static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name if (err) goto errout; + btf_check_sorted(btf); refcount_set(&btf->refcnt, 1); return btf; @@ -6435,6 +6477,7 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, } btf_verifier_env_free(env); + btf_check_sorted(btf); refcount_set(&btf->refcnt, 1); return btf; -- cgit v1.2.3 From dc893cfa390aa9d5fc83c908ea5e37a36e531892 Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Fri, 9 Jan 2026 21:00:00 +0800 Subject: bpf: Skip anonymous types in type lookup for performance Currently, vmlinux and kernel module BTFs are unconditionally sorted during the build phase, with named types placed at the end. Thus, anonymous types should be skipped when starting the search. In my vmlinux BTF, the number of anonymous types is 61,747, which means the loop count can be reduced by 61,747. Signed-off-by: Donglin Peng Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20260109130003.3313716-9-dolinux.peng@gmail.com --- kernel/bpf/btf.c | 10 ++++++---- kernel/bpf/verifier.c | 7 +------ 2 files changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index d5b3a37c8560..364dd84bfc5a 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3534,7 +3534,8 @@ const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type const struct btf_type *t; int len, id; - id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key, 0); + id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key, + btf_named_start_id(btf, false) - 1); if (id < 0) return ERR_PTR(id); @@ -7844,12 +7845,13 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) tname); return -EINVAL; } + /* Convert BTF function arguments into verifier types. * Only PTR_TO_CTX and SCALAR are supported atm. */ for (i = 0; i < nargs; i++) { u32 tags = 0; - int id = 0; + int id = btf_named_start_id(btf, false) - 1; /* 'arg:' decl_tag takes precedence over derivation of * register type from BTF type itself @@ -9338,7 +9340,7 @@ bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id) } /* Attempt to find target candidates in vmlinux BTF first */ - cands = bpf_core_add_cands(cands, main_btf, 1); + cands = bpf_core_add_cands(cands, main_btf, btf_named_start_id(main_btf, true)); if (IS_ERR(cands)) return ERR_CAST(cands); @@ -9370,7 +9372,7 @@ check_modules: */ btf_get(mod_btf); spin_unlock_bh(&btf_idr_lock); - cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf)); + cands = bpf_core_add_cands(cands, mod_btf, btf_named_start_id(mod_btf, true)); btf_put(mod_btf); if (IS_ERR(cands)) return ERR_CAST(cands); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 092003cc7841..45733bae271d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20687,12 +20687,7 @@ static int find_btf_percpu_datasec(struct btf *btf) * types to look at only module's own BTF types. */ n = btf_nr_types(btf); - if (btf_is_module(btf)) - i = btf_nr_types(btf_vmlinux); - else - i = 1; - - for(; i < n; i++) { + for (i = btf_named_start_id(btf, true); i < n; i++) { t = btf_type_by_id(btf, i); if (BTF_INFO_KIND(t->info) != BTF_KIND_DATASEC) continue; -- cgit v1.2.3 From 434bcbc837a69baa2720b2ae5baba8b6e36898c0 Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Fri, 9 Jan 2026 21:00:01 +0800 Subject: bpf: Optimize the performance of find_bpffs_btf_enums Currently, vmlinux BTF is unconditionally sorted during the build phase. The function btf_find_by_name_kind executes the binary search branch, so find_bpffs_btf_enums can be optimized by using btf_find_by_name_kind. Signed-off-by: Donglin Peng Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20260109130003.3313716-10-dolinux.peng@gmail.com --- kernel/bpf/inode.c | 42 +++++++++++++++++------------------------- 1 file changed, 17 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 9f866a010dad..005ea3a2cda7 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -600,10 +600,17 @@ struct bpffs_btf_enums { static int find_bpffs_btf_enums(struct bpffs_btf_enums *info) { + struct { + const struct btf_type **type; + const char *name; + } btf_enums[] = { + {&info->cmd_t, "bpf_cmd"}, + {&info->map_t, "bpf_map_type"}, + {&info->prog_t, "bpf_prog_type"}, + {&info->attach_t, "bpf_attach_type"}, + }; const struct btf *btf; - const struct btf_type *t; - const char *name; - int i, n; + int i, id; memset(info, 0, sizeof(*info)); @@ -615,31 +622,16 @@ static int find_bpffs_btf_enums(struct bpffs_btf_enums *info) info->btf = btf; - for (i = 1, n = btf_nr_types(btf); i < n; i++) { - t = btf_type_by_id(btf, i); - if (!btf_type_is_enum(t)) - continue; - - name = btf_name_by_offset(btf, t->name_off); - if (!name) - continue; - - if (strcmp(name, "bpf_cmd") == 0) - info->cmd_t = t; - else if (strcmp(name, "bpf_map_type") == 0) - info->map_t = t; - else if (strcmp(name, "bpf_prog_type") == 0) - info->prog_t = t; - else if (strcmp(name, "bpf_attach_type") == 0) - info->attach_t = t; - else - continue; + for (i = 0; i < ARRAY_SIZE(btf_enums); i++) { + id = btf_find_by_name_kind(btf, btf_enums[i].name, + BTF_KIND_ENUM); + if (id < 0) + return -ESRCH; - if (info->cmd_t && info->map_t && info->prog_t && info->attach_t) - return 0; + *btf_enums[i].type = btf_type_by_id(btf, id); } - return -ESRCH; + return 0; } static bool find_btf_enum_const(const struct btf *btf, const struct btf_type *enum_t, -- cgit v1.2.3 From f8ade2342e22e7dbc71af496f07c900f8c69dd54 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Tue, 13 Jan 2026 08:39:47 +0000 Subject: bpf: return PTR_TO_BTF_ID | PTR_TRUSTED from BPF kfuncs by default Teach the BPF verifier to treat pointers to struct types returned from BPF kfuncs as implicitly trusted (PTR_TO_BTF_ID | PTR_TRUSTED) by default. Returning untrusted pointers to struct types from BPF kfuncs should be considered an exception only, and certainly not the norm. Update existing selftests to reflect the change in register type printing (e.g. `ptr_` becoming `trusted_ptr_` in verifier error messages). Link: https://lore.kernel.org/bpf/aV4nbCaMfIoM0awM@google.com/ Signed-off-by: Matt Bobrowski Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260113083949.2502978-1-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 46 ++++++++++++++-------- tools/testing/selftests/bpf/progs/map_kptr_fail.c | 4 +- .../struct_ops_kptr_return_fail__wrong_type.c | 2 +- .../selftests/bpf/progs/verifier_global_ptr_args.c | 2 +- tools/testing/selftests/bpf/verifier/calls.c | 2 +- 5 files changed, 34 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 45733bae271d..faa1ecc1fe9d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14212,26 +14212,38 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (is_kfunc_rcu_protected(&meta)) regs[BPF_REG_0].type |= MEM_RCU; } else { - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].btf = desc_btf; - regs[BPF_REG_0].type = PTR_TO_BTF_ID; - regs[BPF_REG_0].btf_id = ptr_type_id; + enum bpf_reg_type type = PTR_TO_BTF_ID; if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache]) - regs[BPF_REG_0].type |= PTR_UNTRUSTED; - else if (is_kfunc_rcu_protected(&meta)) - regs[BPF_REG_0].type |= MEM_RCU; - - if (is_iter_next_kfunc(&meta)) { - struct bpf_reg_state *cur_iter; - - cur_iter = get_iter_from_state(env->cur_state, &meta); - - if (cur_iter->type & MEM_RCU) /* KF_RCU_PROTECTED */ - regs[BPF_REG_0].type |= MEM_RCU; - else - regs[BPF_REG_0].type |= PTR_TRUSTED; + type |= PTR_UNTRUSTED; + else if (is_kfunc_rcu_protected(&meta) || + (is_iter_next_kfunc(&meta) && + (get_iter_from_state(env->cur_state, &meta) + ->type & MEM_RCU))) { + /* + * If the iterator's constructor (the _new + * function e.g., bpf_iter_task_new) has been + * annotated with BPF kfunc flag + * KF_RCU_PROTECTED and was called within a RCU + * read-side critical section, also propagate + * the MEM_RCU flag to the pointer returned from + * the iterator's next function (e.g., + * bpf_iter_task_next). + */ + type |= MEM_RCU; + } else { + /* + * Any PTR_TO_BTF_ID that is returned from a BPF + * kfunc should by default be treated as + * implicitly trusted. + */ + type |= PTR_TRUSTED; } + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].btf = desc_btf; + regs[BPF_REG_0].type = type; + regs[BPF_REG_0].btf_id = ptr_type_id; } if (is_kfunc_ret_null(&meta)) { diff --git a/tools/testing/selftests/bpf/progs/map_kptr_fail.c b/tools/testing/selftests/bpf/progs/map_kptr_fail.c index 4c0ff01f1a96..6443b320c732 100644 --- a/tools/testing/selftests/bpf/progs/map_kptr_fail.c +++ b/tools/testing/selftests/bpf/progs/map_kptr_fail.c @@ -272,7 +272,7 @@ int reject_untrusted_xchg(struct __sk_buff *ctx) SEC("?tc") __failure -__msg("invalid kptr access, R2 type=ptr_prog_test_ref_kfunc expected=ptr_prog_test_member") +__msg("invalid kptr access, R2 type=trusted_ptr_prog_test_ref_kfunc expected=ptr_prog_test_member") int reject_bad_type_xchg(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *ref_ptr; @@ -291,7 +291,7 @@ int reject_bad_type_xchg(struct __sk_buff *ctx) } SEC("?tc") -__failure __msg("invalid kptr access, R2 type=ptr_prog_test_ref_kfunc") +__failure __msg("invalid kptr access, R2 type=trusted_ptr_prog_test_ref_kfunc") int reject_member_of_ref_xchg(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *ref_ptr; diff --git a/tools/testing/selftests/bpf/progs/struct_ops_kptr_return_fail__wrong_type.c b/tools/testing/selftests/bpf/progs/struct_ops_kptr_return_fail__wrong_type.c index 6a2dd5367802..c8d217e89eea 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_kptr_return_fail__wrong_type.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_kptr_return_fail__wrong_type.c @@ -12,7 +12,7 @@ void bpf_task_release(struct task_struct *p) __ksym; * reject programs returning a referenced kptr of the wrong type. */ SEC("struct_ops/test_return_ref_kptr") -__failure __msg("At program exit the register R0 is not a known value (ptr_or_null_)") +__failure __msg("At program exit the register R0 is not a known value (trusted_ptr_or_null_)") struct task_struct *BPF_PROG(kptr_return_fail__wrong_type, int dummy, struct task_struct *task, struct cgroup *cgrp) { diff --git a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c index 1204fbc58178..e7dae0cf9c17 100644 --- a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c +++ b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c @@ -72,7 +72,7 @@ int trusted_task_arg_nonnull_fail1(void *ctx) SEC("?tp_btf/task_newtask") __failure __log_level(2) -__msg("R1 type=ptr_or_null_ expected=ptr_, trusted_ptr_, rcu_ptr_") +__msg("R1 type=trusted_ptr_or_null_ expected=ptr_, trusted_ptr_, rcu_ptr_") __msg("Caller passes invalid args into func#1 ('subprog_trusted_task_nonnull')") int trusted_task_arg_nonnull_fail2(void *ctx) { diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index c8d640802cce..9ca83dce100d 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -220,7 +220,7 @@ }, .result_unpriv = REJECT, .result = REJECT, - .errstr = "variable ptr_ access var_off=(0x0; 0x7) disallowed", + .errstr = "variable trusted_ptr_ access var_off=(0x0; 0x7) disallowed", }, { "calls: invalid kfunc call: referenced arg needs refcounted PTR_TO_BTF_ID", -- cgit v1.2.3 From e3bd7bdf5ffe49d8381e42843f6e98cd0c78a1e8 Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Sun, 11 Jan 2026 15:30:45 +0000 Subject: bpf: Return proper address for non-zero offsets in insn array The map_direct_value_addr() function of the instruction array map incorrectly adds offset to the resulting address. This is a bug, because later the resolve_pseudo_ldimm64() function adds the offset. Fix it. Corresponding selftests are added in a consequent commit. Fixes: 493d9e0d6083 ("bpf, x86: add support for indirect jumps") Signed-off-by: Anton Protopopov Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260111153047.8388-2-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_insn_array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_insn_array.c b/kernel/bpf/bpf_insn_array.c index c96630cb75bf..37b43102953e 100644 --- a/kernel/bpf/bpf_insn_array.c +++ b/kernel/bpf/bpf_insn_array.c @@ -126,7 +126,7 @@ static int insn_array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, return -EINVAL; /* from BPF's point of view, this map is a jump table */ - *imm = (unsigned long)insn_array->ips + off; + *imm = (unsigned long)insn_array->ips; return 0; } -- cgit v1.2.3 From 7e525860e7250355bcb01ae9779154512b5e5e88 Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Sun, 11 Jan 2026 15:30:46 +0000 Subject: bpf: Return EACCES for incorrect access to insn array The insn_array_map_direct_value_addr() function currently returns -EINVAL when the offset within the map is invalid. Change this to return -EACCES, so that it is consistent with similar boundary access checks in the verifier. Signed-off-by: Anton Protopopov Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260111153047.8388-3-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_insn_array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_insn_array.c b/kernel/bpf/bpf_insn_array.c index 37b43102953e..c0286f25ca3c 100644 --- a/kernel/bpf/bpf_insn_array.c +++ b/kernel/bpf/bpf_insn_array.c @@ -123,7 +123,7 @@ static int insn_array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, if ((off % sizeof(long)) != 0 || (off / sizeof(long)) >= map->max_entries) - return -EINVAL; + return -EACCES; /* from BPF's point of view, this map is a jump table */ *imm = (unsigned long)insn_array->ips; -- cgit v1.2.3 From d1aab1ca576c90192ba961094d51b0be6355a4d6 Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Wed, 14 Jan 2026 16:25:43 +0000 Subject: bpf: Properly mark live registers for indirect jumps For a `gotox rX` instruction the rX register should be marked as used in the compute_insn_live_regs() function. Fix this. Signed-off-by: Anton Protopopov Link: https://lore.kernel.org/r/20260114162544.83253-2-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 62ad7c79ce2d..7a375f608263 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -24848,6 +24848,12 @@ static void compute_insn_live_regs(struct bpf_verifier_env *env, case BPF_JMP32: switch (code) { case BPF_JA: + def = 0; + if (BPF_SRC(insn->code) == BPF_X) + use = dst; + else + use = 0; + break; case BPF_JCOND: def = 0; use = 0; -- cgit v1.2.3 From 276f3b6daf6024ae2742afd161e7418a5584a660 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 12 Jan 2026 13:11:56 +0100 Subject: arm64/ftrace,bpf: Fix partial regs after bpf_prog_run Mahe reported issue with bpf_override_return helper not working when executed from kprobe.multi bpf program on arm. The problem is that on arm we use alternate storage for pt_regs object that is passed to bpf_prog_run and if any register is changed (which is the case of bpf_override_return) it's not propagated back to actual pt_regs object. Fixing this by introducing and calling ftrace_partial_regs_update function to propagate the values of changed registers (ip and stack). Reported-by: Mahe Tardy Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Acked-by: Will Deacon Link: https://lore.kernel.org/bpf/20260112121157.854473-1-jolsa@kernel.org --- include/linux/ftrace_regs.h | 25 +++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 1 + 2 files changed, 26 insertions(+) (limited to 'kernel') diff --git a/include/linux/ftrace_regs.h b/include/linux/ftrace_regs.h index 15627ceea9bc..386fa48c4a95 100644 --- a/include/linux/ftrace_regs.h +++ b/include/linux/ftrace_regs.h @@ -33,6 +33,31 @@ struct ftrace_regs; #define ftrace_regs_get_frame_pointer(fregs) \ frame_pointer(&arch_ftrace_regs(fregs)->regs) +static __always_inline void +ftrace_partial_regs_update(struct ftrace_regs *fregs, struct pt_regs *regs) { } + +#else + +/* + * ftrace_partial_regs_update - update the original ftrace_regs from regs + * @fregs: The ftrace_regs to update from @regs + * @regs: The partial regs from ftrace_partial_regs() that was updated + * + * Some architectures have the partial regs living in the ftrace_regs + * structure, whereas other architectures need to make a different copy + * of the @regs. If a partial @regs is retrieved by ftrace_partial_regs() and + * if the code using @regs updates a field (like the instruction pointer or + * stack pointer) it may need to propagate that change to the original @fregs + * it retrieved the partial @regs from. Use this function to guarantee that + * update happens. + */ +static __always_inline void +ftrace_partial_regs_update(struct ftrace_regs *fregs, struct pt_regs *regs) +{ + ftrace_regs_set_instruction_pointer(fregs, instruction_pointer(regs)); + ftrace_regs_set_return_value(fregs, regs_return_value(regs)); +} + #endif /* HAVE_ARCH_FTRACE_REGS */ /* This can be overridden by the architectures */ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6e076485bf70..3a17f79b20c2 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2564,6 +2564,7 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx); err = bpf_prog_run(link->link.prog, regs); bpf_reset_run_ctx(old_run_ctx); + ftrace_partial_regs_update(fregs, bpf_kprobe_multi_pt_regs_ptr()); rcu_read_unlock(); out: -- cgit v1.2.3 From af9e89d8dd39530c8bd14c33ddf6b502df1071b6 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 15 Jan 2026 07:11:40 -0800 Subject: bpf: Preserve id of register in sync_linked_regs() sync_linked_regs() copies the id of known_reg to reg when propagating bounds of known_reg to reg using the off of known_reg, but when known_reg was linked to reg like: known_reg = reg ; both known_reg and reg get same id known_reg += 4 ; known_reg gets off = 4, and its id gets BPF_ADD_CONST now when a call to sync_linked_regs() happens, let's say with the following: if known_reg >= 10 goto pc+2 known_reg's new bounds are propagated to reg but now reg gets BPF_ADD_CONST from the copy. This means if another link to reg is created like: another_reg = reg ; another_reg should get the id of reg but assign_scalar_id_before_mov() sees BPF_ADD_CONST on reg and assigns a new id to it. As reg has a new id now, known_reg's link to reg is broken. If we find new bounds for known_reg, they will not be propagated to reg. This can be seen in the selftest added in the next commit: 0: (85) call bpf_get_prandom_u32#7 ; R0=scalar() 1: (57) r0 &= 255 ; R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) 2: (bf) r1 = r0 ; R0=scalar(id=1,smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) R1=scalar(id=1,smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) 3: (07) r1 += 4 ; R1=scalar(id=1+4,smin=umin=smin32=umin32=4,smax=umax=smax32=umax32=259,var_off=(0x0; 0x1ff)) 4: (a5) if r1 < 0xa goto pc+4 ; R1=scalar(id=1+4,smin=umin=smin32=umin32=10,smax=umax=smax32=umax32=259,var_off=(0x0; 0x1ff)) 5: (bf) r2 = r0 ; R0=scalar(id=2,smin=umin=smin32=umin32=6,smax=umax=smax32=umax32=255) R2=scalar(id=2,smin=umin=smin32=umin32=6,smax=umax=smax32=umax32=255) 6: (a5) if r1 < 0xe goto pc+2 ; R1=scalar(id=1+4,smin=umin=smin32=umin32=14,smax=umax=smax32=umax32=259,var_off=(0x0; 0x1ff)) 7: (35) if r0 >= 0xa goto pc+1 ; R0=scalar(id=2,smin=umin=smin32=umin32=6,smax=umax=smax32=umax32=9,var_off=(0x0; 0xf)) 8: (37) r0 /= 0 div by zero When 4 is verified, r1's bounds are propagated to r0 but r0 also gets BPF_ADD_CONST (bug). When 5 is verified, r0 gets a new id (2) and its link with r1 is broken. After 6 we know r1 has bounds [14, 259] and therefore r0 should have bounds [10, 255], therefore the branch at 7 is always taken. But because r0's id was changed to 2, r1's new bounds are not propagated to r0. The verifier still thinks r0 has bounds [6, 255] before 7 and execution can reach div by zero. Fix this by preserving id in sync_linked_regs() like off and subreg_def. Fixes: 98d7ca374ba4 ("bpf: Track delta between "linked" registers.") Signed-off-by: Puranjay Mohan Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260115151143.1344724-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7a375f608263..9de0ec0c3ed9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -16871,6 +16871,7 @@ static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_s } else { s32 saved_subreg_def = reg->subreg_def; s32 saved_off = reg->off; + u32 saved_id = reg->id; fake_reg.type = SCALAR_VALUE; __mark_reg_known(&fake_reg, (s32)reg->off - (s32)known_reg->off); @@ -16878,10 +16879,11 @@ static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_s /* reg = known_reg; reg += delta */ copy_register_state(reg, known_reg); /* - * Must preserve off, id and add_const flag, + * Must preserve off, id and subreg_def flag, * otherwise another sync_linked_regs() will be incorrect. */ reg->off = saved_off; + reg->id = saved_id; reg->subreg_def = saved_subreg_def; scalar32_min_max_add(reg, &fake_reg); -- cgit v1.2.3 From 1700147697618a57ed0a2a97d9a477d131b8fc54 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Thu, 15 Jan 2026 18:45:09 +0000 Subject: bpf: Add __force annotations to silence sparse warnings Add __force annotations to casts that convert between __user and kernel address spaces. These casts are intentional: - In bpf_send_signal_common(), the value is stored in si_value.sival_ptr which is typed as void __user *, but the value comes from a BPF program parameter. - In the bpf_*_dynptr() kfuncs, user pointers are cast to const void * before being passed to copy helper functions that correctly handle the user address space through copy_from_user variants. Without __force, sparse reports: warning: cast removes address space '__user' of expression Reported-by: kernel test robot Signed-off-by: Mykyta Yatsenko Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260115184509.3585759-1-mykyta.yatsenko5@gmail.com Closes: https://lore.kernel.org/oe-kbuild-all/202601131740.6C3BdBaB-lkp@intel.com/ --- kernel/trace/bpf_trace.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3a17f79b20c2..f73e08c223b5 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -830,7 +830,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type, struct task_struc info.si_code = SI_KERNEL; info.si_pid = 0; info.si_uid = 0; - info.si_value.sival_ptr = (void *)(unsigned long)value; + info.si_value.sival_ptr = (void __user __force *)(unsigned long)value; siginfo = &info; } @@ -3518,7 +3518,7 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid __bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { - return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, + return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_data_nofault, NULL); } @@ -3532,7 +3532,7 @@ __bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u64 off, __bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { - return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, + return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_str_nofault, NULL); } @@ -3546,14 +3546,14 @@ __bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u64 of __bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { - return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, + return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_data_sleepable, NULL); } __bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { - return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, + return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_str_sleepable, NULL); } @@ -3561,7 +3561,7 @@ __bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign, struct task_struct *tsk) { - return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, + return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_data_sleepable, tsk); } @@ -3569,7 +3569,7 @@ __bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64 u64 size, const void __user *unsafe_ptr__ign, struct task_struct *tsk) { - return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, + return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_str_sleepable, tsk); } -- cgit v1.2.3 From 4787eaf7c17131bf0cce93336f8f411f832ad05a Mon Sep 17 00:00:00 2001 From: Tim Bird Date: Wed, 14 Jan 2026 18:31:29 -0700 Subject: bpf: Add SPDX license identifiers to a few files Add GPL-2.0 SPDX-License-Identifier lines to some files, and remove a reference to COPYING, and boilerplate warranty text, from offload.c. Signed-off-by: Tim Bird Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260115013129.598705-1-tim.bird@sony.com --- kernel/bpf/offload.c | 12 +----------- kernel/bpf/ringbuf.c | 1 + kernel/bpf/token.c | 1 + 3 files changed, 3 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 42ae8d595c2c..227f9b5f388b 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -1,16 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2017-2018 Netronome Systems, Inc. - * - * This software is licensed under the GNU General License Version 2, - * June 1991 as shown in the file COPYING in the top-level directory of this - * source tree. - * - * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" - * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, - * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE - * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME - * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. */ #include diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index f6a075ffac63..35ae64ade36b 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include #include #include diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index feecd8f4dbf9..7e4aa1e44b50 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include #include #include -- cgit v1.2.3 From ef7d4e42d16f74b123c86c9195ba5136046cee57 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 16 Jan 2026 06:14:35 -0800 Subject: bpf: verifier: Make sync_linked_regs() scratch registers sync_linked_regs() is called after a conditional jump to propagate new bounds of a register to all its liked registers. But the verifier log only prints the state of the register that is part of the conditional jump. Make sync_linked_regs() scratch the registers whose bounds have been updated by propagation from a known register. Before: 0: (85) call bpf_get_prandom_u32#7 ; R0=scalar() 1: (57) r0 &= 255 ; R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) 2: (bf) r1 = r0 ; R0=scalar(id=1,smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) R1=scalar(id=1,smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) 3: (07) r1 += 4 ; R1=scalar(id=1+4,smin=umin=smin32=umin32=4,smax=umax=smax32=umax32=259,var_off=(0x0; 0x1ff)) 4: (a5) if r1 < 0xa goto pc+2 ; R1=scalar(id=1+4,smin=umin=smin32=umin32=10,smax=umax=smax32=umax32=259,var_off=(0x0; 0x1ff)) 5: (35) if r0 >= 0x6 goto pc+1 After: 0: (85) call bpf_get_prandom_u32#7 ; R0=scalar() 1: (57) r0 &= 255 ; R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) 2: (bf) r1 = r0 ; R0=scalar(id=1,smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) R1=scalar(id=1,smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) 3: (07) r1 += 4 ; R1=scalar(id=1+4,smin=umin=smin32=umin32=4,smax=umax=smax32=umax32=259,var_off=(0x0; 0x1ff)) 4: (a5) if r1 < 0xa goto pc+2 ; R0=scalar(id=1+0,smin=umin=smin32=umin32=6,smax=umax=smax32=umax32=255) R1=scalar(id=1+4,smin=umin=smin32=umin32=10,smax=umax=smax32=umax32=259,var_off=(0x0; 0x1ff)) 5: (35) if r0 >= 0x6 goto pc+1 The conditional jump in 4 updates the bound of R1 and the new bounds are propogated to R0 as it is linked with the same id, before this change, verifier only printed the state for R1 but after it prints for both R0 and R1. Suggested-by: Andrii Nakryiko Signed-off-by: Puranjay Mohan Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20260116141436.3715322-1-puranjay@kernel.org --- kernel/bpf/verifier.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9de0ec0c3ed9..76e02e13890f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -16846,8 +16846,8 @@ static void collect_linked_regs(struct bpf_verifier_state *vstate, u32 id, /* For all R in linked_regs, copy known_reg range into R * if R->id == known_reg->id. */ -static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_state *known_reg, - struct linked_regs *linked_regs) +static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_state *vstate, + struct bpf_reg_state *known_reg, struct linked_regs *linked_regs) { struct bpf_reg_state fake_reg; struct bpf_reg_state *reg; @@ -16890,6 +16890,10 @@ static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_s scalar_min_max_add(reg, &fake_reg); reg->var_off = tnum_add(reg->var_off, fake_reg.var_off); } + if (e->is_reg) + mark_reg_scratched(env, e->regno); + else + mark_stack_slot_scratched(env, e->spi); } } @@ -17076,13 +17080,15 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (BPF_SRC(insn->code) == BPF_X && src_reg->type == SCALAR_VALUE && src_reg->id && !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) { - sync_linked_regs(this_branch, src_reg, &linked_regs); - sync_linked_regs(other_branch, &other_branch_regs[insn->src_reg], &linked_regs); + sync_linked_regs(env, this_branch, src_reg, &linked_regs); + sync_linked_regs(env, other_branch, &other_branch_regs[insn->src_reg], + &linked_regs); } if (dst_reg->type == SCALAR_VALUE && dst_reg->id && !WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) { - sync_linked_regs(this_branch, dst_reg, &linked_regs); - sync_linked_regs(other_branch, &other_branch_regs[insn->dst_reg], &linked_regs); + sync_linked_regs(env, this_branch, dst_reg, &linked_regs); + sync_linked_regs(env, other_branch, &other_branch_regs[insn->dst_reg], + &linked_regs); } /* if one pointer register is compared to another pointer -- cgit v1.2.3 From 713edc71449f122491f8860be49b40f27d5f46b5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 20 Jan 2026 13:55:01 +0100 Subject: bpf: Remove leftover accounting in htab_map_mem_usage after rqspinlock After commit 4fa8d68aa53e ("bpf: Convert hashtab.c to rqspinlock") we no longer use HASHTAB_MAP_LOCK_{COUNT,MASK} as the per-CPU map_locked[HASHTAB_MAP_LOCK_COUNT] array got removed from struct bpf_htab. Right now it is still accounted for in htab_map_mem_usage. Signed-off-by: Daniel Borkmann Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/09703eb6bb249f12b1d5253b5a50a0c4fa239d27.1768913513.git.daniel@iogearbox.net --- kernel/bpf/hashtab.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 441ff5bc54ac..3b9d297a53be 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -82,9 +82,6 @@ struct bucket { rqspinlock_t raw_lock; }; -#define HASHTAB_MAP_LOCK_COUNT 8 -#define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1) - struct bpf_htab { struct bpf_map map; struct bpf_mem_alloc ma; @@ -2237,11 +2234,11 @@ static u64 htab_map_mem_usage(const struct bpf_map *map) bool prealloc = htab_is_prealloc(htab); bool percpu = htab_is_percpu(htab); bool lru = htab_is_lru(htab); - u64 num_entries; - u64 usage = sizeof(struct bpf_htab); + u64 num_entries, usage; + + usage = sizeof(struct bpf_htab) + + sizeof(struct bucket) * htab->n_buckets; - usage += sizeof(struct bucket) * htab->n_buckets; - usage += sizeof(int) * num_possible_cpus() * HASHTAB_MAP_LOCK_COUNT; if (prealloc) { num_entries = map->max_entries; if (htab_has_extra_elems(htab)) -- cgit v1.2.3 From f81c07a6e98e3171d0c4c5ab79f5aeff71b42c44 Mon Sep 17 00:00:00 2001 From: Qiliang Yuan Date: Tue, 20 Jan 2026 10:32:34 +0800 Subject: bpf/verifier: Optimize ID mapping reset in states_equal Currently, reset_idmap_scratch() performs a 4.7KB memset() in every states_equal() call. Optimize this by using a counter to track used ID mappings, replacing the O(N) memset() with an O(1) reset and bounding the search loop in check_ids(). Signed-off-by: Qiliang Yuan Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20260120023234.77673-1-realwujing@gmail.com --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 23 ++++++++++++++--------- 2 files changed, 15 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 130bcbd66f60..8355b585cd18 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -692,6 +692,7 @@ struct bpf_id_pair { struct bpf_idmap { u32 tmp_id_gen; + u32 cnt; struct bpf_id_pair map[BPF_ID_MAP_SIZE]; }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 76e02e13890f..b67d8981b058 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19000,18 +19000,21 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) if (old_id == 0) /* cur_id == 0 as well */ return true; - for (i = 0; i < BPF_ID_MAP_SIZE; i++) { - if (!map[i].old) { - /* Reached an empty slot; haven't seen this id before */ - map[i].old = old_id; - map[i].cur = cur_id; - return true; - } + for (i = 0; i < idmap->cnt; i++) { if (map[i].old == old_id) return map[i].cur == cur_id; if (map[i].cur == cur_id) return false; } + + /* Reached the end of known mappings; haven't seen this id before */ + if (idmap->cnt < BPF_ID_MAP_SIZE) { + map[idmap->cnt].old = old_id; + map[idmap->cnt].cur = cur_id; + idmap->cnt++; + return true; + } + /* We ran out of idmap slots, which should be impossible */ WARN_ON_ONCE(1); return false; @@ -19520,8 +19523,10 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat static void reset_idmap_scratch(struct bpf_verifier_env *env) { - env->idmap_scratch.tmp_id_gen = env->id_gen; - memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map)); + struct bpf_idmap *idmap = &env->idmap_scratch; + + idmap->tmp_id_gen = env->id_gen; + idmap->cnt = 0; } static bool states_equal(struct bpf_verifier_env *env, -- cgit v1.2.3 From ea073d1818e228440275cc90047b4ef0fddd6eb5 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:26 -0800 Subject: bpf: Refactor btf_kfunc_id_set_contains btf_kfunc_id_set_contains() is called by fetch_kfunc_meta() in the BPF verifier to get the kfunc flags stored in the .BTF_ids ELF section. If it returns NULL instead of a valid pointer, it's interpreted as an illegal kfunc usage failing the verification. There are two potential reasons for btf_kfunc_id_set_contains() to return NULL: 1. Provided kfunc BTF id is not present in relevant kfunc id sets. 2. The kfunc is not allowed, as determined by the program type specific filter [1]. The filter functions accept a pointer to `struct bpf_prog`, so they might implicitly depend on earlier stages of verification, when bpf_prog members are set. For example, bpf_qdisc_kfunc_filter() in linux/net/sched/bpf_qdisc.c inspects prog->aux->st_ops [2], which is initialized in: check_attach_btf_id() -> check_struct_ops_btf_id() So far this hasn't been an issue, because fetch_kfunc_meta() is the only caller of btf_kfunc_id_set_contains(). However in subsequent patches of this series it is necessary to inspect kfunc flags earlier in BPF verifier, in the add_kfunc_call(). To resolve this, refactor btf_kfunc_id_set_contains() into two interface functions: * btf_kfunc_flags() that simply returns pointer to kfunc_flags without applying the filters * btf_kfunc_is_allowed() that both checks for kfunc_flags existence (which is a requirement for a kfunc to be allowed) and applies the prog filters See [3] for the previous version of this patch. [1] https://lore.kernel.org/all/20230519225157.760788-7-aditi.ghag@isovalent.com/ [2] https://lore.kernel.org/all/20250409214606.2000194-4-ameryhung@gmail.com/ [3] https://lore.kernel.org/bpf/20251029190113.3323406-3-ihor.solodrai@linux.dev/ Reviewed-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-2-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 4 +-- kernel/bpf/btf.c | 70 ++++++++++++++++++++++++++++++++++++++------------- kernel/bpf/verifier.c | 6 ++--- 3 files changed, 58 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/btf.h b/include/linux/btf.h index 78dc79810c7d..a2f4f383f5b6 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -575,8 +575,8 @@ const char *btf_name_by_offset(const struct btf *btf, u32 offset); const char *btf_str_by_offset(const struct btf *btf, u32 offset); struct btf *btf_parse_vmlinux(void); struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog); -u32 *btf_kfunc_id_set_contains(const struct btf *btf, u32 kfunc_btf_id, - const struct bpf_prog *prog); +u32 *btf_kfunc_flags(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog); +bool btf_kfunc_is_allowed(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog); u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog); int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 364dd84bfc5a..d10b3404260f 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -8757,24 +8757,17 @@ end: return ret; } -static u32 *__btf_kfunc_id_set_contains(const struct btf *btf, - enum btf_kfunc_hook hook, - u32 kfunc_btf_id, - const struct bpf_prog *prog) +static u32 *btf_kfunc_id_set_contains(const struct btf *btf, + enum btf_kfunc_hook hook, + u32 kfunc_btf_id) { - struct btf_kfunc_hook_filter *hook_filter; struct btf_id_set8 *set; - u32 *id, i; + u32 *id; if (hook >= BTF_KFUNC_HOOK_MAX) return NULL; if (!btf->kfunc_set_tab) return NULL; - hook_filter = &btf->kfunc_set_tab->hook_filters[hook]; - for (i = 0; i < hook_filter->nr_filters; i++) { - if (hook_filter->filters[i](prog, kfunc_btf_id)) - return NULL; - } set = btf->kfunc_set_tab->sets[hook]; if (!set) return NULL; @@ -8785,6 +8778,28 @@ static u32 *__btf_kfunc_id_set_contains(const struct btf *btf, return id + 1; } +static bool __btf_kfunc_is_allowed(const struct btf *btf, + enum btf_kfunc_hook hook, + u32 kfunc_btf_id, + const struct bpf_prog *prog) +{ + struct btf_kfunc_hook_filter *hook_filter; + int i; + + if (hook >= BTF_KFUNC_HOOK_MAX) + return false; + if (!btf->kfunc_set_tab) + return false; + + hook_filter = &btf->kfunc_set_tab->hook_filters[hook]; + for (i = 0; i < hook_filter->nr_filters; i++) { + if (hook_filter->filters[i](prog, kfunc_btf_id)) + return false; + } + + return true; +} + static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) { switch (prog_type) { @@ -8832,6 +8847,26 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) } } +bool btf_kfunc_is_allowed(const struct btf *btf, + u32 kfunc_btf_id, + const struct bpf_prog *prog) +{ + enum bpf_prog_type prog_type = resolve_prog_type(prog); + enum btf_kfunc_hook hook; + u32 *kfunc_flags; + + kfunc_flags = btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id); + if (kfunc_flags && __btf_kfunc_is_allowed(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id, prog)) + return true; + + hook = bpf_prog_type_to_kfunc_hook(prog_type); + kfunc_flags = btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id); + if (kfunc_flags && __btf_kfunc_is_allowed(btf, hook, kfunc_btf_id, prog)) + return true; + + return false; +} + /* Caution: * Reference to the module (obtained using btf_try_get_module) corresponding to * the struct btf *MUST* be held when calling this function from verifier @@ -8839,26 +8874,27 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) * keeping the reference for the duration of the call provides the necessary * protection for looking up a well-formed btf->kfunc_set_tab. */ -u32 *btf_kfunc_id_set_contains(const struct btf *btf, - u32 kfunc_btf_id, - const struct bpf_prog *prog) +u32 *btf_kfunc_flags(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog) { enum bpf_prog_type prog_type = resolve_prog_type(prog); enum btf_kfunc_hook hook; u32 *kfunc_flags; - kfunc_flags = __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id, prog); + kfunc_flags = btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id); if (kfunc_flags) return kfunc_flags; hook = bpf_prog_type_to_kfunc_hook(prog_type); - return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id, prog); + return btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id); } u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog) { - return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id, prog); + if (!__btf_kfunc_is_allowed(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id, prog)) + return NULL; + + return btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id); } static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b67d8981b058..c344371feb33 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13723,10 +13723,10 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env, *kfunc_name = func_name; func_proto = btf_type_by_id(desc_btf, func->type); - kfunc_flags = btf_kfunc_id_set_contains(desc_btf, func_id, env->prog); - if (!kfunc_flags) { + if (!btf_kfunc_is_allowed(desc_btf, func_id, env->prog)) return -EACCES; - } + + kfunc_flags = btf_kfunc_flags(desc_btf, func_id, env->prog); memset(meta, 0, sizeof(*meta)); meta->btf = desc_btf; -- cgit v1.2.3 From 08ca87d6324350a7abf5f05db5b63df9420dd29d Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:27 -0800 Subject: bpf: Introduce struct bpf_kfunc_meta There is code duplication between add_kfunc_call() and fetch_kfunc_meta() collecting information about a kfunc from BTF. Introduce struct bpf_kfunc_meta to hold common kfunc BTF data and implement fetch_kfunc_meta() to fill it in, instead of struct bpf_kfunc_call_arg_meta directly. Then use these in add_kfunc_call() and (new) fetch_kfunc_arg_meta() functions, and fixup previous usages of fetch_kfunc_meta() to fetch_kfunc_arg_meta(). Besides the code dedup, this change enables add_kfunc_call() to access kfunc->flags. Acked-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-3-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 156 +++++++++++++++++++++++++++++--------------------- 1 file changed, 91 insertions(+), 65 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c344371feb33..87f0e113b356 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -294,6 +294,14 @@ struct bpf_call_arg_meta { s64 const_map_key; }; +struct bpf_kfunc_meta { + struct btf *btf; + const struct btf_type *proto; + const char *name; + const u32 *flags; + s32 id; +}; + struct bpf_kfunc_call_arg_meta { /* In parameters */ struct btf *btf; @@ -3263,16 +3271,68 @@ static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, s16 offset) return btf_vmlinux ?: ERR_PTR(-ENOENT); } -static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) +static int fetch_kfunc_meta(struct bpf_verifier_env *env, + s32 func_id, + s16 offset, + struct bpf_kfunc_meta *kfunc) { const struct btf_type *func, *func_proto; + const char *func_name; + u32 *kfunc_flags; + struct btf *btf; + + if (func_id <= 0) { + verbose(env, "invalid kernel function btf_id %d\n", func_id); + return -EINVAL; + } + + btf = find_kfunc_desc_btf(env, offset); + if (IS_ERR(btf)) { + verbose(env, "failed to find BTF for kernel function\n"); + return PTR_ERR(btf); + } + + /* + * Note that kfunc_flags may be NULL at this point, which + * means that we couldn't find func_id in any relevant + * kfunc_id_set. This most likely indicates an invalid kfunc + * call. However we don't fail with an error here, + * and let the caller decide what to do with NULL kfunc->flags. + */ + kfunc_flags = btf_kfunc_flags(btf, func_id, env->prog); + + func = btf_type_by_id(btf, func_id); + if (!func || !btf_type_is_func(func)) { + verbose(env, "kernel btf_id %d is not a function\n", func_id); + return -EINVAL; + } + + func_name = btf_name_by_offset(btf, func->name_off); + func_proto = btf_type_by_id(btf, func->type); + if (!func_proto || !btf_type_is_func_proto(func_proto)) { + verbose(env, "kernel function btf_id %d does not have a valid func_proto\n", + func_id); + return -EINVAL; + } + + memset(kfunc, 0, sizeof(*kfunc)); + kfunc->btf = btf; + kfunc->id = func_id; + kfunc->name = func_name; + kfunc->proto = func_proto; + kfunc->flags = kfunc_flags; + + return 0; +} + +static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) +{ struct bpf_kfunc_btf_tab *btf_tab; struct btf_func_model func_model; struct bpf_kfunc_desc_tab *tab; struct bpf_prog_aux *prog_aux; + struct bpf_kfunc_meta kfunc; struct bpf_kfunc_desc *desc; - const char *func_name; - struct btf *desc_btf; unsigned long addr; int err; @@ -3322,12 +3382,6 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) prog_aux->kfunc_btf_tab = btf_tab; } - desc_btf = find_kfunc_desc_btf(env, offset); - if (IS_ERR(desc_btf)) { - verbose(env, "failed to find BTF for kernel function\n"); - return PTR_ERR(desc_btf); - } - if (find_kfunc_desc(env->prog, func_id, offset)) return 0; @@ -3336,24 +3390,13 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return -E2BIG; } - func = btf_type_by_id(desc_btf, func_id); - if (!func || !btf_type_is_func(func)) { - verbose(env, "kernel btf_id %u is not a function\n", - func_id); - return -EINVAL; - } - func_proto = btf_type_by_id(desc_btf, func->type); - if (!func_proto || !btf_type_is_func_proto(func_proto)) { - verbose(env, "kernel function btf_id %u does not have a valid func_proto\n", - func_id); - return -EINVAL; - } + err = fetch_kfunc_meta(env, func_id, offset, &kfunc); + if (err) + return err; - func_name = btf_name_by_offset(desc_btf, func->name_off); - addr = kallsyms_lookup_name(func_name); + addr = kallsyms_lookup_name(kfunc.name); if (!addr) { - verbose(env, "cannot find address for kernel function %s\n", - func_name); + verbose(env, "cannot find address for kernel function %s\n", kfunc.name); return -EINVAL; } @@ -3363,9 +3406,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return err; } - err = btf_distill_func_proto(&env->log, desc_btf, - func_proto, func_name, - &func_model); + err = btf_distill_func_proto(&env->log, kfunc.btf, kfunc.proto, kfunc.name, &func_model); if (err) return err; @@ -13696,44 +13737,28 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return 0; } -static int fetch_kfunc_meta(struct bpf_verifier_env *env, - struct bpf_insn *insn, - struct bpf_kfunc_call_arg_meta *meta, - const char **kfunc_name) +static int fetch_kfunc_arg_meta(struct bpf_verifier_env *env, + s32 func_id, + s16 offset, + struct bpf_kfunc_call_arg_meta *meta) { - const struct btf_type *func, *func_proto; - u32 func_id, *kfunc_flags; - const char *func_name; - struct btf *desc_btf; - - if (kfunc_name) - *kfunc_name = NULL; - - if (!insn->imm) - return -EINVAL; + struct bpf_kfunc_meta kfunc; + int err; - desc_btf = find_kfunc_desc_btf(env, insn->off); - if (IS_ERR(desc_btf)) - return PTR_ERR(desc_btf); + err = fetch_kfunc_meta(env, func_id, offset, &kfunc); + if (err) + return err; - func_id = insn->imm; - func = btf_type_by_id(desc_btf, func_id); - func_name = btf_name_by_offset(desc_btf, func->name_off); - if (kfunc_name) - *kfunc_name = func_name; - func_proto = btf_type_by_id(desc_btf, func->type); + memset(meta, 0, sizeof(*meta)); + meta->btf = kfunc.btf; + meta->func_id = kfunc.id; + meta->func_proto = kfunc.proto; + meta->func_name = kfunc.name; - if (!btf_kfunc_is_allowed(desc_btf, func_id, env->prog)) + if (!kfunc.flags || !btf_kfunc_is_allowed(kfunc.btf, kfunc.id, env->prog)) return -EACCES; - kfunc_flags = btf_kfunc_flags(desc_btf, func_id, env->prog); - - memset(meta, 0, sizeof(*meta)); - meta->btf = desc_btf; - meta->func_id = func_id; - meta->kfunc_flags = *kfunc_flags; - meta->func_proto = func_proto; - meta->func_name = func_name; + meta->kfunc_flags = *kfunc.flags; return 0; } @@ -13938,12 +13963,13 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (!insn->imm) return 0; - err = fetch_kfunc_meta(env, insn, &meta, &func_name); - if (err == -EACCES && func_name) - verbose(env, "calling kernel function %s is not allowed\n", func_name); + err = fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta); + if (err == -EACCES && meta.func_name) + verbose(env, "calling kernel function %s is not allowed\n", meta.func_name); if (err) return err; desc_btf = meta.btf; + func_name = meta.func_name; insn_aux = &env->insn_aux_data[insn_idx]; insn_aux->is_iter_next = is_iter_next_kfunc(&meta); @@ -17789,7 +17815,7 @@ static bool get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call if (bpf_pseudo_kfunc_call(call)) { int err; - err = fetch_kfunc_meta(env, call, &meta, NULL); + err = fetch_kfunc_arg_meta(env, call->imm, call->off, &meta); if (err < 0) /* error would be reported later */ return false; @@ -18297,7 +18323,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env) } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { struct bpf_kfunc_call_arg_meta meta; - ret = fetch_kfunc_meta(env, insn, &meta, NULL); + ret = fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta); if (ret == 0 && is_iter_next_kfunc(&meta)) { mark_prune_point(env, t); /* Checking and saving state checkpoints at iter_next() call -- cgit v1.2.3 From 64e1360524b9ef5835714669b5876e122a23e6fc Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:28 -0800 Subject: bpf: Verifier support for KF_IMPLICIT_ARGS A kernel function bpf_foo marked with KF_IMPLICIT_ARGS flag is expected to have two associated types in BTF: * `bpf_foo` with a function prototype that omits implicit arguments * `bpf_foo_impl` with a function prototype that matches the kernel declaration of `bpf_foo`, but doesn't have a ksym associated with its name In order to support kfuncs with implicit arguments, the verifier has to know how to resolve a call of `bpf_foo` to the correct BTF function prototype and address. To implement this, in add_kfunc_call() kfunc flags are checked for KF_IMPLICIT_ARGS. For such kfuncs a BTF func prototype is adjusted to the one found for `bpf_foo_impl` (func_name + "_impl" suffix, by convention) function in BTF. This effectively changes the signature of the `bpf_foo` kfunc in the context of verification: from one without implicit args to the one with full argument list. The values of implicit arguments by design are provided by the verifier, and so they can only be of particular types. In this patch the only allowed implicit arg type is a pointer to struct bpf_prog_aux. In order for the verifier to correctly set an implicit bpf_prog_aux arg value at runtime, is_kfunc_arg_prog() is extended to check for the arg type. At a point when prog arg is determined in check_kfunc_args() the kfunc with implicit args already has a prototype with full argument list, so the existing value patch mechanism just works. If a new kfunc with KF_IMPLICIT_ARG is declared for an existing kfunc that uses a __prog argument (a legacy case), the prototype substitution works in exactly the same way, assuming the kfunc follows the _impl naming convention. The difference is only in how _impl prototype is added to the BTF, which is not the verifier's concern. See a subsequent resolve_btfids patch for details. __prog suffix is still supported at this point, but will be removed in a subsequent patch, after current users are moved to KF_IMPLICIT_ARGS. Introduction of KF_IMPLICIT_ARGS revealed an issue with zero-extension tracking, because an explicit rX = 0 in place of the verifier-supplied argument is now absent if the arg is implicit (the BPF prog doesn't pass a dummy NULL anymore). To mitigate this, reset the subreg_def of all caller saved registers in check_kfunc_call() [1]. [1] https://lore.kernel.org/bpf/b4a760ef828d40dac7ea6074d39452bb0dc82caa.camel@gmail.com/ Acked-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-4-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 1 + kernel/bpf/verifier.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 55 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/btf.h b/include/linux/btf.h index a2f4f383f5b6..48108471c5b1 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -78,6 +78,7 @@ #define KF_ARENA_RET (1 << 13) /* kfunc returns an arena pointer */ #define KF_ARENA_ARG1 (1 << 14) /* kfunc takes an arena pointer as its first argument */ #define KF_ARENA_ARG2 (1 << 15) /* kfunc takes an arena pointer as its second argument */ +#define KF_IMPLICIT_ARGS (1 << 16) /* kfunc has implicit arguments supplied by the verifier */ /* * Tag marking a kernel function as a kfunc. This is meant to minimize the diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 87f0e113b356..adc24a2ce5b6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3271,6 +3271,34 @@ static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, s16 offset) return btf_vmlinux ?: ERR_PTR(-ENOENT); } +#define KF_IMPL_SUFFIX "_impl" + +static const struct btf_type *find_kfunc_impl_proto(struct bpf_verifier_env *env, + struct btf *btf, + const char *func_name) +{ + char *buf = env->tmp_str_buf; + const struct btf_type *func; + s32 impl_id; + int len; + + len = snprintf(buf, TMP_STR_BUF_LEN, "%s%s", func_name, KF_IMPL_SUFFIX); + if (len < 0 || len >= TMP_STR_BUF_LEN) { + verbose(env, "function name %s%s is too long\n", func_name, KF_IMPL_SUFFIX); + return NULL; + } + + impl_id = btf_find_by_name_kind(btf, buf, BTF_KIND_FUNC); + if (impl_id <= 0) { + verbose(env, "cannot find function %s in BTF\n", buf); + return NULL; + } + + func = btf_type_by_id(btf, impl_id); + + return btf_type_by_id(btf, func->type); +} + static int fetch_kfunc_meta(struct bpf_verifier_env *env, s32 func_id, s16 offset, @@ -3308,7 +3336,16 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env, } func_name = btf_name_by_offset(btf, func->name_off); - func_proto = btf_type_by_id(btf, func->type); + + /* + * An actual prototype of a kfunc with KF_IMPLICIT_ARGS flag + * can be found through the counterpart _impl kfunc. + */ + if (kfunc_flags && (*kfunc_flags & KF_IMPLICIT_ARGS)) + func_proto = find_kfunc_impl_proto(env, btf, func_name); + else + func_proto = btf_type_by_id(btf, func->type); + if (!func_proto || !btf_type_is_func_proto(func_proto)) { verbose(env, "kernel function btf_id %d does not have a valid func_proto\n", func_id); @@ -12174,9 +12211,11 @@ static bool is_kfunc_arg_irq_flag(const struct btf *btf, const struct btf_param return btf_param_match_suffix(btf, arg, "__irq_flag"); } +static bool is_kfunc_arg_prog_aux(const struct btf *btf, const struct btf_param *arg); + static bool is_kfunc_arg_prog(const struct btf *btf, const struct btf_param *arg) { - return btf_param_match_suffix(btf, arg, "__prog"); + return btf_param_match_suffix(btf, arg, "__prog") || is_kfunc_arg_prog_aux(btf, arg); } static bool is_kfunc_arg_scalar_with_name(const struct btf *btf, @@ -12207,6 +12246,7 @@ enum { KF_ARG_WORKQUEUE_ID, KF_ARG_RES_SPIN_LOCK_ID, KF_ARG_TASK_WORK_ID, + KF_ARG_PROG_AUX_ID }; BTF_ID_LIST(kf_arg_btf_ids) @@ -12218,6 +12258,7 @@ BTF_ID(struct, bpf_rb_node) BTF_ID(struct, bpf_wq) BTF_ID(struct, bpf_res_spin_lock) BTF_ID(struct, bpf_task_work) +BTF_ID(struct, bpf_prog_aux) static bool __is_kfunc_ptr_arg_type(const struct btf *btf, const struct btf_param *arg, int type) @@ -12298,6 +12339,11 @@ static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf return true; } +static bool is_kfunc_arg_prog_aux(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_PROG_AUX_ID); +} + /* Returns true if struct is composed of scalars, 4 levels of nesting allowed */ static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env, const struct btf *btf, @@ -14177,8 +14223,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } - for (i = 0; i < CALLER_SAVED_REGS; i++) - mark_reg_not_init(env, regs, caller_saved[i]); + for (i = 0; i < CALLER_SAVED_REGS; i++) { + u32 regno = caller_saved[i]; + + mark_reg_not_init(env, regs, regno); + regs[regno].subreg_def = DEF_NOT_SUBREG; + } /* Check return type */ t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL); -- cgit v1.2.3 From b97931a25a4bc74076ffb5c3d1a534c71ade4d55 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:32 -0800 Subject: bpf: Migrate bpf_wq_set_callback_impl() to KF_IMPLICIT_ARGS Implement bpf_wq_set_callback() with an implicit bpf_prog_aux argument, and remove bpf_wq_set_callback_impl(). Update special kfunc checks in the verifier accordingly. Reviewed-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-8-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 11 +++++------ kernel/bpf/verifier.c | 16 ++++++++-------- tools/testing/selftests/bpf/bpf_experimental.h | 5 ----- .../selftests/bpf/progs/verifier_async_cb_context.c | 4 ++-- tools/testing/selftests/bpf/progs/wq_failures.c | 4 ++-- 5 files changed, 17 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9eaa4185e0a7..c76a9003b221 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3120,12 +3120,11 @@ __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) return 0; } -__bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, - int (callback_fn)(void *map, int *key, void *value), - unsigned int flags, - void *aux__prog) +__bpf_kfunc int bpf_wq_set_callback(struct bpf_wq *wq, + int (callback_fn)(void *map, int *key, void *value), + unsigned int flags, + struct bpf_prog_aux *aux) { - struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__prog; struct bpf_async_kern *async = (struct bpf_async_kern *)wq; if (flags) @@ -4488,7 +4487,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_memset) BTF_ID_FLAGS(func, bpf_modify_return_test_tp) #endif BTF_ID_FLAGS(func, bpf_wq_init) -BTF_ID_FLAGS(func, bpf_wq_set_callback_impl) +BTF_ID_FLAGS(func, bpf_wq_set_callback, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_wq_start) BTF_ID_FLAGS(func, bpf_preempt_disable) BTF_ID_FLAGS(func, bpf_preempt_enable) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index adc24a2ce5b6..51e8c9f70868 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -520,7 +520,7 @@ static bool is_async_callback_calling_kfunc(u32 btf_id); static bool is_callback_calling_kfunc(u32 btf_id); static bool is_bpf_throw_kfunc(struct bpf_insn *insn); -static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id); +static bool is_bpf_wq_set_callback_kfunc(u32 btf_id); static bool is_task_work_add_kfunc(u32 func_id); static bool is_sync_callback_calling_function(enum bpf_func_id func_id) @@ -562,7 +562,7 @@ static bool is_async_cb_sleepable(struct bpf_verifier_env *env, struct bpf_insn /* bpf_wq and bpf_task_work callbacks are always sleepable. */ if (bpf_pseudo_kfunc_call(insn) && insn->off == 0 && - (is_bpf_wq_set_callback_impl_kfunc(insn->imm) || is_task_work_add_kfunc(insn->imm))) + (is_bpf_wq_set_callback_kfunc(insn->imm) || is_task_work_add_kfunc(insn->imm))) return true; verifier_bug(env, "unhandled async callback in is_async_cb_sleepable"); @@ -12437,7 +12437,7 @@ enum special_kfunc_type { KF_bpf_percpu_obj_new_impl, KF_bpf_percpu_obj_drop_impl, KF_bpf_throw, - KF_bpf_wq_set_callback_impl, + KF_bpf_wq_set_callback, KF_bpf_preempt_disable, KF_bpf_preempt_enable, KF_bpf_iter_css_task_new, @@ -12501,7 +12501,7 @@ BTF_ID(func, bpf_dynptr_clone) BTF_ID(func, bpf_percpu_obj_new_impl) BTF_ID(func, bpf_percpu_obj_drop_impl) BTF_ID(func, bpf_throw) -BTF_ID(func, bpf_wq_set_callback_impl) +BTF_ID(func, bpf_wq_set_callback) BTF_ID(func, bpf_preempt_disable) BTF_ID(func, bpf_preempt_enable) #ifdef CONFIG_CGROUPS @@ -12994,7 +12994,7 @@ static bool is_sync_callback_calling_kfunc(u32 btf_id) static bool is_async_callback_calling_kfunc(u32 btf_id) { - return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl] || + return is_bpf_wq_set_callback_kfunc(btf_id) || is_task_work_add_kfunc(btf_id); } @@ -13004,9 +13004,9 @@ static bool is_bpf_throw_kfunc(struct bpf_insn *insn) insn->imm == special_kfunc_list[KF_bpf_throw]; } -static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id) +static bool is_bpf_wq_set_callback_kfunc(u32 btf_id) { - return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl]; + return btf_id == special_kfunc_list[KF_bpf_wq_set_callback]; } static bool is_callback_calling_kfunc(u32 btf_id) @@ -14085,7 +14085,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, meta.r0_rdonly = false; } - if (is_bpf_wq_set_callback_impl_kfunc(meta.func_id)) { + if (is_bpf_wq_set_callback_kfunc(meta.func_id)) { err = push_callback_call(env, insn, insn_idx, meta.subprogno, set_timer_callback_state); if (err) { diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index 2cd9165c7348..68a49b1f77ae 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -580,11 +580,6 @@ extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym; extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym; extern int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __weak __ksym; -extern int bpf_wq_set_callback_impl(struct bpf_wq *wq, - int (callback_fn)(void *map, int *key, void *value), - unsigned int flags__k, void *aux__ign) __ksym; -#define bpf_wq_set_callback(timer, cb, flags) \ - bpf_wq_set_callback_impl(timer, cb, flags, NULL) struct bpf_iter_kmem_cache; extern int bpf_iter_kmem_cache_new(struct bpf_iter_kmem_cache *it) __weak __ksym; diff --git a/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c b/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c index 7efa9521105e..5d5e1cd4d51d 100644 --- a/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c +++ b/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c @@ -96,7 +96,7 @@ int wq_non_sleepable_prog(void *ctx) if (bpf_wq_init(&val->w, &wq_map, 0) != 0) return 0; - if (bpf_wq_set_callback_impl(&val->w, wq_cb, 0, NULL) != 0) + if (bpf_wq_set_callback(&val->w, wq_cb, 0) != 0) return 0; return 0; } @@ -114,7 +114,7 @@ int wq_sleepable_prog(void *ctx) if (bpf_wq_init(&val->w, &wq_map, 0) != 0) return 0; - if (bpf_wq_set_callback_impl(&val->w, wq_cb, 0, NULL) != 0) + if (bpf_wq_set_callback(&val->w, wq_cb, 0) != 0) return 0; return 0; } diff --git a/tools/testing/selftests/bpf/progs/wq_failures.c b/tools/testing/selftests/bpf/progs/wq_failures.c index d06f6d40594a..3767f5595bbc 100644 --- a/tools/testing/selftests/bpf/progs/wq_failures.c +++ b/tools/testing/selftests/bpf/progs/wq_failures.c @@ -97,7 +97,7 @@ __failure /* check that the first argument of bpf_wq_set_callback() * is a correct bpf_wq pointer. */ -__msg(": (85) call bpf_wq_set_callback_impl#") /* anchor message */ +__msg(": (85) call bpf_wq_set_callback#") /* anchor message */ __msg("arg#0 doesn't point to a map value") long test_wrong_wq_pointer(void *ctx) { @@ -123,7 +123,7 @@ __failure /* check that the first argument of bpf_wq_set_callback() * is a correct bpf_wq pointer. */ -__msg(": (85) call bpf_wq_set_callback_impl#") /* anchor message */ +__msg(": (85) call bpf_wq_set_callback#") /* anchor message */ __msg("off 1 doesn't point to 'struct bpf_wq' that is at 0") long test_wrong_wq_pointer_offset(void *ctx) { -- cgit v1.2.3 From 6e663ffdf7600168338fdfa2fd1eed83395d58a3 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:34 -0800 Subject: bpf: Migrate bpf_task_work_schedule_* kfuncs to KF_IMPLICIT_ARGS Implement bpf_task_work_schedule_* with an implicit bpf_prog_aux argument, and remove corresponding _impl funcs from the kernel. Update special kfunc checks in the verifier accordingly. Update the selftests to use the new API with implicit argument. Reviewed-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-10-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 30 ++++++++++------------ kernel/bpf/verifier.c | 12 ++++----- tools/testing/selftests/bpf/progs/file_reader.c | 2 +- tools/testing/selftests/bpf/progs/task_work.c | 7 +++-- tools/testing/selftests/bpf/progs/task_work_fail.c | 8 +++--- .../testing/selftests/bpf/progs/task_work_stress.c | 4 +-- .../bpf/progs/verifier_async_cb_context.c | 4 +-- 7 files changed, 32 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index c76a9003b221..f2f974b5fb3b 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4274,41 +4274,39 @@ release_prog: } /** - * bpf_task_work_schedule_signal_impl - Schedule BPF callback using task_work_add with TWA_SIGNAL + * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL * mode * @task: Task struct for which callback should be scheduled * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping * @map__map: bpf_map that embeds struct bpf_task_work in the values * @callback: pointer to BPF subprogram to call - * @aux__prog: user should pass NULL + * @aux: pointer to bpf_prog_aux of the caller BPF program, implicitly set by the verifier * * Return: 0 if task work has been scheduled successfully, negative error code otherwise */ -__bpf_kfunc int bpf_task_work_schedule_signal_impl(struct task_struct *task, - struct bpf_task_work *tw, void *map__map, - bpf_task_work_callback_t callback, - void *aux__prog) +__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw, + void *map__map, bpf_task_work_callback_t callback, + struct bpf_prog_aux *aux) { - return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL); + return bpf_task_work_schedule(task, tw, map__map, callback, aux, TWA_SIGNAL); } /** - * bpf_task_work_schedule_resume_impl - Schedule BPF callback using task_work_add with TWA_RESUME + * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME * mode * @task: Task struct for which callback should be scheduled * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping * @map__map: bpf_map that embeds struct bpf_task_work in the values * @callback: pointer to BPF subprogram to call - * @aux__prog: user should pass NULL + * @aux: pointer to bpf_prog_aux of the caller BPF program, implicitly set by the verifier * * Return: 0 if task work has been scheduled successfully, negative error code otherwise */ -__bpf_kfunc int bpf_task_work_schedule_resume_impl(struct task_struct *task, - struct bpf_task_work *tw, void *map__map, - bpf_task_work_callback_t callback, - void *aux__prog) +__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw, + void *map__map, bpf_task_work_callback_t callback, + struct bpf_prog_aux *aux) { - return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME); + return bpf_task_work_schedule(task, tw, map__map, callback, aux, TWA_RESUME); } static int make_file_dynptr(struct file *file, u32 flags, bool may_sleep, @@ -4536,8 +4534,8 @@ BTF_ID_FLAGS(func, bpf_strncasestr); BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif BTF_ID_FLAGS(func, bpf_stream_vprintk_impl) -BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl) -BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl) +BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_dynptr_from_file) BTF_ID_FLAGS(func, bpf_dynptr_file_discard) BTF_KFUNCS_END(common_btf_ids) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 51e8c9f70868..8e8570e9d167 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12457,8 +12457,8 @@ enum special_kfunc_type { KF_bpf_dynptr_from_file, KF_bpf_dynptr_file_discard, KF___bpf_trap, - KF_bpf_task_work_schedule_signal_impl, - KF_bpf_task_work_schedule_resume_impl, + KF_bpf_task_work_schedule_signal, + KF_bpf_task_work_schedule_resume, KF_bpf_arena_alloc_pages, KF_bpf_arena_free_pages, KF_bpf_arena_reserve_pages, @@ -12534,16 +12534,16 @@ BTF_ID(func, bpf_res_spin_unlock_irqrestore) BTF_ID(func, bpf_dynptr_from_file) BTF_ID(func, bpf_dynptr_file_discard) BTF_ID(func, __bpf_trap) -BTF_ID(func, bpf_task_work_schedule_signal_impl) -BTF_ID(func, bpf_task_work_schedule_resume_impl) +BTF_ID(func, bpf_task_work_schedule_signal) +BTF_ID(func, bpf_task_work_schedule_resume) BTF_ID(func, bpf_arena_alloc_pages) BTF_ID(func, bpf_arena_free_pages) BTF_ID(func, bpf_arena_reserve_pages) static bool is_task_work_add_kfunc(u32 func_id) { - return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal_impl] || - func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume_impl]; + return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] || + func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume]; } static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) diff --git a/tools/testing/selftests/bpf/progs/file_reader.c b/tools/testing/selftests/bpf/progs/file_reader.c index 4d756b623557..462712ff3b8a 100644 --- a/tools/testing/selftests/bpf/progs/file_reader.c +++ b/tools/testing/selftests/bpf/progs/file_reader.c @@ -77,7 +77,7 @@ int on_open_validate_file_read(void *c) err = 1; return 0; } - bpf_task_work_schedule_signal_impl(task, &work->tw, &arrmap, task_work_callback, NULL); + bpf_task_work_schedule_signal(task, &work->tw, &arrmap, task_work_callback); return 0; } diff --git a/tools/testing/selftests/bpf/progs/task_work.c b/tools/testing/selftests/bpf/progs/task_work.c index 663a80990f8f..a6009d105158 100644 --- a/tools/testing/selftests/bpf/progs/task_work.c +++ b/tools/testing/selftests/bpf/progs/task_work.c @@ -65,8 +65,7 @@ int oncpu_hash_map(struct pt_regs *args) work = bpf_map_lookup_elem(&hmap, &key); if (!work) return 0; - - bpf_task_work_schedule_resume_impl(task, &work->tw, &hmap, process_work, NULL); + bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work); return 0; } @@ -80,7 +79,7 @@ int oncpu_array_map(struct pt_regs *args) work = bpf_map_lookup_elem(&arrmap, &key); if (!work) return 0; - bpf_task_work_schedule_signal_impl(task, &work->tw, &arrmap, process_work, NULL); + bpf_task_work_schedule_signal(task, &work->tw, &arrmap, process_work); return 0; } @@ -102,6 +101,6 @@ int oncpu_lru_map(struct pt_regs *args) work = bpf_map_lookup_elem(&lrumap, &key); if (!work || work->data[0]) return 0; - bpf_task_work_schedule_resume_impl(task, &work->tw, &lrumap, process_work, NULL); + bpf_task_work_schedule_resume(task, &work->tw, &lrumap, process_work); return 0; } diff --git a/tools/testing/selftests/bpf/progs/task_work_fail.c b/tools/testing/selftests/bpf/progs/task_work_fail.c index 1270953fd092..82e4b8913333 100644 --- a/tools/testing/selftests/bpf/progs/task_work_fail.c +++ b/tools/testing/selftests/bpf/progs/task_work_fail.c @@ -53,7 +53,7 @@ int mismatch_map(struct pt_regs *args) work = bpf_map_lookup_elem(&arrmap, &key); if (!work) return 0; - bpf_task_work_schedule_resume_impl(task, &work->tw, &hmap, process_work, NULL); + bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work); return 0; } @@ -65,7 +65,7 @@ int no_map_task_work(struct pt_regs *args) struct bpf_task_work tw; task = bpf_get_current_task_btf(); - bpf_task_work_schedule_resume_impl(task, &tw, &hmap, process_work, NULL); + bpf_task_work_schedule_resume(task, &tw, &hmap, process_work); return 0; } @@ -76,7 +76,7 @@ int task_work_null(struct pt_regs *args) struct task_struct *task; task = bpf_get_current_task_btf(); - bpf_task_work_schedule_resume_impl(task, NULL, &hmap, process_work, NULL); + bpf_task_work_schedule_resume(task, NULL, &hmap, process_work); return 0; } @@ -91,6 +91,6 @@ int map_null(struct pt_regs *args) work = bpf_map_lookup_elem(&arrmap, &key); if (!work) return 0; - bpf_task_work_schedule_resume_impl(task, &work->tw, NULL, process_work, NULL); + bpf_task_work_schedule_resume(task, &work->tw, NULL, process_work); return 0; } diff --git a/tools/testing/selftests/bpf/progs/task_work_stress.c b/tools/testing/selftests/bpf/progs/task_work_stress.c index 55e555f7f41b..1d4378f351ef 100644 --- a/tools/testing/selftests/bpf/progs/task_work_stress.c +++ b/tools/testing/selftests/bpf/progs/task_work_stress.c @@ -51,8 +51,8 @@ int schedule_task_work(void *ctx) if (!work) return 0; } - err = bpf_task_work_schedule_signal_impl(bpf_get_current_task_btf(), &work->tw, &hmap, - process_work, NULL); + err = bpf_task_work_schedule_signal(bpf_get_current_task_btf(), &work->tw, &hmap, + process_work); if (err) __sync_fetch_and_add(&schedule_error, 1); else diff --git a/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c b/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c index 5d5e1cd4d51d..39aff82549c9 100644 --- a/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c +++ b/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c @@ -156,7 +156,7 @@ int task_work_non_sleepable_prog(void *ctx) if (!task) return 0; - bpf_task_work_schedule_resume_impl(task, &val->tw, &task_work_map, task_work_cb, NULL); + bpf_task_work_schedule_resume(task, &val->tw, &task_work_map, task_work_cb); return 0; } @@ -176,6 +176,6 @@ int task_work_sleepable_prog(void *ctx) if (!task) return 0; - bpf_task_work_schedule_resume_impl(task, &val->tw, &task_work_map, task_work_cb, NULL); + bpf_task_work_schedule_resume(task, &val->tw, &task_work_map, task_work_cb); return 0; } -- cgit v1.2.3 From d806f3101276a1ed18d963944580e1ee1c7a3d26 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:35 -0800 Subject: bpf: Migrate bpf_stream_vprintk() to KF_IMPLICIT_ARGS Implement bpf_stream_vprintk with an implicit bpf_prog_aux argument, and remote bpf_stream_vprintk_impl from the kernel. Update the selftests to use the new API with implicit argument. bpf_stream_vprintk macro is changed to use the new bpf_stream_vprintk kfunc, and the extern definition of bpf_stream_vprintk_impl is replaced accordingly. Reviewed-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-11-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 2 +- kernel/bpf/stream.c | 5 ++--- tools/lib/bpf/bpf_helpers.h | 6 +++--- tools/testing/selftests/bpf/progs/stream_fail.c | 6 +++--- 4 files changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index f2f974b5fb3b..f8aa1320e2f7 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4533,7 +4533,7 @@ BTF_ID_FLAGS(func, bpf_strncasestr); #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS) BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif -BTF_ID_FLAGS(func, bpf_stream_vprintk_impl) +BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_dynptr_from_file) diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c index 0b6bc3f30335..24730df55e69 100644 --- a/kernel/bpf/stream.c +++ b/kernel/bpf/stream.c @@ -212,14 +212,13 @@ __bpf_kfunc_start_defs(); * Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the * enum in headers. */ -__bpf_kfunc int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args, - u32 len__sz, void *aux__prog) +__bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, + u32 len__sz, struct bpf_prog_aux *aux) { struct bpf_bprintf_data data = { .get_bin_args = true, .get_buf = true, }; - struct bpf_prog_aux *aux = aux__prog; u32 fmt_size = strlen(fmt__str) + 1; struct bpf_stream *stream; u32 data_len = len__sz; diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index d4e4e388e625..c145da05a67c 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -315,8 +315,8 @@ enum libbpf_tristate { ___param, sizeof(___param)); \ }) -extern int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args, - __u32 len__sz, void *aux__prog) __weak __ksym; +extern int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, + __u32 len__sz) __weak __ksym; #define bpf_stream_printk(stream_id, fmt, args...) \ ({ \ @@ -328,7 +328,7 @@ extern int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const vo ___bpf_fill(___param, args); \ _Pragma("GCC diagnostic pop") \ \ - bpf_stream_vprintk_impl(stream_id, ___fmt, ___param, sizeof(___param), NULL); \ + bpf_stream_vprintk(stream_id, ___fmt, ___param, sizeof(___param)); \ }) /* Use __bpf_printk when bpf_printk call has 3 or fewer fmt args diff --git a/tools/testing/selftests/bpf/progs/stream_fail.c b/tools/testing/selftests/bpf/progs/stream_fail.c index 3662515f0107..8e8249f3521c 100644 --- a/tools/testing/selftests/bpf/progs/stream_fail.c +++ b/tools/testing/selftests/bpf/progs/stream_fail.c @@ -10,7 +10,7 @@ SEC("syscall") __failure __msg("Possibly NULL pointer passed") int stream_vprintk_null_arg(void *ctx) { - bpf_stream_vprintk_impl(BPF_STDOUT, "", NULL, 0, NULL); + bpf_stream_vprintk(BPF_STDOUT, "", NULL, 0); return 0; } @@ -18,7 +18,7 @@ SEC("syscall") __failure __msg("R3 type=scalar expected=") int stream_vprintk_scalar_arg(void *ctx) { - bpf_stream_vprintk_impl(BPF_STDOUT, "", (void *)46, 0, NULL); + bpf_stream_vprintk(BPF_STDOUT, "", (void *)46, 0); return 0; } @@ -26,7 +26,7 @@ SEC("syscall") __failure __msg("arg#1 doesn't point to a const string") int stream_vprintk_string_arg(void *ctx) { - bpf_stream_vprintk_impl(BPF_STDOUT, ctx, NULL, 0, NULL); + bpf_stream_vprintk(BPF_STDOUT, ctx, NULL, 0); return 0; } -- cgit v1.2.3 From aed57a36387135bcb73f01bac3d0a286a657b006 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:37 -0800 Subject: bpf: Remove __prog kfunc arg annotation Now that all the __prog suffix users in the kernel tree migrated to KF_IMPLICIT_ARGS, remove it from the verifier. See prior discussion for context [1]. [1] https://lore.kernel.org/bpf/CAEf4BzbgPfRm9BX=TsZm-TsHFAHcwhPY4vTt=9OT-uhWqf8tqw@mail.gmail.com/ Acked-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-13-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8e8570e9d167..919556614505 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12211,13 +12211,6 @@ static bool is_kfunc_arg_irq_flag(const struct btf *btf, const struct btf_param return btf_param_match_suffix(btf, arg, "__irq_flag"); } -static bool is_kfunc_arg_prog_aux(const struct btf *btf, const struct btf_param *arg); - -static bool is_kfunc_arg_prog(const struct btf *btf, const struct btf_param *arg) -{ - return btf_param_match_suffix(btf, arg, "__prog") || is_kfunc_arg_prog_aux(btf, arg); -} - static bool is_kfunc_arg_scalar_with_name(const struct btf *btf, const struct btf_param *arg, const char *name) @@ -13280,8 +13273,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_arg_ignore(btf, &args[i])) continue; - if (is_kfunc_arg_prog(btf, &args[i])) { - /* Used to reject repeated use of __prog. */ + if (is_kfunc_arg_prog_aux(btf, &args[i])) { + /* Reject repeated use bpf_prog_aux */ if (meta->arg_prog) { verifier_bug(env, "Only 1 prog->aux argument supported per-kfunc"); return -EFAULT; -- cgit v1.2.3 From 44fdd581d27366092e162b42f025d75d5a16c851 Mon Sep 17 00:00:00 2001 From: Yazhou Tang Date: Mon, 19 Jan 2026 16:54:57 +0800 Subject: bpf: Add range tracking for BPF_DIV and BPF_MOD This patch implements range tracking (interval analysis) for BPF_DIV and BPF_MOD operations when the divisor is a constant, covering both signed and unsigned variants. While LLVM typically optimizes integer division and modulo by constants into multiplication and shift sequences, this optimization is less effective for the BPF target when dealing with 64-bit arithmetic. Currently, the verifier does not track bounds for scalar division or modulo, treating the result as "unbounded". This leads to false positive rejections for safe code patterns. For example, the following code (compiled with -O2): ```c int test(struct pt_regs *ctx) { char buffer[6] = {1}; __u64 x = bpf_ktime_get_ns(); __u64 res = x % sizeof(buffer); char value = buffer[res]; bpf_printk("res = %llu, val = %d", res, value); return 0; } ``` Generates a raw `BPF_MOD64` instruction: ```asm ; __u64 res = x % sizeof(buffer); 1: 97 00 00 00 06 00 00 00 r0 %= 0x6 ; char value = buffer[res]; 2: 18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r1 = 0x0 ll 4: 0f 01 00 00 00 00 00 00 r1 += r0 5: 91 14 00 00 00 00 00 00 r4 = *(s8 *)(r1 + 0x0) ``` Without this patch, the verifier fails with "math between map_value pointer and register with unbounded min value is not allowed" because it cannot deduce that `r0` is within [0, 5]. According to the BPF instruction set[1], the instruction's offset field (`insn->off`) is used to distinguish between signed (`off == 1`) and unsigned division (`off == 0`). Moreover, we also follow the BPF division and modulo runtime behavior (semantics) to handle special cases, such as division by zero and signed division overflow. - UDIV: dst = (src != 0) ? (dst / src) : 0 - SDIV: dst = (src == 0) ? 0 : ((src == -1 && dst == LLONG_MIN) ? LLONG_MIN : (dst / src)) - UMOD: dst = (src != 0) ? (dst % src) : dst - SMOD: dst = (src == 0) ? dst : ((src == -1 && dst == LLONG_MIN) ? 0: (dst s% src)) Here is the overview of the changes made in this patch (See the code comments for more details and examples): 1. For BPF_DIV: Firstly check whether the divisor is zero. If so, set the destination register to zero (matching runtime behavior). For non-zero constant divisors: goto `scalar(32)?_min_max_(u|s)div` functions. - General cases: compute the new range by dividing max_dividend and min_dividend by the constant divisor. - Overflow case (SIGNED_MIN / -1) in signed division: mark the result as unbounded if the dividend is not a single number. 2. For BPF_MOD: Firstly check whether the divisor is zero. If so, leave the destination register unchanged (matching runtime behavior). For non-zero constant divisors: goto `scalar(32)?_min_max_(u|s)mod` functions. - General case: For signed modulo, the result's sign matches the dividend's sign. And the result's absolute value is strictly bounded by `min(abs(dividend), abs(divisor) - 1)`. - Special care is taken when the divisor is SIGNED_MIN. By casting to unsigned before negation and subtracting 1, we avoid signed overflow and correctly calculate the maximum possible magnitude (`res_max_abs` in the code). - "Small dividend" case: If the dividend is already within the possible result range (e.g., [-2, 5] % 10), the operation is an identity function, and the destination register remains unchanged. 3. In `scalar(32)?_min_max_(u|s)(div|mod)` functions: After updating current range, reset other ranges and tnum to unbounded/unknown. e.g., in `scalar_min_max_sdiv`, signed 64-bit range is updated. Then reset unsigned 64-bit range and 32-bit range to unbounded, and tnum to unknown. Exception: in BPF_MOD's "small dividend" case, since the result remains unchanged, we do not reset other ranges/tnum. 4. Also updated existing selftests based on the expected BPF_DIV and BPF_MOD behavior. [1] https://www.kernel.org/doc/Documentation/bpf/standardization/instruction-set.rst Co-developed-by: Shenghao Yuan Signed-off-by: Shenghao Yuan Co-developed-by: Tianci Cao Signed-off-by: Tianci Cao Signed-off-by: Yazhou Tang Tested-by: syzbot@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/20260119085458.182221-2-tangyazhou@zju.edu.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 299 +++++++++++++++++++++ .../bpf/progs/verifier_value_illegal_alu.c | 7 +- tools/testing/selftests/bpf/verifier/precise.c | 4 +- 3 files changed, 305 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 919556614505..f11dc5366e5b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2349,6 +2349,18 @@ static void __mark_reg32_unbounded(struct bpf_reg_state *reg) reg->u32_max_value = U32_MAX; } +static void reset_reg64_and_tnum(struct bpf_reg_state *reg) +{ + __mark_reg64_unbounded(reg); + reg->var_off = tnum_unknown; +} + +static void reset_reg32_and_tnum(struct bpf_reg_state *reg) +{ + __mark_reg32_unbounded(reg); + reg->var_off = tnum_unknown; +} + static void __update_reg32_bounds(struct bpf_reg_state *reg) { struct tnum var32_off = tnum_subreg(reg->var_off); @@ -15159,6 +15171,252 @@ static void scalar_min_max_mul(struct bpf_reg_state *dst_reg, } } +static void scalar32_min_max_udiv(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u32 *dst_umin = &dst_reg->u32_min_value; + u32 *dst_umax = &dst_reg->u32_max_value; + u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */ + + *dst_umin = *dst_umin / src_val; + *dst_umax = *dst_umax / src_val; + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; + reset_reg64_and_tnum(dst_reg); +} + +static void scalar_min_max_udiv(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u64 *dst_umin = &dst_reg->umin_value; + u64 *dst_umax = &dst_reg->umax_value; + u64 src_val = src_reg->umin_value; /* non-zero, const divisor */ + + *dst_umin = div64_u64(*dst_umin, src_val); + *dst_umax = div64_u64(*dst_umax, src_val); + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + reset_reg32_and_tnum(dst_reg); +} + +static void scalar32_min_max_sdiv(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s32 *dst_smin = &dst_reg->s32_min_value; + s32 *dst_smax = &dst_reg->s32_max_value; + s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */ + s32 res1, res2; + + /* BPF div specification: S32_MIN / -1 = S32_MIN */ + if (*dst_smin == S32_MIN && src_val == -1) { + /* + * If the dividend range contains more than just S32_MIN, + * we cannot precisely track the result, so it becomes unbounded. + * e.g., [S32_MIN, S32_MIN+10]/(-1), + * = {S32_MIN} U [-(S32_MIN+10), -(S32_MIN+1)] + * = {S32_MIN} U [S32_MAX-9, S32_MAX] = [S32_MIN, S32_MAX] + * Otherwise (if dividend is exactly S32_MIN), result remains S32_MIN. + */ + if (*dst_smax != S32_MIN) { + *dst_smin = S32_MIN; + *dst_smax = S32_MAX; + } + goto reset; + } + + res1 = *dst_smin / src_val; + res2 = *dst_smax / src_val; + *dst_smin = min(res1, res2); + *dst_smax = max(res1, res2); + +reset: + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->u32_min_value = 0; + dst_reg->u32_max_value = U32_MAX; + reset_reg64_and_tnum(dst_reg); +} + +static void scalar_min_max_sdiv(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s64 *dst_smin = &dst_reg->smin_value; + s64 *dst_smax = &dst_reg->smax_value; + s64 src_val = src_reg->smin_value; /* non-zero, const divisor */ + s64 res1, res2; + + /* BPF div specification: S64_MIN / -1 = S64_MIN */ + if (*dst_smin == S64_MIN && src_val == -1) { + /* + * If the dividend range contains more than just S64_MIN, + * we cannot precisely track the result, so it becomes unbounded. + * e.g., [S64_MIN, S64_MIN+10]/(-1), + * = {S64_MIN} U [-(S64_MIN+10), -(S64_MIN+1)] + * = {S64_MIN} U [S64_MAX-9, S64_MAX] = [S64_MIN, S64_MAX] + * Otherwise (if dividend is exactly S64_MIN), result remains S64_MIN. + */ + if (*dst_smax != S64_MIN) { + *dst_smin = S64_MIN; + *dst_smax = S64_MAX; + } + goto reset; + } + + res1 = div64_s64(*dst_smin, src_val); + res2 = div64_s64(*dst_smax, src_val); + *dst_smin = min(res1, res2); + *dst_smax = max(res1, res2); + +reset: + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->umin_value = 0; + dst_reg->umax_value = U64_MAX; + reset_reg32_and_tnum(dst_reg); +} + +static void scalar32_min_max_umod(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u32 *dst_umin = &dst_reg->u32_min_value; + u32 *dst_umax = &dst_reg->u32_max_value; + u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */ + u32 res_max = src_val - 1; + + /* + * If dst_umax <= res_max, the result remains unchanged. + * e.g., [2, 5] % 10 = [2, 5]. + */ + if (*dst_umax <= res_max) + return; + + *dst_umin = 0; + *dst_umax = min(*dst_umax, res_max); + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; + reset_reg64_and_tnum(dst_reg); +} + +static void scalar_min_max_umod(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u64 *dst_umin = &dst_reg->umin_value; + u64 *dst_umax = &dst_reg->umax_value; + u64 src_val = src_reg->umin_value; /* non-zero, const divisor */ + u64 res_max = src_val - 1; + + /* + * If dst_umax <= res_max, the result remains unchanged. + * e.g., [2, 5] % 10 = [2, 5]. + */ + if (*dst_umax <= res_max) + return; + + *dst_umin = 0; + *dst_umax = min(*dst_umax, res_max); + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + reset_reg32_and_tnum(dst_reg); +} + +static void scalar32_min_max_smod(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s32 *dst_smin = &dst_reg->s32_min_value; + s32 *dst_smax = &dst_reg->s32_max_value; + s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */ + + /* + * Safe absolute value calculation: + * If src_val == S32_MIN (-2147483648), src_abs becomes 2147483648. + * Here use unsigned integer to avoid overflow. + */ + u32 src_abs = (src_val > 0) ? (u32)src_val : -(u32)src_val; + + /* + * Calculate the maximum possible absolute value of the result. + * Even if src_abs is 2147483648 (S32_MIN), subtracting 1 gives + * 2147483647 (S32_MAX), which fits perfectly in s32. + */ + s32 res_max_abs = src_abs - 1; + + /* + * If the dividend is already within the result range, + * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5]. + */ + if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs) + return; + + /* General case: result has the same sign as the dividend. */ + if (*dst_smin >= 0) { + *dst_smin = 0; + *dst_smax = min(*dst_smax, res_max_abs); + } else if (*dst_smax <= 0) { + *dst_smax = 0; + *dst_smin = max(*dst_smin, -res_max_abs); + } else { + *dst_smin = -res_max_abs; + *dst_smax = res_max_abs; + } + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->u32_min_value = 0; + dst_reg->u32_max_value = U32_MAX; + reset_reg64_and_tnum(dst_reg); +} + +static void scalar_min_max_smod(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s64 *dst_smin = &dst_reg->smin_value; + s64 *dst_smax = &dst_reg->smax_value; + s64 src_val = src_reg->smin_value; /* non-zero, const divisor */ + + /* + * Safe absolute value calculation: + * If src_val == S64_MIN (-2^63), src_abs becomes 2^63. + * Here use unsigned integer to avoid overflow. + */ + u64 src_abs = (src_val > 0) ? (u64)src_val : -(u64)src_val; + + /* + * Calculate the maximum possible absolute value of the result. + * Even if src_abs is 2^63 (S64_MIN), subtracting 1 gives + * 2^63 - 1 (S64_MAX), which fits perfectly in s64. + */ + s64 res_max_abs = src_abs - 1; + + /* + * If the dividend is already within the result range, + * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5]. + */ + if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs) + return; + + /* General case: result has the same sign as the dividend. */ + if (*dst_smin >= 0) { + *dst_smin = 0; + *dst_smax = min(*dst_smax, res_max_abs); + } else if (*dst_smax <= 0) { + *dst_smax = 0; + *dst_smin = max(*dst_smin, -res_max_abs); + } else { + *dst_smin = -res_max_abs; + *dst_smax = res_max_abs; + } + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->umin_value = 0; + dst_reg->umax_value = U64_MAX; + reset_reg32_and_tnum(dst_reg); +} + static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { @@ -15564,6 +15822,14 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, case BPF_MUL: return true; + /* + * Division and modulo operators range is only safe to compute when the + * divisor is a constant. + */ + case BPF_DIV: + case BPF_MOD: + return src_is_const; + /* Shift operators range is only computable if shift dimension operand * is a constant. Shifts greater than 31 or 63 are undefined. This * includes shifts by a negative number. @@ -15616,6 +15882,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, struct bpf_reg_state src_reg) { u8 opcode = BPF_OP(insn->code); + s16 off = insn->off; bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64); int ret; @@ -15667,6 +15934,38 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, scalar32_min_max_mul(dst_reg, &src_reg); scalar_min_max_mul(dst_reg, &src_reg); break; + case BPF_DIV: + /* BPF div specification: x / 0 = 0 */ + if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0)) { + ___mark_reg_known(dst_reg, 0); + break; + } + if (alu32) + if (off == 1) + scalar32_min_max_sdiv(dst_reg, &src_reg); + else + scalar32_min_max_udiv(dst_reg, &src_reg); + else + if (off == 1) + scalar_min_max_sdiv(dst_reg, &src_reg); + else + scalar_min_max_udiv(dst_reg, &src_reg); + break; + case BPF_MOD: + /* BPF mod specification: x % 0 = x */ + if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0)) + break; + if (alu32) + if (off == 1) + scalar32_min_max_smod(dst_reg, &src_reg); + else + scalar32_min_max_umod(dst_reg, &src_reg); + else + if (off == 1) + scalar_min_max_smod(dst_reg, &src_reg); + else + scalar_min_max_umod(dst_reg, &src_reg); + break; case BPF_AND: if (tnum_is_const(src_reg.var_off)) { ret = maybe_fork_scalars(env, insn, dst_reg); diff --git a/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c b/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c index 2129e4353fd9..4d8273c258d5 100644 --- a/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c +++ b/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c @@ -173,14 +173,15 @@ __naked void flow_keys_illegal_variable_offset_alu(void) asm volatile(" \ r6 = r1; \ r7 = *(u64*)(r6 + %[flow_keys_off]); \ - r8 = 8; \ - r8 /= 1; \ + call %[bpf_get_prandom_u32]; \ + r8 = r0; \ r8 &= 8; \ r7 += r8; \ r0 = *(u64*)(r7 + 0); \ exit; \ " : - : __imm_const(flow_keys_off, offsetof(struct __sk_buff, flow_keys)) + : __imm_const(flow_keys_off, offsetof(struct __sk_buff, flow_keys)), + __imm(bpf_get_prandom_u32) : __clobber_all); } diff --git a/tools/testing/selftests/bpf/verifier/precise.c b/tools/testing/selftests/bpf/verifier/precise.c index 59a020c35647..061d98f6e9bb 100644 --- a/tools/testing/selftests/bpf/verifier/precise.c +++ b/tools/testing/selftests/bpf/verifier/precise.c @@ -229,11 +229,11 @@ { "precise: program doesn't prematurely prune branches", .insns = { - BPF_ALU64_IMM(BPF_MOV, BPF_REG_6, 0x400), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32), + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_0), BPF_ALU64_IMM(BPF_MOV, BPF_REG_7, 0), BPF_ALU64_IMM(BPF_MOV, BPF_REG_8, 0), BPF_ALU64_IMM(BPF_MOV, BPF_REG_9, 0x80000000), - BPF_ALU64_IMM(BPF_MOD, BPF_REG_6, 0x401), BPF_JMP_IMM(BPF_JA, 0, 0, 0), BPF_JMP_REG(BPF_JLE, BPF_REG_6, BPF_REG_9, 2), BPF_ALU64_IMM(BPF_MOD, BPF_REG_6, 1), -- cgit v1.2.3 From 802eef5afb1865bc5536a5302c068ba2215a1f72 Mon Sep 17 00:00:00 2001 From: Zesen Liu Date: Tue, 20 Jan 2026 16:28:46 +0800 Subject: bpf: Fix memory access flags in helper prototypes After commit 37cce22dbd51 ("bpf: verifier: Refactor helper access type tracking"), the verifier started relying on the access type flags in helper function prototypes to perform memory access optimizations. Currently, several helper functions utilizing ARG_PTR_TO_MEM lack the corresponding MEM_RDONLY or MEM_WRITE flags. This omission causes the verifier to incorrectly assume that the buffer contents are unchanged across the helper call. Consequently, the verifier may optimize away subsequent reads based on this wrong assumption, leading to correctness issues. For bpf_get_stack_proto_raw_tp, the original MEM_RDONLY was incorrect since the helper writes to the buffer. Change it to ARG_PTR_TO_UNINIT_MEM which correctly indicates write access to potentially uninitialized memory. Similar issues were recently addressed for specific helpers in commit ac44dcc788b9 ("bpf: Fix verifier assumptions of bpf_d_path's output buffer") and commit 2eb7648558a7 ("bpf: Specify access type of bpf_sysctl_get_name args"). Fix these prototypes by adding the correct memory access flags. Fixes: 37cce22dbd51 ("bpf: verifier: Refactor helper access type tracking") Co-developed-by: Shuran Liu Signed-off-by: Shuran Liu Co-developed-by: Peili Gao Signed-off-by: Peili Gao Co-developed-by: Haoran Ni Signed-off-by: Haoran Ni Signed-off-by: Zesen Liu Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260120-helper_proto-v3-1-27b0180b4e77@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 2 +- kernel/bpf/syscall.c | 2 +- kernel/trace/bpf_trace.c | 6 +++--- net/core/filter.c | 20 ++++++++++---------- 4 files changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index f8aa1320e2f7..4d1af703cfcb 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1077,7 +1077,7 @@ const struct bpf_func_proto bpf_snprintf_proto = { .func = bpf_snprintf, .gpl_only = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM_OR_NULL, + .arg1_type = ARG_PTR_TO_MEM_OR_NULL | MEM_WRITE, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_PTR_TO_CONST_STR, .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ecc0929ce462..3c5c03d43f5f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -6451,7 +6451,7 @@ static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { .func = bpf_kallsyms_lookup_name, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f73e08c223b5..bd15ff62490b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1022,7 +1022,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = { .func = bpf_snprintf_btf, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE, @@ -1526,7 +1526,7 @@ static const struct bpf_func_proto bpf_read_branch_records_proto = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM_OR_NULL, + .arg2_type = ARG_PTR_TO_MEM_OR_NULL | MEM_WRITE, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; @@ -1661,7 +1661,7 @@ static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; diff --git a/net/core/filter.c b/net/core/filter.c index d43df98e1ded..d14401193b01 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6399,7 +6399,7 @@ static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -6454,7 +6454,7 @@ static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -8008,9 +8008,9 @@ static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv4_proto = { .gpl_only = true, /* __cookie_v4_init_sequence() is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY, .arg1_size = sizeof(struct iphdr), - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -8040,9 +8040,9 @@ static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = { .gpl_only = true, /* __cookie_v6_init_sequence() is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY, .arg1_size = sizeof(struct ipv6hdr), - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -8060,9 +8060,9 @@ static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv4_proto = { .gpl_only = true, /* __cookie_v4_check is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY, .arg1_size = sizeof(struct iphdr), - .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY, .arg2_size = sizeof(struct tcphdr), }; @@ -8084,9 +8084,9 @@ static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = { .gpl_only = true, /* __cookie_v6_check is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY, .arg1_size = sizeof(struct ipv6hdr), - .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY, .arg2_size = sizeof(struct tcphdr), }; #endif /* CONFIG_SYN_COOKIES */ -- cgit v1.2.3 From ed4724212f6f472fcb529947730ee0ec21c2eb6c Mon Sep 17 00:00:00 2001 From: Zesen Liu Date: Tue, 20 Jan 2026 16:28:47 +0800 Subject: bpf: Require ARG_PTR_TO_MEM with memory flag Add check to ensure that ARG_PTR_TO_MEM is used with either MEM_WRITE or MEM_RDONLY. Using ARG_PTR_TO_MEM alone without flags does not make sense because: - If the helper does not change the argument, missing MEM_RDONLY causes the verifier to incorrectly reject a read-only buffer. - If the helper does change the argument, missing MEM_WRITE causes the verifier to incorrectly assume the memory is unchanged, leading to errors in code optimization. Co-developed-by: Shuran Liu Signed-off-by: Shuran Liu Co-developed-by: Peili Gao Signed-off-by: Peili Gao Co-developed-by: Haoran Ni Signed-off-by: Haoran Ni Signed-off-by: Zesen Liu Reviewed-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260120-helper_proto-v3-2-27b0180b4e77@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f11dc5366e5b..bca0ca82d164 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10441,10 +10441,27 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn) return true; } +static bool check_mem_arg_rw_flag_ok(const struct bpf_func_proto *fn) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) { + enum bpf_arg_type arg_type = fn->arg_type[i]; + + if (base_type(arg_type) != ARG_PTR_TO_MEM) + continue; + if (!(arg_type & (MEM_WRITE | MEM_RDONLY))) + return false; + } + + return true; +} + static int check_func_proto(const struct bpf_func_proto *fn) { return check_raw_mode_ok(fn) && check_arg_pair_ok(fn) && + check_mem_arg_rw_flag_ok(fn) && check_btf_id_ok(fn) ? 0 : -EINVAL; } -- cgit v1.2.3 From c1f2c449de279967e04254f09104f657ba60ab3f Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 20 Jan 2026 15:59:10 +0000 Subject: bpf: Factor out timer deletion helper Move the timer deletion logic into a dedicated bpf_timer_delete() helper so it can be reused by later patches. Acked-by: Eduard Zingerman Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260120-timer_nolock-v6-1-670ffdd787b4@meta.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 4d1af703cfcb..fb6f9dbca084 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1558,18 +1558,10 @@ out: return cb; } -/* This function is called by map_delete/update_elem for individual element and - * by ops->map_release_uref when the user space reference to a map reaches zero. - */ -void bpf_timer_cancel_and_free(void *val) +static void bpf_timer_delete(struct bpf_hrtimer *t) { - struct bpf_hrtimer *t; - - t = (struct bpf_hrtimer *)__bpf_async_cancel_and_free(val); - - if (!t) - return; - /* We check that bpf_map_delete/update_elem() was called from timer + /* + * We check that bpf_map_delete/update_elem() was called from timer * callback_fn. In such case we don't call hrtimer_cancel() (since it * will deadlock) and don't call hrtimer_try_to_cancel() (since it will * just return -1). Though callback_fn is still running on this cpu it's @@ -1618,6 +1610,21 @@ void bpf_timer_cancel_and_free(void *val) } } +/* + * This function is called by map_delete/update_elem for individual element and + * by ops->map_release_uref when the user space reference to a map reaches zero. + */ +void bpf_timer_cancel_and_free(void *val) +{ + struct bpf_hrtimer *t; + + t = (struct bpf_hrtimer *)__bpf_async_cancel_and_free(val); + if (!t) + return; + + bpf_timer_delete(t); +} + /* This function is called by map_delete/update_elem for individual element and * by ops->map_release_uref when the user space reference to a map reaches zero. */ -- cgit v1.2.3 From 57d31e72dbdd1f71455aa62a2505a8cf088f46c6 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 20 Jan 2026 15:59:11 +0000 Subject: bpf: Remove unnecessary arguments from bpf_async_set_callback() Remove unused arguments from __bpf_async_set_callback(). Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260120-timer_nolock-v6-2-670ffdd787b4@meta.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index fb6f9dbca084..6eadb66b8c67 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1355,10 +1355,9 @@ static const struct bpf_func_proto bpf_timer_init_proto = { }; static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn, - struct bpf_prog_aux *aux, unsigned int flags, - enum bpf_async_type type) + struct bpf_prog *prog) { - struct bpf_prog *prev, *prog = aux->prog; + struct bpf_prog *prev; struct bpf_async_cb *cb; int ret = 0; @@ -1403,7 +1402,7 @@ out: BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn, struct bpf_prog_aux *, aux) { - return __bpf_async_set_callback(timer, callback_fn, aux, 0, BPF_ASYNC_TYPE_TIMER); + return __bpf_async_set_callback(timer, callback_fn, aux->prog); } static const struct bpf_func_proto bpf_timer_set_callback_proto = { @@ -3137,7 +3136,7 @@ __bpf_kfunc int bpf_wq_set_callback(struct bpf_wq *wq, if (flags) return -EINVAL; - return __bpf_async_set_callback(async, callback_fn, aux, flags, BPF_ASYNC_TYPE_WQ); + return __bpf_async_set_callback(async, callback_fn, aux->prog); } __bpf_kfunc void bpf_preempt_disable(void) -- cgit v1.2.3 From 8bb1e32b3fac1becb4c1c8079d720784b8e33e34 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 20 Jan 2026 15:59:12 +0000 Subject: bpf: Introduce lock-free bpf_async_update_prog_callback() Introduce bpf_async_update_prog_callback(): lock-free update of cb->prog and cb->callback_fn. This function allows updating prog and callback_fn fields of the struct bpf_async_cb without holding lock. For now use it under the lock from __bpf_async_set_callback(), in the next patches that lock will be removed. Lock-free algorithm: * Acquire a guard reference on prog to prevent it from being freed during the retry loop. * Retry loop: 1. Each iteration acquires a new prog reference and stores it in cb->prog via xchg. The previous prog is released. 2. The loop condition checks if both cb->prog and cb->callback_fn match what we just wrote. If either differs, a concurrent writer overwrote our value, and we must retry. 3. When we retry, our previously-stored prog was already released by the concurrent writer or will be released by us after overwriting. * Release guard reference. Acked-by: Andrii Nakryiko Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260120-timer_nolock-v6-3-670ffdd787b4@meta.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 67 +++++++++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6eadb66b8c67..2a2df867bfe7 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1354,10 +1354,43 @@ static const struct bpf_func_proto bpf_timer_init_proto = { .arg3_type = ARG_ANYTHING, }; +static int bpf_async_update_prog_callback(struct bpf_async_cb *cb, void *callback_fn, + struct bpf_prog *prog) +{ + struct bpf_prog *prev; + + /* Acquire a guard reference on prog to prevent it from being freed during the loop */ + if (prog) { + prog = bpf_prog_inc_not_zero(prog); + if (IS_ERR(prog)) + return PTR_ERR(prog); + } + + do { + if (prog) + prog = bpf_prog_inc_not_zero(prog); + prev = xchg(&cb->prog, prog); + rcu_assign_pointer(cb->callback_fn, callback_fn); + + /* + * Release previous prog, make sure that if other CPU is contending, + * to set bpf_prog, references are not leaked as each iteration acquires and + * releases one reference. + */ + if (prev) + bpf_prog_put(prev); + + } while (READ_ONCE(cb->prog) != prog || READ_ONCE(cb->callback_fn) != callback_fn); + + if (prog) + bpf_prog_put(prog); + + return 0; +} + static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn, struct bpf_prog *prog) { - struct bpf_prog *prev; struct bpf_async_cb *cb; int ret = 0; @@ -1378,22 +1411,7 @@ static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback ret = -EPERM; goto out; } - prev = cb->prog; - if (prev != prog) { - /* Bump prog refcnt once. Every bpf_timer_set_callback() - * can pick different callback_fn-s within the same prog. - */ - prog = bpf_prog_inc_not_zero(prog); - if (IS_ERR(prog)) { - ret = PTR_ERR(prog); - goto out; - } - if (prev) - /* Drop prev prog refcnt when swapping with new prog */ - bpf_prog_put(prev); - cb->prog = prog; - } - rcu_assign_pointer(cb->callback_fn, callback_fn); + ret = bpf_async_update_prog_callback(cb, callback_fn, prog); out: __bpf_spin_unlock_irqrestore(&async->lock); return ret; @@ -1453,17 +1471,6 @@ static const struct bpf_func_proto bpf_timer_start_proto = { .arg3_type = ARG_ANYTHING, }; -static void drop_prog_refcnt(struct bpf_async_cb *async) -{ - struct bpf_prog *prog = async->prog; - - if (prog) { - bpf_prog_put(prog); - async->prog = NULL; - rcu_assign_pointer(async->callback_fn, NULL); - } -} - BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) { struct bpf_hrtimer *t, *cur_t; @@ -1514,7 +1521,7 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) goto out; } drop: - drop_prog_refcnt(&t->cb); + bpf_async_update_prog_callback(&t->cb, NULL, NULL); out: __bpf_spin_unlock_irqrestore(&timer->lock); /* Cancel the timer and wait for associated callback to finish @@ -1547,7 +1554,7 @@ static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *a cb = async->cb; if (!cb) goto out; - drop_prog_refcnt(cb); + bpf_async_update_prog_callback(cb, NULL, NULL); /* The subsequent bpf_timer_start/cancel() helpers won't be able to use * this timer, since it won't be initialized. */ -- cgit v1.2.3 From 83c9030cdc45e0518d71065c25201a24eafc9818 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 20 Jan 2026 15:59:13 +0000 Subject: bpf: Simplify bpf_timer_cancel() Remove lock from the bpf_timer_cancel() helper. The lock does not protect from concurrent modification of the bpf_async_cb data fields as those are modified in the callback without locking. Use guard(rcu)() instead of pair of explicit lock()/unlock(). Acked-by: Kumar Kartikeya Dwivedi Acked-by: Andrii Nakryiko Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260120-timer_nolock-v6-4-670ffdd787b4@meta.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 2a2df867bfe7..637677815365 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1471,7 +1471,7 @@ static const struct bpf_func_proto bpf_timer_start_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) +BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, async) { struct bpf_hrtimer *t, *cur_t; bool inc = false; @@ -1479,13 +1479,12 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) if (in_nmi()) return -EOPNOTSUPP; - rcu_read_lock(); - __bpf_spin_lock_irqsave(&timer->lock); - t = timer->timer; - if (!t) { - ret = -EINVAL; - goto out; - } + + guard(rcu)(); + + t = READ_ONCE(async->timer); + if (!t) + return -EINVAL; cur_t = this_cpu_read(hrtimer_running); if (cur_t == t) { @@ -1493,8 +1492,7 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) * its own timer the hrtimer_cancel() will deadlock * since it waits for callback_fn to finish. */ - ret = -EDEADLK; - goto out; + return -EDEADLK; } /* Only account in-flight cancellations when invoked from a timer @@ -1517,20 +1515,17 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) * cancelling and waiting for it synchronously, since it might * do the same. Bail! */ - ret = -EDEADLK; - goto out; + atomic_dec(&t->cancelling); + return -EDEADLK; } drop: bpf_async_update_prog_callback(&t->cb, NULL, NULL); -out: - __bpf_spin_unlock_irqrestore(&timer->lock); /* Cancel the timer and wait for associated callback to finish * if it was running. */ - ret = ret ?: hrtimer_cancel(&t->timer); + ret = hrtimer_cancel(&t->timer); if (inc) atomic_dec(&t->cancelling); - rcu_read_unlock(); return ret; } -- cgit v1.2.3 From eaedea154eb96b2f4ba8ec8e4397dab10758a705 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Tue, 20 Jan 2026 15:05:54 +0800 Subject: bpf, x86: inline bpf_get_current_task() for x86_64 Inline bpf_get_current_task() and bpf_get_current_task_btf() for x86_64 to obtain better performance. Signed-off-by: Menglong Dong Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260120070555.233486-2-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bca0ca82d164..9905ad40f4d3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18130,6 +18130,10 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) switch (imm) { #ifdef CONFIG_X86_64 case BPF_FUNC_get_smp_processor_id: +#ifdef CONFIG_SMP + case BPF_FUNC_get_current_task_btf: + case BPF_FUNC_get_current_task: +#endif return env->prog->jit_requested && bpf_jit_supports_percpu_insn(); #endif default: @@ -23715,6 +23719,24 @@ patch_map_ops_generic: insn = new_prog->insnsi + i + delta; goto next_insn; } + + /* Implement bpf_get_current_task() and bpf_get_current_task_btf() inline. */ + if ((insn->imm == BPF_FUNC_get_current_task || insn->imm == BPF_FUNC_get_current_task_btf) && + verifier_inlines_helper_call(env, insn->imm)) { + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)¤t_task); + insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); + insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0); + cnt = 3; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } #endif /* Implement bpf_get_func_arg inline. */ if (prog_type == BPF_PROG_TYPE_TRACING && -- cgit v1.2.3 From 85c7f914714741de992fc19c2ba673f6c400a584 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 21 Jan 2026 12:43:47 +0800 Subject: bpf: support bpf_get_func_arg() for BPF_TRACE_RAW_TP For now, bpf_get_func_arg() and bpf_get_func_arg_cnt() is not supported by the BPF_TRACE_RAW_TP, which is not convenient to get the argument of the tracepoint, especially for the case that the position of the arguments in a tracepoint can change. The target tracepoint BTF type id is specified during loading time, therefore we can get the function argument count from the function prototype instead of the stack. Signed-off-by: Menglong Dong Acked-by: Yonghong Song Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20260121044348.113201-2-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 22 ++++++++++++++++++---- kernel/trace/bpf_trace.c | 10 ++++++++-- 2 files changed, 26 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9905ad40f4d3..c7f5234d5fd2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23741,8 +23741,15 @@ patch_map_ops_generic: /* Implement bpf_get_func_arg inline. */ if (prog_type == BPF_PROG_TYPE_TRACING && insn->imm == BPF_FUNC_get_func_arg) { - /* Load nr_args from ctx - 8 */ - insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + if (eatype == BPF_TRACE_RAW_TP) { + int nr_args = btf_type_vlen(prog->aux->attach_func_proto); + + /* skip 'void *__data' in btf_trace_##name() and save to reg0 */ + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, nr_args - 1); + } else { + /* Load nr_args from ctx - 8 */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + } insn_buf[1] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6); insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3); insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1); @@ -23794,8 +23801,15 @@ patch_map_ops_generic: /* Implement get_func_arg_cnt inline. */ if (prog_type == BPF_PROG_TYPE_TRACING && insn->imm == BPF_FUNC_get_func_arg_cnt) { - /* Load nr_args from ctx - 8 */ - insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + if (eatype == BPF_TRACE_RAW_TP) { + int nr_args = btf_type_vlen(prog->aux->attach_func_proto); + + /* skip 'void *__data' in btf_trace_##name() and save to reg0 */ + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, nr_args - 1); + } else { + /* Load nr_args from ctx - 8 */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + } new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1); if (!new_prog) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index bd15ff62490b..0e9635bcd783 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1734,11 +1734,17 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_d_path: return &bpf_d_path_proto; case BPF_FUNC_get_func_arg: - return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_proto : NULL; + if (bpf_prog_has_trampoline(prog) || + prog->expected_attach_type == BPF_TRACE_RAW_TP) + return &bpf_get_func_arg_proto; + return NULL; case BPF_FUNC_get_func_ret: return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL; case BPF_FUNC_get_func_arg_cnt: - return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL; + if (bpf_prog_has_trampoline(prog) || + prog->expected_attach_type == BPF_TRACE_RAW_TP) + return &bpf_get_func_arg_cnt_proto; + return NULL; case BPF_FUNC_get_attach_cookie: if (prog->type == BPF_PROG_TYPE_TRACING && prog->expected_attach_type == BPF_TRACE_RAW_TP) -- cgit v1.2.3 From 1dc669646762726d59be15e2de354b06e3e0cbcf Mon Sep 17 00:00:00 2001 From: Yuzuki Ishiyama Date: Wed, 21 Jan 2026 12:33:27 +0900 Subject: bpf: add bpf_strncasecmp kfunc bpf_strncasecmp() function performs same like bpf_strcasecmp() except limiting the comparison to a specific length. Signed-off-by: Yuzuki Ishiyama Acked-by: Viktor Malik Acked-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260121033328.1850010-2-ishiyama@hpc.is.uec.ac.jp Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 637677815365..b54ec0e945aa 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3413,7 +3413,7 @@ __bpf_kfunc void __bpf_trap(void) * __get_kernel_nofault instead of plain dereference to make them safe. */ -static int __bpf_strcasecmp(const char *s1, const char *s2, bool ignore_case) +static int __bpf_strncasecmp(const char *s1, const char *s2, bool ignore_case, size_t len) { char c1, c2; int i; @@ -3424,7 +3424,7 @@ static int __bpf_strcasecmp(const char *s1, const char *s2, bool ignore_case) } guard(pagefault)(); - for (i = 0; i < XATTR_SIZE_MAX; i++) { + for (i = 0; i < len && i < XATTR_SIZE_MAX; i++) { __get_kernel_nofault(&c1, s1, char, err_out); __get_kernel_nofault(&c2, s2, char, err_out); if (ignore_case) { @@ -3438,7 +3438,7 @@ static int __bpf_strcasecmp(const char *s1, const char *s2, bool ignore_case) s1++; s2++; } - return -E2BIG; + return i == XATTR_SIZE_MAX ? -E2BIG : 0; err_out: return -EFAULT; } @@ -3458,7 +3458,7 @@ err_out: */ __bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign) { - return __bpf_strcasecmp(s1__ign, s2__ign, false); + return __bpf_strncasecmp(s1__ign, s2__ign, false, XATTR_SIZE_MAX); } /** @@ -3476,7 +3476,26 @@ __bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign) */ __bpf_kfunc int bpf_strcasecmp(const char *s1__ign, const char *s2__ign) { - return __bpf_strcasecmp(s1__ign, s2__ign, true); + return __bpf_strncasecmp(s1__ign, s2__ign, true, XATTR_SIZE_MAX); +} + +/* + * bpf_strncasecmp - Compare two length-limited strings, ignoring case + * @s1__ign: One string + * @s2__ign: Another string + * @len: The maximum number of characters to compare + * + * Return: + * * %0 - Strings are equal + * * %-1 - @s1__ign is smaller + * * %1 - @s2__ign is smaller + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of strings is too large + * * %-ERANGE - One of strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strncasecmp(const char *s1__ign, const char *s2__ign, size_t len) +{ + return __bpf_strncasecmp(s1__ign, s2__ign, true, len); } /** @@ -4526,6 +4545,7 @@ BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) BTF_ID_FLAGS(func, __bpf_trap) BTF_ID_FLAGS(func, bpf_strcmp); BTF_ID_FLAGS(func, bpf_strcasecmp); +BTF_ID_FLAGS(func, bpf_strncasecmp); BTF_ID_FLAGS(func, bpf_strchr); BTF_ID_FLAGS(func, bpf_strchrnul); BTF_ID_FLAGS(func, bpf_strnchr); -- cgit v1.2.3 From 82f3b142c99cf44c7b1e70b7720169c646b9760f Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 22 Jan 2026 03:59:11 -0800 Subject: rqspinlock: Fix TAS fallback lock entry creation The TAS fallback can be invoked directly when queued spin locks are disabled, and through the slow path when paravirt is enabled for queued spin locks. In the latter case, the res_spin_lock macro will attempt the fast path and already hold the entry when entering the slow path. This will lead to creation of extraneous entries that are not released, which may cause false positives for deadlock detection. Fix this by always preceding invocation of the TAS fallback in every case with the grabbing of the held lock entry, and add a comment to make note of this. Fixes: c9102a68c070 ("rqspinlock: Add a test-and-set fallback") Reported-by: Amery Hung Signed-off-by: Kumar Kartikeya Dwivedi Tested-by: Amery Hung Link: https://lore.kernel.org/r/20260122115911.3668985-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/asm-generic/rqspinlock.h | 2 +- kernel/bpf/rqspinlock.c | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/asm-generic/rqspinlock.h b/include/asm-generic/rqspinlock.h index 0f2dcbbfee2f..5c5cf2f7fc39 100644 --- a/include/asm-generic/rqspinlock.h +++ b/include/asm-generic/rqspinlock.h @@ -191,7 +191,7 @@ static __always_inline int res_spin_lock(rqspinlock_t *lock) #else -#define res_spin_lock(lock) resilient_tas_spin_lock(lock) +#define res_spin_lock(lock) ({ grab_held_lock_entry(lock); resilient_tas_spin_lock(lock); }) #endif /* CONFIG_QUEUED_SPINLOCKS */ diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c index f7d0c8d4644e..2fdfa828e3d3 100644 --- a/kernel/bpf/rqspinlock.c +++ b/kernel/bpf/rqspinlock.c @@ -265,10 +265,11 @@ int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock) RES_INIT_TIMEOUT(ts); /* - * The fast path is not invoked for the TAS fallback, so we must grab - * the deadlock detection entry here. + * We are either called directly from res_spin_lock after grabbing the + * deadlock detection entry when queued spinlocks are disabled, or from + * resilient_queued_spin_lock_slowpath after grabbing the deadlock + * detection entry. No need to obtain it here. */ - grab_held_lock_entry(lock); /* * Since the waiting loop's time is dependent on the amount of -- cgit v1.2.3 From 2d419c44658f75e7655794341a95c0687830f3df Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:19:56 +0800 Subject: bpf: add fsession support The fsession is something that similar to kprobe session. It allow to attach a single BPF program to both the entry and the exit of the target functions. Introduce the struct bpf_fsession_link, which allows to add the link to both the fentry and fexit progs_hlist of the trampoline. Signed-off-by: Menglong Dong Co-developed-by: Leon Hwang Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260124062008.8657-2-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 19 ++++++++ include/uapi/linux/bpf.h | 1 + kernel/bpf/btf.c | 2 + kernel/bpf/syscall.c | 18 +++++++- kernel/bpf/trampoline.c | 53 ++++++++++++++++++---- kernel/bpf/verifier.c | 12 +++-- net/bpf/test_run.c | 1 + net/core/bpf_sk_storage.c | 1 + tools/include/uapi/linux/bpf.h | 1 + .../selftests/bpf/prog_tests/tracing_failure.c | 2 +- 10 files changed, 97 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5936f8e2996f..41228b0add52 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1309,6 +1309,7 @@ enum bpf_tramp_prog_type { BPF_TRAMP_MODIFY_RETURN, BPF_TRAMP_MAX, BPF_TRAMP_REPLACE, /* more than MAX */ + BPF_TRAMP_FSESSION, }; struct bpf_tramp_image { @@ -1875,6 +1876,11 @@ struct bpf_tracing_link { struct bpf_prog *tgt_prog; }; +struct bpf_fsession_link { + struct bpf_tracing_link link; + struct bpf_tramp_link fexit; +}; + struct bpf_raw_tp_link { struct bpf_link link; struct bpf_raw_event_map *btp; @@ -2169,6 +2175,19 @@ static inline void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_op #endif +static inline int bpf_fsession_cnt(struct bpf_tramp_links *links) +{ + struct bpf_tramp_links fentries = links[BPF_TRAMP_FENTRY]; + int cnt = 0; + + for (int i = 0; i < links[BPF_TRAMP_FENTRY].nr_links; i++) { + if (fentries.links[i]->link.prog->expected_attach_type == BPF_TRACE_FSESSION) + cnt++; + } + + return cnt; +} + int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog, const struct bpf_ctx_arg_aux *info, u32 cnt); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2a2ade4be60f..44e7dbc278e3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1145,6 +1145,7 @@ enum bpf_attach_type { BPF_NETKIT_PEER, BPF_TRACE_KPROBE_SESSION, BPF_TRACE_UPROBE_SESSION, + BPF_TRACE_FSESSION, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index d10b3404260f..8959f3bc1e92 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6219,6 +6219,7 @@ static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: + case BPF_TRACE_FSESSION: /* allow u64* as ctx */ if (btf_is_int(t) && t->size == 8) return 0; @@ -6820,6 +6821,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, fallthrough; case BPF_LSM_CGROUP: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: /* When LSM programs are attached to void LSM hooks * they use FEXIT trampolines and when attached to * int LSM hooks, they use MODIFY_RETURN trampolines. diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3c5c03d43f5f..b9184545c3fd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3577,6 +3577,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, case BPF_PROG_TYPE_TRACING: if (prog->expected_attach_type != BPF_TRACE_FENTRY && prog->expected_attach_type != BPF_TRACE_FEXIT && + prog->expected_attach_type != BPF_TRACE_FSESSION && prog->expected_attach_type != BPF_MODIFY_RETURN) { err = -EINVAL; goto out_put_prog; @@ -3626,7 +3627,21 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id); } - link = kzalloc(sizeof(*link), GFP_USER); + if (prog->expected_attach_type == BPF_TRACE_FSESSION) { + struct bpf_fsession_link *fslink; + + fslink = kzalloc(sizeof(*fslink), GFP_USER); + if (fslink) { + bpf_link_init(&fslink->fexit.link, BPF_LINK_TYPE_TRACING, + &bpf_tracing_link_lops, prog, attach_type); + fslink->fexit.cookie = bpf_cookie; + link = &fslink->link; + } else { + link = NULL; + } + } else { + link = kzalloc(sizeof(*link), GFP_USER); + } if (!link) { err = -ENOMEM; goto out_put_prog; @@ -4350,6 +4365,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_TRACE_RAW_TP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: case BPF_MODIFY_RETURN: return BPF_PROG_TYPE_TRACING; case BPF_LSM_MAC: diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 2a125d063e62..edf9da43762d 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -109,10 +109,17 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog) enum bpf_attach_type eatype = prog->expected_attach_type; enum bpf_prog_type ptype = prog->type; - return (ptype == BPF_PROG_TYPE_TRACING && - (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || - eatype == BPF_MODIFY_RETURN)) || - (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC); + switch (ptype) { + case BPF_PROG_TYPE_TRACING: + if (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || + eatype == BPF_MODIFY_RETURN || eatype == BPF_TRACE_FSESSION) + return true; + return false; + case BPF_PROG_TYPE_LSM: + return eatype == BPF_LSM_MAC; + default: + return false; + } } void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym) @@ -559,6 +566,8 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) return BPF_TRAMP_MODIFY_RETURN; case BPF_TRACE_FEXIT: return BPF_TRAMP_FEXIT; + case BPF_TRACE_FSESSION: + return BPF_TRAMP_FSESSION; case BPF_LSM_MAC: if (!prog->aux->attach_func_proto->type) /* The function returns void, we cannot modify its @@ -594,8 +603,10 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { + struct bpf_fsession_link *fslink = NULL; enum bpf_tramp_prog_type kind; struct bpf_tramp_link *link_exiting; + struct hlist_head *prog_list; int err = 0; int cnt = 0, i; @@ -621,24 +632,43 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, BPF_MOD_JUMP, NULL, link->link.prog->bpf_func); } + if (kind == BPF_TRAMP_FSESSION) { + prog_list = &tr->progs_hlist[BPF_TRAMP_FENTRY]; + cnt++; + } else { + prog_list = &tr->progs_hlist[kind]; + } if (cnt >= BPF_MAX_TRAMP_LINKS) return -E2BIG; if (!hlist_unhashed(&link->tramp_hlist)) /* prog already linked */ return -EBUSY; - hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) { + hlist_for_each_entry(link_exiting, prog_list, tramp_hlist) { if (link_exiting->link.prog != link->link.prog) continue; /* prog already linked */ return -EBUSY; } - hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]); - tr->progs_cnt[kind]++; + hlist_add_head(&link->tramp_hlist, prog_list); + if (kind == BPF_TRAMP_FSESSION) { + tr->progs_cnt[BPF_TRAMP_FENTRY]++; + fslink = container_of(link, struct bpf_fsession_link, link.link); + hlist_add_head(&fslink->fexit.tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]); + tr->progs_cnt[BPF_TRAMP_FEXIT]++; + } else { + tr->progs_cnt[kind]++; + } err = bpf_trampoline_update(tr, true /* lock_direct_mutex */); if (err) { hlist_del_init(&link->tramp_hlist); - tr->progs_cnt[kind]--; + if (kind == BPF_TRAMP_FSESSION) { + tr->progs_cnt[BPF_TRAMP_FENTRY]--; + hlist_del_init(&fslink->fexit.tramp_hlist); + tr->progs_cnt[BPF_TRAMP_FEXIT]--; + } else { + tr->progs_cnt[kind]--; + } } return err; } @@ -672,6 +702,13 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, guard(mutex)(&tgt_prog->aux->ext_mutex); tgt_prog->aux->is_extended = false; return err; + } else if (kind == BPF_TRAMP_FSESSION) { + struct bpf_fsession_link *fslink = + container_of(link, struct bpf_fsession_link, link.link); + + hlist_del_init(&fslink->fexit.tramp_hlist); + tr->progs_cnt[BPF_TRAMP_FEXIT]--; + kind = BPF_TRAMP_FENTRY; } hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c7f5234d5fd2..41bbed6418b5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -17848,6 +17848,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char switch (env->prog->expected_attach_type) { case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: range = retval_range(0, 0); break; case BPF_TRACE_RAW_TP: @@ -23774,6 +23775,7 @@ patch_map_ops_generic: if (prog_type == BPF_PROG_TYPE_TRACING && insn->imm == BPF_FUNC_get_func_ret) { if (eatype == BPF_TRACE_FEXIT || + eatype == BPF_TRACE_FSESSION || eatype == BPF_MODIFY_RETURN) { /* Load nr_args from ctx - 8 */ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); @@ -24725,7 +24727,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, if (tgt_prog->type == BPF_PROG_TYPE_TRACING && prog_extension && (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY || - tgt_prog->expected_attach_type == BPF_TRACE_FEXIT)) { + tgt_prog->expected_attach_type == BPF_TRACE_FEXIT || + tgt_prog->expected_attach_type == BPF_TRACE_FSESSION)) { /* Program extensions can extend all program types * except fentry/fexit. The reason is the following. * The fentry/fexit programs are used for performance @@ -24740,7 +24743,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, * beyond reasonable stack size. Hence extending fentry * is not allowed. */ - bpf_log(log, "Cannot extend fentry/fexit\n"); + bpf_log(log, "Cannot extend fentry/fexit/fsession\n"); return -EINVAL; } } else { @@ -24824,6 +24827,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, case BPF_LSM_CGROUP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: if (!btf_type_is_func(t)) { bpf_log(log, "attach_btf_id %u is not a function\n", btf_id); @@ -24990,6 +24994,7 @@ static bool can_be_sleepable(struct bpf_prog *prog) case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: case BPF_TRACE_ITER: + case BPF_TRACE_FSESSION: return true; default: return false; @@ -25071,9 +25076,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) tgt_info.tgt_name); return -EINVAL; } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT || + prog->expected_attach_type == BPF_TRACE_FSESSION || prog->expected_attach_type == BPF_MODIFY_RETURN) && btf_id_set_contains(&noreturn_deny, btf_id)) { - verbose(env, "Attaching fexit/fmod_ret to __noreturn function '%s' is rejected.\n", + verbose(env, "Attaching fexit/fsession/fmod_ret to __noreturn function '%s' is rejected.\n", tgt_info.tgt_name); return -EINVAL; } diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 26cfcfdc45eb..178c4738e63b 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -685,6 +685,7 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog, switch (prog->expected_attach_type) { case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: if (bpf_fentry_test1(1) != 2 || bpf_fentry_test2(2, 3) != 5 || bpf_fentry_test3(4, 5, 6) != 15 || diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 850dd736ccd1..de111818f3a0 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -365,6 +365,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) return true; case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: return !!strncmp(prog->aux->attach_func_name, "bpf_sk_storage", strlen("bpf_sk_storage")); default: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b816bc53d2e1..3ca7d76e05f0 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1145,6 +1145,7 @@ enum bpf_attach_type { BPF_NETKIT_PEER, BPF_TRACE_KPROBE_SESSION, BPF_TRACE_UPROBE_SESSION, + BPF_TRACE_FSESSION, __MAX_BPF_ATTACH_TYPE }; diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_failure.c b/tools/testing/selftests/bpf/prog_tests/tracing_failure.c index 10e231965589..f9f9e1cb87bf 100644 --- a/tools/testing/selftests/bpf/prog_tests/tracing_failure.c +++ b/tools/testing/selftests/bpf/prog_tests/tracing_failure.c @@ -73,7 +73,7 @@ static void test_tracing_deny(void) static void test_fexit_noreturns(void) { test_tracing_fail_prog("fexit_noreturns", - "Attaching fexit/fmod_ret to __noreturn function 'do_exit' is rejected."); + "Attaching fexit/fsession/fmod_ret to __noreturn function 'do_exit' is rejected."); } void test_tracing_failure(void) -- cgit v1.2.3 From f1b56b3cbdb2d2a51d8ea91008eddf4d1d9f277b Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:19:57 +0800 Subject: bpf: use the least significant byte for the nr_args in trampoline For now, ((u64 *)ctx)[-1] is used to store the nr_args in the trampoline. However, 1 byte is enough to store such information. Therefore, we use only the least significant byte of ((u64 *)ctx)[-1] to store the nr_args, and reserve the rest for other usages. Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20260124062008.8657-3-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 39 +++++++++++++++++++++++---------------- kernel/trace/bpf_trace.c | 6 +++--- 2 files changed, 26 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 41bbed6418b5..2081343a848d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23747,19 +23747,21 @@ patch_map_ops_generic: /* skip 'void *__data' in btf_trace_##name() and save to reg0 */ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, nr_args - 1); + cnt = 1; } else { /* Load nr_args from ctx - 8 */ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF); + cnt = 2; } - insn_buf[1] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6); - insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3); - insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1); - insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0); - insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0); - insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0); - insn_buf[7] = BPF_JMP_A(1); - insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL); - cnt = 9; + insn_buf[cnt++] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6); + insn_buf[cnt++] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3); + insn_buf[cnt++] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1); + insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0); + insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0); + insn_buf[cnt++] = BPF_MOV64_IMM(BPF_REG_0, 0); + insn_buf[cnt++] = BPF_JMP_A(1); + insn_buf[cnt++] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL); new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) @@ -23779,12 +23781,13 @@ patch_map_ops_generic: eatype == BPF_MODIFY_RETURN) { /* Load nr_args from ctx - 8 */ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); - insn_buf[1] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3); - insn_buf[2] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1); - insn_buf[3] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0); - insn_buf[4] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0); - insn_buf[5] = BPF_MOV64_IMM(BPF_REG_0, 0); - cnt = 6; + insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF); + insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3); + insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1); + insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0); + insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0); + insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0); + cnt = 7; } else { insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, -EOPNOTSUPP); cnt = 1; @@ -23808,15 +23811,19 @@ patch_map_ops_generic: /* skip 'void *__data' in btf_trace_##name() and save to reg0 */ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, nr_args - 1); + cnt = 1; } else { /* Load nr_args from ctx - 8 */ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF); + cnt = 2; } - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1); + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) return -ENOMEM; + delta += cnt - 1; env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; goto next_insn; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0e9635bcd783..d466a1503da3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1194,7 +1194,7 @@ const struct bpf_func_proto bpf_get_branch_snapshot_proto = { BPF_CALL_3(get_func_arg, void *, ctx, u32, n, u64 *, value) { /* This helper call is inlined by verifier. */ - u64 nr_args = ((u64 *)ctx)[-1]; + u64 nr_args = ((u64 *)ctx)[-1] & 0xFF; if ((u64) n >= nr_args) return -EINVAL; @@ -1214,7 +1214,7 @@ static const struct bpf_func_proto bpf_get_func_arg_proto = { BPF_CALL_2(get_func_ret, void *, ctx, u64 *, value) { /* This helper call is inlined by verifier. */ - u64 nr_args = ((u64 *)ctx)[-1]; + u64 nr_args = ((u64 *)ctx)[-1] & 0xFF; *value = ((u64 *)ctx)[nr_args]; return 0; @@ -1231,7 +1231,7 @@ static const struct bpf_func_proto bpf_get_func_ret_proto = { BPF_CALL_1(get_func_arg_cnt, void *, ctx) { /* This helper call is inlined by verifier. */ - return ((u64 *)ctx)[-1]; + return ((u64 *)ctx)[-1] & 0xFF; } static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = { -- cgit v1.2.3 From 8fe4dc4f6456b3d2c9e6f8aeb1f978b7bff0f6c8 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:19:58 +0800 Subject: bpf: change prototype of bpf_session_{cookie,is_return} Add the function argument of "void *ctx" to bpf_session_cookie() and bpf_session_is_return(), which is a preparation of the next patch. The two kfunc is seldom used now, so it will not introduce much effect to change their function prototype. Signed-off-by: Menglong Dong Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20260124062008.8657-4-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 +++++- kernel/trace/bpf_trace.c | 4 ++-- tools/testing/selftests/bpf/bpf_kfuncs.h | 3 --- .../selftests/bpf/progs/kprobe_multi_session_cookie.c | 15 +++++++-------- tools/testing/selftests/bpf/progs/uprobe_multi_session.c | 7 +++---- .../selftests/bpf/progs/uprobe_multi_session_cookie.c | 15 +++++++-------- .../selftests/bpf/progs/uprobe_multi_session_recursive.c | 11 +++++------ 7 files changed, 29 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2081343a848d..0fa73d56cb8b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12484,6 +12484,7 @@ enum special_kfunc_type { KF_bpf_arena_alloc_pages, KF_bpf_arena_free_pages, KF_bpf_arena_reserve_pages, + KF_bpf_session_is_return, }; BTF_ID_LIST(special_kfunc_list) @@ -12561,6 +12562,7 @@ BTF_ID(func, bpf_task_work_schedule_resume) BTF_ID(func, bpf_arena_alloc_pages) BTF_ID(func, bpf_arena_free_pages) BTF_ID(func, bpf_arena_reserve_pages) +BTF_ID(func, bpf_session_is_return) static bool is_task_work_add_kfunc(u32 func_id) { @@ -12615,7 +12617,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg = ®s[regno]; bool arg_mem_size = false; - if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) + if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] || + meta->func_id == special_kfunc_list[KF_bpf_session_is_return] || + meta->func_id == special_kfunc_list[KF_bpf_session_cookie]) return KF_ARG_PTR_TO_CTX; if (argno + 1 < nargs && diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d466a1503da3..13f0a2de33b7 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3323,7 +3323,7 @@ static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx) __bpf_kfunc_start_defs(); -__bpf_kfunc bool bpf_session_is_return(void) +__bpf_kfunc bool bpf_session_is_return(void *ctx) { struct bpf_session_run_ctx *session_ctx; @@ -3331,7 +3331,7 @@ __bpf_kfunc bool bpf_session_is_return(void) return session_ctx->is_return; } -__bpf_kfunc __u64 *bpf_session_cookie(void) +__bpf_kfunc __u64 *bpf_session_cookie(void *ctx) { struct bpf_session_run_ctx *session_ctx; diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h index e0189254bb6e..7dad01439391 100644 --- a/tools/testing/selftests/bpf/bpf_kfuncs.h +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h @@ -79,9 +79,6 @@ extern int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_ptr, struct bpf_dynptr *sig_ptr, struct bpf_key *trusted_keyring) __ksym; -extern bool bpf_session_is_return(void) __ksym __weak; -extern __u64 *bpf_session_cookie(void) __ksym __weak; - struct dentry; /* Description * Returns xattr of a dentry diff --git a/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c b/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c index 0835b5edf685..ad627016e3e5 100644 --- a/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c +++ b/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c @@ -1,9 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include #include #include -#include "bpf_kfuncs.h" char _license[] SEC("license") = "GPL"; @@ -23,16 +22,16 @@ int BPF_PROG(trigger) return 0; } -static int check_cookie(__u64 val, __u64 *result) +static int check_cookie(struct pt_regs *ctx, __u64 val, __u64 *result) { __u64 *cookie; if (bpf_get_current_pid_tgid() >> 32 != pid) return 1; - cookie = bpf_session_cookie(); + cookie = bpf_session_cookie(ctx); - if (bpf_session_is_return()) + if (bpf_session_is_return(ctx)) *result = *cookie == val ? val : 0; else *cookie = val; @@ -42,17 +41,17 @@ static int check_cookie(__u64 val, __u64 *result) SEC("kprobe.session/bpf_fentry_test1") int test_kprobe_1(struct pt_regs *ctx) { - return check_cookie(1, &test_kprobe_1_result); + return check_cookie(ctx, 1, &test_kprobe_1_result); } SEC("kprobe.session/bpf_fentry_test1") int test_kprobe_2(struct pt_regs *ctx) { - return check_cookie(2, &test_kprobe_2_result); + return check_cookie(ctx, 2, &test_kprobe_2_result); } SEC("kprobe.session/bpf_fentry_test1") int test_kprobe_3(struct pt_regs *ctx) { - return check_cookie(3, &test_kprobe_3_result); + return check_cookie(ctx, 3, &test_kprobe_3_result); } diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_session.c b/tools/testing/selftests/bpf/progs/uprobe_multi_session.c index 30bff90b68dc..6e46bb00ff58 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_multi_session.c +++ b/tools/testing/selftests/bpf/progs/uprobe_multi_session.c @@ -1,9 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include #include #include -#include "bpf_kfuncs.h" #include "bpf_misc.h" char _license[] SEC("license") = "GPL"; @@ -51,7 +50,7 @@ static int uprobe_multi_check(void *ctx, bool is_return) SEC("uprobe.session//proc/self/exe:uprobe_multi_func_*") int uprobe(struct pt_regs *ctx) { - return uprobe_multi_check(ctx, bpf_session_is_return()); + return uprobe_multi_check(ctx, bpf_session_is_return(ctx)); } static __always_inline bool verify_sleepable_user_copy(void) @@ -67,5 +66,5 @@ int uprobe_sleepable(struct pt_regs *ctx) { if (verify_sleepable_user_copy()) uprobe_multi_sleep_result++; - return uprobe_multi_check(ctx, bpf_session_is_return()); + return uprobe_multi_check(ctx, bpf_session_is_return(ctx)); } diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c b/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c index 5befdf944dc6..b5db196614a9 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c +++ b/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c @@ -1,9 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include #include #include -#include "bpf_kfuncs.h" char _license[] SEC("license") = "GPL"; @@ -13,16 +12,16 @@ __u64 test_uprobe_1_result = 0; __u64 test_uprobe_2_result = 0; __u64 test_uprobe_3_result = 0; -static int check_cookie(__u64 val, __u64 *result) +static int check_cookie(struct pt_regs *ctx, __u64 val, __u64 *result) { __u64 *cookie; if (bpf_get_current_pid_tgid() >> 32 != pid) return 1; - cookie = bpf_session_cookie(); + cookie = bpf_session_cookie(ctx); - if (bpf_session_is_return()) + if (bpf_session_is_return(ctx)) *result = *cookie == val ? val : 0; else *cookie = val; @@ -32,17 +31,17 @@ static int check_cookie(__u64 val, __u64 *result) SEC("uprobe.session//proc/self/exe:uprobe_multi_func_1") int uprobe_1(struct pt_regs *ctx) { - return check_cookie(1, &test_uprobe_1_result); + return check_cookie(ctx, 1, &test_uprobe_1_result); } SEC("uprobe.session//proc/self/exe:uprobe_multi_func_2") int uprobe_2(struct pt_regs *ctx) { - return check_cookie(2, &test_uprobe_2_result); + return check_cookie(ctx, 2, &test_uprobe_2_result); } SEC("uprobe.session//proc/self/exe:uprobe_multi_func_3") int uprobe_3(struct pt_regs *ctx) { - return check_cookie(3, &test_uprobe_3_result); + return check_cookie(ctx, 3, &test_uprobe_3_result); } diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_session_recursive.c b/tools/testing/selftests/bpf/progs/uprobe_multi_session_recursive.c index 8fbcd69fae22..3ce309248a04 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_multi_session_recursive.c +++ b/tools/testing/selftests/bpf/progs/uprobe_multi_session_recursive.c @@ -1,9 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include #include #include -#include "bpf_kfuncs.h" #include "bpf_misc.h" char _license[] SEC("license") = "GPL"; @@ -16,11 +15,11 @@ int idx_return = 0; __u64 test_uprobe_cookie_entry[6]; __u64 test_uprobe_cookie_return[3]; -static int check_cookie(void) +static int check_cookie(struct pt_regs *ctx) { - __u64 *cookie = bpf_session_cookie(); + __u64 *cookie = bpf_session_cookie(ctx); - if (bpf_session_is_return()) { + if (bpf_session_is_return(ctx)) { if (idx_return >= ARRAY_SIZE(test_uprobe_cookie_return)) return 1; test_uprobe_cookie_return[idx_return++] = *cookie; @@ -40,5 +39,5 @@ int uprobe_recursive(struct pt_regs *ctx) if (bpf_get_current_pid_tgid() >> 32 != pid) return 1; - return check_cookie(); + return check_cookie(ctx); } -- cgit v1.2.3 From 27d89baa6da8e5e546585c53a959176d1302d46e Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:19:59 +0800 Subject: bpf: support fsession for bpf_session_is_return If fsession exists, we will use the bit (1 << BPF_TRAMP_IS_RETURN_SHIFT) in ((u64 *)ctx)[-1] to store the "is_return" flag. The logic of bpf_session_is_return() for fsession is implemented in the verifier by inline following code: bool bpf_session_is_return(void *ctx) { return (((u64 *)ctx)[-1] >> BPF_TRAMP_IS_RETURN_SHIFT) & 1; } Signed-off-by: Menglong Dong Co-developed-by: Leon Hwang Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260124062008.8657-5-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ kernel/bpf/verifier.c | 13 +++++++++++++ kernel/trace/bpf_trace.c | 39 ++++++++++++++++++++++++++------------- 3 files changed, 41 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 41228b0add52..29eecd79352e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1229,6 +1229,8 @@ enum { #endif }; +#define BPF_TRAMP_IS_RETURN_SHIFT 63 + struct bpf_tramp_links { struct bpf_tramp_link *links[BPF_MAX_TRAMP_LINKS]; int nr_links; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0fa73d56cb8b..d04aea235a12 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23011,6 +23011,19 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); *cnt = 1; + } else if (desc->func_id == special_kfunc_list[KF_bpf_session_is_return] && + env->prog->expected_attach_type == BPF_TRACE_FSESSION) { + /* + * inline the bpf_session_is_return() for fsession: + * bool bpf_session_is_return(void *ctx) + * { + * return (((u64 *)ctx)[-1] >> BPF_TRAMP_IS_RETURN_SHIFT) & 1; + * } + */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_IS_RETURN_SHIFT); + insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1); + *cnt = 3; } if (env->insn_aux_data[insn_idx].arg_prog) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 13f0a2de33b7..f7baeb8278ca 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1286,7 +1286,8 @@ static bool is_kprobe_multi(const struct bpf_prog *prog) static inline bool is_kprobe_session(const struct bpf_prog *prog) { - return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION; + return prog->type == BPF_PROG_TYPE_KPROBE && + prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION; } static inline bool is_uprobe_multi(const struct bpf_prog *prog) @@ -1297,7 +1298,14 @@ static inline bool is_uprobe_multi(const struct bpf_prog *prog) static inline bool is_uprobe_session(const struct bpf_prog *prog) { - return prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION; + return prog->type == BPF_PROG_TYPE_KPROBE && + prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION; +} + +static inline bool is_trace_fsession(const struct bpf_prog *prog) +{ + return prog->type == BPF_PROG_TYPE_TRACING && + prog->expected_attach_type == BPF_TRACE_FSESSION; } static const struct bpf_func_proto * @@ -3341,34 +3349,39 @@ __bpf_kfunc __u64 *bpf_session_cookie(void *ctx) __bpf_kfunc_end_defs(); -BTF_KFUNCS_START(kprobe_multi_kfunc_set_ids) +BTF_KFUNCS_START(session_kfunc_set_ids) BTF_ID_FLAGS(func, bpf_session_is_return) BTF_ID_FLAGS(func, bpf_session_cookie) -BTF_KFUNCS_END(kprobe_multi_kfunc_set_ids) +BTF_KFUNCS_END(session_kfunc_set_ids) -static int bpf_kprobe_multi_filter(const struct bpf_prog *prog, u32 kfunc_id) +static int bpf_session_filter(const struct bpf_prog *prog, u32 kfunc_id) { - if (!btf_id_set8_contains(&kprobe_multi_kfunc_set_ids, kfunc_id)) + if (!btf_id_set8_contains(&session_kfunc_set_ids, kfunc_id)) return 0; - if (!is_kprobe_session(prog) && !is_uprobe_session(prog)) + if (!is_kprobe_session(prog) && !is_uprobe_session(prog) && !is_trace_fsession(prog)) return -EACCES; return 0; } -static const struct btf_kfunc_id_set bpf_kprobe_multi_kfunc_set = { +static const struct btf_kfunc_id_set bpf_session_kfunc_set = { .owner = THIS_MODULE, - .set = &kprobe_multi_kfunc_set_ids, - .filter = bpf_kprobe_multi_filter, + .set = &session_kfunc_set_ids, + .filter = bpf_session_filter, }; -static int __init bpf_kprobe_multi_kfuncs_init(void) +static int __init bpf_trace_kfuncs_init(void) { - return register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_kprobe_multi_kfunc_set); + int err = 0; + + err = err ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_session_kfunc_set); + err = err ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_session_kfunc_set); + + return err; } -late_initcall(bpf_kprobe_multi_kfuncs_init); +late_initcall(bpf_trace_kfuncs_init); typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struct *tsk); -- cgit v1.2.3 From eeee4239dbb155a98f8ba737324ac081acde8417 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:20:00 +0800 Subject: bpf: support fsession for bpf_session_cookie Implement session cookie for fsession. The session cookies will be stored in the stack, and the layout of the stack will look like this: return value -> 8 bytes argN -> 8 bytes ... arg1 -> 8 bytes nr_args -> 8 bytes ip (optional) -> 8 bytes cookie2 -> 8 bytes cookie1 -> 8 bytes The offset of the cookie for the current bpf program, which is in 8-byte units, is stored in the "(((u64 *)ctx)[-1] >> BPF_TRAMP_COOKIE_INDEX_SHIFT) & 0xFF". Therefore, we can get the session cookie with ((u64 *)ctx)[-offset]. Implement and inline the bpf_session_cookie() for the fsession in the verifier. Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20260124062008.8657-6-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 15 +++++++++++++++ kernel/bpf/verifier.c | 20 ++++++++++++++++++++ 2 files changed, 35 insertions(+) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 29eecd79352e..4427c6e98331 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1229,6 +1229,7 @@ enum { #endif }; +#define BPF_TRAMP_COOKIE_INDEX_SHIFT 8 #define BPF_TRAMP_IS_RETURN_SHIFT 63 struct bpf_tramp_links { @@ -1782,6 +1783,7 @@ struct bpf_prog { enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */ call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */ call_get_func_ip:1, /* Do we call get_func_ip() */ + call_session_cookie:1, /* Do we call bpf_session_cookie() */ tstamp_type_access:1, /* Accessed __sk_buff->tstamp_type */ sleepable:1; /* BPF program is sleepable */ enum bpf_prog_type type; /* Type of BPF program */ @@ -2190,6 +2192,19 @@ static inline int bpf_fsession_cnt(struct bpf_tramp_links *links) return cnt; } +static inline int bpf_fsession_cookie_cnt(struct bpf_tramp_links *links) +{ + struct bpf_tramp_links fentries = links[BPF_TRAMP_FENTRY]; + int cnt = 0; + + for (int i = 0; i < links[BPF_TRAMP_FENTRY].nr_links; i++) { + if (fentries.links[i]->link.prog->call_session_cookie) + cnt++; + } + + return cnt; +} + int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog, const struct bpf_ctx_arg_aux *info, u32 cnt); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d04aea235a12..c2f2650db9fd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14406,6 +14406,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return err; } + if (meta.func_id == special_kfunc_list[KF_bpf_session_cookie]) + env->prog->call_session_cookie = true; + return 0; } @@ -23024,6 +23027,23 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_IS_RETURN_SHIFT); insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1); *cnt = 3; + } else if (desc->func_id == special_kfunc_list[KF_bpf_session_cookie] && + env->prog->expected_attach_type == BPF_TRACE_FSESSION) { + /* + * inline bpf_session_cookie() for fsession: + * __u64 *bpf_session_cookie(void *ctx) + * { + * u64 off = (((u64 *)ctx)[-1] >> BPF_TRAMP_COOKIE_INDEX_SHIFT) & 0xFF; + * return &((u64 *)ctx)[-off]; + * } + */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_COOKIE_INDEX_SHIFT); + insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF); + insn_buf[3] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3); + insn_buf[4] = BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1); + insn_buf[5] = BPF_ALU64_IMM(BPF_NEG, BPF_REG_0, 0); + *cnt = 6; } if (env->insn_aux_data[insn_idx].arg_prog) { -- cgit v1.2.3 From 752b807028e63f1473b84eb1350e131eca5e5249 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Tue, 27 Jan 2026 08:51:10 +0000 Subject: bpf: add new BPF_CGROUP_ITER_CHILDREN control option Currently, the BPF cgroup iterator supports walking descendants in either pre-order (BPF_CGROUP_ITER_DESCENDANTS_PRE) or post-order (BPF_CGROUP_ITER_DESCENDANTS_POST). These modes perform an exhaustive depth-first search (DFS) of the hierarchy. In scenarios where a BPF program may need to inspect only the direct children of a given parent cgroup, a full DFS is unnecessarily expensive. This patch introduces a new BPF cgroup iterator control option, BPF_CGROUP_ITER_CHILDREN. This control option restricts the traversal to the immediate children of a specified parent cgroup, allowing for more targeted and efficient iteration, particularly when exhaustive depth-first search (DFS) traversal is not required. Signed-off-by: Matt Bobrowski Link: https://lore.kernel.org/r/20260127085112.3608687-1-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 8 ++++++++ kernel/bpf/cgroup_iter.c | 26 +++++++++++++++++++++----- tools/include/uapi/linux/bpf.h | 8 ++++++++ 3 files changed, 37 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 44e7dbc278e3..c8d400b7680a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -119,6 +119,14 @@ enum bpf_cgroup_iter_order { BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ + /* + * Walks the immediate children of the specified parent + * cgroup_subsys_state. Unlike BPF_CGROUP_ITER_DESCENDANTS_PRE, + * BPF_CGROUP_ITER_DESCENDANTS_POST, and BPF_CGROUP_ITER_ANCESTORS_UP + * the iterator does not include the specified parent as one of the + * returned iterator elements. + */ + BPF_CGROUP_ITER_CHILDREN, }; union bpf_iter_link_info { diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c index f04a468cf6a7..fd51fe3d92cc 100644 --- a/kernel/bpf/cgroup_iter.c +++ b/kernel/bpf/cgroup_iter.c @@ -8,12 +8,13 @@ #include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */ -/* cgroup_iter provides four modes of traversal to the cgroup hierarchy. +/* cgroup_iter provides five modes of traversal to the cgroup hierarchy. * * 1. Walk the descendants of a cgroup in pre-order. * 2. Walk the descendants of a cgroup in post-order. * 3. Walk the ancestors of a cgroup. * 4. Show the given cgroup only. + * 5. Walk the children of a given parent cgroup. * * For walking descendants, cgroup_iter can walk in either pre-order or * post-order. For walking ancestors, the iter walks up from a cgroup to @@ -78,6 +79,8 @@ static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos) return css_next_descendant_pre(NULL, p->start_css); else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) return css_next_descendant_post(NULL, p->start_css); + else if (p->order == BPF_CGROUP_ITER_CHILDREN) + return css_next_child(NULL, p->start_css); else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */ return p->start_css; } @@ -113,6 +116,8 @@ static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) return css_next_descendant_post(curr, p->start_css); else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) return curr->parent; + else if (p->order == BPF_CGROUP_ITER_CHILDREN) + return css_next_child(curr, p->start_css); else /* BPF_CGROUP_ITER_SELF_ONLY */ return NULL; } @@ -200,11 +205,16 @@ static int bpf_iter_attach_cgroup(struct bpf_prog *prog, int order = linfo->cgroup.order; struct cgroup *cgrp; - if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE && - order != BPF_CGROUP_ITER_DESCENDANTS_POST && - order != BPF_CGROUP_ITER_ANCESTORS_UP && - order != BPF_CGROUP_ITER_SELF_ONLY) + switch (order) { + case BPF_CGROUP_ITER_DESCENDANTS_PRE: + case BPF_CGROUP_ITER_DESCENDANTS_POST: + case BPF_CGROUP_ITER_ANCESTORS_UP: + case BPF_CGROUP_ITER_SELF_ONLY: + case BPF_CGROUP_ITER_CHILDREN: + break; + default: return -EINVAL; + } if (fd && id) return -EINVAL; @@ -257,6 +267,8 @@ show_order: seq_puts(seq, "order: descendants_post\n"); else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP) seq_puts(seq, "order: ancestors_up\n"); + else if (aux->cgroup.order == BPF_CGROUP_ITER_CHILDREN) + seq_puts(seq, "order: children\n"); else /* BPF_CGROUP_ITER_SELF_ONLY */ seq_puts(seq, "order: self_only\n"); } @@ -320,6 +332,7 @@ __bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it, case BPF_CGROUP_ITER_DESCENDANTS_PRE: case BPF_CGROUP_ITER_DESCENDANTS_POST: case BPF_CGROUP_ITER_ANCESTORS_UP: + case BPF_CGROUP_ITER_CHILDREN: break; default: return -EINVAL; @@ -345,6 +358,9 @@ __bpf_kfunc struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *i case BPF_CGROUP_ITER_DESCENDANTS_POST: kit->pos = css_next_descendant_post(kit->pos, kit->start); break; + case BPF_CGROUP_ITER_CHILDREN: + kit->pos = css_next_child(kit->pos, kit->start); + break; case BPF_CGROUP_ITER_ANCESTORS_UP: kit->pos = kit->pos ? kit->pos->parent : kit->start; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3ca7d76e05f0..5e38b4887de6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -119,6 +119,14 @@ enum bpf_cgroup_iter_order { BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ + /* + * Walks the immediate children of the specified parent + * cgroup_subsys_state. Unlike BPF_CGROUP_ITER_DESCENDANTS_PRE, + * BPF_CGROUP_ITER_DESCENDANTS_POST, and BPF_CGROUP_ITER_ANCESTORS_UP + * the iterator does not include the specified parent as one of the + * returned iterator elements. + */ + BPF_CGROUP_ITER_CHILDREN, }; union bpf_iter_link_info { -- cgit v1.2.3 From ae23bc81ddf7c17b663c4ed1b21e35527b0a7131 Mon Sep 17 00:00:00 2001 From: Guillaume Gonnet Date: Tue, 27 Jan 2026 17:02:00 +0100 Subject: bpf: Fix tcx/netkit detach permissions when prog fd isn't given This commit fixes a security issue where BPF_PROG_DETACH on tcx or netkit devices could be executed by any user when no program fd was provided, bypassing permission checks. The fix adds a capability check for CAP_NET_ADMIN or CAP_SYS_ADMIN in this case. Fixes: e420bed02507 ("bpf: Add fd-based tcx multi-prog infra with link support") Signed-off-by: Guillaume Gonnet Link: https://lore.kernel.org/r/20260127160200.10395-1-ggonnet.linux@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 5 +++++ include/linux/bpf_mprog.h | 10 ++++++++++ kernel/bpf/syscall.c | 7 ++----- 3 files changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4427c6e98331..9272a237cced 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3362,6 +3362,11 @@ static inline void bpf_prog_report_arena_violation(bool write, unsigned long add } #endif /* CONFIG_BPF_SYSCALL */ +static inline bool bpf_net_capable(void) +{ + return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN); +} + static __always_inline int bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr) { diff --git a/include/linux/bpf_mprog.h b/include/linux/bpf_mprog.h index 929225f7b095..0b9f4caeeb0a 100644 --- a/include/linux/bpf_mprog.h +++ b/include/linux/bpf_mprog.h @@ -340,4 +340,14 @@ static inline bool bpf_mprog_supported(enum bpf_prog_type type) return false; } } + +static inline bool bpf_mprog_detach_empty(enum bpf_prog_type type) +{ + switch (type) { + case BPF_PROG_TYPE_SCHED_CLS: + return bpf_net_capable(); + default: + return false; + } +} #endif /* __BPF_MPROG_H */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b9184545c3fd..5f59dd47a5b1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1363,11 +1363,6 @@ free_map_tab: return ret; } -static bool bpf_net_capable(void) -{ - return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN); -} - #define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size /* called via syscall */ static int map_create(union bpf_attr *attr, bpfptr_t uattr) @@ -4579,6 +4574,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); if (IS_ERR(prog)) return PTR_ERR(prog); + } else if (!bpf_mprog_detach_empty(ptype)) { + return -EPERM; } } else if (is_cgroup_prog_type(ptype, 0, false)) { if (attr->attach_flags || attr->relative_fd) -- cgit v1.2.3 From 4be42c92220128b3128854a2d6b0f0ad0bcedbdb Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:02 +0100 Subject: ftrace,bpf: Remove FTRACE_OPS_FL_JMP ftrace_ops flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At the moment the we allow the jmp attach only for ftrace_ops that has FTRACE_OPS_FL_JMP set. This conflicts with following changes where we use single ftrace_ops object for all direct call sites, so all could be be attached via just call or jmp. We already limit the jmp attach support with config option and bit (LSB) set on the trampoline address. It turns out that's actually enough to limit the jmp attach for architecture and only for chosen addresses (with LSB bit set). Each user of register_ftrace_direct or modify_ftrace_direct can set the trampoline bit (LSB) to indicate it has to be attached by jmp. The bpf trampoline generation code uses trampoline flags to generate jmp-attach specific code and ftrace inner code uses the trampoline bit (LSB) to handle return from jmp attachment, so there's no harm to remove the FTRACE_OPS_FL_JMP bit. The fexit/fmodret performance stays the same (did not drop), current code: fentry : 77.904 ± 0.546M/s fexit : 62.430 ± 0.554M/s fmodret : 66.503 ± 0.902M/s with this change: fentry : 80.472 ± 0.061M/s fexit : 63.995 ± 0.127M/s fmodret : 67.362 ± 0.175M/s Fixes: 25e4e3565d45 ("ftrace: Introduce FTRACE_OPS_FL_JMP") Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20251230145010.103439-2-jolsa@kernel.org --- include/linux/ftrace.h | 1 - kernel/bpf/trampoline.c | 32 ++++++++++++++------------------ kernel/trace/ftrace.c | 14 -------------- 3 files changed, 14 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index a3a8989e3268..cc869c59c1a6 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -359,7 +359,6 @@ enum { FTRACE_OPS_FL_DIRECT = BIT(17), FTRACE_OPS_FL_SUBOP = BIT(18), FTRACE_OPS_FL_GRAPH = BIT(19), - FTRACE_OPS_FL_JMP = BIT(20), }; #ifndef CONFIG_DYNAMIC_FTRACE_WITH_ARGS diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index edf9da43762d..468de930b2ed 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -221,10 +221,15 @@ static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags, int ret; if (tr->func.ftrace_managed) { + unsigned long addr = (unsigned long) new_addr; + + if (bpf_trampoline_use_jmp(tr->flags)) + addr = ftrace_jmp_set(addr); + if (lock_direct_mutex) - ret = modify_ftrace_direct(tr->fops, (long)new_addr); + ret = modify_ftrace_direct(tr->fops, addr); else - ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr); + ret = modify_ftrace_direct_nolock(tr->fops, addr); } else { ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, new_addr); @@ -247,10 +252,15 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) } if (tr->func.ftrace_managed) { + unsigned long addr = (unsigned long) new_addr; + + if (bpf_trampoline_use_jmp(tr->flags)) + addr = ftrace_jmp_set(addr); + ret = ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1); if (ret) return ret; - ret = register_ftrace_direct(tr->fops, (long)new_addr); + ret = register_ftrace_direct(tr->fops, addr); } else { ret = bpf_trampoline_update_fentry(tr, 0, NULL, new_addr); } @@ -506,13 +516,6 @@ again: if (err) goto out_free; -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP - if (bpf_trampoline_use_jmp(tr->flags)) - tr->fops->flags |= FTRACE_OPS_FL_JMP; - else - tr->fops->flags &= ~FTRACE_OPS_FL_JMP; -#endif - WARN_ON(tr->cur_image && total == 0); if (tr->cur_image) /* progs already running at this address */ @@ -540,15 +543,8 @@ again: tr->cur_image = im; out: /* If any error happens, restore previous flags */ - if (err) { + if (err) tr->flags = orig_flags; -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP - if (bpf_trampoline_use_jmp(tr->flags)) - tr->fops->flags |= FTRACE_OPS_FL_JMP; - else - tr->fops->flags &= ~FTRACE_OPS_FL_JMP; -#endif - } kfree(tlinks); return err; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ef2d5dca6f70..6040aacc97da 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6046,15 +6046,8 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) if (ftrace_hash_empty(hash)) return -EINVAL; - /* This is a "raw" address, and this should never happen. */ - if (WARN_ON_ONCE(ftrace_is_jmp(addr))) - return -EINVAL; - mutex_lock(&direct_mutex); - if (ops->flags & FTRACE_OPS_FL_JMP) - addr = ftrace_jmp_set(addr); - /* Make sure requested entries are not already registered.. */ size = 1 << hash->size_bits; for (i = 0; i < size; i++) { @@ -6175,13 +6168,6 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) lockdep_assert_held_once(&direct_mutex); - /* This is a "raw" address, and this should never happen. */ - if (WARN_ON_ONCE(ftrace_is_jmp(addr))) - return -EINVAL; - - if (ops->flags & FTRACE_OPS_FL_JMP) - addr = ftrace_jmp_set(addr); - /* Enable the tmp_ops to have the same functions as the direct ops */ ftrace_ops_init(&tmp_ops); tmp_ops.func_hash = ops->func_hash; -- cgit v1.2.3 From 676bfeae7bd55ca405670798711ac7c46b48655d Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:03 +0100 Subject: ftrace: Make alloc_and_copy_ftrace_hash direct friendly Make alloc_and_copy_ftrace_hash to copy also direct address for each hash entry. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20251230145010.103439-3-jolsa@kernel.org --- kernel/trace/ftrace.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6040aacc97da..2a63860c172e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1212,7 +1212,7 @@ static void __add_hash_entry(struct ftrace_hash *hash, } static struct ftrace_func_entry * -add_hash_entry(struct ftrace_hash *hash, unsigned long ip) +add_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigned long direct) { struct ftrace_func_entry *entry; @@ -1221,11 +1221,18 @@ add_hash_entry(struct ftrace_hash *hash, unsigned long ip) return NULL; entry->ip = ip; + entry->direct = direct; __add_hash_entry(hash, entry); return entry; } +static struct ftrace_func_entry * +add_hash_entry(struct ftrace_hash *hash, unsigned long ip) +{ + return add_hash_entry_direct(hash, ip, 0); +} + static void free_hash_entry(struct ftrace_hash *hash, struct ftrace_func_entry *entry) @@ -1398,7 +1405,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) size = 1 << hash->size_bits; for (i = 0; i < size; i++) { hlist_for_each_entry(entry, &hash->buckets[i], hlist) { - if (add_hash_entry(new_hash, entry->ip) == NULL) + if (add_hash_entry_direct(new_hash, entry->ip, entry->direct) == NULL) goto free_hash; } } -- cgit v1.2.3 From 0e860d07c29d70205d5ad48456ac7a133ccfb293 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:04 +0100 Subject: ftrace: Export some of hash related functions We are going to use these functions in following changes. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20251230145010.103439-4-jolsa@kernel.org --- include/linux/ftrace.h | 9 +++++++++ kernel/trace/ftrace.c | 13 ++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index cc869c59c1a6..a77b57a4aba2 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -82,6 +82,7 @@ static inline void early_trace_init(void) { } struct module; struct ftrace_hash; +struct ftrace_func_entry; #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_MODULES) && \ defined(CONFIG_DYNAMIC_FTRACE) @@ -405,6 +406,14 @@ enum ftrace_ops_cmd { typedef int (*ftrace_ops_func_t)(struct ftrace_ops *op, enum ftrace_ops_cmd cmd); #ifdef CONFIG_DYNAMIC_FTRACE + +#define FTRACE_HASH_DEFAULT_BITS 10 + +struct ftrace_hash *alloc_ftrace_hash(int size_bits); +void free_ftrace_hash(struct ftrace_hash *hash); +struct ftrace_func_entry *add_ftrace_hash_entry_direct(struct ftrace_hash *hash, + unsigned long ip, unsigned long direct); + /* The hash used to know what functions callbacks trace */ struct ftrace_ops_hash { struct ftrace_hash __rcu *notrace_hash; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2a63860c172e..bf0ca563e91f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -68,7 +68,6 @@ }) /* hash bits for specific function selection */ -#define FTRACE_HASH_DEFAULT_BITS 10 #define FTRACE_HASH_MAX_BITS 12 #ifdef CONFIG_DYNAMIC_FTRACE @@ -1211,8 +1210,8 @@ static void __add_hash_entry(struct ftrace_hash *hash, hash->count++; } -static struct ftrace_func_entry * -add_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigned long direct) +struct ftrace_func_entry * +add_ftrace_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigned long direct) { struct ftrace_func_entry *entry; @@ -1230,7 +1229,7 @@ add_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigned long static struct ftrace_func_entry * add_hash_entry(struct ftrace_hash *hash, unsigned long ip) { - return add_hash_entry_direct(hash, ip, 0); + return add_ftrace_hash_entry_direct(hash, ip, 0); } static void @@ -1291,7 +1290,7 @@ static void clear_ftrace_mod_list(struct list_head *head) mutex_unlock(&ftrace_lock); } -static void free_ftrace_hash(struct ftrace_hash *hash) +void free_ftrace_hash(struct ftrace_hash *hash) { if (!hash || hash == EMPTY_HASH) return; @@ -1331,7 +1330,7 @@ void ftrace_free_filter(struct ftrace_ops *ops) } EXPORT_SYMBOL_GPL(ftrace_free_filter); -static struct ftrace_hash *alloc_ftrace_hash(int size_bits) +struct ftrace_hash *alloc_ftrace_hash(int size_bits) { struct ftrace_hash *hash; int size; @@ -1405,7 +1404,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) size = 1 << hash->size_bits; for (i = 0; i < size; i++) { hlist_for_each_entry(entry, &hash->buckets[i], hlist) { - if (add_hash_entry_direct(new_hash, entry->ip, entry->direct) == NULL) + if (add_ftrace_hash_entry_direct(new_hash, entry->ip, entry->direct) == NULL) goto free_hash; } } -- cgit v1.2.3 From 05dc5e9c1fe156fd9dddc4c2f81e8fc6c7e50eb5 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:05 +0100 Subject: ftrace: Add update_ftrace_direct_add function Adding update_ftrace_direct_add function that adds all entries (ip -> addr) provided in hash argument to direct ftrace ops and updates its attachments. The difference to current register_ftrace_direct is - hash argument that allows to register multiple ip -> direct entries at once - we can call update_ftrace_direct_add multiple times on the same ftrace_ops object, becase after first registration with register_ftrace_function_nolock, it uses ftrace_update_ops to update the ftrace_ops object This change will allow us to have simple ftrace_ops for all bpf direct interface users in following changes. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20251230145010.103439-5-jolsa@kernel.org --- include/linux/ftrace.h | 7 +++ kernel/trace/ftrace.c | 140 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 147 insertions(+) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index a77b57a4aba2..43a6cfaf5aae 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -543,6 +543,8 @@ int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr, int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr); int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr); +int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash); + void ftrace_stub_direct_tramp(void); #else @@ -569,6 +571,11 @@ static inline int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned l return -ENODEV; } +static inline int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash) +{ + return -ENODEV; +} + /* * This must be implemented by the architecture. * It is the way the ftrace direct_ops helper, when called diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bf0ca563e91f..97bd988b0a62 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6278,6 +6278,146 @@ int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) return err; } EXPORT_SYMBOL_GPL(modify_ftrace_direct); + +static unsigned long hash_count(struct ftrace_hash *hash) +{ + return hash ? hash->count : 0; +} + +/** + * hash_add - adds two struct ftrace_hash and returns the result + * @a: struct ftrace_hash object + * @b: struct ftrace_hash object + * + * Returns struct ftrace_hash object on success, NULL on error. + */ +static struct ftrace_hash *hash_add(struct ftrace_hash *a, struct ftrace_hash *b) +{ + struct ftrace_func_entry *entry; + struct ftrace_hash *add; + int size; + + size = hash_count(a) + hash_count(b); + if (size > 32) + size = 32; + + add = alloc_and_copy_ftrace_hash(fls(size), a); + if (!add) + return NULL; + + size = 1 << b->size_bits; + for (int i = 0; i < size; i++) { + hlist_for_each_entry(entry, &b->buckets[i], hlist) { + if (add_ftrace_hash_entry_direct(add, entry->ip, entry->direct) == NULL) { + free_ftrace_hash(add); + return NULL; + } + } + } + return add; +} + +/** + * update_ftrace_direct_add - Updates @ops by adding direct + * callers provided in @hash + * @ops: The address of the struct ftrace_ops object + * @hash: The address of the struct ftrace_hash object + * + * This is used to add custom direct callers (ip -> addr) to @ops, + * specified in @hash. The @ops will be either registered or updated. + * + * Returns: zero on success. Non zero on error, which includes: + * -EINVAL - The @hash is empty + */ +int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash) +{ + struct ftrace_hash *old_direct_functions = NULL; + struct ftrace_hash *new_direct_functions; + struct ftrace_hash *old_filter_hash; + struct ftrace_hash *new_filter_hash = NULL; + struct ftrace_func_entry *entry; + int err = -EINVAL; + int size; + bool reg; + + if (!hash_count(hash)) + return -EINVAL; + + mutex_lock(&direct_mutex); + + /* Make sure requested entries are not already registered. */ + size = 1 << hash->size_bits; + for (int i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + if (__ftrace_lookup_ip(direct_functions, entry->ip)) + goto out_unlock; + } + } + + old_filter_hash = ops->func_hash ? ops->func_hash->filter_hash : NULL; + + /* If there's nothing in filter_hash we need to register the ops. */ + reg = hash_count(old_filter_hash) == 0; + if (reg) { + if (ops->func || ops->trampoline) + goto out_unlock; + if (ops->flags & FTRACE_OPS_FL_ENABLED) + goto out_unlock; + } + + err = -ENOMEM; + new_filter_hash = hash_add(old_filter_hash, hash); + if (!new_filter_hash) + goto out_unlock; + + new_direct_functions = hash_add(direct_functions, hash); + if (!new_direct_functions) + goto out_unlock; + + old_direct_functions = direct_functions; + rcu_assign_pointer(direct_functions, new_direct_functions); + + if (reg) { + ops->func = call_direct_funcs; + ops->flags |= MULTI_FLAGS; + ops->trampoline = FTRACE_REGS_ADDR; + ops->local_hash.filter_hash = new_filter_hash; + + err = register_ftrace_function_nolock(ops); + if (err) { + /* restore old filter on error */ + ops->local_hash.filter_hash = old_filter_hash; + + /* cleanup for possible another register call */ + ops->func = NULL; + ops->trampoline = 0; + } else { + new_filter_hash = old_filter_hash; + } + } else { + err = ftrace_update_ops(ops, new_filter_hash, EMPTY_HASH); + /* + * new_filter_hash is dup-ed, so we need to release it anyway, + * old_filter_hash either stays on error or is already released + */ + } + + if (err) { + /* reset direct_functions and free the new one */ + rcu_assign_pointer(direct_functions, old_direct_functions); + old_direct_functions = new_direct_functions; + } + + out_unlock: + mutex_unlock(&direct_mutex); + + if (old_direct_functions && old_direct_functions != EMPTY_HASH) + call_rcu_tasks(&old_direct_functions->rcu, register_ftrace_direct_cb); + free_ftrace_hash(new_filter_hash); + + return err; +} + #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ /** -- cgit v1.2.3 From 8d2c1233f37149e4d1223d06ca7054a7f88c0a24 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:06 +0100 Subject: ftrace: Add update_ftrace_direct_del function Adding update_ftrace_direct_del function that removes all entries (ip -> addr) provided in hash argument to direct ftrace ops and updates its attachments. The difference to current unregister_ftrace_direct is - hash argument that allows to unregister multiple ip -> direct entries at once - we can call update_ftrace_direct_del multiple times on the same ftrace_ops object, becase we do not need to unregister all entries at once, we can do it gradualy with the help of ftrace_update_ops function This change will allow us to have simple ftrace_ops for all bpf direct interface users in following changes. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20251230145010.103439-6-jolsa@kernel.org --- include/linux/ftrace.h | 6 +++ kernel/trace/ftrace.c | 127 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 43a6cfaf5aae..cfb957731433 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -544,6 +544,7 @@ int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr); int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr); int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash); +int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash); void ftrace_stub_direct_tramp(void); @@ -576,6 +577,11 @@ static inline int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace return -ENODEV; } +static inline int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash) +{ + return -ENODEV; +} + /* * This must be implemented by the architecture. * It is the way the ftrace direct_ops helper, when called diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 97bd988b0a62..244bb7553d3d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6418,6 +6418,133 @@ int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash) return err; } +/** + * hash_sub - substracts @b from @a and returns the result + * @a: struct ftrace_hash object + * @b: struct ftrace_hash object + * + * Returns struct ftrace_hash object on success, NULL on error. + */ +static struct ftrace_hash *hash_sub(struct ftrace_hash *a, struct ftrace_hash *b) +{ + struct ftrace_func_entry *entry, *del; + struct ftrace_hash *sub; + int size; + + sub = alloc_and_copy_ftrace_hash(a->size_bits, a); + if (!sub) + return NULL; + + size = 1 << b->size_bits; + for (int i = 0; i < size; i++) { + hlist_for_each_entry(entry, &b->buckets[i], hlist) { + del = __ftrace_lookup_ip(sub, entry->ip); + if (WARN_ON_ONCE(!del)) { + free_ftrace_hash(sub); + return NULL; + } + remove_hash_entry(sub, del); + kfree(del); + } + } + return sub; +} + +/** + * update_ftrace_direct_del - Updates @ops by removing its direct + * callers provided in @hash + * @ops: The address of the struct ftrace_ops object + * @hash: The address of the struct ftrace_hash object + * + * This is used to delete custom direct callers (ip -> addr) in + * @ops specified via @hash. The @ops will be either unregistered + * updated. + * + * Returns: zero on success. Non zero on error, which includes: + * -EINVAL - The @hash is empty + * -EINVAL - The @ops is not registered + */ +int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash) +{ + struct ftrace_hash *old_direct_functions = NULL; + struct ftrace_hash *new_direct_functions; + struct ftrace_hash *new_filter_hash = NULL; + struct ftrace_hash *old_filter_hash; + struct ftrace_func_entry *entry; + struct ftrace_func_entry *del; + unsigned long size; + int err = -EINVAL; + + if (!hash_count(hash)) + return -EINVAL; + if (check_direct_multi(ops)) + return -EINVAL; + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EINVAL; + if (direct_functions == EMPTY_HASH) + return -EINVAL; + + mutex_lock(&direct_mutex); + + old_filter_hash = ops->func_hash ? ops->func_hash->filter_hash : NULL; + + if (!hash_count(old_filter_hash)) + goto out_unlock; + + /* Make sure requested entries are already registered. */ + size = 1 << hash->size_bits; + for (int i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + del = __ftrace_lookup_ip(direct_functions, entry->ip); + if (!del || del->direct != entry->direct) + goto out_unlock; + } + } + + err = -ENOMEM; + new_filter_hash = hash_sub(old_filter_hash, hash); + if (!new_filter_hash) + goto out_unlock; + + new_direct_functions = hash_sub(direct_functions, hash); + if (!new_direct_functions) + goto out_unlock; + + /* If there's nothing left, we need to unregister the ops. */ + if (ftrace_hash_empty(new_filter_hash)) { + err = unregister_ftrace_function(ops); + if (!err) { + /* cleanup for possible another register call */ + ops->func = NULL; + ops->trampoline = 0; + ftrace_free_filter(ops); + ops->func_hash->filter_hash = NULL; + } + } else { + err = ftrace_update_ops(ops, new_filter_hash, EMPTY_HASH); + /* + * new_filter_hash is dup-ed, so we need to release it anyway, + * old_filter_hash either stays on error or is already released + */ + } + + if (err) { + /* free the new_direct_functions */ + old_direct_functions = new_direct_functions; + } else { + rcu_assign_pointer(direct_functions, new_direct_functions); + } + + out_unlock: + mutex_unlock(&direct_mutex); + + if (old_direct_functions && old_direct_functions != EMPTY_HASH) + call_rcu_tasks(&old_direct_functions->rcu, register_ftrace_direct_cb); + free_ftrace_hash(new_filter_hash); + + return err; +} + #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ /** -- cgit v1.2.3 From e93672f770d72f4c33d1dd9fb0633f95948c0cc9 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:07 +0100 Subject: ftrace: Add update_ftrace_direct_mod function Adding update_ftrace_direct_mod function that modifies all entries (ip -> direct) provided in hash argument to direct ftrace ops and updates its attachments. The difference to current modify_ftrace_direct is: - hash argument that allows to modify multiple ip -> direct entries at once This change will allow us to have simple ftrace_ops for all bpf direct interface users in following changes. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20251230145010.103439-7-jolsa@kernel.org --- include/linux/ftrace.h | 6 ++++ kernel/trace/ftrace.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index cfb957731433..b8461e541b9d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -545,6 +545,7 @@ int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr); int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash); int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash); +int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace_hash *hash, bool do_direct_lock); void ftrace_stub_direct_tramp(void); @@ -582,6 +583,11 @@ static inline int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace return -ENODEV; } +static inline int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace_hash *hash, bool do_direct_lock) +{ + return -ENODEV; +} + /* * This must be implemented by the architecture. * It is the way the ftrace direct_ops helper, when called diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 244bb7553d3d..794ab5e22398 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6545,6 +6545,100 @@ int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash) return err; } +/** + * update_ftrace_direct_mod - Updates @ops by modifing its direct + * callers provided in @hash + * @ops: The address of the struct ftrace_ops object + * @hash: The address of the struct ftrace_hash object + * @do_direct_lock: If true lock the direct_mutex + * + * This is used to modify custom direct callers (ip -> addr) in + * @ops specified via @hash. + * + * This can be called from within ftrace ops_func callback with + * direct_mutex already locked, in which case @do_direct_lock + * needs to be false. + * + * Returns: zero on success. Non zero on error, which includes: + * -EINVAL - The @hash is empty + * -EINVAL - The @ops is not registered + */ +int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace_hash *hash, bool do_direct_lock) +{ + struct ftrace_func_entry *entry, *tmp; + static struct ftrace_ops tmp_ops = { + .func = ftrace_stub, + .flags = FTRACE_OPS_FL_STUB, + }; + struct ftrace_hash *orig_hash; + unsigned long size, i; + int err = -EINVAL; + + if (!hash_count(hash)) + return -EINVAL; + if (check_direct_multi(ops)) + return -EINVAL; + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EINVAL; + if (direct_functions == EMPTY_HASH) + return -EINVAL; + + /* + * We can be called from within ops_func callback with direct_mutex + * already taken. + */ + if (do_direct_lock) + mutex_lock(&direct_mutex); + + orig_hash = ops->func_hash ? ops->func_hash->filter_hash : NULL; + if (!orig_hash) + goto unlock; + + /* Enable the tmp_ops to have the same functions as the direct ops */ + ftrace_ops_init(&tmp_ops); + tmp_ops.func_hash = ops->func_hash; + + err = register_ftrace_function_nolock(&tmp_ops); + if (err) + goto unlock; + + /* + * Call __ftrace_hash_update_ipmodify() here, so that we can call + * ops->ops_func for the ops. This is needed because the above + * register_ftrace_function_nolock() worked on tmp_ops. + */ + err = __ftrace_hash_update_ipmodify(ops, orig_hash, orig_hash, true); + if (err) + goto out; + + /* + * Now the ftrace_ops_list_func() is called to do the direct callers. + * We can safely change the direct functions attached to each entry. + */ + mutex_lock(&ftrace_lock); + + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + tmp = __ftrace_lookup_ip(direct_functions, entry->ip); + if (!tmp) + continue; + tmp->direct = entry->direct; + } + } + + mutex_unlock(&ftrace_lock); + +out: + /* Removing the tmp_ops will add the updated direct callers to the functions */ + unregister_ftrace_function(&tmp_ops); + +unlock: + if (do_direct_lock) + mutex_unlock(&direct_mutex); + return err; +} + #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ /** -- cgit v1.2.3 From 7d0452497c292153e690652e6df218fead21185f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:08 +0100 Subject: bpf: Add trampoline ip hash table Following changes need to lookup trampoline based on its ip address, adding hash table for that. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20251230145010.103439-8-jolsa@kernel.org --- include/linux/bpf.h | 7 +++++-- kernel/bpf/trampoline.c | 30 +++++++++++++++++++----------- 2 files changed, 24 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9272a237cced..5524f9429e76 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1329,14 +1329,17 @@ struct bpf_tramp_image { }; struct bpf_trampoline { - /* hlist for trampoline_table */ - struct hlist_node hlist; + /* hlist for trampoline_key_table */ + struct hlist_node hlist_key; + /* hlist for trampoline_ip_table */ + struct hlist_node hlist_ip; struct ftrace_ops *fops; /* serializes access to fields of this trampoline */ struct mutex mutex; refcount_t refcnt; u32 flags; u64 key; + unsigned long ip; struct { struct btf_func_model model; void *addr; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 468de930b2ed..1d4f618c2402 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -24,9 +24,10 @@ const struct bpf_prog_ops bpf_extension_prog_ops = { #define TRAMPOLINE_HASH_BITS 10 #define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS) -static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; +static struct hlist_head trampoline_key_table[TRAMPOLINE_TABLE_SIZE]; +static struct hlist_head trampoline_ip_table[TRAMPOLINE_TABLE_SIZE]; -/* serializes access to trampoline_table */ +/* serializes access to trampoline tables */ static DEFINE_MUTEX(trampoline_mutex); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS @@ -142,15 +143,15 @@ void bpf_image_ksym_del(struct bpf_ksym *ksym) PAGE_SIZE, true, ksym->name); } -static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) +static struct bpf_trampoline *bpf_trampoline_lookup(u64 key, unsigned long ip) { struct bpf_trampoline *tr; struct hlist_head *head; int i; mutex_lock(&trampoline_mutex); - head = &trampoline_table[hash_64(key, TRAMPOLINE_HASH_BITS)]; - hlist_for_each_entry(tr, head, hlist) { + head = &trampoline_key_table[hash_64(key, TRAMPOLINE_HASH_BITS)]; + hlist_for_each_entry(tr, head, hlist_key) { if (tr->key == key) { refcount_inc(&tr->refcnt); goto out; @@ -171,8 +172,12 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) #endif tr->key = key; - INIT_HLIST_NODE(&tr->hlist); - hlist_add_head(&tr->hlist, head); + tr->ip = ftrace_location(ip); + INIT_HLIST_NODE(&tr->hlist_key); + INIT_HLIST_NODE(&tr->hlist_ip); + hlist_add_head(&tr->hlist_key, head); + head = &trampoline_ip_table[hash_64(tr->ip, TRAMPOLINE_HASH_BITS)]; + hlist_add_head(&tr->hlist_ip, head); refcount_set(&tr->refcnt, 1); mutex_init(&tr->mutex); for (i = 0; i < BPF_TRAMP_MAX; i++) @@ -883,7 +888,7 @@ void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog) prog->aux->attach_btf_id); bpf_lsm_find_cgroup_shim(prog, &bpf_func); - tr = bpf_trampoline_lookup(key); + tr = bpf_trampoline_lookup(key, 0); if (WARN_ON_ONCE(!tr)) return; @@ -903,7 +908,7 @@ struct bpf_trampoline *bpf_trampoline_get(u64 key, { struct bpf_trampoline *tr; - tr = bpf_trampoline_lookup(key); + tr = bpf_trampoline_lookup(key, tgt_info->tgt_addr); if (!tr) return NULL; @@ -939,7 +944,8 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) * fexit progs. The fentry-only trampoline will be freed via * multiple rcu callbacks. */ - hlist_del(&tr->hlist); + hlist_del(&tr->hlist_key); + hlist_del(&tr->hlist_ip); if (tr->fops) { ftrace_free_filter(tr->fops); kfree(tr->fops); @@ -1212,7 +1218,9 @@ static int __init init_trampolines(void) int i; for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++) - INIT_HLIST_HEAD(&trampoline_table[i]); + INIT_HLIST_HEAD(&trampoline_key_table[i]); + for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++) + INIT_HLIST_HEAD(&trampoline_ip_table[i]); return 0; } late_initcall(init_trampolines); -- cgit v1.2.3 From 956747efd82aa60e0ac3e6aef2b9a17adda6f3b1 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:09 +0100 Subject: ftrace: Factor ftrace_ops ops_func interface We are going to remove "ftrace_ops->private == bpf_trampoline" setup in following changes. Adding ip argument to ftrace_ops_func_t callback function, so we can use it to look up the trampoline. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20251230145010.103439-9-jolsa@kernel.org --- include/linux/ftrace.h | 2 +- kernel/bpf/trampoline.c | 3 ++- kernel/trace/ftrace.c | 6 +++--- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index b8461e541b9d..705db0a6d995 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -403,7 +403,7 @@ enum ftrace_ops_cmd { * Negative on failure. The return value is dependent on the * callback. */ -typedef int (*ftrace_ops_func_t)(struct ftrace_ops *op, enum ftrace_ops_cmd cmd); +typedef int (*ftrace_ops_func_t)(struct ftrace_ops *op, unsigned long ip, enum ftrace_ops_cmd cmd); #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 1d4f618c2402..64556aa0007b 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -33,7 +33,8 @@ static DEFINE_MUTEX(trampoline_mutex); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex); -static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd) +static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip, + enum ftrace_ops_cmd cmd) { struct bpf_trampoline *tr = ops->private; int ret = 0; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 794ab5e22398..ee2d7b8a4372 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2075,7 +2075,7 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, */ if (!ops->ops_func) return -EBUSY; - ret = ops->ops_func(ops, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF); + ret = ops->ops_func(ops, rec->ip, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF); if (ret) return ret; } else if (is_ipmodify) { @@ -9061,7 +9061,7 @@ static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops) if (!op->ops_func) return -EBUSY; - ret = op->ops_func(op, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER); + ret = op->ops_func(op, ip, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER); if (ret) return ret; } @@ -9108,7 +9108,7 @@ static void cleanup_direct_functions_after_ipmodify(struct ftrace_ops *ops) /* The cleanup is optional, ignore any errors */ if (found_op && op->ops_func) - op->ops_func(op, FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER); + op->ops_func(op, ip, FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER); } } mutex_unlock(&direct_mutex); -- cgit v1.2.3 From 424f6a3610965d125634bc65c5d6d506582d4f7e Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:10 +0100 Subject: bpf,x86: Use single ftrace_ops for direct calls Using single ftrace_ops for direct calls update instead of allocating ftrace_ops object for each trampoline. With single ftrace_ops object we can use update_ftrace_direct_* api that allows multiple ip sites updates on single ftrace_ops object. Adding HAVE_SINGLE_FTRACE_DIRECT_OPS config option to be enabled on each arch that supports this. At the moment we can enable this only on x86 arch, because arm relies on ftrace_ops object representing just single trampoline image (stored in ftrace_ops::direct_call). Archs that do not support this will continue to use *_ftrace_direct api. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20251230145010.103439-10-jolsa@kernel.org --- arch/x86/Kconfig | 1 + kernel/bpf/trampoline.c | 220 +++++++++++++++++++++++++++++++++++++++++------- kernel/trace/Kconfig | 3 + kernel/trace/ftrace.c | 7 +- 4 files changed, 200 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 80527299f859..53bf2cf7ff6f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -336,6 +336,7 @@ config X86 select SCHED_SMT if SMP select ARCH_SUPPORTS_SCHED_CLUSTER if SMP select ARCH_SUPPORTS_SCHED_MC if SMP + select HAVE_SINGLE_FTRACE_DIRECT_OPS if X86_64 && DYNAMIC_FTRACE_WITH_DIRECT_CALLS config INSTRUCTION_DECODER def_bool y diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 64556aa0007b..952cd7932461 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -33,12 +33,40 @@ static DEFINE_MUTEX(trampoline_mutex); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex); +#ifdef CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS +static struct bpf_trampoline *direct_ops_ip_lookup(struct ftrace_ops *ops, unsigned long ip) +{ + struct hlist_head *head_ip; + struct bpf_trampoline *tr; + + mutex_lock(&trampoline_mutex); + head_ip = &trampoline_ip_table[hash_64(ip, TRAMPOLINE_HASH_BITS)]; + hlist_for_each_entry(tr, head_ip, hlist_ip) { + if (tr->ip == ip) + goto out; + } + tr = NULL; +out: + mutex_unlock(&trampoline_mutex); + return tr; +} +#else +static struct bpf_trampoline *direct_ops_ip_lookup(struct ftrace_ops *ops, unsigned long ip) +{ + return ops->private; +} +#endif /* CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS */ + static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip, enum ftrace_ops_cmd cmd) { - struct bpf_trampoline *tr = ops->private; + struct bpf_trampoline *tr; int ret = 0; + tr = direct_ops_ip_lookup(ops, ip); + if (!tr) + return -EINVAL; + if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) { /* This is called inside register_ftrace_direct_multi(), so * tr->mutex is already locked. @@ -144,6 +172,162 @@ void bpf_image_ksym_del(struct bpf_ksym *ksym) PAGE_SIZE, true, ksym->name); } +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +#ifdef CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS +/* + * We have only single direct_ops which contains all the direct call + * sites and is the only global ftrace_ops for all trampolines. + * + * We use 'update_ftrace_direct_*' api for attachment. + */ +struct ftrace_ops direct_ops = { + .ops_func = bpf_tramp_ftrace_ops_func, +}; + +static int direct_ops_alloc(struct bpf_trampoline *tr) +{ + tr->fops = &direct_ops; + return 0; +} + +static void direct_ops_free(struct bpf_trampoline *tr) { } + +static struct ftrace_hash *hash_from_ip(struct bpf_trampoline *tr, void *ptr) +{ + unsigned long ip, addr = (unsigned long) ptr; + struct ftrace_hash *hash; + + ip = ftrace_location(tr->ip); + if (!ip) + return NULL; + hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); + if (!hash) + return NULL; + if (bpf_trampoline_use_jmp(tr->flags)) + addr = ftrace_jmp_set(addr); + if (!add_ftrace_hash_entry_direct(hash, ip, addr)) { + free_ftrace_hash(hash); + return NULL; + } + return hash; +} + +static int direct_ops_add(struct bpf_trampoline *tr, void *addr) +{ + struct ftrace_hash *hash = hash_from_ip(tr, addr); + int err; + + if (!hash) + return -ENOMEM; + err = update_ftrace_direct_add(tr->fops, hash); + free_ftrace_hash(hash); + return err; +} + +static int direct_ops_del(struct bpf_trampoline *tr, void *addr) +{ + struct ftrace_hash *hash = hash_from_ip(tr, addr); + int err; + + if (!hash) + return -ENOMEM; + err = update_ftrace_direct_del(tr->fops, hash); + free_ftrace_hash(hash); + return err; +} + +static int direct_ops_mod(struct bpf_trampoline *tr, void *addr, bool lock_direct_mutex) +{ + struct ftrace_hash *hash = hash_from_ip(tr, addr); + int err; + + if (!hash) + return -ENOMEM; + err = update_ftrace_direct_mod(tr->fops, hash, lock_direct_mutex); + free_ftrace_hash(hash); + return err; +} +#else +/* + * We allocate ftrace_ops object for each trampoline and it contains + * call site specific for that trampoline. + * + * We use *_ftrace_direct api for attachment. + */ +static int direct_ops_alloc(struct bpf_trampoline *tr) +{ + tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL); + if (!tr->fops) + return -ENOMEM; + tr->fops->private = tr; + tr->fops->ops_func = bpf_tramp_ftrace_ops_func; + return 0; +} + +static void direct_ops_free(struct bpf_trampoline *tr) +{ + if (!tr->fops) + return; + ftrace_free_filter(tr->fops); + kfree(tr->fops); +} + +static int direct_ops_add(struct bpf_trampoline *tr, void *ptr) +{ + unsigned long addr = (unsigned long) ptr; + struct ftrace_ops *ops = tr->fops; + int ret; + + if (bpf_trampoline_use_jmp(tr->flags)) + addr = ftrace_jmp_set(addr); + + ret = ftrace_set_filter_ip(ops, tr->ip, 0, 1); + if (ret) + return ret; + return register_ftrace_direct(ops, addr); +} + +static int direct_ops_del(struct bpf_trampoline *tr, void *addr) +{ + return unregister_ftrace_direct(tr->fops, (long)addr, false); +} + +static int direct_ops_mod(struct bpf_trampoline *tr, void *ptr, bool lock_direct_mutex) +{ + unsigned long addr = (unsigned long) ptr; + struct ftrace_ops *ops = tr->fops; + + if (bpf_trampoline_use_jmp(tr->flags)) + addr = ftrace_jmp_set(addr); + if (lock_direct_mutex) + return modify_ftrace_direct(ops, addr); + return modify_ftrace_direct_nolock(ops, addr); +} +#endif /* CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS */ +#else +static void direct_ops_free(struct bpf_trampoline *tr) { } + +static int direct_ops_alloc(struct bpf_trampoline *tr) +{ + return 0; +} + +static int direct_ops_add(struct bpf_trampoline *tr, void *addr) +{ + return -ENODEV; +} + +static int direct_ops_del(struct bpf_trampoline *tr, void *addr) +{ + return -ENODEV; +} + +static int direct_ops_mod(struct bpf_trampoline *tr, void *ptr, bool lock_direct_mutex) +{ + return -ENODEV; +} +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + static struct bpf_trampoline *bpf_trampoline_lookup(u64 key, unsigned long ip) { struct bpf_trampoline *tr; @@ -161,16 +345,11 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key, unsigned long ip) tr = kzalloc(sizeof(*tr), GFP_KERNEL); if (!tr) goto out; -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS - tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL); - if (!tr->fops) { + if (direct_ops_alloc(tr)) { kfree(tr); tr = NULL; goto out; } - tr->fops->private = tr; - tr->fops->ops_func = bpf_tramp_ftrace_ops_func; -#endif tr->key = key; tr->ip = ftrace_location(ip); @@ -213,7 +392,7 @@ static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags, int ret; if (tr->func.ftrace_managed) - ret = unregister_ftrace_direct(tr->fops, (long)old_addr, false); + ret = direct_ops_del(tr, old_addr); else ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, NULL); @@ -227,15 +406,7 @@ static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags, int ret; if (tr->func.ftrace_managed) { - unsigned long addr = (unsigned long) new_addr; - - if (bpf_trampoline_use_jmp(tr->flags)) - addr = ftrace_jmp_set(addr); - - if (lock_direct_mutex) - ret = modify_ftrace_direct(tr->fops, addr); - else - ret = modify_ftrace_direct_nolock(tr->fops, addr); + ret = direct_ops_mod(tr, new_addr, lock_direct_mutex); } else { ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, new_addr); @@ -258,15 +429,7 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) } if (tr->func.ftrace_managed) { - unsigned long addr = (unsigned long) new_addr; - - if (bpf_trampoline_use_jmp(tr->flags)) - addr = ftrace_jmp_set(addr); - - ret = ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1); - if (ret) - return ret; - ret = register_ftrace_direct(tr->fops, addr); + ret = direct_ops_add(tr, new_addr); } else { ret = bpf_trampoline_update_fentry(tr, 0, NULL, new_addr); } @@ -947,10 +1110,7 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) */ hlist_del(&tr->hlist_key); hlist_del(&tr->hlist_ip); - if (tr->fops) { - ftrace_free_filter(tr->fops); - kfree(tr->fops); - } + direct_ops_free(tr); kfree(tr); out: mutex_unlock(&trampoline_mutex); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index bfa2ec46e075..d7042a09fe46 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -50,6 +50,9 @@ config HAVE_DYNAMIC_FTRACE_WITH_REGS config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS bool +config HAVE_SINGLE_FTRACE_DIRECT_OPS + bool + config HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS bool diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ee2d7b8a4372..8574932e66b6 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2631,8 +2631,13 @@ unsigned long ftrace_find_rec_direct(unsigned long ip) static void call_direct_funcs(unsigned long ip, unsigned long pip, struct ftrace_ops *ops, struct ftrace_regs *fregs) { - unsigned long addr = READ_ONCE(ops->direct_call); + unsigned long addr; +#ifdef CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS + addr = ftrace_find_rec_direct(ip); +#else + addr = READ_ONCE(ops->direct_call); +#endif if (!addr) return; -- cgit v1.2.3 From cd3b6a3d49f8061d0c4c7e4226783051fe592ae7 Mon Sep 17 00:00:00 2001 From: Luis Gerhorst Date: Tue, 27 Jan 2026 12:59:11 +0100 Subject: bpf: Fix verifier_bug_if to account for BPF_CALL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The BPF verifier assumes `insn_aux->nospec_result` is only set for direct memory writes (e.g., `*(u32*)(r1+off) = r2`). However, the assertion fails to account for helper calls (e.g., `bpf_skb_load_bytes_relative`) that perform writes to stack memory. Make the check more precise to resolve this. The problem is that `BPF_CALL` instructions have `BPF_CLASS(insn->code) == BPF_JMP`, which triggers the warning check: - Helpers like `bpf_skb_load_bytes_relative` write to stack memory - `check_helper_call()` loops through `meta.access_size`, calling `check_mem_access(..., BPF_WRITE)` - `check_stack_write()` sets `insn_aux->nospec_result = 1` - Since `BPF_CALL` is encoded as `BPF_JMP | BPF_CALL`, the warning fires Execution flow: ``` 1. Drop capabilities → Enable Spectre mitigation 2. Load BPF program └─> do_check() ├─> check_cond_jmp_op() → Marks dead branch as speculative │ └─> push_stack(..., speculative=true) ├─> pop_stack() → state->speculative = 1 ├─> check_helper_call() → Processes helper in dead branch │ └─> check_mem_access(..., BPF_WRITE) │ └─> insn_aux->nospec_result = 1 └─> Checks: state->speculative && insn_aux->nospec_result └─> BPF_CLASS(insn->code) == BPF_JMP → WARNING ``` To fix the assert, it would be nice to be able to reuse bpf_insn_successors() here, but bpf_insn_successors()->cnt is not exactly what we want as it may also be 1 for BPF_JA. Instead, we could check opcode_info.can_jump, but then we would have to share the table between the functions. This would mean moving the table out of the function and adding bpf_opcode_info(). As the verifier_bug_if() only runs for insns with nospec_result set, the impact on verification time would likely still be negligible. However, I assume sharing bpf_opcode_info() between liveness.c and verifier.c will not be worth it. It seems as only adjust_jmp_off() could also be simplified using it, and there imm/off is touched. Thus it is maybe better to rely on exact opcode/class matching there. Therefore, to avoid this sharing only for a verifier_bug_if(), just check the opcode. This should now cover all opcodes for which can_jump in bpf_insn_successors() is true. Parts of the description and example are taken from the bug report. Fixes: dadb59104c64 ("bpf: Fix aux usage after do_check_insn()") Signed-off-by: Luis Gerhorst Reported-by: Yinhao Hu Reported-by: Kaiyan Mei Reported-by: Dongliang Mu Closes: https://lore.kernel.org/bpf/7678017d-b760-4053-a2d8-a6879b0dbeeb@hust.edu.cn/ Link: https://lore.kernel.org/r/20260127115912.3026761-2-luis.gerhorst@fau.de Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c2f2650db9fd..e7ff8394e0da 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21065,17 +21065,19 @@ static int do_check(struct bpf_verifier_env *env) * may skip a nospec patched-in after the jump. This can * currently never happen because nospec_result is only * used for the write-ops - * `*(size*)(dst_reg+off)=src_reg|imm32` which must - * never skip the following insn. Still, add a warning - * to document this in case nospec_result is used - * elsewhere in the future. + * `*(size*)(dst_reg+off)=src_reg|imm32` and helper + * calls. These must never skip the following insn + * (i.e., bpf_insn_successors()'s opcode_info.can_jump + * is false). Still, add a warning to document this in + * case nospec_result is used elsewhere in the future. * * All non-branch instructions have a single * fall-through edge. For these, nospec_result should * already work. */ - if (verifier_bug_if(BPF_CLASS(insn->code) == BPF_JMP || - BPF_CLASS(insn->code) == BPF_JMP32, env, + if (verifier_bug_if((BPF_CLASS(insn->code) == BPF_JMP || + BPF_CLASS(insn->code) == BPF_JMP32) && + BPF_OP(insn->code) != BPF_CALL, env, "speculation barrier after jump instruction may not have the desired effect")) return -EFAULT; process_bpf_exit: -- cgit v1.2.3 From 0f0c332992b8a5d2ae7b611b94c4e02ef8d54b97 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 30 Jan 2026 09:12:07 +0100 Subject: bpf: Allow sleepable programs to use tail calls Allowing sleepable programs to use tail calls. Making sure we can't mix sleepable and non-sleepable bpf programs in tail call map (BPF_MAP_TYPE_PROG_ARRAY) and allowing it to be used in sleepable programs. Sleepable programs can be preempted and sleep which might bring new source of race conditions, but both direct and indirect tail calls should not be affected. Direct tail calls work by patching direct jump to callee into bpf caller program, so no problem there. We atomically switch from nop to jump instruction. Indirect tail call reads the callee from the map and then jumps to it. The callee bpf program can't disappear (be released) from the caller, because it is executed under rcu lock (rcu_read_lock_trace). Signed-off-by: Jiri Olsa Acked-by: Leon Hwang Link: https://lore.kernel.org/r/20260130081208.1130204-2-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/core.c | 4 +++- kernel/bpf/verifier.c | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5524f9429e76..3b0ceb759075 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -287,6 +287,7 @@ struct bpf_map_owner { enum bpf_prog_type type; bool jited; bool xdp_has_frags; + bool sleepable; u64 storage_cookie[MAX_BPF_CGROUP_STORAGE_TYPE]; const struct btf_type *attach_func_proto; enum bpf_attach_type expected_attach_type; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index e0b8a8a5aaa9..5ebece600aeb 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2401,6 +2401,7 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map, map->owner->type = prog_type; map->owner->jited = fp->jited; map->owner->xdp_has_frags = aux->xdp_has_frags; + map->owner->sleepable = fp->sleepable; map->owner->expected_attach_type = fp->expected_attach_type; map->owner->attach_func_proto = aux->attach_func_proto; for_each_cgroup_storage_type(i) { @@ -2412,7 +2413,8 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map, } else { ret = map->owner->type == prog_type && map->owner->jited == fp->jited && - map->owner->xdp_has_frags == aux->xdp_has_frags; + map->owner->xdp_has_frags == aux->xdp_has_frags && + map->owner->sleepable == fp->sleepable; if (ret && map->map_type == BPF_MAP_TYPE_PROG_ARRAY && map->owner->expected_attach_type != fp->expected_attach_type) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e7ff8394e0da..f185ebc6748d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21386,6 +21386,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_STACK: case BPF_MAP_TYPE_ARENA: case BPF_MAP_TYPE_INSN_ARRAY: + case BPF_MAP_TYPE_PROG_ARRAY: break; default: verbose(env, -- cgit v1.2.3 From 98c4fd2963cb2bc5eae22b5365e6bbf3a5cd0f14 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Fri, 30 Jan 2026 20:42:10 +0000 Subject: bpf: Introduce struct bpf_map_desc in verifier Introduce struct bpf_map_desc to hold bpf_map pointer and map uid. Use this struct in both bpf_call_arg_meta and bpf_kfunc_call_arg_meta instead of having different representations: - bpf_call_arg_meta had separate map_ptr and map_uid fields - bpf_kfunc_call_arg_meta had an anonymous inline struct This unifies the map fields layout across both metadata structures, making the code more consistent and preparing for further refactoring of map field pointer validation. Acked-by: Eduard Zingerman Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260130-verif_special_fields-v2-1-2c59e637da7d@meta.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 79 ++++++++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f185ebc6748d..c79463d50b37 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -272,8 +272,13 @@ static bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn) insn->src_reg == BPF_PSEUDO_KFUNC_CALL; } +struct bpf_map_desc { + struct bpf_map *ptr; + int uid; +}; + struct bpf_call_arg_meta { - struct bpf_map *map_ptr; + struct bpf_map_desc map; bool raw_mode; bool pkt_access; u8 release_regno; @@ -283,7 +288,6 @@ struct bpf_call_arg_meta { u64 msize_max_value; int ref_obj_id; int dynptr_id; - int map_uid; int func_id; struct btf *btf; u32 btf_id; @@ -351,10 +355,7 @@ struct bpf_kfunc_call_arg_meta { u8 spi; u8 frameno; } iter; - struct { - struct bpf_map *ptr; - int uid; - } map; + struct bpf_map_desc map; u64 mem_size; }; @@ -8666,7 +8667,7 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, if (err) return err; - if (meta->map_ptr) { + if (meta->map.ptr) { verifier_bug(env, "Two map pointers in a timer helper"); return -EFAULT; } @@ -8674,8 +8675,8 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n"); return -EOPNOTSUPP; } - meta->map_uid = reg->map_uid; - meta->map_ptr = map; + meta->map.uid = reg->map_uid; + meta->map.ptr = map; return 0; } @@ -8739,7 +8740,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, return -EINVAL; } rec = map_ptr->record; - meta->map_ptr = map_ptr; + meta->map.ptr = map_ptr; } if (!tnum_is_const(reg->var_off)) { @@ -9246,13 +9247,13 @@ static int resolve_map_arg_type(struct bpf_verifier_env *env, const struct bpf_call_arg_meta *meta, enum bpf_arg_type *arg_type) { - if (!meta->map_ptr) { + if (!meta->map.ptr) { /* kernel subsystem misconfigured verifier */ verifier_bug(env, "invalid map_ptr to access map->type"); return -EFAULT; } - switch (meta->map_ptr->map_type) { + switch (meta->map.ptr->map_type) { case BPF_MAP_TYPE_SOCKMAP: case BPF_MAP_TYPE_SOCKHASH: if (*arg_type == ARG_PTR_TO_MAP_VALUE) { @@ -9906,7 +9907,7 @@ skip_type_check: switch (base_type(arg_type)) { case ARG_CONST_MAP_PTR: /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ - if (meta->map_ptr) { + if (meta->map.ptr) { /* Use map_uid (which is unique id of inner map) to reject: * inner_map1 = bpf_map_lookup_elem(outer_map, key1) * inner_map2 = bpf_map_lookup_elem(outer_map, key2) @@ -9919,23 +9920,23 @@ skip_type_check: * * Comparing map_ptr is enough to distinguish normal and outer maps. */ - if (meta->map_ptr != reg->map_ptr || - meta->map_uid != reg->map_uid) { + if (meta->map.ptr != reg->map_ptr || + meta->map.uid != reg->map_uid) { verbose(env, "timer pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n", - meta->map_uid, reg->map_uid); + meta->map.uid, reg->map_uid); return -EINVAL; } } - meta->map_ptr = reg->map_ptr; - meta->map_uid = reg->map_uid; + meta->map.ptr = reg->map_ptr; + meta->map.uid = reg->map_uid; break; case ARG_PTR_TO_MAP_KEY: /* bpf_map_xxx(..., map_ptr, ..., key) call: * check that [key, key + map->key_size) are within * stack limits and initialized */ - if (!meta->map_ptr) { + if (!meta->map.ptr) { /* in function declaration map_ptr must come before * map_key, so that it's verified and known before * we have to check map_key here. Otherwise it means @@ -9944,11 +9945,11 @@ skip_type_check: verifier_bug(env, "invalid map_ptr to access map->key"); return -EFAULT; } - key_size = meta->map_ptr->key_size; + key_size = meta->map.ptr->key_size; err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL); if (err) return err; - if (can_elide_value_nullness(meta->map_ptr->map_type)) { + if (can_elide_value_nullness(meta->map.ptr->map_type)) { err = get_constant_map_key(env, reg, key_size, &meta->const_map_key); if (err < 0) { meta->const_map_key = -1; @@ -9966,13 +9967,13 @@ skip_type_check: /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity */ - if (!meta->map_ptr) { + if (!meta->map.ptr) { /* kernel subsystem misconfigured verifier */ verifier_bug(env, "invalid map_ptr to access map->value"); return -EFAULT; } meta->raw_mode = arg_type & MEM_UNINIT; - err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, + err = check_helper_mem_access(env, regno, meta->map.ptr->value_size, arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, false, meta); break; @@ -11310,7 +11311,7 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, int func_id, int insn_idx) { struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; - struct bpf_map *map = meta->map_ptr; + struct bpf_map *map = meta->map.ptr; if (func_id != BPF_FUNC_tail_call && func_id != BPF_FUNC_map_lookup_elem && @@ -11343,11 +11344,11 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, } if (!aux->map_ptr_state.map_ptr) - bpf_map_ptr_store(aux, meta->map_ptr, - !meta->map_ptr->bypass_spec_v1, false); - else if (aux->map_ptr_state.map_ptr != meta->map_ptr) - bpf_map_ptr_store(aux, meta->map_ptr, - !meta->map_ptr->bypass_spec_v1, true); + bpf_map_ptr_store(aux, meta->map.ptr, + !meta->map.ptr->bypass_spec_v1, false); + else if (aux->map_ptr_state.map_ptr != meta->map.ptr) + bpf_map_ptr_store(aux, meta->map.ptr, + !meta->map.ptr->bypass_spec_v1, true); return 0; } @@ -11357,7 +11358,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, { struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; struct bpf_reg_state *reg; - struct bpf_map *map = meta->map_ptr; + struct bpf_map *map = meta->map.ptr; u64 val, max; int err; @@ -11913,22 +11914,22 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn * can check 'value_size' boundary of memory access * to map element returned from bpf_map_lookup_elem() */ - if (meta.map_ptr == NULL) { + if (meta.map.ptr == NULL) { verifier_bug(env, "unexpected null map_ptr"); return -EFAULT; } if (func_id == BPF_FUNC_map_lookup_elem && - can_elide_value_nullness(meta.map_ptr->map_type) && + can_elide_value_nullness(meta.map.ptr->map_type) && meta.const_map_key >= 0 && - meta.const_map_key < meta.map_ptr->max_entries) + meta.const_map_key < meta.map.ptr->max_entries) ret_flag &= ~PTR_MAYBE_NULL; - regs[BPF_REG_0].map_ptr = meta.map_ptr; - regs[BPF_REG_0].map_uid = meta.map_uid; + regs[BPF_REG_0].map_ptr = meta.map.ptr; + regs[BPF_REG_0].map_uid = meta.map.uid; regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag; if (!type_may_be_null(ret_flag) && - btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) { + btf_record_has_field(meta.map.ptr->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) { regs[BPF_REG_0].id = ++env->id_gen; } break; @@ -12031,7 +12032,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (type_may_be_null(regs[BPF_REG_0].type)) regs[BPF_REG_0].id = ++env->id_gen; - if (helper_multiple_ref_obj_use(func_id, meta.map_ptr)) { + if (helper_multiple_ref_obj_use(func_id, meta.map.ptr)) { verifier_bug(env, "func %s#%d sets ref_obj_id more than once", func_id_name(func_id), func_id); return -EFAULT; @@ -12043,7 +12044,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) { /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; - } else if (is_acquire_function(func_id, meta.map_ptr)) { + } else if (is_acquire_function(func_id, meta.map.ptr)) { int id = acquire_reference(env, insn_idx); if (id < 0) @@ -12058,7 +12059,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (err) return err; - err = check_map_func_compatibility(env, meta.map_ptr, func_id); + err = check_map_func_compatibility(env, meta.map.ptr, func_id); if (err) return err; -- cgit v1.2.3 From f4e72ad7c161d6ee1466b42ce0acc5c4eb6164dd Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Fri, 30 Jan 2026 20:42:11 +0000 Subject: bpf: Consolidate special map field validation in verifier Consolidate all logic for verifying special map fields in the single function check_map_field_pointer(). Acked-by: Eduard Zingerman Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260130-verif_special_fields-v2-2-2c59e637da7d@meta.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 70 ++++++++------------------------------------------- 1 file changed, 11 insertions(+), 59 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c79463d50b37..256cc5c1a7df 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8610,7 +8610,8 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) /* Check if @regno is a pointer to a specific field in a map value */ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, - enum btf_field_type field_type) + enum btf_field_type field_type, + struct bpf_map_desc *map_desc) { struct bpf_reg_state *reg = reg_state(env, regno); bool is_const = tnum_is_const(reg->var_off); @@ -8653,72 +8654,23 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, val + reg->off, struct_name, field_off); return -EINVAL; } + if (map_desc->ptr) { + verifier_bug(env, "Two map pointers in a %s helper", struct_name); + return -EFAULT; + } + map_desc->uid = reg->map_uid; + map_desc->ptr = map; return 0; } static int process_timer_func(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *reg = reg_state(env, regno); - struct bpf_map *map = reg->map_ptr; - int err; - - err = check_map_field_pointer(env, regno, BPF_TIMER); - if (err) - return err; - - if (meta->map.ptr) { - verifier_bug(env, "Two map pointers in a timer helper"); - return -EFAULT; - } if (IS_ENABLED(CONFIG_PREEMPT_RT)) { verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n"); return -EOPNOTSUPP; } - meta->map.uid = reg->map_uid; - meta->map.ptr = map; - return 0; -} - -static int process_wq_func(struct bpf_verifier_env *env, int regno, - struct bpf_kfunc_call_arg_meta *meta) -{ - struct bpf_reg_state *reg = reg_state(env, regno); - struct bpf_map *map = reg->map_ptr; - int err; - - err = check_map_field_pointer(env, regno, BPF_WORKQUEUE); - if (err) - return err; - - if (meta->map.ptr) { - verifier_bug(env, "Two map pointers in a bpf_wq helper"); - return -EFAULT; - } - - meta->map.uid = reg->map_uid; - meta->map.ptr = map; - return 0; -} - -static int process_task_work_func(struct bpf_verifier_env *env, int regno, - struct bpf_kfunc_call_arg_meta *meta) -{ - struct bpf_reg_state *reg = reg_state(env, regno); - struct bpf_map *map = reg->map_ptr; - int err; - - err = check_map_field_pointer(env, regno, BPF_TASK_WORK); - if (err) - return err; - - if (meta->map.ptr) { - verifier_bug(env, "Two map pointers in a bpf_task_work helper"); - return -EFAULT; - } - meta->map.uid = reg->map_uid; - meta->map.ptr = map; - return 0; + return check_map_field_pointer(env, regno, BPF_TIMER, &meta->map); } static int process_kptr_func(struct bpf_verifier_env *env, int regno, @@ -13754,7 +13706,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "arg#%d doesn't point to a map value\n", i); return -EINVAL; } - ret = process_wq_func(env, regno, meta); + ret = check_map_field_pointer(env, regno, BPF_WORKQUEUE, &meta->map); if (ret < 0) return ret; break; @@ -13763,7 +13715,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "arg#%d doesn't point to a map value\n", i); return -EINVAL; } - ret = process_task_work_func(env, regno, meta); + ret = check_map_field_pointer(env, regno, BPF_TASK_WORK, &meta->map); if (ret < 0) return ret; break; -- cgit v1.2.3 From 8798902f2b8bcae6f90229a1a1496b48ddda2972 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Sat, 31 Jan 2026 22:49:48 +0800 Subject: bpf: Add bpf_jit_supports_fsession() The added fsession does not prevent running on those architectures, that haven't added fsession support. For example, try to run fsession tests on arm64: test_fsession_basic:PASS:fsession_test__open_and_load 0 nsec test_fsession_basic:PASS:fsession_attach 0 nsec check_result:FAIL:test_run_opts err unexpected error: -14 (errno 14) In order to prevent such errors, add bpf_jit_supports_fsession() to guard those architectures. Fixes: 2d419c44658f ("bpf: add fsession support") Acked-by: Puranjay Mohan Tested-by: Puranjay Mohan Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260131144950.16294-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 5 ++++ include/linux/filter.h | 1 + kernel/bpf/core.c | 5 ++++ kernel/bpf/verifier.c | 5 ++++ .../selftests/bpf/prog_tests/fsession_test.c | 32 ++++++++++++++++------ 5 files changed, 40 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 5a075e06cf45..070ba80e39d7 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -4112,3 +4112,8 @@ bool bpf_jit_supports_timed_may_goto(void) { return true; } + +bool bpf_jit_supports_fsession(void) +{ + return true; +} diff --git a/include/linux/filter.h b/include/linux/filter.h index fd54fed8f95f..4e1cb4f91f49 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1167,6 +1167,7 @@ bool bpf_jit_supports_arena(void); bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena); bool bpf_jit_supports_private_stack(void); bool bpf_jit_supports_timed_may_goto(void); +bool bpf_jit_supports_fsession(void); u64 bpf_arch_uaddress_limit(void); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); u64 arch_bpf_timed_may_goto(void); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5ebece600aeb..dc906dfdff94 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3144,6 +3144,11 @@ bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) return false; } +bool __weak bpf_jit_supports_fsession(void) +{ + return false; +} + u64 __weak bpf_arch_uaddress_limit(void) { #if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 256cc5c1a7df..6b62b6d57175 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -24828,6 +24828,11 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_TRACE_FSESSION: + if (prog->expected_attach_type == BPF_TRACE_FSESSION && + !bpf_jit_supports_fsession()) { + bpf_log(log, "JIT does not support fsession\n"); + return -EOPNOTSUPP; + } if (!btf_type_is_func(t)) { bpf_log(log, "attach_btf_id %u is not a function\n", btf_id); diff --git a/tools/testing/selftests/bpf/prog_tests/fsession_test.c b/tools/testing/selftests/bpf/prog_tests/fsession_test.c index 0c4b428e1cee..a299aeb8cc2e 100644 --- a/tools/testing/selftests/bpf/prog_tests/fsession_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fsession_test.c @@ -29,8 +29,16 @@ static void test_fsession_basic(void) struct fsession_test *skel = NULL; int err; - skel = fsession_test__open_and_load(); - if (!ASSERT_OK_PTR(skel, "fsession_test__open_and_load")) + skel = fsession_test__open(); + if (!ASSERT_OK_PTR(skel, "fsession_test__open")) + return; + + err = fsession_test__load(skel); + if (err == -EOPNOTSUPP) { + test__skip(); + goto cleanup; + } + if (!ASSERT_OK(err, "fsession_test__load")) goto cleanup; err = fsession_test__attach(skel); @@ -47,8 +55,16 @@ static void test_fsession_reattach(void) struct fsession_test *skel = NULL; int err; - skel = fsession_test__open_and_load(); - if (!ASSERT_OK_PTR(skel, "fsession_test__open_and_load")) + skel = fsession_test__open(); + if (!ASSERT_OK_PTR(skel, "fsession_test__open")) + return; + + err = fsession_test__load(skel); + if (err == -EOPNOTSUPP) { + test__skip(); + goto cleanup; + } + if (!ASSERT_OK(err, "fsession_test__load")) goto cleanup; /* first attach */ @@ -94,6 +110,10 @@ static void test_fsession_cookie(void) bpf_program__set_autoload(skel->progs.test6, false); err = fsession_test__load(skel); + if (err == -EOPNOTSUPP) { + test__skip(); + goto cleanup; + } if (!ASSERT_OK(err, "fsession_test__load")) goto cleanup; @@ -111,10 +131,6 @@ cleanup: void test_fsession_test(void) { -#if !defined(__x86_64__) - test__skip(); - return; -#endif if (test__start_subtest("fsession_test")) test_fsession_basic(); if (test__start_subtest("fsession_reattach")) -- cgit v1.2.3 From 6b95cc562de2889a9333843dda073ba875f9e808 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 2 Feb 2026 08:58:49 +0100 Subject: ftrace: Fix direct_functions leak in update_ftrace_direct_del Alexei reported memory leak in update_ftrace_direct_del. We miss cleanup of the replaced direct_functions in the success path in update_ftrace_direct_del, adding that. Fixes: 8d2c1233f371 ("ftrace: Add update_ftrace_direct_del function") Reported-by: Alexei Starovoitov Closes: https://lore.kernel.org/bpf/aX_BxG5EJTJdCMT9@krava/T/#m7c13f5a95f862ed7ab78e905fbb678d635306a0c Signed-off-by: Jiri Olsa Acked-by: Steven Rostedt (Google) Link: https://lore.kernel.org/r/20260202075849.1684369-1-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/ftrace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8574932e66b6..b12dbd93ae1c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6537,6 +6537,7 @@ int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash) /* free the new_direct_functions */ old_direct_functions = new_direct_functions; } else { + old_direct_functions = direct_functions; rcu_assign_pointer(direct_functions, new_direct_functions); } -- cgit v1.2.3 From d95d76aa772bf94df353b015b1cb38303d4a415d Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sun, 1 Feb 2026 22:52:48 +0100 Subject: bpf: Replace snprintf("%s") with strscpy Replace snprintf("%s") with the faster and more direct strscpy(). Signed-off-by: Thorsten Blum Link: https://lore.kernel.org/r/20260201215247.677121-2-thorsten.blum@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8959f3bc1e92..7708958e3fb8 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -6324,7 +6325,7 @@ static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name btf->data_size = data_size; btf->kernel_btf = true; btf->named_start_id = 0; - snprintf(btf->name, sizeof(btf->name), "%s", name); + strscpy(btf->name, name); err = btf_parse_hdr(env); if (err) @@ -6443,7 +6444,7 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, btf->start_str_off = base_btf->hdr.str_len; btf->kernel_btf = true; btf->named_start_id = 0; - snprintf(btf->name, sizeof(btf->name), "%s", module_name); + strscpy(btf->name, module_name); btf->data = kvmemdup(data, data_size, GFP_KERNEL | __GFP_NOWARN); if (!btf->data) { -- cgit v1.2.3 From 3cd5c890652ba1f0682adc291b5446245259b692 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Tue, 3 Feb 2026 08:50:57 -0800 Subject: bpf: Let the verifier assign ids on stack fills The next commit will allow clearing of scalar ids if no other register/stack slot has that id. This is because if only one register has a unique id, it can't participate in bounds propagation and is equivalent to having no id. But if the id of a stack slot is cleared by clear_singular_ids() in the next commit, reading that stack slot into a register will not establish a link because the stack slot's id is cleared. This can happen in a situation where a register is spilled and later loses its id due to a multiply operation (for example) and then the stack slot's id becomes singular and can be cleared. Make sure that scalar stack slots have an id before we read them into a register. Acked-by: Eduard Zingerman Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260203165102.2302462-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6b62b6d57175..17b499956156 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5518,6 +5518,12 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, */ s32 subreg_def = state->regs[dst_regno].subreg_def; + if (env->bpf_capable && size == 4 && spill_size == 4 && + get_reg_width(reg) <= 32) + /* Ensure stack slot has an ID to build a relation + * with the destination register on fill. + */ + assign_scalar_id_before_mov(env, reg); copy_register_state(&state->regs[dst_regno], reg); state->regs[dst_regno].subreg_def = subreg_def; @@ -5563,6 +5569,11 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, } } else if (dst_regno >= 0) { /* restore register state from stack */ + if (env->bpf_capable) + /* Ensure stack slot has an ID to build a relation + * with the destination register on fill. + */ + assign_scalar_id_before_mov(env, reg); copy_register_state(&state->regs[dst_regno], reg); /* mark reg as written since spilled pointer state likely * has its liveness marks cleared by is_state_visited() -- cgit v1.2.3 From b2a0aa3a87396483b468b7c81be2fddb29171d74 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Tue, 3 Feb 2026 08:50:58 -0800 Subject: bpf: Clear singular ids for scalars in is_state_visited() The verifier assigns ids to scalar registers/stack slots when they are linked through a mov or stack spill/fill instruction. These ids are later used to propagate newly found bounds from one register to all registers that share the same id. The verifier also compares the ids of these registers in current state and cached state when making pruning decisions. When an ID becomes singular (i.e., only a single register or stack slot has that ID), it can no longer participate in bounds propagation. During comparisons between current and cached states for pruning decisions, however, such stale IDs can prevent pruning of otherwise equivalent states. Find and clear all singular ids before caching a state in is_state_visited(). struct bpf_idset which is currently unused has been repurposed for this use case. Acked-by: Eduard Zingerman Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260203165102.2302462-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 7 +++-- kernel/bpf/verifier.c | 68 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 8355b585cd18..746025df82c8 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -697,8 +697,11 @@ struct bpf_idmap { }; struct bpf_idset { - u32 count; - u32 ids[BPF_ID_MAP_SIZE]; + u32 num_ids; + struct { + u32 id; + u32 cnt; + } entries[BPF_ID_MAP_SIZE]; }; /* see verifier.c:compute_scc_callchain() */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 17b499956156..d92e10d4c2cc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19461,6 +19461,72 @@ static void clean_verifier_state(struct bpf_verifier_env *env, * doesn't meant that the states are DONE. The verifier has to compare * the callsites */ + +/* Find id in idset and increment its count, or add new entry */ +static void idset_cnt_inc(struct bpf_idset *idset, u32 id) +{ + u32 i; + + for (i = 0; i < idset->num_ids; i++) { + if (idset->entries[i].id == id) { + idset->entries[i].cnt++; + return; + } + } + /* New id */ + if (idset->num_ids < BPF_ID_MAP_SIZE) { + idset->entries[idset->num_ids].id = id; + idset->entries[idset->num_ids].cnt = 1; + idset->num_ids++; + } +} + +/* Find id in idset and return its count, or 0 if not found */ +static u32 idset_cnt_get(struct bpf_idset *idset, u32 id) +{ + u32 i; + + for (i = 0; i < idset->num_ids; i++) { + if (idset->entries[i].id == id) + return idset->entries[i].cnt; + } + return 0; +} + +/* + * Clear singular scalar ids in a state. + * A register with a non-zero id is called singular if no other register shares + * the same base id. Such registers can be treated as independent (id=0). + */ +static void clear_singular_ids(struct bpf_verifier_env *env, + struct bpf_verifier_state *st) +{ + struct bpf_idset *idset = &env->idset_scratch; + struct bpf_func_state *func; + struct bpf_reg_state *reg; + + idset->num_ids = 0; + + bpf_for_each_reg_in_vstate(st, func, reg, ({ + if (reg->type != SCALAR_VALUE) + continue; + if (!reg->id) + continue; + idset_cnt_inc(idset, reg->id & ~BPF_ADD_CONST); + })); + + bpf_for_each_reg_in_vstate(st, func, reg, ({ + if (reg->type != SCALAR_VALUE) + continue; + if (!reg->id) + continue; + if (idset_cnt_get(idset, reg->id & ~BPF_ADD_CONST) == 1) { + reg->id = 0; + reg->off = 0; + } + })); +} + static void clean_live_states(struct bpf_verifier_env *env, int insn, struct bpf_verifier_state *cur) { @@ -20459,6 +20525,8 @@ miss: if (env->bpf_capable) mark_all_scalars_imprecise(env, cur); + clear_singular_ids(env, cur); + /* add new state to the head of linked list */ new = &new_sl->state; err = copy_verifier_state(new, cur); -- cgit v1.2.3 From a24d6f955d4f68a98daa905a7dd090675a50eca8 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Tue, 3 Feb 2026 08:50:59 -0800 Subject: bpf: Relax maybe_widen_reg() constraints The maybe_widen_reg() function widens imprecise scalar registers to unknown when their values differ between the cached and current states. Previously, it used regs_exact() which also compared register IDs via check_ids(), requiring registers to have matching IDs (or mapped IDs) to be considered exact. For scalar widening purposes, what matters is whether the value tracking (bounds, tnum, var_off) is the same, not whether the IDs match. Two scalars with identical value constraints but different IDs represent the same abstract value and don't need to be widened. Introduce scalars_exact_for_widen() that only compares the value-tracking portion of bpf_reg_state (fields before 'id'). This allows the verifier to preserve more scalar value information during state merging when IDs differ but actual tracked values are identical, reducing unnecessary widening and potentially improving verification precision. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260203165102.2302462-4-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d92e10d4c2cc..80dc01350c77 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8995,15 +8995,24 @@ static bool regs_exact(const struct bpf_reg_state *rold, const struct bpf_reg_state *rcur, struct bpf_idmap *idmap); +/* + * Check if scalar registers are exact for the purpose of not widening. + * More lenient than regs_exact() + */ +static bool scalars_exact_for_widen(const struct bpf_reg_state *rold, + const struct bpf_reg_state *rcur) +{ + return !memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)); +} + static void maybe_widen_reg(struct bpf_verifier_env *env, - struct bpf_reg_state *rold, struct bpf_reg_state *rcur, - struct bpf_idmap *idmap) + struct bpf_reg_state *rold, struct bpf_reg_state *rcur) { if (rold->type != SCALAR_VALUE) return; if (rold->type != rcur->type) return; - if (rold->precise || rcur->precise || regs_exact(rold, rcur, idmap)) + if (rold->precise || rcur->precise || scalars_exact_for_widen(rold, rcur)) return; __mark_reg_unknown(env, rcur); } @@ -9015,7 +9024,6 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env, struct bpf_func_state *fold, *fcur; int i, fr, num_slots; - reset_idmap_scratch(env); for (fr = old->curframe; fr >= 0; fr--) { fold = old->frame[fr]; fcur = cur->frame[fr]; @@ -9023,8 +9031,7 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env, for (i = 0; i < MAX_BPF_REG; i++) maybe_widen_reg(env, &fold->regs[i], - &fcur->regs[i], - &env->idmap_scratch); + &fcur->regs[i]); num_slots = min(fold->allocated_stack / BPF_REG_SIZE, fcur->allocated_stack / BPF_REG_SIZE); @@ -9035,8 +9042,7 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env, maybe_widen_reg(env, &fold->stack[i].spilled_ptr, - &fcur->stack[i].spilled_ptr, - &env->idmap_scratch); + &fcur->stack[i].spilled_ptr); } } return 0; -- cgit v1.2.3 From b0388bafa4949bd30af7b3be5ee415f2a25ac014 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Tue, 3 Feb 2026 08:51:00 -0800 Subject: bpf: Relax scalar id equivalence for state pruning Scalar register IDs are used by the verifier to track relationships between registers and enable bounds propagation across those relationships. Once an ID becomes singular (i.e. only a single register/stack slot carries it), it can no longer contribute to bounds propagation and effectively becomes stale. The previous commit makes the verifier clear such ids before caching the state. When comparing the current and cached states for pruning, these stale IDs can cause technically equivalent states to be considered different and thus prevent pruning. For example, in the selftest added in the next commit, two registers - r6 and r7 are not linked to any other registers and get cached with id=0, in the current state, they are both linked to each other with id=A. Before this commit, check_scalar_ids would give temporary ids to r6 and r7 (say tid1 and tid2) and then check_ids() would map tid1->A, and when it would see tid2->A, it would not consider these state equivalent. Relax scalar ID equivalence by treating rold->id == 0 as "independent": if the old state did not rely on any ID relationships for a register, then any ID/linking present in the current state only adds constraints and is always safe to accept for pruning. Implement this by returning true immediately in check_scalar_ids() when old_id == 0. Maintain correctness for the opposite direction (old_id != 0 && cur_id == 0) by still allocating a temporary ID for cur_id == 0. This avoids incorrectly allowing multiple independent current registers (id==0) to satisfy a single linked old ID during mapping. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260203165102.2302462-5-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 63 +++++++++++++++++----- .../selftests/bpf/progs/verifier_scalar_ids.c | 8 +-- 2 files changed, 56 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 80dc01350c77..da03bbbc1620 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19387,13 +19387,29 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) return false; } -/* Similar to check_ids(), but allocate a unique temporary ID - * for 'old_id' or 'cur_id' of zero. - * This makes pairs like '0 vs unique ID', 'unique ID vs 0' valid. +/* + * Compare scalar register IDs for state equivalence. + * + * When old_id == 0, the old register is independent - not linked to any + * other register. Any linking in the current state only adds constraints, + * making it more restrictive. Since the old state didn't rely on any ID + * relationships for this register, it's always safe to accept cur regardless + * of its ID. Hence, return true immediately. + * + * When old_id != 0 but cur_id == 0, we need to ensure that different + * independent registers in cur don't incorrectly satisfy the ID matching + * requirements of linked registers in old. + * + * Example: if old has r6.id=X and r7.id=X (linked), but cur has r6.id=0 + * and r7.id=0 (both independent), without temp IDs both would map old_id=X + * to cur_id=0 and pass. With temp IDs: r6 maps X->temp1, r7 tries to map + * X->temp2, but X is already mapped to temp1, so the check fails correctly. */ static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) { - old_id = old_id ? old_id : ++idmap->tmp_id_gen; + if (!old_id) + return true; + cur_id = cur_id ? cur_id : ++idmap->tmp_id_gen; return check_ids(old_id, cur_id, idmap); @@ -19618,11 +19634,21 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, } if (!rold->precise && exact == NOT_EXACT) return true; - if ((rold->id & BPF_ADD_CONST) != (rcur->id & BPF_ADD_CONST)) - return false; - if ((rold->id & BPF_ADD_CONST) && (rold->off != rcur->off)) - return false; - /* Why check_ids() for scalar registers? + /* + * Linked register tracking uses rold->id to detect relationships. + * When rold->id == 0, the register is independent and any linking + * in rcur only adds constraints. When rold->id != 0, we must verify + * id mapping and (for BPF_ADD_CONST) offset consistency. + * + * +------------------+-----------+------------------+---------------+ + * | | rold->id | rold + ADD_CONST | rold->id == 0 | + * |------------------+-----------+------------------+---------------| + * | rcur->id | range,ids | false | range | + * | rcur + ADD_CONST | false | range,ids,off | range | + * | rcur->id == 0 | range,ids | false | range | + * +------------------+-----------+------------------+---------------+ + * + * Why check_ids() for scalar registers? * * Consider the following BPF code: * 1: r6 = ... unbound scalar, ID=a ... @@ -19646,9 +19672,22 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, * --- * Also verify that new value satisfies old value range knowledge. */ - return range_within(rold, rcur) && - tnum_in(rold->var_off, rcur->var_off) && - check_scalar_ids(rold->id, rcur->id, idmap); + + /* ADD_CONST mismatch: different linking semantics */ + if ((rold->id & BPF_ADD_CONST) && !(rcur->id & BPF_ADD_CONST)) + return false; + + if (rold->id && !(rold->id & BPF_ADD_CONST) && (rcur->id & BPF_ADD_CONST)) + return false; + + /* Both have offset linkage: offsets must match */ + if ((rold->id & BPF_ADD_CONST) && rold->off != rcur->off) + return false; + + if (!check_scalar_ids(rold->id, rcur->id, idmap)) + return false; + + return range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off); case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: case PTR_TO_MEM: diff --git a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c index c0ce690ddb68..c8f8820336b7 100644 --- a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c +++ b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c @@ -723,9 +723,9 @@ __success __log_level(2) /* The exit instruction should be reachable from two states, * use two matches and "processed .. insns" to ensure this. */ -__msg("13: (95) exit") -__msg("13: (95) exit") -__msg("processed 18 insns") +__msg("15: (95) exit") +__msg("15: (95) exit") +__msg("processed 20 insns") __flag(BPF_F_TEST_STATE_FREQ) __naked void two_old_ids_one_cur_id(void) { @@ -734,9 +734,11 @@ __naked void two_old_ids_one_cur_id(void) "call %[bpf_ktime_get_ns];" "r0 &= 0xff;" "r6 = r0;" + "r8 = r0;" "call %[bpf_ktime_get_ns];" "r0 &= 0xff;" "r7 = r0;" + "r9 = r0;" "r0 = 0;" /* Maybe make r{6,7} IDs identical */ "if r6 > r7 goto l0_%=;" -- cgit v1.2.3 From 63328bb23f2693fe36e8bcdb972c6040e84d16e4 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 3 Feb 2026 13:04:21 -0500 Subject: bpf: Add bpf_stream_print_stack stack dumping kfunc Add a new kfunc called bpf_stream_print_stack to be used by programs that need to print out their current BPF stack. The kfunc is essentially a wrapper around the existing bpf_stream_dump_stack functionality used to generate stack traces for error events like may_goto violations and BPF-side arena page faults. Signed-off-by: Emil Tsalapatis Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260203180424.14057-2-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 1 + kernel/bpf/stream.c | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b54ec0e945aa..c30a9f68af6b 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4562,6 +4562,7 @@ BTF_ID_FLAGS(func, bpf_strncasestr); BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, bpf_stream_print_stack, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_dynptr_from_file) diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c index 24730df55e69..be9ce98e9469 100644 --- a/kernel/bpf/stream.c +++ b/kernel/bpf/stream.c @@ -245,6 +245,25 @@ __bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const vo return ret; } +/* Directly trigger a stack dump from the program. */ +__bpf_kfunc int bpf_stream_print_stack(int stream_id, struct bpf_prog_aux *aux) +{ + struct bpf_stream_stage ss; + struct bpf_prog *prog; + + /* Make sure the stream ID is valid. */ + if (!bpf_stream_get(stream_id, aux)) + return -ENOENT; + + prog = aux->main_prog_aux->prog; + + bpf_stream_stage(ss, prog, stream_id, ({ + bpf_stream_dump_stack(ss); + })); + + return 0; +} + __bpf_kfunc_end_defs(); /* Added kfunc to common_btf_ids */ -- cgit v1.2.3 From 9ddfa24e16747da8d98464b4285ee66e37ddc5c0 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 3 Feb 2026 13:04:23 -0500 Subject: bpf: Allow BPF stream kfuncs while holding a lock The BPF stream kfuncs bpf_stream_vprintk and bpf_stream_print_stack do not sleep and so are safe to call while holding a lock. Amend the verifier to allow that. Signed-off-by: Emil Tsalapatis Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260203180424.14057-4-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index da03bbbc1620..6a616dc4dc54 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12455,6 +12455,8 @@ enum special_kfunc_type { KF_bpf_arena_free_pages, KF_bpf_arena_reserve_pages, KF_bpf_session_is_return, + KF_bpf_stream_vprintk, + KF_bpf_stream_print_stack, }; BTF_ID_LIST(special_kfunc_list) @@ -12533,6 +12535,8 @@ BTF_ID(func, bpf_arena_alloc_pages) BTF_ID(func, bpf_arena_free_pages) BTF_ID(func, bpf_arena_reserve_pages) BTF_ID(func, bpf_session_is_return) +BTF_ID(func, bpf_stream_vprintk) +BTF_ID(func, bpf_stream_print_stack) static bool is_task_work_add_kfunc(u32 func_id) { @@ -12977,10 +12981,17 @@ static bool is_bpf_arena_kfunc(u32 btf_id) btf_id == special_kfunc_list[KF_bpf_arena_reserve_pages]; } +static bool is_bpf_stream_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_stream_vprintk] || + btf_id == special_kfunc_list[KF_bpf_stream_print_stack]; +} + static bool kfunc_spin_allowed(u32 btf_id) { return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id) || - is_bpf_res_spin_lock_kfunc(btf_id) || is_bpf_arena_kfunc(btf_id); + is_bpf_res_spin_lock_kfunc(btf_id) || is_bpf_arena_kfunc(btf_id) || + is_bpf_stream_kfunc(btf_id); } static bool is_sync_callback_calling_kfunc(u32 btf_id) -- cgit v1.2.3 From 1bfbc267ec915e58c3723a2237bf067f0c4dffa8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sat, 31 Jan 2026 18:53:55 -0800 Subject: bpf: Enable bpf_timer and bpf_wq in any context Refactor bpf_timer and bpf_wq to allow calling them from any context: - add refcnt to bpf_async_cb - map_delete_elem or map_free will drop refcnt to zero via bpf_async_cancel_and_free() - once refcnt is zero timer/wq_start is not allowed to make sure that callback cannot rearm itself - if in_hardirq defer to start/cancel operations to irq_work Co-developed-by: Mykyta Yatsenko Signed-off-by: Mykyta Yatsenko Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20260201025403.66625-2-alexei.starovoitov@gmail.com --- kernel/bpf/helpers.c | 408 ++++++++++++++++++++++++++++----------------------- 1 file changed, 225 insertions(+), 183 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index c30a9f68af6b..354aa607df3e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1095,16 +1095,34 @@ static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx) return (void *)value - round_up(map->key_size, 8); } +enum bpf_async_type { + BPF_ASYNC_TYPE_TIMER = 0, + BPF_ASYNC_TYPE_WQ, +}; + +enum bpf_async_op { + BPF_ASYNC_START, + BPF_ASYNC_CANCEL +}; + +struct bpf_async_cmd { + struct llist_node node; + u64 nsec; + u32 mode; + enum bpf_async_op op; +}; + struct bpf_async_cb { struct bpf_map *map; struct bpf_prog *prog; void __rcu *callback_fn; void *value; - union { - struct rcu_head rcu; - struct work_struct delete_work; - }; + struct rcu_head rcu; u64 flags; + struct irq_work worker; + refcount_t refcnt; + enum bpf_async_type type; + struct llist_head async_cmds; }; /* BPF map elements can contain 'struct bpf_timer'. @@ -1132,7 +1150,6 @@ struct bpf_hrtimer { struct bpf_work { struct bpf_async_cb cb; struct work_struct work; - struct work_struct delete_work; }; /* the actual struct hidden inside uapi struct bpf_timer and bpf_wq */ @@ -1142,20 +1159,12 @@ struct bpf_async_kern { struct bpf_hrtimer *timer; struct bpf_work *work; }; - /* bpf_spin_lock is used here instead of spinlock_t to make - * sure that it always fits into space reserved by struct bpf_timer - * regardless of LOCKDEP and spinlock debug flags. - */ - struct bpf_spin_lock lock; } __attribute__((aligned(8))); -enum bpf_async_type { - BPF_ASYNC_TYPE_TIMER = 0, - BPF_ASYNC_TYPE_WQ, -}; - static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running); +static void bpf_async_refcount_put(struct bpf_async_cb *cb); + static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) { struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer); @@ -1219,45 +1228,73 @@ static void bpf_async_cb_rcu_free(struct rcu_head *rcu) { struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu); + /* + * Drop the last reference to prog only after RCU GP, as set_callback() + * may race with cancel_and_free() + */ + if (cb->prog) + bpf_prog_put(cb->prog); + kfree_nolock(cb); } -static void bpf_wq_delete_work(struct work_struct *work) +/* Callback from call_rcu_tasks_trace, chains to call_rcu for final free */ +static void bpf_async_cb_rcu_tasks_trace_free(struct rcu_head *rcu) { - struct bpf_work *w = container_of(work, struct bpf_work, delete_work); + struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu); + struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb); + struct bpf_work *w = container_of(cb, struct bpf_work, cb); + bool retry = false; - cancel_work_sync(&w->work); + /* + * bpf_async_cancel_and_free() tried to cancel timer/wq, but it + * could have raced with timer/wq_start. Now refcnt is zero and + * srcu/rcu GP completed. Cancel timer/wq again. + */ + switch (cb->type) { + case BPF_ASYNC_TYPE_TIMER: + if (hrtimer_try_to_cancel(&t->timer) < 0) + retry = true; + break; + case BPF_ASYNC_TYPE_WQ: + if (!cancel_work(&w->work)) + retry = true; + break; + } + if (retry) { + /* + * hrtimer or wq callback may still be running. It must be + * in rcu_tasks_trace or rcu CS, so wait for GP again. + * It won't retry forever, since refcnt zero prevents all + * operations on timer/wq. + */ + call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free); + return; + } - call_rcu(&w->cb.rcu, bpf_async_cb_rcu_free); + /* rcu_trace_implies_rcu_gp() is true and will remain so */ + bpf_async_cb_rcu_free(rcu); } -static void bpf_timer_delete_work(struct work_struct *work) +static void bpf_async_refcount_put(struct bpf_async_cb *cb) { - struct bpf_hrtimer *t = container_of(work, struct bpf_hrtimer, cb.delete_work); + if (!refcount_dec_and_test(&cb->refcnt)) + return; - /* Cancel the timer and wait for callback to complete if it was running. - * If hrtimer_cancel() can be safely called it's safe to call - * call_rcu() right after for both preallocated and non-preallocated - * maps. The async->cb = NULL was already done and no code path can see - * address 't' anymore. Timer if armed for existing bpf_hrtimer before - * bpf_timer_cancel_and_free will have been cancelled. - */ - hrtimer_cancel(&t->timer); - call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free); + call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free); } +static void bpf_async_cancel_and_free(struct bpf_async_kern *async); +static void bpf_async_irq_worker(struct irq_work *work); + static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, enum bpf_async_type type) { - struct bpf_async_cb *cb; + struct bpf_async_cb *cb, *old_cb; struct bpf_hrtimer *t; struct bpf_work *w; clockid_t clockid; size_t size; - int ret = 0; - - if (in_nmi()) - return -EOPNOTSUPP; switch (type) { case BPF_ASYNC_TYPE_TIMER: @@ -1270,18 +1307,13 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u return -EINVAL; } - __bpf_spin_lock_irqsave(&async->lock); - t = async->timer; - if (t) { - ret = -EBUSY; - goto out; - } + old_cb = READ_ONCE(async->cb); + if (old_cb) + return -EBUSY; cb = bpf_map_kmalloc_nolock(map, size, 0, map->numa_node); - if (!cb) { - ret = -ENOMEM; - goto out; - } + if (!cb) + return -ENOMEM; switch (type) { case BPF_ASYNC_TYPE_TIMER: @@ -1289,7 +1321,6 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u t = (struct bpf_hrtimer *)cb; atomic_set(&t->cancelling, 0); - INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work); hrtimer_setup(&t->timer, bpf_timer_cb, clockid, HRTIMER_MODE_REL_SOFT); cb->value = (void *)async - map->record->timer_off; break; @@ -1297,16 +1328,24 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u w = (struct bpf_work *)cb; INIT_WORK(&w->work, bpf_wq_work); - INIT_WORK(&w->delete_work, bpf_wq_delete_work); cb->value = (void *)async - map->record->wq_off; break; } cb->map = map; cb->prog = NULL; cb->flags = flags; + cb->worker = IRQ_WORK_INIT(bpf_async_irq_worker); + init_llist_head(&cb->async_cmds); + refcount_set(&cb->refcnt, 1); /* map's reference */ + cb->type = type; rcu_assign_pointer(cb->callback_fn, NULL); - WRITE_ONCE(async->cb, cb); + old_cb = cmpxchg(&async->cb, NULL, cb); + if (old_cb) { + /* Lost the race to initialize this bpf_async_kern, drop the allocated object */ + kfree_nolock(cb); + return -EBUSY; + } /* Guarantee the order between async->cb and map->usercnt. So * when there are concurrent uref release and bpf timer init, either * bpf_timer_cancel_and_free() called by uref release reads a no-NULL @@ -1317,13 +1356,11 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u /* maps with timers must be either held by user space * or pinned in bpffs. */ - WRITE_ONCE(async->cb, NULL); - kfree_nolock(cb); - ret = -EPERM; + bpf_async_cancel_and_free(async); + return -EPERM; } -out: - __bpf_spin_unlock_irqrestore(&async->lock); - return ret; + + return 0; } BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map, @@ -1354,8 +1391,9 @@ static const struct bpf_func_proto bpf_timer_init_proto = { .arg3_type = ARG_ANYTHING, }; -static int bpf_async_update_prog_callback(struct bpf_async_cb *cb, void *callback_fn, - struct bpf_prog *prog) +static int bpf_async_update_prog_callback(struct bpf_async_cb *cb, + struct bpf_prog *prog, + void *callback_fn) { struct bpf_prog *prev; @@ -1380,7 +1418,8 @@ static int bpf_async_update_prog_callback(struct bpf_async_cb *cb, void *callbac if (prev) bpf_prog_put(prev); - } while (READ_ONCE(cb->prog) != prog || READ_ONCE(cb->callback_fn) != callback_fn); + } while (READ_ONCE(cb->prog) != prog || + (void __force *)READ_ONCE(cb->callback_fn) != callback_fn); if (prog) bpf_prog_put(prog); @@ -1388,33 +1427,36 @@ static int bpf_async_update_prog_callback(struct bpf_async_cb *cb, void *callbac return 0; } +static int bpf_async_schedule_op(struct bpf_async_cb *cb, enum bpf_async_op op, + u64 nsec, u32 timer_mode) +{ + WARN_ON_ONCE(!in_hardirq()); + + struct bpf_async_cmd *cmd = kmalloc_nolock(sizeof(*cmd), 0, NUMA_NO_NODE); + + if (!cmd) { + bpf_async_refcount_put(cb); + return -ENOMEM; + } + init_llist_node(&cmd->node); + cmd->nsec = nsec; + cmd->mode = timer_mode; + cmd->op = op; + if (llist_add(&cmd->node, &cb->async_cmds)) + irq_work_queue(&cb->worker); + return 0; +} + static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn, struct bpf_prog *prog) { struct bpf_async_cb *cb; - int ret = 0; - if (in_nmi()) - return -EOPNOTSUPP; - __bpf_spin_lock_irqsave(&async->lock); - cb = async->cb; - if (!cb) { - ret = -EINVAL; - goto out; - } - if (!atomic64_read(&cb->map->usercnt)) { - /* maps with timers must be either held by user space - * or pinned in bpffs. Otherwise timer might still be - * running even when bpf prog is detached and user space - * is gone, since map_release_uref won't ever be called. - */ - ret = -EPERM; - goto out; - } - ret = bpf_async_update_prog_callback(cb, callback_fn, prog); -out: - __bpf_spin_unlock_irqrestore(&async->lock); - return ret; + cb = READ_ONCE(async->cb); + if (!cb) + return -EINVAL; + + return bpf_async_update_prog_callback(cb, prog, callback_fn); } BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn, @@ -1431,22 +1473,17 @@ static const struct bpf_func_proto bpf_timer_set_callback_proto = { .arg2_type = ARG_PTR_TO_FUNC, }; -BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, flags) +BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, async, u64, nsecs, u64, flags) { struct bpf_hrtimer *t; - int ret = 0; - enum hrtimer_mode mode; + u32 mode; - if (in_nmi()) - return -EOPNOTSUPP; if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN)) return -EINVAL; - __bpf_spin_lock_irqsave(&timer->lock); - t = timer->timer; - if (!t || !t->cb.prog) { - ret = -EINVAL; - goto out; - } + + t = READ_ONCE(async->timer); + if (!t || !READ_ONCE(t->cb.prog)) + return -EINVAL; if (flags & BPF_F_TIMER_ABS) mode = HRTIMER_MODE_ABS_SOFT; @@ -1456,10 +1493,20 @@ BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, fla if (flags & BPF_F_TIMER_CPU_PIN) mode |= HRTIMER_MODE_PINNED; - hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode); -out: - __bpf_spin_unlock_irqrestore(&timer->lock); - return ret; + /* + * bpf_async_cancel_and_free() could have dropped refcnt to zero. In + * such case BPF progs are not allowed to arm the timer to prevent UAF. + */ + if (!refcount_inc_not_zero(&t->cb.refcnt)) + return -ENOENT; + + if (!in_hardirq()) { + hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode); + bpf_async_refcount_put(&t->cb); + return 0; + } else { + return bpf_async_schedule_op(&t->cb, BPF_ASYNC_START, nsecs, mode); + } } static const struct bpf_func_proto bpf_timer_start_proto = { @@ -1477,11 +1524,9 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, async) bool inc = false; int ret = 0; - if (in_nmi()) + if (in_hardirq()) return -EOPNOTSUPP; - guard(rcu)(); - t = READ_ONCE(async->timer); if (!t) return -EINVAL; @@ -1536,78 +1581,85 @@ static const struct bpf_func_proto bpf_timer_cancel_proto = { .arg1_type = ARG_PTR_TO_TIMER, }; -static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *async) +static void bpf_async_process_op(struct bpf_async_cb *cb, u32 op, + u64 timer_nsec, u32 timer_mode) +{ + switch (cb->type) { + case BPF_ASYNC_TYPE_TIMER: { + struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb); + + switch (op) { + case BPF_ASYNC_START: + hrtimer_start(&t->timer, ns_to_ktime(timer_nsec), timer_mode); + break; + case BPF_ASYNC_CANCEL: + hrtimer_try_to_cancel(&t->timer); + break; + } + break; + } + case BPF_ASYNC_TYPE_WQ: { + struct bpf_work *w = container_of(cb, struct bpf_work, cb); + + switch (op) { + case BPF_ASYNC_START: + schedule_work(&w->work); + break; + case BPF_ASYNC_CANCEL: + cancel_work(&w->work); + break; + } + break; + } + } + bpf_async_refcount_put(cb); +} + +static void bpf_async_irq_worker(struct irq_work *work) +{ + struct bpf_async_cb *cb = container_of(work, struct bpf_async_cb, worker); + struct llist_node *pos, *n, *list; + + list = llist_del_all(&cb->async_cmds); + if (!list) + return; + + list = llist_reverse_order(list); + llist_for_each_safe(pos, n, list) { + struct bpf_async_cmd *cmd; + + cmd = container_of(pos, struct bpf_async_cmd, node); + bpf_async_process_op(cb, cmd->op, cmd->nsec, cmd->mode); + kfree_nolock(cmd); + } +} + +static void bpf_async_cancel_and_free(struct bpf_async_kern *async) { struct bpf_async_cb *cb; - /* Performance optimization: read async->cb without lock first. */ if (!READ_ONCE(async->cb)) - return NULL; + return; - __bpf_spin_lock_irqsave(&async->lock); - /* re-read it under lock */ - cb = async->cb; + cb = xchg(&async->cb, NULL); if (!cb) - goto out; - bpf_async_update_prog_callback(cb, NULL, NULL); - /* The subsequent bpf_timer_start/cancel() helpers won't be able to use - * this timer, since it won't be initialized. - */ - WRITE_ONCE(async->cb, NULL); -out: - __bpf_spin_unlock_irqrestore(&async->lock); - return cb; -} + return; -static void bpf_timer_delete(struct bpf_hrtimer *t) -{ /* - * We check that bpf_map_delete/update_elem() was called from timer - * callback_fn. In such case we don't call hrtimer_cancel() (since it - * will deadlock) and don't call hrtimer_try_to_cancel() (since it will - * just return -1). Though callback_fn is still running on this cpu it's - * safe to do kfree(t) because bpf_timer_cb() read everything it needed - * from 't'. The bpf subprog callback_fn won't be able to access 't', - * since async->cb = NULL was already done. The timer will be - * effectively cancelled because bpf_timer_cb() will return - * HRTIMER_NORESTART. - * - * However, it is possible the timer callback_fn calling us armed the - * timer _before_ calling us, such that failing to cancel it here will - * cause it to possibly use struct hrtimer after freeing bpf_hrtimer. - * Therefore, we _need_ to cancel any outstanding timers before we do - * call_rcu, even though no more timers can be armed. - * - * Moreover, we need to schedule work even if timer does not belong to - * the calling callback_fn, as on two different CPUs, we can end up in a - * situation where both sides run in parallel, try to cancel one - * another, and we end up waiting on both sides in hrtimer_cancel - * without making forward progress, since timer1 depends on time2 - * callback to finish, and vice versa. - * - * CPU 1 (timer1_cb) CPU 2 (timer2_cb) - * bpf_timer_cancel_and_free(timer2) bpf_timer_cancel_and_free(timer1) - * - * To avoid these issues, punt to workqueue context when we are in a - * timer callback. + * No refcount_inc_not_zero(&cb->refcnt) here. Dropping the last + * refcnt. Either synchronously or asynchronously in irq_work. */ - if (this_cpu_read(hrtimer_running)) { - queue_work(system_dfl_wq, &t->cb.delete_work); - return; - } - if (IS_ENABLED(CONFIG_PREEMPT_RT)) { - /* If the timer is running on other CPU, also use a kworker to - * wait for the completion of the timer instead of trying to - * acquire a sleepable lock in hrtimer_cancel() to wait for its - * completion. - */ - if (hrtimer_try_to_cancel(&t->timer) >= 0) - call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free); - else - queue_work(system_dfl_wq, &t->cb.delete_work); + if (!in_hardirq()) { + bpf_async_process_op(cb, BPF_ASYNC_CANCEL, 0, 0); } else { - bpf_timer_delete_work(&t->cb.delete_work); + (void)bpf_async_schedule_op(cb, BPF_ASYNC_CANCEL, 0, 0); + /* + * bpf_async_schedule_op() either enqueues allocated cmd into llist + * or fails with ENOMEM and drop the last refcnt. + * This is unlikely, but safe, since bpf_async_cb_rcu_tasks_trace_free() + * callback will do additional timer/wq_cancel due to races anyway. + */ } } @@ -1617,33 +1669,16 @@ static void bpf_timer_delete(struct bpf_hrtimer *t) */ void bpf_timer_cancel_and_free(void *val) { - struct bpf_hrtimer *t; - - t = (struct bpf_hrtimer *)__bpf_async_cancel_and_free(val); - if (!t) - return; - - bpf_timer_delete(t); + bpf_async_cancel_and_free(val); } -/* This function is called by map_delete/update_elem for individual element and +/* + * This function is called by map_delete/update_elem for individual element and * by ops->map_release_uref when the user space reference to a map reaches zero. */ void bpf_wq_cancel_and_free(void *val) { - struct bpf_work *work; - - BTF_TYPE_EMIT(struct bpf_wq); - - work = (struct bpf_work *)__bpf_async_cancel_and_free(val); - if (!work) - return; - /* Trigger cancel of the sleepable work, but *do not* wait for - * it to finish if it was running as we might not be in a - * sleepable context. - * kfree will be called once the work has finished. - */ - schedule_work(&work->delete_work); + bpf_async_cancel_and_free(val); } BPF_CALL_2(bpf_kptr_xchg, void *, dst, void *, ptr) @@ -3116,16 +3151,23 @@ __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) struct bpf_async_kern *async = (struct bpf_async_kern *)wq; struct bpf_work *w; - if (in_nmi()) - return -EOPNOTSUPP; if (flags) return -EINVAL; + w = READ_ONCE(async->work); if (!w || !READ_ONCE(w->cb.prog)) return -EINVAL; - schedule_work(&w->work); - return 0; + if (!refcount_inc_not_zero(&w->cb.refcnt)) + return -ENOENT; + + if (!in_hardirq()) { + schedule_work(&w->work); + bpf_async_refcount_put(&w->cb); + return 0; + } else { + return bpf_async_schedule_op(&w->cb, BPF_ASYNC_START, 0, 0); + } } __bpf_kfunc int bpf_wq_set_callback(struct bpf_wq *wq, -- cgit v1.2.3 From 19bd300e22c2be7df838fc4ea41b4e20ccd5c20f Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Sat, 31 Jan 2026 18:53:56 -0800 Subject: bpf: Add verifier support for bpf_timer argument in kfuncs Extend the verifier to recognize struct bpf_timer as a valid kfunc argument type. Previously, bpf_timer was only supported in BPF helpers. This prepares for adding timer-related kfuncs in subsequent patches. Signed-off-by: Mykyta Yatsenko Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260201025403.66625-3-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 55 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 37 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6a616dc4dc54..40a8252140fb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8675,13 +8675,25 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, } static int process_timer_func(struct bpf_verifier_env *env, int regno, - struct bpf_call_arg_meta *meta) + struct bpf_map_desc *map) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) { verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n"); return -EOPNOTSUPP; } - return check_map_field_pointer(env, regno, BPF_TIMER, &meta->map); + return check_map_field_pointer(env, regno, BPF_TIMER, map); +} + +static int process_timer_helper(struct bpf_verifier_env *env, int regno, + struct bpf_call_arg_meta *meta) +{ + return process_timer_func(env, regno, &meta->map); +} + +static int process_timer_kfunc(struct bpf_verifier_env *env, int regno, + struct bpf_kfunc_call_arg_meta *meta) +{ + return process_timer_func(env, regno, &meta->map); } static int process_kptr_func(struct bpf_verifier_env *env, int regno, @@ -9973,7 +9985,7 @@ skip_type_check: } break; case ARG_PTR_TO_TIMER: - err = process_timer_func(env, regno, meta); + err = process_timer_helper(env, regno, meta); if (err) return err; break; @@ -12238,7 +12250,8 @@ enum { KF_ARG_WORKQUEUE_ID, KF_ARG_RES_SPIN_LOCK_ID, KF_ARG_TASK_WORK_ID, - KF_ARG_PROG_AUX_ID + KF_ARG_PROG_AUX_ID, + KF_ARG_TIMER_ID }; BTF_ID_LIST(kf_arg_btf_ids) @@ -12251,6 +12264,7 @@ BTF_ID(struct, bpf_wq) BTF_ID(struct, bpf_res_spin_lock) BTF_ID(struct, bpf_task_work) BTF_ID(struct, bpf_prog_aux) +BTF_ID(struct, bpf_timer) static bool __is_kfunc_ptr_arg_type(const struct btf *btf, const struct btf_param *arg, int type) @@ -12294,6 +12308,11 @@ static bool is_kfunc_arg_rbtree_node(const struct btf *btf, const struct btf_par return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_NODE_ID); } +static bool is_kfunc_arg_timer(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_TIMER_ID); +} + static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg) { return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID); @@ -12393,6 +12412,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_NULL, KF_ARG_PTR_TO_CONST_STR, KF_ARG_PTR_TO_MAP, + KF_ARG_PTR_TO_TIMER, KF_ARG_PTR_TO_WORKQUEUE, KF_ARG_PTR_TO_IRQ_FLAG, KF_ARG_PTR_TO_RES_SPIN_LOCK, @@ -12646,6 +12666,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_wq(meta->btf, &args[argno])) return KF_ARG_PTR_TO_WORKQUEUE; + if (is_kfunc_arg_timer(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_TIMER; + if (is_kfunc_arg_task_work(meta->btf, &args[argno])) return KF_ARG_PTR_TO_TASK_WORK; @@ -13439,6 +13462,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_REFCOUNTED_KPTR: case KF_ARG_PTR_TO_CONST_STR: case KF_ARG_PTR_TO_WORKQUEUE: + case KF_ARG_PTR_TO_TIMER: case KF_ARG_PTR_TO_TASK_WORK: case KF_ARG_PTR_TO_IRQ_FLAG: case KF_ARG_PTR_TO_RES_SPIN_LOCK: @@ -13738,6 +13762,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (ret < 0) return ret; break; + case KF_ARG_PTR_TO_TIMER: + if (reg->type != PTR_TO_MAP_VALUE) { + verbose(env, "arg#%d doesn't point to a map value\n", i); + return -EINVAL; + } + ret = process_timer_kfunc(env, regno, meta); + if (ret < 0) + return ret; + break; case KF_ARG_PTR_TO_TASK_WORK: if (reg->type != PTR_TO_MAP_VALUE) { verbose(env, "arg#%d doesn't point to a map value\n", i); @@ -21429,20 +21462,6 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } - if (btf_record_has_field(map->record, BPF_TIMER)) { - if (is_tracing_prog_type(prog_type)) { - verbose(env, "tracing progs cannot use bpf_timer yet\n"); - return -EINVAL; - } - } - - if (btf_record_has_field(map->record, BPF_WORKQUEUE)) { - if (is_tracing_prog_type(prog_type)) { - verbose(env, "tracing progs cannot use bpf_wq yet\n"); - return -EINVAL; - } - } - if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) && !bpf_offload_prog_map_match(prog, map)) { verbose(env, "offload device mismatch between prog and map\n"); -- cgit v1.2.3 From a7e172aa4ca276d12fe87ffddff9cbd2d95ea51c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sat, 31 Jan 2026 18:53:57 -0800 Subject: bpf: Introduce bpf_timer_cancel_async() kfunc Introduce bpf_timer_cancel_async() that wraps hrtimer_try_to_cancel() and executes it either synchronously or defers to irq_work. Co-developed-by: Mykyta Yatsenko Signed-off-by: Mykyta Yatsenko Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260201025403.66625-4-alexei.starovoitov@gmail.com --- kernel/bpf/helpers.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 354aa607df3e..d4aedac14a60 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4426,6 +4426,53 @@ __bpf_kfunc int bpf_dynptr_file_discard(struct bpf_dynptr *dynptr) return 0; } +/** + * bpf_timer_cancel_async - try to deactivate a timer + * @timer: bpf_timer to stop + * + * Returns: + * + * * 0 when the timer was not active + * * 1 when the timer was active + * * -1 when the timer is currently executing the callback function and + * cannot be stopped + * * -ECANCELED when the timer will be cancelled asynchronously + * * -ENOMEM when out of memory + * * -EINVAL when the timer was not initialized + * * -ENOENT when this kfunc is racing with timer deletion + */ +__bpf_kfunc int bpf_timer_cancel_async(struct bpf_timer *timer) +{ + struct bpf_async_kern *async = (void *)timer; + struct bpf_async_cb *cb; + int ret; + + cb = READ_ONCE(async->cb); + if (!cb) + return -EINVAL; + + /* + * Unlike hrtimer_start() it's ok to synchronously call + * hrtimer_try_to_cancel() when refcnt reached zero, but deferring to + * irq_work is not, since irq callback may execute after RCU GP and + * cb could be freed at that time. Check for refcnt zero for + * consistency. + */ + if (!refcount_inc_not_zero(&cb->refcnt)) + return -ENOENT; + + if (!in_hardirq()) { + struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb); + + ret = hrtimer_try_to_cancel(&t->timer); + bpf_async_refcount_put(cb); + return ret; + } else { + ret = bpf_async_schedule_op(cb, BPF_ASYNC_CANCEL, 0, 0); + return ret ? ret : -ECANCELED; + } +} + __bpf_kfunc_end_defs(); static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work) @@ -4609,6 +4656,7 @@ BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_dynptr_from_file) BTF_ID_FLAGS(func, bpf_dynptr_file_discard) +BTF_ID_FLAGS(func, bpf_timer_cancel_async) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { -- cgit v1.2.3 From f06581392e9d56ac86d8fcc29c0931441ee82f4a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 3 Feb 2026 21:30:06 +0000 Subject: bpf: Use sk_is_inet() and sk_is_unix() in __cgroup_bpf_run_filter_sock_addr(). sk->sk_family should be read with READ_ONCE() in __cgroup_bpf_run_filter_sock_addr() due to IPV6_ADDRFORM. Also, the comment there is a bit stale since commit 859051dd165e ("bpf: Implement cgroup sockaddr hooks for unix sockets"), and the kdoc has the same comment. Let's use sk_is_inet() and sk_is_unix() and remove the comment. Acked-by: Stanislav Fomichev Signed-off-by: Kuniyuki Iwashima Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260203213442.682838-2-kuniyu@google.com --- kernel/bpf/cgroup.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 69988af44b37..b029f0369ecf 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1680,11 +1680,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct cgroup *cgrp; int ret; - /* Check socket family since not all sockets represent network - * endpoint (e.g. AF_UNIX). - */ - if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6 && - sk->sk_family != AF_UNIX) + if (!sk_is_inet(sk) && !sk_is_unix(sk)) return 0; if (!ctx.uaddr) { -- cgit v1.2.3 From 7d49635e3775da946e536bc81ab55b2bca6b791d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 3 Feb 2026 21:51:44 -0800 Subject: bpf: Tighten conditions when timer/wq can be called synchronously Though hrtimer_start/cancel() inlines all of the smaller helpers in hrtimer.c and only call timerqueue_add/del() from lib/timerqueue.c where everything is not traceable and not kprobe-able (because all files in lib/ are not traceable), there are tracepoints within hrtimer that are called with locks held. Therefore prevent the deadlock by tightening conditions when timer/wq can be called synchronously. hrtimer/wq are using raw_spin_lock_irqsave(), so irqs_disabled() is enough. Fixes: 1bfbc267ec91 ("bpf: Enable bpf_timer and bpf_wq in any context") Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260204055147.54960-2-alexei.starovoitov@gmail.com --- kernel/bpf/helpers.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index d4aedac14a60..0517e9a8fc7c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1430,8 +1430,6 @@ static int bpf_async_update_prog_callback(struct bpf_async_cb *cb, static int bpf_async_schedule_op(struct bpf_async_cb *cb, enum bpf_async_op op, u64 nsec, u32 timer_mode) { - WARN_ON_ONCE(!in_hardirq()); - struct bpf_async_cmd *cmd = kmalloc_nolock(sizeof(*cmd), 0, NUMA_NO_NODE); if (!cmd) { @@ -1473,6 +1471,11 @@ static const struct bpf_func_proto bpf_timer_set_callback_proto = { .arg2_type = ARG_PTR_TO_FUNC, }; +static bool defer_timer_wq_op(void) +{ + return in_hardirq() || irqs_disabled(); +} + BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, async, u64, nsecs, u64, flags) { struct bpf_hrtimer *t; @@ -1500,7 +1503,7 @@ BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, async, u64, nsecs, u64, fla if (!refcount_inc_not_zero(&t->cb.refcnt)) return -ENOENT; - if (!in_hardirq()) { + if (!defer_timer_wq_op()) { hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode); bpf_async_refcount_put(&t->cb); return 0; @@ -1524,7 +1527,7 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, async) bool inc = false; int ret = 0; - if (in_hardirq()) + if (defer_timer_wq_op()) return -EOPNOTSUPP; t = READ_ONCE(async->timer); @@ -1650,7 +1653,7 @@ static void bpf_async_cancel_and_free(struct bpf_async_kern *async) * refcnt. Either synchronously or asynchronously in irq_work. */ - if (!in_hardirq()) { + if (!defer_timer_wq_op()) { bpf_async_process_op(cb, BPF_ASYNC_CANCEL, 0, 0); } else { (void)bpf_async_schedule_op(cb, BPF_ASYNC_CANCEL, 0, 0); @@ -3161,7 +3164,7 @@ __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) if (!refcount_inc_not_zero(&w->cb.refcnt)) return -ENOENT; - if (!in_hardirq()) { + if (!defer_timer_wq_op()) { schedule_work(&w->work); bpf_async_refcount_put(&w->cb); return 0; @@ -4461,7 +4464,7 @@ __bpf_kfunc int bpf_timer_cancel_async(struct bpf_timer *timer) if (!refcount_inc_not_zero(&cb->refcnt)) return -ENOENT; - if (!in_hardirq()) { + if (!defer_timer_wq_op()) { struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb); ret = hrtimer_try_to_cancel(&t->timer); -- cgit v1.2.3 From 64873307e888505ccc45ef049dccdcfef42d2f54 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 3 Feb 2026 21:51:46 -0800 Subject: bpf: Add a recursion check to prevent loops in bpf_timer Do not schedule timer/wq operation on a cpu that is in irq_work callback that is processing async_cmds queue. Otherwise the following loop is possible: bpf_timer_start() -> bpf_async_schedule_op() -> irq_work_queue(). irqrestore -> bpf_async_irq_worker() -> tracepoint -> bpf_timer_start(). Fixes: 1bfbc267ec91 ("bpf: Enable bpf_timer and bpf_wq in any context") Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260204055147.54960-4-alexei.starovoitov@gmail.com --- kernel/bpf/helpers.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 0517e9a8fc7c..01052f8664eb 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1427,9 +1427,23 @@ static int bpf_async_update_prog_callback(struct bpf_async_cb *cb, return 0; } +static DEFINE_PER_CPU(struct bpf_async_cb *, async_cb_running); + static int bpf_async_schedule_op(struct bpf_async_cb *cb, enum bpf_async_op op, u64 nsec, u32 timer_mode) { + /* + * Do not schedule another operation on this cpu if it's in irq_work + * callback that is processing async_cmds queue. Otherwise the following + * loop is possible: + * bpf_timer_start() -> bpf_async_schedule_op() -> irq_work_queue(). + * irqrestore -> bpf_async_irq_worker() -> tracepoint -> bpf_timer_start(). + */ + if (this_cpu_read(async_cb_running) == cb) { + bpf_async_refcount_put(cb); + return -EDEADLK; + } + struct bpf_async_cmd *cmd = kmalloc_nolock(sizeof(*cmd), 0, NUMA_NO_NODE); if (!cmd) { @@ -1628,6 +1642,7 @@ static void bpf_async_irq_worker(struct irq_work *work) return; list = llist_reverse_order(list); + this_cpu_write(async_cb_running, cb); llist_for_each_safe(pos, n, list) { struct bpf_async_cmd *cmd; @@ -1635,6 +1650,7 @@ static void bpf_async_irq_worker(struct irq_work *work) bpf_async_process_op(cb, cmd->op, cmd->nsec, cmd->mode); kfree_nolock(cmd); } + this_cpu_write(async_cb_running, NULL); } static void bpf_async_cancel_and_free(struct bpf_async_kern *async) -- cgit v1.2.3 From 9d21199842247ab05c675fb9b6c6ca393a5c0024 Mon Sep 17 00:00:00 2001 From: Tianci Cao Date: Wed, 4 Feb 2026 19:15:02 +0800 Subject: bpf: Add bitwise tracking for BPF_END This patch implements bitwise tracking (tnum analysis) for BPF_END (byte swap) operation. Currently, the BPF verifier does not track value for BPF_END operation, treating the result as completely unknown. This limits the verifier's ability to prove safety of programs that perform endianness conversions, which are common in networking code. For example, the following code pattern for port number validation: int test(struct pt_regs *ctx) { __u64 x = bpf_get_prandom_u32(); x &= 0x3f00; // Range: [0, 0x3f00], var_off: (0x0; 0x3f00) x = bswap16(x); // Should swap to range [0, 0x3f], var_off: (0x0; 0x3f) if (x > 0x3f) goto trap; return 0; trap: return *(u64 *)NULL; // Should be unreachable } Currently generates verifier output: 1: (54) w0 &= 16128 ; R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=16128,var_off=(0x0; 0x3f00)) 2: (d7) r0 = bswap16 r0 ; R0=scalar() 3: (25) if r0 > 0x3f goto pc+2 ; R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=63,var_off=(0x0; 0x3f)) Without this patch, even though the verifier knows `x` has certain bits set, after bswap16, it loses all tracking information and treats port as having a completely unknown value [0, 65535]. According to the BPF instruction set[1], there are 3 kinds of BPF_END: 1. `bswap(16|32|64)`: opcode=0xd7 (BPF_END | BPF_ALU64 | BPF_TO_LE) - do unconditional swap 2. `le(16|32|64)`: opcode=0xd4 (BPF_END | BPF_ALU | BPF_TO_LE) - on big-endian: do swap - on little-endian: truncation (16/32-bit) or no-op (64-bit) 3. `be(16|32|64)`: opcode=0xdc (BPF_END | BPF_ALU | BPF_TO_BE) - on little-endian: do swap - on big-endian: truncation (16/32-bit) or no-op (64-bit) Since BPF_END operations are inherently bit-wise permutations, tnum (bitwise tracking) offers the most efficient and precise mechanism for value analysis. By implementing `tnum_bswap16`, `tnum_bswap32`, and `tnum_bswap64`, we can derive exact `var_off` values concisely, directly reflecting the bit-level changes. Here is the overview of changes: 1. In `tnum_bswap(16|32|64)` (kernel/bpf/tnum.c): Call `swab(16|32|64)` function on the value and mask of `var_off`, and do truncation for 16/32-bit cases. 2. In `adjust_scalar_min_max_vals` (kernel/bpf/verifier.c): Call helper function `scalar_byte_swap`. - Only do byte swap when * alu64 (unconditional swap) OR * switching between big-endian and little-endian machines. - If need do byte swap: * Firstly call `tnum_bswap(16|32|64)` to update `var_off`. * Then reset the bound since byte swap scrambles the range. - For 16/32-bit cases, truncate dst register to match the swapped size. This enables better verification of networking code that frequently uses byte swaps for protocol processing, reducing false positive rejections. [1] https://www.kernel.org/doc/Documentation/bpf/standardization/instruction-set.rst Co-developed-by: Shenghao Yuan Signed-off-by: Shenghao Yuan Co-developed-by: Yazhou Tang Signed-off-by: Yazhou Tang Signed-off-by: Tianci Cao Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260204111503.77871-2-ziye@zju.edu.cn Signed-off-by: Alexei Starovoitov --- include/linux/tnum.h | 5 +++++ kernel/bpf/tnum.c | 16 ++++++++++++++ kernel/bpf/verifier.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 78 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/tnum.h b/include/linux/tnum.h index c52b862dad45..fa4654ffb621 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -63,6 +63,11 @@ struct tnum tnum_union(struct tnum t1, struct tnum t2); /* Return @a with all but the lowest @size bytes cleared */ struct tnum tnum_cast(struct tnum a, u8 size); +/* Swap the bytes of a tnum */ +struct tnum tnum_bswap16(struct tnum a); +struct tnum tnum_bswap32(struct tnum a); +struct tnum tnum_bswap64(struct tnum a); + /* Returns true if @a is a known constant */ static inline bool tnum_is_const(struct tnum a) { diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index f8e70e9c3998..26fbfbb01700 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -8,6 +8,7 @@ */ #include #include +#include #define TNUM(_v, _m) (struct tnum){.value = _v, .mask = _m} /* A completely unknown value */ @@ -253,3 +254,18 @@ struct tnum tnum_const_subreg(struct tnum a, u32 value) { return tnum_with_subreg(a, tnum_const(value)); } + +struct tnum tnum_bswap16(struct tnum a) +{ + return TNUM(swab16(a.value & 0xFFFF), swab16(a.mask & 0xFFFF)); +} + +struct tnum tnum_bswap32(struct tnum a) +{ + return TNUM(swab32(a.value & 0xFFFFFFFF), swab32(a.mask & 0xFFFFFFFF)); +} + +struct tnum tnum_bswap64(struct tnum a) +{ + return TNUM(swab64(a.value), swab64(a.mask)); +} diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 40a8252140fb..92e03a5a50f5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15832,6 +15832,48 @@ static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg, __update_reg_bounds(dst_reg); } +static void scalar_byte_swap(struct bpf_reg_state *dst_reg, struct bpf_insn *insn) +{ + /* + * Byte swap operation - update var_off using tnum_bswap. + * Three cases: + * 1. bswap(16|32|64): opcode=0xd7 (BPF_END | BPF_ALU64 | BPF_TO_LE) + * unconditional swap + * 2. to_le(16|32|64): opcode=0xd4 (BPF_END | BPF_ALU | BPF_TO_LE) + * swap on big-endian, truncation or no-op on little-endian + * 3. to_be(16|32|64): opcode=0xdc (BPF_END | BPF_ALU | BPF_TO_BE) + * swap on little-endian, truncation or no-op on big-endian + */ + + bool alu64 = BPF_CLASS(insn->code) == BPF_ALU64; + bool to_le = BPF_SRC(insn->code) == BPF_TO_LE; + bool is_big_endian; +#ifdef CONFIG_CPU_BIG_ENDIAN + is_big_endian = true; +#else + is_big_endian = false; +#endif + /* Apply bswap if alu64 or switch between big-endian and little-endian machines */ + bool need_bswap = alu64 || (to_le == is_big_endian); + + if (need_bswap) { + if (insn->imm == 16) + dst_reg->var_off = tnum_bswap16(dst_reg->var_off); + else if (insn->imm == 32) + dst_reg->var_off = tnum_bswap32(dst_reg->var_off); + else if (insn->imm == 64) + dst_reg->var_off = tnum_bswap64(dst_reg->var_off); + /* + * Byteswap scrambles the range, so we must reset bounds. + * Bounds will be re-derived from the new tnum later. + */ + __mark_reg_unbounded(dst_reg); + } + /* For bswap16/32, truncate dst register to match the swapped size */ + if (insn->imm == 16 || insn->imm == 32) + coerce_reg_to_size(dst_reg, insn->imm / 8); +} + static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, const struct bpf_reg_state *src_reg) { @@ -15858,6 +15900,7 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, case BPF_XOR: case BPF_OR: case BPF_MUL: + case BPF_END: return true; /* @@ -16047,12 +16090,23 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, else scalar_min_max_arsh(dst_reg, &src_reg); break; + case BPF_END: + scalar_byte_swap(dst_reg, insn); + break; default: break; } - /* ALU32 ops are zero extended into 64bit register */ - if (alu32) + /* + * ALU32 ops are zero extended into 64bit register. + * + * BPF_END is already handled inside the helper (truncation), + * so skip zext here to avoid unexpected zero extension. + * e.g., le64: opcode=(BPF_END|BPF_ALU|BPF_TO_LE), imm=0x40 + * This is a 64bit byte swap operation with alu32==true, + * but we should not zero extend the result. + */ + if (alu32 && opcode != BPF_END) zext_32_to_64(dst_reg); reg_bounds_sync(dst_reg); return 0; @@ -16232,7 +16286,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* check dest operand */ - if (opcode == BPF_NEG && + if ((opcode == BPF_NEG || opcode == BPF_END) && regs[insn->dst_reg].type == SCALAR_VALUE) { err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); err = err ?: adjust_scalar_min_max_vals(env, insn, -- cgit v1.2.3 From 7a433e519364c3c19643e5c857f4fbfaebec441c Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Wed, 4 Feb 2026 07:17:37 -0800 Subject: bpf: Support negative offsets, BPF_SUB, and alu32 for linked register tracking Previously, the verifier only tracked positive constant deltas between linked registers using BPF_ADD. This limitation meant patterns like: r1 = r0; r1 += -4; if r1 s>= 0 goto l0_%=; // r1 >= 0 implies r0 >= 4 // verifier couldn't propagate bounds back to r0 if r0 != 0 goto l0_%=; r0 /= 0; // Verifier thinks this is reachable l0_%=: Similar limitation exists for 32-bit registers. With this change, the verifier can now track negative deltas in reg->off enabling bound propagation for the above pattern. For alu32, we make sure the destination register has the upper 32 bits as 0s before creating the link. BPF_ADD_CONST is split into BPF_ADD_CONST64 and BPF_ADD_CONST32, the latter is used in case of alu32 and sync_linked_regs uses this to zext the result if known_reg has this flag. Signed-off-by: Puranjay Mohan Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260204151741.2678118-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 6 ++- kernel/bpf/verifier.c | 50 +++++++++++++++++----- .../testing/selftests/bpf/progs/verifier_bounds.c | 2 +- 3 files changed, 45 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 746025df82c8..ef8e45a362d9 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -147,8 +147,12 @@ struct bpf_reg_state { * registers. Example: * r1 = r2; both will have r1->id == r2->id == N * r1 += 10; r1->id == N | BPF_ADD_CONST and r1->off == 10 + * r3 = r2; both will have r3->id == r2->id == N + * w3 += 10; r3->id == N | BPF_ADD_CONST32 and r3->off == 10 */ -#define BPF_ADD_CONST (1U << 31) +#define BPF_ADD_CONST64 (1U << 31) +#define BPF_ADD_CONST32 (1U << 30) +#define BPF_ADD_CONST (BPF_ADD_CONST64 | BPF_ADD_CONST32) u32 id; /* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned * from a pointer-cast helper, bpf_sk_fullsock() and diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 92e03a5a50f5..edf5342b982f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -16209,6 +16209,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, verbose(env, "verifier internal error: no src_reg\n"); return -EFAULT; } + /* + * For alu32 linked register tracking, we need to check dst_reg's + * umax_value before the ALU operation. After adjust_scalar_min_max_vals(), + * alu32 ops will have zero-extended the result, making umax_value <= U32_MAX. + */ + u64 dst_umax = dst_reg->umax_value; + err = adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg); if (err) return err; @@ -16218,26 +16225,44 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * r1 += 0x1 * if r2 < 1000 goto ... * use r1 in memory access - * So for 64-bit alu remember constant delta between r2 and r1 and - * update r1 after 'if' condition. + * So remember constant delta between r2 and r1 and update r1 after + * 'if' condition. */ if (env->bpf_capable && - BPF_OP(insn->code) == BPF_ADD && !alu32 && - dst_reg->id && is_reg_const(src_reg, false)) { - u64 val = reg_const_value(src_reg, false); + (BPF_OP(insn->code) == BPF_ADD || BPF_OP(insn->code) == BPF_SUB) && + dst_reg->id && is_reg_const(src_reg, alu32)) { + u64 val = reg_const_value(src_reg, alu32); + s32 off; + + if (!alu32 && ((s64)val < S32_MIN || (s64)val > S32_MAX)) + goto clear_id; + + if (alu32 && (dst_umax > U32_MAX)) + goto clear_id; - if ((dst_reg->id & BPF_ADD_CONST) || - /* prevent overflow in sync_linked_regs() later */ - val > (u32)S32_MAX) { + off = (s32)val; + + if (BPF_OP(insn->code) == BPF_SUB) { + /* Negating S32_MIN would overflow */ + if (off == S32_MIN) + goto clear_id; + off = -off; + } + + if (dst_reg->id & BPF_ADD_CONST) { /* * If the register already went through rX += val * we cannot accumulate another val into rx->off. */ +clear_id: dst_reg->off = 0; dst_reg->id = 0; } else { - dst_reg->id |= BPF_ADD_CONST; - dst_reg->off = val; + if (alu32) + dst_reg->id |= BPF_ADD_CONST32; + else + dst_reg->id |= BPF_ADD_CONST64; + dst_reg->off = off; } } else { /* @@ -17334,7 +17359,7 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s u32 saved_id = reg->id; fake_reg.type = SCALAR_VALUE; - __mark_reg_known(&fake_reg, (s32)reg->off - (s32)known_reg->off); + __mark_reg_known(&fake_reg, (s64)reg->off - (s64)known_reg->off); /* reg = known_reg; reg += delta */ copy_register_state(reg, known_reg); @@ -17349,6 +17374,9 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s scalar32_min_max_add(reg, &fake_reg); scalar_min_max_add(reg, &fake_reg); reg->var_off = tnum_add(reg->var_off, fake_reg.var_off); + if (known_reg->id & BPF_ADD_CONST32) + zext_32_to_64(reg); + reg_bounds_sync(reg); } if (e->is_reg) mark_reg_scratched(env, e->regno); diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index 411a18437d7e..560531404bce 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -1477,7 +1477,7 @@ __naked void sub64_full_overflow(void) SEC("socket") __description("64-bit subtraction, partial overflow, result in unbounded reg") __success __log_level(2) -__msg("3: (1f) r3 -= r2 {{.*}} R3=scalar()") +__msg("3: (1f) r3 -= r2 {{.*}} R3=scalar(id=1-1)") __retval(0) __naked void sub64_partial_overflow(void) { -- cgit v1.2.3 From 81502d7f20bf862b706f5174979bed88d3ab82b3 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 4 Feb 2026 16:38:52 -0800 Subject: bpf: Check for running wq callback when freeing bpf_async_cb When freeing a bpf_async_cb in bpf_async_cb_rcu_tasks_trace_free(), in case the wq callback is not scheduled, doing cancel_work() currently returns false and leads to retry of RCU tasks trace grace period. If the callback is never scheduled, we keep retrying indefinitely and don't put the prog reference. Since the only race we care about here is against a potentially running wq callback in the first grace period, it should finish by the second grace period, hence check work_busy() result to detect presence of running wq callback if it's not pending, otherwise free the object immediately without retrying. Reasoning behind the check and its correctness with racing wq callback invocation: cancel_work is supposed to be synchronized, hence calling it first and getting false would mean that work is definitely not pending, at this point, either the work is not scheduled at all or already running, or we race and it already finished by the time we checked for it using work_busy(). In case it is running, we synchronize using pool->lock to check the current work running there, if we match, it means we extend the wait by another grace period using retry = true, otherwise either the work already finished running or was never scheduled, so we can free the bpf_async_cb right away. Fixes: 1bfbc267ec91 ("bpf: Enable bpf_timer and bpf_wq in any context") Reported-by: Alexei Starovoitov Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260205003853.527571-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 01052f8664eb..b7aec34540c2 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1257,7 +1257,7 @@ static void bpf_async_cb_rcu_tasks_trace_free(struct rcu_head *rcu) retry = true; break; case BPF_ASYNC_TYPE_WQ: - if (!cancel_work(&w->work)) + if (!cancel_work(&w->work) && work_busy(&w->work)) retry = true; break; } -- cgit v1.2.3 From 5000a097f82c7695b7760c5b67c95f0eab4d209b Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 4 Feb 2026 16:38:53 -0800 Subject: bpf: Reset prog callback in bpf_async_cancel_and_free() Replace prog and callback in bpf_async_cb after removing visibility of bpf_async_cb in bpf_async_cancel_and_free() to increase the chances the scheduled async callbacks short-circuit execution and exit early, and not starting a RCU tasks trace section. This improves the overall time spent in running the wq selftest. Suggested-by: Alexei Starovoitov Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260205003853.527571-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b7aec34540c2..a4f039cee88b 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1664,6 +1664,7 @@ static void bpf_async_cancel_and_free(struct bpf_async_kern *async) if (!cb) return; + bpf_async_update_prog_callback(cb, NULL, NULL); /* * No refcount_inc_not_zero(&cb->refcnt) here. Dropping the last * refcnt. Either synchronously or asynchronously in irq_work. -- cgit v1.2.3 From ea1535e28bb3773fc0b3cbd1f3842b808016990c Mon Sep 17 00:00:00 2001 From: KP Singh Date: Thu, 5 Feb 2026 07:38:07 +0100 Subject: bpf: Limit bpf program signature size Practical BPF signatures are significantly smaller than KMALLOC_MAX_CACHE_SIZE Allowing larger sizes opens the door for abuse by passing excessive size values and forcing the kernel into expensive allocation paths (via kmalloc_large or vmalloc). Fixes: 349271568303 ("bpf: Implement signature verification for BPF programs") Reported-by: Chris Mason Signed-off-by: KP Singh Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260205063807.690823-1-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5f59dd47a5b1..93bc0f4c65c5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2813,6 +2813,13 @@ static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr void *sig; int err = 0; + /* + * Don't attempt to use kmalloc_large or vmalloc for signatures. + * Practical signature for BPF program should be below this limit. + */ + if (attr->signature_size > KMALLOC_MAX_CACHE_SIZE) + return -EINVAL; + if (system_keyring_id_check(attr->keyring_id) == 0) key = bpf_lookup_system_key(attr->keyring_id); else -- cgit v1.2.3 From a2c86aa621c22f2a7e26c654f936d65cfff0aa91 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Thu, 5 Feb 2026 08:07:55 +0100 Subject: bpf: Require frozen map for calculating map hash Currently, bpf_map_get_info_by_fd calculates and caches the hash of the map regardless of the map's frozen state. This leads to a TOCTOU bug where userspace can call BPF_OBJ_GET_INFO_BY_FD to cache the hash and then modify the map contents before freezing. Therefore, a trusted loader can be tricked into verifying the stale hash while loading the modified contents. Fix this by returning -EPERM if the map is not frozen when the hash is requested. This ensures the hash is only generated for the final, immutable state of the map. Fixes: ea2e6467ac36 ("bpf: Return hashes of maps in BPF_OBJ_GET_INFO_BY_FD") Reported-by: Toshi Piazza Signed-off-by: KP Singh Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260205070755.695776-1-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 93bc0f4c65c5..683c332dbafb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5328,6 +5328,9 @@ static int bpf_map_get_info_by_fd(struct file *file, if (info.hash_size != SHA256_DIGEST_SIZE) return -EINVAL; + if (!READ_ONCE(map->frozen)) + return -EPERM; + err = map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, map->sha); if (err != 0) return err; -- cgit v1.2.3 From 1ace9bac1ad2bc6a0a70baaa16d22b7e783e88c5 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 5 Feb 2026 11:02:33 -0800 Subject: bpf: Prevent reentrance into call_rcu_tasks_trace() call_rcu_tasks_trace() is not safe from in_nmi() and not reentrant. To prevent deadlock on raw_spin_lock_rcu_node(rtpcp) or memory corruption defer to irq_work when IRQs are disabled. call_rcu_tasks_generic() protects itself with local_irq_save(). Note when bpf_async_cb->refcnt drops to zero it's safe to reuse bpf_async_cb->worker for a different irq_work callback, since bpf_async_schedule_op() -> irq_work_queue(&cb->worker); is only called when refcnt >= 1. Fixes: 1bfbc267ec91 ("bpf: Enable bpf_timer and bpf_wq in any context") Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260205190233.912-1-alexei.starovoitov@gmail.com --- kernel/bpf/helpers.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a4f039cee88b..0458597134da 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1276,12 +1276,24 @@ static void bpf_async_cb_rcu_tasks_trace_free(struct rcu_head *rcu) bpf_async_cb_rcu_free(rcu); } +static void worker_for_call_rcu(struct irq_work *work) +{ + struct bpf_async_cb *cb = container_of(work, struct bpf_async_cb, worker); + + call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free); +} + static void bpf_async_refcount_put(struct bpf_async_cb *cb) { if (!refcount_dec_and_test(&cb->refcnt)) return; - call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free); + if (irqs_disabled()) { + cb->worker = IRQ_WORK_INIT(worker_for_call_rcu); + irq_work_queue(&cb->worker); + } else { + call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free); + } } static void bpf_async_cancel_and_free(struct bpf_async_kern *async); -- cgit v1.2.3 From 0ccef7079ea8d5f7b896b14be7e400022ff240c6 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:28:59 -0800 Subject: bpf: Select bpf_local_storage_map_bucket based on bpf_local_storage A later bpf_local_storage refactor will acquire all locks before performing any update. To simplified the number of locks needed to take in bpf_local_storage_map_update(), determine the bucket based on the local_storage an selem belongs to instead of the selem pointer. Currently, when a new selem needs to be created to replace the old selem in bpf_local_storage_map_update(), locks of both buckets need to be acquired to prevent racing. This can be simplified if the two selem belongs to the same bucket so that only one bucket needs to be locked. Therefore, instead of hashing selem, hashing the local_storage pointer the selem belongs. Performance wise, this is slightly better as update now requires locking one bucket. It should not change the level of contention on one bucket as the pointers to local storages of selems in a map are just as unique as pointers to selems. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-2-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 1 + kernel/bpf/bpf_local_storage.c | 17 +++++++++++------ net/core/bpf_sk_storage.c | 2 +- 3 files changed, 13 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 66432248cd81..2638487425b8 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -179,6 +179,7 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); void bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem); struct bpf_local_storage_elem * diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index e2fe6c32822b..91b28f4e3130 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -19,9 +19,9 @@ static struct bpf_local_storage_map_bucket * select_bucket(struct bpf_local_storage_map *smap, - struct bpf_local_storage_elem *selem) + struct bpf_local_storage *local_storage) { - return &smap->buckets[hash_ptr(selem, smap->bucket_log)]; + return &smap->buckets[hash_ptr(local_storage, smap->bucket_log)]; } static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size) @@ -349,6 +349,7 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) { + struct bpf_local_storage *local_storage; struct bpf_local_storage_map *smap; struct bpf_local_storage_map_bucket *b; unsigned long flags; @@ -357,8 +358,10 @@ static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) /* selem has already be unlinked from smap */ return; + local_storage = rcu_dereference_check(selem->local_storage, + bpf_rcu_lock_held()); smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); - b = select_bucket(smap, selem); + b = select_bucket(smap, local_storage); raw_spin_lock_irqsave(&b->lock, flags); if (likely(selem_linked_to_map(selem))) hlist_del_init_rcu(&selem->map_node); @@ -366,11 +369,13 @@ static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) } void bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem) { - struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem); + struct bpf_local_storage_map_bucket *b; unsigned long flags; + b = select_bucket(smap, local_storage); raw_spin_lock_irqsave(&b->lock, flags); hlist_add_head_rcu(&selem->map_node, &b->list); raw_spin_unlock_irqrestore(&b->lock, flags); @@ -448,7 +453,7 @@ int bpf_local_storage_alloc(void *owner, storage->use_kmalloc_nolock = smap->use_kmalloc_nolock; bpf_selem_link_storage_nolock(storage, first_selem); - bpf_selem_link_map(smap, first_selem); + bpf_selem_link_map(smap, storage, first_selem); owner_storage_ptr = (struct bpf_local_storage **)owner_storage(smap, owner); @@ -576,7 +581,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, alloc_selem = NULL; /* First, link the new selem to the map */ - bpf_selem_link_map(smap, selem); + bpf_selem_link_map(smap, local_storage, selem); /* Second, link (and publish) the new selem to local_storage */ bpf_selem_link_storage_nolock(local_storage, selem); diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index de111818f3a0..e36273e4fcbd 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -191,7 +191,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) } if (new_sk_storage) { - bpf_selem_link_map(smap, copy_selem); + bpf_selem_link_map(smap, new_sk_storage, copy_selem); bpf_selem_link_storage_nolock(new_sk_storage, copy_selem); } else { ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC); -- cgit v1.2.3 From 1b7e0cae85accc9e728004511193e995cd040300 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:00 -0800 Subject: bpf: Convert bpf_selem_unlink_map to failable To prepare for changing bpf_local_storage_map_bucket::lock to rqspinlock, convert bpf_selem_unlink_map() to failable. It still always succeeds and returns 0 for now. Since some operations updating local storage cannot fail in the middle, open-code bpf_selem_unlink_map() to take the b->lock before the operation. There are two such locations: - bpf_local_storage_alloc() The first selem will be unlinked from smap if cmpxchg owner_storage_ptr fails, which should not fail. Therefore, hold b->lock when linking until allocation complete. Helpers that assume b->lock is held by callers are introduced: bpf_selem_link_map_nolock() and bpf_selem_unlink_map_nolock(). - bpf_local_storage_update() The three step update process: link_map(new_selem), link_storage(new_selem), and unlink_map(old_selem) should not fail in the middle. In bpf_selem_unlink(), bpf_selem_unlink_map() and bpf_selem_unlink_storage() should either all succeed or fail as a whole instead of failing in the middle. So, return if unlink_map() failed. Remove the selem_linked_to_map_lockless() check as an selem in the common paths (not bpf_local_storage_map_free() or bpf_local_storage_destroy()), will be unlinked under b->lock and local_storage->lock and therefore no other threads can unlink the selem from map at the same time. In bpf_local_storage_destroy(), ignore the return of bpf_selem_unlink_map() for now. A later patch will allow bpf_local_storage_destroy() to unlink selems even when failing to acquire locks. Note that while this patch removes all callers of selem_linked_to_map(), a later patch that introduces bpf_selem_unlink_nofail() will use it again. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-3-ameryhung@gmail.com --- kernel/bpf/bpf_local_storage.c | 57 +++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 91b28f4e3130..6fa71502c7d7 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -61,11 +61,6 @@ static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem) return !hlist_unhashed(&selem->snode); } -static bool selem_linked_to_map_lockless(const struct bpf_local_storage_elem *selem) -{ - return !hlist_unhashed_lockless(&selem->map_node); -} - static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) { return !hlist_unhashed(&selem->map_node); @@ -347,25 +342,27 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, hlist_add_head_rcu(&selem->snode, &local_storage->list); } -static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) +static int bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) { struct bpf_local_storage *local_storage; struct bpf_local_storage_map *smap; struct bpf_local_storage_map_bucket *b; unsigned long flags; - if (unlikely(!selem_linked_to_map_lockless(selem))) - /* selem has already be unlinked from smap */ - return; - local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); b = select_bucket(smap, local_storage); raw_spin_lock_irqsave(&b->lock, flags); - if (likely(selem_linked_to_map(selem))) - hlist_del_init_rcu(&selem->map_node); + hlist_del_init_rcu(&selem->map_node); raw_spin_unlock_irqrestore(&b->lock, flags); + + return 0; +} + +static void bpf_selem_unlink_map_nolock(struct bpf_local_storage_elem *selem) +{ + hlist_del_init_rcu(&selem->map_node); } void bpf_selem_link_map(struct bpf_local_storage_map *smap, @@ -381,13 +378,24 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap, raw_spin_unlock_irqrestore(&b->lock, flags); } +static void bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket *b, + struct bpf_local_storage_elem *selem) +{ + hlist_add_head_rcu(&selem->map_node, &b->list); +} + void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) { + int err; + /* Always unlink from map before unlinking from local_storage * because selem will be freed after successfully unlinked from * the local_storage. */ - bpf_selem_unlink_map(selem); + err = bpf_selem_unlink_map(selem); + if (err) + return; + bpf_selem_unlink_storage(selem, reuse_now); } @@ -429,6 +437,8 @@ int bpf_local_storage_alloc(void *owner, { struct bpf_local_storage *prev_storage, *storage; struct bpf_local_storage **owner_storage_ptr; + struct bpf_local_storage_map_bucket *b; + unsigned long flags; int err; err = mem_charge(smap, owner, sizeof(*storage)); @@ -453,7 +463,10 @@ int bpf_local_storage_alloc(void *owner, storage->use_kmalloc_nolock = smap->use_kmalloc_nolock; bpf_selem_link_storage_nolock(storage, first_selem); - bpf_selem_link_map(smap, storage, first_selem); + + b = select_bucket(smap, storage); + raw_spin_lock_irqsave(&b->lock, flags); + bpf_selem_link_map_nolock(b, first_selem); owner_storage_ptr = (struct bpf_local_storage **)owner_storage(smap, owner); @@ -469,10 +482,12 @@ int bpf_local_storage_alloc(void *owner, */ prev_storage = cmpxchg(owner_storage_ptr, NULL, storage); if (unlikely(prev_storage)) { - bpf_selem_unlink_map(first_selem); + bpf_selem_unlink_map_nolock(first_selem); + raw_spin_unlock_irqrestore(&b->lock, flags); err = -EAGAIN; goto uncharge; } + raw_spin_unlock_irqrestore(&b->lock, flags); return 0; @@ -494,8 +509,9 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, struct bpf_local_storage_data *old_sdata = NULL; struct bpf_local_storage_elem *alloc_selem, *selem = NULL; struct bpf_local_storage *local_storage; + struct bpf_local_storage_map_bucket *b; HLIST_HEAD(old_selem_free_list); - unsigned long flags; + unsigned long flags, b_flags; int err; /* BPF_EXIST and BPF_NOEXIST cannot be both set */ @@ -579,20 +595,25 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, goto unlock; } + b = select_bucket(smap, local_storage); + + raw_spin_lock_irqsave(&b->lock, b_flags); + alloc_selem = NULL; /* First, link the new selem to the map */ - bpf_selem_link_map(smap, local_storage, selem); + bpf_selem_link_map_nolock(b, selem); /* Second, link (and publish) the new selem to local_storage */ bpf_selem_link_storage_nolock(local_storage, selem); /* Third, remove old selem, SELEM(old_sdata) */ if (old_sdata) { - bpf_selem_unlink_map(SELEM(old_sdata)); + bpf_selem_unlink_map_nolock(SELEM(old_sdata)); bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), &old_selem_free_list); } + raw_spin_unlock_irqrestore(&b->lock, b_flags); unlock: raw_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_selem_free_list(&old_selem_free_list, false); -- cgit v1.2.3 From fd103ffc57c9a3b8b76bc852ffae5eb630a6ded4 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:01 -0800 Subject: bpf: Convert bpf_selem_link_map to failable To prepare for changing bpf_local_storage_map_bucket::lock to rqspinlock, convert bpf_selem_link_map() to failable. It still always succeeds and returns 0 until the change happens. No functional change. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-4-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 6 +++--- kernel/bpf/bpf_local_storage.c | 8 +++++--- net/core/bpf_sk_storage.c | 9 ++++++++- 3 files changed, 16 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 2638487425b8..709506e982a6 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -178,9 +178,9 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); -void bpf_selem_link_map(struct bpf_local_storage_map *smap, - struct bpf_local_storage *local_storage, - struct bpf_local_storage_elem *selem); +int bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem); struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 6fa71502c7d7..2f94ca4a4475 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -365,9 +365,9 @@ static void bpf_selem_unlink_map_nolock(struct bpf_local_storage_elem *selem) hlist_del_init_rcu(&selem->map_node); } -void bpf_selem_link_map(struct bpf_local_storage_map *smap, - struct bpf_local_storage *local_storage, - struct bpf_local_storage_elem *selem) +int bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map_bucket *b; unsigned long flags; @@ -376,6 +376,8 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap, raw_spin_lock_irqsave(&b->lock, flags); hlist_add_head_rcu(&selem->map_node, &b->list); raw_spin_unlock_irqrestore(&b->lock, flags); + + return 0; } static void bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket *b, diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index e36273e4fcbd..0b85d8f2c17e 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -191,7 +191,14 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) } if (new_sk_storage) { - bpf_selem_link_map(smap, new_sk_storage, copy_selem); + ret = bpf_selem_link_map(smap, new_sk_storage, copy_selem); + if (ret) { + bpf_selem_free(copy_selem, true); + atomic_sub(smap->elem_size, + &newsk->sk_omem_alloc); + bpf_map_put(map); + goto out; + } bpf_selem_link_storage_nolock(new_sk_storage, copy_selem); } else { ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC); -- cgit v1.2.3 From 403e935f915896243ff93f9a2ff44e5bb6619032 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:02 -0800 Subject: bpf: Convert bpf_selem_unlink to failable To prepare changing both bpf_local_storage_map_bucket::lock and bpf_local_storage::lock to rqspinlock, convert bpf_selem_unlink() to failable. It still always succeeds and returns 0 until the change happens. No functional change. Open code bpf_selem_unlink_storage() in the only caller, bpf_selem_unlink(), since unlink_map and unlink_storage must be done together after all the necessary locks are acquired. For bpf_local_storage_map_free(), ignore the return from bpf_selem_unlink() for now. A later patch will allow it to unlink selems even when failing to acquire locks. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-5-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 2 +- kernel/bpf/bpf_cgrp_storage.c | 3 +- kernel/bpf/bpf_inode_storage.c | 4 +-- kernel/bpf/bpf_local_storage.c | 71 +++++++++++++++++++-------------------- kernel/bpf/bpf_task_storage.c | 4 +-- net/core/bpf_sk_storage.c | 4 +-- 6 files changed, 39 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 709506e982a6..f74e0f7656a1 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -176,7 +176,7 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem); -void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); +int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); int bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage *local_storage, diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 0687a760974a..8fef24fcac68 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -118,8 +118,7 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata), false); - return 0; + return bpf_selem_unlink(SELEM(sdata), false); } static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index e54cce2b9175..cedc99184dad 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -110,9 +110,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata), false); - - return 0; + return bpf_selem_unlink(SELEM(sdata), false); } static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 2f94ca4a4475..38f5c3db2957 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -308,33 +308,6 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor return free_local_storage; } -static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, - bool reuse_now) -{ - struct bpf_local_storage *local_storage; - bool free_local_storage = false; - HLIST_HEAD(selem_free_list); - unsigned long flags; - - if (unlikely(!selem_linked_to_storage_lockless(selem))) - /* selem has already been unlinked from sk */ - return; - - local_storage = rcu_dereference_check(selem->local_storage, - bpf_rcu_lock_held()); - - raw_spin_lock_irqsave(&local_storage->lock, flags); - if (likely(selem_linked_to_storage(selem))) - free_local_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, &selem_free_list); - raw_spin_unlock_irqrestore(&local_storage->lock, flags); - - bpf_selem_free_list(&selem_free_list, reuse_now); - - if (free_local_storage) - bpf_local_storage_free(local_storage, reuse_now); -} - void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem) { @@ -386,19 +359,43 @@ static void bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket *b, hlist_add_head_rcu(&selem->map_node, &b->list); } -void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) +int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) { - int err; + struct bpf_local_storage *local_storage; + bool free_local_storage = false; + HLIST_HEAD(selem_free_list); + unsigned long flags; + int err = 0; - /* Always unlink from map before unlinking from local_storage - * because selem will be freed after successfully unlinked from - * the local_storage. - */ - err = bpf_selem_unlink_map(selem); - if (err) - return; + if (unlikely(!selem_linked_to_storage_lockless(selem))) + /* selem has already been unlinked from sk */ + return 0; + + local_storage = rcu_dereference_check(selem->local_storage, + bpf_rcu_lock_held()); + + raw_spin_lock_irqsave(&local_storage->lock, flags); + if (likely(selem_linked_to_storage(selem))) { + /* Always unlink from map before unlinking from local_storage + * because selem will be freed after successfully unlinked from + * the local_storage. + */ + err = bpf_selem_unlink_map(selem); + if (err) + goto out; + + free_local_storage = bpf_selem_unlink_storage_nolock( + local_storage, selem, &selem_free_list); + } +out: + raw_spin_unlock_irqrestore(&local_storage->lock, flags); - bpf_selem_unlink_storage(selem, reuse_now); + bpf_selem_free_list(&selem_free_list, reuse_now); + + if (free_local_storage) + bpf_local_storage_free(local_storage, reuse_now); + + return err; } void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index a1dc1bf0848a..ab902364ac23 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -167,9 +167,7 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map, if (!nobusy) return -EBUSY; - bpf_selem_unlink(SELEM(sdata), false); - - return 0; + return bpf_selem_unlink(SELEM(sdata), false); } static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 0b85d8f2c17e..d7b5c4551997 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -40,9 +40,7 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata), false); - - return 0; + return bpf_selem_unlink(SELEM(sdata), false); } /* Called by __sk_destruct() & bpf_sk_storage_clone() */ -- cgit v1.2.3 From 8dabe34b9d5b1135b084268396ed2267107e64c1 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:03 -0800 Subject: bpf: Change local_storage->lock and b->lock to rqspinlock Change bpf_local_storage::lock and bpf_local_storage_map_bucket::lock from raw_spin_lock to rqspinlock. Finally, propagate errors from raw_res_spin_lock_irqsave() to syscall return or BPF helper return. In bpf_local_storage_destroy(), ignore return from raw_res_spin_lock_irqsave() for now. A later patch will correctly handle errors correctly in bpf_local_storage_destroy() so that it can unlink selems even when failing to acquire locks. For __bpf_local_storage_map_cache(), instead of handling the error, skip updating the cache. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-6-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 5 +-- kernel/bpf/bpf_local_storage.c | 64 +++++++++++++++++++++++++++------------ 2 files changed, 47 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index f74e0f7656a1..fa50b7afee18 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -15,12 +15,13 @@ #include #include #include +#include #define BPF_LOCAL_STORAGE_CACHE_SIZE 16 struct bpf_local_storage_map_bucket { struct hlist_head list; - raw_spinlock_t lock; + rqspinlock_t lock; }; /* Thp map is not the primary owner of a bpf_local_storage_elem. @@ -94,7 +95,7 @@ struct bpf_local_storage { * bpf_local_storage_elem. */ struct rcu_head rcu; - raw_spinlock_t lock; /* Protect adding/removing from the "list" */ + rqspinlock_t lock; /* Protect adding/removing from the "list" */ bool use_kmalloc_nolock; }; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 38f5c3db2957..1138e2293b50 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -321,14 +321,18 @@ static int bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) struct bpf_local_storage_map *smap; struct bpf_local_storage_map_bucket *b; unsigned long flags; + int err; local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); b = select_bucket(smap, local_storage); - raw_spin_lock_irqsave(&b->lock, flags); + err = raw_res_spin_lock_irqsave(&b->lock, flags); + if (err) + return err; + hlist_del_init_rcu(&selem->map_node); - raw_spin_unlock_irqrestore(&b->lock, flags); + raw_res_spin_unlock_irqrestore(&b->lock, flags); return 0; } @@ -344,11 +348,16 @@ int bpf_selem_link_map(struct bpf_local_storage_map *smap, { struct bpf_local_storage_map_bucket *b; unsigned long flags; + int err; b = select_bucket(smap, local_storage); - raw_spin_lock_irqsave(&b->lock, flags); + + err = raw_res_spin_lock_irqsave(&b->lock, flags); + if (err) + return err; + hlist_add_head_rcu(&selem->map_node, &b->list); - raw_spin_unlock_irqrestore(&b->lock, flags); + raw_res_spin_unlock_irqrestore(&b->lock, flags); return 0; } @@ -365,7 +374,7 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) bool free_local_storage = false; HLIST_HEAD(selem_free_list); unsigned long flags; - int err = 0; + int err; if (unlikely(!selem_linked_to_storage_lockless(selem))) /* selem has already been unlinked from sk */ @@ -374,7 +383,10 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); - raw_spin_lock_irqsave(&local_storage->lock, flags); + err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); + if (err) + return err; + if (likely(selem_linked_to_storage(selem))) { /* Always unlink from map before unlinking from local_storage * because selem will be freed after successfully unlinked from @@ -388,7 +400,7 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) local_storage, selem, &selem_free_list); } out: - raw_spin_unlock_irqrestore(&local_storage->lock, flags); + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_selem_free_list(&selem_free_list, reuse_now); @@ -403,16 +415,20 @@ void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem) { unsigned long flags; + int err; /* spinlock is needed to avoid racing with the * parallel delete. Otherwise, publishing an already * deleted sdata to the cache will become a use-after-free * problem in the next bpf_local_storage_lookup(). */ - raw_spin_lock_irqsave(&local_storage->lock, flags); + err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); + if (err) + return; + if (selem_linked_to_storage(selem)) rcu_assign_pointer(local_storage->cache[smap->cache_idx], SDATA(selem)); - raw_spin_unlock_irqrestore(&local_storage->lock, flags); + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); } static int check_flags(const struct bpf_local_storage_data *old_sdata, @@ -457,14 +473,17 @@ int bpf_local_storage_alloc(void *owner, RCU_INIT_POINTER(storage->smap, smap); INIT_HLIST_HEAD(&storage->list); - raw_spin_lock_init(&storage->lock); + raw_res_spin_lock_init(&storage->lock); storage->owner = owner; storage->use_kmalloc_nolock = smap->use_kmalloc_nolock; bpf_selem_link_storage_nolock(storage, first_selem); b = select_bucket(smap, storage); - raw_spin_lock_irqsave(&b->lock, flags); + err = raw_res_spin_lock_irqsave(&b->lock, flags); + if (err) + goto uncharge; + bpf_selem_link_map_nolock(b, first_selem); owner_storage_ptr = @@ -482,11 +501,11 @@ int bpf_local_storage_alloc(void *owner, prev_storage = cmpxchg(owner_storage_ptr, NULL, storage); if (unlikely(prev_storage)) { bpf_selem_unlink_map_nolock(first_selem); - raw_spin_unlock_irqrestore(&b->lock, flags); + raw_res_spin_unlock_irqrestore(&b->lock, flags); err = -EAGAIN; goto uncharge; } - raw_spin_unlock_irqrestore(&b->lock, flags); + raw_res_spin_unlock_irqrestore(&b->lock, flags); return 0; @@ -569,7 +588,9 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (!alloc_selem) return ERR_PTR(-ENOMEM); - raw_spin_lock_irqsave(&local_storage->lock, flags); + err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); + if (err) + goto free_selem; /* Recheck local_storage->list under local_storage->lock */ if (unlikely(hlist_empty(&local_storage->list))) { @@ -596,7 +617,9 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, b = select_bucket(smap, local_storage); - raw_spin_lock_irqsave(&b->lock, b_flags); + err = raw_res_spin_lock_irqsave(&b->lock, b_flags); + if (err) + goto unlock; alloc_selem = NULL; /* First, link the new selem to the map */ @@ -612,9 +635,10 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, &old_selem_free_list); } - raw_spin_unlock_irqrestore(&b->lock, b_flags); + raw_res_spin_unlock_irqrestore(&b->lock, b_flags); unlock: - raw_spin_unlock_irqrestore(&local_storage->lock, flags); + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); +free_selem: bpf_selem_free_list(&old_selem_free_list, false); if (alloc_selem) { mem_uncharge(smap, owner, smap->elem_size); @@ -699,7 +723,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) * when unlinking elem from the local_storage->list and * the map's bucket->list. */ - raw_spin_lock_irqsave(&local_storage->lock, flags); + raw_res_spin_lock_irqsave(&local_storage->lock, flags); hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { /* Always unlink from map before unlinking from * local_storage. @@ -714,7 +738,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) free_storage = bpf_selem_unlink_storage_nolock( local_storage, selem, &free_selem_list); } - raw_spin_unlock_irqrestore(&local_storage->lock, flags); + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_selem_free_list(&free_selem_list, true); @@ -761,7 +785,7 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, for (i = 0; i < nbuckets; i++) { INIT_HLIST_HEAD(&smap->buckets[i].list); - raw_spin_lock_init(&smap->buckets[i].lock); + raw_res_spin_lock_init(&smap->buckets[i].lock); } smap->elem_size = offsetof(struct bpf_local_storage_elem, -- cgit v1.2.3 From 4a98c2efa613f0b01bc3aa0acb8c3ff7ae29b6f9 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:04 -0800 Subject: bpf: Remove task local storage percpu counter The percpu counter in task local storage is no longer needed as the underlying bpf_local_storage can now handle deadlock with the help of rqspinlock. Remove the percpu counter and related migrate_{disable, enable}. Since the percpu counter is removed, merge back bpf_task_storage_get() and bpf_task_storage_get_recur(). This will allow the bpf syscalls and helpers to run concurrently on the same CPU, removing the spurious -EBUSY error. bpf_task_storage_get(..., F_CREATE) will now always succeed with enough free memory unless being called recursively. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-7-ameryhung@gmail.com --- kernel/bpf/bpf_task_storage.c | 150 +++++------------------------------------- kernel/bpf/helpers.c | 4 -- 2 files changed, 18 insertions(+), 136 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index ab902364ac23..dd858226ada2 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -20,29 +20,6 @@ DEFINE_BPF_STORAGE_CACHE(task_cache); -static DEFINE_PER_CPU(int, bpf_task_storage_busy); - -static void bpf_task_storage_lock(void) -{ - cant_migrate(); - this_cpu_inc(bpf_task_storage_busy); -} - -static void bpf_task_storage_unlock(void) -{ - this_cpu_dec(bpf_task_storage_busy); -} - -static bool bpf_task_storage_trylock(void) -{ - cant_migrate(); - if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) { - this_cpu_dec(bpf_task_storage_busy); - return false; - } - return true; -} - static struct bpf_local_storage __rcu **task_storage_ptr(void *owner) { struct task_struct *task = owner; @@ -70,17 +47,15 @@ void bpf_task_storage_free(struct task_struct *task) { struct bpf_local_storage *local_storage; - rcu_read_lock_dont_migrate(); + rcu_read_lock(); local_storage = rcu_dereference(task->bpf_storage); if (!local_storage) goto out; - bpf_task_storage_lock(); bpf_local_storage_destroy(local_storage); - bpf_task_storage_unlock(); out: - rcu_read_unlock_migrate(); + rcu_read_unlock(); } static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key) @@ -106,9 +81,7 @@ static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key) goto out; } - bpf_task_storage_lock(); sdata = task_storage_lookup(task, map, true); - bpf_task_storage_unlock(); put_pid(pid); return sdata ? sdata->data : NULL; out: @@ -143,11 +116,9 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, goto out; } - bpf_task_storage_lock(); sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, map_flags, true, GFP_ATOMIC); - bpf_task_storage_unlock(); err = PTR_ERR_OR_ZERO(sdata); out: @@ -155,8 +126,7 @@ out: return err; } -static int task_storage_delete(struct task_struct *task, struct bpf_map *map, - bool nobusy) +static int task_storage_delete(struct task_struct *task, struct bpf_map *map) { struct bpf_local_storage_data *sdata; @@ -164,9 +134,6 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map, if (!sdata) return -ENOENT; - if (!nobusy) - return -EBUSY; - return bpf_selem_unlink(SELEM(sdata), false); } @@ -192,111 +159,50 @@ static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) goto out; } - bpf_task_storage_lock(); - err = task_storage_delete(task, map, true); - bpf_task_storage_unlock(); + err = task_storage_delete(task, map); out: put_pid(pid); return err; } -/* Called by bpf_task_storage_get*() helpers */ -static void *__bpf_task_storage_get(struct bpf_map *map, - struct task_struct *task, void *value, - u64 flags, gfp_t gfp_flags, bool nobusy) +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, + task, void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; - sdata = task_storage_lookup(task, map, nobusy); + WARN_ON_ONCE(!bpf_rcu_lock_held()); + if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task) + return (unsigned long)NULL; + + sdata = task_storage_lookup(task, map, true); if (sdata) - return sdata->data; + return (unsigned long)sdata->data; /* only allocate new storage, when the task is refcounted */ if (refcount_read(&task->usage) && - (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) { + (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) { sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST, false, gfp_flags); - return IS_ERR(sdata) ? NULL : sdata->data; + return IS_ERR(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } - return NULL; -} - -/* *gfp_flags* is a hidden argument provided by the verifier */ -BPF_CALL_5(bpf_task_storage_get_recur, struct bpf_map *, map, struct task_struct *, - task, void *, value, u64, flags, gfp_t, gfp_flags) -{ - bool nobusy; - void *data; - - WARN_ON_ONCE(!bpf_rcu_lock_held()); - if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task) - return (unsigned long)NULL; - - nobusy = bpf_task_storage_trylock(); - data = __bpf_task_storage_get(map, task, value, flags, - gfp_flags, nobusy); - if (nobusy) - bpf_task_storage_unlock(); - return (unsigned long)data; -} - -/* *gfp_flags* is a hidden argument provided by the verifier */ -BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, - task, void *, value, u64, flags, gfp_t, gfp_flags) -{ - void *data; - - WARN_ON_ONCE(!bpf_rcu_lock_held()); - if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task) - return (unsigned long)NULL; - - bpf_task_storage_lock(); - data = __bpf_task_storage_get(map, task, value, flags, - gfp_flags, true); - bpf_task_storage_unlock(); - return (unsigned long)data; -} - -BPF_CALL_2(bpf_task_storage_delete_recur, struct bpf_map *, map, struct task_struct *, - task) -{ - bool nobusy; - int ret; - - WARN_ON_ONCE(!bpf_rcu_lock_held()); - if (!task) - return -EINVAL; - - nobusy = bpf_task_storage_trylock(); - /* This helper must only be called from places where the lifetime of the task - * is guaranteed. Either by being refcounted or by being protected - * by an RCU read-side critical section. - */ - ret = task_storage_delete(task, map, nobusy); - if (nobusy) - bpf_task_storage_unlock(); - return ret; + return (unsigned long)NULL; } BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *, task) { - int ret; - WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!task) return -EINVAL; - bpf_task_storage_lock(); /* This helper must only be called from places where the lifetime of the task * is guaranteed. Either by being refcounted or by being protected * by an RCU read-side critical section. */ - ret = task_storage_delete(task, map, true); - bpf_task_storage_unlock(); - return ret; + return task_storage_delete(task, map); } static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) @@ -311,7 +217,7 @@ static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr) static void task_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &task_cache, &bpf_task_storage_busy); + bpf_local_storage_map_free(map, &task_cache, NULL); } BTF_ID_LIST_GLOBAL_SINGLE(bpf_local_storage_map_btf_id, struct, bpf_local_storage_map) @@ -330,17 +236,6 @@ const struct bpf_map_ops task_storage_map_ops = { .map_owner_storage_ptr = task_storage_ptr, }; -const struct bpf_func_proto bpf_task_storage_get_recur_proto = { - .func = bpf_task_storage_get_recur, - .gpl_only = false, - .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, - .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], - .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, - .arg4_type = ARG_ANYTHING, -}; - const struct bpf_func_proto bpf_task_storage_get_proto = { .func = bpf_task_storage_get, .gpl_only = false, @@ -352,15 +247,6 @@ const struct bpf_func_proto bpf_task_storage_get_proto = { .arg4_type = ARG_ANYTHING, }; -const struct bpf_func_proto bpf_task_storage_delete_recur_proto = { - .func = bpf_task_storage_delete_recur, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, - .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], -}; - const struct bpf_func_proto bpf_task_storage_delete_proto = { .func = bpf_task_storage_delete, .gpl_only = false, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 0458597134da..7ac32798eb04 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2167,12 +2167,8 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_cgroup_classid_curr_proto; #endif case BPF_FUNC_task_storage_get: - if (bpf_prog_check_recur(prog)) - return &bpf_task_storage_get_recur_proto; return &bpf_task_storage_get_proto; case BPF_FUNC_task_storage_delete: - if (bpf_prog_check_recur(prog)) - return &bpf_task_storage_delete_recur_proto; return &bpf_task_storage_delete_proto; default: break; -- cgit v1.2.3 From 5254de7b9607dd341795cd693185f719a9e7298a Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:05 -0800 Subject: bpf: Remove cgroup local storage percpu counter The percpu counter in cgroup local storage is no longer needed as the underlying bpf_local_storage can now handle deadlock with the help of rqspinlock. Remove the percpu counter and related migrate_{disable, enable}. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-8-ameryhung@gmail.com --- kernel/bpf/bpf_cgrp_storage.c | 59 ++++++------------------------------------- 1 file changed, 8 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 8fef24fcac68..4d84611d8222 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -11,29 +11,6 @@ DEFINE_BPF_STORAGE_CACHE(cgroup_cache); -static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy); - -static void bpf_cgrp_storage_lock(void) -{ - cant_migrate(); - this_cpu_inc(bpf_cgrp_storage_busy); -} - -static void bpf_cgrp_storage_unlock(void) -{ - this_cpu_dec(bpf_cgrp_storage_busy); -} - -static bool bpf_cgrp_storage_trylock(void) -{ - cant_migrate(); - if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) { - this_cpu_dec(bpf_cgrp_storage_busy); - return false; - } - return true; -} - static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner) { struct cgroup *cg = owner; @@ -45,16 +22,14 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup) { struct bpf_local_storage *local_storage; - rcu_read_lock_dont_migrate(); + rcu_read_lock(); local_storage = rcu_dereference(cgroup->bpf_cgrp_storage); if (!local_storage) goto out; - bpf_cgrp_storage_lock(); bpf_local_storage_destroy(local_storage); - bpf_cgrp_storage_unlock(); out: - rcu_read_unlock_migrate(); + rcu_read_unlock(); } static struct bpf_local_storage_data * @@ -83,9 +58,7 @@ static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key) if (IS_ERR(cgroup)) return ERR_CAST(cgroup); - bpf_cgrp_storage_lock(); sdata = cgroup_storage_lookup(cgroup, map, true); - bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return sdata ? sdata->data : NULL; } @@ -102,10 +75,8 @@ static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, if (IS_ERR(cgroup)) return PTR_ERR(cgroup); - bpf_cgrp_storage_lock(); sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, value, map_flags, false, GFP_ATOMIC); - bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return PTR_ERR_OR_ZERO(sdata); } @@ -131,9 +102,7 @@ static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) if (IS_ERR(cgroup)) return PTR_ERR(cgroup); - bpf_cgrp_storage_lock(); err = cgroup_storage_delete(cgroup, map); - bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return err; } @@ -150,7 +119,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) static void cgroup_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &cgroup_cache, &bpf_cgrp_storage_busy); + bpf_local_storage_map_free(map, &cgroup_cache, NULL); } /* *gfp_flags* is a hidden argument provided by the verifier */ @@ -158,7 +127,6 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; - bool nobusy; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) @@ -167,38 +135,27 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, if (!cgroup) return (unsigned long)NULL; - nobusy = bpf_cgrp_storage_trylock(); - - sdata = cgroup_storage_lookup(cgroup, map, nobusy); + sdata = cgroup_storage_lookup(cgroup, map, true); if (sdata) - goto unlock; + goto out; /* only allocate new storage, when the cgroup is refcounted */ if (!percpu_ref_is_dying(&cgroup->self.refcnt) && - (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) + (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST, false, gfp_flags); -unlock: - if (nobusy) - bpf_cgrp_storage_unlock(); +out: return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } BPF_CALL_2(bpf_cgrp_storage_delete, struct bpf_map *, map, struct cgroup *, cgroup) { - int ret; - WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!cgroup) return -EINVAL; - if (!bpf_cgrp_storage_trylock()) - return -EBUSY; - - ret = cgroup_storage_delete(cgroup, map); - bpf_cgrp_storage_unlock(); - return ret; + return cgroup_storage_delete(cgroup, map); } const struct bpf_map_ops cgrp_storage_map_ops = { -- cgit v1.2.3 From 3417dffb58331d1e7e4f3e30ec95cc0f8114ff10 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:06 -0800 Subject: bpf: Remove unused percpu counter from bpf_local_storage_map_free Percpu locks have been removed from cgroup and task local storage. Now that all local storage no longer use percpu variables as locks preventing recursion, there is no need to pass them to bpf_local_storage_map_free(). Remove the argument from the function. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-9-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 3 +-- kernel/bpf/bpf_cgrp_storage.c | 2 +- kernel/bpf/bpf_inode_storage.c | 2 +- kernel/bpf/bpf_local_storage.c | 7 +------ kernel/bpf/bpf_task_storage.c | 2 +- net/core/bpf_sk_storage.c | 2 +- 6 files changed, 6 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index fa50b7afee18..fba3354988d3 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -166,8 +166,7 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage, void bpf_local_storage_destroy(struct bpf_local_storage *local_storage); void bpf_local_storage_map_free(struct bpf_map *map, - struct bpf_local_storage_cache *cache, - int __percpu *busy_counter); + struct bpf_local_storage_cache *cache); int bpf_local_storage_map_check_btf(const struct bpf_map *map, const struct btf *btf, diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 4d84611d8222..853183eead2c 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -119,7 +119,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) static void cgroup_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &cgroup_cache, NULL); + bpf_local_storage_map_free(map, &cgroup_cache); } /* *gfp_flags* is a hidden argument provided by the verifier */ diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index cedc99184dad..470f4b02c79e 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -184,7 +184,7 @@ static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr) static void inode_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &inode_cache, NULL); + bpf_local_storage_map_free(map, &inode_cache); } const struct bpf_map_ops inode_storage_map_ops = { diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 1138e2293b50..76e812a40380 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -807,8 +807,7 @@ free_smap: } void bpf_local_storage_map_free(struct bpf_map *map, - struct bpf_local_storage_cache *cache, - int __percpu *busy_counter) + struct bpf_local_storage_cache *cache) { struct bpf_local_storage_map_bucket *b; struct bpf_local_storage_elem *selem; @@ -841,11 +840,7 @@ void bpf_local_storage_map_free(struct bpf_map *map, while ((selem = hlist_entry_safe( rcu_dereference_raw(hlist_first_rcu(&b->list)), struct bpf_local_storage_elem, map_node))) { - if (busy_counter) - this_cpu_inc(*busy_counter); bpf_selem_unlink(selem, true); - if (busy_counter) - this_cpu_dec(*busy_counter); cond_resched_rcu(); } rcu_read_unlock(); diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index dd858226ada2..4d53aebe6784 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -217,7 +217,7 @@ static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr) static void task_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &task_cache, NULL); + bpf_local_storage_map_free(map, &task_cache); } BTF_ID_LIST_GLOBAL_SINGLE(bpf_local_storage_map_btf_id, struct, bpf_local_storage_map) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index d7b5c4551997..d2164165a994 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -60,7 +60,7 @@ out: static void bpf_sk_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &sk_cache, NULL); + bpf_local_storage_map_free(map, &sk_cache); } static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) -- cgit v1.2.3 From c8be3da14718f1e732afcb61e8ee5b78e8d93808 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:07 -0800 Subject: bpf: Prepare for bpf_selem_unlink_nofail() The next patch will introduce bpf_selem_unlink_nofail() to handle rqspinlock errors. bpf_selem_unlink_nofail() will allow an selem to be partially unlinked from map or local storage. Save memory allocation method in selem so that later an selem can be correctly freed even when SDATA(selem)->smap is init to NULL. In addition, keep track of memory charge to the owner in local storage so that later bpf_selem_unlink_nofail() can return the correct memory charge to the owner. Updating local_storage->mem_charge is protected by local_storage->lock. Finally, extract miscellaneous tasks performed when unlinking an selem from local_storage into bpf_selem_unlink_storage_nolock_misc(). It will be reused by bpf_selem_unlink_nofail(). This patch also takes the chance to remove local_storage->smap, which is no longer used since commit f484f4a3e058 ("bpf: Replace bpf memory allocator with kmalloc_nolock() in local storage"). Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-10-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 5 +-- kernel/bpf/bpf_local_storage.c | 69 +++++++++++++++++++-------------------- 2 files changed, 37 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index fba3354988d3..a34ed7fa81d8 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -80,7 +80,8 @@ struct bpf_local_storage_elem { * after raw_spin_unlock */ }; - /* 8 bytes hole */ + bool use_kmalloc_nolock; + /* 7 bytes hole */ /* The data is stored in another cacheline to minimize * the number of cachelines access during a cache hit. */ @@ -89,13 +90,13 @@ struct bpf_local_storage_elem { struct bpf_local_storage { struct bpf_local_storage_data __rcu *cache[BPF_LOCAL_STORAGE_CACHE_SIZE]; - struct bpf_local_storage_map __rcu *smap; struct hlist_head list; /* List of bpf_local_storage_elem */ void *owner; /* The object that owns the above "list" of * bpf_local_storage_elem. */ struct rcu_head rcu; rqspinlock_t lock; /* Protect adding/removing from the "list" */ + u64 mem_charge; /* Copy of mem charged to owner. Protected by "lock" */ bool use_kmalloc_nolock; }; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 76e812a40380..0e9ae41a9759 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -85,6 +85,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, if (selem) { RCU_INIT_POINTER(SDATA(selem)->smap, smap); + selem->use_kmalloc_nolock = smap->use_kmalloc_nolock; if (value) { /* No need to call check_and_init_map_value as memory is zero init */ @@ -214,7 +215,7 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); - if (!smap->use_kmalloc_nolock) { + if (!selem->use_kmalloc_nolock) { /* * No uptr will be unpin even when reuse_now == false since uptr * is only supported in task local storage, where @@ -251,6 +252,30 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) bpf_selem_free(selem, reuse_now); } +static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem *selem, + struct bpf_local_storage_map *smap, + struct bpf_local_storage *local_storage, + bool free_local_storage) +{ + void *owner = local_storage->owner; + u32 uncharge = smap->elem_size; + + if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) == + SDATA(selem)) + RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); + + uncharge += free_local_storage ? sizeof(*local_storage) : 0; + mem_uncharge(smap, local_storage->owner, uncharge); + local_storage->mem_charge -= uncharge; + + if (free_local_storage) { + local_storage->owner = NULL; + + /* After this RCU_INIT, owner may be freed and cannot be used */ + RCU_INIT_POINTER(*owner_storage(smap, owner), NULL); + } +} + /* local_storage->lock must be held and selem->local_storage == local_storage. * The caller must ensure selem->smap is still valid to be * dereferenced for its smap->elem_size and smap->cache_idx. @@ -261,56 +286,30 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor { struct bpf_local_storage_map *smap; bool free_local_storage; - void *owner; smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); - owner = local_storage->owner; - - /* All uncharging on the owner must be done first. - * The owner may be freed once the last selem is unlinked - * from local_storage. - */ - mem_uncharge(smap, owner, smap->elem_size); free_local_storage = hlist_is_singular_node(&selem->snode, &local_storage->list); - if (free_local_storage) { - mem_uncharge(smap, owner, sizeof(struct bpf_local_storage)); - local_storage->owner = NULL; - /* After this RCU_INIT, owner may be freed and cannot be used */ - RCU_INIT_POINTER(*owner_storage(smap, owner), NULL); + bpf_selem_unlink_storage_nolock_misc(selem, smap, local_storage, + free_local_storage); - /* local_storage is not freed now. local_storage->lock is - * still held and raw_spin_unlock_bh(&local_storage->lock) - * will be done by the caller. - * - * Although the unlock will be done under - * rcu_read_lock(), it is more intuitive to - * read if the freeing of the storage is done - * after the raw_spin_unlock_bh(&local_storage->lock). - * - * Hence, a "bool free_local_storage" is returned - * to the caller which then calls then frees the storage after - * all the RCU grace periods have expired. - */ - } hlist_del_init_rcu(&selem->snode); - if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) == - SDATA(selem)) - RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); hlist_add_head(&selem->free_node, free_selem_list); - if (rcu_access_pointer(local_storage->smap) == smap) - RCU_INIT_POINTER(local_storage->smap, NULL); - return free_local_storage; } void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem) { + struct bpf_local_storage_map *smap; + + smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); + local_storage->mem_charge += smap->elem_size; + RCU_INIT_POINTER(selem->local_storage, local_storage); hlist_add_head_rcu(&selem->snode, &local_storage->list); } @@ -471,10 +470,10 @@ int bpf_local_storage_alloc(void *owner, goto uncharge; } - RCU_INIT_POINTER(storage->smap, smap); INIT_HLIST_HEAD(&storage->list); raw_res_spin_lock_init(&storage->lock); storage->owner = owner; + storage->mem_charge = sizeof(*storage); storage->use_kmalloc_nolock = smap->use_kmalloc_nolock; bpf_selem_link_storage_nolock(storage, first_selem); -- cgit v1.2.3 From 5d800f87d0a5ea1b156c47a4b9fd128479335153 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:08 -0800 Subject: bpf: Support lockless unlink when freeing map or local storage Introduce bpf_selem_unlink_nofail() to properly handle errors returned from rqspinlock in bpf_local_storage_map_free() and bpf_local_storage_destroy() where the operation must succeeds. The idea of bpf_selem_unlink_nofail() is to allow an selem to be partially linked and use atomic operation on a bit field, selem->state, to determine when and who can free the selem if any unlink under lock fails. An selem initially is fully linked to a map and a local storage. Under normal circumstances, bpf_selem_unlink_nofail() will be able to grab locks and unlink a selem from map and local storage in sequeunce, just like bpf_selem_unlink(), and then free it after an RCU grace period. However, if any of the lock attempts fails, it will only clear SDATA(selem)->smap or selem->local_storage depending on the caller and set SELEM_MAP_UNLINKED or SELEM_STORAGE_UNLINKED according to the caller. Then, after both map_free() and destroy() see the selem and the state becomes SELEM_UNLINKED, one of two racing caller can succeed in cmpxchg the state from SELEM_UNLINKED to SELEM_TOFREE, ensuring no double free or memory leak. To make sure bpf_obj_free_fields() is done only once and when map is still present, it is called when unlinking an selem from b->list under b->lock. To make sure uncharging memory is done only when the owner is still present in map_free(), block destroy() from returning until there is no pending map_free(). Since smap may not be valid in destroy(), bpf_selem_unlink_nofail() skips bpf_selem_unlink_storage_nolock_misc() when called from destroy(). This is okay as bpf_local_storage_destroy() will return the remaining amount of memory charge tracked by mem_charge to the owner to uncharge. It is also safe to skip clearing local_storage->owner and owner_storage as the owner is being freed and no users or bpf programs should be able to reference the owner and using local_storage. Finally, access of selem, SDATA(selem)->smap and selem->local_storage are racy. Callers will protect these fields with RCU. Acked-by: Alexei Starovoitov Co-developed-by: Martin KaFai Lau Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-11-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 9 ++- kernel/bpf/bpf_local_storage.c | 116 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 118 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index a34ed7fa81d8..69a5d8aa765d 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -68,6 +68,11 @@ struct bpf_local_storage_data { u8 data[] __aligned(8); }; +#define SELEM_MAP_UNLINKED (1 << 0) +#define SELEM_STORAGE_UNLINKED (1 << 1) +#define SELEM_UNLINKED (SELEM_MAP_UNLINKED | SELEM_STORAGE_UNLINKED) +#define SELEM_TOFREE (1 << 2) + /* Linked to bpf_local_storage and bpf_local_storage_map */ struct bpf_local_storage_elem { struct hlist_node map_node; /* Linked to bpf_local_storage_map */ @@ -80,8 +85,9 @@ struct bpf_local_storage_elem { * after raw_spin_unlock */ }; + atomic_t state; bool use_kmalloc_nolock; - /* 7 bytes hole */ + /* 3 bytes hole */ /* The data is stored in another cacheline to minimize * the number of cachelines access during a cache hit. */ @@ -97,6 +103,7 @@ struct bpf_local_storage { struct rcu_head rcu; rqspinlock_t lock; /* Protect adding/removing from the "list" */ u64 mem_charge; /* Copy of mem charged to owner. Protected by "lock" */ + refcount_t owner_refcnt;/* Used to pin owner when map_free is uncharging */ bool use_kmalloc_nolock; }; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 0e9ae41a9759..294605c78271 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -85,6 +85,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, if (selem) { RCU_INIT_POINTER(SDATA(selem)->smap, smap); + atomic_set(&selem->state, 0); selem->use_kmalloc_nolock = smap->use_kmalloc_nolock; if (value) { @@ -194,9 +195,11 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu) /* The bpf_local_storage_map_free will wait for rcu_barrier */ smap = rcu_dereference_check(SDATA(selem)->smap, 1); - migrate_disable(); - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); - migrate_enable(); + if (smap) { + migrate_disable(); + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + migrate_enable(); + } kfree_nolock(selem); } @@ -221,7 +224,8 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, * is only supported in task local storage, where * smap->use_kmalloc_nolock == true. */ - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + if (smap) + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); __bpf_selem_free(selem, reuse_now); return; } @@ -255,7 +259,7 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem *selem, struct bpf_local_storage_map *smap, struct bpf_local_storage *local_storage, - bool free_local_storage) + bool free_local_storage, bool pin_owner) { void *owner = local_storage->owner; u32 uncharge = smap->elem_size; @@ -264,6 +268,9 @@ static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem * SDATA(selem)) RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); + if (pin_owner && !refcount_inc_not_zero(&local_storage->owner_refcnt)) + return; + uncharge += free_local_storage ? sizeof(*local_storage) : 0; mem_uncharge(smap, local_storage->owner, uncharge); local_storage->mem_charge -= uncharge; @@ -274,6 +281,9 @@ static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem * /* After this RCU_INIT, owner may be freed and cannot be used */ RCU_INIT_POINTER(*owner_storage(smap, owner), NULL); } + + if (pin_owner) + refcount_dec(&local_storage->owner_refcnt); } /* local_storage->lock must be held and selem->local_storage == local_storage. @@ -293,7 +303,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor &local_storage->list); bpf_selem_unlink_storage_nolock_misc(selem, smap, local_storage, - free_local_storage); + free_local_storage, false); hlist_del_init_rcu(&selem->snode); @@ -409,6 +419,94 @@ out: return err; } +/* + * Unlink an selem from map and local storage with lockless fallback if callers + * are racing or rqspinlock returns error. It should only be called by + * bpf_local_storage_destroy() or bpf_local_storage_map_free(). + */ +static void bpf_selem_unlink_nofail(struct bpf_local_storage_elem *selem, + struct bpf_local_storage_map_bucket *b) +{ + bool in_map_free = !!b, free_storage = false; + struct bpf_local_storage *local_storage; + struct bpf_local_storage_map *smap; + unsigned long flags; + int err, unlink = 0; + + local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); + smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); + + if (smap) { + b = b ? : select_bucket(smap, local_storage); + err = raw_res_spin_lock_irqsave(&b->lock, flags); + if (!err) { + /* + * Call bpf_obj_free_fields() under b->lock to make sure it is done + * exactly once for an selem. Safe to free special fields immediately + * as no BPF program should be referencing the selem. + */ + if (likely(selem_linked_to_map(selem))) { + hlist_del_init_rcu(&selem->map_node); + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + unlink++; + } + raw_res_spin_unlock_irqrestore(&b->lock, flags); + } + /* + * Highly unlikely scenario: resource leak + * + * When map_free(selem1), destroy(selem1) and destroy(selem2) are racing + * and both selem belong to the same bucket, if destroy(selem2) acquired + * b->lock and block for too long, neither map_free(selem1) and + * destroy(selem1) will be able to free the special field associated + * with selem1 as raw_res_spin_lock_irqsave() returns -ETIMEDOUT. + */ + WARN_ON_ONCE(err && in_map_free); + if (!err || in_map_free) + RCU_INIT_POINTER(SDATA(selem)->smap, NULL); + } + + if (local_storage) { + err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); + if (!err) { + if (likely(selem_linked_to_storage(selem))) { + free_storage = hlist_is_singular_node(&selem->snode, + &local_storage->list); + /* + * Okay to skip clearing owner_storage and storage->owner in + * destroy() since the owner is going away. No user or bpf + * programs should be able to reference it. + */ + if (smap && in_map_free) + bpf_selem_unlink_storage_nolock_misc( + selem, smap, local_storage, + free_storage, true); + hlist_del_init_rcu(&selem->snode); + unlink++; + } + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); + } + if (!err || !in_map_free) + RCU_INIT_POINTER(selem->local_storage, NULL); + } + + if (unlink != 2) + atomic_or(in_map_free ? SELEM_MAP_UNLINKED : SELEM_STORAGE_UNLINKED, &selem->state); + + /* + * Normally, an selem can be unlinked under local_storage->lock and b->lock, and + * then freed after an RCU grace period. However, if destroy() and map_free() are + * racing or rqspinlock returns errors in unlikely situations (unlink != 2), free + * the selem only after both map_free() and destroy() see the selem. + */ + if (unlink == 2 || + atomic_cmpxchg(&selem->state, SELEM_UNLINKED, SELEM_TOFREE) == SELEM_UNLINKED) + bpf_selem_free(selem, true); + + if (free_storage) + bpf_local_storage_free(local_storage, true); +} + void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem) @@ -475,6 +573,7 @@ int bpf_local_storage_alloc(void *owner, storage->owner = owner; storage->mem_charge = sizeof(*storage); storage->use_kmalloc_nolock = smap->use_kmalloc_nolock; + refcount_set(&storage->owner_refcnt, 1); bpf_selem_link_storage_nolock(storage, first_selem); @@ -743,6 +842,11 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) if (free_storage) bpf_local_storage_free(local_storage, true); + + if (!refcount_dec_and_test(&local_storage->owner_refcnt)) { + while (refcount_read(&local_storage->owner_refcnt)) + cpu_relax(); + } } u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) -- cgit v1.2.3 From 0be08389c7f2f9db0194ee666d2e4870886e6ffb Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:09 -0800 Subject: bpf: Switch to bpf_selem_unlink_nofail in bpf_local_storage_{map_free, destroy} Take care of rqspinlock error in bpf_local_storage_{map_free, destroy}() properly by switching to bpf_selem_unlink_nofail(). Both functions iterate their own RCU-protected list of selems and call bpf_selem_unlink_nofail(). In map_free(), to prevent infinite loop when both map_free() and destroy() fail to remove a selem from b->list (extremely unlikely), switch to hlist_for_each_entry_rcu(). In destroy(), also switch to hlist_for_each_entry_rcu() since we no longer iterate local_storage->list under local_storage->lock. bpf_selem_unlink() now becomes dedicated to helpers and syscalls paths so reuse_now should always be false. Remove it from the argument and hardcode it. Acked-by: Alexei Starovoitov Co-developed-by: Martin KaFai Lau Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-12-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 4 +-- kernel/bpf/bpf_cgrp_storage.c | 2 +- kernel/bpf/bpf_inode_storage.c | 2 +- kernel/bpf/bpf_local_storage.c | 63 ++++++++++++++++++--------------------- kernel/bpf/bpf_task_storage.c | 2 +- net/core/bpf_sk_storage.c | 7 +++-- 6 files changed, 39 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 69a5d8aa765d..85efa9772530 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -171,7 +171,7 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage, return SDATA(selem); } -void bpf_local_storage_destroy(struct bpf_local_storage *local_storage); +u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage); void bpf_local_storage_map_free(struct bpf_map *map, struct bpf_local_storage_cache *cache); @@ -184,7 +184,7 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem); -int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); +int bpf_selem_unlink(struct bpf_local_storage_elem *selem); int bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage *local_storage, diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 853183eead2c..c2a2ead1f466 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -89,7 +89,7 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map) if (!sdata) return -ENOENT; - return bpf_selem_unlink(SELEM(sdata), false); + return bpf_selem_unlink(SELEM(sdata)); } static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 470f4b02c79e..e86734609f3d 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -110,7 +110,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map) if (!sdata) return -ENOENT; - return bpf_selem_unlink(SELEM(sdata), false); + return bpf_selem_unlink(SELEM(sdata)); } static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 294605c78271..b28f07d3a0db 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -377,7 +377,11 @@ static void bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket *b, hlist_add_head_rcu(&selem->map_node, &b->list); } -int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) +/* + * Unlink an selem from map and local storage with lock held. + * This is the common path used by local storages to delete an selem. + */ +int bpf_selem_unlink(struct bpf_local_storage_elem *selem) { struct bpf_local_storage *local_storage; bool free_local_storage = false; @@ -411,10 +415,10 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) out: raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); - bpf_selem_free_list(&selem_free_list, reuse_now); + bpf_selem_free_list(&selem_free_list, false); if (free_local_storage) - bpf_local_storage_free(local_storage, reuse_now); + bpf_local_storage_free(local_storage, false); return err; } @@ -804,13 +808,13 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, return 0; } -void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) +/* + * Destroy local storage when the owner is going away. Caller must uncharge memory + * if memory charging is used. + */ +u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage) { struct bpf_local_storage_elem *selem; - bool free_storage = false; - HLIST_HEAD(free_selem_list); - struct hlist_node *n; - unsigned long flags; /* Neither the bpf_prog nor the bpf_map's syscall * could be modifying the local_storage->list now. @@ -821,32 +825,20 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) * when unlinking elem from the local_storage->list and * the map's bucket->list. */ - raw_res_spin_lock_irqsave(&local_storage->lock, flags); - hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { - /* Always unlink from map before unlinking from - * local_storage. - */ - bpf_selem_unlink_map(selem); - /* If local_storage list has only one element, the - * bpf_selem_unlink_storage_nolock() will return true. - * Otherwise, it will return false. The current loop iteration - * intends to remove all local storage. So the last iteration - * of the loop will set the free_cgroup_storage to true. - */ - free_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, &free_selem_list); - } - raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); - - bpf_selem_free_list(&free_selem_list, true); - - if (free_storage) - bpf_local_storage_free(local_storage, true); + hlist_for_each_entry_rcu(selem, &local_storage->list, snode) + bpf_selem_unlink_nofail(selem, NULL); if (!refcount_dec_and_test(&local_storage->owner_refcnt)) { while (refcount_read(&local_storage->owner_refcnt)) cpu_relax(); + /* + * Paired with refcount_dec() in bpf_selem_unlink_nofail() + * to make sure destroy() sees the correct local_storage->mem_charge. + */ + smp_mb(); } + + return local_storage->mem_charge; } u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) @@ -940,11 +932,14 @@ void bpf_local_storage_map_free(struct bpf_map *map, rcu_read_lock(); /* No one is adding to b->list now */ - while ((selem = hlist_entry_safe( - rcu_dereference_raw(hlist_first_rcu(&b->list)), - struct bpf_local_storage_elem, map_node))) { - bpf_selem_unlink(selem, true); - cond_resched_rcu(); +restart: + hlist_for_each_entry_rcu(selem, &b->list, map_node) { + bpf_selem_unlink_nofail(selem, b); + + if (need_resched()) { + cond_resched_rcu(); + goto restart; + } } rcu_read_unlock(); } diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 4d53aebe6784..605506792b5b 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -134,7 +134,7 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map) if (!sdata) return -ENOENT; - return bpf_selem_unlink(SELEM(sdata), false); + return bpf_selem_unlink(SELEM(sdata)); } static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index d2164165a994..1eb3e060994e 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -40,20 +40,23 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map) if (!sdata) return -ENOENT; - return bpf_selem_unlink(SELEM(sdata), false); + return bpf_selem_unlink(SELEM(sdata)); } /* Called by __sk_destruct() & bpf_sk_storage_clone() */ void bpf_sk_storage_free(struct sock *sk) { struct bpf_local_storage *sk_storage; + u32 uncharge; rcu_read_lock_dont_migrate(); sk_storage = rcu_dereference(sk->sk_bpf_storage); if (!sk_storage) goto out; - bpf_local_storage_destroy(sk_storage); + uncharge = bpf_local_storage_destroy(sk_storage); + if (uncharge) + atomic_sub(uncharge, &sk->sk_omem_alloc); out: rcu_read_unlock_migrate(); } -- cgit v1.2.3