From 4bf79f9be434e000c8e12fe83b2f4402480f1460 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Jul 2024 13:23:53 -0700 Subject: bpf: Track equal scalars history on per-instruction level Use bpf_verifier_state->jmp_history to track which registers were updated by find_equal_scalars() (renamed to collect_linked_regs()) when conditional jump was verified. Use recorded information in backtrack_insn() to propagate precision. E.g. for the following program: while verifying instructions 1: r1 = r0 | 2: if r1 < 8 goto ... | push r0,r1 as linked registers in jmp_history 3: if r0 > 16 goto ... | push r0,r1 as linked registers in jmp_history 4: r2 = r10 | 5: r2 += r0 v mark_chain_precision(r0) while doing mark_chain_precision(r0) 5: r2 += r0 | mark r0 precise 4: r2 = r10 | 3: if r0 > 16 goto ... | mark r0,r1 as precise 2: if r1 < 8 goto ... | mark r0,r1 as precise 1: r1 = r0 v Technically, do this as follows: - Use 10 bits to identify each register that gains range because of sync_linked_regs(): - 3 bits for frame number; - 6 bits for register or stack slot number; - 1 bit to indicate if register is spilled. - Use u64 as a vector of 6 such records + 4 bits for vector length. - Augment struct bpf_jmp_history_entry with a field 'linked_regs' representing such vector. - When doing check_cond_jmp_op() remember up to 6 registers that gain range because of sync_linked_regs() in such a vector. - Don't propagate range information and reset IDs for registers that don't fit in 6-value vector. - Push a pair {instruction index, linked registers vector} to bpf_verifier_state->jmp_history. - When doing backtrack_insn() check if any of recorded linked registers is currently marked precise, if so mark all linked registers as precise. This also requires fixes for two test_verifier tests: - precise: test 1 - precise: test 2 Both tests contain the following instruction sequence: 19: (bf) r2 = r9 ; R2=scalar(id=3) R9=scalar(id=3) 20: (a5) if r2 < 0x8 goto pc+1 ; R2=scalar(id=3,umin=8) 21: (95) exit 22: (07) r2 += 1 ; R2_w=scalar(id=3+1,...) 23: (bf) r1 = r10 ; R1_w=fp0 R10=fp0 24: (07) r1 += -8 ; R1_w=fp-8 25: (b7) r3 = 0 ; R3_w=0 26: (85) call bpf_probe_read_kernel#113 The call to bpf_probe_read_kernel() at (26) forces r2 to be precise. Previously, this forced all registers with same id to become precise immediately when mark_chain_precision() is called. After this change, the precision is propagated to registers sharing same id only when 'if' instruction is backtracked. Hence verification log for both tests is changed: regs=r2,r9 -> regs=r2 for instructions 25..20. Fixes: 904e6ddf4133 ("bpf: Use scalar ids in mark_chain_precision()") Reported-by: Hao Sun Suggested-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240718202357.1746514-2-eddyz87@gmail.com Closes: https://lore.kernel.org/bpf/CAEf4BzZ0xidVCqB47XnkXcNhkPWF6_nTV7yt+_Lf0kcFEut2Mg@mail.gmail.com/ --- include/linux/bpf_verifier.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 6503c85b10a3..731a0a4ac13c 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -371,6 +371,10 @@ struct bpf_jmp_history_entry { u32 prev_idx : 22; /* special flags, e.g., whether insn is doing register stack spill/load */ u32 flags : 10; + /* additional registers that need precision tracking when this + * jump is backtracked, vector of six 10-bit records + */ + u64 linked_regs; }; /* Maximum number of register states that can exist at once */ -- cgit v1.2.3 From e42ac14180554fa23a3312d4f921dc4ea7972fb7 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 22 Jul 2024 11:30:45 -0700 Subject: bpf: Check unsupported ops from the bpf_struct_ops's cfi_stubs The bpf_tcp_ca struct_ops currently uses a "u32 unsupported_ops[]" array to track which ops is not supported. After cfi_stubs had been added, the function pointer in cfi_stubs is also NULL for the unsupported ops. Thus, the "u32 unsupported_ops[]" becomes redundant. This observation was originally brought up in the bpf/cfi discussion: https://lore.kernel.org/bpf/CAADnVQJoEkdjyCEJRPASjBw1QGsKYrF33QdMGc1RZa9b88bAEA@mail.gmail.com/ The recent bpf qdisc patch (https://lore.kernel.org/bpf/20240714175130.4051012-6-amery.hung@bytedance.com/) also needs to specify quite many unsupported ops. It is a good time to clean it up. This patch removes the need of "u32 unsupported_ops[]" and tests for null-ness in the cfi_stubs instead. Testing the cfi_stubs is done in a new function bpf_struct_ops_supported(). The verifier will call bpf_struct_ops_supported() when loading the struct_ops program. The ".check_member" is removed from the bpf_tcp_ca in this patch. ".check_member" could still be useful for other subsytems to enforce other restrictions (e.g. sched_ext checks for prog->sleepable). To keep the same error return, ENOTSUPP is used. Cc: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240722183049.2254692-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- include/linux/bpf.h | 5 +++++ kernel/bpf/bpf_struct_ops.c | 7 +++++++ kernel/bpf/verifier.c | 10 +++++++++- net/ipv4/bpf_tcp_ca.c | 26 -------------------------- 4 files changed, 21 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3b94ec161e8c..4c54864316ee 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1795,6 +1795,7 @@ struct bpf_struct_ops_common_value { #define BPF_MODULE_OWNER ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA)) bool bpf_struct_ops_get(const void *kdata); void bpf_struct_ops_put(const void *kdata); +int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff); int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, void *value); int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, @@ -1851,6 +1852,10 @@ static inline void bpf_module_put(const void *data, struct module *owner) { module_put(owner); } +static inline int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff) +{ + return -ENOTSUPP; +} static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, void *value) diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index bb3eabc0dc76..fda3dd2ee984 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -1040,6 +1040,13 @@ void bpf_struct_ops_put(const void *kdata) bpf_map_put(&st_map->map); } +int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff) +{ + void *func_ptr = *(void **)(st_ops->cfi_stubs + moff); + + return func_ptr ? 0 : -ENOTSUPP; +} + static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f8c474e8e597..247a038d3a14 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21132,6 +21132,7 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) u32 btf_id, member_idx; struct btf *btf; const char *mname; + int err; if (!prog->gpl_compatible) { verbose(env, "struct ops programs must have a GPL compatible license\n"); @@ -21179,8 +21180,15 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) return -EINVAL; } + err = bpf_struct_ops_supported(st_ops, __btf_member_bit_offset(t, member) / 8); + if (err) { + verbose(env, "attach to unsupported member %s of struct %s\n", + mname, st_ops->name); + return err; + } + if (st_ops->check_member) { - int err = st_ops->check_member(t, member, prog); + err = st_ops->check_member(t, member, prog); if (err) { verbose(env, "attach to unsupported member %s of struct %s\n", diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 3f88d0961e5b..554804774628 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -14,10 +14,6 @@ /* "extern" is to avoid sparse warning. It is only used in bpf_struct_ops.c. */ static struct bpf_struct_ops bpf_tcp_congestion_ops; -static u32 unsupported_ops[] = { - offsetof(struct tcp_congestion_ops, get_info), -}; - static const struct btf_type *tcp_sock_type; static u32 tcp_sock_id, sock_id; static const struct btf_type *tcp_congestion_ops_type; @@ -45,18 +41,6 @@ static int bpf_tcp_ca_init(struct btf *btf) return 0; } -static bool is_unsupported(u32 member_offset) -{ - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(unsupported_ops); i++) { - if (member_offset == unsupported_ops[i]) - return true; - } - - return false; -} - static bool bpf_tcp_ca_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, @@ -251,15 +235,6 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t, return 0; } -static int bpf_tcp_ca_check_member(const struct btf_type *t, - const struct btf_member *member, - const struct bpf_prog *prog) -{ - if (is_unsupported(__btf_member_bit_offset(t, member) / 8)) - return -ENOTSUPP; - return 0; -} - static int bpf_tcp_ca_reg(void *kdata, struct bpf_link *link) { return tcp_register_congestion_control(kdata); @@ -354,7 +329,6 @@ static struct bpf_struct_ops bpf_tcp_congestion_ops = { .reg = bpf_tcp_ca_reg, .unreg = bpf_tcp_ca_unreg, .update = bpf_tcp_ca_update, - .check_member = bpf_tcp_ca_check_member, .init_member = bpf_tcp_ca_init_member, .init = bpf_tcp_ca_init, .validate = bpf_tcp_ca_validate, -- cgit v1.2.3 From 5d99e198be279045e6ecefe220f5c52f8ce9bfd5 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Fri, 19 Jul 2024 19:00:52 +0800 Subject: bpf, lsm: Add check for BPF LSM return value A bpf prog returning a positive number attached to file_alloc_security hook makes kernel panic. This happens because file system can not filter out the positive number returned by the LSM prog using IS_ERR, and misinterprets this positive number as a file pointer. Given that hook file_alloc_security never returned positive number before the introduction of BPF LSM, and other BPF LSM hooks may encounter similar issues, this patch adds LSM return value check in verifier, to ensure no unexpected value is returned. Fixes: 520b7aa00d8c ("bpf: lsm: Initialize the BPF LSM hooks") Reported-by: Xin Liu Signed-off-by: Xu Kuohai Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240719110059.797546-3-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- include/linux/bpf.h | 1 + include/linux/bpf_lsm.h | 8 +++++++ kernel/bpf/bpf_lsm.c | 34 +++++++++++++++++++++++++++- kernel/bpf/btf.c | 5 ++++- kernel/bpf/verifier.c | 60 +++++++++++++++++++++++++++++++++++++++++-------- 5 files changed, 97 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4c54864316ee..5739cc9986f8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -927,6 +927,7 @@ struct bpf_insn_access_aux { }; }; struct bpf_verifier_log *log; /* for verbose logs */ + bool is_retval; /* is accessing function return value ? */ }; static inline void diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index 1de7ece5d36d..aefcd6564251 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -9,6 +9,7 @@ #include #include +#include #include #ifdef CONFIG_BPF_LSM @@ -45,6 +46,8 @@ void bpf_inode_storage_free(struct inode *inode); void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func); +int bpf_lsm_get_retval_range(const struct bpf_prog *prog, + struct bpf_retval_range *range); #else /* !CONFIG_BPF_LSM */ static inline bool bpf_lsm_is_sleepable_hook(u32 btf_id) @@ -78,6 +81,11 @@ static inline void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, { } +static inline int bpf_lsm_get_retval_range(const struct bpf_prog *prog, + struct bpf_retval_range *range) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_BPF_LSM */ #endif /* _LINUX_BPF_LSM_H */ diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 1f596ad6257c..6292ac5f9bd1 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -417,3 +416,36 @@ const struct bpf_verifier_ops lsm_verifier_ops = { .get_func_proto = bpf_lsm_func_proto, .is_valid_access = btf_ctx_access, }; + +/* hooks return 0 or 1 */ +BTF_SET_START(bool_lsm_hooks) +#ifdef CONFIG_SECURITY_NETWORK_XFRM +BTF_ID(func, bpf_lsm_xfrm_state_pol_flow_match) +#endif +#ifdef CONFIG_AUDIT +BTF_ID(func, bpf_lsm_audit_rule_known) +#endif +BTF_ID(func, bpf_lsm_inode_xattr_skipcap) +BTF_SET_END(bool_lsm_hooks) + +int bpf_lsm_get_retval_range(const struct bpf_prog *prog, + struct bpf_retval_range *retval_range) +{ + /* no return value range for void hooks */ + if (!prog->aux->attach_func_proto->type) + return -EINVAL; + + if (btf_id_set_contains(&bool_lsm_hooks, prog->aux->attach_btf_id)) { + retval_range->minval = 0; + retval_range->maxval = 1; + } else { + /* All other available LSM hooks, except task_prctl, return 0 + * on success and negative error code on failure. + * To keep things simple, we only allow bpf progs to return 0 + * or negative errno for task_prctl too. + */ + retval_range->minval = -MAX_ERRNO; + retval_range->maxval = 0; + } + return 0; +} diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 520f49f422fe..95426d5b634e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6416,8 +6416,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (arg == nr_args) { switch (prog->expected_attach_type) { - case BPF_LSM_CGROUP: case BPF_LSM_MAC: + /* mark we are accessing the return value */ + info->is_retval = true; + fallthrough; + case BPF_LSM_CGROUP: case BPF_TRACE_FEXIT: /* When LSM programs are attached to void LSM hooks * they use FEXIT trampolines and when attached to diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 247a038d3a14..8caa7322f031 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2334,6 +2334,25 @@ static void mark_reg_unknown(struct bpf_verifier_env *env, __mark_reg_unknown(env, regs + regno); } +static int __mark_reg_s32_range(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, + u32 regno, + s32 s32_min, + s32 s32_max) +{ + struct bpf_reg_state *reg = regs + regno; + + reg->s32_min_value = max_t(s32, reg->s32_min_value, s32_min); + reg->s32_max_value = min_t(s32, reg->s32_max_value, s32_max); + + reg->smin_value = max_t(s64, reg->smin_value, s32_min); + reg->smax_value = min_t(s64, reg->smax_value, s32_max); + + reg_bounds_sync(reg); + + return reg_bounds_sanity_check(env, reg, "s32_range"); +} + static void __mark_reg_not_init(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) { @@ -5607,11 +5626,12 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, /* check access to 'struct bpf_context' fields. Supports fixed offsets only */ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type, - struct btf **btf, u32 *btf_id) + struct btf **btf, u32 *btf_id, bool *is_retval) { struct bpf_insn_access_aux info = { .reg_type = *reg_type, .log = &env->log, + .is_retval = false, }; if (env->ops->is_valid_access && @@ -5624,6 +5644,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, * type of narrower access. */ *reg_type = info.reg_type; + *is_retval = info.is_retval; if (base_type(*reg_type) == PTR_TO_BTF_ID) { *btf = info.btf; @@ -6792,6 +6813,17 @@ static int check_stack_access_within_bounds( return grow_stack_state(env, state, -min_off /* size */); } +static bool get_func_retval_range(struct bpf_prog *prog, + struct bpf_retval_range *range) +{ + if (prog->type == BPF_PROG_TYPE_LSM && + prog->expected_attach_type == BPF_LSM_MAC && + !bpf_lsm_get_retval_range(prog, range)) { + return true; + } + return false; +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -6896,6 +6928,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem)) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { + bool is_retval = false; + struct bpf_retval_range range; enum bpf_reg_type reg_type = SCALAR_VALUE; struct btf *btf = NULL; u32 btf_id = 0; @@ -6911,7 +6945,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return err; err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf, - &btf_id); + &btf_id, &is_retval); if (err) verbose_linfo(env, insn_idx, "; "); if (!err && t == BPF_READ && value_regno >= 0) { @@ -6920,7 +6954,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * case, we know the offset is zero. */ if (reg_type == SCALAR_VALUE) { - mark_reg_unknown(env, regs, value_regno); + if (is_retval && get_func_retval_range(env->prog, &range)) { + err = __mark_reg_s32_range(env, regs, value_regno, + range.minval, range.maxval); + if (err) + return err; + } else { + mark_reg_unknown(env, regs, value_regno); + } } else { mark_reg_known_zero(env, regs, value_regno); @@ -15762,12 +15803,13 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char case BPF_PROG_TYPE_LSM: if (env->prog->expected_attach_type != BPF_LSM_CGROUP) { - /* Regular BPF_PROG_TYPE_LSM programs can return - * any value. - */ - return 0; - } - if (!env->prog->aux->attach_func_proto->type) { + /* no range found, any return value is allowed */ + if (!get_func_retval_range(env->prog, &range)) + return 0; + /* no restricted range, any return value is allowed */ + if (range.minval == S32_MIN && range.maxval == S32_MAX) + return 0; + } else if (!env->prog->aux->attach_func_proto->type) { /* Make sure programs that attach to void * hooks don't try to modify return value. */ -- cgit v1.2.3 From 28ead3eaabc16ecc907cfb71876da028080f6356 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Fri, 19 Jul 2024 19:00:53 +0800 Subject: bpf: Prevent tail call between progs attached to different hooks bpf progs can be attached to kernel functions, and the attached functions can take different parameters or return different return values. If prog attached to one kernel function tail calls prog attached to another kernel function, the ctx access or return value verification could be bypassed. For example, if prog1 is attached to func1 which takes only 1 parameter and prog2 is attached to func2 which takes two parameters. Since verifier assumes the bpf ctx passed to prog2 is constructed based on func2's prototype, verifier allows prog2 to access the second parameter from the bpf ctx passed to it. The problem is that verifier does not prevent prog1 from passing its bpf ctx to prog2 via tail call. In this case, the bpf ctx passed to prog2 is constructed from func1 instead of func2, that is, the assumption for ctx access verification is bypassed. Another example, if BPF LSM prog1 is attached to hook file_alloc_security, and BPF LSM prog2 is attached to hook bpf_lsm_audit_rule_known. Verifier knows the return value rules for these two hooks, e.g. it is legal for bpf_lsm_audit_rule_known to return positive number 1, and it is illegal for file_alloc_security to return positive number. So verifier allows prog2 to return positive number 1, but does not allow prog1 to return positive number. The problem is that verifier does not prevent prog1 from calling prog2 via tail call. In this case, prog2's return value 1 will be used as the return value for prog1's hook file_alloc_security. That is, the return value rule is bypassed. This patch adds restriction for tail call to prevent such bypasses. Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20240719110059.797546-4-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- include/linux/bpf.h | 1 + kernel/bpf/core.c | 21 ++++++++++++++++++--- 2 files changed, 19 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5739cc9986f8..f16d0753f518 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -294,6 +294,7 @@ struct bpf_map { * same prog type, JITed flag and xdp_has_frags flag. */ struct { + const struct btf_type *attach_func_proto; spinlock_t lock; enum bpf_prog_type type; bool jited; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7ee62e38faf0..4e07cc057d6f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2302,6 +2302,7 @@ bool bpf_prog_map_compatible(struct bpf_map *map, { enum bpf_prog_type prog_type = resolve_prog_type(fp); bool ret; + struct bpf_prog_aux *aux = fp->aux; if (fp->kprobe_override) return false; @@ -2311,7 +2312,7 @@ bool bpf_prog_map_compatible(struct bpf_map *map, * in the case of devmap and cpumap). Until device checks * are implemented, prohibit adding dev-bound programs to program maps. */ - if (bpf_prog_is_dev_bound(fp->aux)) + if (bpf_prog_is_dev_bound(aux)) return false; spin_lock(&map->owner.lock); @@ -2321,12 +2322,26 @@ bool bpf_prog_map_compatible(struct bpf_map *map, */ map->owner.type = prog_type; map->owner.jited = fp->jited; - map->owner.xdp_has_frags = fp->aux->xdp_has_frags; + map->owner.xdp_has_frags = aux->xdp_has_frags; + map->owner.attach_func_proto = aux->attach_func_proto; ret = true; } else { ret = map->owner.type == prog_type && map->owner.jited == fp->jited && - map->owner.xdp_has_frags == fp->aux->xdp_has_frags; + map->owner.xdp_has_frags == aux->xdp_has_frags; + if (ret && + map->owner.attach_func_proto != aux->attach_func_proto) { + switch (prog_type) { + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_EXT: + case BPF_PROG_TYPE_STRUCT_OPS: + ret = false; + break; + default: + break; + } + } } spin_unlock(&map->owner.lock); -- cgit v1.2.3 From 92de36080c93296ef9005690705cba260b9bd68a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 23 Jul 2024 08:34:39 -0700 Subject: bpf: Fail verification for sign-extension of packet data/data_end/data_meta syzbot reported a kernel crash due to commit 1f1e864b6555 ("bpf: Handle sign-extenstin ctx member accesses"). The reason is due to sign-extension of 32-bit load for packet data/data_end/data_meta uapi field. The original code looks like: r2 = *(s32 *)(r1 + 76) /* load __sk_buff->data */ r3 = *(u32 *)(r1 + 80) /* load __sk_buff->data_end */ r0 = r2 r0 += 8 if r3 > r0 goto +1 ... Note that __sk_buff->data load has 32-bit sign extension. After verification and convert_ctx_accesses(), the final asm code looks like: r2 = *(u64 *)(r1 +208) r2 = (s32)r2 r3 = *(u64 *)(r1 +80) r0 = r2 r0 += 8 if r3 > r0 goto pc+1 ... Note that 'r2 = (s32)r2' may make the kernel __sk_buff->data address invalid which may cause runtime failure. Currently, in C code, typically we have void *data = (void *)(long)skb->data; void *data_end = (void *)(long)skb->data_end; ... and it will generate r2 = *(u64 *)(r1 +208) r3 = *(u64 *)(r1 +80) r0 = r2 r0 += 8 if r3 > r0 goto pc+1 If we allow sign-extension, void *data = (void *)(long)(int)skb->data; void *data_end = (void *)(long)skb->data_end; ... the generated code looks like r2 = *(u64 *)(r1 +208) r2 <<= 32 r2 s>>= 32 r3 = *(u64 *)(r1 +80) r0 = r2 r0 += 8 if r3 > r0 goto pc+1 and this will cause verification failure since "r2 <<= 32" is not allowed as "r2" is a packet pointer. To fix this issue for case r2 = *(s32 *)(r1 + 76) /* load __sk_buff->data */ this patch added additional checking in is_valid_access() callback function for packet data/data_end/data_meta access. If those accesses are with sign-extenstion, the verification will fail. [1] https://lore.kernel.org/bpf/000000000000c90eee061d236d37@google.com/ Reported-by: syzbot+ad9ec60c8eaf69e6f99c@syzkaller.appspotmail.com Fixes: 1f1e864b6555 ("bpf: Handle sign-extenstin ctx member accesses") Acked-by: Eduard Zingerman Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240723153439.2429035-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- include/linux/bpf.h | 1 + kernel/bpf/verifier.c | 5 +++-- net/core/filter.c | 21 ++++++++++++++++----- 3 files changed, 20 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f16d0753f518..f560ea0c2b36 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -920,6 +920,7 @@ static_assert(__BPF_REG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); */ struct bpf_insn_access_aux { enum bpf_reg_type reg_type; + bool is_ldsx; union { int ctx_field_size; struct { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c419005a29dc..2ab59db5837f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5626,12 +5626,13 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, /* check access to 'struct bpf_context' fields. Supports fixed offsets only */ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type, - struct btf **btf, u32 *btf_id, bool *is_retval) + struct btf **btf, u32 *btf_id, bool *is_retval, bool is_ldsx) { struct bpf_insn_access_aux info = { .reg_type = *reg_type, .log = &env->log, .is_retval = false, + .is_ldsx = is_ldsx, }; if (env->ops->is_valid_access && @@ -6945,7 +6946,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return err; err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf, - &btf_id, &is_retval); + &btf_id, &is_retval, is_ldsx); if (err) verbose_linfo(env, insn_idx, "; "); if (!err && t == BPF_READ && value_regno >= 0) { diff --git a/net/core/filter.c b/net/core/filter.c index f3c72cf86099..78a6f746ea0b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8579,13 +8579,16 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (off + size > offsetofend(struct __sk_buff, cb[4])) return false; break; + case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range(struct __sk_buff, data_end): + if (info->is_ldsx || size != size_default) + return false; + break; case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]): case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]): case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4): case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4): - case bpf_ctx_range(struct __sk_buff, data): - case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range(struct __sk_buff, data_end): if (size != size_default) return false; break; @@ -9029,6 +9032,14 @@ static bool xdp_is_valid_access(int off, int size, } } return false; + } else { + switch (off) { + case offsetof(struct xdp_md, data_meta): + case offsetof(struct xdp_md, data): + case offsetof(struct xdp_md, data_end): + if (info->is_ldsx) + return false; + } } switch (off) { @@ -9354,12 +9365,12 @@ static bool flow_dissector_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, data): - if (size != size_default) + if (info->is_ldsx || size != size_default) return false; info->reg_type = PTR_TO_PACKET; return true; case bpf_ctx_range(struct __sk_buff, data_end): - if (size != size_default) + if (info->is_ldsx || size != size_default) return false; info->reg_type = PTR_TO_PACKET_END; return true; -- cgit v1.2.3 From 5b5f51bff1b66cedb62b5ba74a1878341204e057 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 22 Jul 2024 16:38:36 -0700 Subject: bpf: no_caller_saved_registers attribute for helper calls GCC and LLVM define a no_caller_saved_registers function attribute. This attribute means that function scratches only some of the caller saved registers defined by ABI. For BPF the set of such registers could be defined as follows: - R0 is scratched only if function is non-void; - R1-R5 are scratched only if corresponding parameter type is defined in the function prototype. This commit introduces flag bpf_func_prot->allow_nocsr. If this flag is set for some helper function, verifier assumes that it follows no_caller_saved_registers calling convention. The contract between kernel and clang allows to simultaneously use such functions and maintain backwards compatibility with old kernels that don't understand no_caller_saved_registers calls (nocsr for short): - clang generates a simple pattern for nocsr calls, e.g.: r1 = 1; r2 = 2; *(u64 *)(r10 - 8) = r1; *(u64 *)(r10 - 16) = r2; call %[to_be_inlined] r2 = *(u64 *)(r10 - 16); r1 = *(u64 *)(r10 - 8); r0 = r1; r0 += r2; exit; - kernel removes unnecessary spills and fills, if called function is inlined by verifier or current JIT (with assumption that patch inserted by verifier or JIT honors nocsr contract, e.g. does not scratch r3-r5 for the example above), e.g. the code above would be transformed to: r1 = 1; r2 = 2; call %[to_be_inlined] r0 = r1; r0 += r2; exit; Technically, the transformation is split into the following phases: - function mark_nocsr_patterns(), called from bpf_check() searches and marks potential patterns in instruction auxiliary data; - upon stack read or write access, function check_nocsr_stack_contract() is used to verify if stack offsets, presumably reserved for nocsr patterns, are used only from those patterns; - function remove_nocsr_spills_fills(), called from bpf_check(), applies the rewrite for valid patterns. See comment in mark_nocsr_pattern_for_call() for more details. Suggested-by: Alexei Starovoitov Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240722233844.1406874-3-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- include/linux/bpf.h | 6 + include/linux/bpf_verifier.h | 14 ++ kernel/bpf/verifier.c | 317 ++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 334 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f560ea0c2b36..b9425e410bcb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -808,6 +808,12 @@ struct bpf_func_proto { bool gpl_only; bool pkt_access; bool might_sleep; + /* set to true if helper follows contract for gcc/llvm + * attribute no_caller_saved_registers: + * - void functions do not scratch r0 + * - functions taking N arguments scratch only registers r1-rN + */ + bool allow_nocsr; enum bpf_return_type ret_type; union { struct { diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 731a0a4ac13c..5cea15c81b8a 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -576,6 +576,14 @@ struct bpf_insn_aux_data { bool is_iter_next; /* bpf_iter__next() kfunc call */ bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */ u8 alu_state; /* used in combination with alu_limit */ + /* true if STX or LDX instruction is a part of a spill/fill + * pattern for a no_caller_saved_registers call. + */ + u8 nocsr_pattern:1; + /* for CALL instructions, a number of spill/fill pairs in the + * no_caller_saved_registers pattern. + */ + u8 nocsr_spills_num:3; /* below fields are initialized once */ unsigned int orig_idx; /* original instruction index */ @@ -645,6 +653,10 @@ struct bpf_subprog_info { u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u16 stack_depth; /* max. stack depth used by this function */ u16 stack_extra; + /* offsets in range [stack_depth .. nocsr_stack_off) + * are used for no_caller_saved_registers spills and fills. + */ + s16 nocsr_stack_off; bool has_tail_call: 1; bool tail_call_reachable: 1; bool has_ld_abs: 1; @@ -652,6 +664,8 @@ struct bpf_subprog_info { bool is_async_cb: 1; bool is_exception_cb: 1; bool args_cached: 1; + /* true if nocsr stack region is used by functions that can't be inlined */ + bool keep_nocsr_stack: 1; u8 arg_cnt; struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS]; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3ed67452f8c6..7587336967cc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4579,6 +4579,31 @@ static int get_reg_width(struct bpf_reg_state *reg) return fls64(reg->umax_value); } +/* See comment for mark_nocsr_pattern_for_call() */ +static void check_nocsr_stack_contract(struct bpf_verifier_env *env, struct bpf_func_state *state, + int insn_idx, int off) +{ + struct bpf_subprog_info *subprog = &env->subprog_info[state->subprogno]; + struct bpf_insn_aux_data *aux = env->insn_aux_data; + int i; + + if (subprog->nocsr_stack_off <= off || aux[insn_idx].nocsr_pattern) + return; + /* access to the region [max_stack_depth .. nocsr_stack_off) + * from something that is not a part of the nocsr pattern, + * disable nocsr rewrites for current subprogram by setting + * nocsr_stack_off to a value smaller than any possible offset. + */ + subprog->nocsr_stack_off = S16_MIN; + /* reset nocsr aux flags within subprogram, + * happens at most once per subprogram + */ + for (i = subprog->start; i < (subprog + 1)->start; ++i) { + aux[i].nocsr_spills_num = 0; + aux[i].nocsr_pattern = 0; + } +} + /* check_stack_{read,write}_fixed_off functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ @@ -4627,6 +4652,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, if (err) return err; + check_nocsr_stack_contract(env, state, insn_idx, off); mark_stack_slot_scratched(env, spi); if (reg && !(off % BPF_REG_SIZE) && reg->type == SCALAR_VALUE && env->bpf_capable) { bool reg_value_fits; @@ -4761,6 +4787,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, return err; } + check_nocsr_stack_contract(env, state, insn_idx, min_off); /* Variable offset writes destroy any spilled pointers in range. */ for (i = min_off; i < max_off; i++) { u8 new_type, *stype; @@ -4899,6 +4926,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, reg = ®_state->stack[spi].spilled_ptr; mark_stack_slot_scratched(env, spi); + check_nocsr_stack_contract(env, state, env->insn_idx, off); if (is_spilled_reg(®_state->stack[spi])) { u8 spill_size = 1; @@ -5059,6 +5087,7 @@ static int check_stack_read_var_off(struct bpf_verifier_env *env, min_off = reg->smin_value + off; max_off = reg->smax_value + off; mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno); + check_nocsr_stack_contract(env, ptr_state, env->insn_idx, min_off); return 0; } @@ -6772,10 +6801,20 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env, struct bpf_func_state *state, enum bpf_access_type t) { - int min_valid_off; + struct bpf_insn_aux_data *aux = &env->insn_aux_data[env->insn_idx]; + int min_valid_off, max_bpf_stack; + + /* If accessing instruction is a spill/fill from nocsr pattern, + * add room for all caller saved registers below MAX_BPF_STACK. + * In case if nocsr rewrite won't happen maximal stack depth + * would be checked by check_max_stack_depth_subprog(). + */ + max_bpf_stack = MAX_BPF_STACK; + if (aux->nocsr_pattern) + max_bpf_stack += CALLER_SAVED_REGS * BPF_REG_SIZE; if (t == BPF_WRITE || env->allow_uninit_stack) - min_valid_off = -MAX_BPF_STACK; + min_valid_off = -max_bpf_stack; else min_valid_off = -state->allocated_stack; @@ -16061,6 +16100,232 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, return ret; } +/* Bitmask with 1s for all caller saved registers */ +#define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1) + +/* Return a bitmask specifying which caller saved registers are + * clobbered by a call to a helper *as if* this helper follows + * no_caller_saved_registers contract: + * - includes R0 if function is non-void; + * - includes R1-R5 if corresponding parameter has is described + * in the function prototype. + */ +static u32 helper_nocsr_clobber_mask(const struct bpf_func_proto *fn) +{ + u8 mask; + int i; + + mask = 0; + if (fn->ret_type != RET_VOID) + mask |= BIT(BPF_REG_0); + for (i = 0; i < ARRAY_SIZE(fn->arg_type); ++i) + if (fn->arg_type[i] != ARG_DONTCARE) + mask |= BIT(BPF_REG_1 + i); + return mask; +} + +/* True if do_misc_fixups() replaces calls to helper number 'imm', + * replacement patch is presumed to follow no_caller_saved_registers contract + * (see mark_nocsr_pattern_for_call() below). + */ +static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) +{ + return false; +} + +/* GCC and LLVM define a no_caller_saved_registers function attribute. + * This attribute means that function scratches only some of + * the caller saved registers defined by ABI. + * For BPF the set of such registers could be defined as follows: + * - R0 is scratched only if function is non-void; + * - R1-R5 are scratched only if corresponding parameter type is defined + * in the function prototype. + * + * The contract between kernel and clang allows to simultaneously use + * such functions and maintain backwards compatibility with old + * kernels that don't understand no_caller_saved_registers calls + * (nocsr for short): + * + * - for nocsr calls clang allocates registers as-if relevant r0-r5 + * registers are not scratched by the call; + * + * - as a post-processing step, clang visits each nocsr call and adds + * spill/fill for every live r0-r5; + * + * - stack offsets used for the spill/fill are allocated as lowest + * stack offsets in whole function and are not used for any other + * purposes; + * + * - when kernel loads a program, it looks for such patterns + * (nocsr function surrounded by spills/fills) and checks if + * spill/fill stack offsets are used exclusively in nocsr patterns; + * + * - if so, and if verifier or current JIT inlines the call to the + * nocsr function (e.g. a helper call), kernel removes unnecessary + * spill/fill pairs; + * + * - when old kernel loads a program, presence of spill/fill pairs + * keeps BPF program valid, albeit slightly less efficient. + * + * For example: + * + * r1 = 1; + * r2 = 2; + * *(u64 *)(r10 - 8) = r1; r1 = 1; + * *(u64 *)(r10 - 16) = r2; r2 = 2; + * call %[to_be_inlined] --> call %[to_be_inlined] + * r2 = *(u64 *)(r10 - 16); r0 = r1; + * r1 = *(u64 *)(r10 - 8); r0 += r2; + * r0 = r1; exit; + * r0 += r2; + * exit; + * + * The purpose of mark_nocsr_pattern_for_call is to: + * - look for such patterns; + * - mark spill and fill instructions in env->insn_aux_data[*].nocsr_pattern; + * - mark set env->insn_aux_data[*].nocsr_spills_num for call instruction; + * - update env->subprog_info[*]->nocsr_stack_off to find an offset + * at which nocsr spill/fill stack slots start; + * - update env->subprog_info[*]->keep_nocsr_stack. + * + * The .nocsr_pattern and .nocsr_stack_off are used by + * check_nocsr_stack_contract() to check if every stack access to + * nocsr spill/fill stack slot originates from spill/fill + * instructions, members of nocsr patterns. + * + * If such condition holds true for a subprogram, nocsr patterns could + * be rewritten by remove_nocsr_spills_fills(). + * Otherwise nocsr patterns are not changed in the subprogram + * (code, presumably, generated by an older clang version). + * + * For example, it is *not* safe to remove spill/fill below: + * + * r1 = 1; + * *(u64 *)(r10 - 8) = r1; r1 = 1; + * call %[to_be_inlined] --> call %[to_be_inlined] + * r1 = *(u64 *)(r10 - 8); r0 = *(u64 *)(r10 - 8); <---- wrong !!! + * r0 = *(u64 *)(r10 - 8); r0 += r1; + * r0 += r1; exit; + * exit; + */ +static void mark_nocsr_pattern_for_call(struct bpf_verifier_env *env, + struct bpf_subprog_info *subprog, + int insn_idx, s16 lowest_off) +{ + struct bpf_insn *insns = env->prog->insnsi, *stx, *ldx; + struct bpf_insn *call = &env->prog->insnsi[insn_idx]; + const struct bpf_func_proto *fn; + u32 clobbered_regs_mask = ALL_CALLER_SAVED_REGS; + u32 expected_regs_mask; + bool can_be_inlined = false; + s16 off; + int i; + + if (bpf_helper_call(call)) { + if (get_helper_proto(env, call->imm, &fn) < 0) + /* error would be reported later */ + return; + clobbered_regs_mask = helper_nocsr_clobber_mask(fn); + can_be_inlined = fn->allow_nocsr && + (verifier_inlines_helper_call(env, call->imm) || + bpf_jit_inlines_helper_call(call->imm)); + } + + if (clobbered_regs_mask == ALL_CALLER_SAVED_REGS) + return; + + /* e.g. if helper call clobbers r{0,1}, expect r{2,3,4,5} in the pattern */ + expected_regs_mask = ~clobbered_regs_mask & ALL_CALLER_SAVED_REGS; + + /* match pairs of form: + * + * *(u64 *)(r10 - Y) = rX (where Y % 8 == 0) + * ... + * call %[to_be_inlined] + * ... + * rX = *(u64 *)(r10 - Y) + */ + for (i = 1, off = lowest_off; i <= ARRAY_SIZE(caller_saved); ++i, off += BPF_REG_SIZE) { + if (insn_idx - i < 0 || insn_idx + i >= env->prog->len) + break; + stx = &insns[insn_idx - i]; + ldx = &insns[insn_idx + i]; + /* must be a stack spill/fill pair */ + if (stx->code != (BPF_STX | BPF_MEM | BPF_DW) || + ldx->code != (BPF_LDX | BPF_MEM | BPF_DW) || + stx->dst_reg != BPF_REG_10 || + ldx->src_reg != BPF_REG_10) + break; + /* must be a spill/fill for the same reg */ + if (stx->src_reg != ldx->dst_reg) + break; + /* must be one of the previously unseen registers */ + if ((BIT(stx->src_reg) & expected_regs_mask) == 0) + break; + /* must be a spill/fill for the same expected offset, + * no need to check offset alignment, BPF_DW stack access + * is always 8-byte aligned. + */ + if (stx->off != off || ldx->off != off) + break; + expected_regs_mask &= ~BIT(stx->src_reg); + env->insn_aux_data[insn_idx - i].nocsr_pattern = 1; + env->insn_aux_data[insn_idx + i].nocsr_pattern = 1; + } + if (i == 1) + return; + + /* Conditionally set 'nocsr_spills_num' to allow forward + * compatibility when more helper functions are marked as + * nocsr at compile time than current kernel supports, e.g: + * + * 1: *(u64 *)(r10 - 8) = r1 + * 2: call A ;; assume A is nocsr for current kernel + * 3: r1 = *(u64 *)(r10 - 8) + * 4: *(u64 *)(r10 - 8) = r1 + * 5: call B ;; assume B is not nocsr for current kernel + * 6: r1 = *(u64 *)(r10 - 8) + * + * There is no need to block nocsr rewrite for such program. + * Set 'nocsr_pattern' for both calls to keep check_nocsr_stack_contract() happy, + * don't set 'nocsr_spills_num' for call B so that remove_nocsr_spills_fills() + * does not remove spill/fill pair {4,6}. + */ + if (can_be_inlined) + env->insn_aux_data[insn_idx].nocsr_spills_num = i - 1; + else + subprog->keep_nocsr_stack = 1; + subprog->nocsr_stack_off = min(subprog->nocsr_stack_off, off); +} + +static int mark_nocsr_patterns(struct bpf_verifier_env *env) +{ + struct bpf_subprog_info *subprog = env->subprog_info; + struct bpf_insn *insn; + s16 lowest_off; + int s, i; + + for (s = 0; s < env->subprog_cnt; ++s, ++subprog) { + /* find lowest stack spill offset used in this subprog */ + lowest_off = 0; + for (i = subprog->start; i < (subprog + 1)->start; ++i) { + insn = env->prog->insnsi + i; + if (insn->code != (BPF_STX | BPF_MEM | BPF_DW) || + insn->dst_reg != BPF_REG_10) + continue; + lowest_off = min(lowest_off, insn->off); + } + /* use this offset to find nocsr patterns */ + for (i = subprog->start; i < (subprog + 1)->start; ++i) { + insn = env->prog->insnsi + i; + if (insn->code != (BPF_JMP | BPF_CALL)) + continue; + mark_nocsr_pattern_for_call(env, subprog, i, lowest_off); + } + } + return 0; +} + /* Visits the instruction at index t and returns one of the following: * < 0 - an error occurred * DONE_EXPLORING - the instruction was fully explored @@ -19209,9 +19474,11 @@ static int opt_remove_dead_code(struct bpf_verifier_env *env) return 0; } +static const struct bpf_insn NOP = BPF_JMP_IMM(BPF_JA, 0, 0, 0); + static int opt_remove_nops(struct bpf_verifier_env *env) { - const struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0); + const struct bpf_insn ja = NOP; struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; int i, err; @@ -20957,6 +21224,40 @@ static int optimize_bpf_loop(struct bpf_verifier_env *env) return 0; } +/* Remove unnecessary spill/fill pairs, members of nocsr pattern, + * adjust subprograms stack depth when possible. + */ +static int remove_nocsr_spills_fills(struct bpf_verifier_env *env) +{ + struct bpf_subprog_info *subprog = env->subprog_info; + struct bpf_insn_aux_data *aux = env->insn_aux_data; + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + u32 spills_num; + bool modified = false; + int i, j; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (aux[i].nocsr_spills_num > 0) { + spills_num = aux[i].nocsr_spills_num; + /* NOPs would be removed by opt_remove_nops() */ + for (j = 1; j <= spills_num; ++j) { + *(insn - j) = NOP; + *(insn + j) = NOP; + } + modified = true; + } + if ((subprog + 1)->start == i + 1) { + if (modified && !subprog->keep_nocsr_stack) + subprog->stack_depth = -subprog->nocsr_stack_off; + subprog++; + modified = false; + } + } + + return 0; +} + static void free_states(struct bpf_verifier_env *env) { struct bpf_verifier_state_list *sl, *sln; @@ -21871,6 +22172,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret < 0) goto skip_full_check; + ret = mark_nocsr_patterns(env); + if (ret < 0) + goto skip_full_check; + ret = do_check_main(env); ret = ret ?: do_check_subprogs(env); @@ -21880,6 +22185,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 skip_full_check: kvfree(env->explored_states); + /* might decrease stack depth, keep it before passes that + * allocate additional slots. + */ + if (ret == 0) + ret = remove_nocsr_spills_fills(env); + if (ret == 0) ret = check_max_stack_depth(env); -- cgit v1.2.3 From 1cbe8143fd2f588031a5f157d15ae15ce12215e2 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Tue, 30 Jul 2024 13:37:33 +0800 Subject: bpf: kprobe: Remove unused declaring of bpf_kprobe_override After the commit 66665ad2f102 ("tracing/kprobe: bpf: Compare instruction pointer with original one"), "bpf_kprobe_override" is not used anywhere anymore, and we can remove it now. Fixes: 66665ad2f102 ("tracing/kprobe: bpf: Compare instruction pointer with original one") Signed-off-by: Menglong Dong Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240730053733.885785-1-dongml2@chinatelecom.cn --- include/linux/trace_events.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 9df3e2973626..9435185c10ef 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -880,7 +880,6 @@ do { \ struct perf_event; DECLARE_PER_CPU(struct pt_regs, perf_trace_regs); -DECLARE_PER_CPU(int, bpf_kprobe_override); extern int perf_trace_init(struct perf_event *event); extern void perf_trace_destroy(struct perf_event *event); -- cgit v1.2.3 From 7f6287417baf57754f47687c6ea1a749a0686ab0 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Mon, 19 Aug 2024 18:28:05 +0200 Subject: bpf: Allow bpf_current_task_under_cgroup() with BPF_CGROUP_* The helper bpf_current_task_under_cgroup() currently is only allowed for tracing programs, allow its usage also in the BPF_CGROUP_* program types. Move the code from kernel/trace/bpf_trace.c to kernel/bpf/helpers.c, so it compiles also without CONFIG_BPF_EVENTS. This will be used in systemd-networkd to monitor the sysctl writes, and filter it's own writes from others: https://github.com/systemd/systemd/pull/32212 Signed-off-by: Matteo Croce Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240819162805.78235-3-technoboy85@gmail.com --- include/linux/bpf.h | 1 + kernel/bpf/cgroup.c | 2 ++ kernel/bpf/helpers.c | 23 +++++++++++++++++++++++ kernel/trace/bpf_trace.c | 27 ++------------------------- 4 files changed, 28 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b9425e410bcb..f0192c173ed8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3206,6 +3206,7 @@ extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; extern const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto; extern const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto; +extern const struct bpf_func_proto bpf_current_task_under_cgroup_proto; extern const struct bpf_func_proto bpf_msg_redirect_hash_proto; extern const struct bpf_func_proto bpf_msg_redirect_map_proto; extern const struct bpf_func_proto bpf_sk_redirect_hash_proto; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 8ba73042a239..e7113d700b87 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -2581,6 +2581,8 @@ cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_curr_proto; #endif + case BPF_FUNC_current_task_under_cgroup: + return &bpf_current_task_under_cgroup_proto; default: return NULL; } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 26b9649ab4ce..12e3aa40b180 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2458,6 +2458,29 @@ __bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task, return ret; } +BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct cgroup *cgrp; + + if (unlikely(idx >= array->map.max_entries)) + return -E2BIG; + + cgrp = READ_ONCE(array->ptrs[idx]); + if (unlikely(!cgrp)) + return -EAGAIN; + + return task_under_cgroup_hierarchy(current, cgrp); +} + +const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { + .func = bpf_current_task_under_cgroup, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; + /** * bpf_task_get_cgroup1 - Acquires the associated cgroup of a task within a * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d557bb11e0ff..b69a39316c0c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -797,29 +797,6 @@ const struct bpf_func_proto bpf_task_pt_regs_proto = { .ret_btf_id = &bpf_task_pt_regs_ids[0], }; -BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) -{ - struct bpf_array *array = container_of(map, struct bpf_array, map); - struct cgroup *cgrp; - - if (unlikely(idx >= array->map.max_entries)) - return -E2BIG; - - cgrp = READ_ONCE(array->ptrs[idx]); - if (unlikely(!cgrp)) - return -EAGAIN; - - return task_under_cgroup_hierarchy(current, cgrp); -} - -static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { - .func = bpf_current_task_under_cgroup, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_ANYTHING, -}; - struct send_signal_irq_work { struct irq_work irq_work; struct task_struct *task; @@ -1480,8 +1457,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_numa_node_id_proto; case BPF_FUNC_perf_event_read: return &bpf_perf_event_read_proto; - case BPF_FUNC_current_task_under_cgroup: - return &bpf_current_task_under_cgroup_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_probe_write_user: @@ -1510,6 +1485,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_cgrp_storage_get_proto; case BPF_FUNC_cgrp_storage_delete: return &bpf_cgrp_storage_delete_proto; + case BPF_FUNC_current_task_under_cgroup: + return &bpf_current_task_under_cgroup_proto; #endif case BPF_FUNC_send_signal: return &bpf_send_signal_proto; -- cgit v1.2.3 From 496ddd19a0fad22f250fc7a7b7a8000155418934 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 8 Aug 2024 16:22:28 -0700 Subject: bpf: extract iterator argument type and name validation logic Verifier enforces that all iterator structs are named `bpf_iter_` and that whenever iterator is passed to a kfunc it's passed as a valid PTR -> STRUCT chain (with potentially const modifiers in between). We'll need this check for upcoming changes, so instead of duplicating the logic, extract it into a helper function. Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240808232230.2848712-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 5 +++++ kernel/bpf/btf.c | 50 ++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 41 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index cffb43133c68..b8a583194c4a 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -580,6 +580,7 @@ bool btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type); bool btf_types_are_same(const struct btf *btf1, u32 id1, const struct btf *btf2, u32 id2); +int btf_check_iter_arg(struct btf *btf, const struct btf_type *func, int arg_idx); #else static inline const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) @@ -654,6 +655,10 @@ static inline bool btf_types_are_same(const struct btf *btf1, u32 id1, { return false; } +static inline int btf_check_iter_arg(struct btf *btf, const struct btf_type *func, int arg_idx) +{ + return -EOPNOTSUPP; +} #endif static inline bool btf_type_is_struct_ptr(struct btf *btf, const struct btf_type *t) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index b12db397303e..c9338fb397fc 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -8047,15 +8047,44 @@ BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE) BTF_TRACING_TYPE_xxx #undef BTF_TRACING_TYPE +/* Validate well-formedness of iter argument type. + * On success, return positive BTF ID of iter state's STRUCT type. + * On error, negative error is returned. + */ +int btf_check_iter_arg(struct btf *btf, const struct btf_type *func, int arg_idx) +{ + const struct btf_param *arg; + const struct btf_type *t; + const char *name; + int btf_id; + + if (btf_type_vlen(func) <= arg_idx) + return -EINVAL; + + arg = &btf_params(func)[arg_idx]; + t = btf_type_skip_modifiers(btf, arg->type, NULL); + if (!t || !btf_type_is_ptr(t)) + return -EINVAL; + t = btf_type_skip_modifiers(btf, t->type, &btf_id); + if (!t || !__btf_type_is_struct(t)) + return -EINVAL; + + name = btf_name_by_offset(btf, t->name_off); + if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1)) + return -EINVAL; + + return btf_id; +} + static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name, const struct btf_type *func, u32 func_flags) { u32 flags = func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY); - const char *name, *sfx, *iter_name; - const struct btf_param *arg; + const char *sfx, *iter_name; const struct btf_type *t; char exp_name[128]; u32 nr_args; + int btf_id; /* exactly one of KF_ITER_{NEW,NEXT,DESTROY} can be set */ if (!flags || (flags & (flags - 1))) @@ -8066,28 +8095,21 @@ static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name, if (nr_args < 1) return -EINVAL; - arg = &btf_params(func)[0]; - t = btf_type_skip_modifiers(btf, arg->type, NULL); - if (!t || !btf_type_is_ptr(t)) - return -EINVAL; - t = btf_type_skip_modifiers(btf, t->type, NULL); - if (!t || !__btf_type_is_struct(t)) - return -EINVAL; - - name = btf_name_by_offset(btf, t->name_off); - if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1)) - return -EINVAL; + btf_id = btf_check_iter_arg(btf, func, 0); + if (btf_id < 0) + return btf_id; /* sizeof(struct bpf_iter_) should be a multiple of 8 to * fit nicely in stack slots */ + t = btf_type_by_id(btf, btf_id); if (t->size == 0 || (t->size % 8)) return -EINVAL; /* validate bpf_iter__{new,next,destroy}(struct bpf_iter_ *) * naming pattern */ - iter_name = name + sizeof(ITER_PREFIX) - 1; + iter_name = btf_name_by_offset(btf, t->name_off) + sizeof(ITER_PREFIX) - 1; if (flags & KF_ITER_NEW) sfx = "new"; else if (flags & KF_ITER_NEXT) -- cgit v1.2.3 From ae010757a55b57c8b82628e8ea9b7da2269131d9 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 22 Aug 2024 01:41:07 -0700 Subject: bpf: rename nocsr -> bpf_fastcall in verifier Attribute used by LLVM implementation of the feature had been changed from no_caller_saved_registers to bpf_fastcall (see [1]). This commit replaces references to nocsr by references to bpf_fastcall to keep LLVM and Kernel parts in sync. [1] https://github.com/llvm/llvm-project/pull/105417 Acked-by: Yonghong Song Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240822084112.3257995-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 +- include/linux/bpf_verifier.h | 18 +++--- kernel/bpf/helpers.c | 2 +- kernel/bpf/verifier.c | 143 +++++++++++++++++++++---------------------- 4 files changed, 84 insertions(+), 85 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f0192c173ed8..00dc4dd28cbd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -808,12 +808,12 @@ struct bpf_func_proto { bool gpl_only; bool pkt_access; bool might_sleep; - /* set to true if helper follows contract for gcc/llvm - * attribute no_caller_saved_registers: + /* set to true if helper follows contract for llvm + * attribute bpf_fastcall: * - void functions do not scratch r0 * - functions taking N arguments scratch only registers r1-rN */ - bool allow_nocsr; + bool allow_fastcall; enum bpf_return_type ret_type; union { struct { diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 5cea15c81b8a..634a302a39e3 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -577,13 +577,13 @@ struct bpf_insn_aux_data { bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */ u8 alu_state; /* used in combination with alu_limit */ /* true if STX or LDX instruction is a part of a spill/fill - * pattern for a no_caller_saved_registers call. + * pattern for a bpf_fastcall call. */ - u8 nocsr_pattern:1; + u8 fastcall_pattern:1; /* for CALL instructions, a number of spill/fill pairs in the - * no_caller_saved_registers pattern. + * bpf_fastcall pattern. */ - u8 nocsr_spills_num:3; + u8 fastcall_spills_num:3; /* below fields are initialized once */ unsigned int orig_idx; /* original instruction index */ @@ -653,10 +653,10 @@ struct bpf_subprog_info { u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u16 stack_depth; /* max. stack depth used by this function */ u16 stack_extra; - /* offsets in range [stack_depth .. nocsr_stack_off) - * are used for no_caller_saved_registers spills and fills. + /* offsets in range [stack_depth .. fastcall_stack_off) + * are used for bpf_fastcall spills and fills. */ - s16 nocsr_stack_off; + s16 fastcall_stack_off; bool has_tail_call: 1; bool tail_call_reachable: 1; bool has_ld_abs: 1; @@ -664,8 +664,8 @@ struct bpf_subprog_info { bool is_async_cb: 1; bool is_exception_cb: 1; bool args_cached: 1; - /* true if nocsr stack region is used by functions that can't be inlined */ - bool keep_nocsr_stack: 1; + /* true if bpf_fastcall stack region is used by functions that can't be inlined */ + bool keep_fastcall_stack: 1; u8 arg_cnt; struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS]; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index ccca6fe0367c..4105b4af6e03 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -158,7 +158,7 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = { .func = bpf_get_smp_processor_id, .gpl_only = false, .ret_type = RET_INTEGER, - .allow_nocsr = true, + .allow_fastcall = true, }; BPF_CALL_0(bpf_get_numa_node_id) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 920d7c5fe944..0dfd91f36417 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4579,28 +4579,28 @@ static int get_reg_width(struct bpf_reg_state *reg) return fls64(reg->umax_value); } -/* See comment for mark_nocsr_pattern_for_call() */ -static void check_nocsr_stack_contract(struct bpf_verifier_env *env, struct bpf_func_state *state, - int insn_idx, int off) +/* See comment for mark_fastcall_pattern_for_call() */ +static void check_fastcall_stack_contract(struct bpf_verifier_env *env, + struct bpf_func_state *state, int insn_idx, int off) { struct bpf_subprog_info *subprog = &env->subprog_info[state->subprogno]; struct bpf_insn_aux_data *aux = env->insn_aux_data; int i; - if (subprog->nocsr_stack_off <= off || aux[insn_idx].nocsr_pattern) + if (subprog->fastcall_stack_off <= off || aux[insn_idx].fastcall_pattern) return; - /* access to the region [max_stack_depth .. nocsr_stack_off) - * from something that is not a part of the nocsr pattern, - * disable nocsr rewrites for current subprogram by setting - * nocsr_stack_off to a value smaller than any possible offset. + /* access to the region [max_stack_depth .. fastcall_stack_off) + * from something that is not a part of the fastcall pattern, + * disable fastcall rewrites for current subprogram by setting + * fastcall_stack_off to a value smaller than any possible offset. */ - subprog->nocsr_stack_off = S16_MIN; - /* reset nocsr aux flags within subprogram, + subprog->fastcall_stack_off = S16_MIN; + /* reset fastcall aux flags within subprogram, * happens at most once per subprogram */ for (i = subprog->start; i < (subprog + 1)->start; ++i) { - aux[i].nocsr_spills_num = 0; - aux[i].nocsr_pattern = 0; + aux[i].fastcall_spills_num = 0; + aux[i].fastcall_pattern = 0; } } @@ -4652,7 +4652,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, if (err) return err; - check_nocsr_stack_contract(env, state, insn_idx, off); + check_fastcall_stack_contract(env, state, insn_idx, off); mark_stack_slot_scratched(env, spi); if (reg && !(off % BPF_REG_SIZE) && reg->type == SCALAR_VALUE && env->bpf_capable) { bool reg_value_fits; @@ -4787,7 +4787,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, return err; } - check_nocsr_stack_contract(env, state, insn_idx, min_off); + check_fastcall_stack_contract(env, state, insn_idx, min_off); /* Variable offset writes destroy any spilled pointers in range. */ for (i = min_off; i < max_off; i++) { u8 new_type, *stype; @@ -4926,7 +4926,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, reg = ®_state->stack[spi].spilled_ptr; mark_stack_slot_scratched(env, spi); - check_nocsr_stack_contract(env, state, env->insn_idx, off); + check_fastcall_stack_contract(env, state, env->insn_idx, off); if (is_spilled_reg(®_state->stack[spi])) { u8 spill_size = 1; @@ -5087,7 +5087,7 @@ static int check_stack_read_var_off(struct bpf_verifier_env *env, min_off = reg->smin_value + off; max_off = reg->smax_value + off; mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno); - check_nocsr_stack_contract(env, ptr_state, env->insn_idx, min_off); + check_fastcall_stack_contract(env, ptr_state, env->insn_idx, min_off); return 0; } @@ -6804,13 +6804,13 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env, struct bpf_insn_aux_data *aux = &env->insn_aux_data[env->insn_idx]; int min_valid_off, max_bpf_stack; - /* If accessing instruction is a spill/fill from nocsr pattern, + /* If accessing instruction is a spill/fill from bpf_fastcall pattern, * add room for all caller saved registers below MAX_BPF_STACK. - * In case if nocsr rewrite won't happen maximal stack depth + * In case if bpf_fastcall rewrite won't happen maximal stack depth * would be checked by check_max_stack_depth_subprog(). */ max_bpf_stack = MAX_BPF_STACK; - if (aux->nocsr_pattern) + if (aux->fastcall_pattern) max_bpf_stack += CALLER_SAVED_REGS * BPF_REG_SIZE; if (t == BPF_WRITE || env->allow_uninit_stack) @@ -16118,12 +16118,12 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, /* Return a bitmask specifying which caller saved registers are * clobbered by a call to a helper *as if* this helper follows - * no_caller_saved_registers contract: + * bpf_fastcall contract: * - includes R0 if function is non-void; * - includes R1-R5 if corresponding parameter has is described * in the function prototype. */ -static u32 helper_nocsr_clobber_mask(const struct bpf_func_proto *fn) +static u32 helper_fastcall_clobber_mask(const struct bpf_func_proto *fn) { u8 mask; int i; @@ -16138,8 +16138,8 @@ static u32 helper_nocsr_clobber_mask(const struct bpf_func_proto *fn) } /* True if do_misc_fixups() replaces calls to helper number 'imm', - * replacement patch is presumed to follow no_caller_saved_registers contract - * (see mark_nocsr_pattern_for_call() below). + * replacement patch is presumed to follow bpf_fastcall contract + * (see mark_fastcall_pattern_for_call() below). */ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) { @@ -16153,7 +16153,7 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) } } -/* GCC and LLVM define a no_caller_saved_registers function attribute. +/* LLVM define a bpf_fastcall function attribute. * This attribute means that function scratches only some of * the caller saved registers defined by ABI. * For BPF the set of such registers could be defined as follows: @@ -16163,13 +16163,12 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) * * The contract between kernel and clang allows to simultaneously use * such functions and maintain backwards compatibility with old - * kernels that don't understand no_caller_saved_registers calls - * (nocsr for short): + * kernels that don't understand bpf_fastcall calls: * - * - for nocsr calls clang allocates registers as-if relevant r0-r5 + * - for bpf_fastcall calls clang allocates registers as-if relevant r0-r5 * registers are not scratched by the call; * - * - as a post-processing step, clang visits each nocsr call and adds + * - as a post-processing step, clang visits each bpf_fastcall call and adds * spill/fill for every live r0-r5; * * - stack offsets used for the spill/fill are allocated as lowest @@ -16177,11 +16176,11 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) * purposes; * * - when kernel loads a program, it looks for such patterns - * (nocsr function surrounded by spills/fills) and checks if - * spill/fill stack offsets are used exclusively in nocsr patterns; + * (bpf_fastcall function surrounded by spills/fills) and checks if + * spill/fill stack offsets are used exclusively in fastcall patterns; * * - if so, and if verifier or current JIT inlines the call to the - * nocsr function (e.g. a helper call), kernel removes unnecessary + * bpf_fastcall function (e.g. a helper call), kernel removes unnecessary * spill/fill pairs; * * - when old kernel loads a program, presence of spill/fill pairs @@ -16200,22 +16199,22 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) * r0 += r2; * exit; * - * The purpose of mark_nocsr_pattern_for_call is to: + * The purpose of mark_fastcall_pattern_for_call is to: * - look for such patterns; - * - mark spill and fill instructions in env->insn_aux_data[*].nocsr_pattern; - * - mark set env->insn_aux_data[*].nocsr_spills_num for call instruction; - * - update env->subprog_info[*]->nocsr_stack_off to find an offset - * at which nocsr spill/fill stack slots start; - * - update env->subprog_info[*]->keep_nocsr_stack. + * - mark spill and fill instructions in env->insn_aux_data[*].fastcall_pattern; + * - mark set env->insn_aux_data[*].fastcall_spills_num for call instruction; + * - update env->subprog_info[*]->fastcall_stack_off to find an offset + * at which bpf_fastcall spill/fill stack slots start; + * - update env->subprog_info[*]->keep_fastcall_stack. * - * The .nocsr_pattern and .nocsr_stack_off are used by - * check_nocsr_stack_contract() to check if every stack access to - * nocsr spill/fill stack slot originates from spill/fill - * instructions, members of nocsr patterns. + * The .fastcall_pattern and .fastcall_stack_off are used by + * check_fastcall_stack_contract() to check if every stack access to + * fastcall spill/fill stack slot originates from spill/fill + * instructions, members of fastcall patterns. * - * If such condition holds true for a subprogram, nocsr patterns could - * be rewritten by remove_nocsr_spills_fills(). - * Otherwise nocsr patterns are not changed in the subprogram + * If such condition holds true for a subprogram, fastcall patterns could + * be rewritten by remove_fastcall_spills_fills(). + * Otherwise bpf_fastcall patterns are not changed in the subprogram * (code, presumably, generated by an older clang version). * * For example, it is *not* safe to remove spill/fill below: @@ -16228,9 +16227,9 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) * r0 += r1; exit; * exit; */ -static void mark_nocsr_pattern_for_call(struct bpf_verifier_env *env, - struct bpf_subprog_info *subprog, - int insn_idx, s16 lowest_off) +static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env, + struct bpf_subprog_info *subprog, + int insn_idx, s16 lowest_off) { struct bpf_insn *insns = env->prog->insnsi, *stx, *ldx; struct bpf_insn *call = &env->prog->insnsi[insn_idx]; @@ -16245,8 +16244,8 @@ static void mark_nocsr_pattern_for_call(struct bpf_verifier_env *env, if (get_helper_proto(env, call->imm, &fn) < 0) /* error would be reported later */ return; - clobbered_regs_mask = helper_nocsr_clobber_mask(fn); - can_be_inlined = fn->allow_nocsr && + clobbered_regs_mask = helper_fastcall_clobber_mask(fn); + can_be_inlined = fn->allow_fastcall && (verifier_inlines_helper_call(env, call->imm) || bpf_jit_inlines_helper_call(call->imm)); } @@ -16289,36 +16288,36 @@ static void mark_nocsr_pattern_for_call(struct bpf_verifier_env *env, if (stx->off != off || ldx->off != off) break; expected_regs_mask &= ~BIT(stx->src_reg); - env->insn_aux_data[insn_idx - i].nocsr_pattern = 1; - env->insn_aux_data[insn_idx + i].nocsr_pattern = 1; + env->insn_aux_data[insn_idx - i].fastcall_pattern = 1; + env->insn_aux_data[insn_idx + i].fastcall_pattern = 1; } if (i == 1) return; - /* Conditionally set 'nocsr_spills_num' to allow forward + /* Conditionally set 'fastcall_spills_num' to allow forward * compatibility when more helper functions are marked as - * nocsr at compile time than current kernel supports, e.g: + * bpf_fastcall at compile time than current kernel supports, e.g: * * 1: *(u64 *)(r10 - 8) = r1 - * 2: call A ;; assume A is nocsr for current kernel + * 2: call A ;; assume A is bpf_fastcall for current kernel * 3: r1 = *(u64 *)(r10 - 8) * 4: *(u64 *)(r10 - 8) = r1 - * 5: call B ;; assume B is not nocsr for current kernel + * 5: call B ;; assume B is not bpf_fastcall for current kernel * 6: r1 = *(u64 *)(r10 - 8) * - * There is no need to block nocsr rewrite for such program. - * Set 'nocsr_pattern' for both calls to keep check_nocsr_stack_contract() happy, - * don't set 'nocsr_spills_num' for call B so that remove_nocsr_spills_fills() + * There is no need to block bpf_fastcall rewrite for such program. + * Set 'fastcall_pattern' for both calls to keep check_fastcall_stack_contract() happy, + * don't set 'fastcall_spills_num' for call B so that remove_fastcall_spills_fills() * does not remove spill/fill pair {4,6}. */ if (can_be_inlined) - env->insn_aux_data[insn_idx].nocsr_spills_num = i - 1; + env->insn_aux_data[insn_idx].fastcall_spills_num = i - 1; else - subprog->keep_nocsr_stack = 1; - subprog->nocsr_stack_off = min(subprog->nocsr_stack_off, off); + subprog->keep_fastcall_stack = 1; + subprog->fastcall_stack_off = min(subprog->fastcall_stack_off, off); } -static int mark_nocsr_patterns(struct bpf_verifier_env *env) +static int mark_fastcall_patterns(struct bpf_verifier_env *env) { struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn; @@ -16335,12 +16334,12 @@ static int mark_nocsr_patterns(struct bpf_verifier_env *env) continue; lowest_off = min(lowest_off, insn->off); } - /* use this offset to find nocsr patterns */ + /* use this offset to find fastcall patterns */ for (i = subprog->start; i < (subprog + 1)->start; ++i) { insn = env->prog->insnsi + i; if (insn->code != (BPF_JMP | BPF_CALL)) continue; - mark_nocsr_pattern_for_call(env, subprog, i, lowest_off); + mark_fastcall_pattern_for_call(env, subprog, i, lowest_off); } } return 0; @@ -21244,10 +21243,10 @@ static int optimize_bpf_loop(struct bpf_verifier_env *env) return 0; } -/* Remove unnecessary spill/fill pairs, members of nocsr pattern, +/* Remove unnecessary spill/fill pairs, members of fastcall pattern, * adjust subprograms stack depth when possible. */ -static int remove_nocsr_spills_fills(struct bpf_verifier_env *env) +static int remove_fastcall_spills_fills(struct bpf_verifier_env *env) { struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn_aux_data *aux = env->insn_aux_data; @@ -21258,8 +21257,8 @@ static int remove_nocsr_spills_fills(struct bpf_verifier_env *env) int i, j; for (i = 0; i < insn_cnt; i++, insn++) { - if (aux[i].nocsr_spills_num > 0) { - spills_num = aux[i].nocsr_spills_num; + if (aux[i].fastcall_spills_num > 0) { + spills_num = aux[i].fastcall_spills_num; /* NOPs would be removed by opt_remove_nops() */ for (j = 1; j <= spills_num; ++j) { *(insn - j) = NOP; @@ -21268,8 +21267,8 @@ static int remove_nocsr_spills_fills(struct bpf_verifier_env *env) modified = true; } if ((subprog + 1)->start == i + 1) { - if (modified && !subprog->keep_nocsr_stack) - subprog->stack_depth = -subprog->nocsr_stack_off; + if (modified && !subprog->keep_fastcall_stack) + subprog->stack_depth = -subprog->fastcall_stack_off; subprog++; modified = false; } @@ -22192,7 +22191,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret < 0) goto skip_full_check; - ret = mark_nocsr_patterns(env); + ret = mark_fastcall_patterns(env); if (ret < 0) goto skip_full_check; @@ -22209,7 +22208,7 @@ skip_full_check: * allocate additional slots. */ if (ret == 0) - ret = remove_nocsr_spills_fills(env); + ret = remove_fastcall_spills_fills(env); if (ret == 0) ret = check_max_stack_depth(env); -- cgit v1.2.3 From d59232afb0344e33e9399f308d9b4a03876e7676 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Tue, 13 Aug 2024 21:24:22 +0000 Subject: bpf: Rename ARG_PTR_TO_KPTR -> ARG_KPTR_XCHG_DEST ARG_PTR_TO_KPTR is currently only used by the bpf_kptr_xchg helper. Although it limits reg types for that helper's first arg to PTR_TO_MAP_VALUE, any arbitrary mapval won't do: further custom verification logic ensures that the mapval reg being xchgd-into is pointing to a kptr field. If this is not the case, it's not safe to xchg into that reg's pointee. Let's rename the bpf_arg_type to more accurately describe the fairly specific expectations that this arg type encodes. This is a nonfunctional change. Acked-by: Martin KaFai Lau Signed-off-by: Dave Marchevsky Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20240813212424.2871455-4-amery.hung@bytedance.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- kernel/bpf/helpers.c | 2 +- kernel/bpf/verifier.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 00dc4dd28cbd..dc63083f76b7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -744,7 +744,7 @@ enum bpf_arg_type { ARG_PTR_TO_STACK, /* pointer to stack */ ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */ ARG_PTR_TO_TIMER, /* pointer to bpf_timer */ - ARG_PTR_TO_KPTR, /* pointer to referenced kptr */ + ARG_KPTR_XCHG_DEST, /* pointer to destination that kptrs are bpf_kptr_xchg'd into */ ARG_PTR_TO_DYNPTR, /* pointer to bpf_dynptr. See bpf_type_flag for dynptr type */ __BPF_ARG_TYPE_MAX, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 4105b4af6e03..bf55b81b3e06 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1636,7 +1636,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .ret_btf_id = BPF_PTR_POISON, - .arg1_type = ARG_PTR_TO_KPTR, + .arg1_type = ARG_KPTR_XCHG_DEST, .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL | OBJ_RELEASE, .arg2_btf_id = BPF_PTR_POISON, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 148536446457..5fe2a8978bb8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8412,7 +8412,7 @@ static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } }; static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } }; -static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } }; +static const struct bpf_reg_types kptr_xchg_dest_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types dynptr_types = { .types = { PTR_TO_STACK, @@ -8444,7 +8444,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_STACK] = &stack_ptr_types, [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types, [ARG_PTR_TO_TIMER] = &timer_types, - [ARG_PTR_TO_KPTR] = &kptr_types, + [ARG_KPTR_XCHG_DEST] = &kptr_xchg_dest_types, [ARG_PTR_TO_DYNPTR] = &dynptr_types, }; @@ -9044,7 +9044,7 @@ skip_type_check: return err; break; } - case ARG_PTR_TO_KPTR: + case ARG_KPTR_XCHG_DEST: err = process_kptr_func(env, regno, meta); if (err) return err; -- cgit v1.2.3 From 6f606ffd6dd7583d8194ee3d858ba4da2eff26a3 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 29 Aug 2024 14:08:23 -0700 Subject: bpf: Move insn_buf[16] to bpf_verifier_env This patch moves the 'struct bpf_insn insn_buf[16]' stack usage to the bpf_verifier_env. A '#define INSN_BUF_SIZE 16' is also added to replace the ARRAY_SIZE(insn_buf) usages. Both convert_ctx_accesses() and do_misc_fixup() are changed to use the env->insn_buf. It is a refactoring work for adding the epilogue_buf[16] in a later patch. With this patch, the stack size usage decreased. Before: ./kernel/bpf/verifier.c:22133:5: warning: stack frame size (2584) After: ./kernel/bpf/verifier.c:22184:5: warning: stack frame size (2264) Reviewed-by: Eduard Zingerman Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240829210833.388152-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 3 +++ kernel/bpf/verifier.c | 15 ++++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 279b4a640644..0ad2d189c546 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -23,6 +23,8 @@ * (in the "-8,-16,...,-512" form) */ #define TMP_STR_BUF_LEN 320 +/* Patch buffer size */ +#define INSN_BUF_SIZE 16 /* Liveness marks, used for registers and spilled-regs (in stack slots). * Read marks propagate upwards until they find a write mark; they record that @@ -780,6 +782,7 @@ struct bpf_verifier_env { * e.g., in reg_type_str() to generate reg_type string */ char tmp_str_buf[TMP_STR_BUF_LEN]; + struct bpf_insn insn_buf[INSN_BUF_SIZE]; }; static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f32e3b9bb4e5..261849384ea8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19677,7 +19677,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) const struct bpf_verifier_ops *ops = env->ops; int i, cnt, size, ctx_field_size, delta = 0; const int insn_cnt = env->prog->len; - struct bpf_insn insn_buf[16], *insn; + struct bpf_insn *insn_buf = env->insn_buf; + struct bpf_insn *insn; u32 target_size, size_default, off; struct bpf_prog *new_prog; enum bpf_access_type type; @@ -19690,7 +19691,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, env->prog); - if (cnt >= ARRAY_SIZE(insn_buf)) { + if (cnt >= INSN_BUF_SIZE) { verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } else if (cnt) { @@ -19837,7 +19838,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) target_size = 0; cnt = convert_ctx_access(type, insn, insn_buf, env->prog, &target_size); - if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) || + if (cnt == 0 || cnt >= INSN_BUF_SIZE || (ctx_field_size && !target_size)) { verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; @@ -19846,7 +19847,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (is_narrower_load && size < target_size) { u8 shift = bpf_ctx_narrow_access_offset( off, size, size_default) * 8; - if (shift && cnt + 1 >= ARRAY_SIZE(insn_buf)) { + if (shift && cnt + 1 >= INSN_BUF_SIZE) { verbose(env, "bpf verifier narrow ctx load misconfigured\n"); return -EINVAL; } @@ -20391,7 +20392,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) const int insn_cnt = prog->len; const struct bpf_map_ops *ops; struct bpf_insn_aux_data *aux; - struct bpf_insn insn_buf[16]; + struct bpf_insn *insn_buf = env->insn_buf; struct bpf_prog *new_prog; struct bpf_map *map_ptr; int i, ret, cnt, delta = 0, cur_subprog = 0; @@ -20510,7 +20511,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) (BPF_MODE(insn->code) == BPF_ABS || BPF_MODE(insn->code) == BPF_IND)) { cnt = env->ops->gen_ld_abs(insn, insn_buf); - if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + if (cnt == 0 || cnt >= INSN_BUF_SIZE) { verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } @@ -20803,7 +20804,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) cnt = ops->map_gen_lookup(map_ptr, insn_buf); if (cnt == -EOPNOTSUPP) goto patch_map_ops_generic; - if (cnt <= 0 || cnt >= ARRAY_SIZE(insn_buf)) { + if (cnt <= 0 || cnt >= INSN_BUF_SIZE) { verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } -- cgit v1.2.3 From 169c31761c8d7f606f3ee628829c27998626c4f0 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 29 Aug 2024 14:08:25 -0700 Subject: bpf: Add gen_epilogue to bpf_verifier_ops This patch adds a .gen_epilogue to the bpf_verifier_ops. It is similar to the existing .gen_prologue. Instead of allowing a subsystem to run code at the beginning of a bpf prog, it allows the subsystem to run code just before the bpf prog exit. One of the use case is to allow the upcoming bpf qdisc to ensure that the skb->dev is the same as the qdisc->dev_queue->dev. The bpf qdisc struct_ops implementation could either fix it up or drop the skb. Another use case could be in bpf_tcp_ca.c to enforce snd_cwnd has sane value (e.g. non zero). The epilogue can do the useful thing (like checking skb->dev) if it can access the bpf prog's ctx. Unlike prologue, r1 may not hold the ctx pointer. This patch saves the r1 in the stack if the .gen_epilogue has returned some instructions in the "epilogue_buf". The existing .gen_prologue is done in convert_ctx_accesses(). The new .gen_epilogue is done in the convert_ctx_accesses() also. When it sees the (BPF_JMP | BPF_EXIT) instruction, it will be patched with the earlier generated "epilogue_buf". The epilogue patching is only done for the main prog. Only one epilogue will be patched to the main program. When the bpf prog has multiple BPF_EXIT instructions, a BPF_JA is used to goto the earlier patched epilogue. Majority of the archs support (BPF_JMP32 | BPF_JA): x86, arm, s390, risv64, loongarch, powerpc and arc. This patch keeps it simple and always use (BPF_JMP32 | BPF_JA). A new macro BPF_JMP32_A is added to generate the (BPF_JMP32 | BPF_JA) insn. Acked-by: Eduard Zingerman Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240829210833.388152-4-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ include/linux/bpf_verifier.h | 1 + include/linux/filter.h | 10 ++++++++++ kernel/bpf/verifier.c | 46 +++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 58 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dc63083f76b7..6f87fb014fba 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -974,6 +974,8 @@ struct bpf_verifier_ops { struct bpf_insn_access_aux *info); int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, const struct bpf_prog *prog); + int (*gen_epilogue)(struct bpf_insn *insn, const struct bpf_prog *prog, + s16 ctx_stack_off); int (*gen_ld_abs)(const struct bpf_insn *orig, struct bpf_insn *insn_buf); u32 (*convert_ctx_access)(enum bpf_access_type type, diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 0ad2d189c546..2e20207315a9 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -783,6 +783,7 @@ struct bpf_verifier_env { */ char tmp_str_buf[TMP_STR_BUF_LEN]; struct bpf_insn insn_buf[INSN_BUF_SIZE]; + struct bpf_insn epilogue_buf[INSN_BUF_SIZE]; }; static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog) diff --git a/include/linux/filter.h b/include/linux/filter.h index b6672ff61407..99b6fc83825b 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -437,6 +437,16 @@ static inline bool insn_is_cast_user(const struct bpf_insn *insn) .off = OFF, \ .imm = 0 }) +/* Unconditional jumps, gotol pc + imm32 */ + +#define BPF_JMP32_A(IMM) \ + ((struct bpf_insn) { \ + .code = BPF_JMP32 | BPF_JA, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + /* Relative call */ #define BPF_CALL_REL(TGT) \ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 03e974129c05..fdb1d39c8b58 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19677,15 +19677,39 @@ apply_patch_buffer: */ static int convert_ctx_accesses(struct bpf_verifier_env *env) { + struct bpf_subprog_info *subprogs = env->subprog_info; const struct bpf_verifier_ops *ops = env->ops; - int i, cnt, size, ctx_field_size, delta = 0; + int i, cnt, size, ctx_field_size, delta = 0, epilogue_cnt = 0; const int insn_cnt = env->prog->len; + struct bpf_insn *epilogue_buf = env->epilogue_buf; struct bpf_insn *insn_buf = env->insn_buf; struct bpf_insn *insn; u32 target_size, size_default, off; struct bpf_prog *new_prog; enum bpf_access_type type; bool is_narrower_load; + int epilogue_idx = 0; + + if (ops->gen_epilogue) { + epilogue_cnt = ops->gen_epilogue(epilogue_buf, env->prog, + -(subprogs[0].stack_depth + 8)); + if (epilogue_cnt >= INSN_BUF_SIZE) { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } else if (epilogue_cnt) { + /* Save the ARG_PTR_TO_CTX for the epilogue to use */ + cnt = 0; + subprogs[0].stack_depth += 8; + insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_1, + -subprogs[0].stack_depth); + insn_buf[cnt++] = env->prog->insnsi[0]; + new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + env->prog = new_prog; + delta += cnt - 1; + } + } if (ops->gen_prologue || env->seen_direct_write) { if (!ops->gen_prologue) { @@ -19742,6 +19766,25 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code); env->prog->aux->num_exentries++; continue; + } else if (insn->code == (BPF_JMP | BPF_EXIT) && + epilogue_cnt && + i + delta < subprogs[1].start) { + /* Generate epilogue for the main prog */ + if (epilogue_idx) { + /* jump back to the earlier generated epilogue */ + insn_buf[0] = BPF_JMP32_A(epilogue_idx - i - delta - 1); + cnt = 1; + } else { + memcpy(insn_buf, epilogue_buf, + epilogue_cnt * sizeof(*epilogue_buf)); + cnt = epilogue_cnt; + /* epilogue_idx cannot be 0. It must have at + * least one ctx ptr saving insn before the + * epilogue. + */ + epilogue_idx = i + delta; + } + goto patch_insn_buf; } else { continue; } @@ -19878,6 +19921,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->dst_reg, insn->dst_reg, size * 8, 0); +patch_insn_buf: new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) return -ENOMEM; -- cgit v1.2.3 From 940ce73bdec5a020fec058ba947d2bf627462c53 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 4 Sep 2024 11:08:44 -0700 Subject: bpf: Remove the insn_buf array stack usage from the inline_bpf_loop() This patch removes the insn_buf array stack usage from the inline_bpf_loop(). Instead, the env->insn_buf is used. The usage in inline_bpf_loop() needs more than 16 insn, so the INSN_BUF_SIZE needs to be increased from 16 to 32. The compiler stack size warning on the verifier is gone after this change. Cc: Eduard Zingerman Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240904180847.56947-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 +- kernel/bpf/verifier.c | 83 ++++++++++++++++++++++---------------------- 2 files changed, 43 insertions(+), 42 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 2e20207315a9..8458632824a4 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -24,7 +24,7 @@ */ #define TMP_STR_BUF_LEN 320 /* Patch buffer size */ -#define INSN_BUF_SIZE 16 +#define INSN_BUF_SIZE 32 /* Liveness marks, used for registers and spilled-regs (in stack slots). * Read marks propagate upwards until they find a write mark; they record that diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 217eb0eafa2a..8bfb14d1eb7a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21232,7 +21232,7 @@ static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env, int position, s32 stack_base, u32 callback_subprogno, - u32 *cnt) + u32 *total_cnt) { s32 r6_offset = stack_base + 0 * BPF_REG_SIZE; s32 r7_offset = stack_base + 1 * BPF_REG_SIZE; @@ -21241,55 +21241,56 @@ static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env, int reg_loop_cnt = BPF_REG_7; int reg_loop_ctx = BPF_REG_8; + struct bpf_insn *insn_buf = env->insn_buf; struct bpf_prog *new_prog; u32 callback_start; u32 call_insn_offset; s32 callback_offset; + u32 cnt = 0; /* This represents an inlined version of bpf_iter.c:bpf_loop, * be careful to modify this code in sync. */ - struct bpf_insn insn_buf[] = { - /* Return error and jump to the end of the patch if - * expected number of iterations is too big. - */ - BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2), - BPF_MOV32_IMM(BPF_REG_0, -E2BIG), - BPF_JMP_IMM(BPF_JA, 0, 0, 16), - /* spill R6, R7, R8 to use these as loop vars */ - BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset), - BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset), - BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset), - /* initialize loop vars */ - BPF_MOV64_REG(reg_loop_max, BPF_REG_1), - BPF_MOV32_IMM(reg_loop_cnt, 0), - BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3), - /* loop header, - * if reg_loop_cnt >= reg_loop_max skip the loop body - */ - BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5), - /* callback call, - * correct callback offset would be set after patching - */ - BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt), - BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx), - BPF_CALL_REL(0), - /* increment loop counter */ - BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1), - /* jump to loop header if callback returned 0 */ - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6), - /* return value of bpf_loop, - * set R0 to the number of iterations - */ - BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt), - /* restore original values of R6, R7, R8 */ - BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset), - BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset), - BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset), - }; - *cnt = ARRAY_SIZE(insn_buf); - new_prog = bpf_patch_insn_data(env, position, insn_buf, *cnt); + /* Return error and jump to the end of the patch if + * expected number of iterations is too big. + */ + insn_buf[cnt++] = BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2); + insn_buf[cnt++] = BPF_MOV32_IMM(BPF_REG_0, -E2BIG); + insn_buf[cnt++] = BPF_JMP_IMM(BPF_JA, 0, 0, 16); + /* spill R6, R7, R8 to use these as loop vars */ + insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset); + insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset); + insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset); + /* initialize loop vars */ + insn_buf[cnt++] = BPF_MOV64_REG(reg_loop_max, BPF_REG_1); + insn_buf[cnt++] = BPF_MOV32_IMM(reg_loop_cnt, 0); + insn_buf[cnt++] = BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3); + /* loop header, + * if reg_loop_cnt >= reg_loop_max skip the loop body + */ + insn_buf[cnt++] = BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5); + /* callback call, + * correct callback offset would be set after patching + */ + insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt); + insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx); + insn_buf[cnt++] = BPF_CALL_REL(0); + /* increment loop counter */ + insn_buf[cnt++] = BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1); + /* jump to loop header if callback returned 0 */ + insn_buf[cnt++] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6); + /* return value of bpf_loop, + * set R0 to the number of iterations + */ + insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt); + /* restore original values of R6, R7, R8 */ + insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset); + insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset); + insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset); + + *total_cnt = cnt; + new_prog = bpf_patch_insn_data(env, position, insn_buf, cnt); if (!new_prog) return new_prog; -- cgit v1.2.3 From 1ae497c78f01855f3695b58481311ffdd429b028 Mon Sep 17 00:00:00 2001 From: Shung-Hsi Yu Date: Thu, 5 Sep 2024 13:52:32 +0800 Subject: bpf: use type_may_be_null() helper for nullable-param check Commit 980ca8ceeae6 ("bpf: check bpf_dummy_struct_ops program params for test runs") does bitwise AND between reg_type and PTR_MAYBE_NULL, which is correct, but due to type difference the compiler complains: net/bpf/bpf_dummy_struct_ops.c:118:31: warning: bitwise operation between different enumeration types ('const enum bpf_reg_type' and 'enum bpf_type_flag') [-Wenum-enum-conversion] 118 | if (info && (info->reg_type & PTR_MAYBE_NULL)) | ~~~~~~~~~~~~~~ ^ ~~~~~~~~~~~~~~ Workaround the warning by moving the type_may_be_null() helper from verifier.c into bpf_verifier.h, and reuse it here to check whether param is nullable. Fixes: 980ca8ceeae6 ("bpf: check bpf_dummy_struct_ops program params for test runs") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202404241956.HEiRYwWq-lkp@intel.com/ Signed-off-by: Shung-Hsi Yu Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240905055233.70203-1-shung-hsi.yu@suse.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 5 +++++ kernel/bpf/verifier.c | 5 ----- net/bpf/bpf_dummy_struct_ops.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 8458632824a4..4513372c5bc8 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -927,6 +927,11 @@ static inline bool type_is_sk_pointer(enum bpf_reg_type type) type == PTR_TO_XDP_SOCK; } +static inline bool type_may_be_null(u32 type) +{ + return type & PTR_MAYBE_NULL; +} + static inline void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno) { env->scratched_regs |= 1U << regno; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b806afeba212..53d0556fbbf3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -383,11 +383,6 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env, verbose(env, " should have been in [%d, %d]\n", range.minval, range.maxval); } -static bool type_may_be_null(u32 type) -{ - return type & PTR_MAYBE_NULL; -} - static bool reg_not_null(const struct bpf_reg_state *reg) { enum bpf_reg_type type; diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index 3ea52b05adfb..f71f67c6896b 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -115,7 +115,7 @@ static int check_test_run_args(struct bpf_prog *prog, struct bpf_dummy_ops_test_ offset = btf_ctx_arg_offset(bpf_dummy_ops_btf, func_proto, arg_no); info = find_ctx_arg_info(prog->aux, offset); - if (info && (info->reg_type & PTR_MAYBE_NULL)) + if (info && type_may_be_null(info->reg_type)) continue; return -EINVAL; -- cgit v1.2.3 From 45b8fc3096542a53bfd245a9ad8ef870384b4897 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 29 Aug 2024 10:42:27 -0700 Subject: lib/buildid: rename build_id_parse() into build_id_parse_nofault() Make it clear that build_id_parse() assumes that it can take no page fault by renaming it and current few users to build_id_parse_nofault(). Also add build_id_parse() stub which for now falls back to non-sleepable implementation, but will be changed in subsequent patches to take advantage of sleepable context. PROCMAP_QUERY ioctl() on /proc//maps file is using build_id_parse() and will automatically take advantage of more reliable sleepable context implementation. Reviewed-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240829174232.3133883-6-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/buildid.h | 4 ++-- kernel/bpf/stackmap.c | 2 +- kernel/events/core.c | 2 +- lib/buildid.c | 25 ++++++++++++++++++++++--- 4 files changed, 26 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/buildid.h b/include/linux/buildid.h index 20aa3c2d89f7..014a88c41073 100644 --- a/include/linux/buildid.h +++ b/include/linux/buildid.h @@ -7,8 +7,8 @@ #define BUILD_ID_SIZE_MAX 20 struct vm_area_struct; -int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, - __u32 *size); +int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size); +int build_id_parse_nofault(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size); int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size); #if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_VMCORE_INFO) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index c99f8e5234ac..770ae8e88016 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -156,7 +156,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, goto build_id_valid; } vma = find_vma(current->mm, ips[i]); - if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) { + if (!vma || build_id_parse_nofault(vma, id_offs[i].build_id, NULL)) { /* per entry fall back to ips */ id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; diff --git a/kernel/events/core.c b/kernel/events/core.c index c973e3c11e03..c78a77f9dce4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8851,7 +8851,7 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; if (atomic_read(&nr_build_id_events)) - build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size); + build_id_parse_nofault(vma, mmap_event->build_id, &mmap_event->build_id_size); perf_iterate_sb(perf_event_mmap_output, mmap_event, diff --git a/lib/buildid.c b/lib/buildid.c index e8fc4aeb01f2..c1cbd34f3685 100644 --- a/lib/buildid.c +++ b/lib/buildid.c @@ -293,10 +293,12 @@ static int get_build_id_64(struct freader *r, unsigned char *build_id, __u32 *si * @build_id: buffer to store build id, at least BUILD_ID_SIZE long * @size: returns actual build id size in case of success * - * Return: 0 on success, -EINVAL otherwise + * Assumes no page fault can be taken, so if relevant portions of ELF file are + * not already paged in, fetching of build ID fails. + * + * Return: 0 on success; negative error, otherwise */ -int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, - __u32 *size) +int build_id_parse_nofault(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size) { const Elf32_Ehdr *ehdr; struct freader r; @@ -335,6 +337,23 @@ out: return ret; } +/* + * Parse build ID of ELF file mapped to VMA + * @vma: vma object + * @build_id: buffer to store build id, at least BUILD_ID_SIZE long + * @size: returns actual build id size in case of success + * + * Assumes faultable context and can cause page faults to bring in file data + * into page cache. + * + * Return: 0 on success; negative error, otherwise + */ +int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size) +{ + /* fallback to non-faultable version for now */ + return build_id_parse_nofault(vma, build_id, size); +} + /** * build_id_parse_buf - Get build ID from a buffer * @buf: ELF note section(s) to parse -- cgit v1.2.3 From d4dd9775ec242425576af93daadb80a34083a53c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 29 Aug 2024 10:42:31 -0700 Subject: bpf: wire up sleepable bpf_get_stack() and bpf_get_task_stack() helpers Add sleepable implementations of bpf_get_stack() and bpf_get_task_stack() helpers and allow them to be used from sleepable BPF program (e.g., sleepable uprobes). Note, the stack trace IPs capturing itself is not sleepable (that would need to be a separate project), only build ID fetching is sleepable and thus more reliable, as it will wait for data to be paged in, if necessary. For that we make use of sleepable build_id_parse() implementation. Now that build ID related internals in kernel/bpf/stackmap.c can be used both in sleepable and non-sleepable contexts, we need to add additional rcu_read_lock()/rcu_read_unlock() protection around fetching perf_callchain_entry, but with the refactoring in previous commit it's now pretty straightforward. We make sure to do rcu_read_unlock (in sleepable mode only) right before stack_map_get_build_id_offset() call which can sleep. By that time we don't have any more use of perf_callchain_entry. Note, bpf_get_task_stack() will fail for user mode if task != current. And for kernel mode build ID are irrelevant. So in that sense adding sleepable bpf_get_task_stack() implementation is a no-op. It feel right to wire this up for symmetry and completeness, but I'm open to just dropping it until we support `user && crosstask` condition. Reviewed-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240829174232.3133883-10-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ kernel/bpf/stackmap.c | 90 ++++++++++++++++++++++++++++++++++++++---------- kernel/trace/bpf_trace.c | 5 +-- 3 files changed, 77 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6f87fb014fba..7f3e8477d5e7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3200,7 +3200,9 @@ extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_get_stack_proto; +extern const struct bpf_func_proto bpf_get_stack_sleepable_proto; extern const struct bpf_func_proto bpf_get_task_stack_proto; +extern const struct bpf_func_proto bpf_get_task_stack_sleepable_proto; extern const struct bpf_func_proto bpf_get_stackid_proto_pe; extern const struct bpf_func_proto bpf_get_stack_proto_pe; extern const struct bpf_func_proto bpf_sock_map_update_proto; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 6457222b0b46..3615c06b7dfa 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -124,6 +124,12 @@ free_smap: return ERR_PTR(err); } +static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, bool may_fault) +{ + return may_fault ? build_id_parse(vma, build_id, NULL) + : build_id_parse_nofault(vma, build_id, NULL); +} + /* * Expects all id_offs[i].ip values to be set to correct initial IPs. * They will be subsequently: @@ -135,7 +141,7 @@ free_smap: * BPF_STACK_BUILD_ID_IP. */ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, - u32 trace_nr, bool user) + u32 trace_nr, bool user, bool may_fault) { int i; struct mmap_unlock_irq_work *work = NULL; @@ -166,7 +172,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, goto build_id_valid; } vma = find_vma(current->mm, ip); - if (!vma || build_id_parse_nofault(vma, id_offs[i].build_id, NULL)) { + if (!vma || fetch_build_id(vma, id_offs[i].build_id, may_fault)) { /* per entry fall back to ips */ id_offs[i].status = BPF_STACK_BUILD_ID_IP; memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); @@ -257,7 +263,7 @@ static long __bpf_get_stackid(struct bpf_map *map, id_offs = (struct bpf_stack_build_id *)new_bucket->data; for (i = 0; i < trace_nr; i++) id_offs[i].ip = ips[i]; - stack_map_get_build_id_offset(id_offs, trace_nr, user); + stack_map_get_build_id_offset(id_offs, trace_nr, user, false /* !may_fault */); trace_len = trace_nr * sizeof(struct bpf_stack_build_id); if (hash_matches && bucket->nr == trace_nr && memcmp(bucket->data, new_bucket->data, trace_len) == 0) { @@ -398,7 +404,7 @@ const struct bpf_func_proto bpf_get_stackid_proto_pe = { static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, struct perf_callchain_entry *trace_in, - void *buf, u32 size, u64 flags) + void *buf, u32 size, u64 flags, bool may_fault) { u32 trace_nr, copy_len, elem_size, num_elem, max_depth; bool user_build_id = flags & BPF_F_USER_BUILD_ID; @@ -416,8 +422,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, if (kernel && user_build_id) goto clear; - elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id) - : sizeof(u64); + elem_size = user_build_id ? sizeof(struct bpf_stack_build_id) : sizeof(u64); if (unlikely(size % elem_size)) goto clear; @@ -438,6 +443,9 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, if (sysctl_perf_event_max_stack < max_depth) max_depth = sysctl_perf_event_max_stack; + if (may_fault) + rcu_read_lock(); /* need RCU for perf's callchain below */ + if (trace_in) trace = trace_in; else if (kernel && task) @@ -445,28 +453,35 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, else trace = get_perf_callchain(regs, 0, kernel, user, max_depth, crosstask, false); - if (unlikely(!trace)) - goto err_fault; - if (trace->nr < skip) + if (unlikely(!trace) || trace->nr < skip) { + if (may_fault) + rcu_read_unlock(); goto err_fault; + } trace_nr = trace->nr - skip; trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem; copy_len = trace_nr * elem_size; ips = trace->ip + skip; - if (user && user_build_id) { + if (user_build_id) { struct bpf_stack_build_id *id_offs = buf; u32 i; for (i = 0; i < trace_nr; i++) id_offs[i].ip = ips[i]; - stack_map_get_build_id_offset(buf, trace_nr, user); } else { memcpy(buf, ips, copy_len); } + /* trace/ips should not be dereferenced after this point */ + if (may_fault) + rcu_read_unlock(); + + if (user_build_id) + stack_map_get_build_id_offset(buf, trace_nr, user, may_fault); + if (size > copy_len) memset(buf + copy_len, 0, size - copy_len); return copy_len; @@ -481,7 +496,7 @@ clear: BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, u64, flags) { - return __bpf_get_stack(regs, NULL, NULL, buf, size, flags); + return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */); } const struct bpf_func_proto bpf_get_stack_proto = { @@ -494,8 +509,24 @@ const struct bpf_func_proto bpf_get_stack_proto = { .arg4_type = ARG_ANYTHING, }; -BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, - u32, size, u64, flags) +BPF_CALL_4(bpf_get_stack_sleepable, struct pt_regs *, regs, void *, buf, u32, size, + u64, flags) +{ + return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, true /* may_fault */); +} + +const struct bpf_func_proto bpf_get_stack_sleepable_proto = { + .func = bpf_get_stack_sleepable, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + +static long __bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, + u64 flags, bool may_fault) { struct pt_regs *regs; long res = -EINVAL; @@ -505,12 +536,18 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, regs = task_pt_regs(task); if (regs) - res = __bpf_get_stack(regs, task, NULL, buf, size, flags); + res = __bpf_get_stack(regs, task, NULL, buf, size, flags, may_fault); put_task_stack(task); return res; } +BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, + u32, size, u64, flags) +{ + return __bpf_get_task_stack(task, buf, size, flags, false /* !may_fault */); +} + const struct bpf_func_proto bpf_get_task_stack_proto = { .func = bpf_get_task_stack, .gpl_only = false, @@ -522,6 +559,23 @@ const struct bpf_func_proto bpf_get_task_stack_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_get_task_stack_sleepable, struct task_struct *, task, void *, buf, + u32, size, u64, flags) +{ + return __bpf_get_task_stack(task, buf, size, flags, true /* !may_fault */); +} + +const struct bpf_func_proto bpf_get_task_stack_sleepable_proto = { + .func = bpf_get_task_stack_sleepable, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, void *, buf, u32, size, u64, flags) { @@ -533,7 +587,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, __u64 nr_kernel; if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) - return __bpf_get_stack(regs, NULL, NULL, buf, size, flags); + return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */); if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_USER_BUILD_ID))) @@ -553,7 +607,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, __u64 nr = trace->nr; trace->nr = nr_kernel; - err = __bpf_get_stack(regs, NULL, trace, buf, size, flags); + err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */); /* restore nr */ trace->nr = nr; @@ -565,7 +619,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, goto clear; flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; - err = __bpf_get_stack(regs, NULL, trace, buf, size, flags); + err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */); } return err; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 98e395f1baae..68b5905c6eb5 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1507,7 +1507,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; case BPF_FUNC_get_task_stack: - return &bpf_get_task_stack_proto; + return prog->sleepable ? &bpf_get_task_stack_sleepable_proto + : &bpf_get_task_stack_proto; case BPF_FUNC_copy_from_user: return &bpf_copy_from_user_proto; case BPF_FUNC_copy_from_user_task: @@ -1563,7 +1564,7 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto; case BPF_FUNC_get_stack: - return &bpf_get_stack_proto; + return prog->sleepable ? &bpf_get_stack_sleepable_proto : &bpf_get_stack_proto; #ifdef CONFIG_BPF_KPROBE_OVERRIDE case BPF_FUNC_override_return: return &bpf_override_return_proto; -- cgit v1.2.3 From 32556ce93bc45c730829083cb60f95a2728ea48b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 13 Sep 2024 21:17:48 +0200 Subject: bpf: Fix helper writes to read-only maps Lonial found an issue that despite user- and BPF-side frozen BPF map (like in case of .rodata), it was still possible to write into it from a BPF program side through specific helpers having ARG_PTR_TO_{LONG,INT} as arguments. In check_func_arg() when the argument is as mentioned, the meta->raw_mode is never set. Later, check_helper_mem_access(), under the case of PTR_TO_MAP_VALUE as register base type, it assumes BPF_READ for the subsequent call to check_map_access_type() and given the BPF map is read-only it succeeds. The helpers really need to be annotated as ARG_PTR_TO_{LONG,INT} | MEM_UNINIT when results are written into them as opposed to read out of them. The latter indicates that it's okay to pass a pointer to uninitialized memory as the memory is written to anyway. However, ARG_PTR_TO_{LONG,INT} is a special case of ARG_PTR_TO_FIXED_SIZE_MEM just with additional alignment requirement. So it is better to just get rid of the ARG_PTR_TO_{LONG,INT} special cases altogether and reuse the fixed size memory types. For this, add MEM_ALIGNED to additionally ensure alignment given these helpers write directly into the args via * = val. The .arg*_size has been initialized reflecting the actual sizeof(*). MEM_ALIGNED can only be used in combination with MEM_FIXED_SIZE annotated argument types, since in !MEM_FIXED_SIZE cases the verifier does not know the buffer size a priori and therefore cannot blindly write * = val. Fixes: 57c3bb725a3d ("bpf: Introduce ARG_PTR_TO_{INT,LONG} arg types") Reported-by: Lonial Con Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Shung-Hsi Yu Link: https://lore.kernel.org/r/20240913191754.13290-3-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 7 +++++-- kernel/bpf/helpers.c | 6 ++++-- kernel/bpf/syscall.c | 3 ++- kernel/bpf/verifier.c | 41 +++++------------------------------------ kernel/trace/bpf_trace.c | 6 ++++-- net/core/filter.c | 6 ++++-- 6 files changed, 24 insertions(+), 45 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7f3e8477d5e7..0c3893c47171 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -695,6 +695,11 @@ enum bpf_type_flag { /* DYNPTR points to xdp_buff */ DYNPTR_TYPE_XDP = BIT(16 + BPF_BASE_TYPE_BITS), + /* Memory must be aligned on some architectures, used in combination with + * MEM_FIXED_SIZE. + */ + MEM_ALIGNED = BIT(17 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; @@ -732,8 +737,6 @@ enum bpf_arg_type { ARG_ANYTHING, /* any (initialized) argument is ok */ ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */ ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */ - ARG_PTR_TO_INT, /* pointer to int */ - ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ ARG_PTR_TO_RINGBUF_MEM, /* pointer to dynamically reserved ringbuf memory */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5404bb964d83..5e97fdc6b20f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -537,7 +537,8 @@ const struct bpf_func_proto bpf_strtol_proto = { .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_LONG, + .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg4_size = sizeof(s64), }; BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags, @@ -563,7 +564,8 @@ const struct bpf_func_proto bpf_strtoul_proto = { .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_LONG, + .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg4_size = sizeof(u64), }; BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6988e432fc3d..a7cd3719e26a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5954,7 +5954,8 @@ static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { .arg1_type = ARG_PTR_TO_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_LONG, + .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg4_size = sizeof(u64), }; static const struct bpf_func_proto * diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 068f763dcefb..7d62fd576089 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8301,16 +8301,6 @@ static bool arg_type_is_dynptr(enum bpf_arg_type type) return base_type(type) == ARG_PTR_TO_DYNPTR; } -static int int_ptr_type_to_size(enum bpf_arg_type type) -{ - if (type == ARG_PTR_TO_INT) - return sizeof(u32); - else if (type == ARG_PTR_TO_LONG) - return sizeof(u64); - - return -EINVAL; -} - static int resolve_map_arg_type(struct bpf_verifier_env *env, const struct bpf_call_arg_meta *meta, enum bpf_arg_type *arg_type) @@ -8383,16 +8373,6 @@ static const struct bpf_reg_types mem_types = { }, }; -static const struct bpf_reg_types int_ptr_types = { - .types = { - PTR_TO_STACK, - PTR_TO_PACKET, - PTR_TO_PACKET_META, - PTR_TO_MAP_KEY, - PTR_TO_MAP_VALUE, - }, -}; - static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE, @@ -8453,8 +8433,6 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types, [ARG_PTR_TO_MEM] = &mem_types, [ARG_PTR_TO_RINGBUF_MEM] = &ringbuf_mem_types, - [ARG_PTR_TO_INT] = &int_ptr_types, - [ARG_PTR_TO_LONG] = &int_ptr_types, [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types, [ARG_PTR_TO_FUNC] = &func_ptr_types, [ARG_PTR_TO_STACK] = &stack_ptr_types, @@ -9017,9 +8995,11 @@ skip_type_check: */ meta->raw_mode = arg_type & MEM_UNINIT; if (arg_type & MEM_FIXED_SIZE) { - err = check_helper_mem_access(env, regno, - fn->arg_size[arg], false, - meta); + err = check_helper_mem_access(env, regno, fn->arg_size[arg], false, meta); + if (err) + return err; + if (arg_type & MEM_ALIGNED) + err = check_ptr_alignment(env, reg, 0, fn->arg_size[arg], true); } break; case ARG_CONST_SIZE: @@ -9044,17 +9024,6 @@ skip_type_check: if (err) return err; break; - case ARG_PTR_TO_INT: - case ARG_PTR_TO_LONG: - { - int size = int_ptr_type_to_size(arg_type); - - err = check_helper_mem_access(env, regno, size, false, meta); - if (err) - return err; - err = check_ptr_alignment(env, reg, 0, size, true); - break; - } case ARG_PTR_TO_CONST_STR: { err = check_reg_const_str(env, reg, regno); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 68b5905c6eb5..b3283c250e27 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1202,7 +1202,8 @@ static const struct bpf_func_proto bpf_get_func_arg_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_LONG, + .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg3_size = sizeof(u64), }; BPF_CALL_2(get_func_ret, void *, ctx, u64 *, value) @@ -1218,7 +1219,8 @@ static const struct bpf_func_proto bpf_get_func_ret_proto = { .func = get_func_ret, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_LONG, + .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg2_size = sizeof(u64), }; BPF_CALL_1(get_func_arg_cnt, void *, ctx) diff --git a/net/core/filter.c b/net/core/filter.c index ecf2ddf633bf..d0550c87e520 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6346,7 +6346,8 @@ static const struct bpf_func_proto bpf_skb_check_mtu_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_INT, + .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg3_size = sizeof(u32), .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -6357,7 +6358,8 @@ static const struct bpf_func_proto bpf_xdp_check_mtu_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_INT, + .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg3_size = sizeof(u32), .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; -- cgit v1.2.3