diff options
Diffstat (limited to 'kernel/bpf/helpers.c')
| -rw-r--r-- | kernel/bpf/helpers.c | 1082 |
1 files changed, 1031 insertions, 51 deletions
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b71e428ad936..8eb117c52817 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -24,6 +24,10 @@ #include <linux/bpf_mem_alloc.h> #include <linux/kasan.h> #include <linux/bpf_verifier.h> +#include <linux/uaccess.h> +#include <linux/verification.h> +#include <linux/task_work.h> +#include <linux/irq_work.h> #include "../../lib/kstrtox.h" @@ -763,30 +767,19 @@ static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, return -EINVAL; } -/* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary - * arguments representation. - */ -#define MAX_BPRINTF_BIN_ARGS 512 - /* Support executing three nested bprintf helper calls on a given CPU */ #define MAX_BPRINTF_NEST_LEVEL 3 -struct bpf_bprintf_buffers { - char bin_args[MAX_BPRINTF_BIN_ARGS]; - char buf[MAX_BPRINTF_BUF]; -}; static DEFINE_PER_CPU(struct bpf_bprintf_buffers[MAX_BPRINTF_NEST_LEVEL], bpf_bprintf_bufs); static DEFINE_PER_CPU(int, bpf_bprintf_nest_level); -static int try_get_buffers(struct bpf_bprintf_buffers **bufs) +int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs) { int nest_level; - preempt_disable(); nest_level = this_cpu_inc_return(bpf_bprintf_nest_level); if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) { this_cpu_dec(bpf_bprintf_nest_level); - preempt_enable(); return -EBUSY; } *bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]); @@ -794,14 +787,18 @@ static int try_get_buffers(struct bpf_bprintf_buffers **bufs) return 0; } -void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) +void bpf_put_buffers(void) { - if (!data->bin_args && !data->buf) - return; if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0)) return; this_cpu_dec(bpf_bprintf_nest_level); - preempt_enable(); +} + +void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) +{ + if (!data->bin_args && !data->buf) + return; + bpf_put_buffers(); } /* @@ -818,7 +815,7 @@ void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) * In argument preparation mode, if 0 is returned, safe temporary buffers are * allocated and bpf_bprintf_cleanup should be called to free them after use. */ -int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, +int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args, u32 num_args, struct bpf_bprintf_data *data) { bool get_buffers = (data->get_bin_args && num_args) || data->get_buf; @@ -834,7 +831,7 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, return -EINVAL; fmt_size = fmt_end - fmt; - if (get_buffers && try_get_buffers(&buffers)) + if (get_buffers && bpf_try_get_buffers(&buffers)) return -EBUSY; if (data->get_bin_args) { @@ -884,6 +881,13 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, if (fmt[i] == 'p') { sizeof_cur_arg = sizeof(long); + if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) || + ispunct(fmt[i + 1])) { + if (tmp_buf) + cur_arg = raw_args[num_spec]; + goto nocopy_fmt; + } + if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') && fmt[i + 2] == 's') { fmt_ptype = fmt[i + 1]; @@ -891,11 +895,9 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, goto fmt_str; } - if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) || - ispunct(fmt[i + 1]) || fmt[i + 1] == 'K' || + if (fmt[i + 1] == 'K' || fmt[i + 1] == 'x' || fmt[i + 1] == 's' || fmt[i + 1] == 'S') { - /* just kernel pointers */ if (tmp_buf) cur_arg = raw_args[num_spec]; i++; @@ -1082,6 +1084,17 @@ const struct bpf_func_proto bpf_snprintf_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx) +{ + if (map->map_type == BPF_MAP_TYPE_ARRAY) { + struct bpf_array *array = container_of(map, struct bpf_array, map); + + *arr_idx = ((char *)value - array->value) / array->elem_size; + return arr_idx; + } + return (void *)value - round_up(map->key_size, 8); +} + struct bpf_async_cb { struct bpf_map *map; struct bpf_prog *prog; @@ -1164,15 +1177,8 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) * bpf_map_delete_elem() on the same timer. */ this_cpu_write(hrtimer_running, t); - if (map->map_type == BPF_MAP_TYPE_ARRAY) { - struct bpf_array *array = container_of(map, struct bpf_array, map); - /* compute the key */ - idx = ((char *)value - array->value) / array->elem_size; - key = &idx; - } else { /* hash or lru */ - key = value - round_up(map->key_size, 8); - } + key = map_key_from_value(map, value, &idx); callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0); /* The verifier checked that return value is zero. */ @@ -1198,15 +1204,7 @@ static void bpf_wq_work(struct work_struct *work) if (!callback_fn) return; - if (map->map_type == BPF_MAP_TYPE_ARRAY) { - struct bpf_array *array = container_of(map, struct bpf_array, map); - - /* compute the key */ - idx = ((char *)value - array->value) / array->elem_size; - key = &idx; - } else { /* hash or lru */ - key = value - round_up(map->key_size, 8); - } + key = map_key_from_value(map, value, &idx); rcu_read_lock_trace(); migrate_disable(); @@ -1217,13 +1215,20 @@ static void bpf_wq_work(struct work_struct *work) rcu_read_unlock_trace(); } +static void bpf_async_cb_rcu_free(struct rcu_head *rcu) +{ + struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu); + + kfree_nolock(cb); +} + static void bpf_wq_delete_work(struct work_struct *work) { struct bpf_work *w = container_of(work, struct bpf_work, delete_work); cancel_work_sync(&w->work); - kfree_rcu(w, cb.rcu); + call_rcu(&w->cb.rcu, bpf_async_cb_rcu_free); } static void bpf_timer_delete_work(struct work_struct *work) @@ -1232,13 +1237,13 @@ static void bpf_timer_delete_work(struct work_struct *work) /* Cancel the timer and wait for callback to complete if it was running. * If hrtimer_cancel() can be safely called it's safe to call - * kfree_rcu(t) right after for both preallocated and non-preallocated + * call_rcu() right after for both preallocated and non-preallocated * maps. The async->cb = NULL was already done and no code path can see * address 't' anymore. Timer if armed for existing bpf_hrtimer before * bpf_timer_cancel_and_free will have been cancelled. */ hrtimer_cancel(&t->timer); - kfree_rcu(t, cb.rcu); + call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free); } static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, @@ -1272,8 +1277,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u goto out; } - /* allocate hrtimer via map_kmalloc to use memcg accounting */ - cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node); + cb = bpf_map_kmalloc_nolock(map, size, 0, map->numa_node); if (!cb) { ret = -ENOMEM; goto out; @@ -1314,7 +1318,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u * or pinned in bpffs. */ WRITE_ONCE(async->cb, NULL); - kfree(cb); + kfree_nolock(cb); ret = -EPERM; } out: @@ -1579,7 +1583,7 @@ void bpf_timer_cancel_and_free(void *val) * timer _before_ calling us, such that failing to cancel it here will * cause it to possibly use struct hrtimer after freeing bpf_hrtimer. * Therefore, we _need_ to cancel any outstanding timers before we do - * kfree_rcu, even though no more timers can be armed. + * call_rcu, even though no more timers can be armed. * * Moreover, we need to schedule work even if timer does not belong to * the calling callback_fn, as on two different CPUs, we can end up in a @@ -1595,7 +1599,7 @@ void bpf_timer_cancel_and_free(void *val) * timer callback. */ if (this_cpu_read(hrtimer_running)) { - queue_work(system_unbound_wq, &t->cb.delete_work); + queue_work(system_dfl_wq, &t->cb.delete_work); return; } @@ -1606,9 +1610,9 @@ void bpf_timer_cancel_and_free(void *val) * completion. */ if (hrtimer_try_to_cancel(&t->timer) >= 0) - kfree_rcu(t, cb.rcu); + call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free); else - queue_work(system_unbound_wq, &t->cb.delete_work); + queue_work(system_dfl_wq, &t->cb.delete_work); } else { bpf_timer_delete_work(&t->cb.delete_work); } @@ -1778,6 +1782,9 @@ static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *s return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len); case BPF_DYNPTR_TYPE_XDP: return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len); + case BPF_DYNPTR_TYPE_SKB_META: + memmove(dst, bpf_skb_meta_pointer(src->data, src->offset + offset), len); + return 0; default: WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type); return -EFAULT; @@ -1834,6 +1841,11 @@ int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, if (flags) return -EINVAL; return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len); + case BPF_DYNPTR_TYPE_SKB_META: + if (flags) + return -EINVAL; + memmove(bpf_skb_meta_pointer(dst->data, dst->offset + offset), src, len); + return 0; default: WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type); return -EFAULT; @@ -1880,6 +1892,7 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3 return (unsigned long)(ptr->data + ptr->offset + offset); case BPF_DYNPTR_TYPE_SKB: case BPF_DYNPTR_TYPE_XDP: + case BPF_DYNPTR_TYPE_SKB_META: /* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */ return 0; default: @@ -2535,7 +2548,7 @@ __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid) { struct cgroup *cgrp; - cgrp = cgroup_get_from_id(cgid); + cgrp = __cgroup_get_from_id(cgid); if (IS_ERR(cgrp)) return NULL; return cgrp; @@ -2708,6 +2721,8 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset, bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false); return buffer__opt; } + case BPF_DYNPTR_TYPE_SKB_META: + return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset); default: WARN_ONCE(true, "unknown dynptr type %d\n", type); return NULL; @@ -2906,6 +2921,52 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, return 0; } +/** + * bpf_dynptr_memset() - Fill dynptr memory with a constant byte. + * @p: Destination dynptr - where data will be filled + * @offset: Offset into the dynptr to start filling from + * @size: Number of bytes to fill + * @val: Constant byte to fill the memory with + * + * Fills the @size bytes of the memory area pointed to by @p + * at @offset with the constant byte @val. + * Returns 0 on success; negative error, otherwise. + */ + __bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u32 offset, u32 size, u8 val) + { + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; + u32 chunk_sz, write_off; + char buf[256]; + void* slice; + int err; + + slice = bpf_dynptr_slice_rdwr(p, offset, NULL, size); + if (likely(slice)) { + memset(slice, val, size); + return 0; + } + + if (__bpf_dynptr_is_rdonly(ptr)) + return -EINVAL; + + err = bpf_dynptr_check_off_len(ptr, offset, size); + if (err) + return err; + + /* Non-linear data under the dynptr, write from a local buffer */ + chunk_sz = min_t(u32, sizeof(buf), size); + memset(buf, val, chunk_sz); + + for (write_off = 0; write_off < size; write_off += chunk_sz) { + chunk_sz = min_t(u32, sizeof(buf), size - write_off); + err = __bpf_dynptr_write(ptr, offset + write_off, buf, chunk_sz, 0); + if (err) + return err; + } + + return 0; +} + __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj) { return obj; @@ -2938,9 +2999,16 @@ static bool bpf_stack_walker(void *cookie, u64 ip, u64 sp, u64 bp) struct bpf_throw_ctx *ctx = cookie; struct bpf_prog *prog; - if (!is_bpf_text_address(ip)) - return !ctx->cnt; + /* + * The RCU read lock is held to safely traverse the latch tree, but we + * don't need its protection when accessing the prog, since it has an + * active stack frame on the current stack trace, and won't disappear. + */ + rcu_read_lock(); prog = bpf_prog_ksym_find(ip); + rcu_read_unlock(); + if (!prog) + return !ctx->cnt; ctx->cnt++; if (bpf_is_subprog(prog)) return true; @@ -3278,8 +3346,893 @@ __bpf_kfunc void __bpf_trap(void) { } +/* + * Kfuncs for string operations. + * + * Since strings are not necessarily %NUL-terminated, we cannot directly call + * in-kernel implementations. Instead, we open-code the implementations using + * __get_kernel_nofault instead of plain dereference to make them safe. + */ + +static int __bpf_strcasecmp(const char *s1, const char *s2, bool ignore_case) +{ + char c1, c2; + int i; + + if (!copy_from_kernel_nofault_allowed(s1, 1) || + !copy_from_kernel_nofault_allowed(s2, 1)) { + return -ERANGE; + } + + guard(pagefault)(); + for (i = 0; i < XATTR_SIZE_MAX; i++) { + __get_kernel_nofault(&c1, s1, char, err_out); + __get_kernel_nofault(&c2, s2, char, err_out); + if (ignore_case) { + c1 = tolower(c1); + c2 = tolower(c2); + } + if (c1 != c2) + return c1 < c2 ? -1 : 1; + if (c1 == '\0') + return 0; + s1++; + s2++; + } + return -E2BIG; +err_out: + return -EFAULT; +} + +/** + * bpf_strcmp - Compare two strings + * @s1__ign: One string + * @s2__ign: Another string + * + * Return: + * * %0 - Strings are equal + * * %-1 - @s1__ign is smaller + * * %1 - @s2__ign is smaller + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of strings is too large + * * %-ERANGE - One of strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign) +{ + return __bpf_strcasecmp(s1__ign, s2__ign, false); +} + +/** + * bpf_strcasecmp - Compare two strings, ignoring the case of the characters + * @s1__ign: One string + * @s2__ign: Another string + * + * Return: + * * %0 - Strings are equal + * * %-1 - @s1__ign is smaller + * * %1 - @s2__ign is smaller + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of strings is too large + * * %-ERANGE - One of strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strcasecmp(const char *s1__ign, const char *s2__ign) +{ + return __bpf_strcasecmp(s1__ign, s2__ign, true); +} + +/** + * bpf_strnchr - Find a character in a length limited string + * @s__ign: The string to be searched + * @count: The number of characters to be searched + * @c: The character to search for + * + * Note that the %NUL-terminator is considered part of the string, and can + * be searched for. + * + * Return: + * * >=0 - Index of the first occurrence of @c within @s__ign + * * %-ENOENT - @c not found in the first @count characters of @s__ign + * * %-EFAULT - Cannot read @s__ign + * * %-E2BIG - @s__ign is too large + * * %-ERANGE - @s__ign is outside of kernel address space + */ +__bpf_kfunc int bpf_strnchr(const char *s__ign, size_t count, char c) +{ + char sc; + int i; + + if (!copy_from_kernel_nofault_allowed(s__ign, 1)) + return -ERANGE; + + guard(pagefault)(); + for (i = 0; i < count && i < XATTR_SIZE_MAX; i++) { + __get_kernel_nofault(&sc, s__ign, char, err_out); + if (sc == c) + return i; + if (sc == '\0') + return -ENOENT; + s__ign++; + } + return i == XATTR_SIZE_MAX ? -E2BIG : -ENOENT; +err_out: + return -EFAULT; +} + +/** + * bpf_strchr - Find the first occurrence of a character in a string + * @s__ign: The string to be searched + * @c: The character to search for + * + * Note that the %NUL-terminator is considered part of the string, and can + * be searched for. + * + * Return: + * * >=0 - The index of the first occurrence of @c within @s__ign + * * %-ENOENT - @c not found in @s__ign + * * %-EFAULT - Cannot read @s__ign + * * %-E2BIG - @s__ign is too large + * * %-ERANGE - @s__ign is outside of kernel address space + */ +__bpf_kfunc int bpf_strchr(const char *s__ign, char c) +{ + return bpf_strnchr(s__ign, XATTR_SIZE_MAX, c); +} + +/** + * bpf_strchrnul - Find and return a character in a string, or end of string + * @s__ign: The string to be searched + * @c: The character to search for + * + * Return: + * * >=0 - Index of the first occurrence of @c within @s__ign or index of + * the null byte at the end of @s__ign when @c is not found + * * %-EFAULT - Cannot read @s__ign + * * %-E2BIG - @s__ign is too large + * * %-ERANGE - @s__ign is outside of kernel address space + */ +__bpf_kfunc int bpf_strchrnul(const char *s__ign, char c) +{ + char sc; + int i; + + if (!copy_from_kernel_nofault_allowed(s__ign, 1)) + return -ERANGE; + + guard(pagefault)(); + for (i = 0; i < XATTR_SIZE_MAX; i++) { + __get_kernel_nofault(&sc, s__ign, char, err_out); + if (sc == '\0' || sc == c) + return i; + s__ign++; + } + return -E2BIG; +err_out: + return -EFAULT; +} + +/** + * bpf_strrchr - Find the last occurrence of a character in a string + * @s__ign: The string to be searched + * @c: The character to search for + * + * Return: + * * >=0 - Index of the last occurrence of @c within @s__ign + * * %-ENOENT - @c not found in @s__ign + * * %-EFAULT - Cannot read @s__ign + * * %-E2BIG - @s__ign is too large + * * %-ERANGE - @s__ign is outside of kernel address space + */ +__bpf_kfunc int bpf_strrchr(const char *s__ign, int c) +{ + char sc; + int i, last = -ENOENT; + + if (!copy_from_kernel_nofault_allowed(s__ign, 1)) + return -ERANGE; + + guard(pagefault)(); + for (i = 0; i < XATTR_SIZE_MAX; i++) { + __get_kernel_nofault(&sc, s__ign, char, err_out); + if (sc == c) + last = i; + if (sc == '\0') + return last; + s__ign++; + } + return -E2BIG; +err_out: + return -EFAULT; +} + +/** + * bpf_strnlen - Calculate the length of a length-limited string + * @s__ign: The string + * @count: The maximum number of characters to count + * + * Return: + * * >=0 - The length of @s__ign + * * %-EFAULT - Cannot read @s__ign + * * %-E2BIG - @s__ign is too large + * * %-ERANGE - @s__ign is outside of kernel address space + */ +__bpf_kfunc int bpf_strnlen(const char *s__ign, size_t count) +{ + char c; + int i; + + if (!copy_from_kernel_nofault_allowed(s__ign, 1)) + return -ERANGE; + + guard(pagefault)(); + for (i = 0; i < count && i < XATTR_SIZE_MAX; i++) { + __get_kernel_nofault(&c, s__ign, char, err_out); + if (c == '\0') + return i; + s__ign++; + } + return i == XATTR_SIZE_MAX ? -E2BIG : i; +err_out: + return -EFAULT; +} + +/** + * bpf_strlen - Calculate the length of a string + * @s__ign: The string + * + * Return: + * * >=0 - The length of @s__ign + * * %-EFAULT - Cannot read @s__ign + * * %-E2BIG - @s__ign is too large + * * %-ERANGE - @s__ign is outside of kernel address space + */ +__bpf_kfunc int bpf_strlen(const char *s__ign) +{ + return bpf_strnlen(s__ign, XATTR_SIZE_MAX); +} + +/** + * bpf_strspn - Calculate the length of the initial substring of @s__ign which + * only contains letters in @accept__ign + * @s__ign: The string to be searched + * @accept__ign: The string to search for + * + * Return: + * * >=0 - The length of the initial substring of @s__ign which only + * contains letters from @accept__ign + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of the strings is too large + * * %-ERANGE - One of the strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strspn(const char *s__ign, const char *accept__ign) +{ + char cs, ca; + int i, j; + + if (!copy_from_kernel_nofault_allowed(s__ign, 1) || + !copy_from_kernel_nofault_allowed(accept__ign, 1)) { + return -ERANGE; + } + + guard(pagefault)(); + for (i = 0; i < XATTR_SIZE_MAX; i++) { + __get_kernel_nofault(&cs, s__ign, char, err_out); + if (cs == '\0') + return i; + for (j = 0; j < XATTR_SIZE_MAX; j++) { + __get_kernel_nofault(&ca, accept__ign + j, char, err_out); + if (cs == ca || ca == '\0') + break; + } + if (j == XATTR_SIZE_MAX) + return -E2BIG; + if (ca == '\0') + return i; + s__ign++; + } + return -E2BIG; +err_out: + return -EFAULT; +} + +/** + * bpf_strcspn - Calculate the length of the initial substring of @s__ign which + * does not contain letters in @reject__ign + * @s__ign: The string to be searched + * @reject__ign: The string to search for + * + * Return: + * * >=0 - The length of the initial substring of @s__ign which does not + * contain letters from @reject__ign + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of the strings is too large + * * %-ERANGE - One of the strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strcspn(const char *s__ign, const char *reject__ign) +{ + char cs, cr; + int i, j; + + if (!copy_from_kernel_nofault_allowed(s__ign, 1) || + !copy_from_kernel_nofault_allowed(reject__ign, 1)) { + return -ERANGE; + } + + guard(pagefault)(); + for (i = 0; i < XATTR_SIZE_MAX; i++) { + __get_kernel_nofault(&cs, s__ign, char, err_out); + if (cs == '\0') + return i; + for (j = 0; j < XATTR_SIZE_MAX; j++) { + __get_kernel_nofault(&cr, reject__ign + j, char, err_out); + if (cs == cr || cr == '\0') + break; + } + if (j == XATTR_SIZE_MAX) + return -E2BIG; + if (cr != '\0') + return i; + s__ign++; + } + return -E2BIG; +err_out: + return -EFAULT; +} + +/** + * bpf_strnstr - Find the first substring in a length-limited string + * @s1__ign: The string to be searched + * @s2__ign: The string to search for + * @len: the maximum number of characters to search + * + * Return: + * * >=0 - Index of the first character of the first occurrence of @s2__ign + * within the first @len characters of @s1__ign + * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of the strings is too large + * * %-ERANGE - One of the strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len) +{ + char c1, c2; + int i, j; + + if (!copy_from_kernel_nofault_allowed(s1__ign, 1) || + !copy_from_kernel_nofault_allowed(s2__ign, 1)) { + return -ERANGE; + } + + guard(pagefault)(); + for (i = 0; i < XATTR_SIZE_MAX; i++) { + for (j = 0; i + j <= len && j < XATTR_SIZE_MAX; j++) { + __get_kernel_nofault(&c2, s2__ign + j, char, err_out); + if (c2 == '\0') + return i; + /* + * We allow reading an extra byte from s2 (note the + * `i + j <= len` above) to cover the case when s2 is + * a suffix of the first len chars of s1. + */ + if (i + j == len) + break; + __get_kernel_nofault(&c1, s1__ign + j, char, err_out); + if (c1 == '\0') + return -ENOENT; + if (c1 != c2) + break; + } + if (j == XATTR_SIZE_MAX) + return -E2BIG; + if (i + j == len) + return -ENOENT; + s1__ign++; + } + return -E2BIG; +err_out: + return -EFAULT; +} + +/** + * bpf_strstr - Find the first substring in a string + * @s1__ign: The string to be searched + * @s2__ign: The string to search for + * + * Return: + * * >=0 - Index of the first character of the first occurrence of @s2__ign + * within @s1__ign + * * %-ENOENT - @s2__ign is not a substring of @s1__ign + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of the strings is too large + * * %-ERANGE - One of the strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign) +{ + return bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX); +} +#ifdef CONFIG_KEYS +/** + * bpf_lookup_user_key - lookup a key by its serial + * @serial: key handle serial number + * @flags: lookup-specific flags + * + * Search a key with a given *serial* and the provided *flags*. + * If found, increment the reference count of the key by one, and + * return it in the bpf_key structure. + * + * The bpf_key structure must be passed to bpf_key_put() when done + * with it, so that the key reference count is decremented and the + * bpf_key structure is freed. + * + * Permission checks are deferred to the time the key is used by + * one of the available key-specific kfuncs. + * + * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested + * special keyring (e.g. session keyring), if it doesn't yet exist. + * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting + * for the key construction, and to retrieve uninstantiated keys (keys + * without data attached to them). + * + * Return: a bpf_key pointer with a valid key pointer if the key is found, a + * NULL pointer otherwise. + */ +__bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags) +{ + key_ref_t key_ref; + struct bpf_key *bkey; + + if (flags & ~KEY_LOOKUP_ALL) + return NULL; + + /* + * Permission check is deferred until the key is used, as the + * intent of the caller is unknown here. + */ + key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK); + if (IS_ERR(key_ref)) + return NULL; + + bkey = kmalloc(sizeof(*bkey), GFP_KERNEL); + if (!bkey) { + key_put(key_ref_to_ptr(key_ref)); + return NULL; + } + + bkey->key = key_ref_to_ptr(key_ref); + bkey->has_ref = true; + + return bkey; +} + +/** + * bpf_lookup_system_key - lookup a key by a system-defined ID + * @id: key ID + * + * Obtain a bpf_key structure with a key pointer set to the passed key ID. + * The key pointer is marked as invalid, to prevent bpf_key_put() from + * attempting to decrement the key reference count on that pointer. The key + * pointer set in such way is currently understood only by + * verify_pkcs7_signature(). + * + * Set *id* to one of the values defined in include/linux/verification.h: + * 0 for the primary keyring (immutable keyring of system keys); + * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring + * (where keys can be added only if they are vouched for by existing keys + * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform + * keyring (primarily used by the integrity subsystem to verify a kexec'ed + * kerned image and, possibly, the initramfs signature). + * + * Return: a bpf_key pointer with an invalid key pointer set from the + * pre-determined ID on success, a NULL pointer otherwise + */ +__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id) +{ + struct bpf_key *bkey; + + if (system_keyring_id_check(id) < 0) + return NULL; + + bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC); + if (!bkey) + return NULL; + + bkey->key = (struct key *)(unsigned long)id; + bkey->has_ref = false; + + return bkey; +} + +/** + * bpf_key_put - decrement key reference count if key is valid and free bpf_key + * @bkey: bpf_key structure + * + * Decrement the reference count of the key inside *bkey*, if the pointer + * is valid, and free *bkey*. + */ +__bpf_kfunc void bpf_key_put(struct bpf_key *bkey) +{ + if (bkey->has_ref) + key_put(bkey->key); + + kfree(bkey); +} + +/** + * bpf_verify_pkcs7_signature - verify a PKCS#7 signature + * @data_p: data to verify + * @sig_p: signature of the data + * @trusted_keyring: keyring with keys trusted for signature verification + * + * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr* + * with keys in a keyring referenced by *trusted_keyring*. + * + * Return: 0 on success, a negative value on error. + */ +__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, + struct bpf_dynptr *sig_p, + struct bpf_key *trusted_keyring) +{ +#ifdef CONFIG_SYSTEM_DATA_VERIFICATION + struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p; + struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p; + const void *data, *sig; + u32 data_len, sig_len; + int ret; + + if (trusted_keyring->has_ref) { + /* + * Do the permission check deferred in bpf_lookup_user_key(). + * See bpf_lookup_user_key() for more details. + * + * A call to key_task_permission() here would be redundant, as + * it is already done by keyring_search() called by + * find_asymmetric_key(). + */ + ret = key_validate(trusted_keyring->key); + if (ret < 0) + return ret; + } + + data_len = __bpf_dynptr_size(data_ptr); + data = __bpf_dynptr_data(data_ptr, data_len); + sig_len = __bpf_dynptr_size(sig_ptr); + sig = __bpf_dynptr_data(sig_ptr, sig_len); + + return verify_pkcs7_signature(data, data_len, sig, sig_len, + trusted_keyring->key, + VERIFYING_BPF_SIGNATURE, NULL, + NULL); +#else + return -EOPNOTSUPP; +#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */ +} +#endif /* CONFIG_KEYS */ + +typedef int (*bpf_task_work_callback_t)(struct bpf_map *map, void *key, void *value); + +enum bpf_task_work_state { + /* bpf_task_work is ready to be used */ + BPF_TW_STANDBY = 0, + /* irq work scheduling in progress */ + BPF_TW_PENDING, + /* task work scheduling in progress */ + BPF_TW_SCHEDULING, + /* task work is scheduled successfully */ + BPF_TW_SCHEDULED, + /* callback is running */ + BPF_TW_RUNNING, + /* associated BPF map value is deleted */ + BPF_TW_FREED, +}; + +struct bpf_task_work_ctx { + enum bpf_task_work_state state; + refcount_t refcnt; + struct callback_head work; + struct irq_work irq_work; + /* bpf_prog that schedules task work */ + struct bpf_prog *prog; + /* task for which callback is scheduled */ + struct task_struct *task; + /* the map and map value associated with this context */ + struct bpf_map *map; + void *map_val; + enum task_work_notify_mode mode; + bpf_task_work_callback_t callback_fn; + struct rcu_head rcu; +} __aligned(8); + +/* Actual type for struct bpf_task_work */ +struct bpf_task_work_kern { + struct bpf_task_work_ctx *ctx; +}; + +static void bpf_task_work_ctx_reset(struct bpf_task_work_ctx *ctx) +{ + if (ctx->prog) { + bpf_prog_put(ctx->prog); + ctx->prog = NULL; + } + if (ctx->task) { + bpf_task_release(ctx->task); + ctx->task = NULL; + } +} + +static bool bpf_task_work_ctx_tryget(struct bpf_task_work_ctx *ctx) +{ + return refcount_inc_not_zero(&ctx->refcnt); +} + +static void bpf_task_work_ctx_put(struct bpf_task_work_ctx *ctx) +{ + if (!refcount_dec_and_test(&ctx->refcnt)) + return; + + bpf_task_work_ctx_reset(ctx); + + /* bpf_mem_free expects migration to be disabled */ + migrate_disable(); + bpf_mem_free(&bpf_global_ma, ctx); + migrate_enable(); +} + +static void bpf_task_work_cancel(struct bpf_task_work_ctx *ctx) +{ + /* + * Scheduled task_work callback holds ctx ref, so if we successfully + * cancelled, we put that ref on callback's behalf. If we couldn't + * cancel, callback will inevitably run or has already completed + * running, and it would have taken care of its ctx ref itself. + */ + if (task_work_cancel(ctx->task, &ctx->work)) + bpf_task_work_ctx_put(ctx); +} + +static void bpf_task_work_callback(struct callback_head *cb) +{ + struct bpf_task_work_ctx *ctx = container_of(cb, struct bpf_task_work_ctx, work); + enum bpf_task_work_state state; + u32 idx; + void *key; + + /* Read lock is needed to protect ctx and map key/value access */ + guard(rcu_tasks_trace)(); + /* + * This callback may start running before bpf_task_work_irq() switched to + * SCHEDULED state, so handle both transition variants SCHEDULING|SCHEDULED -> RUNNING. + */ + state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_RUNNING); + if (state == BPF_TW_SCHEDULED) + state = cmpxchg(&ctx->state, BPF_TW_SCHEDULED, BPF_TW_RUNNING); + if (state == BPF_TW_FREED) { + bpf_task_work_ctx_put(ctx); + return; + } + + key = (void *)map_key_from_value(ctx->map, ctx->map_val, &idx); + + migrate_disable(); + ctx->callback_fn(ctx->map, key, ctx->map_val); + migrate_enable(); + + bpf_task_work_ctx_reset(ctx); + (void)cmpxchg(&ctx->state, BPF_TW_RUNNING, BPF_TW_STANDBY); + + bpf_task_work_ctx_put(ctx); +} + +static void bpf_task_work_irq(struct irq_work *irq_work) +{ + struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work); + enum bpf_task_work_state state; + int err; + + guard(rcu_tasks_trace)(); + + if (cmpxchg(&ctx->state, BPF_TW_PENDING, BPF_TW_SCHEDULING) != BPF_TW_PENDING) { + bpf_task_work_ctx_put(ctx); + return; + } + + err = task_work_add(ctx->task, &ctx->work, ctx->mode); + if (err) { + bpf_task_work_ctx_reset(ctx); + /* + * try to switch back to STANDBY for another task_work reuse, but we might have + * gone to FREED already, which is fine as we already cleaned up after ourselves + */ + (void)cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_STANDBY); + bpf_task_work_ctx_put(ctx); + return; + } + + /* + * It's technically possible for just scheduled task_work callback to + * complete running by now, going SCHEDULING -> RUNNING and then + * dropping its ctx refcount. Instead of capturing extra ref just to + * protected below ctx->state access, we rely on RCU protection to + * perform below SCHEDULING -> SCHEDULED attempt. + */ + state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_SCHEDULED); + if (state == BPF_TW_FREED) + bpf_task_work_cancel(ctx); /* clean up if we switched into FREED state */ +} + +static struct bpf_task_work_ctx *bpf_task_work_fetch_ctx(struct bpf_task_work *tw, + struct bpf_map *map) +{ + struct bpf_task_work_kern *twk = (void *)tw; + struct bpf_task_work_ctx *ctx, *old_ctx; + + ctx = READ_ONCE(twk->ctx); + if (ctx) + return ctx; + + ctx = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_task_work_ctx)); + if (!ctx) + return ERR_PTR(-ENOMEM); + + memset(ctx, 0, sizeof(*ctx)); + refcount_set(&ctx->refcnt, 1); /* map's own ref */ + ctx->state = BPF_TW_STANDBY; + + old_ctx = cmpxchg(&twk->ctx, NULL, ctx); + if (old_ctx) { + /* + * tw->ctx is set by concurrent BPF program, release allocated + * memory and try to reuse already set context. + */ + bpf_mem_free(&bpf_global_ma, ctx); + return old_ctx; + } + + return ctx; /* Success */ +} + +static struct bpf_task_work_ctx *bpf_task_work_acquire_ctx(struct bpf_task_work *tw, + struct bpf_map *map) +{ + struct bpf_task_work_ctx *ctx; + + ctx = bpf_task_work_fetch_ctx(tw, map); + if (IS_ERR(ctx)) + return ctx; + + /* try to get ref for task_work callback to hold */ + if (!bpf_task_work_ctx_tryget(ctx)) + return ERR_PTR(-EBUSY); + + if (cmpxchg(&ctx->state, BPF_TW_STANDBY, BPF_TW_PENDING) != BPF_TW_STANDBY) { + /* lost acquiring race or map_release_uref() stole it from us, put ref and bail */ + bpf_task_work_ctx_put(ctx); + return ERR_PTR(-EBUSY); + } + + /* + * If no process or bpffs is holding a reference to the map, no new callbacks should be + * scheduled. This does not address any race or correctness issue, but rather is a policy + * choice: dropping user references should stop everything. + */ + if (!atomic64_read(&map->usercnt)) { + /* drop ref we just got for task_work callback itself */ + bpf_task_work_ctx_put(ctx); + /* transfer map's ref into cancel_and_free() */ + bpf_task_work_cancel_and_free(tw); + return ERR_PTR(-EBUSY); + } + + return ctx; +} + +static int bpf_task_work_schedule(struct task_struct *task, struct bpf_task_work *tw, + struct bpf_map *map, bpf_task_work_callback_t callback_fn, + struct bpf_prog_aux *aux, enum task_work_notify_mode mode) +{ + struct bpf_prog *prog; + struct bpf_task_work_ctx *ctx; + int err; + + BTF_TYPE_EMIT(struct bpf_task_work); + + prog = bpf_prog_inc_not_zero(aux->prog); + if (IS_ERR(prog)) + return -EBADF; + task = bpf_task_acquire(task); + if (!task) { + err = -EBADF; + goto release_prog; + } + + ctx = bpf_task_work_acquire_ctx(tw, map); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + goto release_all; + } + + ctx->task = task; + ctx->callback_fn = callback_fn; + ctx->prog = prog; + ctx->mode = mode; + ctx->map = map; + ctx->map_val = (void *)tw - map->record->task_work_off; + init_task_work(&ctx->work, bpf_task_work_callback); + init_irq_work(&ctx->irq_work, bpf_task_work_irq); + + irq_work_queue(&ctx->irq_work); + return 0; + +release_all: + bpf_task_release(task); +release_prog: + bpf_prog_put(prog); + return err; +} + +/** + * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode + * @task: Task struct for which callback should be scheduled + * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping + * @map__map: bpf_map that embeds struct bpf_task_work in the values + * @callback: pointer to BPF subprogram to call + * @aux__prog: user should pass NULL + * + * Return: 0 if task work has been scheduled successfully, negative error code otherwise + */ +__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw, + void *map__map, bpf_task_work_callback_t callback, + void *aux__prog) +{ + return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL); +} + +/** + * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME mode + * @task: Task struct for which callback should be scheduled + * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping + * @map__map: bpf_map that embeds struct bpf_task_work in the values + * @callback: pointer to BPF subprogram to call + * @aux__prog: user should pass NULL + * + * Return: 0 if task work has been scheduled successfully, negative error code otherwise + */ +__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw, + void *map__map, bpf_task_work_callback_t callback, + void *aux__prog) +{ + return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME); +} + __bpf_kfunc_end_defs(); +static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work) +{ + struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work); + + bpf_task_work_cancel(ctx); /* this might put task_work callback's ref */ + bpf_task_work_ctx_put(ctx); /* and here we put map's own ref that was transferred to us */ +} + +void bpf_task_work_cancel_and_free(void *val) +{ + struct bpf_task_work_kern *twk = val; + struct bpf_task_work_ctx *ctx; + enum bpf_task_work_state state; + + ctx = xchg(&twk->ctx, NULL); + if (!ctx) + return; + + state = xchg(&ctx->state, BPF_TW_FREED); + if (state == BPF_TW_SCHEDULED) { + /* run in irq_work to avoid locks in NMI */ + init_irq_work(&ctx->irq_work, bpf_task_work_cancel_scheduled); + irq_work_queue(&ctx->irq_work); + return; + } + + bpf_task_work_ctx_put(ctx); /* put bpf map's ref */ +} + BTF_KFUNCS_START(generic_btf_ids) #ifdef CONFIG_CRASH_DUMP BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) @@ -3318,6 +4271,14 @@ BTF_ID_FLAGS(func, bpf_throw) #ifdef CONFIG_BPF_EVENTS BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS) #endif +#ifdef CONFIG_KEYS +BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE) +#ifdef CONFIG_SYSTEM_DATA_VERIFICATION +BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE) +#endif +#endif BTF_KFUNCS_END(generic_btf_ids) static const struct btf_kfunc_id_set generic_kfunc_set = { @@ -3364,6 +4325,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) BTF_ID_FLAGS(func, bpf_dynptr_copy) +BTF_ID_FLAGS(func, bpf_dynptr_memset) #ifdef CONFIG_NET BTF_ID_FLAGS(func, bpf_modify_return_test_tp) #endif @@ -3397,6 +4359,24 @@ BTF_ID_FLAGS(func, bpf_iter_dmabuf_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPAB BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) #endif BTF_ID_FLAGS(func, __bpf_trap) +BTF_ID_FLAGS(func, bpf_strcmp); +BTF_ID_FLAGS(func, bpf_strcasecmp); +BTF_ID_FLAGS(func, bpf_strchr); +BTF_ID_FLAGS(func, bpf_strchrnul); +BTF_ID_FLAGS(func, bpf_strnchr); +BTF_ID_FLAGS(func, bpf_strrchr); +BTF_ID_FLAGS(func, bpf_strlen); +BTF_ID_FLAGS(func, bpf_strnlen); +BTF_ID_FLAGS(func, bpf_strspn); +BTF_ID_FLAGS(func, bpf_strcspn); +BTF_ID_FLAGS(func, bpf_strstr); +BTF_ID_FLAGS(func, bpf_strnstr); +#if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS) +BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) +#endif +BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_TRUSTED_ARGS) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { |
