summaryrefslogtreecommitdiff
path: root/kernel/bpf/helpers.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/bpf/helpers.c')
-rw-r--r--kernel/bpf/helpers.c628
1 files changed, 582 insertions, 46 deletions
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 6b4877e85a68..c9fab9a356df 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -25,6 +25,9 @@
#include <linux/kasan.h>
#include <linux/bpf_verifier.h>
#include <linux/uaccess.h>
+#include <linux/verification.h>
+#include <linux/task_work.h>
+#include <linux/irq_work.h>
#include "../../lib/kstrtox.h"
@@ -774,11 +777,9 @@ int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs)
{
int nest_level;
- preempt_disable();
nest_level = this_cpu_inc_return(bpf_bprintf_nest_level);
if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) {
this_cpu_dec(bpf_bprintf_nest_level);
- preempt_enable();
return -EBUSY;
}
*bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]);
@@ -791,7 +792,6 @@ void bpf_put_buffers(void)
if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0))
return;
this_cpu_dec(bpf_bprintf_nest_level);
- preempt_enable();
}
void bpf_bprintf_cleanup(struct bpf_bprintf_data *data)
@@ -1084,6 +1084,17 @@ const struct bpf_func_proto bpf_snprintf_proto = {
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
+static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx)
+{
+ if (map->map_type == BPF_MAP_TYPE_ARRAY) {
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+ *arr_idx = ((char *)value - array->value) / array->elem_size;
+ return arr_idx;
+ }
+ return (void *)value - round_up(map->key_size, 8);
+}
+
struct bpf_async_cb {
struct bpf_map *map;
struct bpf_prog *prog;
@@ -1166,15 +1177,8 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
* bpf_map_delete_elem() on the same timer.
*/
this_cpu_write(hrtimer_running, t);
- if (map->map_type == BPF_MAP_TYPE_ARRAY) {
- struct bpf_array *array = container_of(map, struct bpf_array, map);
- /* compute the key */
- idx = ((char *)value - array->value) / array->elem_size;
- key = &idx;
- } else { /* hash or lru */
- key = value - round_up(map->key_size, 8);
- }
+ key = map_key_from_value(map, value, &idx);
callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
/* The verifier checked that return value is zero. */
@@ -1200,15 +1204,7 @@ static void bpf_wq_work(struct work_struct *work)
if (!callback_fn)
return;
- if (map->map_type == BPF_MAP_TYPE_ARRAY) {
- struct bpf_array *array = container_of(map, struct bpf_array, map);
-
- /* compute the key */
- idx = ((char *)value - array->value) / array->elem_size;
- key = &idx;
- } else { /* hash or lru */
- key = value - round_up(map->key_size, 8);
- }
+ key = map_key_from_value(map, value, &idx);
rcu_read_lock_trace();
migrate_disable();
@@ -1274,8 +1270,11 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
goto out;
}
- /* allocate hrtimer via map_kmalloc to use memcg accounting */
- cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
+ /* Allocate via bpf_map_kmalloc_node() for memcg accounting. Until
+ * kmalloc_nolock() is available, avoid locking issues by using
+ * __GFP_HIGH (GFP_ATOMIC & ~__GFP_RECLAIM).
+ */
+ cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node);
if (!cb) {
ret = -ENOMEM;
goto out;
@@ -1597,7 +1596,7 @@ void bpf_timer_cancel_and_free(void *val)
* timer callback.
*/
if (this_cpu_read(hrtimer_running)) {
- queue_work(system_unbound_wq, &t->cb.delete_work);
+ queue_work(system_dfl_wq, &t->cb.delete_work);
return;
}
@@ -1610,7 +1609,7 @@ void bpf_timer_cancel_and_free(void *val)
if (hrtimer_try_to_cancel(&t->timer) >= 0)
kfree_rcu(t, cb.rcu);
else
- queue_work(system_unbound_wq, &t->cb.delete_work);
+ queue_work(system_dfl_wq, &t->cb.delete_work);
} else {
bpf_timer_delete_work(&t->cb.delete_work);
}
@@ -1780,6 +1779,9 @@ static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *s
return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len);
case BPF_DYNPTR_TYPE_XDP:
return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len);
+ case BPF_DYNPTR_TYPE_SKB_META:
+ memmove(dst, bpf_skb_meta_pointer(src->data, src->offset + offset), len);
+ return 0;
default:
WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type);
return -EFAULT;
@@ -1836,6 +1838,11 @@ int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src,
if (flags)
return -EINVAL;
return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len);
+ case BPF_DYNPTR_TYPE_SKB_META:
+ if (flags)
+ return -EINVAL;
+ memmove(bpf_skb_meta_pointer(dst->data, dst->offset + offset), src, len);
+ return 0;
default:
WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type);
return -EFAULT;
@@ -1882,6 +1889,7 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3
return (unsigned long)(ptr->data + ptr->offset + offset);
case BPF_DYNPTR_TYPE_SKB:
case BPF_DYNPTR_TYPE_XDP:
+ case BPF_DYNPTR_TYPE_SKB_META:
/* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */
return 0;
default:
@@ -2537,7 +2545,7 @@ __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid)
{
struct cgroup *cgrp;
- cgrp = cgroup_get_from_id(cgid);
+ cgrp = __cgroup_get_from_id(cgid);
if (IS_ERR(cgrp))
return NULL;
return cgrp;
@@ -2710,6 +2718,8 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset,
bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false);
return buffer__opt;
}
+ case BPF_DYNPTR_TYPE_SKB_META:
+ return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset);
default:
WARN_ONCE(true, "unknown dynptr type %d\n", type);
return NULL;
@@ -3341,39 +3351,30 @@ __bpf_kfunc void __bpf_trap(void)
* __get_kernel_nofault instead of plain dereference to make them safe.
*/
-/**
- * bpf_strcmp - Compare two strings
- * @s1__ign: One string
- * @s2__ign: Another string
- *
- * Return:
- * * %0 - Strings are equal
- * * %-1 - @s1__ign is smaller
- * * %1 - @s2__ign is smaller
- * * %-EFAULT - Cannot read one of the strings
- * * %-E2BIG - One of strings is too large
- * * %-ERANGE - One of strings is outside of kernel address space
- */
-__bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign)
+static int __bpf_strcasecmp(const char *s1, const char *s2, bool ignore_case)
{
char c1, c2;
int i;
- if (!copy_from_kernel_nofault_allowed(s1__ign, 1) ||
- !copy_from_kernel_nofault_allowed(s2__ign, 1)) {
+ if (!copy_from_kernel_nofault_allowed(s1, 1) ||
+ !copy_from_kernel_nofault_allowed(s2, 1)) {
return -ERANGE;
}
guard(pagefault)();
for (i = 0; i < XATTR_SIZE_MAX; i++) {
- __get_kernel_nofault(&c1, s1__ign, char, err_out);
- __get_kernel_nofault(&c2, s2__ign, char, err_out);
+ __get_kernel_nofault(&c1, s1, char, err_out);
+ __get_kernel_nofault(&c2, s2, char, err_out);
+ if (ignore_case) {
+ c1 = tolower(c1);
+ c2 = tolower(c2);
+ }
if (c1 != c2)
return c1 < c2 ? -1 : 1;
if (c1 == '\0')
return 0;
- s1__ign++;
- s2__ign++;
+ s1++;
+ s2++;
}
return -E2BIG;
err_out:
@@ -3381,6 +3382,42 @@ err_out:
}
/**
+ * bpf_strcmp - Compare two strings
+ * @s1__ign: One string
+ * @s2__ign: Another string
+ *
+ * Return:
+ * * %0 - Strings are equal
+ * * %-1 - @s1__ign is smaller
+ * * %1 - @s2__ign is smaller
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of strings is too large
+ * * %-ERANGE - One of strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign)
+{
+ return __bpf_strcasecmp(s1__ign, s2__ign, false);
+}
+
+/**
+ * bpf_strcasecmp - Compare two strings, ignoring the case of the characters
+ * @s1__ign: One string
+ * @s2__ign: Another string
+ *
+ * Return:
+ * * %0 - Strings are equal
+ * * %-1 - @s1__ign is smaller
+ * * %1 - @s2__ign is smaller
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of strings is too large
+ * * %-ERANGE - One of strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strcasecmp(const char *s1__ign, const char *s2__ign)
+{
+ return __bpf_strcasecmp(s1__ign, s2__ign, true);
+}
+
+/**
* bpf_strnchr - Find a character in a length limited string
* @s__ign: The string to be searched
* @count: The number of characters to be searched
@@ -3664,10 +3701,17 @@ __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len
guard(pagefault)();
for (i = 0; i < XATTR_SIZE_MAX; i++) {
- for (j = 0; i + j < len && j < XATTR_SIZE_MAX; j++) {
+ for (j = 0; i + j <= len && j < XATTR_SIZE_MAX; j++) {
__get_kernel_nofault(&c2, s2__ign + j, char, err_out);
if (c2 == '\0')
return i;
+ /*
+ * We allow reading an extra byte from s2 (note the
+ * `i + j <= len` above) to cover the case when s2 is
+ * a suffix of the first len chars of s1.
+ */
+ if (i + j == len)
+ break;
__get_kernel_nofault(&c1, s1__ign + j, char, err_out);
if (c1 == '\0')
return -ENOENT;
@@ -3702,9 +3746,490 @@ __bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign)
{
return bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX);
}
+#ifdef CONFIG_KEYS
+/**
+ * bpf_lookup_user_key - lookup a key by its serial
+ * @serial: key handle serial number
+ * @flags: lookup-specific flags
+ *
+ * Search a key with a given *serial* and the provided *flags*.
+ * If found, increment the reference count of the key by one, and
+ * return it in the bpf_key structure.
+ *
+ * The bpf_key structure must be passed to bpf_key_put() when done
+ * with it, so that the key reference count is decremented and the
+ * bpf_key structure is freed.
+ *
+ * Permission checks are deferred to the time the key is used by
+ * one of the available key-specific kfuncs.
+ *
+ * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested
+ * special keyring (e.g. session keyring), if it doesn't yet exist.
+ * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting
+ * for the key construction, and to retrieve uninstantiated keys (keys
+ * without data attached to them).
+ *
+ * Return: a bpf_key pointer with a valid key pointer if the key is found, a
+ * NULL pointer otherwise.
+ */
+__bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags)
+{
+ key_ref_t key_ref;
+ struct bpf_key *bkey;
+
+ if (flags & ~KEY_LOOKUP_ALL)
+ return NULL;
+
+ /*
+ * Permission check is deferred until the key is used, as the
+ * intent of the caller is unknown here.
+ */
+ key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK);
+ if (IS_ERR(key_ref))
+ return NULL;
+
+ bkey = kmalloc(sizeof(*bkey), GFP_KERNEL);
+ if (!bkey) {
+ key_put(key_ref_to_ptr(key_ref));
+ return NULL;
+ }
+
+ bkey->key = key_ref_to_ptr(key_ref);
+ bkey->has_ref = true;
+
+ return bkey;
+}
+
+/**
+ * bpf_lookup_system_key - lookup a key by a system-defined ID
+ * @id: key ID
+ *
+ * Obtain a bpf_key structure with a key pointer set to the passed key ID.
+ * The key pointer is marked as invalid, to prevent bpf_key_put() from
+ * attempting to decrement the key reference count on that pointer. The key
+ * pointer set in such way is currently understood only by
+ * verify_pkcs7_signature().
+ *
+ * Set *id* to one of the values defined in include/linux/verification.h:
+ * 0 for the primary keyring (immutable keyring of system keys);
+ * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring
+ * (where keys can be added only if they are vouched for by existing keys
+ * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform
+ * keyring (primarily used by the integrity subsystem to verify a kexec'ed
+ * kerned image and, possibly, the initramfs signature).
+ *
+ * Return: a bpf_key pointer with an invalid key pointer set from the
+ * pre-determined ID on success, a NULL pointer otherwise
+ */
+__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id)
+{
+ struct bpf_key *bkey;
+
+ if (system_keyring_id_check(id) < 0)
+ return NULL;
+
+ bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC);
+ if (!bkey)
+ return NULL;
+
+ bkey->key = (struct key *)(unsigned long)id;
+ bkey->has_ref = false;
+
+ return bkey;
+}
+
+/**
+ * bpf_key_put - decrement key reference count if key is valid and free bpf_key
+ * @bkey: bpf_key structure
+ *
+ * Decrement the reference count of the key inside *bkey*, if the pointer
+ * is valid, and free *bkey*.
+ */
+__bpf_kfunc void bpf_key_put(struct bpf_key *bkey)
+{
+ if (bkey->has_ref)
+ key_put(bkey->key);
+
+ kfree(bkey);
+}
+
+/**
+ * bpf_verify_pkcs7_signature - verify a PKCS#7 signature
+ * @data_p: data to verify
+ * @sig_p: signature of the data
+ * @trusted_keyring: keyring with keys trusted for signature verification
+ *
+ * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr*
+ * with keys in a keyring referenced by *trusted_keyring*.
+ *
+ * Return: 0 on success, a negative value on error.
+ */
+__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p,
+ struct bpf_dynptr *sig_p,
+ struct bpf_key *trusted_keyring)
+{
+#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
+ struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p;
+ struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p;
+ const void *data, *sig;
+ u32 data_len, sig_len;
+ int ret;
+
+ if (trusted_keyring->has_ref) {
+ /*
+ * Do the permission check deferred in bpf_lookup_user_key().
+ * See bpf_lookup_user_key() for more details.
+ *
+ * A call to key_task_permission() here would be redundant, as
+ * it is already done by keyring_search() called by
+ * find_asymmetric_key().
+ */
+ ret = key_validate(trusted_keyring->key);
+ if (ret < 0)
+ return ret;
+ }
+
+ data_len = __bpf_dynptr_size(data_ptr);
+ data = __bpf_dynptr_data(data_ptr, data_len);
+ sig_len = __bpf_dynptr_size(sig_ptr);
+ sig = __bpf_dynptr_data(sig_ptr, sig_len);
+
+ return verify_pkcs7_signature(data, data_len, sig, sig_len,
+ trusted_keyring->key,
+ VERIFYING_BPF_SIGNATURE, NULL,
+ NULL);
+#else
+ return -EOPNOTSUPP;
+#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
+}
+#endif /* CONFIG_KEYS */
+
+typedef int (*bpf_task_work_callback_t)(struct bpf_map *map, void *key, void *value);
+
+enum bpf_task_work_state {
+ /* bpf_task_work is ready to be used */
+ BPF_TW_STANDBY = 0,
+ /* irq work scheduling in progress */
+ BPF_TW_PENDING,
+ /* task work scheduling in progress */
+ BPF_TW_SCHEDULING,
+ /* task work is scheduled successfully */
+ BPF_TW_SCHEDULED,
+ /* callback is running */
+ BPF_TW_RUNNING,
+ /* associated BPF map value is deleted */
+ BPF_TW_FREED,
+};
+
+struct bpf_task_work_ctx {
+ enum bpf_task_work_state state;
+ refcount_t refcnt;
+ struct callback_head work;
+ struct irq_work irq_work;
+ /* bpf_prog that schedules task work */
+ struct bpf_prog *prog;
+ /* task for which callback is scheduled */
+ struct task_struct *task;
+ /* the map and map value associated with this context */
+ struct bpf_map *map;
+ void *map_val;
+ enum task_work_notify_mode mode;
+ bpf_task_work_callback_t callback_fn;
+ struct rcu_head rcu;
+} __aligned(8);
+
+/* Actual type for struct bpf_task_work */
+struct bpf_task_work_kern {
+ struct bpf_task_work_ctx *ctx;
+};
+
+static void bpf_task_work_ctx_reset(struct bpf_task_work_ctx *ctx)
+{
+ if (ctx->prog) {
+ bpf_prog_put(ctx->prog);
+ ctx->prog = NULL;
+ }
+ if (ctx->task) {
+ bpf_task_release(ctx->task);
+ ctx->task = NULL;
+ }
+}
+
+static bool bpf_task_work_ctx_tryget(struct bpf_task_work_ctx *ctx)
+{
+ return refcount_inc_not_zero(&ctx->refcnt);
+}
+
+static void bpf_task_work_ctx_put(struct bpf_task_work_ctx *ctx)
+{
+ if (!refcount_dec_and_test(&ctx->refcnt))
+ return;
+
+ bpf_task_work_ctx_reset(ctx);
+
+ /* bpf_mem_free expects migration to be disabled */
+ migrate_disable();
+ bpf_mem_free(&bpf_global_ma, ctx);
+ migrate_enable();
+}
+
+static void bpf_task_work_cancel(struct bpf_task_work_ctx *ctx)
+{
+ /*
+ * Scheduled task_work callback holds ctx ref, so if we successfully
+ * cancelled, we put that ref on callback's behalf. If we couldn't
+ * cancel, callback will inevitably run or has already completed
+ * running, and it would have taken care of its ctx ref itself.
+ */
+ if (task_work_cancel(ctx->task, &ctx->work))
+ bpf_task_work_ctx_put(ctx);
+}
+
+static void bpf_task_work_callback(struct callback_head *cb)
+{
+ struct bpf_task_work_ctx *ctx = container_of(cb, struct bpf_task_work_ctx, work);
+ enum bpf_task_work_state state;
+ u32 idx;
+ void *key;
+
+ /* Read lock is needed to protect ctx and map key/value access */
+ guard(rcu_tasks_trace)();
+ /*
+ * This callback may start running before bpf_task_work_irq() switched to
+ * SCHEDULED state, so handle both transition variants SCHEDULING|SCHEDULED -> RUNNING.
+ */
+ state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_RUNNING);
+ if (state == BPF_TW_SCHEDULED)
+ state = cmpxchg(&ctx->state, BPF_TW_SCHEDULED, BPF_TW_RUNNING);
+ if (state == BPF_TW_FREED) {
+ bpf_task_work_ctx_put(ctx);
+ return;
+ }
+
+ key = (void *)map_key_from_value(ctx->map, ctx->map_val, &idx);
+
+ migrate_disable();
+ ctx->callback_fn(ctx->map, key, ctx->map_val);
+ migrate_enable();
+
+ bpf_task_work_ctx_reset(ctx);
+ (void)cmpxchg(&ctx->state, BPF_TW_RUNNING, BPF_TW_STANDBY);
+
+ bpf_task_work_ctx_put(ctx);
+}
+
+static void bpf_task_work_irq(struct irq_work *irq_work)
+{
+ struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work);
+ enum bpf_task_work_state state;
+ int err;
+
+ guard(rcu_tasks_trace)();
+
+ if (cmpxchg(&ctx->state, BPF_TW_PENDING, BPF_TW_SCHEDULING) != BPF_TW_PENDING) {
+ bpf_task_work_ctx_put(ctx);
+ return;
+ }
+
+ err = task_work_add(ctx->task, &ctx->work, ctx->mode);
+ if (err) {
+ bpf_task_work_ctx_reset(ctx);
+ /*
+ * try to switch back to STANDBY for another task_work reuse, but we might have
+ * gone to FREED already, which is fine as we already cleaned up after ourselves
+ */
+ (void)cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_STANDBY);
+ bpf_task_work_ctx_put(ctx);
+ return;
+ }
+
+ /*
+ * It's technically possible for just scheduled task_work callback to
+ * complete running by now, going SCHEDULING -> RUNNING and then
+ * dropping its ctx refcount. Instead of capturing extra ref just to
+ * protected below ctx->state access, we rely on RCU protection to
+ * perform below SCHEDULING -> SCHEDULED attempt.
+ */
+ state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_SCHEDULED);
+ if (state == BPF_TW_FREED)
+ bpf_task_work_cancel(ctx); /* clean up if we switched into FREED state */
+}
+
+static struct bpf_task_work_ctx *bpf_task_work_fetch_ctx(struct bpf_task_work *tw,
+ struct bpf_map *map)
+{
+ struct bpf_task_work_kern *twk = (void *)tw;
+ struct bpf_task_work_ctx *ctx, *old_ctx;
+
+ ctx = READ_ONCE(twk->ctx);
+ if (ctx)
+ return ctx;
+
+ ctx = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_task_work_ctx));
+ if (!ctx)
+ return ERR_PTR(-ENOMEM);
+
+ memset(ctx, 0, sizeof(*ctx));
+ refcount_set(&ctx->refcnt, 1); /* map's own ref */
+ ctx->state = BPF_TW_STANDBY;
+
+ old_ctx = cmpxchg(&twk->ctx, NULL, ctx);
+ if (old_ctx) {
+ /*
+ * tw->ctx is set by concurrent BPF program, release allocated
+ * memory and try to reuse already set context.
+ */
+ bpf_mem_free(&bpf_global_ma, ctx);
+ return old_ctx;
+ }
+
+ return ctx; /* Success */
+}
+
+static struct bpf_task_work_ctx *bpf_task_work_acquire_ctx(struct bpf_task_work *tw,
+ struct bpf_map *map)
+{
+ struct bpf_task_work_ctx *ctx;
+
+ ctx = bpf_task_work_fetch_ctx(tw, map);
+ if (IS_ERR(ctx))
+ return ctx;
+
+ /* try to get ref for task_work callback to hold */
+ if (!bpf_task_work_ctx_tryget(ctx))
+ return ERR_PTR(-EBUSY);
+
+ if (cmpxchg(&ctx->state, BPF_TW_STANDBY, BPF_TW_PENDING) != BPF_TW_STANDBY) {
+ /* lost acquiring race or map_release_uref() stole it from us, put ref and bail */
+ bpf_task_work_ctx_put(ctx);
+ return ERR_PTR(-EBUSY);
+ }
+
+ /*
+ * If no process or bpffs is holding a reference to the map, no new callbacks should be
+ * scheduled. This does not address any race or correctness issue, but rather is a policy
+ * choice: dropping user references should stop everything.
+ */
+ if (!atomic64_read(&map->usercnt)) {
+ /* drop ref we just got for task_work callback itself */
+ bpf_task_work_ctx_put(ctx);
+ /* transfer map's ref into cancel_and_free() */
+ bpf_task_work_cancel_and_free(tw);
+ return ERR_PTR(-EBUSY);
+ }
+
+ return ctx;
+}
+
+static int bpf_task_work_schedule(struct task_struct *task, struct bpf_task_work *tw,
+ struct bpf_map *map, bpf_task_work_callback_t callback_fn,
+ struct bpf_prog_aux *aux, enum task_work_notify_mode mode)
+{
+ struct bpf_prog *prog;
+ struct bpf_task_work_ctx *ctx;
+ int err;
+
+ BTF_TYPE_EMIT(struct bpf_task_work);
+
+ prog = bpf_prog_inc_not_zero(aux->prog);
+ if (IS_ERR(prog))
+ return -EBADF;
+ task = bpf_task_acquire(task);
+ if (!task) {
+ err = -EBADF;
+ goto release_prog;
+ }
+
+ ctx = bpf_task_work_acquire_ctx(tw, map);
+ if (IS_ERR(ctx)) {
+ err = PTR_ERR(ctx);
+ goto release_all;
+ }
+
+ ctx->task = task;
+ ctx->callback_fn = callback_fn;
+ ctx->prog = prog;
+ ctx->mode = mode;
+ ctx->map = map;
+ ctx->map_val = (void *)tw - map->record->task_work_off;
+ init_task_work(&ctx->work, bpf_task_work_callback);
+ init_irq_work(&ctx->irq_work, bpf_task_work_irq);
+
+ irq_work_queue(&ctx->irq_work);
+ return 0;
+
+release_all:
+ bpf_task_release(task);
+release_prog:
+ bpf_prog_put(prog);
+ return err;
+}
+
+/**
+ * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode
+ * @task: Task struct for which callback should be scheduled
+ * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
+ * @map__map: bpf_map that embeds struct bpf_task_work in the values
+ * @callback: pointer to BPF subprogram to call
+ * @aux__prog: user should pass NULL
+ *
+ * Return: 0 if task work has been scheduled successfully, negative error code otherwise
+ */
+__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw,
+ void *map__map, bpf_task_work_callback_t callback,
+ void *aux__prog)
+{
+ return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL);
+}
+
+/**
+ * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME mode
+ * @task: Task struct for which callback should be scheduled
+ * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
+ * @map__map: bpf_map that embeds struct bpf_task_work in the values
+ * @callback: pointer to BPF subprogram to call
+ * @aux__prog: user should pass NULL
+ *
+ * Return: 0 if task work has been scheduled successfully, negative error code otherwise
+ */
+__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw,
+ void *map__map, bpf_task_work_callback_t callback,
+ void *aux__prog)
+{
+ return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME);
+}
__bpf_kfunc_end_defs();
+static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work)
+{
+ struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work);
+
+ bpf_task_work_cancel(ctx); /* this might put task_work callback's ref */
+ bpf_task_work_ctx_put(ctx); /* and here we put map's own ref that was transferred to us */
+}
+
+void bpf_task_work_cancel_and_free(void *val)
+{
+ struct bpf_task_work_kern *twk = val;
+ struct bpf_task_work_ctx *ctx;
+ enum bpf_task_work_state state;
+
+ ctx = xchg(&twk->ctx, NULL);
+ if (!ctx)
+ return;
+
+ state = xchg(&ctx->state, BPF_TW_FREED);
+ if (state == BPF_TW_SCHEDULED) {
+ /* run in irq_work to avoid locks in NMI */
+ init_irq_work(&ctx->irq_work, bpf_task_work_cancel_scheduled);
+ irq_work_queue(&ctx->irq_work);
+ return;
+ }
+
+ bpf_task_work_ctx_put(ctx); /* put bpf map's ref */
+}
+
BTF_KFUNCS_START(generic_btf_ids)
#ifdef CONFIG_CRASH_DUMP
BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
@@ -3743,6 +4268,14 @@ BTF_ID_FLAGS(func, bpf_throw)
#ifdef CONFIG_BPF_EVENTS
BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS)
#endif
+#ifdef CONFIG_KEYS
+BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE)
+#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
+BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE)
+#endif
+#endif
BTF_KFUNCS_END(generic_btf_ids)
static const struct btf_kfunc_id_set generic_kfunc_set = {
@@ -3824,6 +4357,7 @@ BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
#endif
BTF_ID_FLAGS(func, __bpf_trap)
BTF_ID_FLAGS(func, bpf_strcmp);
+BTF_ID_FLAGS(func, bpf_strcasecmp);
BTF_ID_FLAGS(func, bpf_strchr);
BTF_ID_FLAGS(func, bpf_strchrnul);
BTF_ID_FLAGS(func, bpf_strnchr);
@@ -3838,6 +4372,8 @@ BTF_ID_FLAGS(func, bpf_strnstr);
BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU)
#endif
BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {