summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/Makefile12
-rw-r--r--kernel/bpf/arena.c405
-rw-r--r--kernel/bpf/arraymap.c29
-rw-r--r--kernel/bpf/bpf_cgrp_storage.c62
-rw-r--r--kernel/bpf/bpf_inode_storage.c6
-rw-r--r--kernel/bpf/bpf_insn_array.c4
-rw-r--r--kernel/bpf/bpf_iter.c2
-rw-r--r--kernel/bpf/bpf_local_storage.c408
-rw-r--r--kernel/bpf/bpf_lsm.c5
-rw-r--r--kernel/bpf/bpf_lsm_proto.c19
-rw-r--r--kernel/bpf/bpf_struct_ops.c88
-rw-r--r--kernel/bpf/bpf_task_storage.c154
-rw-r--r--kernel/bpf/btf.c228
-rw-r--r--kernel/bpf/cgroup.c6
-rw-r--r--kernel/bpf/cgroup_iter.c26
-rw-r--r--kernel/bpf/core.c15
-rw-r--r--kernel/bpf/cpumap.c21
-rw-r--r--kernel/bpf/cpumask.c2
-rw-r--r--kernel/bpf/crypto.c10
-rw-r--r--kernel/bpf/hashtab.c105
-rw-r--r--kernel/bpf/helpers.c698
-rw-r--r--kernel/bpf/inode.c42
-rw-r--r--kernel/bpf/local_storage.c27
-rw-r--r--kernel/bpf/map_iter.c2
-rw-r--r--kernel/bpf/offload.c12
-rw-r--r--kernel/bpf/range_tree.c5
-rw-r--r--kernel/bpf/ringbuf.c1
-rw-r--r--kernel/bpf/rqspinlock.c7
-rw-r--r--kernel/bpf/stream.c24
-rw-r--r--kernel/bpf/syscall.c173
-rw-r--r--kernel/bpf/tnum.c16
-rw-r--r--kernel/bpf/token.c1
-rw-r--r--kernel/bpf/trampoline.c320
-rw-r--r--kernel/bpf/verifier.c1471
-rw-r--r--kernel/sched/ext.c8
-rw-r--r--kernel/trace/Kconfig3
-rw-r--r--kernel/trace/bpf_trace.c84
-rw-r--r--kernel/trace/ftrace.c407
38 files changed, 3551 insertions, 1357 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 232cbc97434d..79cf22860a99 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -42,7 +42,17 @@ endif
ifeq ($(CONFIG_BPF_JIT),y)
obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
obj-$(CONFIG_BPF_SYSCALL) += cpumask.o
-obj-${CONFIG_BPF_LSM} += bpf_lsm.o
+# bpf_lsm_proto.o must precede bpf_lsm.o. The current pahole logic
+# deduplicates function prototypes within
+# btf_encoder__add_saved_func() by keeping the first instance seen. We
+# need the function prototype(s) in bpf_lsm_proto.o to take precedence
+# over those within bpf_lsm.o. Having bpf_lsm_proto.o precede
+# bpf_lsm.o ensures its DWARF CU is processed early, forcing the
+# generated BTF to contain the overrides.
+#
+# Notably, this is a temporary workaround whilst the deduplication
+# semantics within pahole are revisited accordingly.
+obj-${CONFIG_BPF_LSM} += bpf_lsm_proto.o bpf_lsm.o
endif
ifneq ($(CONFIG_CRYPTO),)
obj-$(CONFIG_BPF_SYSCALL) += crypto.o
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 872dc0e41c65..42fae0a9f314 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -2,11 +2,15 @@
/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
#include <linux/bpf.h>
#include <linux/btf.h>
+#include <linux/cacheflush.h>
#include <linux/err.h>
+#include <linux/irq_work.h>
#include "linux/filter.h"
+#include <linux/llist.h>
#include <linux/btf_ids.h>
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
+#include <asm/tlbflush.h>
#include "range_tree.h"
/*
@@ -42,14 +46,31 @@
#define GUARD_SZ round_up(1ull << sizeof_field(struct bpf_insn, off) * 8, PAGE_SIZE << 1)
#define KERN_VM_SZ (SZ_4G + GUARD_SZ)
+static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable);
+
struct bpf_arena {
struct bpf_map map;
u64 user_vm_start;
u64 user_vm_end;
struct vm_struct *kern_vm;
struct range_tree rt;
+ /* protects rt */
+ rqspinlock_t spinlock;
struct list_head vma_list;
+ /* protects vma_list */
struct mutex lock;
+ struct irq_work free_irq;
+ struct work_struct free_work;
+ struct llist_head free_spans;
+};
+
+static void arena_free_worker(struct work_struct *work);
+static void arena_free_irq(struct irq_work *iw);
+
+struct arena_free_span {
+ struct llist_node node;
+ unsigned long uaddr;
+ u32 page_cnt;
};
u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
@@ -92,6 +113,66 @@ static long compute_pgoff(struct bpf_arena *arena, long uaddr)
return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT;
}
+struct apply_range_data {
+ struct page **pages;
+ int i;
+};
+
+static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data)
+{
+ struct apply_range_data *d = data;
+ struct page *page;
+
+ if (!data)
+ return 0;
+ /* sanity check */
+ if (unlikely(!pte_none(ptep_get(pte))))
+ return -EBUSY;
+
+ page = d->pages[d->i];
+ /* paranoia, similar to vmap_pages_pte_range() */
+ if (WARN_ON_ONCE(!pfn_valid(page_to_pfn(page))))
+ return -EINVAL;
+
+ set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL));
+ d->i++;
+ return 0;
+}
+
+static void flush_vmap_cache(unsigned long start, unsigned long size)
+{
+ flush_cache_vmap(start, start + size);
+}
+
+static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *free_pages)
+{
+ pte_t old_pte;
+ struct page *page;
+
+ /* sanity check */
+ old_pte = ptep_get(pte);
+ if (pte_none(old_pte) || !pte_present(old_pte))
+ return 0; /* nothing to do */
+
+ page = pte_page(old_pte);
+ if (WARN_ON_ONCE(!page))
+ return -EINVAL;
+
+ pte_clear(&init_mm, addr, pte);
+
+ /* Add page to the list so it is freed later */
+ if (free_pages)
+ __llist_add(&page->pcp_llist, free_pages);
+
+ return 0;
+}
+
+static int populate_pgtable_except_pte(struct bpf_arena *arena)
+{
+ return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
+ KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL);
+}
+
static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
{
struct vm_struct *kern_vm;
@@ -136,6 +217,9 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
arena->user_vm_end = arena->user_vm_start + vm_range;
INIT_LIST_HEAD(&arena->vma_list);
+ init_llist_head(&arena->free_spans);
+ init_irq_work(&arena->free_irq, arena_free_irq);
+ INIT_WORK(&arena->free_work, arena_free_worker);
bpf_map_init_from_attr(&arena->map, attr);
range_tree_init(&arena->rt);
err = range_tree_set(&arena->rt, 0, attr->max_entries);
@@ -144,6 +228,13 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
goto err;
}
mutex_init(&arena->lock);
+ raw_res_spin_lock_init(&arena->spinlock);
+ err = populate_pgtable_except_pte(arena);
+ if (err) {
+ range_tree_destroy(&arena->rt);
+ bpf_map_area_free(arena);
+ goto err;
+ }
return &arena->map;
err:
@@ -184,6 +275,10 @@ static void arena_map_free(struct bpf_map *map)
if (WARN_ON_ONCE(!list_empty(&arena->vma_list)))
return;
+ /* Ensure no pending deferred frees */
+ irq_work_sync(&arena->free_irq);
+ flush_work(&arena->free_work);
+
/*
* free_vm_area() calls remove_vm_area() that calls free_unmap_vmap_area().
* It unmaps everything from vmalloc area and clears pgtables.
@@ -265,44 +360,59 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
{
struct bpf_map *map = vmf->vma->vm_file->private_data;
struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+ struct mem_cgroup *new_memcg, *old_memcg;
struct page *page;
long kbase, kaddr;
+ unsigned long flags;
int ret;
kbase = bpf_arena_get_kern_vm_start(arena);
kaddr = kbase + (u32)(vmf->address);
- guard(mutex)(&arena->lock);
+ if (raw_res_spin_lock_irqsave(&arena->spinlock, flags))
+ /* Make a reasonable effort to address impossible case */
+ return VM_FAULT_RETRY;
+
page = vmalloc_to_page((void *)kaddr);
if (page)
/* already have a page vmap-ed */
goto out;
+ bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
+
if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT)
/* User space requested to segfault when page is not allocated by bpf prog */
- return VM_FAULT_SIGSEGV;
+ goto out_unlock_sigsegv;
ret = range_tree_clear(&arena->rt, vmf->pgoff, 1);
if (ret)
- return VM_FAULT_SIGSEGV;
+ goto out_unlock_sigsegv;
+ struct apply_range_data data = { .pages = &page, .i = 0 };
/* Account into memcg of the process that created bpf_arena */
ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
if (ret) {
range_tree_set(&arena->rt, vmf->pgoff, 1);
- return VM_FAULT_SIGSEGV;
+ goto out_unlock_sigsegv;
}
- ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page);
+ ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data);
if (ret) {
range_tree_set(&arena->rt, vmf->pgoff, 1);
- __free_page(page);
- return VM_FAULT_SIGSEGV;
+ free_pages_nolock(page, 0);
+ goto out_unlock_sigsegv;
}
+ flush_vmap_cache(kaddr, PAGE_SIZE);
+ bpf_map_memcg_exit(old_memcg, new_memcg);
out:
page_ref_add(page, 1);
+ raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
vmf->page = page;
return 0;
+out_unlock_sigsegv:
+ bpf_map_memcg_exit(old_memcg, new_memcg);
+ raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
+ return VM_FAULT_SIGSEGV;
}
static const struct vm_operations_struct arena_vm_ops = {
@@ -423,12 +533,18 @@ static u64 clear_lo32(u64 val)
* Allocate pages and vmap them into kernel vmalloc area.
* Later the pages will be mmaped into user space vma.
*/
-static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id)
+static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id,
+ bool sleepable)
{
/* user_vm_end/start are fixed before bpf prog runs */
long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena);
- struct page **pages;
+ struct mem_cgroup *new_memcg, *old_memcg;
+ struct apply_range_data data;
+ struct page **pages = NULL;
+ long remaining, mapped = 0;
+ long alloc_pages;
+ unsigned long flags;
long pgoff = 0;
u32 uaddr32;
int ret, i;
@@ -445,17 +561,23 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
return 0;
}
- /* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */
- pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL);
- if (!pages)
+ bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
+ /* Cap allocation size to KMALLOC_MAX_CACHE_SIZE so kmalloc_nolock() can succeed. */
+ alloc_pages = min(page_cnt, KMALLOC_MAX_CACHE_SIZE / sizeof(struct page *));
+ pages = kmalloc_nolock(alloc_pages * sizeof(struct page *), __GFP_ACCOUNT, NUMA_NO_NODE);
+ if (!pages) {
+ bpf_map_memcg_exit(old_memcg, new_memcg);
return 0;
+ }
+ data.pages = pages;
- guard(mutex)(&arena->lock);
+ if (raw_res_spin_lock_irqsave(&arena->spinlock, flags))
+ goto out_free_pages;
if (uaddr) {
ret = is_range_tree_set(&arena->rt, pgoff, page_cnt);
if (ret)
- goto out_free_pages;
+ goto out_unlock_free_pages;
ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
} else {
ret = pgoff = range_tree_find(&arena->rt, page_cnt);
@@ -463,33 +585,62 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
}
if (ret)
- goto out_free_pages;
-
- ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages);
- if (ret)
- goto out;
+ goto out_unlock_free_pages;
+ remaining = page_cnt;
uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE);
- /* Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1
- * will not overflow 32-bit. Lower 32-bit need to represent
- * contiguous user address range.
- * Map these pages at kern_vm_start base.
- * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow
- * lower 32-bit and it's ok.
- */
- ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32,
- kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages);
- if (ret) {
- for (i = 0; i < page_cnt; i++)
- __free_page(pages[i]);
- goto out;
+
+ while (remaining) {
+ long this_batch = min(remaining, alloc_pages);
+
+ /* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */
+ memset(pages, 0, this_batch * sizeof(struct page *));
+
+ ret = bpf_map_alloc_pages(&arena->map, node_id, this_batch, pages);
+ if (ret)
+ goto out;
+
+ /*
+ * Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1
+ * will not overflow 32-bit. Lower 32-bit need to represent
+ * contiguous user address range.
+ * Map these pages at kern_vm_start base.
+ * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow
+ * lower 32-bit and it's ok.
+ */
+ data.i = 0;
+ ret = apply_to_page_range(&init_mm,
+ kern_vm_start + uaddr32 + (mapped << PAGE_SHIFT),
+ this_batch << PAGE_SHIFT, apply_range_set_cb, &data);
+ if (ret) {
+ /* data.i pages were mapped, account them and free the remaining */
+ mapped += data.i;
+ for (i = data.i; i < this_batch; i++)
+ free_pages_nolock(pages[i], 0);
+ goto out;
+ }
+
+ mapped += this_batch;
+ remaining -= this_batch;
}
- kvfree(pages);
+ flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT);
+ raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
+ kfree_nolock(pages);
+ bpf_map_memcg_exit(old_memcg, new_memcg);
return clear_lo32(arena->user_vm_start) + uaddr32;
out:
- range_tree_set(&arena->rt, pgoff, page_cnt);
+ range_tree_set(&arena->rt, pgoff + mapped, page_cnt - mapped);
+ raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
+ if (mapped) {
+ flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT);
+ arena_free_pages(arena, uaddr32, mapped, sleepable);
+ }
+ goto out_free_pages;
+out_unlock_free_pages:
+ raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
out_free_pages:
- kvfree(pages);
+ kfree_nolock(pages);
+ bpf_map_memcg_exit(old_memcg, new_memcg);
return 0;
}
@@ -502,42 +653,66 @@ static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
{
struct vma_list *vml;
+ guard(mutex)(&arena->lock);
+ /* iterate link list under lock */
list_for_each_entry(vml, &arena->vma_list, head)
zap_page_range_single(vml->vma, uaddr,
PAGE_SIZE * page_cnt, NULL);
}
-static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
+static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable)
{
+ struct mem_cgroup *new_memcg, *old_memcg;
u64 full_uaddr, uaddr_end;
- long kaddr, pgoff, i;
+ long kaddr, pgoff;
struct page *page;
+ struct llist_head free_pages;
+ struct llist_node *pos, *t;
+ struct arena_free_span *s;
+ unsigned long flags;
+ int ret = 0;
/* only aligned lower 32-bit are relevant */
uaddr = (u32)uaddr;
uaddr &= PAGE_MASK;
+ kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr;
full_uaddr = clear_lo32(arena->user_vm_start) + uaddr;
uaddr_end = min(arena->user_vm_end, full_uaddr + (page_cnt << PAGE_SHIFT));
if (full_uaddr >= uaddr_end)
return;
page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT;
+ pgoff = compute_pgoff(arena, uaddr);
+ bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
- guard(mutex)(&arena->lock);
+ if (!sleepable)
+ goto defer;
+
+ ret = raw_res_spin_lock_irqsave(&arena->spinlock, flags);
+
+ /* Can't proceed without holding the spinlock so defer the free */
+ if (ret)
+ goto defer;
- pgoff = compute_pgoff(arena, uaddr);
- /* clear range */
range_tree_set(&arena->rt, pgoff, page_cnt);
+ init_llist_head(&free_pages);
+ /* clear ptes and collect struct pages */
+ apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
+ apply_range_clear_cb, &free_pages);
+
+ /* drop the lock to do the tlb flush and zap pages */
+ raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
+
+ /* ensure no stale TLB entries */
+ flush_tlb_kernel_range(kaddr, kaddr + (page_cnt * PAGE_SIZE));
+
if (page_cnt > 1)
/* bulk zap if multiple pages being freed */
zap_pages(arena, full_uaddr, page_cnt);
- kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr;
- for (i = 0; i < page_cnt; i++, kaddr += PAGE_SIZE, full_uaddr += PAGE_SIZE) {
- page = vmalloc_to_page((void *)kaddr);
- if (!page)
- continue;
+ llist_for_each_safe(pos, t, __llist_del_all(&free_pages)) {
+ page = llist_entry(pos, struct page, pcp_llist);
if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */
/* Optimization for the common case of page_cnt==1:
* If page wasn't mapped into some user vma there
@@ -545,9 +720,27 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
* page_cnt is big it's faster to do the batched zap.
*/
zap_pages(arena, full_uaddr, 1);
- vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE);
__free_page(page);
}
+ bpf_map_memcg_exit(old_memcg, new_memcg);
+
+ return;
+
+defer:
+ s = kmalloc_nolock(sizeof(struct arena_free_span), __GFP_ACCOUNT, -1);
+ bpf_map_memcg_exit(old_memcg, new_memcg);
+ if (!s)
+ /*
+ * If allocation fails in non-sleepable context, pages are intentionally left
+ * inaccessible (leaked) until the arena is destroyed. Cleanup or retries are not
+ * possible here, so we intentionally omit them for safety.
+ */
+ return;
+
+ s->page_cnt = page_cnt;
+ s->uaddr = uaddr;
+ llist_add(&s->node, &arena->free_spans);
+ irq_work_queue(&arena->free_irq);
}
/*
@@ -557,6 +750,8 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt)
{
long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
+ struct mem_cgroup *new_memcg, *old_memcg;
+ unsigned long flags;
long pgoff;
int ret;
@@ -567,15 +762,94 @@ static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt
if (pgoff + page_cnt > page_cnt_max)
return -EINVAL;
- guard(mutex)(&arena->lock);
+ if (raw_res_spin_lock_irqsave(&arena->spinlock, flags))
+ return -EBUSY;
/* Cannot guard already allocated pages. */
ret = is_range_tree_set(&arena->rt, pgoff, page_cnt);
- if (ret)
- return -EBUSY;
+ if (ret) {
+ ret = -EBUSY;
+ goto out;
+ }
/* "Allocate" the region to prevent it from being allocated. */
- return range_tree_clear(&arena->rt, pgoff, page_cnt);
+ bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
+ ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
+ bpf_map_memcg_exit(old_memcg, new_memcg);
+out:
+ raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
+ return ret;
+}
+
+static void arena_free_worker(struct work_struct *work)
+{
+ struct bpf_arena *arena = container_of(work, struct bpf_arena, free_work);
+ struct mem_cgroup *new_memcg, *old_memcg;
+ struct llist_node *list, *pos, *t;
+ struct arena_free_span *s;
+ u64 arena_vm_start, user_vm_start;
+ struct llist_head free_pages;
+ struct page *page;
+ unsigned long full_uaddr;
+ long kaddr, page_cnt, pgoff;
+ unsigned long flags;
+
+ if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) {
+ schedule_work(work);
+ return;
+ }
+
+ bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
+
+ init_llist_head(&free_pages);
+ arena_vm_start = bpf_arena_get_kern_vm_start(arena);
+ user_vm_start = bpf_arena_get_user_vm_start(arena);
+
+ list = llist_del_all(&arena->free_spans);
+ llist_for_each(pos, list) {
+ s = llist_entry(pos, struct arena_free_span, node);
+ page_cnt = s->page_cnt;
+ kaddr = arena_vm_start + s->uaddr;
+ pgoff = compute_pgoff(arena, s->uaddr);
+
+ /* clear ptes and collect pages in free_pages llist */
+ apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
+ apply_range_clear_cb, &free_pages);
+
+ range_tree_set(&arena->rt, pgoff, page_cnt);
+ }
+ raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
+
+ /* Iterate the list again without holding spinlock to do the tlb flush and zap_pages */
+ llist_for_each_safe(pos, t, list) {
+ s = llist_entry(pos, struct arena_free_span, node);
+ page_cnt = s->page_cnt;
+ full_uaddr = clear_lo32(user_vm_start) + s->uaddr;
+ kaddr = arena_vm_start + s->uaddr;
+
+ /* ensure no stale TLB entries */
+ flush_tlb_kernel_range(kaddr, kaddr + (page_cnt * PAGE_SIZE));
+
+ /* remove pages from user vmas */
+ zap_pages(arena, full_uaddr, page_cnt);
+
+ kfree_nolock(s);
+ }
+
+ /* free all pages collected by apply_to_existing_page_range() in the first loop */
+ llist_for_each_safe(pos, t, __llist_del_all(&free_pages)) {
+ page = llist_entry(pos, struct page, pcp_llist);
+ __free_page(page);
+ }
+
+ bpf_map_memcg_exit(old_memcg, new_memcg);
+}
+
+static void arena_free_irq(struct irq_work *iw)
+{
+ struct bpf_arena *arena = container_of(iw, struct bpf_arena, free_irq);
+
+ schedule_work(&arena->free_work);
}
__bpf_kfunc_start_defs();
@@ -589,9 +863,20 @@ __bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_
if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
return NULL;
- return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id);
+ return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true);
}
+void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
+ int node_id, u64 flags)
+{
+ struct bpf_map *map = p__map;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
+ return NULL;
+
+ return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false);
+}
__bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt)
{
struct bpf_map *map = p__map;
@@ -599,7 +884,17 @@ __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt
if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign)
return;
- arena_free_pages(arena, (long)ptr__ign, page_cnt);
+ arena_free_pages(arena, (long)ptr__ign, page_cnt, true);
+}
+
+void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt)
+{
+ struct bpf_map *map = p__map;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign)
+ return;
+ arena_free_pages(arena, (long)ptr__ign, page_cnt, false);
}
__bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_cnt)
@@ -618,9 +913,9 @@ __bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_c
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(arena_kfuncs)
-BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_RET | KF_ARENA_ARG2)
-BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2)
-BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2)
+BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_ARENA_RET | KF_ARENA_ARG2)
+BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_ARENA_ARG2)
+BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_ARENA_ARG2)
BTF_KFUNCS_END(arena_kfuncs)
static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 1eeb31c5b317..67e9e811de3a 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -307,7 +307,7 @@ static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key,
return per_cpu_ptr(array->pptrs[index & array->index_mask], cpu);
}
-int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
+int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value, u64 map_flags)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
@@ -325,11 +325,18 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
size = array->elem_size;
rcu_read_lock();
pptr = array->pptrs[index & array->index_mask];
+ if (map_flags & BPF_F_CPU) {
+ cpu = map_flags >> 32;
+ copy_map_value(map, value, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, value);
+ goto unlock;
+ }
for_each_possible_cpu(cpu) {
copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
check_and_init_map_value(map, value + off);
off += size;
}
+unlock:
rcu_read_unlock();
return 0;
}
@@ -398,10 +405,11 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
void __percpu *pptr;
- int cpu, off = 0;
+ void *ptr, *val;
u32 size;
+ int cpu;
- if (unlikely(map_flags > BPF_EXIST))
+ if (unlikely((map_flags & BPF_F_LOCK) || (u32)map_flags > BPF_F_ALL_CPUS))
/* unknown flags */
return -EINVAL;
@@ -422,11 +430,20 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
size = array->elem_size;
rcu_read_lock();
pptr = array->pptrs[index & array->index_mask];
+ if (map_flags & BPF_F_CPU) {
+ cpu = map_flags >> 32;
+ ptr = per_cpu_ptr(pptr, cpu);
+ copy_map_value(map, ptr, value);
+ bpf_obj_free_fields(array->map.record, ptr);
+ goto unlock;
+ }
for_each_possible_cpu(cpu) {
- copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off);
- bpf_obj_free_fields(array->map.record, per_cpu_ptr(pptr, cpu));
- off += size;
+ ptr = per_cpu_ptr(pptr, cpu);
+ val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu;
+ copy_map_value(map, ptr, val);
+ bpf_obj_free_fields(array->map.record, ptr);
}
+unlock:
rcu_read_unlock();
return 0;
}
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index 0687a760974a..c2a2ead1f466 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -11,29 +11,6 @@
DEFINE_BPF_STORAGE_CACHE(cgroup_cache);
-static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy);
-
-static void bpf_cgrp_storage_lock(void)
-{
- cant_migrate();
- this_cpu_inc(bpf_cgrp_storage_busy);
-}
-
-static void bpf_cgrp_storage_unlock(void)
-{
- this_cpu_dec(bpf_cgrp_storage_busy);
-}
-
-static bool bpf_cgrp_storage_trylock(void)
-{
- cant_migrate();
- if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) {
- this_cpu_dec(bpf_cgrp_storage_busy);
- return false;
- }
- return true;
-}
-
static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner)
{
struct cgroup *cg = owner;
@@ -45,16 +22,14 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup)
{
struct bpf_local_storage *local_storage;
- rcu_read_lock_dont_migrate();
+ rcu_read_lock();
local_storage = rcu_dereference(cgroup->bpf_cgrp_storage);
if (!local_storage)
goto out;
- bpf_cgrp_storage_lock();
bpf_local_storage_destroy(local_storage);
- bpf_cgrp_storage_unlock();
out:
- rcu_read_unlock_migrate();
+ rcu_read_unlock();
}
static struct bpf_local_storage_data *
@@ -83,9 +58,7 @@ static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key)
if (IS_ERR(cgroup))
return ERR_CAST(cgroup);
- bpf_cgrp_storage_lock();
sdata = cgroup_storage_lookup(cgroup, map, true);
- bpf_cgrp_storage_unlock();
cgroup_put(cgroup);
return sdata ? sdata->data : NULL;
}
@@ -102,10 +75,8 @@ static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
if (IS_ERR(cgroup))
return PTR_ERR(cgroup);
- bpf_cgrp_storage_lock();
sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
value, map_flags, false, GFP_ATOMIC);
- bpf_cgrp_storage_unlock();
cgroup_put(cgroup);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -118,8 +89,7 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map)
if (!sdata)
return -ENOENT;
- bpf_selem_unlink(SELEM(sdata), false);
- return 0;
+ return bpf_selem_unlink(SELEM(sdata));
}
static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
@@ -132,9 +102,7 @@ static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
if (IS_ERR(cgroup))
return PTR_ERR(cgroup);
- bpf_cgrp_storage_lock();
err = cgroup_storage_delete(cgroup, map);
- bpf_cgrp_storage_unlock();
cgroup_put(cgroup);
return err;
}
@@ -151,7 +119,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
static void cgroup_storage_map_free(struct bpf_map *map)
{
- bpf_local_storage_map_free(map, &cgroup_cache, &bpf_cgrp_storage_busy);
+ bpf_local_storage_map_free(map, &cgroup_cache);
}
/* *gfp_flags* is a hidden argument provided by the verifier */
@@ -159,7 +127,6 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
void *, value, u64, flags, gfp_t, gfp_flags)
{
struct bpf_local_storage_data *sdata;
- bool nobusy;
WARN_ON_ONCE(!bpf_rcu_lock_held());
if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
@@ -168,38 +135,27 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
if (!cgroup)
return (unsigned long)NULL;
- nobusy = bpf_cgrp_storage_trylock();
-
- sdata = cgroup_storage_lookup(cgroup, map, nobusy);
+ sdata = cgroup_storage_lookup(cgroup, map, true);
if (sdata)
- goto unlock;
+ goto out;
/* only allocate new storage, when the cgroup is refcounted */
if (!percpu_ref_is_dying(&cgroup->self.refcnt) &&
- (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy)
+ (flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
value, BPF_NOEXIST, false, gfp_flags);
-unlock:
- if (nobusy)
- bpf_cgrp_storage_unlock();
+out:
return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data;
}
BPF_CALL_2(bpf_cgrp_storage_delete, struct bpf_map *, map, struct cgroup *, cgroup)
{
- int ret;
-
WARN_ON_ONCE(!bpf_rcu_lock_held());
if (!cgroup)
return -EINVAL;
- if (!bpf_cgrp_storage_trylock())
- return -EBUSY;
-
- ret = cgroup_storage_delete(cgroup, map);
- bpf_cgrp_storage_unlock();
- return ret;
+ return cgroup_storage_delete(cgroup, map);
}
const struct bpf_map_ops cgrp_storage_map_ops = {
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index e54cce2b9175..e86734609f3d 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -110,9 +110,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
if (!sdata)
return -ENOENT;
- bpf_selem_unlink(SELEM(sdata), false);
-
- return 0;
+ return bpf_selem_unlink(SELEM(sdata));
}
static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
@@ -186,7 +184,7 @@ static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
static void inode_storage_map_free(struct bpf_map *map)
{
- bpf_local_storage_map_free(map, &inode_cache, NULL);
+ bpf_local_storage_map_free(map, &inode_cache);
}
const struct bpf_map_ops inode_storage_map_ops = {
diff --git a/kernel/bpf/bpf_insn_array.c b/kernel/bpf/bpf_insn_array.c
index c96630cb75bf..c0286f25ca3c 100644
--- a/kernel/bpf/bpf_insn_array.c
+++ b/kernel/bpf/bpf_insn_array.c
@@ -123,10 +123,10 @@ static int insn_array_map_direct_value_addr(const struct bpf_map *map, u64 *imm,
if ((off % sizeof(long)) != 0 ||
(off / sizeof(long)) >= map->max_entries)
- return -EINVAL;
+ return -EACCES;
/* from BPF's point of view, this map is a jump table */
- *imm = (unsigned long)insn_array->ips + off;
+ *imm = (unsigned long)insn_array->ips;
return 0;
}
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index eec60b57bd3d..4b58d56ecab1 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -86,7 +86,7 @@ static bool bpf_iter_support_resched(struct seq_file *seq)
/* bpf_seq_read, a customized and simpler version for bpf iterator.
* The following are differences from seq_read():
- * . fixed buffer size (PAGE_SIZE)
+ * . fixed buffer size (PAGE_SIZE << 3)
* . assuming NULL ->llseek()
* . stop() may call bpf program, handling potential overflow there
*/
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index e2fe6c32822b..b28f07d3a0db 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -19,9 +19,9 @@
static struct bpf_local_storage_map_bucket *
select_bucket(struct bpf_local_storage_map *smap,
- struct bpf_local_storage_elem *selem)
+ struct bpf_local_storage *local_storage)
{
- return &smap->buckets[hash_ptr(selem, smap->bucket_log)];
+ return &smap->buckets[hash_ptr(local_storage, smap->bucket_log)];
}
static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size)
@@ -61,11 +61,6 @@ static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem)
return !hlist_unhashed(&selem->snode);
}
-static bool selem_linked_to_map_lockless(const struct bpf_local_storage_elem *selem)
-{
- return !hlist_unhashed_lockless(&selem->map_node);
-}
-
static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
{
return !hlist_unhashed(&selem->map_node);
@@ -90,6 +85,8 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
if (selem) {
RCU_INIT_POINTER(SDATA(selem)->smap, smap);
+ atomic_set(&selem->state, 0);
+ selem->use_kmalloc_nolock = smap->use_kmalloc_nolock;
if (value) {
/* No need to call check_and_init_map_value as memory is zero init */
@@ -198,9 +195,11 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
/* The bpf_local_storage_map_free will wait for rcu_barrier */
smap = rcu_dereference_check(SDATA(selem)->smap, 1);
- migrate_disable();
- bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
- migrate_enable();
+ if (smap) {
+ migrate_disable();
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+ migrate_enable();
+ }
kfree_nolock(selem);
}
@@ -219,13 +218,14 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
- if (!smap->use_kmalloc_nolock) {
+ if (!selem->use_kmalloc_nolock) {
/*
* No uptr will be unpin even when reuse_now == false since uptr
* is only supported in task local storage, where
* smap->use_kmalloc_nolock == true.
*/
- bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+ if (smap)
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
__bpf_selem_free(selem, reuse_now);
return;
}
@@ -256,6 +256,36 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
bpf_selem_free(selem, reuse_now);
}
+static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem *selem,
+ struct bpf_local_storage_map *smap,
+ struct bpf_local_storage *local_storage,
+ bool free_local_storage, bool pin_owner)
+{
+ void *owner = local_storage->owner;
+ u32 uncharge = smap->elem_size;
+
+ if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) ==
+ SDATA(selem))
+ RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
+
+ if (pin_owner && !refcount_inc_not_zero(&local_storage->owner_refcnt))
+ return;
+
+ uncharge += free_local_storage ? sizeof(*local_storage) : 0;
+ mem_uncharge(smap, local_storage->owner, uncharge);
+ local_storage->mem_charge -= uncharge;
+
+ if (free_local_storage) {
+ local_storage->owner = NULL;
+
+ /* After this RCU_INIT, owner may be freed and cannot be used */
+ RCU_INIT_POINTER(*owner_storage(smap, owner), NULL);
+ }
+
+ if (pin_owner)
+ refcount_dec(&local_storage->owner_refcnt);
+}
+
/* local_storage->lock must be held and selem->local_storage == local_storage.
* The caller must ensure selem->smap is still valid to be
* dereferenced for its smap->elem_size and smap->cache_idx.
@@ -266,124 +296,219 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
{
struct bpf_local_storage_map *smap;
bool free_local_storage;
- void *owner;
smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
- owner = local_storage->owner;
-
- /* All uncharging on the owner must be done first.
- * The owner may be freed once the last selem is unlinked
- * from local_storage.
- */
- mem_uncharge(smap, owner, smap->elem_size);
free_local_storage = hlist_is_singular_node(&selem->snode,
&local_storage->list);
- if (free_local_storage) {
- mem_uncharge(smap, owner, sizeof(struct bpf_local_storage));
- local_storage->owner = NULL;
- /* After this RCU_INIT, owner may be freed and cannot be used */
- RCU_INIT_POINTER(*owner_storage(smap, owner), NULL);
+ bpf_selem_unlink_storage_nolock_misc(selem, smap, local_storage,
+ free_local_storage, false);
- /* local_storage is not freed now. local_storage->lock is
- * still held and raw_spin_unlock_bh(&local_storage->lock)
- * will be done by the caller.
- *
- * Although the unlock will be done under
- * rcu_read_lock(), it is more intuitive to
- * read if the freeing of the storage is done
- * after the raw_spin_unlock_bh(&local_storage->lock).
- *
- * Hence, a "bool free_local_storage" is returned
- * to the caller which then calls then frees the storage after
- * all the RCU grace periods have expired.
- */
- }
hlist_del_init_rcu(&selem->snode);
- if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) ==
- SDATA(selem))
- RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
hlist_add_head(&selem->free_node, free_selem_list);
- if (rcu_access_pointer(local_storage->smap) == smap)
- RCU_INIT_POINTER(local_storage->smap, NULL);
-
return free_local_storage;
}
-static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
- bool reuse_now)
+void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_elem *selem)
+{
+ struct bpf_local_storage_map *smap;
+
+ smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
+ local_storage->mem_charge += smap->elem_size;
+
+ RCU_INIT_POINTER(selem->local_storage, local_storage);
+ hlist_add_head_rcu(&selem->snode, &local_storage->list);
+}
+
+static int bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
+{
+ struct bpf_local_storage *local_storage;
+ struct bpf_local_storage_map *smap;
+ struct bpf_local_storage_map_bucket *b;
+ unsigned long flags;
+ int err;
+
+ local_storage = rcu_dereference_check(selem->local_storage,
+ bpf_rcu_lock_held());
+ smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
+ b = select_bucket(smap, local_storage);
+ err = raw_res_spin_lock_irqsave(&b->lock, flags);
+ if (err)
+ return err;
+
+ hlist_del_init_rcu(&selem->map_node);
+ raw_res_spin_unlock_irqrestore(&b->lock, flags);
+
+ return 0;
+}
+
+static void bpf_selem_unlink_map_nolock(struct bpf_local_storage_elem *selem)
+{
+ hlist_del_init_rcu(&selem->map_node);
+}
+
+int bpf_selem_link_map(struct bpf_local_storage_map *smap,
+ struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_elem *selem)
+{
+ struct bpf_local_storage_map_bucket *b;
+ unsigned long flags;
+ int err;
+
+ b = select_bucket(smap, local_storage);
+
+ err = raw_res_spin_lock_irqsave(&b->lock, flags);
+ if (err)
+ return err;
+
+ hlist_add_head_rcu(&selem->map_node, &b->list);
+ raw_res_spin_unlock_irqrestore(&b->lock, flags);
+
+ return 0;
+}
+
+static void bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket *b,
+ struct bpf_local_storage_elem *selem)
+{
+ hlist_add_head_rcu(&selem->map_node, &b->list);
+}
+
+/*
+ * Unlink an selem from map and local storage with lock held.
+ * This is the common path used by local storages to delete an selem.
+ */
+int bpf_selem_unlink(struct bpf_local_storage_elem *selem)
{
struct bpf_local_storage *local_storage;
bool free_local_storage = false;
HLIST_HEAD(selem_free_list);
unsigned long flags;
+ int err;
if (unlikely(!selem_linked_to_storage_lockless(selem)))
/* selem has already been unlinked from sk */
- return;
+ return 0;
local_storage = rcu_dereference_check(selem->local_storage,
bpf_rcu_lock_held());
- raw_spin_lock_irqsave(&local_storage->lock, flags);
- if (likely(selem_linked_to_storage(selem)))
+ err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
+ if (err)
+ return err;
+
+ if (likely(selem_linked_to_storage(selem))) {
+ /* Always unlink from map before unlinking from local_storage
+ * because selem will be freed after successfully unlinked from
+ * the local_storage.
+ */
+ err = bpf_selem_unlink_map(selem);
+ if (err)
+ goto out;
+
free_local_storage = bpf_selem_unlink_storage_nolock(
local_storage, selem, &selem_free_list);
- raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ }
+out:
+ raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
- bpf_selem_free_list(&selem_free_list, reuse_now);
+ bpf_selem_free_list(&selem_free_list, false);
if (free_local_storage)
- bpf_local_storage_free(local_storage, reuse_now);
-}
+ bpf_local_storage_free(local_storage, false);
-void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
- struct bpf_local_storage_elem *selem)
-{
- RCU_INIT_POINTER(selem->local_storage, local_storage);
- hlist_add_head_rcu(&selem->snode, &local_storage->list);
+ return err;
}
-static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
+/*
+ * Unlink an selem from map and local storage with lockless fallback if callers
+ * are racing or rqspinlock returns error. It should only be called by
+ * bpf_local_storage_destroy() or bpf_local_storage_map_free().
+ */
+static void bpf_selem_unlink_nofail(struct bpf_local_storage_elem *selem,
+ struct bpf_local_storage_map_bucket *b)
{
+ bool in_map_free = !!b, free_storage = false;
+ struct bpf_local_storage *local_storage;
struct bpf_local_storage_map *smap;
- struct bpf_local_storage_map_bucket *b;
unsigned long flags;
+ int err, unlink = 0;
- if (unlikely(!selem_linked_to_map_lockless(selem)))
- /* selem has already be unlinked from smap */
- return;
-
+ local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held());
smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
- b = select_bucket(smap, selem);
- raw_spin_lock_irqsave(&b->lock, flags);
- if (likely(selem_linked_to_map(selem)))
- hlist_del_init_rcu(&selem->map_node);
- raw_spin_unlock_irqrestore(&b->lock, flags);
-}
-void bpf_selem_link_map(struct bpf_local_storage_map *smap,
- struct bpf_local_storage_elem *selem)
-{
- struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem);
- unsigned long flags;
+ if (smap) {
+ b = b ? : select_bucket(smap, local_storage);
+ err = raw_res_spin_lock_irqsave(&b->lock, flags);
+ if (!err) {
+ /*
+ * Call bpf_obj_free_fields() under b->lock to make sure it is done
+ * exactly once for an selem. Safe to free special fields immediately
+ * as no BPF program should be referencing the selem.
+ */
+ if (likely(selem_linked_to_map(selem))) {
+ hlist_del_init_rcu(&selem->map_node);
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+ unlink++;
+ }
+ raw_res_spin_unlock_irqrestore(&b->lock, flags);
+ }
+ /*
+ * Highly unlikely scenario: resource leak
+ *
+ * When map_free(selem1), destroy(selem1) and destroy(selem2) are racing
+ * and both selem belong to the same bucket, if destroy(selem2) acquired
+ * b->lock and block for too long, neither map_free(selem1) and
+ * destroy(selem1) will be able to free the special field associated
+ * with selem1 as raw_res_spin_lock_irqsave() returns -ETIMEDOUT.
+ */
+ WARN_ON_ONCE(err && in_map_free);
+ if (!err || in_map_free)
+ RCU_INIT_POINTER(SDATA(selem)->smap, NULL);
+ }
- raw_spin_lock_irqsave(&b->lock, flags);
- hlist_add_head_rcu(&selem->map_node, &b->list);
- raw_spin_unlock_irqrestore(&b->lock, flags);
-}
+ if (local_storage) {
+ err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
+ if (!err) {
+ if (likely(selem_linked_to_storage(selem))) {
+ free_storage = hlist_is_singular_node(&selem->snode,
+ &local_storage->list);
+ /*
+ * Okay to skip clearing owner_storage and storage->owner in
+ * destroy() since the owner is going away. No user or bpf
+ * programs should be able to reference it.
+ */
+ if (smap && in_map_free)
+ bpf_selem_unlink_storage_nolock_misc(
+ selem, smap, local_storage,
+ free_storage, true);
+ hlist_del_init_rcu(&selem->snode);
+ unlink++;
+ }
+ raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
+ }
+ if (!err || !in_map_free)
+ RCU_INIT_POINTER(selem->local_storage, NULL);
+ }
-void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now)
-{
- /* Always unlink from map before unlinking from local_storage
- * because selem will be freed after successfully unlinked from
- * the local_storage.
+ if (unlink != 2)
+ atomic_or(in_map_free ? SELEM_MAP_UNLINKED : SELEM_STORAGE_UNLINKED, &selem->state);
+
+ /*
+ * Normally, an selem can be unlinked under local_storage->lock and b->lock, and
+ * then freed after an RCU grace period. However, if destroy() and map_free() are
+ * racing or rqspinlock returns errors in unlikely situations (unlink != 2), free
+ * the selem only after both map_free() and destroy() see the selem.
*/
- bpf_selem_unlink_map(selem);
- bpf_selem_unlink_storage(selem, reuse_now);
+ if (unlink == 2 ||
+ atomic_cmpxchg(&selem->state, SELEM_UNLINKED, SELEM_TOFREE) == SELEM_UNLINKED)
+ bpf_selem_free(selem, true);
+
+ if (free_storage)
+ bpf_local_storage_free(local_storage, true);
}
void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage,
@@ -391,16 +516,20 @@ void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage,
struct bpf_local_storage_elem *selem)
{
unsigned long flags;
+ int err;
/* spinlock is needed to avoid racing with the
* parallel delete. Otherwise, publishing an already
* deleted sdata to the cache will become a use-after-free
* problem in the next bpf_local_storage_lookup().
*/
- raw_spin_lock_irqsave(&local_storage->lock, flags);
+ err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
+ if (err)
+ return;
+
if (selem_linked_to_storage(selem))
rcu_assign_pointer(local_storage->cache[smap->cache_idx], SDATA(selem));
- raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
}
static int check_flags(const struct bpf_local_storage_data *old_sdata,
@@ -424,6 +553,8 @@ int bpf_local_storage_alloc(void *owner,
{
struct bpf_local_storage *prev_storage, *storage;
struct bpf_local_storage **owner_storage_ptr;
+ struct bpf_local_storage_map_bucket *b;
+ unsigned long flags;
int err;
err = mem_charge(smap, owner, sizeof(*storage));
@@ -441,14 +572,21 @@ int bpf_local_storage_alloc(void *owner,
goto uncharge;
}
- RCU_INIT_POINTER(storage->smap, smap);
INIT_HLIST_HEAD(&storage->list);
- raw_spin_lock_init(&storage->lock);
+ raw_res_spin_lock_init(&storage->lock);
storage->owner = owner;
+ storage->mem_charge = sizeof(*storage);
storage->use_kmalloc_nolock = smap->use_kmalloc_nolock;
+ refcount_set(&storage->owner_refcnt, 1);
bpf_selem_link_storage_nolock(storage, first_selem);
- bpf_selem_link_map(smap, first_selem);
+
+ b = select_bucket(smap, storage);
+ err = raw_res_spin_lock_irqsave(&b->lock, flags);
+ if (err)
+ goto uncharge;
+
+ bpf_selem_link_map_nolock(b, first_selem);
owner_storage_ptr =
(struct bpf_local_storage **)owner_storage(smap, owner);
@@ -464,10 +602,12 @@ int bpf_local_storage_alloc(void *owner,
*/
prev_storage = cmpxchg(owner_storage_ptr, NULL, storage);
if (unlikely(prev_storage)) {
- bpf_selem_unlink_map(first_selem);
+ bpf_selem_unlink_map_nolock(first_selem);
+ raw_res_spin_unlock_irqrestore(&b->lock, flags);
err = -EAGAIN;
goto uncharge;
}
+ raw_res_spin_unlock_irqrestore(&b->lock, flags);
return 0;
@@ -489,8 +629,9 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
struct bpf_local_storage_data *old_sdata = NULL;
struct bpf_local_storage_elem *alloc_selem, *selem = NULL;
struct bpf_local_storage *local_storage;
+ struct bpf_local_storage_map_bucket *b;
HLIST_HEAD(old_selem_free_list);
- unsigned long flags;
+ unsigned long flags, b_flags;
int err;
/* BPF_EXIST and BPF_NOEXIST cannot be both set */
@@ -549,7 +690,9 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (!alloc_selem)
return ERR_PTR(-ENOMEM);
- raw_spin_lock_irqsave(&local_storage->lock, flags);
+ err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
+ if (err)
+ goto free_selem;
/* Recheck local_storage->list under local_storage->lock */
if (unlikely(hlist_empty(&local_storage->list))) {
@@ -574,22 +717,30 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
goto unlock;
}
+ b = select_bucket(smap, local_storage);
+
+ err = raw_res_spin_lock_irqsave(&b->lock, b_flags);
+ if (err)
+ goto unlock;
+
alloc_selem = NULL;
/* First, link the new selem to the map */
- bpf_selem_link_map(smap, selem);
+ bpf_selem_link_map_nolock(b, selem);
/* Second, link (and publish) the new selem to local_storage */
bpf_selem_link_storage_nolock(local_storage, selem);
/* Third, remove old selem, SELEM(old_sdata) */
if (old_sdata) {
- bpf_selem_unlink_map(SELEM(old_sdata));
+ bpf_selem_unlink_map_nolock(SELEM(old_sdata));
bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
&old_selem_free_list);
}
+ raw_res_spin_unlock_irqrestore(&b->lock, b_flags);
unlock:
- raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
+free_selem:
bpf_selem_free_list(&old_selem_free_list, false);
if (alloc_selem) {
mem_uncharge(smap, owner, smap->elem_size);
@@ -657,13 +808,13 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
return 0;
}
-void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
+/*
+ * Destroy local storage when the owner is going away. Caller must uncharge memory
+ * if memory charging is used.
+ */
+u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
{
struct bpf_local_storage_elem *selem;
- bool free_storage = false;
- HLIST_HEAD(free_selem_list);
- struct hlist_node *n;
- unsigned long flags;
/* Neither the bpf_prog nor the bpf_map's syscall
* could be modifying the local_storage->list now.
@@ -674,27 +825,20 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
* when unlinking elem from the local_storage->list and
* the map's bucket->list.
*/
- raw_spin_lock_irqsave(&local_storage->lock, flags);
- hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
- /* Always unlink from map before unlinking from
- * local_storage.
- */
- bpf_selem_unlink_map(selem);
- /* If local_storage list has only one element, the
- * bpf_selem_unlink_storage_nolock() will return true.
- * Otherwise, it will return false. The current loop iteration
- * intends to remove all local storage. So the last iteration
- * of the loop will set the free_cgroup_storage to true.
+ hlist_for_each_entry_rcu(selem, &local_storage->list, snode)
+ bpf_selem_unlink_nofail(selem, NULL);
+
+ if (!refcount_dec_and_test(&local_storage->owner_refcnt)) {
+ while (refcount_read(&local_storage->owner_refcnt))
+ cpu_relax();
+ /*
+ * Paired with refcount_dec() in bpf_selem_unlink_nofail()
+ * to make sure destroy() sees the correct local_storage->mem_charge.
*/
- free_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, &free_selem_list);
+ smp_mb();
}
- raw_spin_unlock_irqrestore(&local_storage->lock, flags);
-
- bpf_selem_free_list(&free_selem_list, true);
- if (free_storage)
- bpf_local_storage_free(local_storage, true);
+ return local_storage->mem_charge;
}
u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
@@ -736,7 +880,7 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
for (i = 0; i < nbuckets; i++) {
INIT_HLIST_HEAD(&smap->buckets[i].list);
- raw_spin_lock_init(&smap->buckets[i].lock);
+ raw_res_spin_lock_init(&smap->buckets[i].lock);
}
smap->elem_size = offsetof(struct bpf_local_storage_elem,
@@ -758,8 +902,7 @@ free_smap:
}
void bpf_local_storage_map_free(struct bpf_map *map,
- struct bpf_local_storage_cache *cache,
- int __percpu *busy_counter)
+ struct bpf_local_storage_cache *cache)
{
struct bpf_local_storage_map_bucket *b;
struct bpf_local_storage_elem *selem;
@@ -789,15 +932,14 @@ void bpf_local_storage_map_free(struct bpf_map *map,
rcu_read_lock();
/* No one is adding to b->list now */
- while ((selem = hlist_entry_safe(
- rcu_dereference_raw(hlist_first_rcu(&b->list)),
- struct bpf_local_storage_elem, map_node))) {
- if (busy_counter)
- this_cpu_inc(*busy_counter);
- bpf_selem_unlink(selem, true);
- if (busy_counter)
- this_cpu_dec(*busy_counter);
- cond_resched_rcu();
+restart:
+ hlist_for_each_entry_rcu(selem, &b->list, map_node) {
+ bpf_selem_unlink_nofail(selem, b);
+
+ if (need_resched()) {
+ cond_resched_rcu();
+ goto restart;
+ }
}
rcu_read_unlock();
}
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 7cb6e8d4282c..0c4a0c8e6f70 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -18,10 +18,11 @@
#include <linux/bpf-cgroup.h>
/* For every LSM hook that allows attachment of BPF programs, declare a nop
- * function where a BPF program can be attached.
+ * function where a BPF program can be attached. Notably, we qualify each with
+ * weak linkage such that strong overrides can be implemented if need be.
*/
#define LSM_HOOK(RET, DEFAULT, NAME, ...) \
-noinline RET bpf_lsm_##NAME(__VA_ARGS__) \
+__weak noinline RET bpf_lsm_##NAME(__VA_ARGS__) \
{ \
return DEFAULT; \
}
diff --git a/kernel/bpf/bpf_lsm_proto.c b/kernel/bpf/bpf_lsm_proto.c
new file mode 100644
index 000000000000..44a54fd8045e
--- /dev/null
+++ b/kernel/bpf/bpf_lsm_proto.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2025 Google LLC.
+ */
+
+#include <linux/fs.h>
+#include <linux/bpf_lsm.h>
+
+/*
+ * Strong definition of the mmap_file() BPF LSM hook. The __nullable suffix on
+ * the struct file pointer parameter name marks it as PTR_MAYBE_NULL. This
+ * explicitly enforces that BPF LSM programs check for NULL before attempting to
+ * dereference it.
+ */
+int bpf_lsm_mmap_file(struct file *file__nullable, unsigned long reqprot,
+ unsigned long prot, unsigned long flags)
+{
+ return 0;
+}
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 278490683d28..c43346cb3d76 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -533,6 +533,17 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
}
}
+static void bpf_struct_ops_map_dissoc_progs(struct bpf_struct_ops_map *st_map)
+{
+ u32 i;
+
+ for (i = 0; i < st_map->funcs_cnt; i++) {
+ if (!st_map->links[i])
+ break;
+ bpf_prog_disassoc_struct_ops(st_map->links[i]->prog);
+ }
+}
+
static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map *st_map)
{
int i;
@@ -801,6 +812,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
goto reset_unlock;
}
+ /* Poison pointer on error instead of return for backward compatibility */
+ bpf_prog_assoc_struct_ops(prog, &st_map->map);
+
link = kzalloc(sizeof(*link), GFP_USER);
if (!link) {
bpf_prog_put(prog);
@@ -980,6 +994,8 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
if (btf_is_module(st_map->btf))
module_put(st_map->st_ops_desc->st_ops->owner);
+ bpf_struct_ops_map_dissoc_progs(st_map);
+
bpf_struct_ops_map_del_ksyms(st_map);
/* The struct_ops's function may switch to another struct_ops.
@@ -1396,6 +1412,78 @@ err_out:
return err;
}
+int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map)
+{
+ struct bpf_map *st_ops_assoc;
+
+ guard(mutex)(&prog->aux->st_ops_assoc_mutex);
+
+ st_ops_assoc = rcu_dereference_protected(prog->aux->st_ops_assoc,
+ lockdep_is_held(&prog->aux->st_ops_assoc_mutex));
+ if (st_ops_assoc && st_ops_assoc == map)
+ return 0;
+
+ if (st_ops_assoc) {
+ if (prog->type != BPF_PROG_TYPE_STRUCT_OPS)
+ return -EBUSY;
+
+ rcu_assign_pointer(prog->aux->st_ops_assoc, BPF_PTR_POISON);
+ } else {
+ /*
+ * struct_ops map does not track associated non-struct_ops programs.
+ * Bump the refcount to make sure st_ops_assoc is always valid.
+ */
+ if (prog->type != BPF_PROG_TYPE_STRUCT_OPS)
+ bpf_map_inc(map);
+
+ rcu_assign_pointer(prog->aux->st_ops_assoc, map);
+ }
+
+ return 0;
+}
+
+void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog)
+{
+ struct bpf_map *st_ops_assoc;
+
+ guard(mutex)(&prog->aux->st_ops_assoc_mutex);
+
+ st_ops_assoc = rcu_dereference_protected(prog->aux->st_ops_assoc,
+ lockdep_is_held(&prog->aux->st_ops_assoc_mutex));
+ if (!st_ops_assoc || st_ops_assoc == BPF_PTR_POISON)
+ return;
+
+ if (prog->type != BPF_PROG_TYPE_STRUCT_OPS)
+ bpf_map_put(st_ops_assoc);
+
+ RCU_INIT_POINTER(prog->aux->st_ops_assoc, NULL);
+}
+
+/*
+ * Get a reference to the struct_ops struct (i.e., kdata) associated with a
+ * program. Should only be called in BPF program context (e.g., in a kfunc).
+ *
+ * If the returned pointer is not NULL, it must points to a valid struct_ops.
+ * The struct_ops map is not guaranteed to be initialized nor attached.
+ * Kernel struct_ops implementers are responsible for tracking and checking
+ * the state of the struct_ops if the use case requires an initialized or
+ * attached struct_ops.
+ */
+void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux)
+{
+ struct bpf_struct_ops_map *st_map;
+ struct bpf_map *st_ops_assoc;
+
+ st_ops_assoc = rcu_dereference_check(aux->st_ops_assoc, bpf_rcu_lock_held());
+ if (!st_ops_assoc || st_ops_assoc == BPF_PTR_POISON)
+ return NULL;
+
+ st_map = (struct bpf_struct_ops_map *)st_ops_assoc;
+
+ return &st_map->kvalue.data;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_get_assoc_struct_ops);
+
void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index a1dc1bf0848a..605506792b5b 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -20,29 +20,6 @@
DEFINE_BPF_STORAGE_CACHE(task_cache);
-static DEFINE_PER_CPU(int, bpf_task_storage_busy);
-
-static void bpf_task_storage_lock(void)
-{
- cant_migrate();
- this_cpu_inc(bpf_task_storage_busy);
-}
-
-static void bpf_task_storage_unlock(void)
-{
- this_cpu_dec(bpf_task_storage_busy);
-}
-
-static bool bpf_task_storage_trylock(void)
-{
- cant_migrate();
- if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
- this_cpu_dec(bpf_task_storage_busy);
- return false;
- }
- return true;
-}
-
static struct bpf_local_storage __rcu **task_storage_ptr(void *owner)
{
struct task_struct *task = owner;
@@ -70,17 +47,15 @@ void bpf_task_storage_free(struct task_struct *task)
{
struct bpf_local_storage *local_storage;
- rcu_read_lock_dont_migrate();
+ rcu_read_lock();
local_storage = rcu_dereference(task->bpf_storage);
if (!local_storage)
goto out;
- bpf_task_storage_lock();
bpf_local_storage_destroy(local_storage);
- bpf_task_storage_unlock();
out:
- rcu_read_unlock_migrate();
+ rcu_read_unlock();
}
static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
@@ -106,9 +81,7 @@ static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
goto out;
}
- bpf_task_storage_lock();
sdata = task_storage_lookup(task, map, true);
- bpf_task_storage_unlock();
put_pid(pid);
return sdata ? sdata->data : NULL;
out:
@@ -143,11 +116,9 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
goto out;
}
- bpf_task_storage_lock();
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value, map_flags,
true, GFP_ATOMIC);
- bpf_task_storage_unlock();
err = PTR_ERR_OR_ZERO(sdata);
out:
@@ -155,8 +126,7 @@ out:
return err;
}
-static int task_storage_delete(struct task_struct *task, struct bpf_map *map,
- bool nobusy)
+static int task_storage_delete(struct task_struct *task, struct bpf_map *map)
{
struct bpf_local_storage_data *sdata;
@@ -164,12 +134,7 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map,
if (!sdata)
return -ENOENT;
- if (!nobusy)
- return -EBUSY;
-
- bpf_selem_unlink(SELEM(sdata), false);
-
- return 0;
+ return bpf_selem_unlink(SELEM(sdata));
}
static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
@@ -194,111 +159,50 @@ static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
goto out;
}
- bpf_task_storage_lock();
- err = task_storage_delete(task, map, true);
- bpf_task_storage_unlock();
+ err = task_storage_delete(task, map);
out:
put_pid(pid);
return err;
}
-/* Called by bpf_task_storage_get*() helpers */
-static void *__bpf_task_storage_get(struct bpf_map *map,
- struct task_struct *task, void *value,
- u64 flags, gfp_t gfp_flags, bool nobusy)
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
+ task, void *, value, u64, flags, gfp_t, gfp_flags)
{
struct bpf_local_storage_data *sdata;
- sdata = task_storage_lookup(task, map, nobusy);
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task)
+ return (unsigned long)NULL;
+
+ sdata = task_storage_lookup(task, map, true);
if (sdata)
- return sdata->data;
+ return (unsigned long)sdata->data;
/* only allocate new storage, when the task is refcounted */
if (refcount_read(&task->usage) &&
- (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) {
+ (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) {
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value,
BPF_NOEXIST, false, gfp_flags);
- return IS_ERR(sdata) ? NULL : sdata->data;
+ return IS_ERR(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data;
}
- return NULL;
-}
-
-/* *gfp_flags* is a hidden argument provided by the verifier */
-BPF_CALL_5(bpf_task_storage_get_recur, struct bpf_map *, map, struct task_struct *,
- task, void *, value, u64, flags, gfp_t, gfp_flags)
-{
- bool nobusy;
- void *data;
-
- WARN_ON_ONCE(!bpf_rcu_lock_held());
- if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task)
- return (unsigned long)NULL;
-
- nobusy = bpf_task_storage_trylock();
- data = __bpf_task_storage_get(map, task, value, flags,
- gfp_flags, nobusy);
- if (nobusy)
- bpf_task_storage_unlock();
- return (unsigned long)data;
-}
-
-/* *gfp_flags* is a hidden argument provided by the verifier */
-BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
- task, void *, value, u64, flags, gfp_t, gfp_flags)
-{
- void *data;
-
- WARN_ON_ONCE(!bpf_rcu_lock_held());
- if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task)
- return (unsigned long)NULL;
-
- bpf_task_storage_lock();
- data = __bpf_task_storage_get(map, task, value, flags,
- gfp_flags, true);
- bpf_task_storage_unlock();
- return (unsigned long)data;
-}
-
-BPF_CALL_2(bpf_task_storage_delete_recur, struct bpf_map *, map, struct task_struct *,
- task)
-{
- bool nobusy;
- int ret;
-
- WARN_ON_ONCE(!bpf_rcu_lock_held());
- if (!task)
- return -EINVAL;
-
- nobusy = bpf_task_storage_trylock();
- /* This helper must only be called from places where the lifetime of the task
- * is guaranteed. Either by being refcounted or by being protected
- * by an RCU read-side critical section.
- */
- ret = task_storage_delete(task, map, nobusy);
- if (nobusy)
- bpf_task_storage_unlock();
- return ret;
+ return (unsigned long)NULL;
}
BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
task)
{
- int ret;
-
WARN_ON_ONCE(!bpf_rcu_lock_held());
if (!task)
return -EINVAL;
- bpf_task_storage_lock();
/* This helper must only be called from places where the lifetime of the task
* is guaranteed. Either by being refcounted or by being protected
* by an RCU read-side critical section.
*/
- ret = task_storage_delete(task, map, true);
- bpf_task_storage_unlock();
- return ret;
+ return task_storage_delete(task, map);
}
static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
@@ -313,7 +217,7 @@ static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)
static void task_storage_map_free(struct bpf_map *map)
{
- bpf_local_storage_map_free(map, &task_cache, &bpf_task_storage_busy);
+ bpf_local_storage_map_free(map, &task_cache);
}
BTF_ID_LIST_GLOBAL_SINGLE(bpf_local_storage_map_btf_id, struct, bpf_local_storage_map)
@@ -332,17 +236,6 @@ const struct bpf_map_ops task_storage_map_ops = {
.map_owner_storage_ptr = task_storage_ptr,
};
-const struct bpf_func_proto bpf_task_storage_get_recur_proto = {
- .func = bpf_task_storage_get_recur,
- .gpl_only = false,
- .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
- .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
- .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
- .arg4_type = ARG_ANYTHING,
-};
-
const struct bpf_func_proto bpf_task_storage_get_proto = {
.func = bpf_task_storage_get,
.gpl_only = false,
@@ -354,15 +247,6 @@ const struct bpf_func_proto bpf_task_storage_get_proto = {
.arg4_type = ARG_ANYTHING,
};
-const struct bpf_func_proto bpf_task_storage_delete_recur_proto = {
- .func = bpf_task_storage_delete_recur,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
- .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
-};
-
const struct bpf_func_proto bpf_task_storage_delete_proto = {
.func = bpf_task_storage_delete,
.gpl_only = false,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 0de8fc8a0e0b..7708958e3fb8 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -25,6 +25,7 @@
#include <linux/perf_event.h>
#include <linux/bsearch.h>
#include <linux/kobject.h>
+#include <linux/string.h>
#include <linux/sysfs.h>
#include <linux/overflow.h>
@@ -259,6 +260,7 @@ struct btf {
void *nohdr_data;
struct btf_header hdr;
u32 nr_types; /* includes VOID for base BTF */
+ u32 named_start_id;
u32 types_size;
u32 data_size;
refcount_t refcnt;
@@ -494,6 +496,11 @@ static bool btf_type_is_modifier(const struct btf_type *t)
return false;
}
+static int btf_start_id(const struct btf *btf)
+{
+ return btf->start_id + (btf->base_btf ? 0 : 1);
+}
+
bool btf_type_is_void(const struct btf_type *t)
{
return t == &btf_void;
@@ -544,21 +551,125 @@ u32 btf_nr_types(const struct btf *btf)
return total;
}
+/*
+ * Note that vmlinux and kernel module BTFs are always sorted
+ * during the building phase.
+ */
+static void btf_check_sorted(struct btf *btf)
+{
+ u32 i, n, named_start_id = 0;
+
+ n = btf_nr_types(btf);
+ if (btf_is_vmlinux(btf)) {
+ for (i = btf_start_id(btf); i < n; i++) {
+ const struct btf_type *t = btf_type_by_id(btf, i);
+ const char *n = btf_name_by_offset(btf, t->name_off);
+
+ if (n[0] != '\0') {
+ btf->named_start_id = i;
+ return;
+ }
+ }
+ return;
+ }
+
+ for (i = btf_start_id(btf) + 1; i < n; i++) {
+ const struct btf_type *ta = btf_type_by_id(btf, i - 1);
+ const struct btf_type *tb = btf_type_by_id(btf, i);
+ const char *na = btf_name_by_offset(btf, ta->name_off);
+ const char *nb = btf_name_by_offset(btf, tb->name_off);
+
+ if (strcmp(na, nb) > 0)
+ return;
+
+ if (named_start_id == 0 && na[0] != '\0')
+ named_start_id = i - 1;
+ if (named_start_id == 0 && nb[0] != '\0')
+ named_start_id = i;
+ }
+
+ if (named_start_id)
+ btf->named_start_id = named_start_id;
+}
+
+/*
+ * btf_named_start_id - Get the named starting ID for the BTF
+ * @btf: Pointer to the target BTF object
+ * @own: Flag indicating whether to query only the current BTF (true = current BTF only,
+ * false = recursively traverse the base BTF chain)
+ *
+ * Return value rules:
+ * 1. For a sorted btf, return its named_start_id
+ * 2. Else for a split BTF, return its start_id
+ * 3. Else for a base BTF, return 1
+ */
+u32 btf_named_start_id(const struct btf *btf, bool own)
+{
+ const struct btf *base_btf = btf;
+
+ while (!own && base_btf->base_btf)
+ base_btf = base_btf->base_btf;
+
+ return base_btf->named_start_id ?: (base_btf->start_id ?: 1);
+}
+
+static s32 btf_find_by_name_kind_bsearch(const struct btf *btf, const char *name)
+{
+ const struct btf_type *t;
+ const char *tname;
+ s32 l, r, m;
+
+ l = btf_named_start_id(btf, true);
+ r = btf_nr_types(btf) - 1;
+ while (l <= r) {
+ m = l + (r - l) / 2;
+ t = btf_type_by_id(btf, m);
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (strcmp(tname, name) >= 0) {
+ if (l == r)
+ return r;
+ r = m;
+ } else {
+ l = m + 1;
+ }
+ }
+
+ return btf_nr_types(btf);
+}
+
s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
{
+ const struct btf *base_btf = btf_base_btf(btf);
const struct btf_type *t;
const char *tname;
- u32 i, total;
+ s32 id, total;
- total = btf_nr_types(btf);
- for (i = 1; i < total; i++) {
- t = btf_type_by_id(btf, i);
- if (BTF_INFO_KIND(t->info) != kind)
- continue;
+ if (base_btf) {
+ id = btf_find_by_name_kind(base_btf, name, kind);
+ if (id > 0)
+ return id;
+ }
- tname = btf_name_by_offset(btf, t->name_off);
- if (!strcmp(tname, name))
- return i;
+ total = btf_nr_types(btf);
+ if (btf->named_start_id > 0 && name[0]) {
+ id = btf_find_by_name_kind_bsearch(btf, name);
+ for (; id < total; id++) {
+ t = btf_type_by_id(btf, id);
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (strcmp(tname, name) != 0)
+ return -ENOENT;
+ if (BTF_INFO_KIND(t->info) == kind)
+ return id;
+ }
+ } else {
+ for (id = btf_start_id(btf); id < total; id++) {
+ t = btf_type_by_id(btf, id);
+ if (BTF_INFO_KIND(t->info) != kind)
+ continue;
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (strcmp(tname, name) == 0)
+ return id;
+ }
}
return -ENOENT;
@@ -3424,7 +3535,8 @@ const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type
const struct btf_type *t;
int len, id;
- id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key, 0);
+ id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key,
+ btf_named_start_id(btf, false) - 1);
if (id < 0)
return ERR_PTR(id);
@@ -5791,6 +5903,7 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
goto errout;
}
env->btf = btf;
+ btf->named_start_id = 0;
data = kvmalloc(attr->btf_size, GFP_KERNEL | __GFP_NOWARN);
if (!data) {
@@ -6107,6 +6220,7 @@ static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
case BPF_MODIFY_RETURN:
+ case BPF_TRACE_FSESSION:
/* allow u64* as ctx */
if (btf_is_int(t) && t->size == 8)
return 0;
@@ -6210,7 +6324,8 @@ static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name
btf->data = data;
btf->data_size = data_size;
btf->kernel_btf = true;
- snprintf(btf->name, sizeof(btf->name), "%s", name);
+ btf->named_start_id = 0;
+ strscpy(btf->name, name);
err = btf_parse_hdr(env);
if (err)
@@ -6230,6 +6345,7 @@ static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name
if (err)
goto errout;
+ btf_check_sorted(btf);
refcount_set(&btf->refcnt, 1);
return btf;
@@ -6327,7 +6443,8 @@ static struct btf *btf_parse_module(const char *module_name, const void *data,
btf->start_id = base_btf->nr_types;
btf->start_str_off = base_btf->hdr.str_len;
btf->kernel_btf = true;
- snprintf(btf->name, sizeof(btf->name), "%s", module_name);
+ btf->named_start_id = 0;
+ strscpy(btf->name, module_name);
btf->data = kvmemdup(data, data_size, GFP_KERNEL | __GFP_NOWARN);
if (!btf->data) {
@@ -6363,6 +6480,7 @@ static struct btf *btf_parse_module(const char *module_name, const void *data,
}
btf_verifier_env_free(env);
+ btf_check_sorted(btf);
refcount_set(&btf->refcnt, 1);
return btf;
@@ -6704,6 +6822,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
fallthrough;
case BPF_LSM_CGROUP:
case BPF_TRACE_FEXIT:
+ case BPF_TRACE_FSESSION:
/* When LSM programs are attached to void LSM hooks
* they use FEXIT trampolines and when attached to
* int LSM hooks, they use MODIFY_RETURN trampolines.
@@ -7729,12 +7848,13 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
tname);
return -EINVAL;
}
+
/* Convert BTF function arguments into verifier types.
* Only PTR_TO_CTX and SCALAR are supported atm.
*/
for (i = 0; i < nargs; i++) {
u32 tags = 0;
- int id = 0;
+ int id = btf_named_start_id(btf, false) - 1;
/* 'arg:<tag>' decl_tag takes precedence over derivation of
* register type from BTF type itself
@@ -8640,24 +8760,17 @@ end:
return ret;
}
-static u32 *__btf_kfunc_id_set_contains(const struct btf *btf,
- enum btf_kfunc_hook hook,
- u32 kfunc_btf_id,
- const struct bpf_prog *prog)
+static u32 *btf_kfunc_id_set_contains(const struct btf *btf,
+ enum btf_kfunc_hook hook,
+ u32 kfunc_btf_id)
{
- struct btf_kfunc_hook_filter *hook_filter;
struct btf_id_set8 *set;
- u32 *id, i;
+ u32 *id;
if (hook >= BTF_KFUNC_HOOK_MAX)
return NULL;
if (!btf->kfunc_set_tab)
return NULL;
- hook_filter = &btf->kfunc_set_tab->hook_filters[hook];
- for (i = 0; i < hook_filter->nr_filters; i++) {
- if (hook_filter->filters[i](prog, kfunc_btf_id))
- return NULL;
- }
set = btf->kfunc_set_tab->sets[hook];
if (!set)
return NULL;
@@ -8668,6 +8781,28 @@ static u32 *__btf_kfunc_id_set_contains(const struct btf *btf,
return id + 1;
}
+static bool __btf_kfunc_is_allowed(const struct btf *btf,
+ enum btf_kfunc_hook hook,
+ u32 kfunc_btf_id,
+ const struct bpf_prog *prog)
+{
+ struct btf_kfunc_hook_filter *hook_filter;
+ int i;
+
+ if (hook >= BTF_KFUNC_HOOK_MAX)
+ return false;
+ if (!btf->kfunc_set_tab)
+ return false;
+
+ hook_filter = &btf->kfunc_set_tab->hook_filters[hook];
+ for (i = 0; i < hook_filter->nr_filters; i++) {
+ if (hook_filter->filters[i](prog, kfunc_btf_id))
+ return false;
+ }
+
+ return true;
+}
+
static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
{
switch (prog_type) {
@@ -8681,6 +8816,7 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
return BTF_KFUNC_HOOK_STRUCT_OPS;
case BPF_PROG_TYPE_TRACING:
case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_LSM:
return BTF_KFUNC_HOOK_TRACING;
@@ -8714,6 +8850,26 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
}
}
+bool btf_kfunc_is_allowed(const struct btf *btf,
+ u32 kfunc_btf_id,
+ const struct bpf_prog *prog)
+{
+ enum bpf_prog_type prog_type = resolve_prog_type(prog);
+ enum btf_kfunc_hook hook;
+ u32 *kfunc_flags;
+
+ kfunc_flags = btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id);
+ if (kfunc_flags && __btf_kfunc_is_allowed(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id, prog))
+ return true;
+
+ hook = bpf_prog_type_to_kfunc_hook(prog_type);
+ kfunc_flags = btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id);
+ if (kfunc_flags && __btf_kfunc_is_allowed(btf, hook, kfunc_btf_id, prog))
+ return true;
+
+ return false;
+}
+
/* Caution:
* Reference to the module (obtained using btf_try_get_module) corresponding to
* the struct btf *MUST* be held when calling this function from verifier
@@ -8721,26 +8877,27 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
* keeping the reference for the duration of the call provides the necessary
* protection for looking up a well-formed btf->kfunc_set_tab.
*/
-u32 *btf_kfunc_id_set_contains(const struct btf *btf,
- u32 kfunc_btf_id,
- const struct bpf_prog *prog)
+u32 *btf_kfunc_flags(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog)
{
enum bpf_prog_type prog_type = resolve_prog_type(prog);
enum btf_kfunc_hook hook;
u32 *kfunc_flags;
- kfunc_flags = __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id, prog);
+ kfunc_flags = btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id);
if (kfunc_flags)
return kfunc_flags;
hook = bpf_prog_type_to_kfunc_hook(prog_type);
- return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id, prog);
+ return btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id);
}
u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id,
const struct bpf_prog *prog)
{
- return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id, prog);
+ if (!__btf_kfunc_is_allowed(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id, prog))
+ return NULL;
+
+ return btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id);
}
static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
@@ -8845,6 +9002,13 @@ static int btf_check_dtor_kfuncs(struct btf *btf, const struct btf_id_dtor_kfunc
*/
if (!t || !btf_type_is_ptr(t))
return -EINVAL;
+
+ if (IS_ENABLED(CONFIG_CFI_CLANG)) {
+ /* Ensure the destructor kfunc type matches btf_dtor_kfunc_t */
+ t = btf_type_by_id(btf, t->type);
+ if (!btf_type_is_void(t))
+ return -EINVAL;
+ }
}
return 0;
}
@@ -9215,7 +9379,7 @@ bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id)
}
/* Attempt to find target candidates in vmlinux BTF first */
- cands = bpf_core_add_cands(cands, main_btf, 1);
+ cands = bpf_core_add_cands(cands, main_btf, btf_named_start_id(main_btf, true));
if (IS_ERR(cands))
return ERR_CAST(cands);
@@ -9247,7 +9411,7 @@ check_modules:
*/
btf_get(mod_btf);
spin_unlock_bh(&btf_idr_lock);
- cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf));
+ cands = bpf_core_add_cands(cands, mod_btf, btf_named_start_id(mod_btf, true));
btf_put(mod_btf);
if (IS_ERR(cands))
return ERR_CAST(cands);
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 69988af44b37..b029f0369ecf 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1680,11 +1680,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
struct cgroup *cgrp;
int ret;
- /* Check socket family since not all sockets represent network
- * endpoint (e.g. AF_UNIX).
- */
- if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6 &&
- sk->sk_family != AF_UNIX)
+ if (!sk_is_inet(sk) && !sk_is_unix(sk))
return 0;
if (!ctx.uaddr) {
diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
index f04a468cf6a7..fd51fe3d92cc 100644
--- a/kernel/bpf/cgroup_iter.c
+++ b/kernel/bpf/cgroup_iter.c
@@ -8,12 +8,13 @@
#include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */
-/* cgroup_iter provides four modes of traversal to the cgroup hierarchy.
+/* cgroup_iter provides five modes of traversal to the cgroup hierarchy.
*
* 1. Walk the descendants of a cgroup in pre-order.
* 2. Walk the descendants of a cgroup in post-order.
* 3. Walk the ancestors of a cgroup.
* 4. Show the given cgroup only.
+ * 5. Walk the children of a given parent cgroup.
*
* For walking descendants, cgroup_iter can walk in either pre-order or
* post-order. For walking ancestors, the iter walks up from a cgroup to
@@ -78,6 +79,8 @@ static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
return css_next_descendant_pre(NULL, p->start_css);
else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
return css_next_descendant_post(NULL, p->start_css);
+ else if (p->order == BPF_CGROUP_ITER_CHILDREN)
+ return css_next_child(NULL, p->start_css);
else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */
return p->start_css;
}
@@ -113,6 +116,8 @@ static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
return css_next_descendant_post(curr, p->start_css);
else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP)
return curr->parent;
+ else if (p->order == BPF_CGROUP_ITER_CHILDREN)
+ return css_next_child(curr, p->start_css);
else /* BPF_CGROUP_ITER_SELF_ONLY */
return NULL;
}
@@ -200,11 +205,16 @@ static int bpf_iter_attach_cgroup(struct bpf_prog *prog,
int order = linfo->cgroup.order;
struct cgroup *cgrp;
- if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE &&
- order != BPF_CGROUP_ITER_DESCENDANTS_POST &&
- order != BPF_CGROUP_ITER_ANCESTORS_UP &&
- order != BPF_CGROUP_ITER_SELF_ONLY)
+ switch (order) {
+ case BPF_CGROUP_ITER_DESCENDANTS_PRE:
+ case BPF_CGROUP_ITER_DESCENDANTS_POST:
+ case BPF_CGROUP_ITER_ANCESTORS_UP:
+ case BPF_CGROUP_ITER_SELF_ONLY:
+ case BPF_CGROUP_ITER_CHILDREN:
+ break;
+ default:
return -EINVAL;
+ }
if (fd && id)
return -EINVAL;
@@ -257,6 +267,8 @@ show_order:
seq_puts(seq, "order: descendants_post\n");
else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP)
seq_puts(seq, "order: ancestors_up\n");
+ else if (aux->cgroup.order == BPF_CGROUP_ITER_CHILDREN)
+ seq_puts(seq, "order: children\n");
else /* BPF_CGROUP_ITER_SELF_ONLY */
seq_puts(seq, "order: self_only\n");
}
@@ -320,6 +332,7 @@ __bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it,
case BPF_CGROUP_ITER_DESCENDANTS_PRE:
case BPF_CGROUP_ITER_DESCENDANTS_POST:
case BPF_CGROUP_ITER_ANCESTORS_UP:
+ case BPF_CGROUP_ITER_CHILDREN:
break;
default:
return -EINVAL;
@@ -345,6 +358,9 @@ __bpf_kfunc struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *i
case BPF_CGROUP_ITER_DESCENDANTS_POST:
kit->pos = css_next_descendant_post(kit->pos, kit->start);
break;
+ case BPF_CGROUP_ITER_CHILDREN:
+ kit->pos = css_next_child(kit->pos, kit->start);
+ break;
case BPF_CGROUP_ITER_ANCESTORS_UP:
kit->pos = kit->pos ? kit->pos->parent : kit->start;
}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 1b9b18e5b03c..dc906dfdff94 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -112,7 +112,8 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
vfree(fp);
return NULL;
}
- fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
+ fp->active = __alloc_percpu_gfp(sizeof(u8[BPF_NR_CONTEXTS]), 4,
+ bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
if (!fp->active) {
vfree(fp);
kfree(aux);
@@ -136,6 +137,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
mutex_init(&fp->aux->used_maps_mutex);
mutex_init(&fp->aux->ext_mutex);
mutex_init(&fp->aux->dst_mutex);
+ mutex_init(&fp->aux->st_ops_assoc_mutex);
#ifdef CONFIG_BPF_SYSCALL
bpf_prog_stream_init(fp);
@@ -286,6 +288,7 @@ void __bpf_prog_free(struct bpf_prog *fp)
if (fp->aux) {
mutex_destroy(&fp->aux->used_maps_mutex);
mutex_destroy(&fp->aux->dst_mutex);
+ mutex_destroy(&fp->aux->st_ops_assoc_mutex);
kfree(fp->aux->poke_tab);
kfree(fp->aux);
}
@@ -2398,6 +2401,7 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map,
map->owner->type = prog_type;
map->owner->jited = fp->jited;
map->owner->xdp_has_frags = aux->xdp_has_frags;
+ map->owner->sleepable = fp->sleepable;
map->owner->expected_attach_type = fp->expected_attach_type;
map->owner->attach_func_proto = aux->attach_func_proto;
for_each_cgroup_storage_type(i) {
@@ -2409,7 +2413,8 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map,
} else {
ret = map->owner->type == prog_type &&
map->owner->jited == fp->jited &&
- map->owner->xdp_has_frags == aux->xdp_has_frags;
+ map->owner->xdp_has_frags == aux->xdp_has_frags &&
+ map->owner->sleepable == fp->sleepable;
if (ret &&
map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
map->owner->expected_attach_type != fp->expected_attach_type)
@@ -2912,6 +2917,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)
#endif
bpf_free_used_maps(aux);
bpf_free_used_btfs(aux);
+ bpf_prog_disassoc_struct_ops(aux->prog);
if (bpf_prog_is_dev_bound(aux))
bpf_prog_dev_bound_destroy(aux->prog);
#ifdef CONFIG_PERF_EVENTS
@@ -3138,6 +3144,11 @@ bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena)
return false;
}
+bool __weak bpf_jit_supports_fsession(void)
+{
+ return false;
+}
+
u64 __weak bpf_arch_uaddress_limit(void)
{
#if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE)
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 703e5df1f4ef..04171fbc39cb 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -430,7 +430,7 @@ static struct bpf_cpu_map_entry *
__cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
u32 cpu)
{
- int numa, err, i, fd = value->bpf_prog.fd;
+ int numa, err = -ENOMEM, i, fd = value->bpf_prog.fd;
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
struct bpf_cpu_map_entry *rcpu;
struct xdp_bulk_queue *bq;
@@ -440,7 +440,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
rcpu = bpf_map_kmalloc_node(map, sizeof(*rcpu), gfp | __GFP_ZERO, numa);
if (!rcpu)
- return NULL;
+ return ERR_PTR(err);
/* Alloc percpu bulkq */
rcpu->bulkq = bpf_map_alloc_percpu(map, sizeof(*rcpu->bulkq),
@@ -468,16 +468,21 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
rcpu->value.qsize = value->qsize;
gro_init(&rcpu->gro);
- if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd))
- goto free_ptr_ring;
+ if (fd > 0) {
+ err = __cpu_map_load_bpf_program(rcpu, map, fd);
+ if (err)
+ goto free_ptr_ring;
+ }
/* Setup kthread */
init_completion(&rcpu->kthread_running);
rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
"cpumap/%d/map:%d", cpu,
map->id);
- if (IS_ERR(rcpu->kthread))
+ if (IS_ERR(rcpu->kthread)) {
+ err = PTR_ERR(rcpu->kthread);
goto free_prog;
+ }
/* Make sure kthread runs on a single CPU */
kthread_bind(rcpu->kthread, cpu);
@@ -503,7 +508,7 @@ free_bulkq:
free_percpu(rcpu->bulkq);
free_rcu:
kfree(rcpu);
- return NULL;
+ return ERR_PTR(err);
}
static void __cpu_map_entry_free(struct work_struct *work)
@@ -596,8 +601,8 @@ static long cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
} else {
/* Updating qsize cause re-allocation of bpf_cpu_map_entry */
rcpu = __cpu_map_entry_alloc(map, &cpumap_value, key_cpu);
- if (!rcpu)
- return -ENOMEM;
+ if (IS_ERR(rcpu))
+ return PTR_ERR(rcpu);
}
rcu_read_lock();
__cpu_map_entry_replace(cmap, key_cpu, rcpu);
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index 9876c5fe6c2a..b8c805b4b06a 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -477,7 +477,7 @@ __bpf_kfunc_end_defs();
BTF_KFUNCS_START(cpumask_kfunc_btf_ids)
BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE)
-BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE)
BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_first_and, KF_RCU)
diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c
index 83c4d9943084..7e75a1936256 100644
--- a/kernel/bpf/crypto.c
+++ b/kernel/bpf/crypto.c
@@ -60,7 +60,7 @@ struct bpf_crypto_ctx {
int bpf_crypto_register_type(const struct bpf_crypto_type *type)
{
struct bpf_crypto_type_list *node;
- int err = -EEXIST;
+ int err = -EBUSY;
down_write(&bpf_crypto_types_sem);
list_for_each_entry(node, &bpf_crypto_types, list) {
@@ -261,6 +261,12 @@ __bpf_kfunc void bpf_crypto_ctx_release(struct bpf_crypto_ctx *ctx)
call_rcu(&ctx->rcu, crypto_free_cb);
}
+__bpf_kfunc void bpf_crypto_ctx_release_dtor(void *ctx)
+{
+ bpf_crypto_ctx_release(ctx);
+}
+CFI_NOSEAL(bpf_crypto_ctx_release_dtor);
+
static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx,
const struct bpf_dynptr_kern *src,
const struct bpf_dynptr_kern *dst,
@@ -368,7 +374,7 @@ static const struct btf_kfunc_id_set crypt_kfunc_set = {
BTF_ID_LIST(bpf_crypto_dtor_ids)
BTF_ID(struct, bpf_crypto_ctx)
-BTF_ID(func, bpf_crypto_ctx_release)
+BTF_ID(func, bpf_crypto_ctx_release_dtor)
static int __init crypto_kfunc_init(void)
{
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index c8a9b27f8663..3b9d297a53be 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -82,9 +82,6 @@ struct bucket {
rqspinlock_t raw_lock;
};
-#define HASHTAB_MAP_LOCK_COUNT 8
-#define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1)
-
struct bpf_htab {
struct bpf_map map;
struct bpf_mem_alloc ma;
@@ -932,7 +929,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
}
static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
- void *value, bool onallcpus)
+ void *value, bool onallcpus, u64 map_flags)
{
void *ptr;
@@ -943,19 +940,28 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
bpf_obj_free_fields(htab->map.record, ptr);
} else {
u32 size = round_up(htab->map.value_size, 8);
- int off = 0, cpu;
+ void *val;
+ int cpu;
+
+ if (map_flags & BPF_F_CPU) {
+ cpu = map_flags >> 32;
+ ptr = per_cpu_ptr(pptr, cpu);
+ copy_map_value(&htab->map, ptr, value);
+ bpf_obj_free_fields(htab->map.record, ptr);
+ return;
+ }
for_each_possible_cpu(cpu) {
ptr = per_cpu_ptr(pptr, cpu);
- copy_map_value_long(&htab->map, ptr, value + off);
+ val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu;
+ copy_map_value(&htab->map, ptr, val);
bpf_obj_free_fields(htab->map.record, ptr);
- off += size;
}
}
}
static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
- void *value, bool onallcpus)
+ void *value, bool onallcpus, u64 map_flags)
{
/* When not setting the initial value on all cpus, zero-fill element
* values for other cpus. Otherwise, bpf program has no way to ensure
@@ -973,7 +979,7 @@ static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu));
}
} else {
- pcpu_copy_value(htab, pptr, value, onallcpus);
+ pcpu_copy_value(htab, pptr, value, onallcpus, map_flags);
}
}
@@ -985,7 +991,7 @@ static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
void *value, u32 key_size, u32 hash,
bool percpu, bool onallcpus,
- struct htab_elem *old_elem)
+ struct htab_elem *old_elem, u64 map_flags)
{
u32 size = htab->map.value_size;
bool prealloc = htab_is_prealloc(htab);
@@ -1043,7 +1049,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
pptr = *(void __percpu **)ptr;
}
- pcpu_init_value(htab, pptr, value, onallcpus);
+ pcpu_init_value(htab, pptr, value, onallcpus, map_flags);
if (!prealloc)
htab_elem_set_ptr(l_new, key_size, pptr);
@@ -1147,7 +1153,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
}
l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
- l_old);
+ l_old, map_flags);
if (IS_ERR(l_new)) {
/* all pre-allocated elements are in use or memory exhausted */
ret = PTR_ERR(l_new);
@@ -1249,6 +1255,15 @@ err_lock_bucket:
return ret;
}
+static int htab_map_check_update_flags(bool onallcpus, u64 map_flags)
+{
+ if (unlikely(!onallcpus && map_flags > BPF_EXIST))
+ return -EINVAL;
+ if (unlikely(onallcpus && ((map_flags & BPF_F_LOCK) || (u32)map_flags > BPF_F_ALL_CPUS)))
+ return -EINVAL;
+ return 0;
+}
+
static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
void *value, u64 map_flags,
bool percpu, bool onallcpus)
@@ -1262,9 +1277,9 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
u32 key_size, hash;
int ret;
- if (unlikely(map_flags > BPF_EXIST))
- /* unknown flags */
- return -EINVAL;
+ ret = htab_map_check_update_flags(onallcpus, map_flags);
+ if (unlikely(ret))
+ return ret;
WARN_ON_ONCE(!bpf_rcu_lock_held());
@@ -1289,7 +1304,7 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
/* Update value in-place */
if (percpu) {
pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
- value, onallcpus);
+ value, onallcpus, map_flags);
} else {
void **inner_map_pptr = htab_elem_value(l_old, key_size);
@@ -1298,7 +1313,7 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
}
} else {
l_new = alloc_htab_elem(htab, key, value, key_size,
- hash, percpu, onallcpus, NULL);
+ hash, percpu, onallcpus, NULL, map_flags);
if (IS_ERR(l_new)) {
ret = PTR_ERR(l_new);
goto err;
@@ -1324,9 +1339,9 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
u32 key_size, hash;
int ret;
- if (unlikely(map_flags > BPF_EXIST))
- /* unknown flags */
- return -EINVAL;
+ ret = htab_map_check_update_flags(onallcpus, map_flags);
+ if (unlikely(ret))
+ return ret;
WARN_ON_ONCE(!bpf_rcu_lock_held());
@@ -1363,10 +1378,10 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
/* per-cpu hash map can update value in-place */
pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
- value, onallcpus);
+ value, onallcpus, map_flags);
} else {
pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size),
- value, onallcpus);
+ value, onallcpus, map_flags);
hlist_nulls_add_head_rcu(&l_new->hash_node, head);
l_new = NULL;
}
@@ -1678,9 +1693,9 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
u32 batch, max_count, size, bucket_size, map_id;
+ u64 elem_map_flags, map_flags, allowed_flags;
u32 bucket_cnt, total, key_size, value_size;
struct htab_elem *node_to_free = NULL;
- u64 elem_map_flags, map_flags;
struct hlist_nulls_head *head;
struct hlist_nulls_node *n;
unsigned long flags = 0;
@@ -1690,9 +1705,12 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
int ret = 0;
elem_map_flags = attr->batch.elem_flags;
- if ((elem_map_flags & ~BPF_F_LOCK) ||
- ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
- return -EINVAL;
+ allowed_flags = BPF_F_LOCK;
+ if (!do_delete && is_percpu)
+ allowed_flags |= BPF_F_CPU;
+ ret = bpf_map_check_op_flags(map, elem_map_flags, allowed_flags);
+ if (ret)
+ return ret;
map_flags = attr->batch.flags;
if (map_flags)
@@ -1715,7 +1733,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
key_size = htab->map.key_size;
value_size = htab->map.value_size;
size = round_up(value_size, 8);
- if (is_percpu)
+ if (is_percpu && !(elem_map_flags & BPF_F_CPU))
value_size = size * num_possible_cpus();
total = 0;
/* while experimenting with hash tables with sizes ranging from 10 to
@@ -1798,10 +1816,17 @@ again_nocopy:
void __percpu *pptr;
pptr = htab_elem_get_ptr(l, map->key_size);
- for_each_possible_cpu(cpu) {
- copy_map_value_long(&htab->map, dst_val + off, per_cpu_ptr(pptr, cpu));
- check_and_init_map_value(&htab->map, dst_val + off);
- off += size;
+ if (elem_map_flags & BPF_F_CPU) {
+ cpu = elem_map_flags >> 32;
+ copy_map_value(&htab->map, dst_val, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(&htab->map, dst_val);
+ } else {
+ for_each_possible_cpu(cpu) {
+ copy_map_value_long(&htab->map, dst_val + off,
+ per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(&htab->map, dst_val + off);
+ off += size;
+ }
}
} else {
value = htab_elem_value(l, key_size);
@@ -2209,11 +2234,11 @@ static u64 htab_map_mem_usage(const struct bpf_map *map)
bool prealloc = htab_is_prealloc(htab);
bool percpu = htab_is_percpu(htab);
bool lru = htab_is_lru(htab);
- u64 num_entries;
- u64 usage = sizeof(struct bpf_htab);
+ u64 num_entries, usage;
+
+ usage = sizeof(struct bpf_htab) +
+ sizeof(struct bucket) * htab->n_buckets;
- usage += sizeof(struct bucket) * htab->n_buckets;
- usage += sizeof(int) * num_possible_cpus() * HASHTAB_MAP_LOCK_COUNT;
if (prealloc) {
num_entries = map->max_entries;
if (htab_has_extra_elems(htab))
@@ -2357,7 +2382,7 @@ static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *k
return NULL;
}
-int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
+int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value, u64 map_flags)
{
struct htab_elem *l;
void __percpu *pptr;
@@ -2374,16 +2399,22 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
l = __htab_map_lookup_elem(map, key);
if (!l)
goto out;
+ ret = 0;
/* We do not mark LRU map element here in order to not mess up
* eviction heuristics when user space does a map walk.
*/
pptr = htab_elem_get_ptr(l, map->key_size);
+ if (map_flags & BPF_F_CPU) {
+ cpu = map_flags >> 32;
+ copy_map_value(map, value, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, value);
+ goto out;
+ }
for_each_possible_cpu(cpu) {
copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
check_and_init_map_value(map, value + off);
off += size;
}
- ret = 0;
out:
rcu_read_unlock();
return ret;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index db72b96f9c8c..7ac32798eb04 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1077,7 +1077,7 @@ const struct bpf_func_proto bpf_snprintf_proto = {
.func = bpf_snprintf,
.gpl_only = true,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg1_type = ARG_PTR_TO_MEM_OR_NULL | MEM_WRITE,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_PTR_TO_CONST_STR,
.arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
@@ -1095,16 +1095,34 @@ static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx)
return (void *)value - round_up(map->key_size, 8);
}
+enum bpf_async_type {
+ BPF_ASYNC_TYPE_TIMER = 0,
+ BPF_ASYNC_TYPE_WQ,
+};
+
+enum bpf_async_op {
+ BPF_ASYNC_START,
+ BPF_ASYNC_CANCEL
+};
+
+struct bpf_async_cmd {
+ struct llist_node node;
+ u64 nsec;
+ u32 mode;
+ enum bpf_async_op op;
+};
+
struct bpf_async_cb {
struct bpf_map *map;
struct bpf_prog *prog;
void __rcu *callback_fn;
void *value;
- union {
- struct rcu_head rcu;
- struct work_struct delete_work;
- };
+ struct rcu_head rcu;
u64 flags;
+ struct irq_work worker;
+ refcount_t refcnt;
+ enum bpf_async_type type;
+ struct llist_head async_cmds;
};
/* BPF map elements can contain 'struct bpf_timer'.
@@ -1132,7 +1150,6 @@ struct bpf_hrtimer {
struct bpf_work {
struct bpf_async_cb cb;
struct work_struct work;
- struct work_struct delete_work;
};
/* the actual struct hidden inside uapi struct bpf_timer and bpf_wq */
@@ -1142,20 +1159,12 @@ struct bpf_async_kern {
struct bpf_hrtimer *timer;
struct bpf_work *work;
};
- /* bpf_spin_lock is used here instead of spinlock_t to make
- * sure that it always fits into space reserved by struct bpf_timer
- * regardless of LOCKDEP and spinlock debug flags.
- */
- struct bpf_spin_lock lock;
} __attribute__((aligned(8)));
-enum bpf_async_type {
- BPF_ASYNC_TYPE_TIMER = 0,
- BPF_ASYNC_TYPE_WQ,
-};
-
static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
+static void bpf_async_refcount_put(struct bpf_async_cb *cb);
+
static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
{
struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
@@ -1219,45 +1228,85 @@ static void bpf_async_cb_rcu_free(struct rcu_head *rcu)
{
struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu);
+ /*
+ * Drop the last reference to prog only after RCU GP, as set_callback()
+ * may race with cancel_and_free()
+ */
+ if (cb->prog)
+ bpf_prog_put(cb->prog);
+
kfree_nolock(cb);
}
-static void bpf_wq_delete_work(struct work_struct *work)
+/* Callback from call_rcu_tasks_trace, chains to call_rcu for final free */
+static void bpf_async_cb_rcu_tasks_trace_free(struct rcu_head *rcu)
{
- struct bpf_work *w = container_of(work, struct bpf_work, delete_work);
+ struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu);
+ struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb);
+ struct bpf_work *w = container_of(cb, struct bpf_work, cb);
+ bool retry = false;
- cancel_work_sync(&w->work);
+ /*
+ * bpf_async_cancel_and_free() tried to cancel timer/wq, but it
+ * could have raced with timer/wq_start. Now refcnt is zero and
+ * srcu/rcu GP completed. Cancel timer/wq again.
+ */
+ switch (cb->type) {
+ case BPF_ASYNC_TYPE_TIMER:
+ if (hrtimer_try_to_cancel(&t->timer) < 0)
+ retry = true;
+ break;
+ case BPF_ASYNC_TYPE_WQ:
+ if (!cancel_work(&w->work) && work_busy(&w->work))
+ retry = true;
+ break;
+ }
+ if (retry) {
+ /*
+ * hrtimer or wq callback may still be running. It must be
+ * in rcu_tasks_trace or rcu CS, so wait for GP again.
+ * It won't retry forever, since refcnt zero prevents all
+ * operations on timer/wq.
+ */
+ call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free);
+ return;
+ }
- call_rcu(&w->cb.rcu, bpf_async_cb_rcu_free);
+ /* rcu_trace_implies_rcu_gp() is true and will remain so */
+ bpf_async_cb_rcu_free(rcu);
}
-static void bpf_timer_delete_work(struct work_struct *work)
+static void worker_for_call_rcu(struct irq_work *work)
{
- struct bpf_hrtimer *t = container_of(work, struct bpf_hrtimer, cb.delete_work);
+ struct bpf_async_cb *cb = container_of(work, struct bpf_async_cb, worker);
- /* Cancel the timer and wait for callback to complete if it was running.
- * If hrtimer_cancel() can be safely called it's safe to call
- * call_rcu() right after for both preallocated and non-preallocated
- * maps. The async->cb = NULL was already done and no code path can see
- * address 't' anymore. Timer if armed for existing bpf_hrtimer before
- * bpf_timer_cancel_and_free will have been cancelled.
- */
- hrtimer_cancel(&t->timer);
- call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free);
+ call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free);
}
+static void bpf_async_refcount_put(struct bpf_async_cb *cb)
+{
+ if (!refcount_dec_and_test(&cb->refcnt))
+ return;
+
+ if (irqs_disabled()) {
+ cb->worker = IRQ_WORK_INIT(worker_for_call_rcu);
+ irq_work_queue(&cb->worker);
+ } else {
+ call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free);
+ }
+}
+
+static void bpf_async_cancel_and_free(struct bpf_async_kern *async);
+static void bpf_async_irq_worker(struct irq_work *work);
+
static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
enum bpf_async_type type)
{
- struct bpf_async_cb *cb;
+ struct bpf_async_cb *cb, *old_cb;
struct bpf_hrtimer *t;
struct bpf_work *w;
clockid_t clockid;
size_t size;
- int ret = 0;
-
- if (in_nmi())
- return -EOPNOTSUPP;
switch (type) {
case BPF_ASYNC_TYPE_TIMER:
@@ -1270,18 +1319,13 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
return -EINVAL;
}
- __bpf_spin_lock_irqsave(&async->lock);
- t = async->timer;
- if (t) {
- ret = -EBUSY;
- goto out;
- }
+ old_cb = READ_ONCE(async->cb);
+ if (old_cb)
+ return -EBUSY;
cb = bpf_map_kmalloc_nolock(map, size, 0, map->numa_node);
- if (!cb) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!cb)
+ return -ENOMEM;
switch (type) {
case BPF_ASYNC_TYPE_TIMER:
@@ -1289,7 +1333,6 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
t = (struct bpf_hrtimer *)cb;
atomic_set(&t->cancelling, 0);
- INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work);
hrtimer_setup(&t->timer, bpf_timer_cb, clockid, HRTIMER_MODE_REL_SOFT);
cb->value = (void *)async - map->record->timer_off;
break;
@@ -1297,16 +1340,24 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
w = (struct bpf_work *)cb;
INIT_WORK(&w->work, bpf_wq_work);
- INIT_WORK(&w->delete_work, bpf_wq_delete_work);
cb->value = (void *)async - map->record->wq_off;
break;
}
cb->map = map;
cb->prog = NULL;
cb->flags = flags;
+ cb->worker = IRQ_WORK_INIT(bpf_async_irq_worker);
+ init_llist_head(&cb->async_cmds);
+ refcount_set(&cb->refcnt, 1); /* map's reference */
+ cb->type = type;
rcu_assign_pointer(cb->callback_fn, NULL);
- WRITE_ONCE(async->cb, cb);
+ old_cb = cmpxchg(&async->cb, NULL, cb);
+ if (old_cb) {
+ /* Lost the race to initialize this bpf_async_kern, drop the allocated object */
+ kfree_nolock(cb);
+ return -EBUSY;
+ }
/* Guarantee the order between async->cb and map->usercnt. So
* when there are concurrent uref release and bpf timer init, either
* bpf_timer_cancel_and_free() called by uref release reads a no-NULL
@@ -1317,13 +1368,11 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
/* maps with timers must be either held by user space
* or pinned in bpffs.
*/
- WRITE_ONCE(async->cb, NULL);
- kfree_nolock(cb);
- ret = -EPERM;
+ bpf_async_cancel_and_free(async);
+ return -EPERM;
}
-out:
- __bpf_spin_unlock_irqrestore(&async->lock);
- return ret;
+
+ return 0;
}
BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map,
@@ -1354,56 +1403,90 @@ static const struct bpf_func_proto bpf_timer_init_proto = {
.arg3_type = ARG_ANYTHING,
};
-static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn,
- struct bpf_prog_aux *aux, unsigned int flags,
- enum bpf_async_type type)
+static int bpf_async_update_prog_callback(struct bpf_async_cb *cb,
+ struct bpf_prog *prog,
+ void *callback_fn)
{
- struct bpf_prog *prev, *prog = aux->prog;
- struct bpf_async_cb *cb;
- int ret = 0;
+ struct bpf_prog *prev;
- if (in_nmi())
- return -EOPNOTSUPP;
- __bpf_spin_lock_irqsave(&async->lock);
- cb = async->cb;
- if (!cb) {
- ret = -EINVAL;
- goto out;
- }
- if (!atomic64_read(&cb->map->usercnt)) {
- /* maps with timers must be either held by user space
- * or pinned in bpffs. Otherwise timer might still be
- * running even when bpf prog is detached and user space
- * is gone, since map_release_uref won't ever be called.
- */
- ret = -EPERM;
- goto out;
+ /* Acquire a guard reference on prog to prevent it from being freed during the loop */
+ if (prog) {
+ prog = bpf_prog_inc_not_zero(prog);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
}
- prev = cb->prog;
- if (prev != prog) {
- /* Bump prog refcnt once. Every bpf_timer_set_callback()
- * can pick different callback_fn-s within the same prog.
+
+ do {
+ if (prog)
+ prog = bpf_prog_inc_not_zero(prog);
+ prev = xchg(&cb->prog, prog);
+ rcu_assign_pointer(cb->callback_fn, callback_fn);
+
+ /*
+ * Release previous prog, make sure that if other CPU is contending,
+ * to set bpf_prog, references are not leaked as each iteration acquires and
+ * releases one reference.
*/
- prog = bpf_prog_inc_not_zero(prog);
- if (IS_ERR(prog)) {
- ret = PTR_ERR(prog);
- goto out;
- }
if (prev)
- /* Drop prev prog refcnt when swapping with new prog */
bpf_prog_put(prev);
- cb->prog = prog;
+
+ } while (READ_ONCE(cb->prog) != prog ||
+ (void __force *)READ_ONCE(cb->callback_fn) != callback_fn);
+
+ if (prog)
+ bpf_prog_put(prog);
+
+ return 0;
+}
+
+static DEFINE_PER_CPU(struct bpf_async_cb *, async_cb_running);
+
+static int bpf_async_schedule_op(struct bpf_async_cb *cb, enum bpf_async_op op,
+ u64 nsec, u32 timer_mode)
+{
+ /*
+ * Do not schedule another operation on this cpu if it's in irq_work
+ * callback that is processing async_cmds queue. Otherwise the following
+ * loop is possible:
+ * bpf_timer_start() -> bpf_async_schedule_op() -> irq_work_queue().
+ * irqrestore -> bpf_async_irq_worker() -> tracepoint -> bpf_timer_start().
+ */
+ if (this_cpu_read(async_cb_running) == cb) {
+ bpf_async_refcount_put(cb);
+ return -EDEADLK;
}
- rcu_assign_pointer(cb->callback_fn, callback_fn);
-out:
- __bpf_spin_unlock_irqrestore(&async->lock);
- return ret;
+
+ struct bpf_async_cmd *cmd = kmalloc_nolock(sizeof(*cmd), 0, NUMA_NO_NODE);
+
+ if (!cmd) {
+ bpf_async_refcount_put(cb);
+ return -ENOMEM;
+ }
+ init_llist_node(&cmd->node);
+ cmd->nsec = nsec;
+ cmd->mode = timer_mode;
+ cmd->op = op;
+ if (llist_add(&cmd->node, &cb->async_cmds))
+ irq_work_queue(&cb->worker);
+ return 0;
+}
+
+static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn,
+ struct bpf_prog *prog)
+{
+ struct bpf_async_cb *cb;
+
+ cb = READ_ONCE(async->cb);
+ if (!cb)
+ return -EINVAL;
+
+ return bpf_async_update_prog_callback(cb, prog, callback_fn);
}
BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn,
struct bpf_prog_aux *, aux)
{
- return __bpf_async_set_callback(timer, callback_fn, aux, 0, BPF_ASYNC_TYPE_TIMER);
+ return __bpf_async_set_callback(timer, callback_fn, aux->prog);
}
static const struct bpf_func_proto bpf_timer_set_callback_proto = {
@@ -1414,22 +1497,22 @@ static const struct bpf_func_proto bpf_timer_set_callback_proto = {
.arg2_type = ARG_PTR_TO_FUNC,
};
-BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, flags)
+static bool defer_timer_wq_op(void)
+{
+ return in_hardirq() || irqs_disabled();
+}
+
+BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, async, u64, nsecs, u64, flags)
{
struct bpf_hrtimer *t;
- int ret = 0;
- enum hrtimer_mode mode;
+ u32 mode;
- if (in_nmi())
- return -EOPNOTSUPP;
if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN))
return -EINVAL;
- __bpf_spin_lock_irqsave(&timer->lock);
- t = timer->timer;
- if (!t || !t->cb.prog) {
- ret = -EINVAL;
- goto out;
- }
+
+ t = READ_ONCE(async->timer);
+ if (!t || !READ_ONCE(t->cb.prog))
+ return -EINVAL;
if (flags & BPF_F_TIMER_ABS)
mode = HRTIMER_MODE_ABS_SOFT;
@@ -1439,10 +1522,20 @@ BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, fla
if (flags & BPF_F_TIMER_CPU_PIN)
mode |= HRTIMER_MODE_PINNED;
- hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode);
-out:
- __bpf_spin_unlock_irqrestore(&timer->lock);
- return ret;
+ /*
+ * bpf_async_cancel_and_free() could have dropped refcnt to zero. In
+ * such case BPF progs are not allowed to arm the timer to prevent UAF.
+ */
+ if (!refcount_inc_not_zero(&t->cb.refcnt))
+ return -ENOENT;
+
+ if (!defer_timer_wq_op()) {
+ hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode);
+ bpf_async_refcount_put(&t->cb);
+ return 0;
+ } else {
+ return bpf_async_schedule_op(&t->cb, BPF_ASYNC_START, nsecs, mode);
+ }
}
static const struct bpf_func_proto bpf_timer_start_proto = {
@@ -1454,32 +1547,18 @@ static const struct bpf_func_proto bpf_timer_start_proto = {
.arg3_type = ARG_ANYTHING,
};
-static void drop_prog_refcnt(struct bpf_async_cb *async)
-{
- struct bpf_prog *prog = async->prog;
-
- if (prog) {
- bpf_prog_put(prog);
- async->prog = NULL;
- rcu_assign_pointer(async->callback_fn, NULL);
- }
-}
-
-BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
+BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, async)
{
struct bpf_hrtimer *t, *cur_t;
bool inc = false;
int ret = 0;
- if (in_nmi())
+ if (defer_timer_wq_op())
return -EOPNOTSUPP;
- rcu_read_lock();
- __bpf_spin_lock_irqsave(&timer->lock);
- t = timer->timer;
- if (!t) {
- ret = -EINVAL;
- goto out;
- }
+
+ t = READ_ONCE(async->timer);
+ if (!t)
+ return -EINVAL;
cur_t = this_cpu_read(hrtimer_running);
if (cur_t == t) {
@@ -1487,8 +1566,7 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
* its own timer the hrtimer_cancel() will deadlock
* since it waits for callback_fn to finish.
*/
- ret = -EDEADLK;
- goto out;
+ return -EDEADLK;
}
/* Only account in-flight cancellations when invoked from a timer
@@ -1511,20 +1589,17 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
* cancelling and waiting for it synchronously, since it might
* do the same. Bail!
*/
- ret = -EDEADLK;
- goto out;
+ atomic_dec(&t->cancelling);
+ return -EDEADLK;
}
drop:
- drop_prog_refcnt(&t->cb);
-out:
- __bpf_spin_unlock_irqrestore(&timer->lock);
+ bpf_async_update_prog_callback(&t->cb, NULL, NULL);
/* Cancel the timer and wait for associated callback to finish
* if it was running.
*/
- ret = ret ?: hrtimer_cancel(&t->timer);
+ ret = hrtimer_cancel(&t->timer);
if (inc)
atomic_dec(&t->cancelling);
- rcu_read_unlock();
return ret;
}
@@ -1535,107 +1610,107 @@ static const struct bpf_func_proto bpf_timer_cancel_proto = {
.arg1_type = ARG_PTR_TO_TIMER,
};
-static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *async)
+static void bpf_async_process_op(struct bpf_async_cb *cb, u32 op,
+ u64 timer_nsec, u32 timer_mode)
{
- struct bpf_async_cb *cb;
+ switch (cb->type) {
+ case BPF_ASYNC_TYPE_TIMER: {
+ struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb);
- /* Performance optimization: read async->cb without lock first. */
- if (!READ_ONCE(async->cb))
- return NULL;
+ switch (op) {
+ case BPF_ASYNC_START:
+ hrtimer_start(&t->timer, ns_to_ktime(timer_nsec), timer_mode);
+ break;
+ case BPF_ASYNC_CANCEL:
+ hrtimer_try_to_cancel(&t->timer);
+ break;
+ }
+ break;
+ }
+ case BPF_ASYNC_TYPE_WQ: {
+ struct bpf_work *w = container_of(cb, struct bpf_work, cb);
+
+ switch (op) {
+ case BPF_ASYNC_START:
+ schedule_work(&w->work);
+ break;
+ case BPF_ASYNC_CANCEL:
+ cancel_work(&w->work);
+ break;
+ }
+ break;
+ }
+ }
+ bpf_async_refcount_put(cb);
+}
- __bpf_spin_lock_irqsave(&async->lock);
- /* re-read it under lock */
- cb = async->cb;
- if (!cb)
- goto out;
- drop_prog_refcnt(cb);
- /* The subsequent bpf_timer_start/cancel() helpers won't be able to use
- * this timer, since it won't be initialized.
- */
- WRITE_ONCE(async->cb, NULL);
-out:
- __bpf_spin_unlock_irqrestore(&async->lock);
- return cb;
+static void bpf_async_irq_worker(struct irq_work *work)
+{
+ struct bpf_async_cb *cb = container_of(work, struct bpf_async_cb, worker);
+ struct llist_node *pos, *n, *list;
+
+ list = llist_del_all(&cb->async_cmds);
+ if (!list)
+ return;
+
+ list = llist_reverse_order(list);
+ this_cpu_write(async_cb_running, cb);
+ llist_for_each_safe(pos, n, list) {
+ struct bpf_async_cmd *cmd;
+
+ cmd = container_of(pos, struct bpf_async_cmd, node);
+ bpf_async_process_op(cb, cmd->op, cmd->nsec, cmd->mode);
+ kfree_nolock(cmd);
+ }
+ this_cpu_write(async_cb_running, NULL);
}
-/* This function is called by map_delete/update_elem for individual element and
- * by ops->map_release_uref when the user space reference to a map reaches zero.
- */
-void bpf_timer_cancel_and_free(void *val)
+static void bpf_async_cancel_and_free(struct bpf_async_kern *async)
{
- struct bpf_hrtimer *t;
+ struct bpf_async_cb *cb;
- t = (struct bpf_hrtimer *)__bpf_async_cancel_and_free(val);
+ if (!READ_ONCE(async->cb))
+ return;
- if (!t)
+ cb = xchg(&async->cb, NULL);
+ if (!cb)
return;
- /* We check that bpf_map_delete/update_elem() was called from timer
- * callback_fn. In such case we don't call hrtimer_cancel() (since it
- * will deadlock) and don't call hrtimer_try_to_cancel() (since it will
- * just return -1). Though callback_fn is still running on this cpu it's
- * safe to do kfree(t) because bpf_timer_cb() read everything it needed
- * from 't'. The bpf subprog callback_fn won't be able to access 't',
- * since async->cb = NULL was already done. The timer will be
- * effectively cancelled because bpf_timer_cb() will return
- * HRTIMER_NORESTART.
- *
- * However, it is possible the timer callback_fn calling us armed the
- * timer _before_ calling us, such that failing to cancel it here will
- * cause it to possibly use struct hrtimer after freeing bpf_hrtimer.
- * Therefore, we _need_ to cancel any outstanding timers before we do
- * call_rcu, even though no more timers can be armed.
- *
- * Moreover, we need to schedule work even if timer does not belong to
- * the calling callback_fn, as on two different CPUs, we can end up in a
- * situation where both sides run in parallel, try to cancel one
- * another, and we end up waiting on both sides in hrtimer_cancel
- * without making forward progress, since timer1 depends on time2
- * callback to finish, and vice versa.
- *
- * CPU 1 (timer1_cb) CPU 2 (timer2_cb)
- * bpf_timer_cancel_and_free(timer2) bpf_timer_cancel_and_free(timer1)
- *
- * To avoid these issues, punt to workqueue context when we are in a
- * timer callback.
+
+ bpf_async_update_prog_callback(cb, NULL, NULL);
+ /*
+ * No refcount_inc_not_zero(&cb->refcnt) here. Dropping the last
+ * refcnt. Either synchronously or asynchronously in irq_work.
*/
- if (this_cpu_read(hrtimer_running)) {
- queue_work(system_dfl_wq, &t->cb.delete_work);
- return;
- }
- if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
- /* If the timer is running on other CPU, also use a kworker to
- * wait for the completion of the timer instead of trying to
- * acquire a sleepable lock in hrtimer_cancel() to wait for its
- * completion.
- */
- if (hrtimer_try_to_cancel(&t->timer) >= 0)
- call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free);
- else
- queue_work(system_dfl_wq, &t->cb.delete_work);
+ if (!defer_timer_wq_op()) {
+ bpf_async_process_op(cb, BPF_ASYNC_CANCEL, 0, 0);
} else {
- bpf_timer_delete_work(&t->cb.delete_work);
+ (void)bpf_async_schedule_op(cb, BPF_ASYNC_CANCEL, 0, 0);
+ /*
+ * bpf_async_schedule_op() either enqueues allocated cmd into llist
+ * or fails with ENOMEM and drop the last refcnt.
+ * This is unlikely, but safe, since bpf_async_cb_rcu_tasks_trace_free()
+ * callback will do additional timer/wq_cancel due to races anyway.
+ */
}
}
-/* This function is called by map_delete/update_elem for individual element and
+/*
+ * This function is called by map_delete/update_elem for individual element and
* by ops->map_release_uref when the user space reference to a map reaches zero.
*/
-void bpf_wq_cancel_and_free(void *val)
+void bpf_timer_cancel_and_free(void *val)
{
- struct bpf_work *work;
-
- BTF_TYPE_EMIT(struct bpf_wq);
+ bpf_async_cancel_and_free(val);
+}
- work = (struct bpf_work *)__bpf_async_cancel_and_free(val);
- if (!work)
- return;
- /* Trigger cancel of the sleepable work, but *do not* wait for
- * it to finish if it was running as we might not be in a
- * sleepable context.
- * kfree will be called once the work has finished.
- */
- schedule_work(&work->delete_work);
+/*
+ * This function is called by map_delete/update_elem for individual element and
+ * by ops->map_release_uref when the user space reference to a map reaches zero.
+ */
+void bpf_wq_cancel_and_free(void *val)
+{
+ bpf_async_cancel_and_free(val);
}
BPF_CALL_2(bpf_kptr_xchg, void *, dst, void *, ptr)
@@ -2092,12 +2167,8 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_cgroup_classid_curr_proto;
#endif
case BPF_FUNC_task_storage_get:
- if (bpf_prog_check_recur(prog))
- return &bpf_task_storage_get_recur_proto;
return &bpf_task_storage_get_proto;
case BPF_FUNC_task_storage_delete:
- if (bpf_prog_check_recur(prog))
- return &bpf_task_storage_delete_recur_proto;
return &bpf_task_storage_delete_proto;
default:
break;
@@ -2709,14 +2780,14 @@ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid)
* bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data.
* @p: The dynptr whose data slice to retrieve
* @offset: Offset into the dynptr
- * @buffer__opt: User-provided buffer to copy contents into. May be NULL
+ * @buffer__nullable: User-provided buffer to copy contents into. May be NULL
* @buffer__szk: Size (in bytes) of the buffer if present. This is the
* length of the requested slice. This must be a constant.
*
* For non-skb and non-xdp type dynptrs, there is no difference between
* bpf_dynptr_slice and bpf_dynptr_data.
*
- * If buffer__opt is NULL, the call will fail if buffer_opt was needed.
+ * If buffer__nullable is NULL, the call will fail if buffer_opt was needed.
*
* If the intention is to write to the data slice, please use
* bpf_dynptr_slice_rdwr.
@@ -2734,7 +2805,7 @@ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid)
* direct pointer)
*/
__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset,
- void *buffer__opt, u64 buffer__szk)
+ void *buffer__nullable, u64 buffer__szk)
{
const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
enum bpf_dynptr_type type;
@@ -2755,8 +2826,8 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset,
case BPF_DYNPTR_TYPE_RINGBUF:
return ptr->data + ptr->offset + offset;
case BPF_DYNPTR_TYPE_SKB:
- if (buffer__opt)
- return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__opt);
+ if (buffer__nullable)
+ return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__nullable);
else
return skb_pointer_if_linear(ptr->data, ptr->offset + offset, len);
case BPF_DYNPTR_TYPE_XDP:
@@ -2765,16 +2836,16 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset,
if (!IS_ERR_OR_NULL(xdp_ptr))
return xdp_ptr;
- if (!buffer__opt)
+ if (!buffer__nullable)
return NULL;
- bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false);
- return buffer__opt;
+ bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__nullable, len, false);
+ return buffer__nullable;
}
case BPF_DYNPTR_TYPE_SKB_META:
return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset);
case BPF_DYNPTR_TYPE_FILE:
- err = bpf_file_fetch_bytes(ptr->data, offset, buffer__opt, buffer__szk);
- return err ? NULL : buffer__opt;
+ err = bpf_file_fetch_bytes(ptr->data, offset, buffer__nullable, buffer__szk);
+ return err ? NULL : buffer__nullable;
default:
WARN_ONCE(true, "unknown dynptr type %d\n", type);
return NULL;
@@ -2785,14 +2856,14 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset,
* bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data.
* @p: The dynptr whose data slice to retrieve
* @offset: Offset into the dynptr
- * @buffer__opt: User-provided buffer to copy contents into. May be NULL
+ * @buffer__nullable: User-provided buffer to copy contents into. May be NULL
* @buffer__szk: Size (in bytes) of the buffer if present. This is the
* length of the requested slice. This must be a constant.
*
* For non-skb and non-xdp type dynptrs, there is no difference between
* bpf_dynptr_slice and bpf_dynptr_data.
*
- * If buffer__opt is NULL, the call will fail if buffer_opt was needed.
+ * If buffer__nullable is NULL, the call will fail if buffer_opt was needed.
*
* The returned pointer is writable and may point to either directly the dynptr
* data at the requested offset or to the buffer if unable to obtain a direct
@@ -2824,7 +2895,7 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset,
* direct pointer)
*/
__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset,
- void *buffer__opt, u64 buffer__szk)
+ void *buffer__nullable, u64 buffer__szk)
{
const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
@@ -2853,7 +2924,7 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset,
* will be copied out into the buffer and the user will need to call
* bpf_dynptr_write() to commit changes.
*/
- return bpf_dynptr_slice(p, offset, buffer__opt, buffer__szk);
+ return bpf_dynptr_slice(p, offset, buffer__nullable, buffer__szk);
}
__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end)
@@ -3108,30 +3179,36 @@ __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags)
struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
struct bpf_work *w;
- if (in_nmi())
- return -EOPNOTSUPP;
if (flags)
return -EINVAL;
+
w = READ_ONCE(async->work);
if (!w || !READ_ONCE(w->cb.prog))
return -EINVAL;
- schedule_work(&w->work);
- return 0;
+ if (!refcount_inc_not_zero(&w->cb.refcnt))
+ return -ENOENT;
+
+ if (!defer_timer_wq_op()) {
+ schedule_work(&w->work);
+ bpf_async_refcount_put(&w->cb);
+ return 0;
+ } else {
+ return bpf_async_schedule_op(&w->cb, BPF_ASYNC_START, 0, 0);
+ }
}
-__bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq,
- int (callback_fn)(void *map, int *key, void *value),
- unsigned int flags,
- void *aux__prog)
+__bpf_kfunc int bpf_wq_set_callback(struct bpf_wq *wq,
+ int (callback_fn)(void *map, int *key, void *value),
+ unsigned int flags,
+ struct bpf_prog_aux *aux)
{
- struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__prog;
struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
if (flags)
return -EINVAL;
- return __bpf_async_set_callback(async, callback_fn, aux, flags, BPF_ASYNC_TYPE_WQ);
+ return __bpf_async_set_callback(async, callback_fn, aux->prog);
}
__bpf_kfunc void bpf_preempt_disable(void)
@@ -3406,7 +3483,7 @@ __bpf_kfunc void __bpf_trap(void)
* __get_kernel_nofault instead of plain dereference to make them safe.
*/
-static int __bpf_strcasecmp(const char *s1, const char *s2, bool ignore_case)
+static int __bpf_strncasecmp(const char *s1, const char *s2, bool ignore_case, size_t len)
{
char c1, c2;
int i;
@@ -3417,7 +3494,7 @@ static int __bpf_strcasecmp(const char *s1, const char *s2, bool ignore_case)
}
guard(pagefault)();
- for (i = 0; i < XATTR_SIZE_MAX; i++) {
+ for (i = 0; i < len && i < XATTR_SIZE_MAX; i++) {
__get_kernel_nofault(&c1, s1, char, err_out);
__get_kernel_nofault(&c2, s2, char, err_out);
if (ignore_case) {
@@ -3431,7 +3508,7 @@ static int __bpf_strcasecmp(const char *s1, const char *s2, bool ignore_case)
s1++;
s2++;
}
- return -E2BIG;
+ return i == XATTR_SIZE_MAX ? -E2BIG : 0;
err_out:
return -EFAULT;
}
@@ -3451,7 +3528,7 @@ err_out:
*/
__bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign)
{
- return __bpf_strcasecmp(s1__ign, s2__ign, false);
+ return __bpf_strncasecmp(s1__ign, s2__ign, false, XATTR_SIZE_MAX);
}
/**
@@ -3469,7 +3546,26 @@ __bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign)
*/
__bpf_kfunc int bpf_strcasecmp(const char *s1__ign, const char *s2__ign)
{
- return __bpf_strcasecmp(s1__ign, s2__ign, true);
+ return __bpf_strncasecmp(s1__ign, s2__ign, true, XATTR_SIZE_MAX);
+}
+
+/*
+ * bpf_strncasecmp - Compare two length-limited strings, ignoring case
+ * @s1__ign: One string
+ * @s2__ign: Another string
+ * @len: The maximum number of characters to compare
+ *
+ * Return:
+ * * %0 - Strings are equal
+ * * %-1 - @s1__ign is smaller
+ * * %1 - @s2__ign is smaller
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of strings is too large
+ * * %-ERANGE - One of strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strncasecmp(const char *s1__ign, const char *s2__ign, size_t len)
+{
+ return __bpf_strncasecmp(s1__ign, s2__ign, true, len);
}
/**
@@ -4275,41 +4371,39 @@ release_prog:
}
/**
- * bpf_task_work_schedule_signal_impl - Schedule BPF callback using task_work_add with TWA_SIGNAL
+ * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL
* mode
* @task: Task struct for which callback should be scheduled
* @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
* @map__map: bpf_map that embeds struct bpf_task_work in the values
* @callback: pointer to BPF subprogram to call
- * @aux__prog: user should pass NULL
+ * @aux: pointer to bpf_prog_aux of the caller BPF program, implicitly set by the verifier
*
* Return: 0 if task work has been scheduled successfully, negative error code otherwise
*/
-__bpf_kfunc int bpf_task_work_schedule_signal_impl(struct task_struct *task,
- struct bpf_task_work *tw, void *map__map,
- bpf_task_work_callback_t callback,
- void *aux__prog)
+__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw,
+ void *map__map, bpf_task_work_callback_t callback,
+ struct bpf_prog_aux *aux)
{
- return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL);
+ return bpf_task_work_schedule(task, tw, map__map, callback, aux, TWA_SIGNAL);
}
/**
- * bpf_task_work_schedule_resume_impl - Schedule BPF callback using task_work_add with TWA_RESUME
+ * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME
* mode
* @task: Task struct for which callback should be scheduled
* @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
* @map__map: bpf_map that embeds struct bpf_task_work in the values
* @callback: pointer to BPF subprogram to call
- * @aux__prog: user should pass NULL
+ * @aux: pointer to bpf_prog_aux of the caller BPF program, implicitly set by the verifier
*
* Return: 0 if task work has been scheduled successfully, negative error code otherwise
*/
-__bpf_kfunc int bpf_task_work_schedule_resume_impl(struct task_struct *task,
- struct bpf_task_work *tw, void *map__map,
- bpf_task_work_callback_t callback,
- void *aux__prog)
+__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw,
+ void *map__map, bpf_task_work_callback_t callback,
+ struct bpf_prog_aux *aux)
{
- return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME);
+ return bpf_task_work_schedule(task, tw, map__map, callback, aux, TWA_RESUME);
}
static int make_file_dynptr(struct file *file, u32 flags, bool may_sleep,
@@ -4360,6 +4454,53 @@ __bpf_kfunc int bpf_dynptr_file_discard(struct bpf_dynptr *dynptr)
return 0;
}
+/**
+ * bpf_timer_cancel_async - try to deactivate a timer
+ * @timer: bpf_timer to stop
+ *
+ * Returns:
+ *
+ * * 0 when the timer was not active
+ * * 1 when the timer was active
+ * * -1 when the timer is currently executing the callback function and
+ * cannot be stopped
+ * * -ECANCELED when the timer will be cancelled asynchronously
+ * * -ENOMEM when out of memory
+ * * -EINVAL when the timer was not initialized
+ * * -ENOENT when this kfunc is racing with timer deletion
+ */
+__bpf_kfunc int bpf_timer_cancel_async(struct bpf_timer *timer)
+{
+ struct bpf_async_kern *async = (void *)timer;
+ struct bpf_async_cb *cb;
+ int ret;
+
+ cb = READ_ONCE(async->cb);
+ if (!cb)
+ return -EINVAL;
+
+ /*
+ * Unlike hrtimer_start() it's ok to synchronously call
+ * hrtimer_try_to_cancel() when refcnt reached zero, but deferring to
+ * irq_work is not, since irq callback may execute after RCU GP and
+ * cb could be freed at that time. Check for refcnt zero for
+ * consistency.
+ */
+ if (!refcount_inc_not_zero(&cb->refcnt))
+ return -ENOENT;
+
+ if (!defer_timer_wq_op()) {
+ struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb);
+
+ ret = hrtimer_try_to_cancel(&t->timer);
+ bpf_async_refcount_put(cb);
+ return ret;
+ } else {
+ ret = bpf_async_schedule_op(cb, BPF_ASYNC_CANCEL, 0, 0);
+ return ret ? ret : -ECANCELED;
+ }
+}
+
__bpf_kfunc_end_defs();
static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work)
@@ -4427,7 +4568,7 @@ BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_throw)
#ifdef CONFIG_BPF_EVENTS
-BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_send_signal_task)
#endif
#ifdef CONFIG_KEYS
BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
@@ -4467,14 +4608,14 @@ BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU)
BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
#ifdef CONFIG_CGROUPS
-BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW)
BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
-BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_RCU_PROTECTED)
BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY)
#endif
-BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_RCU_PROTECTED)
BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY)
BTF_ID_FLAGS(func, bpf_dynptr_adjust)
@@ -4488,7 +4629,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_memset)
BTF_ID_FLAGS(func, bpf_modify_return_test_tp)
#endif
BTF_ID_FLAGS(func, bpf_wq_init)
-BTF_ID_FLAGS(func, bpf_wq_set_callback_impl)
+BTF_ID_FLAGS(func, bpf_wq_set_callback, KF_IMPLICIT_ARGS)
BTF_ID_FLAGS(func, bpf_wq_start)
BTF_ID_FLAGS(func, bpf_preempt_disable)
BTF_ID_FLAGS(func, bpf_preempt_enable)
@@ -4510,8 +4651,8 @@ BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr)
BTF_ID_FLAGS(func, bpf_probe_read_kernel_str_dynptr)
BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE)
-BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE)
#endif
#ifdef CONFIG_DMA_SHARED_BUFFER
BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE)
@@ -4521,6 +4662,7 @@ BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
BTF_ID_FLAGS(func, __bpf_trap)
BTF_ID_FLAGS(func, bpf_strcmp);
BTF_ID_FLAGS(func, bpf_strcasecmp);
+BTF_ID_FLAGS(func, bpf_strncasecmp);
BTF_ID_FLAGS(func, bpf_strchr);
BTF_ID_FLAGS(func, bpf_strchrnul);
BTF_ID_FLAGS(func, bpf_strnchr);
@@ -4536,11 +4678,13 @@ BTF_ID_FLAGS(func, bpf_strncasestr);
#if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS)
BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU)
#endif
-BTF_ID_FLAGS(func, bpf_stream_vprintk_impl, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_dynptr_from_file, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, bpf_stream_print_stack, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, bpf_dynptr_from_file)
BTF_ID_FLAGS(func, bpf_dynptr_file_discard)
+BTF_ID_FLAGS(func, bpf_timer_cancel_async)
BTF_KFUNCS_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 9f866a010dad..005ea3a2cda7 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -600,10 +600,17 @@ struct bpffs_btf_enums {
static int find_bpffs_btf_enums(struct bpffs_btf_enums *info)
{
+ struct {
+ const struct btf_type **type;
+ const char *name;
+ } btf_enums[] = {
+ {&info->cmd_t, "bpf_cmd"},
+ {&info->map_t, "bpf_map_type"},
+ {&info->prog_t, "bpf_prog_type"},
+ {&info->attach_t, "bpf_attach_type"},
+ };
const struct btf *btf;
- const struct btf_type *t;
- const char *name;
- int i, n;
+ int i, id;
memset(info, 0, sizeof(*info));
@@ -615,31 +622,16 @@ static int find_bpffs_btf_enums(struct bpffs_btf_enums *info)
info->btf = btf;
- for (i = 1, n = btf_nr_types(btf); i < n; i++) {
- t = btf_type_by_id(btf, i);
- if (!btf_type_is_enum(t))
- continue;
-
- name = btf_name_by_offset(btf, t->name_off);
- if (!name)
- continue;
-
- if (strcmp(name, "bpf_cmd") == 0)
- info->cmd_t = t;
- else if (strcmp(name, "bpf_map_type") == 0)
- info->map_t = t;
- else if (strcmp(name, "bpf_prog_type") == 0)
- info->prog_t = t;
- else if (strcmp(name, "bpf_attach_type") == 0)
- info->attach_t = t;
- else
- continue;
+ for (i = 0; i < ARRAY_SIZE(btf_enums); i++) {
+ id = btf_find_by_name_kind(btf, btf_enums[i].name,
+ BTF_KIND_ENUM);
+ if (id < 0)
+ return -ESRCH;
- if (info->cmd_t && info->map_t && info->prog_t && info->attach_t)
- return 0;
+ *btf_enums[i].type = btf_type_by_id(btf, id);
}
- return -ESRCH;
+ return 0;
}
static bool find_btf_enum_const(const struct btf *btf, const struct btf_type *enum_t,
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index c93a756e035c..1ccbf28b2ad9 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -180,7 +180,7 @@ static long cgroup_storage_update_elem(struct bpf_map *map, void *key,
}
int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key,
- void *value)
+ void *value, u64 map_flags)
{
struct bpf_cgroup_storage_map *map = map_to_storage(_map);
struct bpf_cgroup_storage *storage;
@@ -198,12 +198,17 @@ int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key,
* access 'value_size' of them, so copying rounded areas
* will not leak any kernel data
*/
+ if (map_flags & BPF_F_CPU) {
+ cpu = map_flags >> 32;
+ copy_map_value(_map, value, per_cpu_ptr(storage->percpu_buf, cpu));
+ goto unlock;
+ }
size = round_up(_map->value_size, 8);
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(value + off,
- per_cpu_ptr(storage->percpu_buf, cpu), size);
+ copy_map_value_long(_map, value + off, per_cpu_ptr(storage->percpu_buf, cpu));
off += size;
}
+unlock:
rcu_read_unlock();
return 0;
}
@@ -213,10 +218,11 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key,
{
struct bpf_cgroup_storage_map *map = map_to_storage(_map);
struct bpf_cgroup_storage *storage;
- int cpu, off = 0;
+ void *val;
u32 size;
+ int cpu;
- if (map_flags != BPF_ANY && map_flags != BPF_EXIST)
+ if ((u32)map_flags & ~(BPF_ANY | BPF_EXIST | BPF_F_CPU | BPF_F_ALL_CPUS))
return -EINVAL;
rcu_read_lock();
@@ -232,12 +238,17 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key,
* returned or zeros which were zero-filled by percpu_alloc,
* so no kernel data leaks possible
*/
+ if (map_flags & BPF_F_CPU) {
+ cpu = map_flags >> 32;
+ copy_map_value(_map, per_cpu_ptr(storage->percpu_buf, cpu), value);
+ goto unlock;
+ }
size = round_up(_map->value_size, 8);
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu),
- value + off, size);
- off += size;
+ val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu;
+ copy_map_value(_map, per_cpu_ptr(storage->percpu_buf, cpu), val);
}
+unlock:
rcu_read_unlock();
return 0;
}
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index 9575314f40a6..261a03ea73d3 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -214,7 +214,7 @@ __bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map)
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(bpf_map_iter_kfunc_ids)
-BTF_ID_FLAGS(func, bpf_map_sum_elem_count, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_map_sum_elem_count)
BTF_KFUNCS_END(bpf_map_iter_kfunc_ids)
static const struct btf_kfunc_id_set bpf_map_iter_kfunc_set = {
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 42ae8d595c2c..227f9b5f388b 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -1,16 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2017-2018 Netronome Systems, Inc.
- *
- * This software is licensed under the GNU General License Version 2,
- * June 1991 as shown in the file COPYING in the top-level directory of this
- * source tree.
- *
- * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS"
- * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
- * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE
- * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME
- * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
*/
#include <linux/bpf.h>
diff --git a/kernel/bpf/range_tree.c b/kernel/bpf/range_tree.c
index 99c63d982c5d..2f28886f3ff7 100644
--- a/kernel/bpf/range_tree.c
+++ b/kernel/bpf/range_tree.c
@@ -149,7 +149,8 @@ int range_tree_clear(struct range_tree *rt, u32 start, u32 len)
range_it_insert(rn, rt);
/* Add a range */
- new_rn = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE);
+ new_rn = kmalloc_nolock(sizeof(struct range_node), __GFP_ACCOUNT,
+ NUMA_NO_NODE);
if (!new_rn)
return -ENOMEM;
new_rn->rn_start = last + 1;
@@ -234,7 +235,7 @@ int range_tree_set(struct range_tree *rt, u32 start, u32 len)
right->rn_start = start;
range_it_insert(right, rt);
} else {
- left = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE);
+ left = kmalloc_nolock(sizeof(struct range_node), __GFP_ACCOUNT, NUMA_NO_NODE);
if (!left)
return -ENOMEM;
left->rn_start = start;
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index f6a075ffac63..35ae64ade36b 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/err.h>
diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c
index f7d0c8d4644e..2fdfa828e3d3 100644
--- a/kernel/bpf/rqspinlock.c
+++ b/kernel/bpf/rqspinlock.c
@@ -265,10 +265,11 @@ int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock)
RES_INIT_TIMEOUT(ts);
/*
- * The fast path is not invoked for the TAS fallback, so we must grab
- * the deadlock detection entry here.
+ * We are either called directly from res_spin_lock after grabbing the
+ * deadlock detection entry when queued spinlocks are disabled, or from
+ * resilient_queued_spin_lock_slowpath after grabbing the deadlock
+ * detection entry. No need to obtain it here.
*/
- grab_held_lock_entry(lock);
/*
* Since the waiting loop's time is dependent on the amount of
diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c
index 0b6bc3f30335..be9ce98e9469 100644
--- a/kernel/bpf/stream.c
+++ b/kernel/bpf/stream.c
@@ -212,14 +212,13 @@ __bpf_kfunc_start_defs();
* Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the
* enum in headers.
*/
-__bpf_kfunc int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args,
- u32 len__sz, void *aux__prog)
+__bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args,
+ u32 len__sz, struct bpf_prog_aux *aux)
{
struct bpf_bprintf_data data = {
.get_bin_args = true,
.get_buf = true,
};
- struct bpf_prog_aux *aux = aux__prog;
u32 fmt_size = strlen(fmt__str) + 1;
struct bpf_stream *stream;
u32 data_len = len__sz;
@@ -246,6 +245,25 @@ __bpf_kfunc int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, con
return ret;
}
+/* Directly trigger a stack dump from the program. */
+__bpf_kfunc int bpf_stream_print_stack(int stream_id, struct bpf_prog_aux *aux)
+{
+ struct bpf_stream_stage ss;
+ struct bpf_prog *prog;
+
+ /* Make sure the stream ID is valid. */
+ if (!bpf_stream_get(stream_id, aux))
+ return -ENOENT;
+
+ prog = aux->main_prog_aux->prog;
+
+ bpf_stream_stage(ss, prog, stream_id, ({
+ bpf_stream_dump_stack(ss);
+ }));
+
+ return 0;
+}
+
__bpf_kfunc_end_defs();
/* Added kfunc to common_btf_ids */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4ff82144f885..683c332dbafb 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -133,12 +133,14 @@ bool bpf_map_write_active(const struct bpf_map *map)
return atomic64_read(&map->writecnt) != 0;
}
-static u32 bpf_map_value_size(const struct bpf_map *map)
-{
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
- map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
+static u32 bpf_map_value_size(const struct bpf_map *map, u64 flags)
+{
+ if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS))
+ return map->value_size;
+ else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
return round_up(map->value_size, 8) * num_possible_cpus();
else if (IS_FD_MAP(map))
return sizeof(u32);
@@ -314,11 +316,11 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
bpf_disable_instrumentation();
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
- err = bpf_percpu_hash_copy(map, key, value);
+ err = bpf_percpu_hash_copy(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
- err = bpf_percpu_array_copy(map, key, value);
+ err = bpf_percpu_array_copy(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
- err = bpf_percpu_cgroup_storage_copy(map, key, value);
+ err = bpf_percpu_cgroup_storage_copy(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
err = bpf_stackmap_extract(map, key, value, false);
} else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
@@ -505,17 +507,29 @@ static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map)
return root_mem_cgroup;
}
+void bpf_map_memcg_enter(const struct bpf_map *map, struct mem_cgroup **old_memcg,
+ struct mem_cgroup **new_memcg)
+{
+ *new_memcg = bpf_map_get_memcg(map);
+ *old_memcg = set_active_memcg(*new_memcg);
+}
+
+void bpf_map_memcg_exit(struct mem_cgroup *old_memcg,
+ struct mem_cgroup *new_memcg)
+{
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(new_memcg);
+}
+
void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
int node)
{
struct mem_cgroup *memcg, *old_memcg;
void *ptr;
- memcg = bpf_map_get_memcg(map);
- old_memcg = set_active_memcg(memcg);
+ bpf_map_memcg_enter(map, &old_memcg, &memcg);
ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
- set_active_memcg(old_memcg);
- mem_cgroup_put(memcg);
+ bpf_map_memcg_exit(old_memcg, memcg);
return ptr;
}
@@ -526,11 +540,9 @@ void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags
struct mem_cgroup *memcg, *old_memcg;
void *ptr;
- memcg = bpf_map_get_memcg(map);
- old_memcg = set_active_memcg(memcg);
+ bpf_map_memcg_enter(map, &old_memcg, &memcg);
ptr = kmalloc_nolock(size, flags | __GFP_ACCOUNT, node);
- set_active_memcg(old_memcg);
- mem_cgroup_put(memcg);
+ bpf_map_memcg_exit(old_memcg, memcg);
return ptr;
}
@@ -540,11 +552,9 @@ void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
struct mem_cgroup *memcg, *old_memcg;
void *ptr;
- memcg = bpf_map_get_memcg(map);
- old_memcg = set_active_memcg(memcg);
+ bpf_map_memcg_enter(map, &old_memcg, &memcg);
ptr = kzalloc(size, flags | __GFP_ACCOUNT);
- set_active_memcg(old_memcg);
- mem_cgroup_put(memcg);
+ bpf_map_memcg_exit(old_memcg, memcg);
return ptr;
}
@@ -555,11 +565,9 @@ void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size,
struct mem_cgroup *memcg, *old_memcg;
void *ptr;
- memcg = bpf_map_get_memcg(map);
- old_memcg = set_active_memcg(memcg);
+ bpf_map_memcg_enter(map, &old_memcg, &memcg);
ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT);
- set_active_memcg(old_memcg);
- mem_cgroup_put(memcg);
+ bpf_map_memcg_exit(old_memcg, memcg);
return ptr;
}
@@ -570,11 +578,9 @@ void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
struct mem_cgroup *memcg, *old_memcg;
void __percpu *ptr;
- memcg = bpf_map_get_memcg(map);
- old_memcg = set_active_memcg(memcg);
+ bpf_map_memcg_enter(map, &old_memcg, &memcg);
ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
- set_active_memcg(old_memcg);
- mem_cgroup_put(memcg);
+ bpf_map_memcg_exit(old_memcg, memcg);
return ptr;
}
@@ -612,12 +618,7 @@ int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
unsigned long i, j;
struct page *pg;
int ret = 0;
-#ifdef CONFIG_MEMCG
- struct mem_cgroup *memcg, *old_memcg;
- memcg = bpf_map_get_memcg(map);
- old_memcg = set_active_memcg(memcg);
-#endif
for (i = 0; i < nr_pages; i++) {
pg = __bpf_alloc_page(nid);
@@ -631,10 +632,6 @@ int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
break;
}
-#ifdef CONFIG_MEMCG
- set_active_memcg(old_memcg);
- mem_cgroup_put(memcg);
-#endif
return ret;
}
@@ -1366,11 +1363,6 @@ free_map_tab:
return ret;
}
-static bool bpf_net_capable(void)
-{
- return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN);
-}
-
#define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size
/* called via syscall */
static int map_create(union bpf_attr *attr, bpfptr_t uattr)
@@ -1734,7 +1726,7 @@ static int map_lookup_elem(union bpf_attr *attr)
if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ))
return -EPERM;
- err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK);
+ err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK | BPF_F_CPU);
if (err)
return err;
@@ -1742,7 +1734,7 @@ static int map_lookup_elem(union bpf_attr *attr)
if (IS_ERR(key))
return PTR_ERR(key);
- value_size = bpf_map_value_size(map);
+ value_size = bpf_map_value_size(map, attr->flags);
err = -ENOMEM;
value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
@@ -1809,7 +1801,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
goto err_put;
}
- value_size = bpf_map_value_size(map);
+ value_size = bpf_map_value_size(map, attr->flags);
value = kvmemdup_bpfptr(uvalue, value_size);
if (IS_ERR(value)) {
err = PTR_ERR(value);
@@ -2005,11 +1997,12 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
void *key, *value;
int err = 0;
- err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK);
+ err = bpf_map_check_op_flags(map, attr->batch.elem_flags,
+ BPF_F_LOCK | BPF_F_CPU | BPF_F_ALL_CPUS);
if (err)
return err;
- value_size = bpf_map_value_size(map);
+ value_size = bpf_map_value_size(map, attr->batch.elem_flags);
max_count = attr->batch.count;
if (!max_count)
@@ -2064,11 +2057,11 @@ int generic_map_lookup_batch(struct bpf_map *map,
u32 value_size, cp, max_count;
int err;
- err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK);
+ err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK | BPF_F_CPU);
if (err)
return err;
- value_size = bpf_map_value_size(map);
+ value_size = bpf_map_value_size(map, attr->batch.elem_flags);
max_count = attr->batch.count;
if (!max_count)
@@ -2190,7 +2183,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
goto err_put;
}
- value_size = bpf_map_value_size(map);
+ value_size = bpf_map_value_size(map, 0);
err = -ENOMEM;
value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
@@ -2820,6 +2813,13 @@ static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr
void *sig;
int err = 0;
+ /*
+ * Don't attempt to use kmalloc_large or vmalloc for signatures.
+ * Practical signature for BPF program should be below this limit.
+ */
+ if (attr->signature_size > KMALLOC_MAX_CACHE_SIZE)
+ return -EINVAL;
+
if (system_keyring_id_check(attr->keyring_id) == 0)
key = bpf_lookup_system_key(attr->keyring_id);
else
@@ -3579,6 +3579,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
case BPF_PROG_TYPE_TRACING:
if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
prog->expected_attach_type != BPF_TRACE_FEXIT &&
+ prog->expected_attach_type != BPF_TRACE_FSESSION &&
prog->expected_attach_type != BPF_MODIFY_RETURN) {
err = -EINVAL;
goto out_put_prog;
@@ -3628,7 +3629,21 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id);
}
- link = kzalloc(sizeof(*link), GFP_USER);
+ if (prog->expected_attach_type == BPF_TRACE_FSESSION) {
+ struct bpf_fsession_link *fslink;
+
+ fslink = kzalloc(sizeof(*fslink), GFP_USER);
+ if (fslink) {
+ bpf_link_init(&fslink->fexit.link, BPF_LINK_TYPE_TRACING,
+ &bpf_tracing_link_lops, prog, attach_type);
+ fslink->fexit.cookie = bpf_cookie;
+ link = &fslink->link;
+ } else {
+ link = NULL;
+ }
+ } else {
+ link = kzalloc(sizeof(*link), GFP_USER);
+ }
if (!link) {
err = -ENOMEM;
goto out_put_prog;
@@ -4352,6 +4367,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
case BPF_TRACE_RAW_TP:
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
+ case BPF_TRACE_FSESSION:
case BPF_MODIFY_RETURN:
return BPF_PROG_TYPE_TRACING;
case BPF_LSM_MAC:
@@ -4565,6 +4581,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
if (IS_ERR(prog))
return PTR_ERR(prog);
+ } else if (!bpf_mprog_detach_empty(ptype)) {
+ return -EPERM;
}
} else if (is_cgroup_prog_type(ptype, 0, false)) {
if (attr->attach_flags || attr->relative_fd)
@@ -5310,6 +5328,9 @@ static int bpf_map_get_info_by_fd(struct file *file,
if (info.hash_size != SHA256_DIGEST_SIZE)
return -EINVAL;
+ if (!READ_ONCE(map->frozen))
+ return -EPERM;
+
err = map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, map->sha);
if (err != 0)
return err;
@@ -6122,6 +6143,49 @@ static int prog_stream_read(union bpf_attr *attr)
return ret;
}
+#define BPF_PROG_ASSOC_STRUCT_OPS_LAST_FIELD prog_assoc_struct_ops.prog_fd
+
+static int prog_assoc_struct_ops(union bpf_attr *attr)
+{
+ struct bpf_prog *prog;
+ struct bpf_map *map;
+ int ret;
+
+ if (CHECK_ATTR(BPF_PROG_ASSOC_STRUCT_OPS))
+ return -EINVAL;
+
+ if (attr->prog_assoc_struct_ops.flags)
+ return -EINVAL;
+
+ prog = bpf_prog_get(attr->prog_assoc_struct_ops.prog_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) {
+ ret = -EINVAL;
+ goto put_prog;
+ }
+
+ map = bpf_map_get(attr->prog_assoc_struct_ops.map_fd);
+ if (IS_ERR(map)) {
+ ret = PTR_ERR(map);
+ goto put_prog;
+ }
+
+ if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS) {
+ ret = -EINVAL;
+ goto put_map;
+ }
+
+ ret = bpf_prog_assoc_struct_ops(prog, map);
+
+put_map:
+ bpf_map_put(map);
+put_prog:
+ bpf_prog_put(prog);
+ return ret;
+}
+
static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
{
union bpf_attr attr;
@@ -6261,6 +6325,9 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
case BPF_PROG_STREAM_READ_BY_FD:
err = prog_stream_read(&attr);
break;
+ case BPF_PROG_ASSOC_STRUCT_OPS:
+ err = prog_assoc_struct_ops(&attr);
+ break;
default:
err = -EINVAL;
break;
@@ -6407,7 +6474,7 @@ static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
.func = bpf_kallsyms_lookup_name,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index f8e70e9c3998..26fbfbb01700 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -8,6 +8,7 @@
*/
#include <linux/kernel.h>
#include <linux/tnum.h>
+#include <linux/swab.h>
#define TNUM(_v, _m) (struct tnum){.value = _v, .mask = _m}
/* A completely unknown value */
@@ -253,3 +254,18 @@ struct tnum tnum_const_subreg(struct tnum a, u32 value)
{
return tnum_with_subreg(a, tnum_const(value));
}
+
+struct tnum tnum_bswap16(struct tnum a)
+{
+ return TNUM(swab16(a.value & 0xFFFF), swab16(a.mask & 0xFFFF));
+}
+
+struct tnum tnum_bswap32(struct tnum a)
+{
+ return TNUM(swab32(a.value & 0xFFFFFFFF), swab32(a.mask & 0xFFFFFFFF));
+}
+
+struct tnum tnum_bswap64(struct tnum a)
+{
+ return TNUM(swab64(a.value), swab64(a.mask));
+}
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
index feecd8f4dbf9..7e4aa1e44b50 100644
--- a/kernel/bpf/token.c
+++ b/kernel/bpf/token.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf.h>
#include <linux/vmalloc.h>
#include <linux/file.h>
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 976d89011b15..952cd7932461 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -24,19 +24,49 @@ const struct bpf_prog_ops bpf_extension_prog_ops = {
#define TRAMPOLINE_HASH_BITS 10
#define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS)
-static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
+static struct hlist_head trampoline_key_table[TRAMPOLINE_TABLE_SIZE];
+static struct hlist_head trampoline_ip_table[TRAMPOLINE_TABLE_SIZE];
-/* serializes access to trampoline_table */
+/* serializes access to trampoline tables */
static DEFINE_MUTEX(trampoline_mutex);
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex);
-static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd)
+#ifdef CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS
+static struct bpf_trampoline *direct_ops_ip_lookup(struct ftrace_ops *ops, unsigned long ip)
{
- struct bpf_trampoline *tr = ops->private;
+ struct hlist_head *head_ip;
+ struct bpf_trampoline *tr;
+
+ mutex_lock(&trampoline_mutex);
+ head_ip = &trampoline_ip_table[hash_64(ip, TRAMPOLINE_HASH_BITS)];
+ hlist_for_each_entry(tr, head_ip, hlist_ip) {
+ if (tr->ip == ip)
+ goto out;
+ }
+ tr = NULL;
+out:
+ mutex_unlock(&trampoline_mutex);
+ return tr;
+}
+#else
+static struct bpf_trampoline *direct_ops_ip_lookup(struct ftrace_ops *ops, unsigned long ip)
+{
+ return ops->private;
+}
+#endif /* CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS */
+
+static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip,
+ enum ftrace_ops_cmd cmd)
+{
+ struct bpf_trampoline *tr;
int ret = 0;
+ tr = direct_ops_ip_lookup(ops, ip);
+ if (!tr)
+ return -EINVAL;
+
if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) {
/* This is called inside register_ftrace_direct_multi(), so
* tr->mutex is already locked.
@@ -109,10 +139,17 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
enum bpf_attach_type eatype = prog->expected_attach_type;
enum bpf_prog_type ptype = prog->type;
- return (ptype == BPF_PROG_TYPE_TRACING &&
- (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
- eatype == BPF_MODIFY_RETURN)) ||
- (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC);
+ switch (ptype) {
+ case BPF_PROG_TYPE_TRACING:
+ if (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
+ eatype == BPF_MODIFY_RETURN || eatype == BPF_TRACE_FSESSION)
+ return true;
+ return false;
+ case BPF_PROG_TYPE_LSM:
+ return eatype == BPF_LSM_MAC;
+ default:
+ return false;
+ }
}
void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym)
@@ -135,15 +172,171 @@ void bpf_image_ksym_del(struct bpf_ksym *ksym)
PAGE_SIZE, true, ksym->name);
}
-static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+#ifdef CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS
+/*
+ * We have only single direct_ops which contains all the direct call
+ * sites and is the only global ftrace_ops for all trampolines.
+ *
+ * We use 'update_ftrace_direct_*' api for attachment.
+ */
+struct ftrace_ops direct_ops = {
+ .ops_func = bpf_tramp_ftrace_ops_func,
+};
+
+static int direct_ops_alloc(struct bpf_trampoline *tr)
+{
+ tr->fops = &direct_ops;
+ return 0;
+}
+
+static void direct_ops_free(struct bpf_trampoline *tr) { }
+
+static struct ftrace_hash *hash_from_ip(struct bpf_trampoline *tr, void *ptr)
+{
+ unsigned long ip, addr = (unsigned long) ptr;
+ struct ftrace_hash *hash;
+
+ ip = ftrace_location(tr->ip);
+ if (!ip)
+ return NULL;
+ hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
+ if (!hash)
+ return NULL;
+ if (bpf_trampoline_use_jmp(tr->flags))
+ addr = ftrace_jmp_set(addr);
+ if (!add_ftrace_hash_entry_direct(hash, ip, addr)) {
+ free_ftrace_hash(hash);
+ return NULL;
+ }
+ return hash;
+}
+
+static int direct_ops_add(struct bpf_trampoline *tr, void *addr)
+{
+ struct ftrace_hash *hash = hash_from_ip(tr, addr);
+ int err;
+
+ if (!hash)
+ return -ENOMEM;
+ err = update_ftrace_direct_add(tr->fops, hash);
+ free_ftrace_hash(hash);
+ return err;
+}
+
+static int direct_ops_del(struct bpf_trampoline *tr, void *addr)
+{
+ struct ftrace_hash *hash = hash_from_ip(tr, addr);
+ int err;
+
+ if (!hash)
+ return -ENOMEM;
+ err = update_ftrace_direct_del(tr->fops, hash);
+ free_ftrace_hash(hash);
+ return err;
+}
+
+static int direct_ops_mod(struct bpf_trampoline *tr, void *addr, bool lock_direct_mutex)
+{
+ struct ftrace_hash *hash = hash_from_ip(tr, addr);
+ int err;
+
+ if (!hash)
+ return -ENOMEM;
+ err = update_ftrace_direct_mod(tr->fops, hash, lock_direct_mutex);
+ free_ftrace_hash(hash);
+ return err;
+}
+#else
+/*
+ * We allocate ftrace_ops object for each trampoline and it contains
+ * call site specific for that trampoline.
+ *
+ * We use *_ftrace_direct api for attachment.
+ */
+static int direct_ops_alloc(struct bpf_trampoline *tr)
+{
+ tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL);
+ if (!tr->fops)
+ return -ENOMEM;
+ tr->fops->private = tr;
+ tr->fops->ops_func = bpf_tramp_ftrace_ops_func;
+ return 0;
+}
+
+static void direct_ops_free(struct bpf_trampoline *tr)
+{
+ if (!tr->fops)
+ return;
+ ftrace_free_filter(tr->fops);
+ kfree(tr->fops);
+}
+
+static int direct_ops_add(struct bpf_trampoline *tr, void *ptr)
+{
+ unsigned long addr = (unsigned long) ptr;
+ struct ftrace_ops *ops = tr->fops;
+ int ret;
+
+ if (bpf_trampoline_use_jmp(tr->flags))
+ addr = ftrace_jmp_set(addr);
+
+ ret = ftrace_set_filter_ip(ops, tr->ip, 0, 1);
+ if (ret)
+ return ret;
+ return register_ftrace_direct(ops, addr);
+}
+
+static int direct_ops_del(struct bpf_trampoline *tr, void *addr)
+{
+ return unregister_ftrace_direct(tr->fops, (long)addr, false);
+}
+
+static int direct_ops_mod(struct bpf_trampoline *tr, void *ptr, bool lock_direct_mutex)
+{
+ unsigned long addr = (unsigned long) ptr;
+ struct ftrace_ops *ops = tr->fops;
+
+ if (bpf_trampoline_use_jmp(tr->flags))
+ addr = ftrace_jmp_set(addr);
+ if (lock_direct_mutex)
+ return modify_ftrace_direct(ops, addr);
+ return modify_ftrace_direct_nolock(ops, addr);
+}
+#endif /* CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS */
+#else
+static void direct_ops_free(struct bpf_trampoline *tr) { }
+
+static int direct_ops_alloc(struct bpf_trampoline *tr)
+{
+ return 0;
+}
+
+static int direct_ops_add(struct bpf_trampoline *tr, void *addr)
+{
+ return -ENODEV;
+}
+
+static int direct_ops_del(struct bpf_trampoline *tr, void *addr)
+{
+ return -ENODEV;
+}
+
+static int direct_ops_mod(struct bpf_trampoline *tr, void *ptr, bool lock_direct_mutex)
+{
+ return -ENODEV;
+}
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
+
+static struct bpf_trampoline *bpf_trampoline_lookup(u64 key, unsigned long ip)
{
struct bpf_trampoline *tr;
struct hlist_head *head;
int i;
mutex_lock(&trampoline_mutex);
- head = &trampoline_table[hash_64(key, TRAMPOLINE_HASH_BITS)];
- hlist_for_each_entry(tr, head, hlist) {
+ head = &trampoline_key_table[hash_64(key, TRAMPOLINE_HASH_BITS)];
+ hlist_for_each_entry(tr, head, hlist_key) {
if (tr->key == key) {
refcount_inc(&tr->refcnt);
goto out;
@@ -152,20 +345,19 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
tr = kzalloc(sizeof(*tr), GFP_KERNEL);
if (!tr)
goto out;
-#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
- tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL);
- if (!tr->fops) {
+ if (direct_ops_alloc(tr)) {
kfree(tr);
tr = NULL;
goto out;
}
- tr->fops->private = tr;
- tr->fops->ops_func = bpf_tramp_ftrace_ops_func;
-#endif
tr->key = key;
- INIT_HLIST_NODE(&tr->hlist);
- hlist_add_head(&tr->hlist, head);
+ tr->ip = ftrace_location(ip);
+ INIT_HLIST_NODE(&tr->hlist_key);
+ INIT_HLIST_NODE(&tr->hlist_ip);
+ hlist_add_head(&tr->hlist_key, head);
+ head = &trampoline_ip_table[hash_64(tr->ip, TRAMPOLINE_HASH_BITS)];
+ hlist_add_head(&tr->hlist_ip, head);
refcount_set(&tr->refcnt, 1);
mutex_init(&tr->mutex);
for (i = 0; i < BPF_TRAMP_MAX; i++)
@@ -200,7 +392,7 @@ static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags,
int ret;
if (tr->func.ftrace_managed)
- ret = unregister_ftrace_direct(tr->fops, (long)old_addr, false);
+ ret = direct_ops_del(tr, old_addr);
else
ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, NULL);
@@ -214,10 +406,7 @@ static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags,
int ret;
if (tr->func.ftrace_managed) {
- if (lock_direct_mutex)
- ret = modify_ftrace_direct(tr->fops, (long)new_addr);
- else
- ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr);
+ ret = direct_ops_mod(tr, new_addr, lock_direct_mutex);
} else {
ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr,
new_addr);
@@ -240,10 +429,7 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
}
if (tr->func.ftrace_managed) {
- ret = ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1);
- if (ret)
- return ret;
- ret = register_ftrace_direct(tr->fops, (long)new_addr);
+ ret = direct_ops_add(tr, new_addr);
} else {
ret = bpf_trampoline_update_fentry(tr, 0, NULL, new_addr);
}
@@ -499,13 +685,6 @@ again:
if (err)
goto out_free;
-#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP
- if (bpf_trampoline_use_jmp(tr->flags))
- tr->fops->flags |= FTRACE_OPS_FL_JMP;
- else
- tr->fops->flags &= ~FTRACE_OPS_FL_JMP;
-#endif
-
WARN_ON(tr->cur_image && total == 0);
if (tr->cur_image)
/* progs already running at this address */
@@ -533,15 +712,8 @@ again:
tr->cur_image = im;
out:
/* If any error happens, restore previous flags */
- if (err) {
+ if (err)
tr->flags = orig_flags;
-#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP
- if (bpf_trampoline_use_jmp(tr->flags))
- tr->fops->flags |= FTRACE_OPS_FL_JMP;
- else
- tr->fops->flags &= ~FTRACE_OPS_FL_JMP;
-#endif
- }
kfree(tlinks);
return err;
@@ -559,6 +731,8 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
return BPF_TRAMP_MODIFY_RETURN;
case BPF_TRACE_FEXIT:
return BPF_TRAMP_FEXIT;
+ case BPF_TRACE_FSESSION:
+ return BPF_TRAMP_FSESSION;
case BPF_LSM_MAC:
if (!prog->aux->attach_func_proto->type)
/* The function returns void, we cannot modify its
@@ -594,8 +768,10 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link,
struct bpf_trampoline *tr,
struct bpf_prog *tgt_prog)
{
+ struct bpf_fsession_link *fslink = NULL;
enum bpf_tramp_prog_type kind;
struct bpf_tramp_link *link_exiting;
+ struct hlist_head *prog_list;
int err = 0;
int cnt = 0, i;
@@ -621,24 +797,43 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link,
BPF_MOD_JUMP, NULL,
link->link.prog->bpf_func);
}
+ if (kind == BPF_TRAMP_FSESSION) {
+ prog_list = &tr->progs_hlist[BPF_TRAMP_FENTRY];
+ cnt++;
+ } else {
+ prog_list = &tr->progs_hlist[kind];
+ }
if (cnt >= BPF_MAX_TRAMP_LINKS)
return -E2BIG;
if (!hlist_unhashed(&link->tramp_hlist))
/* prog already linked */
return -EBUSY;
- hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) {
+ hlist_for_each_entry(link_exiting, prog_list, tramp_hlist) {
if (link_exiting->link.prog != link->link.prog)
continue;
/* prog already linked */
return -EBUSY;
}
- hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]);
- tr->progs_cnt[kind]++;
+ hlist_add_head(&link->tramp_hlist, prog_list);
+ if (kind == BPF_TRAMP_FSESSION) {
+ tr->progs_cnt[BPF_TRAMP_FENTRY]++;
+ fslink = container_of(link, struct bpf_fsession_link, link.link);
+ hlist_add_head(&fslink->fexit.tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]);
+ tr->progs_cnt[BPF_TRAMP_FEXIT]++;
+ } else {
+ tr->progs_cnt[kind]++;
+ }
err = bpf_trampoline_update(tr, true /* lock_direct_mutex */);
if (err) {
hlist_del_init(&link->tramp_hlist);
- tr->progs_cnt[kind]--;
+ if (kind == BPF_TRAMP_FSESSION) {
+ tr->progs_cnt[BPF_TRAMP_FENTRY]--;
+ hlist_del_init(&fslink->fexit.tramp_hlist);
+ tr->progs_cnt[BPF_TRAMP_FEXIT]--;
+ } else {
+ tr->progs_cnt[kind]--;
+ }
}
return err;
}
@@ -672,6 +867,13 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
guard(mutex)(&tgt_prog->aux->ext_mutex);
tgt_prog->aux->is_extended = false;
return err;
+ } else if (kind == BPF_TRAMP_FSESSION) {
+ struct bpf_fsession_link *fslink =
+ container_of(link, struct bpf_fsession_link, link.link);
+
+ hlist_del_init(&fslink->fexit.tramp_hlist);
+ tr->progs_cnt[BPF_TRAMP_FEXIT]--;
+ kind = BPF_TRAMP_FENTRY;
}
hlist_del_init(&link->tramp_hlist);
tr->progs_cnt[kind]--;
@@ -850,7 +1052,7 @@ void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog)
prog->aux->attach_btf_id);
bpf_lsm_find_cgroup_shim(prog, &bpf_func);
- tr = bpf_trampoline_lookup(key);
+ tr = bpf_trampoline_lookup(key, 0);
if (WARN_ON_ONCE(!tr))
return;
@@ -870,7 +1072,7 @@ struct bpf_trampoline *bpf_trampoline_get(u64 key,
{
struct bpf_trampoline *tr;
- tr = bpf_trampoline_lookup(key);
+ tr = bpf_trampoline_lookup(key, tgt_info->tgt_addr);
if (!tr)
return NULL;
@@ -906,11 +1108,9 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
* fexit progs. The fentry-only trampoline will be freed via
* multiple rcu callbacks.
*/
- hlist_del(&tr->hlist);
- if (tr->fops) {
- ftrace_free_filter(tr->fops);
- kfree(tr->fops);
- }
+ hlist_del(&tr->hlist_key);
+ hlist_del(&tr->hlist_ip);
+ direct_ops_free(tr);
kfree(tr);
out:
mutex_unlock(&trampoline_mutex);
@@ -949,7 +1149,7 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram
run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
- if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+ if (unlikely(!bpf_prog_get_recursion_context(prog))) {
bpf_prog_inc_misses_counter(prog);
if (prog->aux->recursion_detected)
prog->aux->recursion_detected(prog);
@@ -993,7 +1193,7 @@ static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start,
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
update_prog_stats(prog, start);
- this_cpu_dec(*(prog->active));
+ bpf_prog_put_recursion_context(prog);
rcu_read_unlock_migrate();
}
@@ -1029,7 +1229,7 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
- if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+ if (unlikely(!bpf_prog_get_recursion_context(prog))) {
bpf_prog_inc_misses_counter(prog);
if (prog->aux->recursion_detected)
prog->aux->recursion_detected(prog);
@@ -1044,7 +1244,7 @@ void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start,
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
update_prog_stats(prog, start);
- this_cpu_dec(*(prog->active));
+ bpf_prog_put_recursion_context(prog);
migrate_enable();
rcu_read_unlock_trace();
}
@@ -1179,7 +1379,9 @@ static int __init init_trampolines(void)
int i;
for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++)
- INIT_HLIST_HEAD(&trampoline_table[i]);
+ INIT_HLIST_HEAD(&trampoline_key_table[i]);
+ for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++)
+ INIT_HLIST_HEAD(&trampoline_ip_table[i]);
return 0;
}
late_initcall(init_trampolines);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3135643d5695..edf5342b982f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -272,8 +272,13 @@ static bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn)
insn->src_reg == BPF_PSEUDO_KFUNC_CALL;
}
+struct bpf_map_desc {
+ struct bpf_map *ptr;
+ int uid;
+};
+
struct bpf_call_arg_meta {
- struct bpf_map *map_ptr;
+ struct bpf_map_desc map;
bool raw_mode;
bool pkt_access;
u8 release_regno;
@@ -283,7 +288,6 @@ struct bpf_call_arg_meta {
u64 msize_max_value;
int ref_obj_id;
int dynptr_id;
- int map_uid;
int func_id;
struct btf *btf;
u32 btf_id;
@@ -294,6 +298,14 @@ struct bpf_call_arg_meta {
s64 const_map_key;
};
+struct bpf_kfunc_meta {
+ struct btf *btf;
+ const struct btf_type *proto;
+ const char *name;
+ const u32 *flags;
+ s32 id;
+};
+
struct bpf_kfunc_call_arg_meta {
/* In parameters */
struct btf *btf;
@@ -343,10 +355,7 @@ struct bpf_kfunc_call_arg_meta {
u8 spi;
u8 frameno;
} iter;
- struct {
- struct bpf_map *ptr;
- int uid;
- } map;
+ struct bpf_map_desc map;
u64 mem_size;
};
@@ -512,7 +521,7 @@ static bool is_async_callback_calling_kfunc(u32 btf_id);
static bool is_callback_calling_kfunc(u32 btf_id);
static bool is_bpf_throw_kfunc(struct bpf_insn *insn);
-static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id);
+static bool is_bpf_wq_set_callback_kfunc(u32 btf_id);
static bool is_task_work_add_kfunc(u32 func_id);
static bool is_sync_callback_calling_function(enum bpf_func_id func_id)
@@ -554,7 +563,7 @@ static bool is_async_cb_sleepable(struct bpf_verifier_env *env, struct bpf_insn
/* bpf_wq and bpf_task_work callbacks are always sleepable. */
if (bpf_pseudo_kfunc_call(insn) && insn->off == 0 &&
- (is_bpf_wq_set_callback_impl_kfunc(insn->imm) || is_task_work_add_kfunc(insn->imm)))
+ (is_bpf_wq_set_callback_kfunc(insn->imm) || is_task_work_add_kfunc(insn->imm)))
return true;
verifier_bug(env, "unhandled async callback in is_async_cb_sleepable");
@@ -2341,6 +2350,18 @@ static void __mark_reg32_unbounded(struct bpf_reg_state *reg)
reg->u32_max_value = U32_MAX;
}
+static void reset_reg64_and_tnum(struct bpf_reg_state *reg)
+{
+ __mark_reg64_unbounded(reg);
+ reg->var_off = tnum_unknown;
+}
+
+static void reset_reg32_and_tnum(struct bpf_reg_state *reg)
+{
+ __mark_reg32_unbounded(reg);
+ reg->var_off = tnum_unknown;
+}
+
static void __update_reg32_bounds(struct bpf_reg_state *reg)
{
struct tnum var32_off = tnum_subreg(reg->var_off);
@@ -3263,16 +3284,105 @@ static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, s16 offset)
return btf_vmlinux ?: ERR_PTR(-ENOENT);
}
-static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
+#define KF_IMPL_SUFFIX "_impl"
+
+static const struct btf_type *find_kfunc_impl_proto(struct bpf_verifier_env *env,
+ struct btf *btf,
+ const char *func_name)
+{
+ char *buf = env->tmp_str_buf;
+ const struct btf_type *func;
+ s32 impl_id;
+ int len;
+
+ len = snprintf(buf, TMP_STR_BUF_LEN, "%s%s", func_name, KF_IMPL_SUFFIX);
+ if (len < 0 || len >= TMP_STR_BUF_LEN) {
+ verbose(env, "function name %s%s is too long\n", func_name, KF_IMPL_SUFFIX);
+ return NULL;
+ }
+
+ impl_id = btf_find_by_name_kind(btf, buf, BTF_KIND_FUNC);
+ if (impl_id <= 0) {
+ verbose(env, "cannot find function %s in BTF\n", buf);
+ return NULL;
+ }
+
+ func = btf_type_by_id(btf, impl_id);
+
+ return btf_type_by_id(btf, func->type);
+}
+
+static int fetch_kfunc_meta(struct bpf_verifier_env *env,
+ s32 func_id,
+ s16 offset,
+ struct bpf_kfunc_meta *kfunc)
{
const struct btf_type *func, *func_proto;
+ const char *func_name;
+ u32 *kfunc_flags;
+ struct btf *btf;
+
+ if (func_id <= 0) {
+ verbose(env, "invalid kernel function btf_id %d\n", func_id);
+ return -EINVAL;
+ }
+
+ btf = find_kfunc_desc_btf(env, offset);
+ if (IS_ERR(btf)) {
+ verbose(env, "failed to find BTF for kernel function\n");
+ return PTR_ERR(btf);
+ }
+
+ /*
+ * Note that kfunc_flags may be NULL at this point, which
+ * means that we couldn't find func_id in any relevant
+ * kfunc_id_set. This most likely indicates an invalid kfunc
+ * call. However we don't fail with an error here,
+ * and let the caller decide what to do with NULL kfunc->flags.
+ */
+ kfunc_flags = btf_kfunc_flags(btf, func_id, env->prog);
+
+ func = btf_type_by_id(btf, func_id);
+ if (!func || !btf_type_is_func(func)) {
+ verbose(env, "kernel btf_id %d is not a function\n", func_id);
+ return -EINVAL;
+ }
+
+ func_name = btf_name_by_offset(btf, func->name_off);
+
+ /*
+ * An actual prototype of a kfunc with KF_IMPLICIT_ARGS flag
+ * can be found through the counterpart _impl kfunc.
+ */
+ if (kfunc_flags && (*kfunc_flags & KF_IMPLICIT_ARGS))
+ func_proto = find_kfunc_impl_proto(env, btf, func_name);
+ else
+ func_proto = btf_type_by_id(btf, func->type);
+
+ if (!func_proto || !btf_type_is_func_proto(func_proto)) {
+ verbose(env, "kernel function btf_id %d does not have a valid func_proto\n",
+ func_id);
+ return -EINVAL;
+ }
+
+ memset(kfunc, 0, sizeof(*kfunc));
+ kfunc->btf = btf;
+ kfunc->id = func_id;
+ kfunc->name = func_name;
+ kfunc->proto = func_proto;
+ kfunc->flags = kfunc_flags;
+
+ return 0;
+}
+
+static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
+{
struct bpf_kfunc_btf_tab *btf_tab;
struct btf_func_model func_model;
struct bpf_kfunc_desc_tab *tab;
struct bpf_prog_aux *prog_aux;
+ struct bpf_kfunc_meta kfunc;
struct bpf_kfunc_desc *desc;
- const char *func_name;
- struct btf *desc_btf;
unsigned long addr;
int err;
@@ -3322,12 +3432,6 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
prog_aux->kfunc_btf_tab = btf_tab;
}
- desc_btf = find_kfunc_desc_btf(env, offset);
- if (IS_ERR(desc_btf)) {
- verbose(env, "failed to find BTF for kernel function\n");
- return PTR_ERR(desc_btf);
- }
-
if (find_kfunc_desc(env->prog, func_id, offset))
return 0;
@@ -3336,24 +3440,13 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
return -E2BIG;
}
- func = btf_type_by_id(desc_btf, func_id);
- if (!func || !btf_type_is_func(func)) {
- verbose(env, "kernel btf_id %u is not a function\n",
- func_id);
- return -EINVAL;
- }
- func_proto = btf_type_by_id(desc_btf, func->type);
- if (!func_proto || !btf_type_is_func_proto(func_proto)) {
- verbose(env, "kernel function btf_id %u does not have a valid func_proto\n",
- func_id);
- return -EINVAL;
- }
+ err = fetch_kfunc_meta(env, func_id, offset, &kfunc);
+ if (err)
+ return err;
- func_name = btf_name_by_offset(desc_btf, func->name_off);
- addr = kallsyms_lookup_name(func_name);
+ addr = kallsyms_lookup_name(kfunc.name);
if (!addr) {
- verbose(env, "cannot find address for kernel function %s\n",
- func_name);
+ verbose(env, "cannot find address for kernel function %s\n", kfunc.name);
return -EINVAL;
}
@@ -3363,9 +3456,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
return err;
}
- err = btf_distill_func_proto(&env->log, desc_btf,
- func_proto, func_name,
- &func_model);
+ err = btf_distill_func_proto(&env->log, kfunc.btf, kfunc.proto, kfunc.name, &func_model);
if (err)
return err;
@@ -5427,6 +5518,12 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
*/
s32 subreg_def = state->regs[dst_regno].subreg_def;
+ if (env->bpf_capable && size == 4 && spill_size == 4 &&
+ get_reg_width(reg) <= 32)
+ /* Ensure stack slot has an ID to build a relation
+ * with the destination register on fill.
+ */
+ assign_scalar_id_before_mov(env, reg);
copy_register_state(&state->regs[dst_regno], reg);
state->regs[dst_regno].subreg_def = subreg_def;
@@ -5472,6 +5569,11 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
}
} else if (dst_regno >= 0) {
/* restore register state from stack */
+ if (env->bpf_capable)
+ /* Ensure stack slot has an ID to build a relation
+ * with the destination register on fill.
+ */
+ assign_scalar_id_before_mov(env, reg);
copy_register_state(&state->regs[dst_regno], reg);
/* mark reg as written since spilled pointer state likely
* has its liveness marks cleared by is_state_visited()
@@ -5654,8 +5756,8 @@ static int check_stack_write(struct bpf_verifier_env *env,
static int check_map_access_type(struct bpf_verifier_env *env, u32 regno,
int off, int size, enum bpf_access_type type)
{
- struct bpf_reg_state *regs = cur_regs(env);
- struct bpf_map *map = regs[regno].map_ptr;
+ struct bpf_reg_state *reg = reg_state(env, regno);
+ struct bpf_map *map = reg->map_ptr;
u32 cap = bpf_map_flags_to_cap(map);
if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) {
@@ -6168,8 +6270,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
int size, bool zero_size_allowed)
{
- struct bpf_reg_state *regs = cur_regs(env);
- struct bpf_reg_state *reg = &regs[regno];
+ struct bpf_reg_state *reg = reg_state(env, regno);
int err;
/* We may have added a variable offset to the packet pointer; but any
@@ -6256,8 +6357,7 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
u32 regno, int off, int size,
enum bpf_access_type t)
{
- struct bpf_reg_state *regs = cur_regs(env);
- struct bpf_reg_state *reg = &regs[regno];
+ struct bpf_reg_state *reg = reg_state(env, regno);
struct bpf_insn_access_aux info = {};
bool valid;
@@ -7453,8 +7553,7 @@ static int check_stack_access_within_bounds(
int regno, int off, int access_size,
enum bpf_access_type type)
{
- struct bpf_reg_state *regs = cur_regs(env);
- struct bpf_reg_state *reg = regs + regno;
+ struct bpf_reg_state *reg = reg_state(env, regno);
struct bpf_func_state *state = func(env, reg);
s64 min_off, max_off;
int err;
@@ -8408,7 +8507,7 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags)
{
bool is_lock = flags & PROCESS_SPIN_LOCK, is_res_lock = flags & PROCESS_RES_LOCK;
const char *lock_str = is_res_lock ? "bpf_res_spin" : "bpf_spin";
- struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_reg_state *reg = reg_state(env, regno);
struct bpf_verifier_state *cur = env->cur_state;
bool is_const = tnum_is_const(reg->var_off);
bool is_irq = flags & PROCESS_LOCK_IRQ;
@@ -8522,9 +8621,10 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags)
/* Check if @regno is a pointer to a specific field in a map value */
static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno,
- enum btf_field_type field_type)
+ enum btf_field_type field_type,
+ struct bpf_map_desc *map_desc)
{
- struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_reg_state *reg = reg_state(env, regno);
bool is_const = tnum_is_const(reg->var_off);
struct bpf_map *map = reg->map_ptr;
u64 val = reg->var_off.value;
@@ -8565,78 +8665,41 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno,
val + reg->off, struct_name, field_off);
return -EINVAL;
}
+ if (map_desc->ptr) {
+ verifier_bug(env, "Two map pointers in a %s helper", struct_name);
+ return -EFAULT;
+ }
+ map_desc->uid = reg->map_uid;
+ map_desc->ptr = map;
return 0;
}
static int process_timer_func(struct bpf_verifier_env *env, int regno,
- struct bpf_call_arg_meta *meta)
+ struct bpf_map_desc *map)
{
- struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
- struct bpf_map *map = reg->map_ptr;
- int err;
-
- err = check_map_field_pointer(env, regno, BPF_TIMER);
- if (err)
- return err;
-
- if (meta->map_ptr) {
- verifier_bug(env, "Two map pointers in a timer helper");
- return -EFAULT;
- }
if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n");
return -EOPNOTSUPP;
}
- meta->map_uid = reg->map_uid;
- meta->map_ptr = map;
- return 0;
+ return check_map_field_pointer(env, regno, BPF_TIMER, map);
}
-static int process_wq_func(struct bpf_verifier_env *env, int regno,
- struct bpf_kfunc_call_arg_meta *meta)
+static int process_timer_helper(struct bpf_verifier_env *env, int regno,
+ struct bpf_call_arg_meta *meta)
{
- struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
- struct bpf_map *map = reg->map_ptr;
- int err;
-
- err = check_map_field_pointer(env, regno, BPF_WORKQUEUE);
- if (err)
- return err;
-
- if (meta->map.ptr) {
- verifier_bug(env, "Two map pointers in a bpf_wq helper");
- return -EFAULT;
- }
-
- meta->map.uid = reg->map_uid;
- meta->map.ptr = map;
- return 0;
+ return process_timer_func(env, regno, &meta->map);
}
-static int process_task_work_func(struct bpf_verifier_env *env, int regno,
- struct bpf_kfunc_call_arg_meta *meta)
+static int process_timer_kfunc(struct bpf_verifier_env *env, int regno,
+ struct bpf_kfunc_call_arg_meta *meta)
{
- struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
- struct bpf_map *map = reg->map_ptr;
- int err;
-
- err = check_map_field_pointer(env, regno, BPF_TASK_WORK);
- if (err)
- return err;
-
- if (meta->map.ptr) {
- verifier_bug(env, "Two map pointers in a bpf_task_work helper");
- return -EFAULT;
- }
- meta->map.uid = reg->map_uid;
- meta->map.ptr = map;
- return 0;
+ return process_timer_func(env, regno, &meta->map);
}
static int process_kptr_func(struct bpf_verifier_env *env, int regno,
struct bpf_call_arg_meta *meta)
{
- struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_reg_state *reg = reg_state(env, regno);
struct btf_field *kptr_field;
struct bpf_map *map_ptr;
struct btf_record *rec;
@@ -8652,7 +8715,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
return -EINVAL;
}
rec = map_ptr->record;
- meta->map_ptr = map_ptr;
+ meta->map.ptr = map_ptr;
}
if (!tnum_is_const(reg->var_off)) {
@@ -8709,7 +8772,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx,
enum bpf_arg_type arg_type, int clone_ref_obj_id)
{
- struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_reg_state *reg = reg_state(env, regno);
int err;
if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) {
@@ -8829,7 +8892,7 @@ static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg_idx,
static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx,
struct bpf_kfunc_call_arg_meta *meta)
{
- struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_reg_state *reg = reg_state(env, regno);
const struct btf_type *t;
int spi, err, i, nr_slots, btf_id;
@@ -8944,15 +9007,24 @@ static bool regs_exact(const struct bpf_reg_state *rold,
const struct bpf_reg_state *rcur,
struct bpf_idmap *idmap);
+/*
+ * Check if scalar registers are exact for the purpose of not widening.
+ * More lenient than regs_exact()
+ */
+static bool scalars_exact_for_widen(const struct bpf_reg_state *rold,
+ const struct bpf_reg_state *rcur)
+{
+ return !memcmp(rold, rcur, offsetof(struct bpf_reg_state, id));
+}
+
static void maybe_widen_reg(struct bpf_verifier_env *env,
- struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
- struct bpf_idmap *idmap)
+ struct bpf_reg_state *rold, struct bpf_reg_state *rcur)
{
if (rold->type != SCALAR_VALUE)
return;
if (rold->type != rcur->type)
return;
- if (rold->precise || rcur->precise || regs_exact(rold, rcur, idmap))
+ if (rold->precise || rcur->precise || scalars_exact_for_widen(rold, rcur))
return;
__mark_reg_unknown(env, rcur);
}
@@ -8964,7 +9036,6 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env,
struct bpf_func_state *fold, *fcur;
int i, fr, num_slots;
- reset_idmap_scratch(env);
for (fr = old->curframe; fr >= 0; fr--) {
fold = old->frame[fr];
fcur = cur->frame[fr];
@@ -8972,8 +9043,7 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env,
for (i = 0; i < MAX_BPF_REG; i++)
maybe_widen_reg(env,
&fold->regs[i],
- &fcur->regs[i],
- &env->idmap_scratch);
+ &fcur->regs[i]);
num_slots = min(fold->allocated_stack / BPF_REG_SIZE,
fcur->allocated_stack / BPF_REG_SIZE);
@@ -8984,8 +9054,7 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env,
maybe_widen_reg(env,
&fold->stack[i].spilled_ptr,
- &fcur->stack[i].spilled_ptr,
- &env->idmap_scratch);
+ &fcur->stack[i].spilled_ptr);
}
}
return 0;
@@ -9159,13 +9228,13 @@ static int resolve_map_arg_type(struct bpf_verifier_env *env,
const struct bpf_call_arg_meta *meta,
enum bpf_arg_type *arg_type)
{
- if (!meta->map_ptr) {
+ if (!meta->map.ptr) {
/* kernel subsystem misconfigured verifier */
verifier_bug(env, "invalid map_ptr to access map->type");
return -EFAULT;
}
- switch (meta->map_ptr->map_type) {
+ switch (meta->map.ptr->map_type) {
case BPF_MAP_TYPE_SOCKMAP:
case BPF_MAP_TYPE_SOCKHASH:
if (*arg_type == ARG_PTR_TO_MAP_VALUE) {
@@ -9301,7 +9370,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
const u32 *arg_btf_id,
struct bpf_call_arg_meta *meta)
{
- struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_reg_state *reg = reg_state(env, regno);
enum bpf_reg_type expected, type = reg->type;
const struct bpf_reg_types *compatible;
int i, j;
@@ -9719,7 +9788,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
int insn_idx)
{
u32 regno = BPF_REG_1 + arg;
- struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_reg_state *reg = reg_state(env, regno);
enum bpf_arg_type arg_type = fn->arg_type[arg];
enum bpf_reg_type type = reg->type;
u32 *arg_btf_id = NULL;
@@ -9819,7 +9888,7 @@ skip_type_check:
switch (base_type(arg_type)) {
case ARG_CONST_MAP_PTR:
/* bpf_map_xxx(map_ptr) call: remember that map_ptr */
- if (meta->map_ptr) {
+ if (meta->map.ptr) {
/* Use map_uid (which is unique id of inner map) to reject:
* inner_map1 = bpf_map_lookup_elem(outer_map, key1)
* inner_map2 = bpf_map_lookup_elem(outer_map, key2)
@@ -9832,23 +9901,23 @@ skip_type_check:
*
* Comparing map_ptr is enough to distinguish normal and outer maps.
*/
- if (meta->map_ptr != reg->map_ptr ||
- meta->map_uid != reg->map_uid) {
+ if (meta->map.ptr != reg->map_ptr ||
+ meta->map.uid != reg->map_uid) {
verbose(env,
"timer pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n",
- meta->map_uid, reg->map_uid);
+ meta->map.uid, reg->map_uid);
return -EINVAL;
}
}
- meta->map_ptr = reg->map_ptr;
- meta->map_uid = reg->map_uid;
+ meta->map.ptr = reg->map_ptr;
+ meta->map.uid = reg->map_uid;
break;
case ARG_PTR_TO_MAP_KEY:
/* bpf_map_xxx(..., map_ptr, ..., key) call:
* check that [key, key + map->key_size) are within
* stack limits and initialized
*/
- if (!meta->map_ptr) {
+ if (!meta->map.ptr) {
/* in function declaration map_ptr must come before
* map_key, so that it's verified and known before
* we have to check map_key here. Otherwise it means
@@ -9857,11 +9926,11 @@ skip_type_check:
verifier_bug(env, "invalid map_ptr to access map->key");
return -EFAULT;
}
- key_size = meta->map_ptr->key_size;
+ key_size = meta->map.ptr->key_size;
err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL);
if (err)
return err;
- if (can_elide_value_nullness(meta->map_ptr->map_type)) {
+ if (can_elide_value_nullness(meta->map.ptr->map_type)) {
err = get_constant_map_key(env, reg, key_size, &meta->const_map_key);
if (err < 0) {
meta->const_map_key = -1;
@@ -9879,13 +9948,13 @@ skip_type_check:
/* bpf_map_xxx(..., map_ptr, ..., value) call:
* check [value, value + map->value_size) validity
*/
- if (!meta->map_ptr) {
+ if (!meta->map.ptr) {
/* kernel subsystem misconfigured verifier */
verifier_bug(env, "invalid map_ptr to access map->value");
return -EFAULT;
}
meta->raw_mode = arg_type & MEM_UNINIT;
- err = check_helper_mem_access(env, regno, meta->map_ptr->value_size,
+ err = check_helper_mem_access(env, regno, meta->map.ptr->value_size,
arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ,
false, meta);
break;
@@ -9916,7 +9985,7 @@ skip_type_check:
}
break;
case ARG_PTR_TO_TIMER:
- err = process_timer_func(env, regno, meta);
+ err = process_timer_helper(env, regno, meta);
if (err)
return err;
break;
@@ -10354,10 +10423,27 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn)
return true;
}
-static int check_func_proto(const struct bpf_func_proto *fn, int func_id)
+static bool check_mem_arg_rw_flag_ok(const struct bpf_func_proto *fn)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) {
+ enum bpf_arg_type arg_type = fn->arg_type[i];
+
+ if (base_type(arg_type) != ARG_PTR_TO_MEM)
+ continue;
+ if (!(arg_type & (MEM_WRITE | MEM_RDONLY)))
+ return false;
+ }
+
+ return true;
+}
+
+static int check_func_proto(const struct bpf_func_proto *fn)
{
return check_raw_mode_ok(fn) &&
check_arg_pair_ok(fn) &&
+ check_mem_arg_rw_flag_ok(fn) &&
check_btf_id_ok(fn) ? 0 : -EINVAL;
}
@@ -11206,7 +11292,7 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
int func_id, int insn_idx)
{
struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
- struct bpf_map *map = meta->map_ptr;
+ struct bpf_map *map = meta->map.ptr;
if (func_id != BPF_FUNC_tail_call &&
func_id != BPF_FUNC_map_lookup_elem &&
@@ -11239,11 +11325,11 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
}
if (!aux->map_ptr_state.map_ptr)
- bpf_map_ptr_store(aux, meta->map_ptr,
- !meta->map_ptr->bypass_spec_v1, false);
- else if (aux->map_ptr_state.map_ptr != meta->map_ptr)
- bpf_map_ptr_store(aux, meta->map_ptr,
- !meta->map_ptr->bypass_spec_v1, true);
+ bpf_map_ptr_store(aux, meta->map.ptr,
+ !meta->map.ptr->bypass_spec_v1, false);
+ else if (aux->map_ptr_state.map_ptr != meta->map.ptr)
+ bpf_map_ptr_store(aux, meta->map.ptr,
+ !meta->map.ptr->bypass_spec_v1, true);
return 0;
}
@@ -11252,8 +11338,8 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
int func_id, int insn_idx)
{
struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
- struct bpf_reg_state *regs = cur_regs(env), *reg;
- struct bpf_map *map = meta->map_ptr;
+ struct bpf_reg_state *reg;
+ struct bpf_map *map = meta->map.ptr;
u64 val, max;
int err;
@@ -11264,7 +11350,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
return -EINVAL;
}
- reg = &regs[BPF_REG_3];
+ reg = reg_state(env, BPF_REG_3);
val = reg->var_off.value;
max = map->max_entries;
@@ -11410,8 +11496,7 @@ static struct bpf_insn_aux_data *cur_aux(const struct bpf_verifier_env *env)
static bool loop_flag_is_zero(struct bpf_verifier_env *env)
{
- struct bpf_reg_state *regs = cur_regs(env);
- struct bpf_reg_state *reg = &regs[BPF_REG_4];
+ struct bpf_reg_state *reg = reg_state(env, BPF_REG_4);
bool reg_is_null = register_is_null(reg);
if (reg_is_null)
@@ -11471,6 +11556,7 @@ static inline bool in_sleepable_context(struct bpf_verifier_env *env)
{
return !env->cur_state->active_rcu_locks &&
!env->cur_state->active_preempt_locks &&
+ !env->cur_state->active_locks &&
!env->cur_state->active_irq_id &&
in_sleepable(env);
}
@@ -11529,7 +11615,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
memset(&meta, 0, sizeof(meta));
meta.pkt_access = fn->pkt_access;
- err = check_func_proto(fn, func_id);
+ err = check_func_proto(fn);
if (err) {
verifier_bug(env, "incorrect func proto %s#%d", func_id_name(func_id), func_id);
return err;
@@ -11809,22 +11895,22 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
* can check 'value_size' boundary of memory access
* to map element returned from bpf_map_lookup_elem()
*/
- if (meta.map_ptr == NULL) {
+ if (meta.map.ptr == NULL) {
verifier_bug(env, "unexpected null map_ptr");
return -EFAULT;
}
if (func_id == BPF_FUNC_map_lookup_elem &&
- can_elide_value_nullness(meta.map_ptr->map_type) &&
+ can_elide_value_nullness(meta.map.ptr->map_type) &&
meta.const_map_key >= 0 &&
- meta.const_map_key < meta.map_ptr->max_entries)
+ meta.const_map_key < meta.map.ptr->max_entries)
ret_flag &= ~PTR_MAYBE_NULL;
- regs[BPF_REG_0].map_ptr = meta.map_ptr;
- regs[BPF_REG_0].map_uid = meta.map_uid;
+ regs[BPF_REG_0].map_ptr = meta.map.ptr;
+ regs[BPF_REG_0].map_uid = meta.map.uid;
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag;
if (!type_may_be_null(ret_flag) &&
- btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) {
+ btf_record_has_field(meta.map.ptr->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) {
regs[BPF_REG_0].id = ++env->id_gen;
}
break;
@@ -11927,7 +12013,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (type_may_be_null(regs[BPF_REG_0].type))
regs[BPF_REG_0].id = ++env->id_gen;
- if (helper_multiple_ref_obj_use(func_id, meta.map_ptr)) {
+ if (helper_multiple_ref_obj_use(func_id, meta.map.ptr)) {
verifier_bug(env, "func %s#%d sets ref_obj_id more than once",
func_id_name(func_id), func_id);
return -EFAULT;
@@ -11939,7 +12025,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) {
/* For release_reference() */
regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
- } else if (is_acquire_function(func_id, meta.map_ptr)) {
+ } else if (is_acquire_function(func_id, meta.map.ptr)) {
int id = acquire_reference(env, insn_idx);
if (id < 0)
@@ -11954,7 +12040,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (err)
return err;
- err = check_map_func_compatibility(env, meta.map_ptr, func_id);
+ err = check_map_func_compatibility(env, meta.map.ptr, func_id);
if (err)
return err;
@@ -12045,11 +12131,6 @@ static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta)
return meta->kfunc_flags & KF_RELEASE;
}
-static bool is_kfunc_trusted_args(struct bpf_kfunc_call_arg_meta *meta)
-{
- return (meta->kfunc_flags & KF_TRUSTED_ARGS) || is_kfunc_release(meta);
-}
-
static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta)
{
return meta->kfunc_flags & KF_SLEEPABLE;
@@ -12096,11 +12177,6 @@ static bool is_kfunc_arg_const_mem_size(const struct btf *btf,
return btf_param_match_suffix(btf, arg, "__szk");
}
-static bool is_kfunc_arg_optional(const struct btf *btf, const struct btf_param *arg)
-{
- return btf_param_match_suffix(btf, arg, "__opt");
-}
-
static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg)
{
return btf_param_match_suffix(btf, arg, "__k");
@@ -12146,11 +12222,6 @@ static bool is_kfunc_arg_irq_flag(const struct btf *btf, const struct btf_param
return btf_param_match_suffix(btf, arg, "__irq_flag");
}
-static bool is_kfunc_arg_prog(const struct btf *btf, const struct btf_param *arg)
-{
- return btf_param_match_suffix(btf, arg, "__prog");
-}
-
static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,
const struct btf_param *arg,
const char *name)
@@ -12179,6 +12250,8 @@ enum {
KF_ARG_WORKQUEUE_ID,
KF_ARG_RES_SPIN_LOCK_ID,
KF_ARG_TASK_WORK_ID,
+ KF_ARG_PROG_AUX_ID,
+ KF_ARG_TIMER_ID
};
BTF_ID_LIST(kf_arg_btf_ids)
@@ -12190,6 +12263,8 @@ BTF_ID(struct, bpf_rb_node)
BTF_ID(struct, bpf_wq)
BTF_ID(struct, bpf_res_spin_lock)
BTF_ID(struct, bpf_task_work)
+BTF_ID(struct, bpf_prog_aux)
+BTF_ID(struct, bpf_timer)
static bool __is_kfunc_ptr_arg_type(const struct btf *btf,
const struct btf_param *arg, int type)
@@ -12233,6 +12308,11 @@ static bool is_kfunc_arg_rbtree_node(const struct btf *btf, const struct btf_par
return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_NODE_ID);
}
+static bool is_kfunc_arg_timer(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_TIMER_ID);
+}
+
static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg)
{
return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID);
@@ -12270,6 +12350,11 @@ static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf
return true;
}
+static bool is_kfunc_arg_prog_aux(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_PROG_AUX_ID);
+}
+
/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */
static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env,
const struct btf *btf,
@@ -12327,6 +12412,7 @@ enum kfunc_ptr_arg_type {
KF_ARG_PTR_TO_NULL,
KF_ARG_PTR_TO_CONST_STR,
KF_ARG_PTR_TO_MAP,
+ KF_ARG_PTR_TO_TIMER,
KF_ARG_PTR_TO_WORKQUEUE,
KF_ARG_PTR_TO_IRQ_FLAG,
KF_ARG_PTR_TO_RES_SPIN_LOCK,
@@ -12363,7 +12449,7 @@ enum special_kfunc_type {
KF_bpf_percpu_obj_new_impl,
KF_bpf_percpu_obj_drop_impl,
KF_bpf_throw,
- KF_bpf_wq_set_callback_impl,
+ KF_bpf_wq_set_callback,
KF_bpf_preempt_disable,
KF_bpf_preempt_enable,
KF_bpf_iter_css_task_new,
@@ -12383,8 +12469,14 @@ enum special_kfunc_type {
KF_bpf_dynptr_from_file,
KF_bpf_dynptr_file_discard,
KF___bpf_trap,
- KF_bpf_task_work_schedule_signal_impl,
- KF_bpf_task_work_schedule_resume_impl,
+ KF_bpf_task_work_schedule_signal,
+ KF_bpf_task_work_schedule_resume,
+ KF_bpf_arena_alloc_pages,
+ KF_bpf_arena_free_pages,
+ KF_bpf_arena_reserve_pages,
+ KF_bpf_session_is_return,
+ KF_bpf_stream_vprintk,
+ KF_bpf_stream_print_stack,
};
BTF_ID_LIST(special_kfunc_list)
@@ -12424,7 +12516,7 @@ BTF_ID(func, bpf_dynptr_clone)
BTF_ID(func, bpf_percpu_obj_new_impl)
BTF_ID(func, bpf_percpu_obj_drop_impl)
BTF_ID(func, bpf_throw)
-BTF_ID(func, bpf_wq_set_callback_impl)
+BTF_ID(func, bpf_wq_set_callback)
BTF_ID(func, bpf_preempt_disable)
BTF_ID(func, bpf_preempt_enable)
#ifdef CONFIG_CGROUPS
@@ -12457,13 +12549,19 @@ BTF_ID(func, bpf_res_spin_unlock_irqrestore)
BTF_ID(func, bpf_dynptr_from_file)
BTF_ID(func, bpf_dynptr_file_discard)
BTF_ID(func, __bpf_trap)
-BTF_ID(func, bpf_task_work_schedule_signal_impl)
-BTF_ID(func, bpf_task_work_schedule_resume_impl)
+BTF_ID(func, bpf_task_work_schedule_signal)
+BTF_ID(func, bpf_task_work_schedule_resume)
+BTF_ID(func, bpf_arena_alloc_pages)
+BTF_ID(func, bpf_arena_free_pages)
+BTF_ID(func, bpf_arena_reserve_pages)
+BTF_ID(func, bpf_session_is_return)
+BTF_ID(func, bpf_stream_vprintk)
+BTF_ID(func, bpf_stream_print_stack)
static bool is_task_work_add_kfunc(u32 func_id)
{
- return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal_impl] ||
- func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume_impl];
+ return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] ||
+ func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume];
}
static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
@@ -12513,9 +12611,16 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
struct bpf_reg_state *reg = &regs[regno];
bool arg_mem_size = false;
- if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx])
+ if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
+ meta->func_id == special_kfunc_list[KF_bpf_session_is_return] ||
+ meta->func_id == special_kfunc_list[KF_bpf_session_cookie])
return KF_ARG_PTR_TO_CTX;
+ if (argno + 1 < nargs &&
+ (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]) ||
+ is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1])))
+ arg_mem_size = true;
+
/* In this function, we verify the kfunc's BTF as per the argument type,
* leaving the rest of the verification with respect to the register
* type to our caller. When a set of conditions hold in the BTF type of
@@ -12524,7 +12629,8 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno))
return KF_ARG_PTR_TO_CTX;
- if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg))
+ if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg) &&
+ !arg_mem_size)
return KF_ARG_PTR_TO_NULL;
if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno]))
@@ -12560,6 +12666,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_wq(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_WORKQUEUE;
+ if (is_kfunc_arg_timer(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_TIMER;
+
if (is_kfunc_arg_task_work(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_TASK_WORK;
@@ -12581,11 +12690,6 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_callback(env, meta->btf, &args[argno]))
return KF_ARG_PTR_TO_CALLBACK;
- if (argno + 1 < nargs &&
- (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]) ||
- is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1])))
- arg_mem_size = true;
-
/* This is the catch all argument type of register types supported by
* check_helper_mem_access. However, we only allow when argument type is
* pointer to scalar, or struct composed (recursively) of scalars. When
@@ -12625,7 +12729,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
/* Enforce strict type matching for calls to kfuncs that are acquiring
* or releasing a reference, or are no-cast aliases. We do _not_
- * enforce strict matching for plain KF_TRUSTED_ARGS kfuncs by default,
+ * enforce strict matching for kfuncs by default,
* as we want to enable BPF programs to pass types that are bitwise
* equivalent without forcing them to explicitly cast with something
* like bpf_cast_to_kern_ctx().
@@ -12675,7 +12779,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
static int process_irq_flag(struct bpf_verifier_env *env, int regno,
struct bpf_kfunc_call_arg_meta *meta)
{
- struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_reg_state *reg = reg_state(env, regno);
int err, kfunc_class = IRQ_NATIVE_KFUNC;
bool irq_save;
@@ -12893,10 +12997,24 @@ static bool is_bpf_res_spin_lock_kfunc(u32 btf_id)
btf_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore];
}
+static bool is_bpf_arena_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_arena_alloc_pages] ||
+ btf_id == special_kfunc_list[KF_bpf_arena_free_pages] ||
+ btf_id == special_kfunc_list[KF_bpf_arena_reserve_pages];
+}
+
+static bool is_bpf_stream_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_stream_vprintk] ||
+ btf_id == special_kfunc_list[KF_bpf_stream_print_stack];
+}
+
static bool kfunc_spin_allowed(u32 btf_id)
{
return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id) ||
- is_bpf_res_spin_lock_kfunc(btf_id);
+ is_bpf_res_spin_lock_kfunc(btf_id) || is_bpf_arena_kfunc(btf_id) ||
+ is_bpf_stream_kfunc(btf_id);
}
static bool is_sync_callback_calling_kfunc(u32 btf_id)
@@ -12906,7 +13024,7 @@ static bool is_sync_callback_calling_kfunc(u32 btf_id)
static bool is_async_callback_calling_kfunc(u32 btf_id)
{
- return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl] ||
+ return is_bpf_wq_set_callback_kfunc(btf_id) ||
is_task_work_add_kfunc(btf_id);
}
@@ -12916,9 +13034,9 @@ static bool is_bpf_throw_kfunc(struct bpf_insn *insn)
insn->imm == special_kfunc_list[KF_bpf_throw];
}
-static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id)
+static bool is_bpf_wq_set_callback_kfunc(u32 btf_id)
{
- return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl];
+ return btf_id == special_kfunc_list[KF_bpf_wq_set_callback];
}
static bool is_callback_calling_kfunc(u32 btf_id)
@@ -13192,8 +13310,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
if (is_kfunc_arg_ignore(btf, &args[i]))
continue;
- if (is_kfunc_arg_prog(btf, &args[i])) {
- /* Used to reject repeated use of __prog. */
+ if (is_kfunc_arg_prog_aux(btf, &args[i])) {
+ /* Reject repeated use bpf_prog_aux */
if (meta->arg_prog) {
verifier_bug(env, "Only 1 prog->aux argument supported per-kfunc");
return -EFAULT;
@@ -13254,9 +13372,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return -EINVAL;
}
- if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) &&
- (register_is_null(reg) || type_may_be_null(reg->type)) &&
- !is_kfunc_arg_nullable(meta->btf, &args[i])) {
+ if ((register_is_null(reg) || type_may_be_null(reg->type)) &&
+ !is_kfunc_arg_nullable(meta->btf, &args[i])) {
verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i);
return -EACCES;
}
@@ -13321,9 +13438,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
fallthrough;
case KF_ARG_PTR_TO_ALLOC_BTF_ID:
case KF_ARG_PTR_TO_BTF_ID:
- if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta))
- break;
-
if (!is_trusted_reg(reg)) {
if (!is_kfunc_rcu(meta)) {
verbose(env, "R%d must be referenced or trusted\n", regno);
@@ -13348,6 +13462,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
case KF_ARG_PTR_TO_CONST_STR:
case KF_ARG_PTR_TO_WORKQUEUE:
+ case KF_ARG_PTR_TO_TIMER:
case KF_ARG_PTR_TO_TASK_WORK:
case KF_ARG_PTR_TO_IRQ_FLAG:
case KF_ARG_PTR_TO_RES_SPIN_LOCK:
@@ -13575,7 +13690,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
struct bpf_reg_state *size_reg = &regs[regno + 1];
const struct btf_param *size_arg = &args[i + 1];
- if (!register_is_null(buff_reg) || !is_kfunc_arg_optional(meta->btf, buff_arg)) {
+ if (!register_is_null(buff_reg) || !is_kfunc_arg_nullable(meta->btf, buff_arg)) {
ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1);
if (ret < 0) {
verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);
@@ -13643,7 +13758,16 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
verbose(env, "arg#%d doesn't point to a map value\n", i);
return -EINVAL;
}
- ret = process_wq_func(env, regno, meta);
+ ret = check_map_field_pointer(env, regno, BPF_WORKQUEUE, &meta->map);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_TIMER:
+ if (reg->type != PTR_TO_MAP_VALUE) {
+ verbose(env, "arg#%d doesn't point to a map value\n", i);
+ return -EINVAL;
+ }
+ ret = process_timer_kfunc(env, regno, meta);
if (ret < 0)
return ret;
break;
@@ -13652,7 +13776,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
verbose(env, "arg#%d doesn't point to a map value\n", i);
return -EINVAL;
}
- ret = process_task_work_func(env, regno, meta);
+ ret = check_map_field_pointer(env, regno, BPF_TASK_WORK, &meta->map);
if (ret < 0)
return ret;
break;
@@ -13699,44 +13823,28 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return 0;
}
-static int fetch_kfunc_meta(struct bpf_verifier_env *env,
- struct bpf_insn *insn,
- struct bpf_kfunc_call_arg_meta *meta,
- const char **kfunc_name)
+static int fetch_kfunc_arg_meta(struct bpf_verifier_env *env,
+ s32 func_id,
+ s16 offset,
+ struct bpf_kfunc_call_arg_meta *meta)
{
- const struct btf_type *func, *func_proto;
- u32 func_id, *kfunc_flags;
- const char *func_name;
- struct btf *desc_btf;
-
- if (kfunc_name)
- *kfunc_name = NULL;
+ struct bpf_kfunc_meta kfunc;
+ int err;
- if (!insn->imm)
- return -EINVAL;
+ err = fetch_kfunc_meta(env, func_id, offset, &kfunc);
+ if (err)
+ return err;
- desc_btf = find_kfunc_desc_btf(env, insn->off);
- if (IS_ERR(desc_btf))
- return PTR_ERR(desc_btf);
+ memset(meta, 0, sizeof(*meta));
+ meta->btf = kfunc.btf;
+ meta->func_id = kfunc.id;
+ meta->func_proto = kfunc.proto;
+ meta->func_name = kfunc.name;
- func_id = insn->imm;
- func = btf_type_by_id(desc_btf, func_id);
- func_name = btf_name_by_offset(desc_btf, func->name_off);
- if (kfunc_name)
- *kfunc_name = func_name;
- func_proto = btf_type_by_id(desc_btf, func->type);
-
- kfunc_flags = btf_kfunc_id_set_contains(desc_btf, func_id, env->prog);
- if (!kfunc_flags) {
+ if (!kfunc.flags || !btf_kfunc_is_allowed(kfunc.btf, kfunc.id, env->prog))
return -EACCES;
- }
- memset(meta, 0, sizeof(*meta));
- meta->btf = desc_btf;
- meta->func_id = func_id;
- meta->kfunc_flags = *kfunc_flags;
- meta->func_proto = func_proto;
- meta->func_name = func_name;
+ meta->kfunc_flags = *kfunc.flags;
return 0;
}
@@ -13941,12 +14049,13 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (!insn->imm)
return 0;
- err = fetch_kfunc_meta(env, insn, &meta, &func_name);
- if (err == -EACCES && func_name)
- verbose(env, "calling kernel function %s is not allowed\n", func_name);
+ err = fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta);
+ if (err == -EACCES && meta.func_name)
+ verbose(env, "calling kernel function %s is not allowed\n", meta.func_name);
if (err)
return err;
desc_btf = meta.btf;
+ func_name = meta.func_name;
insn_aux = &env->insn_aux_data[insn_idx];
insn_aux->is_iter_next = is_iter_next_kfunc(&meta);
@@ -14016,7 +14125,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
meta.r0_rdonly = false;
}
- if (is_bpf_wq_set_callback_impl_kfunc(meta.func_id)) {
+ if (is_bpf_wq_set_callback_kfunc(meta.func_id)) {
err = push_callback_call(env, insn, insn_idx, meta.subprogno,
set_timer_callback_state);
if (err) {
@@ -14154,8 +14263,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
}
- for (i = 0; i < CALLER_SAVED_REGS; i++)
- mark_reg_not_init(env, regs, caller_saved[i]);
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
+ u32 regno = caller_saved[i];
+
+ mark_reg_not_init(env, regs, regno);
+ regs[regno].subreg_def = DEF_NOT_SUBREG;
+ }
/* Check return type */
t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL);
@@ -14220,26 +14333,38 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (is_kfunc_rcu_protected(&meta))
regs[BPF_REG_0].type |= MEM_RCU;
} else {
- mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].btf = desc_btf;
- regs[BPF_REG_0].type = PTR_TO_BTF_ID;
- regs[BPF_REG_0].btf_id = ptr_type_id;
+ enum bpf_reg_type type = PTR_TO_BTF_ID;
if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache])
- regs[BPF_REG_0].type |= PTR_UNTRUSTED;
- else if (is_kfunc_rcu_protected(&meta))
- regs[BPF_REG_0].type |= MEM_RCU;
-
- if (is_iter_next_kfunc(&meta)) {
- struct bpf_reg_state *cur_iter;
-
- cur_iter = get_iter_from_state(env->cur_state, &meta);
-
- if (cur_iter->type & MEM_RCU) /* KF_RCU_PROTECTED */
- regs[BPF_REG_0].type |= MEM_RCU;
- else
- regs[BPF_REG_0].type |= PTR_TRUSTED;
+ type |= PTR_UNTRUSTED;
+ else if (is_kfunc_rcu_protected(&meta) ||
+ (is_iter_next_kfunc(&meta) &&
+ (get_iter_from_state(env->cur_state, &meta)
+ ->type & MEM_RCU))) {
+ /*
+ * If the iterator's constructor (the _new
+ * function e.g., bpf_iter_task_new) has been
+ * annotated with BPF kfunc flag
+ * KF_RCU_PROTECTED and was called within a RCU
+ * read-side critical section, also propagate
+ * the MEM_RCU flag to the pointer returned from
+ * the iterator's next function (e.g.,
+ * bpf_iter_task_next).
+ */
+ type |= MEM_RCU;
+ } else {
+ /*
+ * Any PTR_TO_BTF_ID that is returned from a BPF
+ * kfunc should by default be treated as
+ * implicitly trusted.
+ */
+ type |= PTR_TRUSTED;
}
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].btf = desc_btf;
+ regs[BPF_REG_0].type = type;
+ regs[BPF_REG_0].btf_id = ptr_type_id;
}
if (is_kfunc_ret_null(&meta)) {
@@ -14295,6 +14420,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return err;
}
+ if (meta.func_id == special_kfunc_list[KF_bpf_session_cookie])
+ env->prog->call_session_cookie = true;
+
return 0;
}
@@ -15081,6 +15209,252 @@ static void scalar_min_max_mul(struct bpf_reg_state *dst_reg,
}
}
+static void scalar32_min_max_udiv(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ u32 *dst_umin = &dst_reg->u32_min_value;
+ u32 *dst_umax = &dst_reg->u32_max_value;
+ u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */
+
+ *dst_umin = *dst_umin / src_val;
+ *dst_umax = *dst_umax / src_val;
+
+ /* Reset other ranges/tnum to unbounded/unknown. */
+ dst_reg->s32_min_value = S32_MIN;
+ dst_reg->s32_max_value = S32_MAX;
+ reset_reg64_and_tnum(dst_reg);
+}
+
+static void scalar_min_max_udiv(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ u64 *dst_umin = &dst_reg->umin_value;
+ u64 *dst_umax = &dst_reg->umax_value;
+ u64 src_val = src_reg->umin_value; /* non-zero, const divisor */
+
+ *dst_umin = div64_u64(*dst_umin, src_val);
+ *dst_umax = div64_u64(*dst_umax, src_val);
+
+ /* Reset other ranges/tnum to unbounded/unknown. */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ reset_reg32_and_tnum(dst_reg);
+}
+
+static void scalar32_min_max_sdiv(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ s32 *dst_smin = &dst_reg->s32_min_value;
+ s32 *dst_smax = &dst_reg->s32_max_value;
+ s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */
+ s32 res1, res2;
+
+ /* BPF div specification: S32_MIN / -1 = S32_MIN */
+ if (*dst_smin == S32_MIN && src_val == -1) {
+ /*
+ * If the dividend range contains more than just S32_MIN,
+ * we cannot precisely track the result, so it becomes unbounded.
+ * e.g., [S32_MIN, S32_MIN+10]/(-1),
+ * = {S32_MIN} U [-(S32_MIN+10), -(S32_MIN+1)]
+ * = {S32_MIN} U [S32_MAX-9, S32_MAX] = [S32_MIN, S32_MAX]
+ * Otherwise (if dividend is exactly S32_MIN), result remains S32_MIN.
+ */
+ if (*dst_smax != S32_MIN) {
+ *dst_smin = S32_MIN;
+ *dst_smax = S32_MAX;
+ }
+ goto reset;
+ }
+
+ res1 = *dst_smin / src_val;
+ res2 = *dst_smax / src_val;
+ *dst_smin = min(res1, res2);
+ *dst_smax = max(res1, res2);
+
+reset:
+ /* Reset other ranges/tnum to unbounded/unknown. */
+ dst_reg->u32_min_value = 0;
+ dst_reg->u32_max_value = U32_MAX;
+ reset_reg64_and_tnum(dst_reg);
+}
+
+static void scalar_min_max_sdiv(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ s64 *dst_smin = &dst_reg->smin_value;
+ s64 *dst_smax = &dst_reg->smax_value;
+ s64 src_val = src_reg->smin_value; /* non-zero, const divisor */
+ s64 res1, res2;
+
+ /* BPF div specification: S64_MIN / -1 = S64_MIN */
+ if (*dst_smin == S64_MIN && src_val == -1) {
+ /*
+ * If the dividend range contains more than just S64_MIN,
+ * we cannot precisely track the result, so it becomes unbounded.
+ * e.g., [S64_MIN, S64_MIN+10]/(-1),
+ * = {S64_MIN} U [-(S64_MIN+10), -(S64_MIN+1)]
+ * = {S64_MIN} U [S64_MAX-9, S64_MAX] = [S64_MIN, S64_MAX]
+ * Otherwise (if dividend is exactly S64_MIN), result remains S64_MIN.
+ */
+ if (*dst_smax != S64_MIN) {
+ *dst_smin = S64_MIN;
+ *dst_smax = S64_MAX;
+ }
+ goto reset;
+ }
+
+ res1 = div64_s64(*dst_smin, src_val);
+ res2 = div64_s64(*dst_smax, src_val);
+ *dst_smin = min(res1, res2);
+ *dst_smax = max(res1, res2);
+
+reset:
+ /* Reset other ranges/tnum to unbounded/unknown. */
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+ reset_reg32_and_tnum(dst_reg);
+}
+
+static void scalar32_min_max_umod(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ u32 *dst_umin = &dst_reg->u32_min_value;
+ u32 *dst_umax = &dst_reg->u32_max_value;
+ u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */
+ u32 res_max = src_val - 1;
+
+ /*
+ * If dst_umax <= res_max, the result remains unchanged.
+ * e.g., [2, 5] % 10 = [2, 5].
+ */
+ if (*dst_umax <= res_max)
+ return;
+
+ *dst_umin = 0;
+ *dst_umax = min(*dst_umax, res_max);
+
+ /* Reset other ranges/tnum to unbounded/unknown. */
+ dst_reg->s32_min_value = S32_MIN;
+ dst_reg->s32_max_value = S32_MAX;
+ reset_reg64_and_tnum(dst_reg);
+}
+
+static void scalar_min_max_umod(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ u64 *dst_umin = &dst_reg->umin_value;
+ u64 *dst_umax = &dst_reg->umax_value;
+ u64 src_val = src_reg->umin_value; /* non-zero, const divisor */
+ u64 res_max = src_val - 1;
+
+ /*
+ * If dst_umax <= res_max, the result remains unchanged.
+ * e.g., [2, 5] % 10 = [2, 5].
+ */
+ if (*dst_umax <= res_max)
+ return;
+
+ *dst_umin = 0;
+ *dst_umax = min(*dst_umax, res_max);
+
+ /* Reset other ranges/tnum to unbounded/unknown. */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ reset_reg32_and_tnum(dst_reg);
+}
+
+static void scalar32_min_max_smod(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ s32 *dst_smin = &dst_reg->s32_min_value;
+ s32 *dst_smax = &dst_reg->s32_max_value;
+ s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */
+
+ /*
+ * Safe absolute value calculation:
+ * If src_val == S32_MIN (-2147483648), src_abs becomes 2147483648.
+ * Here use unsigned integer to avoid overflow.
+ */
+ u32 src_abs = (src_val > 0) ? (u32)src_val : -(u32)src_val;
+
+ /*
+ * Calculate the maximum possible absolute value of the result.
+ * Even if src_abs is 2147483648 (S32_MIN), subtracting 1 gives
+ * 2147483647 (S32_MAX), which fits perfectly in s32.
+ */
+ s32 res_max_abs = src_abs - 1;
+
+ /*
+ * If the dividend is already within the result range,
+ * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5].
+ */
+ if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs)
+ return;
+
+ /* General case: result has the same sign as the dividend. */
+ if (*dst_smin >= 0) {
+ *dst_smin = 0;
+ *dst_smax = min(*dst_smax, res_max_abs);
+ } else if (*dst_smax <= 0) {
+ *dst_smax = 0;
+ *dst_smin = max(*dst_smin, -res_max_abs);
+ } else {
+ *dst_smin = -res_max_abs;
+ *dst_smax = res_max_abs;
+ }
+
+ /* Reset other ranges/tnum to unbounded/unknown. */
+ dst_reg->u32_min_value = 0;
+ dst_reg->u32_max_value = U32_MAX;
+ reset_reg64_and_tnum(dst_reg);
+}
+
+static void scalar_min_max_smod(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ s64 *dst_smin = &dst_reg->smin_value;
+ s64 *dst_smax = &dst_reg->smax_value;
+ s64 src_val = src_reg->smin_value; /* non-zero, const divisor */
+
+ /*
+ * Safe absolute value calculation:
+ * If src_val == S64_MIN (-2^63), src_abs becomes 2^63.
+ * Here use unsigned integer to avoid overflow.
+ */
+ u64 src_abs = (src_val > 0) ? (u64)src_val : -(u64)src_val;
+
+ /*
+ * Calculate the maximum possible absolute value of the result.
+ * Even if src_abs is 2^63 (S64_MIN), subtracting 1 gives
+ * 2^63 - 1 (S64_MAX), which fits perfectly in s64.
+ */
+ s64 res_max_abs = src_abs - 1;
+
+ /*
+ * If the dividend is already within the result range,
+ * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5].
+ */
+ if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs)
+ return;
+
+ /* General case: result has the same sign as the dividend. */
+ if (*dst_smin >= 0) {
+ *dst_smin = 0;
+ *dst_smax = min(*dst_smax, res_max_abs);
+ } else if (*dst_smax <= 0) {
+ *dst_smax = 0;
+ *dst_smin = max(*dst_smin, -res_max_abs);
+ } else {
+ *dst_smin = -res_max_abs;
+ *dst_smax = res_max_abs;
+ }
+
+ /* Reset other ranges/tnum to unbounded/unknown. */
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+ reset_reg32_and_tnum(dst_reg);
+}
+
static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
@@ -15305,21 +15679,17 @@ static void __scalar64_min_max_lsh(struct bpf_reg_state *dst_reg,
u64 umin_val, u64 umax_val)
{
/* Special case <<32 because it is a common compiler pattern to sign
- * extend subreg by doing <<32 s>>32. In this case if 32bit bounds are
- * positive we know this shift will also be positive so we can track
- * bounds correctly. Otherwise we lose all sign bit information except
- * what we can pick up from var_off. Perhaps we can generalize this
- * later to shifts of any length.
+ * extend subreg by doing <<32 s>>32. smin/smax assignments are correct
+ * because s32 bounds don't flip sign when shifting to the left by
+ * 32bits.
*/
- if (umin_val == 32 && umax_val == 32 && dst_reg->s32_max_value >= 0)
+ if (umin_val == 32 && umax_val == 32) {
dst_reg->smax_value = (s64)dst_reg->s32_max_value << 32;
- else
- dst_reg->smax_value = S64_MAX;
-
- if (umin_val == 32 && umax_val == 32 && dst_reg->s32_min_value >= 0)
dst_reg->smin_value = (s64)dst_reg->s32_min_value << 32;
- else
+ } else {
+ dst_reg->smax_value = S64_MAX;
dst_reg->smin_value = S64_MIN;
+ }
/* If we might shift our top bit out, then we know nothing */
if (dst_reg->umax_value > 1ULL << (63 - umax_val)) {
@@ -15462,6 +15832,48 @@ static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg,
__update_reg_bounds(dst_reg);
}
+static void scalar_byte_swap(struct bpf_reg_state *dst_reg, struct bpf_insn *insn)
+{
+ /*
+ * Byte swap operation - update var_off using tnum_bswap.
+ * Three cases:
+ * 1. bswap(16|32|64): opcode=0xd7 (BPF_END | BPF_ALU64 | BPF_TO_LE)
+ * unconditional swap
+ * 2. to_le(16|32|64): opcode=0xd4 (BPF_END | BPF_ALU | BPF_TO_LE)
+ * swap on big-endian, truncation or no-op on little-endian
+ * 3. to_be(16|32|64): opcode=0xdc (BPF_END | BPF_ALU | BPF_TO_BE)
+ * swap on little-endian, truncation or no-op on big-endian
+ */
+
+ bool alu64 = BPF_CLASS(insn->code) == BPF_ALU64;
+ bool to_le = BPF_SRC(insn->code) == BPF_TO_LE;
+ bool is_big_endian;
+#ifdef CONFIG_CPU_BIG_ENDIAN
+ is_big_endian = true;
+#else
+ is_big_endian = false;
+#endif
+ /* Apply bswap if alu64 or switch between big-endian and little-endian machines */
+ bool need_bswap = alu64 || (to_le == is_big_endian);
+
+ if (need_bswap) {
+ if (insn->imm == 16)
+ dst_reg->var_off = tnum_bswap16(dst_reg->var_off);
+ else if (insn->imm == 32)
+ dst_reg->var_off = tnum_bswap32(dst_reg->var_off);
+ else if (insn->imm == 64)
+ dst_reg->var_off = tnum_bswap64(dst_reg->var_off);
+ /*
+ * Byteswap scrambles the range, so we must reset bounds.
+ * Bounds will be re-derived from the new tnum later.
+ */
+ __mark_reg_unbounded(dst_reg);
+ }
+ /* For bswap16/32, truncate dst register to match the swapped size */
+ if (insn->imm == 16 || insn->imm == 32)
+ coerce_reg_to_size(dst_reg, insn->imm / 8);
+}
+
static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn,
const struct bpf_reg_state *src_reg)
{
@@ -15488,8 +15900,17 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn,
case BPF_XOR:
case BPF_OR:
case BPF_MUL:
+ case BPF_END:
return true;
+ /*
+ * Division and modulo operators range is only safe to compute when the
+ * divisor is a constant.
+ */
+ case BPF_DIV:
+ case BPF_MOD:
+ return src_is_const;
+
/* Shift operators range is only computable if shift dimension operand
* is a constant. Shifts greater than 31 or 63 are undefined. This
* includes shifts by a negative number.
@@ -15503,6 +15924,35 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn,
}
}
+static int maybe_fork_scalars(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ struct bpf_reg_state *dst_reg)
+{
+ struct bpf_verifier_state *branch;
+ struct bpf_reg_state *regs;
+ bool alu32;
+
+ if (dst_reg->smin_value == -1 && dst_reg->smax_value == 0)
+ alu32 = false;
+ else if (dst_reg->s32_min_value == -1 && dst_reg->s32_max_value == 0)
+ alu32 = true;
+ else
+ return 0;
+
+ branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false);
+ if (IS_ERR(branch))
+ return PTR_ERR(branch);
+
+ regs = branch->frame[branch->curframe]->regs;
+ if (alu32) {
+ __mark_reg32_known(&regs[insn->dst_reg], 0);
+ __mark_reg32_known(dst_reg, -1ull);
+ } else {
+ __mark_reg_known(&regs[insn->dst_reg], 0);
+ __mark_reg_known(dst_reg, -1ull);
+ }
+ return 0;
+}
+
/* WARNING: This function does calculations on 64-bit values, but the actual
* execution may occur on 32-bit values. Therefore, things like bitshifts
* need extra checks in the 32-bit case.
@@ -15513,6 +15963,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
struct bpf_reg_state src_reg)
{
u8 opcode = BPF_OP(insn->code);
+ s16 off = insn->off;
bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64);
int ret;
@@ -15564,12 +16015,54 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
scalar32_min_max_mul(dst_reg, &src_reg);
scalar_min_max_mul(dst_reg, &src_reg);
break;
+ case BPF_DIV:
+ /* BPF div specification: x / 0 = 0 */
+ if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0)) {
+ ___mark_reg_known(dst_reg, 0);
+ break;
+ }
+ if (alu32)
+ if (off == 1)
+ scalar32_min_max_sdiv(dst_reg, &src_reg);
+ else
+ scalar32_min_max_udiv(dst_reg, &src_reg);
+ else
+ if (off == 1)
+ scalar_min_max_sdiv(dst_reg, &src_reg);
+ else
+ scalar_min_max_udiv(dst_reg, &src_reg);
+ break;
+ case BPF_MOD:
+ /* BPF mod specification: x % 0 = x */
+ if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0))
+ break;
+ if (alu32)
+ if (off == 1)
+ scalar32_min_max_smod(dst_reg, &src_reg);
+ else
+ scalar32_min_max_umod(dst_reg, &src_reg);
+ else
+ if (off == 1)
+ scalar_min_max_smod(dst_reg, &src_reg);
+ else
+ scalar_min_max_umod(dst_reg, &src_reg);
+ break;
case BPF_AND:
+ if (tnum_is_const(src_reg.var_off)) {
+ ret = maybe_fork_scalars(env, insn, dst_reg);
+ if (ret)
+ return ret;
+ }
dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off);
scalar32_min_max_and(dst_reg, &src_reg);
scalar_min_max_and(dst_reg, &src_reg);
break;
case BPF_OR:
+ if (tnum_is_const(src_reg.var_off)) {
+ ret = maybe_fork_scalars(env, insn, dst_reg);
+ if (ret)
+ return ret;
+ }
dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off);
scalar32_min_max_or(dst_reg, &src_reg);
scalar_min_max_or(dst_reg, &src_reg);
@@ -15597,12 +16090,23 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
else
scalar_min_max_arsh(dst_reg, &src_reg);
break;
+ case BPF_END:
+ scalar_byte_swap(dst_reg, insn);
+ break;
default:
break;
}
- /* ALU32 ops are zero extended into 64bit register */
- if (alu32)
+ /*
+ * ALU32 ops are zero extended into 64bit register.
+ *
+ * BPF_END is already handled inside the helper (truncation),
+ * so skip zext here to avoid unexpected zero extension.
+ * e.g., le64: opcode=(BPF_END|BPF_ALU|BPF_TO_LE), imm=0x40
+ * This is a 64bit byte swap operation with alu32==true,
+ * but we should not zero extend the result.
+ */
+ if (alu32 && opcode != BPF_END)
zext_32_to_64(dst_reg);
reg_bounds_sync(dst_reg);
return 0;
@@ -15705,6 +16209,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
verbose(env, "verifier internal error: no src_reg\n");
return -EFAULT;
}
+ /*
+ * For alu32 linked register tracking, we need to check dst_reg's
+ * umax_value before the ALU operation. After adjust_scalar_min_max_vals(),
+ * alu32 ops will have zero-extended the result, making umax_value <= U32_MAX.
+ */
+ u64 dst_umax = dst_reg->umax_value;
+
err = adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
if (err)
return err;
@@ -15714,26 +16225,44 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
* r1 += 0x1
* if r2 < 1000 goto ...
* use r1 in memory access
- * So for 64-bit alu remember constant delta between r2 and r1 and
- * update r1 after 'if' condition.
+ * So remember constant delta between r2 and r1 and update r1 after
+ * 'if' condition.
*/
if (env->bpf_capable &&
- BPF_OP(insn->code) == BPF_ADD && !alu32 &&
- dst_reg->id && is_reg_const(src_reg, false)) {
- u64 val = reg_const_value(src_reg, false);
+ (BPF_OP(insn->code) == BPF_ADD || BPF_OP(insn->code) == BPF_SUB) &&
+ dst_reg->id && is_reg_const(src_reg, alu32)) {
+ u64 val = reg_const_value(src_reg, alu32);
+ s32 off;
+
+ if (!alu32 && ((s64)val < S32_MIN || (s64)val > S32_MAX))
+ goto clear_id;
+
+ if (alu32 && (dst_umax > U32_MAX))
+ goto clear_id;
- if ((dst_reg->id & BPF_ADD_CONST) ||
- /* prevent overflow in sync_linked_regs() later */
- val > (u32)S32_MAX) {
+ off = (s32)val;
+
+ if (BPF_OP(insn->code) == BPF_SUB) {
+ /* Negating S32_MIN would overflow */
+ if (off == S32_MIN)
+ goto clear_id;
+ off = -off;
+ }
+
+ if (dst_reg->id & BPF_ADD_CONST) {
/*
* If the register already went through rX += val
* we cannot accumulate another val into rx->off.
*/
+clear_id:
dst_reg->off = 0;
dst_reg->id = 0;
} else {
- dst_reg->id |= BPF_ADD_CONST;
- dst_reg->off = val;
+ if (alu32)
+ dst_reg->id |= BPF_ADD_CONST32;
+ else
+ dst_reg->id |= BPF_ADD_CONST64;
+ dst_reg->off = off;
}
} else {
/*
@@ -15782,7 +16311,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check dest operand */
- if (opcode == BPF_NEG &&
+ if ((opcode == BPF_NEG || opcode == BPF_END) &&
regs[insn->dst_reg].type == SCALAR_VALUE) {
err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
err = err ?: adjust_scalar_min_max_vals(env, insn,
@@ -16802,8 +17331,8 @@ static void collect_linked_regs(struct bpf_verifier_state *vstate, u32 id,
/* For all R in linked_regs, copy known_reg range into R
* if R->id == known_reg->id.
*/
-static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_state *known_reg,
- struct linked_regs *linked_regs)
+static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_state *vstate,
+ struct bpf_reg_state *known_reg, struct linked_regs *linked_regs)
{
struct bpf_reg_state fake_reg;
struct bpf_reg_state *reg;
@@ -16827,23 +17356,32 @@ static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_s
} else {
s32 saved_subreg_def = reg->subreg_def;
s32 saved_off = reg->off;
+ u32 saved_id = reg->id;
fake_reg.type = SCALAR_VALUE;
- __mark_reg_known(&fake_reg, (s32)reg->off - (s32)known_reg->off);
+ __mark_reg_known(&fake_reg, (s64)reg->off - (s64)known_reg->off);
/* reg = known_reg; reg += delta */
copy_register_state(reg, known_reg);
/*
- * Must preserve off, id and add_const flag,
+ * Must preserve off, id and subreg_def flag,
* otherwise another sync_linked_regs() will be incorrect.
*/
reg->off = saved_off;
+ reg->id = saved_id;
reg->subreg_def = saved_subreg_def;
scalar32_min_max_add(reg, &fake_reg);
scalar_min_max_add(reg, &fake_reg);
reg->var_off = tnum_add(reg->var_off, fake_reg.var_off);
+ if (known_reg->id & BPF_ADD_CONST32)
+ zext_32_to_64(reg);
+ reg_bounds_sync(reg);
}
+ if (e->is_reg)
+ mark_reg_scratched(env, e->regno);
+ else
+ mark_stack_slot_scratched(env, e->spi);
}
}
@@ -17030,13 +17568,15 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
if (BPF_SRC(insn->code) == BPF_X &&
src_reg->type == SCALAR_VALUE && src_reg->id &&
!WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) {
- sync_linked_regs(this_branch, src_reg, &linked_regs);
- sync_linked_regs(other_branch, &other_branch_regs[insn->src_reg], &linked_regs);
+ sync_linked_regs(env, this_branch, src_reg, &linked_regs);
+ sync_linked_regs(env, other_branch, &other_branch_regs[insn->src_reg],
+ &linked_regs);
}
if (dst_reg->type == SCALAR_VALUE && dst_reg->id &&
!WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) {
- sync_linked_regs(this_branch, dst_reg, &linked_regs);
- sync_linked_regs(other_branch, &other_branch_regs[insn->dst_reg], &linked_regs);
+ sync_linked_regs(env, this_branch, dst_reg, &linked_regs);
+ sync_linked_regs(env, other_branch, &other_branch_regs[insn->dst_reg],
+ &linked_regs);
}
/* if one pointer register is compared to another pointer
@@ -17411,6 +17951,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
switch (env->prog->expected_attach_type) {
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
+ case BPF_TRACE_FSESSION:
range = retval_range(0, 0);
break;
case BPF_TRACE_RAW_TP:
@@ -17693,6 +18234,10 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm)
switch (imm) {
#ifdef CONFIG_X86_64
case BPF_FUNC_get_smp_processor_id:
+#ifdef CONFIG_SMP
+ case BPF_FUNC_get_current_task_btf:
+ case BPF_FUNC_get_current_task:
+#endif
return env->prog->jit_requested && bpf_jit_supports_percpu_insn();
#endif
default:
@@ -17737,7 +18282,7 @@ static bool get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call
if (bpf_pseudo_kfunc_call(call)) {
int err;
- err = fetch_kfunc_meta(env, call, &meta, NULL);
+ err = fetch_kfunc_arg_meta(env, call->imm, call->off, &meta);
if (err < 0)
/* error would be reported later */
return false;
@@ -18245,7 +18790,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
} else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
struct bpf_kfunc_call_arg_meta meta;
- ret = fetch_kfunc_meta(env, insn, &meta, NULL);
+ ret = fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta);
if (ret == 0 && is_iter_next_kfunc(&meta)) {
mark_prune_point(env, t);
/* Checking and saving state checkpoints at iter_next() call
@@ -18948,30 +19493,49 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
if (old_id == 0) /* cur_id == 0 as well */
return true;
- for (i = 0; i < BPF_ID_MAP_SIZE; i++) {
- if (!map[i].old) {
- /* Reached an empty slot; haven't seen this id before */
- map[i].old = old_id;
- map[i].cur = cur_id;
- return true;
- }
+ for (i = 0; i < idmap->cnt; i++) {
if (map[i].old == old_id)
return map[i].cur == cur_id;
if (map[i].cur == cur_id)
return false;
}
+
+ /* Reached the end of known mappings; haven't seen this id before */
+ if (idmap->cnt < BPF_ID_MAP_SIZE) {
+ map[idmap->cnt].old = old_id;
+ map[idmap->cnt].cur = cur_id;
+ idmap->cnt++;
+ return true;
+ }
+
/* We ran out of idmap slots, which should be impossible */
WARN_ON_ONCE(1);
return false;
}
-/* Similar to check_ids(), but allocate a unique temporary ID
- * for 'old_id' or 'cur_id' of zero.
- * This makes pairs like '0 vs unique ID', 'unique ID vs 0' valid.
+/*
+ * Compare scalar register IDs for state equivalence.
+ *
+ * When old_id == 0, the old register is independent - not linked to any
+ * other register. Any linking in the current state only adds constraints,
+ * making it more restrictive. Since the old state didn't rely on any ID
+ * relationships for this register, it's always safe to accept cur regardless
+ * of its ID. Hence, return true immediately.
+ *
+ * When old_id != 0 but cur_id == 0, we need to ensure that different
+ * independent registers in cur don't incorrectly satisfy the ID matching
+ * requirements of linked registers in old.
+ *
+ * Example: if old has r6.id=X and r7.id=X (linked), but cur has r6.id=0
+ * and r7.id=0 (both independent), without temp IDs both would map old_id=X
+ * to cur_id=0 and pass. With temp IDs: r6 maps X->temp1, r7 tries to map
+ * X->temp2, but X is already mapped to temp1, so the check fails correctly.
*/
static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
{
- old_id = old_id ? old_id : ++idmap->tmp_id_gen;
+ if (!old_id)
+ return true;
+
cur_id = cur_id ? cur_id : ++idmap->tmp_id_gen;
return check_ids(old_id, cur_id, idmap);
@@ -19045,6 +19609,72 @@ static void clean_verifier_state(struct bpf_verifier_env *env,
* doesn't meant that the states are DONE. The verifier has to compare
* the callsites
*/
+
+/* Find id in idset and increment its count, or add new entry */
+static void idset_cnt_inc(struct bpf_idset *idset, u32 id)
+{
+ u32 i;
+
+ for (i = 0; i < idset->num_ids; i++) {
+ if (idset->entries[i].id == id) {
+ idset->entries[i].cnt++;
+ return;
+ }
+ }
+ /* New id */
+ if (idset->num_ids < BPF_ID_MAP_SIZE) {
+ idset->entries[idset->num_ids].id = id;
+ idset->entries[idset->num_ids].cnt = 1;
+ idset->num_ids++;
+ }
+}
+
+/* Find id in idset and return its count, or 0 if not found */
+static u32 idset_cnt_get(struct bpf_idset *idset, u32 id)
+{
+ u32 i;
+
+ for (i = 0; i < idset->num_ids; i++) {
+ if (idset->entries[i].id == id)
+ return idset->entries[i].cnt;
+ }
+ return 0;
+}
+
+/*
+ * Clear singular scalar ids in a state.
+ * A register with a non-zero id is called singular if no other register shares
+ * the same base id. Such registers can be treated as independent (id=0).
+ */
+static void clear_singular_ids(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st)
+{
+ struct bpf_idset *idset = &env->idset_scratch;
+ struct bpf_func_state *func;
+ struct bpf_reg_state *reg;
+
+ idset->num_ids = 0;
+
+ bpf_for_each_reg_in_vstate(st, func, reg, ({
+ if (reg->type != SCALAR_VALUE)
+ continue;
+ if (!reg->id)
+ continue;
+ idset_cnt_inc(idset, reg->id & ~BPF_ADD_CONST);
+ }));
+
+ bpf_for_each_reg_in_vstate(st, func, reg, ({
+ if (reg->type != SCALAR_VALUE)
+ continue;
+ if (!reg->id)
+ continue;
+ if (idset_cnt_get(idset, reg->id & ~BPF_ADD_CONST) == 1) {
+ reg->id = 0;
+ reg->off = 0;
+ }
+ }));
+}
+
static void clean_live_states(struct bpf_verifier_env *env, int insn,
struct bpf_verifier_state *cur)
{
@@ -19091,11 +19721,9 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
if (exact == EXACT)
return regs_exact(rold, rcur, idmap);
- if (rold->type == NOT_INIT) {
- if (exact == NOT_EXACT || rcur->type == NOT_INIT)
- /* explored state can't have used this */
- return true;
- }
+ if (rold->type == NOT_INIT)
+ /* explored state can't have used this */
+ return true;
/* Enforce that register types have to match exactly, including their
* modifiers (like PTR_MAYBE_NULL, MEM_RDONLY, etc), as a general
@@ -19132,11 +19760,21 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
}
if (!rold->precise && exact == NOT_EXACT)
return true;
- if ((rold->id & BPF_ADD_CONST) != (rcur->id & BPF_ADD_CONST))
- return false;
- if ((rold->id & BPF_ADD_CONST) && (rold->off != rcur->off))
- return false;
- /* Why check_ids() for scalar registers?
+ /*
+ * Linked register tracking uses rold->id to detect relationships.
+ * When rold->id == 0, the register is independent and any linking
+ * in rcur only adds constraints. When rold->id != 0, we must verify
+ * id mapping and (for BPF_ADD_CONST) offset consistency.
+ *
+ * +------------------+-----------+------------------+---------------+
+ * | | rold->id | rold + ADD_CONST | rold->id == 0 |
+ * |------------------+-----------+------------------+---------------|
+ * | rcur->id | range,ids | false | range |
+ * | rcur + ADD_CONST | false | range,ids,off | range |
+ * | rcur->id == 0 | range,ids | false | range |
+ * +------------------+-----------+------------------+---------------+
+ *
+ * Why check_ids() for scalar registers?
*
* Consider the following BPF code:
* 1: r6 = ... unbound scalar, ID=a ...
@@ -19160,9 +19798,22 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
* ---
* Also verify that new value satisfies old value range knowledge.
*/
- return range_within(rold, rcur) &&
- tnum_in(rold->var_off, rcur->var_off) &&
- check_scalar_ids(rold->id, rcur->id, idmap);
+
+ /* ADD_CONST mismatch: different linking semantics */
+ if ((rold->id & BPF_ADD_CONST) && !(rcur->id & BPF_ADD_CONST))
+ return false;
+
+ if (rold->id && !(rold->id & BPF_ADD_CONST) && (rcur->id & BPF_ADD_CONST))
+ return false;
+
+ /* Both have offset linkage: offsets must match */
+ if ((rold->id & BPF_ADD_CONST) && rold->off != rcur->off)
+ return false;
+
+ if (!check_scalar_ids(rold->id, rcur->id, idmap))
+ return false;
+
+ return range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off);
case PTR_TO_MAP_KEY:
case PTR_TO_MAP_VALUE:
case PTR_TO_MEM:
@@ -19264,7 +19915,7 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
spi = i / BPF_REG_SIZE;
- if (exact != NOT_EXACT &&
+ if (exact == EXACT &&
(i >= cur->allocated_stack ||
old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
cur->stack[spi].slot_type[i % BPF_REG_SIZE]))
@@ -19470,8 +20121,10 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
static void reset_idmap_scratch(struct bpf_verifier_env *env)
{
- env->idmap_scratch.tmp_id_gen = env->id_gen;
- memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
+ struct bpf_idmap *idmap = &env->idmap_scratch;
+
+ idmap->tmp_id_gen = env->id_gen;
+ idmap->cnt = 0;
}
static bool states_equal(struct bpf_verifier_env *env,
@@ -19835,8 +20488,10 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
}
}
if (bpf_calls_callback(env, insn_idx)) {
- if (states_equal(env, &sl->state, cur, RANGE_WITHIN))
+ if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
+ loop = true;
goto hit;
+ }
goto skip_inf_loop_check;
}
/* attempt to detect infinite loop to avoid unnecessary doomed work */
@@ -20041,6 +20696,8 @@ miss:
if (env->bpf_capable)
mark_all_scalars_imprecise(env, cur);
+ clear_singular_ids(env, cur);
+
/* add new state to the head of linked list */
new = &new_sl->state;
err = copy_verifier_state(new, cur);
@@ -20611,17 +21268,19 @@ static int do_check(struct bpf_verifier_env *env)
* may skip a nospec patched-in after the jump. This can
* currently never happen because nospec_result is only
* used for the write-ops
- * `*(size*)(dst_reg+off)=src_reg|imm32` which must
- * never skip the following insn. Still, add a warning
- * to document this in case nospec_result is used
- * elsewhere in the future.
+ * `*(size*)(dst_reg+off)=src_reg|imm32` and helper
+ * calls. These must never skip the following insn
+ * (i.e., bpf_insn_successors()'s opcode_info.can_jump
+ * is false). Still, add a warning to document this in
+ * case nospec_result is used elsewhere in the future.
*
* All non-branch instructions have a single
* fall-through edge. For these, nospec_result should
* already work.
*/
- if (verifier_bug_if(BPF_CLASS(insn->code) == BPF_JMP ||
- BPF_CLASS(insn->code) == BPF_JMP32, env,
+ if (verifier_bug_if((BPF_CLASS(insn->code) == BPF_JMP ||
+ BPF_CLASS(insn->code) == BPF_JMP32) &&
+ BPF_OP(insn->code) != BPF_CALL, env,
"speculation barrier after jump instruction may not have the desired effect"))
return -EFAULT;
process_bpf_exit:
@@ -20660,12 +21319,7 @@ static int find_btf_percpu_datasec(struct btf *btf)
* types to look at only module's own BTF types.
*/
n = btf_nr_types(btf);
- if (btf_is_module(btf))
- i = btf_nr_types(btf_vmlinux);
- else
- i = 1;
-
- for(; i < n; i++) {
+ for (i = btf_named_start_id(btf, true); i < n; i++) {
t = btf_type_by_id(btf, i);
if (BTF_INFO_KIND(t->info) != BTF_KIND_DATASEC)
continue;
@@ -20890,20 +21544,6 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
}
}
- if (btf_record_has_field(map->record, BPF_TIMER)) {
- if (is_tracing_prog_type(prog_type)) {
- verbose(env, "tracing progs cannot use bpf_timer yet\n");
- return -EINVAL;
- }
- }
-
- if (btf_record_has_field(map->record, BPF_WORKQUEUE)) {
- if (is_tracing_prog_type(prog_type)) {
- verbose(env, "tracing progs cannot use bpf_wq yet\n");
- return -EINVAL;
- }
- }
-
if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) &&
!bpf_offload_prog_map_match(prog, map)) {
verbose(env, "offload device mismatch between prog and map\n");
@@ -20935,6 +21575,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_STACK:
case BPF_MAP_TYPE_ARENA:
case BPF_MAP_TYPE_INSN_ARRAY:
+ case BPF_MAP_TYPE_PROG_ARRAY:
break;
default:
verbose(env,
@@ -21141,11 +21782,6 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
} else {
u32 off = insn[1].imm;
- if (off >= BPF_MAX_VAR_OFF) {
- verbose(env, "direct value offset of %u is not allowed\n", off);
- return -EINVAL;
- }
-
if (!map->ops->map_direct_value_addr) {
verbose(env, "no direct value access support for this map type\n");
return -EINVAL;
@@ -22446,6 +23082,12 @@ static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc
} else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) {
if (!env->insn_aux_data[insn_idx].non_sleepable)
addr = (unsigned long)bpf_dynptr_from_file_sleepable;
+ } else if (func_id == special_kfunc_list[KF_bpf_arena_alloc_pages]) {
+ if (env->insn_aux_data[insn_idx].non_sleepable)
+ addr = (unsigned long)bpf_arena_alloc_pages_non_sleepable;
+ } else if (func_id == special_kfunc_list[KF_bpf_arena_free_pages]) {
+ if (env->insn_aux_data[insn_idx].non_sleepable)
+ addr = (unsigned long)bpf_arena_free_pages_non_sleepable;
}
desc->addr = addr;
return 0;
@@ -22498,8 +23140,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (!bpf_jit_supports_far_kfunc_call())
insn->imm = BPF_CALL_IMM(desc->addr);
- if (insn->off)
- return 0;
+
if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
@@ -22565,6 +23206,36 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
*cnt = 1;
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_session_is_return] &&
+ env->prog->expected_attach_type == BPF_TRACE_FSESSION) {
+ /*
+ * inline the bpf_session_is_return() for fsession:
+ * bool bpf_session_is_return(void *ctx)
+ * {
+ * return (((u64 *)ctx)[-1] >> BPF_TRAMP_IS_RETURN_SHIFT) & 1;
+ * }
+ */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_IS_RETURN_SHIFT);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1);
+ *cnt = 3;
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_session_cookie] &&
+ env->prog->expected_attach_type == BPF_TRACE_FSESSION) {
+ /*
+ * inline bpf_session_cookie() for fsession:
+ * __u64 *bpf_session_cookie(void *ctx)
+ * {
+ * u64 off = (((u64 *)ctx)[-1] >> BPF_TRAMP_COOKIE_INDEX_SHIFT) & 0xFF;
+ * return &((u64 *)ctx)[-off];
+ * }
+ */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_COOKIE_INDEX_SHIFT);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF);
+ insn_buf[3] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
+ insn_buf[4] = BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1);
+ insn_buf[5] = BPF_ALU64_IMM(BPF_NEG, BPF_REG_0, 0);
+ *cnt = 6;
}
if (env->insn_aux_data[insn_idx].arg_prog) {
@@ -23278,21 +23949,48 @@ patch_map_ops_generic:
insn = new_prog->insnsi + i + delta;
goto next_insn;
}
+
+ /* Implement bpf_get_current_task() and bpf_get_current_task_btf() inline. */
+ if ((insn->imm == BPF_FUNC_get_current_task || insn->imm == BPF_FUNC_get_current_task_btf) &&
+ verifier_inlines_helper_call(env, insn->imm)) {
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)&current_task);
+ insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
+ insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0);
+ cnt = 3;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
#endif
/* Implement bpf_get_func_arg inline. */
if (prog_type == BPF_PROG_TYPE_TRACING &&
insn->imm == BPF_FUNC_get_func_arg) {
- /* Load nr_args from ctx - 8 */
- insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
- insn_buf[1] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6);
- insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3);
- insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1);
- insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0);
- insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
- insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0);
- insn_buf[7] = BPF_JMP_A(1);
- insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
- cnt = 9;
+ if (eatype == BPF_TRACE_RAW_TP) {
+ int nr_args = btf_type_vlen(prog->aux->attach_func_proto);
+
+ /* skip 'void *__data' in btf_trace_##name() and save to reg0 */
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, nr_args - 1);
+ cnt = 1;
+ } else {
+ /* Load nr_args from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF);
+ cnt = 2;
+ }
+ insn_buf[cnt++] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6);
+ insn_buf[cnt++] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3);
+ insn_buf[cnt++] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1);
+ insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0);
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
+ insn_buf[cnt++] = BPF_MOV64_IMM(BPF_REG_0, 0);
+ insn_buf[cnt++] = BPF_JMP_A(1);
+ insn_buf[cnt++] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
@@ -23308,15 +24006,17 @@ patch_map_ops_generic:
if (prog_type == BPF_PROG_TYPE_TRACING &&
insn->imm == BPF_FUNC_get_func_ret) {
if (eatype == BPF_TRACE_FEXIT ||
+ eatype == BPF_TRACE_FSESSION ||
eatype == BPF_MODIFY_RETURN) {
/* Load nr_args from ctx - 8 */
insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
- insn_buf[1] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
- insn_buf[2] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1);
- insn_buf[3] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
- insn_buf[4] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0);
- insn_buf[5] = BPF_MOV64_IMM(BPF_REG_0, 0);
- cnt = 6;
+ insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
+ insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1);
+ insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
+ insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0);
+ insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0);
+ cnt = 7;
} else {
insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, -EOPNOTSUPP);
cnt = 1;
@@ -23335,13 +24035,24 @@ patch_map_ops_generic:
/* Implement get_func_arg_cnt inline. */
if (prog_type == BPF_PROG_TYPE_TRACING &&
insn->imm == BPF_FUNC_get_func_arg_cnt) {
- /* Load nr_args from ctx - 8 */
- insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ if (eatype == BPF_TRACE_RAW_TP) {
+ int nr_args = btf_type_vlen(prog->aux->attach_func_proto);
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
+ /* skip 'void *__data' in btf_trace_##name() and save to reg0 */
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, nr_args - 1);
+ cnt = 1;
+ } else {
+ /* Load nr_args from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF);
+ cnt = 2;
+ }
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
+ delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
goto next_insn;
@@ -24252,7 +24963,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
if (tgt_prog->type == BPF_PROG_TYPE_TRACING &&
prog_extension &&
(tgt_prog->expected_attach_type == BPF_TRACE_FENTRY ||
- tgt_prog->expected_attach_type == BPF_TRACE_FEXIT)) {
+ tgt_prog->expected_attach_type == BPF_TRACE_FEXIT ||
+ tgt_prog->expected_attach_type == BPF_TRACE_FSESSION)) {
/* Program extensions can extend all program types
* except fentry/fexit. The reason is the following.
* The fentry/fexit programs are used for performance
@@ -24267,7 +24979,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
* beyond reasonable stack size. Hence extending fentry
* is not allowed.
*/
- bpf_log(log, "Cannot extend fentry/fexit\n");
+ bpf_log(log, "Cannot extend fentry/fexit/fsession\n");
return -EINVAL;
}
} else {
@@ -24351,6 +25063,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
case BPF_LSM_CGROUP:
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
+ case BPF_TRACE_FSESSION:
+ if (prog->expected_attach_type == BPF_TRACE_FSESSION &&
+ !bpf_jit_supports_fsession()) {
+ bpf_log(log, "JIT does not support fsession\n");
+ return -EOPNOTSUPP;
+ }
if (!btf_type_is_func(t)) {
bpf_log(log, "attach_btf_id %u is not a function\n",
btf_id);
@@ -24517,6 +25235,7 @@ static bool can_be_sleepable(struct bpf_prog *prog)
case BPF_TRACE_FEXIT:
case BPF_MODIFY_RETURN:
case BPF_TRACE_ITER:
+ case BPF_TRACE_FSESSION:
return true;
default:
return false;
@@ -24598,9 +25317,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
tgt_info.tgt_name);
return -EINVAL;
} else if ((prog->expected_attach_type == BPF_TRACE_FEXIT ||
+ prog->expected_attach_type == BPF_TRACE_FSESSION ||
prog->expected_attach_type == BPF_MODIFY_RETURN) &&
btf_id_set_contains(&noreturn_deny, btf_id)) {
- verbose(env, "Attaching fexit/fmod_ret to __noreturn function '%s' is rejected.\n",
+ verbose(env, "Attaching fexit/fsession/fmod_ret to __noreturn function '%s' is rejected.\n",
tgt_info.tgt_name);
return -EINVAL;
}
@@ -24809,6 +25529,12 @@ static void compute_insn_live_regs(struct bpf_verifier_env *env,
case BPF_JMP32:
switch (code) {
case BPF_JA:
+ def = 0;
+ if (BPF_SRC(insn->code) == BPF_X)
+ use = dst;
+ else
+ use = 0;
+ break;
case BPF_JCOND:
def = 0;
use = 0;
@@ -25076,15 +25802,18 @@ dfs_continue:
}
/*
* Assign SCC number only if component has two or more elements,
- * or if component has a self reference.
+ * or if component has a self reference, or if instruction is a
+ * callback calling function (implicit loop).
*/
- assign_scc = stack[stack_sz - 1] != w;
- for (j = 0; j < succ->cnt; ++j) {
+ assign_scc = stack[stack_sz - 1] != w; /* two or more elements? */
+ for (j = 0; j < succ->cnt; ++j) { /* self reference? */
if (succ->items[j] == w) {
assign_scc = true;
break;
}
}
+ if (bpf_calls_callback(env, w)) /* implicit loop? */
+ assign_scc = true;
/* Pop component elements from stack */
do {
t = stack[--stack_sz];
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 0bb8fa927e9e..7ccd84c17792 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -7275,9 +7275,9 @@ BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED)
BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY)
-BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_exit_bstr)
+BTF_ID_FLAGS(func, scx_bpf_error_bstr)
+BTF_ID_FLAGS(func, scx_bpf_dump_bstr)
BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2)
BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap)
BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur)
@@ -7296,7 +7296,7 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU_PROTECTED)
BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
#endif
BTF_ID_FLAGS(func, scx_bpf_now)
-BTF_ID_FLAGS(func, scx_bpf_events, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_events)
BTF_KFUNCS_END(scx_kfunc_ids_any)
static const struct btf_kfunc_id_set scx_kfunc_set_any = {
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index bfa2ec46e075..d7042a09fe46 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -50,6 +50,9 @@ config HAVE_DYNAMIC_FTRACE_WITH_REGS
config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
bool
+config HAVE_SINGLE_FTRACE_DIRECT_OPS
+ bool
+
config HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS
bool
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index fe28d86f7c35..f7baeb8278ca 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -830,7 +830,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type, struct task_struc
info.si_code = SI_KERNEL;
info.si_pid = 0;
info.si_uid = 0;
- info.si_value.sival_ptr = (void *)(unsigned long)value;
+ info.si_value.sival_ptr = (void __user __force *)(unsigned long)value;
siginfo = &info;
}
@@ -1022,7 +1022,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = {
.func = bpf_snprintf_btf,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_WRITE,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg4_type = ARG_CONST_SIZE,
@@ -1194,7 +1194,7 @@ const struct bpf_func_proto bpf_get_branch_snapshot_proto = {
BPF_CALL_3(get_func_arg, void *, ctx, u32, n, u64 *, value)
{
/* This helper call is inlined by verifier. */
- u64 nr_args = ((u64 *)ctx)[-1];
+ u64 nr_args = ((u64 *)ctx)[-1] & 0xFF;
if ((u64) n >= nr_args)
return -EINVAL;
@@ -1214,7 +1214,7 @@ static const struct bpf_func_proto bpf_get_func_arg_proto = {
BPF_CALL_2(get_func_ret, void *, ctx, u64 *, value)
{
/* This helper call is inlined by verifier. */
- u64 nr_args = ((u64 *)ctx)[-1];
+ u64 nr_args = ((u64 *)ctx)[-1] & 0xFF;
*value = ((u64 *)ctx)[nr_args];
return 0;
@@ -1231,7 +1231,7 @@ static const struct bpf_func_proto bpf_get_func_ret_proto = {
BPF_CALL_1(get_func_arg_cnt, void *, ctx)
{
/* This helper call is inlined by verifier. */
- return ((u64 *)ctx)[-1];
+ return ((u64 *)ctx)[-1] & 0xFF;
}
static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = {
@@ -1286,7 +1286,8 @@ static bool is_kprobe_multi(const struct bpf_prog *prog)
static inline bool is_kprobe_session(const struct bpf_prog *prog)
{
- return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION;
+ return prog->type == BPF_PROG_TYPE_KPROBE &&
+ prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION;
}
static inline bool is_uprobe_multi(const struct bpf_prog *prog)
@@ -1297,7 +1298,14 @@ static inline bool is_uprobe_multi(const struct bpf_prog *prog)
static inline bool is_uprobe_session(const struct bpf_prog *prog)
{
- return prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION;
+ return prog->type == BPF_PROG_TYPE_KPROBE &&
+ prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION;
+}
+
+static inline bool is_trace_fsession(const struct bpf_prog *prog)
+{
+ return prog->type == BPF_PROG_TYPE_TRACING &&
+ prog->expected_attach_type == BPF_TRACE_FSESSION;
}
static const struct bpf_func_proto *
@@ -1526,7 +1534,7 @@ static const struct bpf_func_proto bpf_read_branch_records_proto = {
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg2_type = ARG_PTR_TO_MEM_OR_NULL | MEM_WRITE,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
};
@@ -1661,7 +1669,7 @@ static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
};
@@ -1734,11 +1742,17 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_d_path:
return &bpf_d_path_proto;
case BPF_FUNC_get_func_arg:
- return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_proto : NULL;
+ if (bpf_prog_has_trampoline(prog) ||
+ prog->expected_attach_type == BPF_TRACE_RAW_TP)
+ return &bpf_get_func_arg_proto;
+ return NULL;
case BPF_FUNC_get_func_ret:
return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL;
case BPF_FUNC_get_func_arg_cnt:
- return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL;
+ if (bpf_prog_has_trampoline(prog) ||
+ prog->expected_attach_type == BPF_TRACE_RAW_TP)
+ return &bpf_get_func_arg_cnt_proto;
+ return NULL;
case BPF_FUNC_get_attach_cookie:
if (prog->type == BPF_PROG_TYPE_TRACING &&
prog->expected_attach_type == BPF_TRACE_RAW_TP)
@@ -2063,7 +2077,7 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
struct bpf_trace_run_ctx run_ctx;
cant_sleep();
- if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+ if (unlikely(!bpf_prog_get_recursion_context(prog))) {
bpf_prog_inc_misses_counter(prog);
goto out;
}
@@ -2077,7 +2091,7 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
bpf_reset_run_ctx(old_run_ctx);
out:
- this_cpu_dec(*(prog->active));
+ bpf_prog_put_recursion_context(prog);
}
#define UNPACK(...) __VA_ARGS__
@@ -2564,6 +2578,7 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx);
err = bpf_prog_run(link->link.prog, regs);
bpf_reset_run_ctx(old_run_ctx);
+ ftrace_partial_regs_update(fregs, bpf_kprobe_multi_pt_regs_ptr());
rcu_read_unlock();
out:
@@ -3316,7 +3331,7 @@ static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
__bpf_kfunc_start_defs();
-__bpf_kfunc bool bpf_session_is_return(void)
+__bpf_kfunc bool bpf_session_is_return(void *ctx)
{
struct bpf_session_run_ctx *session_ctx;
@@ -3324,7 +3339,7 @@ __bpf_kfunc bool bpf_session_is_return(void)
return session_ctx->is_return;
}
-__bpf_kfunc __u64 *bpf_session_cookie(void)
+__bpf_kfunc __u64 *bpf_session_cookie(void *ctx)
{
struct bpf_session_run_ctx *session_ctx;
@@ -3334,34 +3349,39 @@ __bpf_kfunc __u64 *bpf_session_cookie(void)
__bpf_kfunc_end_defs();
-BTF_KFUNCS_START(kprobe_multi_kfunc_set_ids)
+BTF_KFUNCS_START(session_kfunc_set_ids)
BTF_ID_FLAGS(func, bpf_session_is_return)
BTF_ID_FLAGS(func, bpf_session_cookie)
-BTF_KFUNCS_END(kprobe_multi_kfunc_set_ids)
+BTF_KFUNCS_END(session_kfunc_set_ids)
-static int bpf_kprobe_multi_filter(const struct bpf_prog *prog, u32 kfunc_id)
+static int bpf_session_filter(const struct bpf_prog *prog, u32 kfunc_id)
{
- if (!btf_id_set8_contains(&kprobe_multi_kfunc_set_ids, kfunc_id))
+ if (!btf_id_set8_contains(&session_kfunc_set_ids, kfunc_id))
return 0;
- if (!is_kprobe_session(prog) && !is_uprobe_session(prog))
+ if (!is_kprobe_session(prog) && !is_uprobe_session(prog) && !is_trace_fsession(prog))
return -EACCES;
return 0;
}
-static const struct btf_kfunc_id_set bpf_kprobe_multi_kfunc_set = {
+static const struct btf_kfunc_id_set bpf_session_kfunc_set = {
.owner = THIS_MODULE,
- .set = &kprobe_multi_kfunc_set_ids,
- .filter = bpf_kprobe_multi_filter,
+ .set = &session_kfunc_set_ids,
+ .filter = bpf_session_filter,
};
-static int __init bpf_kprobe_multi_kfuncs_init(void)
+static int __init bpf_trace_kfuncs_init(void)
{
- return register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_kprobe_multi_kfunc_set);
+ int err = 0;
+
+ err = err ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_session_kfunc_set);
+ err = err ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_session_kfunc_set);
+
+ return err;
}
-late_initcall(bpf_kprobe_multi_kfuncs_init);
+late_initcall(bpf_trace_kfuncs_init);
typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struct *tsk);
@@ -3517,7 +3537,7 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid
__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign)
{
- return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
+ return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign,
copy_user_data_nofault, NULL);
}
@@ -3531,7 +3551,7 @@ __bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u64 off,
__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign)
{
- return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
+ return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign,
copy_user_str_nofault, NULL);
}
@@ -3545,14 +3565,14 @@ __bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u64 of
__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign)
{
- return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
+ return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign,
copy_user_data_sleepable, NULL);
}
__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign)
{
- return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
+ return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign,
copy_user_str_sleepable, NULL);
}
@@ -3560,7 +3580,7 @@ __bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off,
u64 size, const void __user *unsafe_ptr__ign,
struct task_struct *tsk)
{
- return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
+ return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign,
copy_user_data_sleepable, tsk);
}
@@ -3568,7 +3588,7 @@ __bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64
u64 size, const void __user *unsafe_ptr__ign,
struct task_struct *tsk)
{
- return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
+ return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign,
copy_user_str_sleepable, tsk);
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index aa758efc3731..f9b10c633bdd 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -68,7 +68,6 @@
})
/* hash bits for specific function selection */
-#define FTRACE_HASH_DEFAULT_BITS 10
#define FTRACE_HASH_MAX_BITS 12
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -1210,8 +1209,8 @@ static void __add_hash_entry(struct ftrace_hash *hash,
hash->count++;
}
-static struct ftrace_func_entry *
-add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
+struct ftrace_func_entry *
+add_ftrace_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigned long direct)
{
struct ftrace_func_entry *entry;
@@ -1220,11 +1219,18 @@ add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
return NULL;
entry->ip = ip;
+ entry->direct = direct;
__add_hash_entry(hash, entry);
return entry;
}
+static struct ftrace_func_entry *
+add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
+{
+ return add_ftrace_hash_entry_direct(hash, ip, 0);
+}
+
static void
free_hash_entry(struct ftrace_hash *hash,
struct ftrace_func_entry *entry)
@@ -1283,7 +1289,7 @@ static void clear_ftrace_mod_list(struct list_head *head)
mutex_unlock(&ftrace_lock);
}
-static void free_ftrace_hash(struct ftrace_hash *hash)
+void free_ftrace_hash(struct ftrace_hash *hash)
{
if (!hash || hash == EMPTY_HASH)
return;
@@ -1323,7 +1329,7 @@ void ftrace_free_filter(struct ftrace_ops *ops)
}
EXPORT_SYMBOL_GPL(ftrace_free_filter);
-static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
+struct ftrace_hash *alloc_ftrace_hash(int size_bits)
{
struct ftrace_hash *hash;
int size;
@@ -1397,7 +1403,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
size = 1 << hash->size_bits;
for (i = 0; i < size; i++) {
hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
- if (add_hash_entry(new_hash, entry->ip) == NULL)
+ if (add_ftrace_hash_entry_direct(new_hash, entry->ip, entry->direct) == NULL)
goto free_hash;
}
}
@@ -2068,7 +2074,7 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
*/
if (!ops->ops_func)
return -EBUSY;
- ret = ops->ops_func(ops, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF);
+ ret = ops->ops_func(ops, rec->ip, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF);
if (ret)
return ret;
} else if (is_ipmodify) {
@@ -2624,8 +2630,13 @@ unsigned long ftrace_find_rec_direct(unsigned long ip)
static void call_direct_funcs(unsigned long ip, unsigned long pip,
struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
- unsigned long addr = READ_ONCE(ops->direct_call);
+ unsigned long addr;
+#ifdef CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS
+ addr = ftrace_find_rec_direct(ip);
+#else
+ addr = READ_ONCE(ops->direct_call);
+#endif
if (!addr)
return;
@@ -6049,15 +6060,8 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
if (ftrace_hash_empty(hash))
return -EINVAL;
- /* This is a "raw" address, and this should never happen. */
- if (WARN_ON_ONCE(ftrace_is_jmp(addr)))
- return -EINVAL;
-
mutex_lock(&direct_mutex);
- if (ops->flags & FTRACE_OPS_FL_JMP)
- addr = ftrace_jmp_set(addr);
-
/* Make sure requested entries are not already registered.. */
size = 1 << hash->size_bits;
for (i = 0; i < size; i++) {
@@ -6178,13 +6182,6 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
lockdep_assert_held_once(&direct_mutex);
- /* This is a "raw" address, and this should never happen. */
- if (WARN_ON_ONCE(ftrace_is_jmp(addr)))
- return -EINVAL;
-
- if (ops->flags & FTRACE_OPS_FL_JMP)
- addr = ftrace_jmp_set(addr);
-
/* Enable the tmp_ops to have the same functions as the direct ops */
ftrace_ops_init(&tmp_ops);
tmp_ops.func_hash = ops->func_hash;
@@ -6289,6 +6286,368 @@ int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
return err;
}
EXPORT_SYMBOL_GPL(modify_ftrace_direct);
+
+static unsigned long hash_count(struct ftrace_hash *hash)
+{
+ return hash ? hash->count : 0;
+}
+
+/**
+ * hash_add - adds two struct ftrace_hash and returns the result
+ * @a: struct ftrace_hash object
+ * @b: struct ftrace_hash object
+ *
+ * Returns struct ftrace_hash object on success, NULL on error.
+ */
+static struct ftrace_hash *hash_add(struct ftrace_hash *a, struct ftrace_hash *b)
+{
+ struct ftrace_func_entry *entry;
+ struct ftrace_hash *add;
+ int size;
+
+ size = hash_count(a) + hash_count(b);
+ if (size > 32)
+ size = 32;
+
+ add = alloc_and_copy_ftrace_hash(fls(size), a);
+ if (!add)
+ return NULL;
+
+ size = 1 << b->size_bits;
+ for (int i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &b->buckets[i], hlist) {
+ if (add_ftrace_hash_entry_direct(add, entry->ip, entry->direct) == NULL) {
+ free_ftrace_hash(add);
+ return NULL;
+ }
+ }
+ }
+ return add;
+}
+
+/**
+ * update_ftrace_direct_add - Updates @ops by adding direct
+ * callers provided in @hash
+ * @ops: The address of the struct ftrace_ops object
+ * @hash: The address of the struct ftrace_hash object
+ *
+ * This is used to add custom direct callers (ip -> addr) to @ops,
+ * specified in @hash. The @ops will be either registered or updated.
+ *
+ * Returns: zero on success. Non zero on error, which includes:
+ * -EINVAL - The @hash is empty
+ */
+int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash)
+{
+ struct ftrace_hash *old_direct_functions = NULL;
+ struct ftrace_hash *new_direct_functions;
+ struct ftrace_hash *old_filter_hash;
+ struct ftrace_hash *new_filter_hash = NULL;
+ struct ftrace_func_entry *entry;
+ int err = -EINVAL;
+ int size;
+ bool reg;
+
+ if (!hash_count(hash))
+ return -EINVAL;
+
+ mutex_lock(&direct_mutex);
+
+ /* Make sure requested entries are not already registered. */
+ size = 1 << hash->size_bits;
+ for (int i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ if (__ftrace_lookup_ip(direct_functions, entry->ip))
+ goto out_unlock;
+ }
+ }
+
+ old_filter_hash = ops->func_hash ? ops->func_hash->filter_hash : NULL;
+
+ /* If there's nothing in filter_hash we need to register the ops. */
+ reg = hash_count(old_filter_hash) == 0;
+ if (reg) {
+ if (ops->func || ops->trampoline)
+ goto out_unlock;
+ if (ops->flags & FTRACE_OPS_FL_ENABLED)
+ goto out_unlock;
+ }
+
+ err = -ENOMEM;
+ new_filter_hash = hash_add(old_filter_hash, hash);
+ if (!new_filter_hash)
+ goto out_unlock;
+
+ new_direct_functions = hash_add(direct_functions, hash);
+ if (!new_direct_functions)
+ goto out_unlock;
+
+ old_direct_functions = direct_functions;
+ rcu_assign_pointer(direct_functions, new_direct_functions);
+
+ if (reg) {
+ ops->func = call_direct_funcs;
+ ops->flags |= MULTI_FLAGS;
+ ops->trampoline = FTRACE_REGS_ADDR;
+ ops->local_hash.filter_hash = new_filter_hash;
+
+ err = register_ftrace_function_nolock(ops);
+ if (err) {
+ /* restore old filter on error */
+ ops->local_hash.filter_hash = old_filter_hash;
+
+ /* cleanup for possible another register call */
+ ops->func = NULL;
+ ops->trampoline = 0;
+ } else {
+ new_filter_hash = old_filter_hash;
+ }
+ } else {
+ err = ftrace_update_ops(ops, new_filter_hash, EMPTY_HASH);
+ /*
+ * new_filter_hash is dup-ed, so we need to release it anyway,
+ * old_filter_hash either stays on error or is already released
+ */
+ }
+
+ if (err) {
+ /* reset direct_functions and free the new one */
+ rcu_assign_pointer(direct_functions, old_direct_functions);
+ old_direct_functions = new_direct_functions;
+ }
+
+ out_unlock:
+ mutex_unlock(&direct_mutex);
+
+ if (old_direct_functions && old_direct_functions != EMPTY_HASH)
+ call_rcu_tasks(&old_direct_functions->rcu, register_ftrace_direct_cb);
+ free_ftrace_hash(new_filter_hash);
+
+ return err;
+}
+
+/**
+ * hash_sub - substracts @b from @a and returns the result
+ * @a: struct ftrace_hash object
+ * @b: struct ftrace_hash object
+ *
+ * Returns struct ftrace_hash object on success, NULL on error.
+ */
+static struct ftrace_hash *hash_sub(struct ftrace_hash *a, struct ftrace_hash *b)
+{
+ struct ftrace_func_entry *entry, *del;
+ struct ftrace_hash *sub;
+ int size;
+
+ sub = alloc_and_copy_ftrace_hash(a->size_bits, a);
+ if (!sub)
+ return NULL;
+
+ size = 1 << b->size_bits;
+ for (int i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &b->buckets[i], hlist) {
+ del = __ftrace_lookup_ip(sub, entry->ip);
+ if (WARN_ON_ONCE(!del)) {
+ free_ftrace_hash(sub);
+ return NULL;
+ }
+ remove_hash_entry(sub, del);
+ kfree(del);
+ }
+ }
+ return sub;
+}
+
+/**
+ * update_ftrace_direct_del - Updates @ops by removing its direct
+ * callers provided in @hash
+ * @ops: The address of the struct ftrace_ops object
+ * @hash: The address of the struct ftrace_hash object
+ *
+ * This is used to delete custom direct callers (ip -> addr) in
+ * @ops specified via @hash. The @ops will be either unregistered
+ * updated.
+ *
+ * Returns: zero on success. Non zero on error, which includes:
+ * -EINVAL - The @hash is empty
+ * -EINVAL - The @ops is not registered
+ */
+int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash)
+{
+ struct ftrace_hash *old_direct_functions = NULL;
+ struct ftrace_hash *new_direct_functions;
+ struct ftrace_hash *new_filter_hash = NULL;
+ struct ftrace_hash *old_filter_hash;
+ struct ftrace_func_entry *entry;
+ struct ftrace_func_entry *del;
+ unsigned long size;
+ int err = -EINVAL;
+
+ if (!hash_count(hash))
+ return -EINVAL;
+ if (check_direct_multi(ops))
+ return -EINVAL;
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return -EINVAL;
+ if (direct_functions == EMPTY_HASH)
+ return -EINVAL;
+
+ mutex_lock(&direct_mutex);
+
+ old_filter_hash = ops->func_hash ? ops->func_hash->filter_hash : NULL;
+
+ if (!hash_count(old_filter_hash))
+ goto out_unlock;
+
+ /* Make sure requested entries are already registered. */
+ size = 1 << hash->size_bits;
+ for (int i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ del = __ftrace_lookup_ip(direct_functions, entry->ip);
+ if (!del || del->direct != entry->direct)
+ goto out_unlock;
+ }
+ }
+
+ err = -ENOMEM;
+ new_filter_hash = hash_sub(old_filter_hash, hash);
+ if (!new_filter_hash)
+ goto out_unlock;
+
+ new_direct_functions = hash_sub(direct_functions, hash);
+ if (!new_direct_functions)
+ goto out_unlock;
+
+ /* If there's nothing left, we need to unregister the ops. */
+ if (ftrace_hash_empty(new_filter_hash)) {
+ err = unregister_ftrace_function(ops);
+ if (!err) {
+ /* cleanup for possible another register call */
+ ops->func = NULL;
+ ops->trampoline = 0;
+ ftrace_free_filter(ops);
+ ops->func_hash->filter_hash = NULL;
+ }
+ } else {
+ err = ftrace_update_ops(ops, new_filter_hash, EMPTY_HASH);
+ /*
+ * new_filter_hash is dup-ed, so we need to release it anyway,
+ * old_filter_hash either stays on error or is already released
+ */
+ }
+
+ if (err) {
+ /* free the new_direct_functions */
+ old_direct_functions = new_direct_functions;
+ } else {
+ old_direct_functions = direct_functions;
+ rcu_assign_pointer(direct_functions, new_direct_functions);
+ }
+
+ out_unlock:
+ mutex_unlock(&direct_mutex);
+
+ if (old_direct_functions && old_direct_functions != EMPTY_HASH)
+ call_rcu_tasks(&old_direct_functions->rcu, register_ftrace_direct_cb);
+ free_ftrace_hash(new_filter_hash);
+
+ return err;
+}
+
+/**
+ * update_ftrace_direct_mod - Updates @ops by modifing its direct
+ * callers provided in @hash
+ * @ops: The address of the struct ftrace_ops object
+ * @hash: The address of the struct ftrace_hash object
+ * @do_direct_lock: If true lock the direct_mutex
+ *
+ * This is used to modify custom direct callers (ip -> addr) in
+ * @ops specified via @hash.
+ *
+ * This can be called from within ftrace ops_func callback with
+ * direct_mutex already locked, in which case @do_direct_lock
+ * needs to be false.
+ *
+ * Returns: zero on success. Non zero on error, which includes:
+ * -EINVAL - The @hash is empty
+ * -EINVAL - The @ops is not registered
+ */
+int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace_hash *hash, bool do_direct_lock)
+{
+ struct ftrace_func_entry *entry, *tmp;
+ static struct ftrace_ops tmp_ops = {
+ .func = ftrace_stub,
+ .flags = FTRACE_OPS_FL_STUB,
+ };
+ struct ftrace_hash *orig_hash;
+ unsigned long size, i;
+ int err = -EINVAL;
+
+ if (!hash_count(hash))
+ return -EINVAL;
+ if (check_direct_multi(ops))
+ return -EINVAL;
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return -EINVAL;
+ if (direct_functions == EMPTY_HASH)
+ return -EINVAL;
+
+ /*
+ * We can be called from within ops_func callback with direct_mutex
+ * already taken.
+ */
+ if (do_direct_lock)
+ mutex_lock(&direct_mutex);
+
+ orig_hash = ops->func_hash ? ops->func_hash->filter_hash : NULL;
+ if (!orig_hash)
+ goto unlock;
+
+ /* Enable the tmp_ops to have the same functions as the direct ops */
+ ftrace_ops_init(&tmp_ops);
+ tmp_ops.func_hash = ops->func_hash;
+
+ err = register_ftrace_function_nolock(&tmp_ops);
+ if (err)
+ goto unlock;
+
+ /*
+ * Call __ftrace_hash_update_ipmodify() here, so that we can call
+ * ops->ops_func for the ops. This is needed because the above
+ * register_ftrace_function_nolock() worked on tmp_ops.
+ */
+ err = __ftrace_hash_update_ipmodify(ops, orig_hash, orig_hash, true);
+ if (err)
+ goto out;
+
+ /*
+ * Now the ftrace_ops_list_func() is called to do the direct callers.
+ * We can safely change the direct functions attached to each entry.
+ */
+ mutex_lock(&ftrace_lock);
+
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ tmp = __ftrace_lookup_ip(direct_functions, entry->ip);
+ if (!tmp)
+ continue;
+ tmp->direct = entry->direct;
+ }
+ }
+
+ mutex_unlock(&ftrace_lock);
+
+out:
+ /* Removing the tmp_ops will add the updated direct callers to the functions */
+ unregister_ftrace_function(&tmp_ops);
+
+unlock:
+ if (do_direct_lock)
+ mutex_unlock(&direct_mutex);
+ return err;
+}
+
#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
/**
@@ -8709,7 +9068,7 @@ static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops)
if (!op->ops_func)
return -EBUSY;
- ret = op->ops_func(op, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER);
+ ret = op->ops_func(op, ip, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER);
if (ret)
return ret;
}
@@ -8756,7 +9115,7 @@ static void cleanup_direct_functions_after_ipmodify(struct ftrace_ops *ops)
/* The cleanup is optional, ignore any errors */
if (found_op && op->ops_func)
- op->ops_func(op, FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER);
+ op->ops_func(op, ip, FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER);
}
}
mutex_unlock(&direct_mutex);