Merge branch 'close-race-in-freeing-special-fields-and-map-value'

Kumar Kartikeya Dwivedi says: ==================== Close race in freeing special fields and map value There exists a race across various map types where the freeing of special fields (tw, timer, wq, kptr, etc.) can be done eagerly when a logical delete operation is done on a map value, such that the program which continues to have access to such a map value can recreate the fields and cause them to leak. The set contains fixes for this case. It is a continuation of Mykyta's previous attempt in [0], but applies to all fields. A test is included which reproduces the bug reliably in absence of the fixes. Local Storage Benchmarks ------------------------ Evaluation Setup: Benchmarked on a dual-socket Intel Xeon Gold 6348 (Ice Lake) @ 2.60GHz (56 cores / 112 threads), with the CPU governor set to performance. Bench was pinned to a single NUMA node throughout the test. Benchmark comes from [1] using the following command: ./bench -p 1 local-storage-create --storage-type <socket,task> --batch-size <16,32,64> Before the test, 10 runs of all cases ([socket|task] x 3 batch sizes x 7 iterations per batch size) are done to warm up and prime the machine. Then, 3 runs of all cases are done (with and without the patch, across reboots). For each comparison, we have 21 samples, i.e. per batch size (e.g. socket 16) of a given local storage, we have 3 runs x 7 iterations. The statistics (mean, median, stddev) and t-test is done for each scenario (local storage and batch size pair) individually (21 samples for either case). All values are for local storage creations in thousand creations / sec (k/s). Baseline (without patch) With patch Delta Case Median Mean Std. Dev. Median Mean Std. Dev. Median % --------------------------------------------------------------------------------------------------- socket 16 432.026 431.941 1.047 431.347 431.953 1.635 -0.679 -0.16% socket 32 432.641 432.818 1.535 432.488 432.302 1.508 -0.153 -0.04% socket 64 431.504 431.996 1.337 429.145 430.326 2.469 -2.359 -0.55% task 16 38.816 39.382 1.456 39.657 39.337 1.831 +0.841 +2.17% task 32 38.815 39.644 2.690 38.721 39.122 1.636 -0.094 -0.24% task 64 37.562 38.080 1.701 39.554 38.563 1.689 +1.992 +5.30% The cases for socket are within the range of noise, and improvements in task local storage are due to high variance (CV ~4%-6% across batch sizes). The only statistically significant case worth mentioning is socket with batch size 64 with p-value from t-test < 0.05, but the absolute difference is small (~2k/s). TL;DR there doesn't appear to be any significant regression or improvement. [0]: https://lore.kernel.org/bpf/20260216131341.1285427-1-mykyta.yatsenko5@gmail.com [1]: https://lore.kernel.org/bpf/20260205222916.1788211-1-ameryhung@gmail.com Changelog: ---------- v2 -> v3 v2: https://lore.kernel.org/bpf/20260227052031.3988575-1-memxor@gmail.com * Add syzbot Tested-by. * Add Amery's Reviewed-by. * Fix missing rcu_dereference_check() in __bpf_selem_free_rcu. (BPF CI Bot) * Remove migrate_disable() in bpf_selem_free_rcu. (Alexei) v1 -> v2 v1: https://lore.kernel.org/bpf/20260225185121.2057388-1-memxor@gmail.com * Add Paul's Reviewed-by. * Fix use-after-free in accessing bpf_mem_alloc embedded in map. (syzbot CI) * Add benchmark numbers for local storage. * Add extra test case for per-cpu hashmap coverage with up to 16 refcount leaks. * Target bpf tree. ==================== Link: https://patch.msgid.link/20260227224806.646888-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
author: Alexei Starovoitov <ast@kernel.org> 2026-02-27 15:39:01 -0800
committer: Alexei Starovoitov <ast@kernel.org> 2026-02-27 15:39:01 -0800
commit: 5263e30fffbcc7934671f0421eafb87b690a01d2 (patch)
tree: 9d89fe95ffee4efc17eef99084b39ba515d3ae80 /kernel/bpf/memalloc.c
parent: 6881af27f9ea0f5ca8f606f573ef5cc25ca31fe4 (diff)
parent: 2939d7b3b0e5f35359ce8f69dbbad0bfc4e920b6 (diff)
1 files changed, 47 insertions, 11 deletions
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index bd45dda9dc35..682a9f34214b 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -102,6 +102,8 @@ struct bpf_mem_cache {
 	int percpu_size;
 	bool draining;
 	struct bpf_mem_cache *tgt;
+	void (*dtor)(void *obj, void *ctx);
+	void *dtor_ctx;
 
 	/* list of objects to be freed after RCU GP */
 	struct llist_head free_by_rcu;
@@ -260,12 +262,14 @@ static void free_one(void *obj, bool percpu)
 	kfree(obj);
 }
 
-static int free_all(struct llist_node *llnode, bool percpu)
+static int free_all(struct bpf_mem_cache *c, struct llist_node *llnode, bool percpu)
 {
 	struct llist_node *pos, *t;
 	int cnt = 0;
 
 	llist_for_each_safe(pos, t, llnode) {
+		if (c->dtor)
+			c->dtor((void *)pos + LLIST_NODE_SZ, c->dtor_ctx);
 		free_one(pos, percpu);
 		cnt++;
 	}
@@ -276,7 +280,7 @@ static void __free_rcu(struct rcu_head *head)
 {
 	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace);
 
-	free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size);
+	free_all(c, llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size);
 	atomic_set(&c->call_rcu_ttrace_in_progress, 0);
 }
 
@@ -308,7 +312,7 @@ static void do_call_rcu_ttrace(struct bpf_mem_cache *c)
 	if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) {
 		if (unlikely(READ_ONCE(c->draining))) {
 			llnode = llist_del_all(&c->free_by_rcu_ttrace);
-			free_all(llnode, !!c->percpu_size);
+			free_all(c, llnode, !!c->percpu_size);
 		}
 		return;
 	}
@@ -417,7 +421,7 @@ static void check_free_by_rcu(struct bpf_mem_cache *c)
 	dec_active(c, &flags);
 
 	if (unlikely(READ_ONCE(c->draining))) {
-		free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size);
+		free_all(c, llist_del_all(&c->waiting_for_gp), !!c->percpu_size);
 		atomic_set(&c->call_rcu_in_progress, 0);
 	} else {
 		call_rcu_hurry(&c->rcu, __free_by_rcu);
@@ -635,13 +639,13 @@ static void drain_mem_cache(struct bpf_mem_cache *c)
 	 * Except for waiting_for_gp_ttrace list, there are no concurrent operations
 	 * on these lists, so it is safe to use __llist_del_all().
 	 */
-	free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu);
-	free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu);
-	free_all(__llist_del_all(&c->free_llist), percpu);
-	free_all(__llist_del_all(&c->free_llist_extra), percpu);
-	free_all(__llist_del_all(&c->free_by_rcu), percpu);
-	free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu);
-	free_all(llist_del_all(&c->waiting_for_gp), percpu);
+	free_all(c, llist_del_all(&c->free_by_rcu_ttrace), percpu);
+	free_all(c, llist_del_all(&c->waiting_for_gp_ttrace), percpu);
+	free_all(c, __llist_del_all(&c->free_llist), percpu);
+	free_all(c, __llist_del_all(&c->free_llist_extra), percpu);
+	free_all(c, __llist_del_all(&c->free_by_rcu), percpu);
+	free_all(c, __llist_del_all(&c->free_llist_extra_rcu), percpu);
+	free_all(c, llist_del_all(&c->waiting_for_gp), percpu);
 }
 
 static void check_mem_cache(struct bpf_mem_cache *c)
@@ -680,6 +684,9 @@ static void check_leaked_objs(struct bpf_mem_alloc *ma)
 
 static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
 {
+	/* We can free dtor ctx only once all callbacks are done using it. */
+	if (ma->dtor_ctx_free)
+		ma->dtor_ctx_free(ma->dtor_ctx);
 	check_leaked_objs(ma);
 	free_percpu(ma->cache);
 	free_percpu(ma->caches);
@@ -1014,3 +1021,32 @@ int bpf_mem_alloc_check_size(bool percpu, size_t size)
 
 	return 0;
 }
+
+void bpf_mem_alloc_set_dtor(struct bpf_mem_alloc *ma, void (*dtor)(void *obj, void *ctx),
+			    void (*dtor_ctx_free)(void *ctx), void *ctx)
+{
+	struct bpf_mem_caches *cc;
+	struct bpf_mem_cache *c;
+	int cpu, i;
+
+	ma->dtor_ctx_free = dtor_ctx_free;
+	ma->dtor_ctx = ctx;
+
+	if (ma->cache) {
+		for_each_possible_cpu(cpu) {
+			c = per_cpu_ptr(ma->cache, cpu);
+			c->dtor = dtor;
+			c->dtor_ctx = ctx;
+		}
+	}
+	if (ma->caches) {
+		for_each_possible_cpu(cpu) {
+			cc = per_cpu_ptr(ma->caches, cpu);
+			for (i = 0; i < NUM_CACHES; i++) {
+				c = &cc->cache[i];
+				c->dtor = dtor;
+				c->dtor_ctx = ctx;
+			}
+		}
+	}
+}
author	Alexei Starovoitov <ast@kernel.org>	2026-02-27 15:39:01 -0800
committer	Alexei Starovoitov <ast@kernel.org>	2026-02-27 15:39:01 -0800
commit	5263e30fffbcc7934671f0421eafb87b690a01d2 (patch)
tree	9d89fe95ffee4efc17eef99084b39ba515d3ae80 /kernel/bpf/memalloc.c
parent	6881af27f9ea0f5ca8f606f573ef5cc25ca31fe4 (diff)
parent	2939d7b3b0e5f35359ce8f69dbbad0bfc4e920b6 (diff)