summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorAlexei Starovoitov <ast@kernel.org>2026-02-27 16:06:40 -0800
committerAlexei Starovoitov <ast@kernel.org>2026-02-27 16:08:34 -0800
commit8c0d9e178d4a91ae596b2780d9b0a77f9d4186ad (patch)
tree2c0b45c5ab04caab0f464eb2aee1dd379dcd1fe1 /kernel
parent5263e30fffbcc7934671f0421eafb87b690a01d2 (diff)
parent1872e75375c40add4a35990de3be77b5741c252c (diff)
Merge branch 'bpf-cpumap-devmap-fix-per-cpu-bulk-queue-races-on-preempt_rt'
Jiayuan Chen says: ==================== bpf: Fix per-CPU bulk queue races on PREEMPT_RT On PREEMPT_RT kernels, local_bh_disable() only calls migrate_disable() (when PREEMPT_RT_NEEDS_BH_LOCK is not set) and does not disable preemption. This means CFS scheduling can preempt a task inside the per-CPU bulk queue (bq) operations in cpumap and devmap, allowing another task on the same CPU to concurrently access the same bq, leading to use-after-free, list corruption, and kernel panics. Patch 1 fixes the cpumap race in bq_flush_to_queue(), originally reported by syzbot [1]. Patch 2 fixes the same class of race in devmap's bq_xmit_all(), identified by code inspection after Sebastian Andrzej Siewior pointed out that devmap has the same per-CPU bulk queue pattern [2]. Both patches use local_lock_nested_bh() to serialize access to the per-CPU bq. On non-RT this is a pure lockdep annotation with no overhead; on PREEMPT_RT it provides a per-CPU sleeping lock. To reproduce the devmap race, insert an mdelay(100) in bq_xmit_all() after "cnt = bq->count" and before the actual transmit loop. Then pin two threads to the same CPU, each running BPF_PROG_TEST_RUN with an XDP program that redirects to a DEVMAP entry (e.g. a veth pair). CFS timeslicing during the mdelay window causes interleaving. Without the fix, KASAN reports null-ptr-deref due to operating on freed frames: BUG: KASAN: null-ptr-deref in __build_skb_around+0x22d/0x340 Write of size 32 at addr 0000000000000d50 by task devmap_race_rep/449 CPU: 0 UID: 0 PID: 449 Comm: devmap_race_rep Not tainted 6.19.0+ #31 PREEMPT_RT Call Trace: <TASK> __build_skb_around+0x22d/0x340 build_skb_around+0x25/0x260 __xdp_build_skb_from_frame+0x103/0x860 veth_xdp_rcv_bulk_skb.isra.0+0x162/0x320 veth_xdp_rcv.constprop.0+0x61e/0xbb0 veth_poll+0x280/0xb50 __napi_poll.constprop.0+0xa5/0x590 net_rx_action+0x4b0/0xea0 handle_softirqs.isra.0+0x1b3/0x780 __local_bh_enable_ip+0x12a/0x240 xdp_test_run_batch.constprop.0+0xedd/0x1f60 bpf_test_run_xdp_live+0x304/0x640 bpf_prog_test_run_xdp+0xd24/0x1b70 __sys_bpf+0x61c/0x3e00 </TASK> Kernel panic - not syncing: Fatal exception in interrupt [1] https://lore.kernel.org/all/69369331.a70a0220.38f243.009d.GAE@google.com/T/ [2] https://lore.kernel.org/bpf/20260212023634.366343-1-jiayuan.chen@linux.dev/ v3 -> v4: https://lore.kernel.org/all/20260213034018.284146-1-jiayuan.chen@linux.dev/ - Move panic trace to cover letter. (Sebastian Andrzej Siewior) - Add Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> to both patches from cover letter. v2 -> v3: https://lore.kernel.org/bpf/20260212023634.366343-1-jiayuan.chen@linux.dev/ - Fix commit message: remove incorrect "spin_lock() becomes rt_mutex" claim, the per-CPU bq has no spin_lock at all. (Sebastian Andrzej Siewior) - Fix commit message: accurately describe local_lock_nested_bh() behavior instead of referencing local_lock(). (Sebastian Andrzej Siewior) - Remove incomplete discussion of snapshot alternative. (Sebastian Andrzej Siewior) - Remove panic trace from commit message. (Sebastian Andrzej Siewior) - Add patch 2/2 for devmap, same race pattern. (Sebastian Andrzej Siewior) v1 -> v2: https://lore.kernel.org/bpf/20260211064417.196401-1-jiayuan.chen@linux.dev/ - Use local_lock_nested_bh()/local_unlock_nested_bh() instead of local_lock()/local_unlock(), since these paths already run under local_bh_disable(). (Sebastian Andrzej Siewior) - Replace "Caller must hold bq->bq_lock" comment with lockdep_assert_held() in bq_flush_to_queue(). (Sebastian Andrzej Siewior) - Fix Fixes tag to 3253cb49cbad ("softirq: Allow to drop the softirq-BKL lock on PREEMPT_RT") which is the actual commit that makes the race possible. (Sebastian Andrzej Siewior) ==================== Link: https://patch.msgid.link/20260225121459.183121-1-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/cpumap.c17
-rw-r--r--kernel/bpf/devmap.c25
2 files changed, 36 insertions, 6 deletions
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 04171fbc39cb..32b43cb9061b 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -29,6 +29,7 @@
#include <linux/sched.h>
#include <linux/workqueue.h>
#include <linux/kthread.h>
+#include <linux/local_lock.h>
#include <linux/completion.h>
#include <trace/events/xdp.h>
#include <linux/btf_ids.h>
@@ -52,6 +53,7 @@ struct xdp_bulk_queue {
struct list_head flush_node;
struct bpf_cpu_map_entry *obj;
unsigned int count;
+ local_lock_t bq_lock;
};
/* Struct for every remote "destination" CPU in map */
@@ -451,6 +453,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
for_each_possible_cpu(i) {
bq = per_cpu_ptr(rcpu->bulkq, i);
bq->obj = rcpu;
+ local_lock_init(&bq->bq_lock);
}
/* Alloc queue */
@@ -722,6 +725,8 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
struct ptr_ring *q;
int i;
+ lockdep_assert_held(&bq->bq_lock);
+
if (unlikely(!bq->count))
return;
@@ -749,11 +754,15 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
}
/* Runs under RCU-read-side, plus in softirq under NAPI protection.
- * Thus, safe percpu variable access.
+ * Thus, safe percpu variable access. PREEMPT_RT relies on
+ * local_lock_nested_bh() to serialise access to the per-CPU bq.
*/
static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
{
- struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
+ struct xdp_bulk_queue *bq;
+
+ local_lock_nested_bh(&rcpu->bulkq->bq_lock);
+ bq = this_cpu_ptr(rcpu->bulkq);
if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
bq_flush_to_queue(bq);
@@ -774,6 +783,8 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
list_add(&bq->flush_node, flush_list);
}
+
+ local_unlock_nested_bh(&rcpu->bulkq->bq_lock);
}
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
@@ -810,7 +821,9 @@ void __cpu_map_flush(struct list_head *flush_list)
struct xdp_bulk_queue *bq, *tmp;
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
+ local_lock_nested_bh(&bq->obj->bulkq->bq_lock);
bq_flush_to_queue(bq);
+ local_unlock_nested_bh(&bq->obj->bulkq->bq_lock);
/* If already running, costs spin_lock_irqsave + smb_mb */
wake_up_process(bq->obj->kthread);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 2984e938f94d..3d619d01088e 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -45,6 +45,7 @@
* types of devmap; only the lookup and insertion is different.
*/
#include <linux/bpf.h>
+#include <linux/local_lock.h>
#include <net/xdp.h>
#include <linux/filter.h>
#include <trace/events/xdp.h>
@@ -60,6 +61,7 @@ struct xdp_dev_bulk_queue {
struct net_device *dev_rx;
struct bpf_prog *xdp_prog;
unsigned int count;
+ local_lock_t bq_lock;
};
struct bpf_dtab_netdev {
@@ -381,6 +383,8 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
int to_send = cnt;
int i;
+ lockdep_assert_held(&bq->bq_lock);
+
if (unlikely(!cnt))
return;
@@ -425,10 +429,12 @@ void __dev_flush(struct list_head *flush_list)
struct xdp_dev_bulk_queue *bq, *tmp;
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
+ local_lock_nested_bh(&bq->dev->xdp_bulkq->bq_lock);
bq_xmit_all(bq, XDP_XMIT_FLUSH);
bq->dev_rx = NULL;
bq->xdp_prog = NULL;
__list_del_clearprev(&bq->flush_node);
+ local_unlock_nested_bh(&bq->dev->xdp_bulkq->bq_lock);
}
}
@@ -451,12 +457,16 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
/* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu
* variable access, and map elements stick around. See comment above
- * xdp_do_flush() in filter.c.
+ * xdp_do_flush() in filter.c. PREEMPT_RT relies on local_lock_nested_bh()
+ * to serialise access to the per-CPU bq.
*/
static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx, struct bpf_prog *xdp_prog)
{
- struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
+ struct xdp_dev_bulk_queue *bq;
+
+ local_lock_nested_bh(&dev->xdp_bulkq->bq_lock);
+ bq = this_cpu_ptr(dev->xdp_bulkq);
if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
bq_xmit_all(bq, 0);
@@ -477,6 +487,8 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
}
bq->q[bq->count++] = xdpf;
+
+ local_unlock_nested_bh(&dev->xdp_bulkq->bq_lock);
}
static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
@@ -1127,8 +1139,13 @@ static int dev_map_notification(struct notifier_block *notifier,
if (!netdev->xdp_bulkq)
return NOTIFY_BAD;
- for_each_possible_cpu(cpu)
- per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev;
+ for_each_possible_cpu(cpu) {
+ struct xdp_dev_bulk_queue *bq;
+
+ bq = per_cpu_ptr(netdev->xdp_bulkq, cpu);
+ bq->dev = netdev;
+ local_lock_init(&bq->bq_lock);
+ }
break;
case NETDEV_UNREGISTER:
/* This rcu_read_lock/unlock pair is needed because