From 486ff5ad49bc50315bcaf6d45f04a33ef0a45ced Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 2 Jun 2025 21:51:05 -0700 Subject: perf/core: Fix invalid wait context in ctx_sched_in() Lockdep found a bug in the event scheduling when a pinned event was failed and wakes up the threads in the ring buffer like below. It seems it should not grab a wait-queue lock under perf-context lock. Let's do it with irq_work. [ 39.913691] ============================= [ 39.914157] [ BUG: Invalid wait context ] [ 39.914623] 6.15.0-next-20250530-next-2025053 #1 Not tainted [ 39.915271] ----------------------------- [ 39.915731] repro/837 is trying to lock: [ 39.916191] ffff88801acfabd8 (&event->waitq){....}-{3:3}, at: __wake_up+0x26/0x60 [ 39.917182] other info that might help us debug this: [ 39.917761] context-{5:5} [ 39.918079] 4 locks held by repro/837: [ 39.918530] #0: ffffffff8725cd00 (rcu_read_lock){....}-{1:3}, at: __perf_event_task_sched_in+0xd1/0xbc0 [ 39.919612] #1: ffff88806ca3c6f8 (&cpuctx_lock){....}-{2:2}, at: __perf_event_task_sched_in+0x1a7/0xbc0 [ 39.920748] #2: ffff88800d91fc18 (&ctx->lock){....}-{2:2}, at: __perf_event_task_sched_in+0x1f9/0xbc0 [ 39.921819] #3: ffffffff8725cd00 (rcu_read_lock){....}-{1:3}, at: perf_event_wakeup+0x6c/0x470 Fixes: f4b07fd62d4d ("perf/core: Use POLLHUP for a pinned event in error") Closes: https://lore.kernel.org/lkml/aD2w50VDvGIH95Pf@ly-workstation Reported-by: "Lai, Yi" Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Tested-by: "Lai, Yi" Link: https://patch.msgid.link/20250603045105.1731451-1-namhyung@kernel.org --- kernel/events/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index ac70d68217b6..4f86d22f281a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4138,7 +4138,8 @@ static int merge_sched_in(struct perf_event *event, void *data) if (*perf_event_fasync(event)) event->pending_kill = POLL_ERR; - perf_event_wakeup(event); + event->pending_wakeup = 1; + irq_work_queue(&event->pending_irq); } else { struct perf_cpu_pmu_context *cpc = this_cpc(event->pmu_ctx->pmu); -- cgit v1.2.3 From 6a8a48644c4b804123e59dbfc5d6cd29a0194046 Mon Sep 17 00:00:00 2001 From: Zide Chen Date: Mon, 9 Feb 2026 16:52:25 -0800 Subject: perf/x86/intel/uncore: Add per-scheduler IMC CAS count events IMC on SPR and EMR does not support sub-channels. In contrast, CPUs that use gnr_uncores[] (e.g. Granite Rapids and Sierra Forest) implement two command schedulers (SCH0/SCH1) per memory channel, providing logically independent command and data paths. Do not reuse the spr_uncore_imc[] configuration for these CPUs. Instead, introduce a dedicated gnr_uncore_imc[] with per-scheduler events, so userspace can monitor SCH0 and SCH1 independently. On these CPUs, replace cas_count_{read,write} with cas_count_{read,write}_sch{0,1}. This may break existing userspace that relies on cas_count_{read,write}, prompting it to switch to the per-scheduler events, as the legacy event reports only partial traffic (SCH0). Fixes: 632c4bf6d007 ("perf/x86/intel/uncore: Support Granite Rapids") Fixes: cb4a6ccf3583 ("perf/x86/intel/uncore: Support Sierra Forest and Grand Ridge") Reported-by: Reinette Chatre Signed-off-by: Zide Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dapeng Mi Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260210005225.20311-1-zide.chen@intel.com --- arch/x86/events/intel/uncore_snbep.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 5ed6e0b7e715..0a1d08136cc1 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -6497,6 +6497,32 @@ static struct intel_uncore_type gnr_uncore_ubox = { .attr_update = uncore_alias_groups, }; +static struct uncore_event_desc gnr_uncore_imc_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x01,umask=0x00"), + INTEL_UNCORE_EVENT_DESC(cas_count_read_sch0, "event=0x05,umask=0xcf"), + INTEL_UNCORE_EVENT_DESC(cas_count_read_sch0.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_read_sch0.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(cas_count_read_sch1, "event=0x06,umask=0xcf"), + INTEL_UNCORE_EVENT_DESC(cas_count_read_sch1.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_read_sch1.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(cas_count_write_sch0, "event=0x05,umask=0xf0"), + INTEL_UNCORE_EVENT_DESC(cas_count_write_sch0.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_write_sch0.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(cas_count_write_sch1, "event=0x06,umask=0xf0"), + INTEL_UNCORE_EVENT_DESC(cas_count_write_sch1.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_write_sch1.unit, "MiB"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_type gnr_uncore_imc = { + SPR_UNCORE_MMIO_COMMON_FORMAT(), + .name = "imc", + .fixed_ctr_bits = 48, + .fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR, + .fixed_ctl = SNR_IMC_MMIO_PMON_FIXED_CTL, + .event_descs = gnr_uncore_imc_events, +}; + static struct intel_uncore_type gnr_uncore_pciex8 = { SPR_UNCORE_PCI_COMMON_FORMAT(), .name = "pciex8", @@ -6544,7 +6570,7 @@ static struct intel_uncore_type *gnr_uncores[UNCORE_GNR_NUM_UNCORE_TYPES] = { NULL, &spr_uncore_pcu, &gnr_uncore_ubox, - &spr_uncore_imc, + &gnr_uncore_imc, NULL, &gnr_uncore_upi, NULL, -- cgit v1.2.3 From 77de62ad3de3967818c3dbe656b7336ebee461d2 Mon Sep 17 00:00:00 2001 From: Haocheng Yu Date: Tue, 3 Feb 2026 00:20:56 +0800 Subject: perf/core: Fix refcount bug and potential UAF in perf_mmap Syzkaller reported a refcount_t: addition on 0; use-after-free warning in perf_mmap. The issue is caused by a race condition between a failing mmap() setup and a concurrent mmap() on a dependent event (e.g., using output redirection). In perf_mmap(), the ring_buffer (rb) is allocated and assigned to event->rb with the mmap_mutex held. The mutex is then released to perform map_range(). If map_range() fails, perf_mmap_close() is called to clean up. However, since the mutex was dropped, another thread attaching to this event (via inherited events or output redirection) can acquire the mutex, observe the valid event->rb pointer, and attempt to increment its reference count. If the cleanup path has already dropped the reference count to zero, this results in a use-after-free or refcount saturation warning. Fix this by extending the scope of mmap_mutex to cover the map_range() call. This ensures that the ring buffer initialization and mapping (or cleanup on failure) happens atomically effectively, preventing other threads from accessing a half-initialized or dying ring buffer. Closes: https://lore.kernel.org/oe-kbuild-all/202602020208.m7KIjdzW-lkp@intel.com/ Reported-by: kernel test robot Signed-off-by: Haocheng Yu Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260202162057.7237-1-yuhaocheng035@gmail.com --- kernel/events/core.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 4f86d22f281a..22a0f405585b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7465,28 +7465,28 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) ret = perf_mmap_aux(vma, event, nr_pages); if (ret) return ret; - } - /* - * Since pinned accounting is per vm we cannot allow fork() to copy our - * vma. - */ - vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); - vma->vm_ops = &perf_mmap_vmops; + /* + * Since pinned accounting is per vm we cannot allow fork() to copy our + * vma. + */ + vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); + vma->vm_ops = &perf_mmap_vmops; - mapped = get_mapped(event, event_mapped); - if (mapped) - mapped(event, vma->vm_mm); + mapped = get_mapped(event, event_mapped); + if (mapped) + mapped(event, vma->vm_mm); - /* - * Try to map it into the page table. On fail, invoke - * perf_mmap_close() to undo the above, as the callsite expects - * full cleanup in this case and therefore does not invoke - * vmops::close(). - */ - ret = map_range(event->rb, vma); - if (ret) - perf_mmap_close(vma); + /* + * Try to map it into the page table. On fail, invoke + * perf_mmap_close() to undo the above, as the callsite expects + * full cleanup in this case and therefore does not invoke + * vmops::close(). + */ + ret = map_range(event->rb, vma); + if (ret) + perf_mmap_close(vma); + } return ret; } -- cgit v1.2.3 From c9bc1753b3cc41d0e01fbca7f035258b5f4db0ae Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 Feb 2026 13:29:09 +0100 Subject: perf: Fix __perf_event_overflow() vs perf_remove_from_context() race Make sure that __perf_event_overflow() runs with IRQs disabled for all possible callchains. Specifically the software events can end up running it with only preemption disabled. This opens up a race vs perf_event_exit_event() and friends that will go and free various things the overflow path expects to be present, like the BPF program. Fixes: 592903cdcbf6 ("perf_counter: add an event_list") Reported-by: Simond Hu Signed-off-by: Peter Zijlstra (Intel) Tested-by: Simond Hu Link: https://patch.msgid.link/20260224122909.GV1395416@noisy.programming.kicks-ass.net --- kernel/events/core.c | 42 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 22a0f405585b..1f5699b339ec 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10777,6 +10777,13 @@ int perf_event_overflow(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { + /* + * Entry point from hardware PMI, interrupts should be disabled here. + * This serializes us against perf_event_remove_from_context() in + * things like perf_event_release_kernel(). + */ + lockdep_assert_irqs_disabled(); + return __perf_event_overflow(event, 1, data, regs); } @@ -10853,6 +10860,19 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, { struct hw_perf_event *hwc = &event->hw; + /* + * This is: + * - software preempt + * - tracepoint preempt + * - tp_target_task irq (ctx->lock) + * - uprobes preempt/irq + * - kprobes preempt/irq + * - hw_breakpoint irq + * + * Any of these are sufficient to hold off RCU and thus ensure @event + * exists. + */ + lockdep_assert_preemption_disabled(); local64_add(nr, &event->count); if (!regs) @@ -10861,6 +10881,16 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, if (!is_sampling_event(event)) return; + /* + * Serialize against event_function_call() IPIs like normal overflow + * event handling. Specifically, must not allow + * perf_event_release_kernel() -> perf_remove_from_context() to make + * progress and 'release' the event from under us. + */ + guard(irqsave)(); + if (event->state != PERF_EVENT_STATE_ACTIVE) + return; + if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) { data->period = nr; return perf_swevent_overflow(event, 1, data, regs); @@ -11359,6 +11389,11 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct perf_sample_data data; struct perf_event *event; + /* + * Per being a tracepoint, this runs with preemption disabled. + */ + lockdep_assert_preemption_disabled(); + struct perf_raw_record raw = { .frag = { .size = entry_size, @@ -11691,6 +11726,11 @@ void perf_bp_event(struct perf_event *bp, void *data) struct perf_sample_data sample; struct pt_regs *regs = data; + /* + * Exception context, will have interrupts disabled. + */ + lockdep_assert_irqs_disabled(); + perf_sample_data_init(&sample, bp->attr.bp_addr, 0); if (!bp->hw.state && !perf_exclude_event(bp, regs)) @@ -12155,7 +12195,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) if (regs && !perf_exclude_event(event, regs)) { if (!(event->attr.exclude_idle && is_idle_task(current))) - if (__perf_event_overflow(event, 1, &data, regs)) + if (perf_event_overflow(event, &data, regs)) ret = HRTIMER_NORESTART; } -- cgit v1.2.3