From ec5fd50aeff9c9156304853c6d75eda852d4a2c8 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Mon, 17 Feb 2025 08:43:35 +0100 Subject: uprobes: Don't use %pK through printk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restricted pointers ("%pK") are not meant to be used through printk(). It can unintentionally expose security sensitive, raw pointer values. Use regular pointer formatting instead. For more background, see: https://lore.kernel.org/lkml/20250113171731-dc10e3c1-da64-4af0-b767-7c7070468023@linutronix.de/ Signed-off-by: Thomas Weißschuh Signed-off-by: Ingo Molnar Cc: Masami Hiramatsu Cc: Oleg Nesterov Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250217-restricted-pointers-uprobes-v1-1-e8cbe5bb22a7@linutronix.de --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2ca797cbe465..bf2a87a0a378 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -417,7 +417,7 @@ static void update_ref_ctr_warn(struct uprobe *uprobe, struct mm_struct *mm, short d) { pr_warn("ref_ctr %s failed for inode: 0x%lx offset: " - "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%pK\n", + "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%p\n", d > 0 ? "increment" : "decrement", uprobe->inode->i_ino, (unsigned long long) uprobe->offset, (unsigned long long) uprobe->ref_ctr_offset, mm); -- cgit v1.2.3 From 0fe8813baf4b2e865d3b2c735ce1a15b86002c74 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 17 Jan 2025 06:41:07 -0800 Subject: perf/core: Add RCU read lock protection to perf_iterate_ctx() The perf_iterate_ctx() function performs RCU list traversal but currently lacks RCU read lock protection. This causes lockdep warnings when running perf probe with unshare(1) under CONFIG_PROVE_RCU_LIST=y: WARNING: suspicious RCU usage kernel/events/core.c:8168 RCU-list traversed in non-reader section!! Call Trace: lockdep_rcu_suspicious ? perf_event_addr_filters_apply perf_iterate_ctx perf_event_exec begin_new_exec ? load_elf_phdrs load_elf_binary ? lock_acquire ? find_held_lock ? bprm_execve bprm_execve do_execveat_common.isra.0 __x64_sys_execve do_syscall_64 entry_SYSCALL_64_after_hwframe This protection was previously present but was removed in commit bd2756811766 ("perf: Rewrite core context handling"). Add back the necessary rcu_read_lock()/rcu_read_unlock() pair around perf_iterate_ctx() call in perf_event_exec(). [ mingo: Use scoped_guard() as suggested by Peter ] Fixes: bd2756811766 ("perf: Rewrite core context handling") Signed-off-by: Breno Leitao Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250117-fix_perf_rcu-v1-1-13cb9210fc6a@debian.org --- kernel/events/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index bcb09e011e9e..7dabbcaf825a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8321,7 +8321,8 @@ void perf_event_exec(void) perf_event_enable_on_exec(ctx); perf_event_remove_on_exec(ctx); - perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); + scoped_guard(rcu) + perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); perf_unpin_context(ctx); put_ctx(ctx); -- cgit v1.2.3 From 2016066c66192a99d9e0ebf433789c490a6785a2 Mon Sep 17 00:00:00 2001 From: Luo Gengkun Date: Wed, 22 Jan 2025 07:33:56 +0000 Subject: perf/core: Order the PMU list to fix warning about unordered pmu_ctx_list Syskaller triggers a warning due to prev_epc->pmu != next_epc->pmu in perf_event_swap_task_ctx_data(). vmcore shows that two lists have the same perf_event_pmu_context, but not in the same order. The problem is that the order of pmu_ctx_list for the parent is impacted by the time when an event/PMU is added. While the order for a child is impacted by the event order in the pinned_groups and flexible_groups. So the order of pmu_ctx_list in the parent and child may be different. To fix this problem, insert the perf_event_pmu_context to its proper place after iteration of the pmu_ctx_list. The follow testcase can trigger above warning: # perf record -e cycles --call-graph lbr -- taskset -c 3 ./a.out & # perf stat -e cpu-clock,cs -p xxx // xxx is the pid of a.out test.c void main() { int count = 0; pid_t pid; printf("%d running\n", getpid()); sleep(30); printf("running\n"); pid = fork(); if (pid == -1) { printf("fork error\n"); return; } if (pid == 0) { while (1) { count++; } } else { while (1) { count++; } } } The testcase first opens an LBR event, so it will allocate task_ctx_data, and then open tracepoint and software events, so the parent context will have 3 different perf_event_pmu_contexts. On inheritance, child ctx will insert the perf_event_pmu_context in another order and the warning will trigger. [ mingo: Tidied up the changelog. ] Fixes: bd2756811766 ("perf: Rewrite core context handling") Signed-off-by: Luo Gengkun Signed-off-by: Ingo Molnar Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20250122073356.1824736-1-luogengkun@huaweicloud.com --- kernel/events/core.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 7dabbcaf825a..086d46d09696 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4950,7 +4950,7 @@ static struct perf_event_pmu_context * find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, struct perf_event *event) { - struct perf_event_pmu_context *new = NULL, *epc; + struct perf_event_pmu_context *new = NULL, *pos = NULL, *epc; void *task_ctx_data = NULL; if (!ctx->task) { @@ -5007,12 +5007,19 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, atomic_inc(&epc->refcount); goto found_epc; } + /* Make sure the pmu_ctx_list is sorted by PMU type: */ + if (!pos && epc->pmu->type > pmu->type) + pos = epc; } epc = new; new = NULL; - list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); + if (!pos) + list_add_tail(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); + else + list_add(&epc->pmu_ctx_entry, pos->pmu_ctx_entry.prev); + epc->ctx = ctx; found_epc: -- cgit v1.2.3 From bddf10d26e6e5114e7415a0e442ec6f51a559468 Mon Sep 17 00:00:00 2001 From: Tong Tiangen Date: Mon, 24 Feb 2025 11:11:49 +0800 Subject: uprobes: Reject the shared zeropage in uprobe_write_opcode() We triggered the following crash in syzkaller tests: BUG: Bad page state in process syz.7.38 pfn:1eff3 page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1eff3 flags: 0x3fffff00004004(referenced|reserved|node=0|zone=1|lastcpupid=0x1fffff) raw: 003fffff00004004 ffffe6c6c07bfcc8 ffffe6c6c07bfcc8 0000000000000000 raw: 0000000000000000 0000000000000000 00000000fffffffe 0000000000000000 page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Call Trace: dump_stack_lvl+0x32/0x50 bad_page+0x69/0xf0 free_unref_page_prepare+0x401/0x500 free_unref_page+0x6d/0x1b0 uprobe_write_opcode+0x460/0x8e0 install_breakpoint.part.0+0x51/0x80 register_for_each_vma+0x1d9/0x2b0 __uprobe_register+0x245/0x300 bpf_uprobe_multi_link_attach+0x29b/0x4f0 link_create+0x1e2/0x280 __sys_bpf+0x75f/0xac0 __x64_sys_bpf+0x1a/0x30 do_syscall_64+0x56/0x100 entry_SYSCALL_64_after_hwframe+0x78/0xe2 BUG: Bad rss-counter state mm:00000000452453e0 type:MM_FILEPAGES val:-1 The following syzkaller test case can be used to reproduce: r2 = creat(&(0x7f0000000000)='./file0\x00', 0x8) write$nbd(r2, &(0x7f0000000580)=ANY=[], 0x10) r4 = openat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x42, 0x0) mmap$IORING_OFF_SQ_RING(&(0x7f0000ffd000/0x3000)=nil, 0x3000, 0x0, 0x12, r4, 0x0) r5 = userfaultfd(0x80801) ioctl$UFFDIO_API(r5, 0xc018aa3f, &(0x7f0000000040)={0xaa, 0x20}) r6 = userfaultfd(0x80801) ioctl$UFFDIO_API(r6, 0xc018aa3f, &(0x7f0000000140)) ioctl$UFFDIO_REGISTER(r6, 0xc020aa00, &(0x7f0000000100)={{&(0x7f0000ffc000/0x4000)=nil, 0x4000}, 0x2}) ioctl$UFFDIO_ZEROPAGE(r5, 0xc020aa04, &(0x7f0000000000)={{&(0x7f0000ffd000/0x1000)=nil, 0x1000}}) r7 = bpf$PROG_LOAD(0x5, &(0x7f0000000140)={0x2, 0x3, &(0x7f0000000200)=ANY=[@ANYBLOB="1800000000120000000000000000000095"], &(0x7f0000000000)='GPL\x00', 0x7, 0x0, 0x0, 0x0, 0x0, '\x00', 0x0, @fallback=0x30, 0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x10, 0x0, @void, @value}, 0x94) bpf$BPF_LINK_CREATE_XDP(0x1c, &(0x7f0000000040)={r7, 0x0, 0x30, 0x1e, @val=@uprobe_multi={&(0x7f0000000080)='./file0\x00', &(0x7f0000000100)=[0x2], 0x0, 0x0, 0x1}}, 0x40) The cause is that zero pfn is set to the PTE without increasing the RSS count in mfill_atomic_pte_zeropage() and the refcount of zero folio does not increase accordingly. Then, the operation on the same pfn is performed in uprobe_write_opcode()->__replace_page() to unconditional decrease the RSS count and old_folio's refcount. Therefore, two bugs are introduced: 1. The RSS count is incorrect, when process exit, the check_mm() report error "Bad rss-count". 2. The reserved folio (zero folio) is freed when folio->refcount is zero, then free_pages_prepare->free_page_is_bad() report error "Bad page state". There is more, the following warning could also theoretically be triggered: __replace_page() -> ... -> folio_remove_rmap_pte() -> VM_WARN_ON_FOLIO(is_zero_folio(folio), folio) Considering that uprobe hit on the zero folio is a very rare case, just reject zero old folio immediately after get_user_page_vma_remote(). [ mingo: Cleaned up the changelog ] Fixes: 7396fa818d62 ("uprobes/core: Make background page replacement logic account for rss_stat counters") Fixes: 2b1444983508 ("uprobes, mm, x86: Add the ability to install and remove uprobes breakpoints") Signed-off-by: Tong Tiangen Signed-off-by: Ingo Molnar Reviewed-by: David Hildenbrand Reviewed-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Masami Hiramatsu Link: https://lore.kernel.org/r/20250224031149.1598949-1-tongtiangen@huawei.com --- kernel/events/uprobes.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index bf2a87a0a378..af53fbd2d12c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -495,6 +495,11 @@ retry: if (ret <= 0) goto put_old; + if (is_zero_page(old_page)) { + ret = -EINVAL; + goto put_old; + } + if (WARN(!is_register && PageCompound(old_page), "uprobe unregister should never work on compound page\n")) { ret = -EINVAL; -- cgit v1.2.3 From 0d39844150546fa1415127c5fbae26db64070dd3 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 17 Jan 2025 07:19:12 -0800 Subject: perf/core: Fix low freq setting via IOC_PERIOD A low attr::freq value cannot be set via IOC_PERIOD on some platforms. The perf_event_check_period() introduced in: 81ec3f3c4c4d ("perf/x86: Add check_period PMU callback") was intended to check the period, rather than the frequency. A low frequency may be mistakenly rejected by limit_period(). Fix it. Fixes: 81ec3f3c4c4d ("perf/x86: Add check_period PMU callback") Signed-off-by: Kan Liang Signed-off-by: Ingo Molnar Reviewed-by: Ravi Bangoria Cc: Peter Zijlstra Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250117151913.3043942-2-kan.liang@linux.intel.com Closes: https://lore.kernel.org/lkml/20250115154949.3147-1-ravi.bangoria@amd.com/ --- kernel/events/core.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 086d46d09696..6364319e2f88 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5969,14 +5969,15 @@ static int _perf_event_period(struct perf_event *event, u64 value) if (!value) return -EINVAL; - if (event->attr.freq && value > sysctl_perf_event_sample_rate) - return -EINVAL; - - if (perf_event_check_period(event, value)) - return -EINVAL; - - if (!event->attr.freq && (value & (1ULL << 63))) - return -EINVAL; + if (event->attr.freq) { + if (value > sysctl_perf_event_sample_rate) + return -EINVAL; + } else { + if (perf_event_check_period(event, value)) + return -EINVAL; + if (value & (1ULL << 63)) + return -EINVAL; + } event_function_call(event, __perf_event_period, &value); -- cgit v1.2.3 From f8c857238a392f21d5726d07966f6061007c8d4f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 25 Feb 2025 14:32:14 -0800 Subject: uprobes: Remove too strict lockdep_assert() condition in hprobe_expire() hprobe_expire() is used to atomically switch pending uretprobe instance (struct return_instance) from being SRCU protected to be refcounted. This can be done from background timer thread, or synchronously within current thread when task is forked. In the former case, return_instance has to be protected through RCU read lock, and that's what hprobe_expire() used to check with lockdep_assert(rcu_read_lock_held()). But in the latter case (hprobe_expire() called from dup_utask()) there is no RCU lock being held, and it's both unnecessary and incovenient. Inconvenient due to the intervening memory allocations inside dup_return_instance()'s loop. Unnecessary because dup_utask() is called synchronously in current thread, and no uretprobe can run at that point, so return_instance can't be freed either. So drop rcu_read_lock_held() condition, and expand corresponding comment to explain necessary lifetime guarantees. lockdep_assert()-detected issue is a false positive. Fixes: dd1a7567784e ("uprobes: SRCU-protect uretprobe lifetime (with timeout)") Reported-by: Breno Leitao Signed-off-by: Andrii Nakryiko Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250225223214.2970740-1-andrii@kernel.org --- kernel/events/uprobes.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index af53fbd2d12c..b4ca8898fe17 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -767,10 +767,14 @@ static struct uprobe *hprobe_expire(struct hprobe *hprobe, bool get) enum hprobe_state hstate; /* - * return_instance's hprobe is protected by RCU. - * Underlying uprobe is itself protected from reuse by SRCU. + * Caller should guarantee that return_instance is not going to be + * freed from under us. This can be achieved either through holding + * rcu_read_lock() or by owning return_instance in the first place. + * + * Underlying uprobe is itself protected from reuse by SRCU, so ensure + * SRCU lock is held properly. */ - lockdep_assert(rcu_read_lock_held() && srcu_read_lock_held(&uretprobes_srcu)); + lockdep_assert(srcu_read_lock_held(&uretprobes_srcu)); hstate = READ_ONCE(hprobe->state); switch (hstate) { -- cgit v1.2.3