From 4dd595c34c4bb22c16a76206a18c13e4e194335d Mon Sep 17 00:00:00 2001
From: Sohil Mehta <sohil.mehta@intel.com>
Date: Wed, 21 Jun 2023 22:36:00 +0000
Subject: syscalls: Remove file path comments from headers

Source file locations for syscall definitions can change over a period
of time. File paths in comments get stale and are hard to maintain long
term. Also, their usefulness is questionable since it would be easier to
locate a syscall definition using the SYSCALL_DEFINEx() macro.

Remove all source file path comments from the syscall headers. Also,
equalize the uneven line spacing (some of which is introduced due to the
deletions).

Signed-off-by: Sohil Mehta <sohil.mehta@intel.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 kernel/sys_ni.c | 110 +-------------------------------------------------------
 1 file changed, 1 insertion(+), 109 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 860b2dcf3ac4..f207236002e9 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -51,99 +51,35 @@ COND_SYSCALL_COMPAT(io_pgetevents);
 COND_SYSCALL(io_uring_setup);
 COND_SYSCALL(io_uring_enter);
 COND_SYSCALL(io_uring_register);
-
-/* fs/xattr.c */
-
-/* fs/dcache.c */
-
-/* fs/cookies.c */
 COND_SYSCALL(lookup_dcookie);
 COND_SYSCALL_COMPAT(lookup_dcookie);
-
-/* fs/eventfd.c */
 COND_SYSCALL(eventfd2);
-
-/* fs/eventfd.c */
 COND_SYSCALL(epoll_create1);
 COND_SYSCALL(epoll_ctl);
 COND_SYSCALL(epoll_pwait);
 COND_SYSCALL_COMPAT(epoll_pwait);
 COND_SYSCALL(epoll_pwait2);
 COND_SYSCALL_COMPAT(epoll_pwait2);
-
-/* fs/fcntl.c */
-
-/* fs/inotify_user.c */
 COND_SYSCALL(inotify_init1);
 COND_SYSCALL(inotify_add_watch);
 COND_SYSCALL(inotify_rm_watch);
-
-/* fs/ioctl.c */
-
-/* fs/ioprio.c */
 COND_SYSCALL(ioprio_set);
 COND_SYSCALL(ioprio_get);
-
-/* fs/locks.c */
 COND_SYSCALL(flock);
-
-/* fs/namei.c */
-
-/* fs/namespace.c */
-
-/* fs/nfsctl.c */
-
-/* fs/open.c */
-
-/* fs/pipe.c */
-
-/* fs/quota.c */
 COND_SYSCALL(quotactl);
 COND_SYSCALL(quotactl_fd);
-
-/* fs/readdir.c */
-
-/* fs/read_write.c */
-
-/* fs/sendfile.c */
-
-/* fs/select.c */
-
-/* fs/signalfd.c */
 COND_SYSCALL(signalfd4);
 COND_SYSCALL_COMPAT(signalfd4);
-
-/* fs/splice.c */
-
-/* fs/stat.c */
-
-/* fs/sync.c */
-
-/* fs/timerfd.c */
 COND_SYSCALL(timerfd_create);
 COND_SYSCALL(timerfd_settime);
 COND_SYSCALL(timerfd_settime32);
 COND_SYSCALL(timerfd_gettime);
 COND_SYSCALL(timerfd_gettime32);
-
-/* fs/utimes.c */
-
-/* kernel/acct.c */
 COND_SYSCALL(acct);
-
-/* kernel/capability.c */
 COND_SYSCALL(capget);
 COND_SYSCALL(capset);
-
-/* kernel/exec_domain.c */
-
-/* kernel/exit.c */
-
-/* kernel/fork.c */
 /* __ARCH_WANT_SYS_CLONE3 */
 COND_SYSCALL(clone3);
-
-/* kernel/futex/syscalls.c */
 COND_SYSCALL(futex);
 COND_SYSCALL(futex_time32);
 COND_SYSCALL(set_robust_list);
@@ -151,29 +87,11 @@ COND_SYSCALL_COMPAT(set_robust_list);
 COND_SYSCALL(get_robust_list);
 COND_SYSCALL_COMPAT(get_robust_list);
 COND_SYSCALL(futex_waitv);
-
-/* kernel/hrtimer.c */
-
-/* kernel/itimer.c */
-
-/* kernel/kexec.c */
 COND_SYSCALL(kexec_load);
 COND_SYSCALL_COMPAT(kexec_load);
-
-/* kernel/module.c */
 COND_SYSCALL(init_module);
 COND_SYSCALL(delete_module);
-
-/* kernel/posix-timers.c */
-
-/* kernel/printk.c */
 COND_SYSCALL(syslog);
-
-/* kernel/ptrace.c */
-
-/* kernel/sched/core.c */
-
-/* kernel/sys.c */
 COND_SYSCALL(setregid);
 COND_SYSCALL(setgid);
 COND_SYSCALL(setreuid);
@@ -186,12 +104,6 @@ COND_SYSCALL(setfsuid);
 COND_SYSCALL(setfsgid);
 COND_SYSCALL(setgroups);
 COND_SYSCALL(getgroups);
-
-/* kernel/time.c */
-
-/* kernel/timer.c */
-
-/* ipc/mqueue.c */
 COND_SYSCALL(mq_open);
 COND_SYSCALL_COMPAT(mq_open);
 COND_SYSCALL(mq_unlink);
@@ -203,8 +115,6 @@ COND_SYSCALL(mq_notify);
 COND_SYSCALL_COMPAT(mq_notify);
 COND_SYSCALL(mq_getsetattr);
 COND_SYSCALL_COMPAT(mq_getsetattr);
-
-/* ipc/msg.c */
 COND_SYSCALL(msgget);
 COND_SYSCALL(old_msgctl);
 COND_SYSCALL(msgctl);
@@ -214,8 +124,6 @@ COND_SYSCALL(msgrcv);
 COND_SYSCALL_COMPAT(msgrcv);
 COND_SYSCALL(msgsnd);
 COND_SYSCALL_COMPAT(msgsnd);
-
-/* ipc/sem.c */
 COND_SYSCALL(semget);
 COND_SYSCALL(old_semctl);
 COND_SYSCALL(semctl);
@@ -224,8 +132,6 @@ COND_SYSCALL_COMPAT(old_semctl);
 COND_SYSCALL(semtimedop);
 COND_SYSCALL(semtimedop_time32);
 COND_SYSCALL(semop);
-
-/* ipc/shm.c */
 COND_SYSCALL(shmget);
 COND_SYSCALL(old_shmctl);
 COND_SYSCALL(shmctl);
@@ -234,8 +140,6 @@ COND_SYSCALL_COMPAT(old_shmctl);
 COND_SYSCALL(shmat);
 COND_SYSCALL_COMPAT(shmat);
 COND_SYSCALL(shmdt);
-
-/* net/socket.c */
 COND_SYSCALL(socket);
 COND_SYSCALL(socketpair);
 COND_SYSCALL(bind);
@@ -256,30 +160,18 @@ COND_SYSCALL(sendmsg);
 COND_SYSCALL_COMPAT(sendmsg);
 COND_SYSCALL(recvmsg);
 COND_SYSCALL_COMPAT(recvmsg);
-
-/* mm/filemap.c */
-
-/* mm/nommu.c, also with MMU */
 COND_SYSCALL(mremap);
-
-/* security/keys/keyctl.c */
 COND_SYSCALL(add_key);
 COND_SYSCALL(request_key);
 COND_SYSCALL(keyctl);
 COND_SYSCALL_COMPAT(keyctl);
-
-/* security/landlock/syscalls.c */
 COND_SYSCALL(landlock_create_ruleset);
 COND_SYSCALL(landlock_add_rule);
 COND_SYSCALL(landlock_restrict_self);
-
-/* arch/example/kernel/sys_example.c */
-
-/* mm/fadvise.c */
 COND_SYSCALL(fadvise64_64);
 COND_SYSCALL_COMPAT(fadvise64_64);
 
-/* mm/, CONFIG_MMU only */
+/* CONFIG_MMU only */
 COND_SYSCALL(swapon);
 COND_SYSCALL(swapoff);
 COND_SYSCALL(mprotect);
-- 
cgit v1.2.3


From 5f81018753dfd4989e33ece1f0cb6b8aae498b82 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 15 Jun 2023 13:52:36 +0200
Subject: fprobe: Release rethook after the ftrace_ops is unregistered

While running bpf selftests it's possible to get following fault:

  general protection fault, probably for non-canonical address \
  0x6b6b6b6b6b6b6b6b: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC NOPTI
  ...
  Call Trace:
   <TASK>
   fprobe_handler+0xc1/0x270
   ? __pfx_bpf_testmod_init+0x10/0x10
   ? __pfx_bpf_testmod_init+0x10/0x10
   ? bpf_fentry_test1+0x5/0x10
   ? bpf_fentry_test1+0x5/0x10
   ? bpf_testmod_init+0x22/0x80
   ? do_one_initcall+0x63/0x2e0
   ? rcu_is_watching+0xd/0x40
   ? kmalloc_trace+0xaf/0xc0
   ? do_init_module+0x60/0x250
   ? __do_sys_finit_module+0xac/0x120
   ? do_syscall_64+0x37/0x90
   ? entry_SYSCALL_64_after_hwframe+0x72/0xdc
   </TASK>

In unregister_fprobe function we can't release fp->rethook while it's
possible there are some of its users still running on another cpu.

Moving rethook_free call after fp->ops is unregistered with
unregister_ftrace_function call.

Link: https://lore.kernel.org/all/20230615115236.3476617-1-jolsa@kernel.org/

Fixes: 5b0ab78998e3 ("fprobe: Add exit_handler support")
Cc: stable@vger.kernel.org
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 kernel/trace/fprobe.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 18d36842faf5..0121e8c0d54e 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -364,19 +364,13 @@ int unregister_fprobe(struct fprobe *fp)
 		    fp->ops.saved_func != fprobe_kprobe_handler))
 		return -EINVAL;
 
-	/*
-	 * rethook_free() starts disabling the rethook, but the rethook handlers
-	 * may be running on other processors at this point. To make sure that all
-	 * current running handlers are finished, call unregister_ftrace_function()
-	 * after this.
-	 */
-	if (fp->rethook)
-		rethook_free(fp->rethook);
-
 	ret = unregister_ftrace_function(&fp->ops);
 	if (ret < 0)
 		return ret;
 
+	if (fp->rethook)
+		rethook_free(fp->rethook);
+
 	ftrace_free_filter(&fp->ops);
 
 	return ret;
-- 
cgit v1.2.3


From aabd12609f91155f26584508b01f548215cc3c0c Mon Sep 17 00:00:00 2001
From: Petr Tesarik <petr.tesarik.ext@huawei.com>
Date: Mon, 26 Jun 2023 15:01:03 +0200
Subject: swiotlb: always set the number of areas before allocating the pool

The number of areas defaults to the number of possible CPUs. However, the
total number of slots may have to be increased after adjusting the number
of areas. Consequently, the number of areas must be determined before
allocating the memory pool. This is even explained with a comment in
swiotlb_init_remap(), but swiotlb_init_late() adjusts the number of areas
after slots are already allocated. The areas may end up being smaller than
IO_TLB_SEGSIZE, which breaks per-area locking.

While fixing swiotlb_init_late(), move all relevant comments before the
definition of swiotlb_adjust_nareas() and convert them to kernel-doc.

Fixes: 20347fca71a3 ("swiotlb: split up the global swiotlb lock")
Signed-off-by: Petr Tesarik <petr.tesarik.ext@huawei.com>
Reviewed-by: Roberto Sassu <roberto.sassu@huawei.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/swiotlb.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 775f7bb10ab1..89db590f931f 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -115,9 +115,16 @@ static bool round_up_default_nslabs(void)
 	return true;
 }
 
+/**
+ * swiotlb_adjust_nareas() - adjust the number of areas and slots
+ * @nareas:	Desired number of areas. Zero is treated as 1.
+ *
+ * Adjust the default number of areas in a memory pool.
+ * The default size of the memory pool may also change to meet minimum area
+ * size requirements.
+ */
 static void swiotlb_adjust_nareas(unsigned int nareas)
 {
-	/* use a single area when non is specified */
 	if (!nareas)
 		nareas = 1;
 	else if (!is_power_of_2(nareas))
@@ -298,10 +305,6 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
 	if (swiotlb_force_disable)
 		return;
 
-	/*
-	 * default_nslabs maybe changed when adjust area number.
-	 * So allocate bounce buffer after adjusting area number.
-	 */
 	if (!default_nareas)
 		swiotlb_adjust_nareas(num_possible_cpus());
 
@@ -363,6 +366,9 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
 	if (swiotlb_force_disable)
 		return 0;
 
+	if (!default_nareas)
+		swiotlb_adjust_nareas(num_possible_cpus());
+
 retry:
 	order = get_order(nslabs << IO_TLB_SHIFT);
 	nslabs = SLABS_PER_PAGE << order;
@@ -397,9 +403,6 @@ retry:
 			(PAGE_SIZE << order) >> 20);
 	}
 
-	if (!default_nareas)
-		swiotlb_adjust_nareas(num_possible_cpus());
-
 	area_order = get_order(array_size(sizeof(*mem->areas),
 		default_nareas));
 	mem->areas = (struct io_tlb_area *)
-- 
cgit v1.2.3


From 8ac04063354a01a484d2e55d20ed1958aa0d3392 Mon Sep 17 00:00:00 2001
From: Petr Tesarik <petr.tesarik.ext@huawei.com>
Date: Mon, 26 Jun 2023 15:01:04 +0200
Subject: swiotlb: reduce the number of areas to match actual memory pool size

Although the desired size of the SWIOTLB memory pool is increased in
swiotlb_adjust_nareas() to match the number of areas, the actual allocation
may be smaller, which may require reducing the number of areas.

For example, Xen uses swiotlb_init_late(), which in turn uses the page
allocator. On x86, page size is 4 KiB and MAX_ORDER is 10 (1024 pages),
resulting in a maximum memory pool size of 4 MiB. This corresponds to 2048
slots of 2 KiB each. The minimum area size is 128 (IO_TLB_SEGSIZE),
allowing at most 2048 / 128 = 16 areas.

If num_possible_cpus() is greater than the maximum number of areas, areas
are smaller than IO_TLB_SEGSIZE and contiguous groups of free slots will
span multiple areas. When allocating and freeing slots, only one area will
be properly locked, causing race conditions on the unlocked slots and
ultimately data corruption, kernel hangs and crashes.

Fixes: 20347fca71a3 ("swiotlb: split up the global swiotlb lock")
Signed-off-by: Petr Tesarik <petr.tesarik.ext@huawei.com>
Reviewed-by: Roberto Sassu <roberto.sassu@huawei.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/swiotlb.c | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 89db590f931f..2b83e3ad9dca 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -138,6 +138,23 @@ static void swiotlb_adjust_nareas(unsigned int nareas)
 			(default_nslabs << IO_TLB_SHIFT) >> 20);
 }
 
+/**
+ * limit_nareas() - get the maximum number of areas for a given memory pool size
+ * @nareas:	Desired number of areas.
+ * @nslots:	Total number of slots in the memory pool.
+ *
+ * Limit the number of areas to the maximum possible number of areas in
+ * a memory pool of the given size.
+ *
+ * Return: Maximum possible number of areas.
+ */
+static unsigned int limit_nareas(unsigned int nareas, unsigned long nslots)
+{
+	if (nslots < nareas * IO_TLB_SEGSIZE)
+		return nslots / IO_TLB_SEGSIZE;
+	return nareas;
+}
+
 static int __init
 setup_io_tlb_npages(char *str)
 {
@@ -297,6 +314,7 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
 {
 	struct io_tlb_mem *mem = &io_tlb_default_mem;
 	unsigned long nslabs;
+	unsigned int nareas;
 	size_t alloc_size;
 	void *tlb;
 
@@ -309,10 +327,12 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
 		swiotlb_adjust_nareas(num_possible_cpus());
 
 	nslabs = default_nslabs;
+	nareas = limit_nareas(default_nareas, nslabs);
 	while ((tlb = swiotlb_memblock_alloc(nslabs, flags, remap)) == NULL) {
 		if (nslabs <= IO_TLB_MIN_SLABS)
 			return;
 		nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
+		nareas = limit_nareas(nareas, nslabs);
 	}
 
 	if (default_nslabs != nslabs) {
@@ -358,6 +378,7 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
 {
 	struct io_tlb_mem *mem = &io_tlb_default_mem;
 	unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
+	unsigned int nareas;
 	unsigned char *vstart = NULL;
 	unsigned int order, area_order;
 	bool retried = false;
@@ -403,8 +424,8 @@ retry:
 			(PAGE_SIZE << order) >> 20);
 	}
 
-	area_order = get_order(array_size(sizeof(*mem->areas),
-		default_nareas));
+	nareas = limit_nareas(default_nareas, nslabs);
+	area_order = get_order(array_size(sizeof(*mem->areas), nareas));
 	mem->areas = (struct io_tlb_area *)
 		__get_free_pages(GFP_KERNEL | __GFP_ZERO, area_order);
 	if (!mem->areas)
@@ -418,7 +439,7 @@ retry:
 	set_memory_decrypted((unsigned long)vstart,
 			     (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT);
 	swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, 0, true,
-				default_nareas);
+				nareas);
 
 	swiotlb_print_info();
 	return 0;
-- 
cgit v1.2.3


From 67a4e1a3bf7c68ed3fbefc4213648165d912cabb Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 28 Jun 2023 18:02:51 +0300
Subject: irqdomain: Use return value of strreplace()

Since strreplace() returns the pointer to the string itself, use it
directly.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20230628150251.17832-1-andriy.shevchenko@linux.intel.com
---
 kernel/irq/irqdomain.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 5bd01624e447..0bdef4fe925b 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -182,9 +182,7 @@ static struct irq_domain *__irq_domain_create(struct fwnode_handle *fwnode,
 			return NULL;
 		}
 
-		strreplace(name, '/', ':');
-
-		domain->name = name;
+		domain->name = strreplace(name, '/', ':');
 		domain->fwnode = fwnode;
 		domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
 	}
-- 
cgit v1.2.3


From 02b0095e2fbbc060560c1065f86a211d91e27b26 Mon Sep 17 00:00:00 2001
From: Mateusz Stachyra <m.stachyra@samsung.com>
Date: Tue, 4 Jul 2023 12:27:06 +0200
Subject: tracing: Fix null pointer dereference in tracing_err_log_open()

Fix an issue in function 'tracing_err_log_open'.
The function doesn't call 'seq_open' if the file is opened only with
write permissions, which results in 'file->private_data' being left as null.
If we then use 'lseek' on that opened file, 'seq_lseek' dereferences
'file->private_data' in 'mutex_lock(&m->lock)', resulting in a kernel panic.
Writing to this node requires root privileges, therefore this bug
has very little security impact.

Tracefs node: /sys/kernel/tracing/error_log

Example Kernel panic:

Unable to handle kernel NULL pointer dereference at virtual address 0000000000000038
Call trace:
 mutex_lock+0x30/0x110
 seq_lseek+0x34/0xb8
 __arm64_sys_lseek+0x6c/0xb8
 invoke_syscall+0x58/0x13c
 el0_svc_common+0xc4/0x10c
 do_el0_svc+0x24/0x98
 el0_svc+0x24/0x88
 el0t_64_sync_handler+0x84/0xe4
 el0t_64_sync+0x1b4/0x1b8
Code: d503201f aa0803e0 aa1f03e1 aa0103e9 (c8e97d02)
---[ end trace 561d1b49c12cf8a5 ]---
Kernel panic - not syncing: Oops: Fatal exception

Link: https://lore.kernel.org/linux-trace-kernel/20230703155237eucms1p4dfb6a19caa14c79eb6c823d127b39024@eucms1p4
Link: https://lore.kernel.org/linux-trace-kernel/20230704102706eucms1p30d7ecdcc287f46ad67679fc8491b2e0f@eucms1p3

Cc: stable@vger.kernel.org
Fixes: 8a062902be725 ("tracing: Add tracing error log")
Signed-off-by: Mateusz Stachyra <m.stachyra@samsung.com>
Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 64a4dde073ef..3d34e6fea6b2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8135,7 +8135,7 @@ static const struct file_operations tracing_err_log_fops = {
 	.open           = tracing_err_log_open,
 	.write		= tracing_err_log_write,
 	.read           = seq_read,
-	.llseek         = seq_lseek,
+	.llseek         = tracing_lseek,
 	.release        = tracing_err_log_release,
 };
 
-- 
cgit v1.2.3


From fddca7db4a4c17f7333793dfb5308d80c76d2896 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 4 Jul 2023 10:08:07 -0400
Subject: tracing/boot: Test strscpy() against less than zero for error

Instead of checking for -E2BIG, it is better to just check for less than
zero of strscpy() for error. Testing for -E2BIG is not very robust, and
the calling code does not really care about the error code, just that
there was an error.

One of the updates to convert strlcpy() to strscpy() had a v2 version
that changed the test from testing against -E2BIG to less than zero, but I
took the v1 version that still tested for -E2BIG.

Link: https://lore.kernel.org/linux-trace-kernel/20230615180420.400769-1-azeemshaikh38@gmail.com/
Link: https://lore.kernel.org/linux-trace-kernel/20230704100807.707d1605@rorschach.local.home

Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Azeem Shaikh <azeemshaikh38@gmail.com>
Cc: Kees Cook <keescook@chromium.org>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_boot.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 5fe525f1b8cc..7ccc7a8e155b 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -31,7 +31,7 @@ trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node)
 
 	/* Common ftrace options */
 	xbc_node_for_each_array_value(node, "options", anode, p) {
-		if (strscpy(buf, p, ARRAY_SIZE(buf)) == -E2BIG) {
+		if (strscpy(buf, p, ARRAY_SIZE(buf)) < 0) {
 			pr_err("String is too long: %s\n", p);
 			continue;
 		}
@@ -87,7 +87,7 @@ trace_boot_enable_events(struct trace_array *tr, struct xbc_node *node)
 	const char *p;
 
 	xbc_node_for_each_array_value(node, "events", anode, p) {
-		if (strscpy(buf, p, ARRAY_SIZE(buf)) == -E2BIG) {
+		if (strscpy(buf, p, ARRAY_SIZE(buf)) < 0) {
 			pr_err("String is too long: %s\n", p);
 			continue;
 		}
@@ -486,7 +486,7 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode,
 
 	p = xbc_node_find_value(enode, "filter", NULL);
 	if (p && *p != '\0') {
-		if (strscpy(buf, p, ARRAY_SIZE(buf)) == -E2BIG)
+		if (strscpy(buf, p, ARRAY_SIZE(buf)) < 0)
 			pr_err("filter string is too long: %s\n", p);
 		else if (apply_event_filter(file, buf) < 0)
 			pr_err("Failed to apply filter: %s\n", buf);
@@ -494,7 +494,7 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode,
 
 	if (IS_ENABLED(CONFIG_HIST_TRIGGERS)) {
 		xbc_node_for_each_array_value(enode, "actions", anode, p) {
-			if (strscpy(buf, p, ARRAY_SIZE(buf)) == -E2BIG)
+			if (strscpy(buf, p, ARRAY_SIZE(buf)) < 0)
 				pr_err("action string is too long: %s\n", p);
 			else if (trigger_process_regex(file, buf) < 0)
 				pr_err("Failed to apply an action: %s\n", p);
-- 
cgit v1.2.3


From 5415ccd50a8620c8cbaa32d6f18c946c453566f5 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 5 Jul 2023 20:17:29 +0530
Subject: bpf: Fix max stack depth check for async callbacks

The check_max_stack_depth pass happens after the verifier's symbolic
execution, and attempts to walk the call graph of the BPF program,
ensuring that the stack usage stays within bounds for all possible call
chains. There are two cases to consider: bpf_pseudo_func and
bpf_pseudo_call. In the former case, the callback pointer is loaded into
a register, and is assumed that it is passed to some helper later which
calls it (however there is no way to be sure), but the check remains
conservative and accounts the stack usage anyway. For this particular
case, asynchronous callbacks are skipped as they execute asynchronously
when their corresponding event fires.

The case of bpf_pseudo_call is simpler and we know that the call is
definitely made, hence the stack depth of the subprog is accounted for.

However, the current check still skips an asynchronous callback even if
a bpf_pseudo_call was made for it. This is erroneous, as it will miss
accounting for the stack usage of the asynchronous callback, which can
be used to breach the maximum stack depth limit.

Fix this by only skipping asynchronous callbacks when the instruction is
not a pseudo call to the subprog.

Fixes: 7ddc80a476c2 ("bpf: Teach stack depth check about async callbacks.")
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230705144730.235802-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 11e54dd8b6dd..930b5555cfd3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5642,8 +5642,9 @@ continue_func:
 				verbose(env, "verifier bug. subprog has tail_call and async cb\n");
 				return -EFAULT;
 			}
-			 /* async callbacks don't increase bpf prog stack size */
-			continue;
+			/* async callbacks don't increase bpf prog stack size unless called directly */
+			if (!bpf_pseudo_call(insn + i))
+				continue;
 		}
 		i = next_insn;
 
-- 
cgit v1.2.3


From fb49c455323ff8319a123dd312be9082c49a23a5 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Sat, 8 Jul 2023 12:12:12 -0700
Subject: fork: lock VMAs of the parent process when forking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When forking a child process, the parent write-protects anonymous pages
and COW-shares them with the child being forked using copy_present_pte().

We must not take any concurrent page faults on the source vma's as they
are being processed, as we expect both the vma and the pte's behind it
to be stable.  For example, the anon_vma_fork() expects the parents
vma->anon_vma to not change during the vma copy.

A concurrent page fault on a page newly marked read-only by the page
copy might trigger wp_page_copy() and a anon_vma_prepare(vma) on the
source vma, defeating the anon_vma_clone() that wasn't done because the
parent vma originally didn't have an anon_vma, but we now might end up
copying a pte entry for a page that has one.

Before the per-vma lock based changes, the mmap_lock guaranteed
exclusion with concurrent page faults.  But now we need to do a
vma_start_write() to make sure no concurrent faults happen on this vma
while it is being processed.

This fix can potentially regress some fork-heavy workloads.  Kernel
build time did not show noticeable regression on a 56-core machine while
a stress test mapping 10000 VMAs and forking 5000 times in a tight loop
shows ~5% regression.  If such fork time regression is unacceptable,
disabling CONFIG_PER_VMA_LOCK should restore its performance.  Further
optimizations are possible if this regression proves to be problematic.

Suggested-by: David Hildenbrand <david@redhat.com>
Reported-by: Jiri Slaby <jirislaby@kernel.org>
Closes: https://lore.kernel.org/all/dbdef34c-3a07-5951-e1ae-e9c6e3cdf51b@kernel.org/
Reported-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Closes: https://lore.kernel.org/all/b198d649-f4bf-b971-31d0-e8433ec2a34c@applied-asynchrony.com/
Reported-by: Jacob Young <jacobly.alt@gmail.com>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217624
Fixes: 0bff0aaea03e ("x86/mm: try VMA lock-based page fault handling first")
Cc: stable@vger.kernel.org
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index b85814e614a5..d2e12b6d2b18 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -686,6 +686,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 	for_each_vma(old_vmi, mpnt) {
 		struct file *file;
 
+		vma_start_write(mpnt);
 		if (mpnt->vm_flags & VM_DONTCOPY) {
 			vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
 			continue;
-- 
cgit v1.2.3


From cf0a624dc706c306294c14e6b3e7694702f25191 Mon Sep 17 00:00:00 2001
From: "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com>
Date: Mon, 3 Jul 2023 07:28:53 +0300
Subject: kernel/trace: Fix cleanup logic of enable_trace_eprobe

The enable_trace_eprobe() function enables all event probes, attached
to given trace probe. If an error occurs in enabling one of the event
probes, all others should be roll backed. There is a bug in that roll
back logic - instead of all event probes, only the failed one is
disabled.

Link: https://lore.kernel.org/all/20230703042853.1427493-1-tz.stoyanov@gmail.com/

Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Fixes: 7491e2c44278 ("tracing: Add a probe that attaches to trace events")
Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 kernel/trace/trace_eprobe.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index 67e854979d53..3f04f0ffe0d7 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -675,6 +675,7 @@ static int enable_trace_eprobe(struct trace_event_call *call,
 	struct trace_eprobe *ep;
 	bool enabled;
 	int ret = 0;
+	int cnt = 0;
 
 	tp = trace_probe_primary_from_call(call);
 	if (WARN_ON_ONCE(!tp))
@@ -698,12 +699,25 @@ static int enable_trace_eprobe(struct trace_event_call *call,
 		if (ret)
 			break;
 		enabled = true;
+		cnt++;
 	}
 
 	if (ret) {
 		/* Failed to enable one of them. Roll back all */
-		if (enabled)
-			disable_eprobe(ep, file->tr);
+		if (enabled) {
+			/*
+			 * It's a bug if one failed for something other than memory
+			 * not being available but another eprobe succeeded.
+			 */
+			WARN_ON_ONCE(ret != -ENOMEM);
+
+			list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
+				ep = container_of(pos, struct trace_eprobe, tp);
+				disable_eprobe(ep, file->tr);
+				if (!--cnt)
+					break;
+			}
+		}
 		if (file)
 			trace_probe_remove_file(tp, file);
 		else
-- 
cgit v1.2.3


From 5f0c584daf7464f04114c65dd07269ee2bfedc13 Mon Sep 17 00:00:00 2001
From: Ze Gao <zegao2021@gmail.com>
Date: Mon, 3 Jul 2023 17:23:36 +0800
Subject: fprobe: add unlock to match a succeeded ftrace_test_recursion_trylock

Unlock ftrace recursion lock when fprobe_kprobe_handler() is failed
because of some running kprobe.

Link: https://lore.kernel.org/all/20230703092336.268371-1-zegao@tencent.com/

Fixes: 3cc4e2c5fbae ("fprobe: make fprobe_kprobe_handler recursion free")
Reported-by: Yafang <laoar.shao@gmail.com>
Closes: https://lore.kernel.org/linux-trace-kernel/CALOAHbC6UpfFOOibdDiC7xFc5YFUgZnk3MZ=3Ny6we=AcrNbew@mail.gmail.com/
Signed-off-by: Ze Gao <zegao@tencent.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Acked-by: Yafang Shao <laoar.shao@gmail.com>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 kernel/trace/fprobe.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 0121e8c0d54e..58e4b3607aef 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -102,12 +102,14 @@ static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
 
 	if (unlikely(kprobe_running())) {
 		fp->nmissed++;
-		return;
+		goto recursion_unlock;
 	}
 
 	kprobe_busy_begin();
 	__fprobe_handler(ip, parent_ip, ops, fregs);
 	kprobe_busy_end();
+
+recursion_unlock:
 	ftrace_test_recursion_unlock(bit);
 }
 
-- 
cgit v1.2.3


From e1164787f22b51010cfdc6a5e41b744435836b79 Mon Sep 17 00:00:00 2001
From: Li zeming <zeming@nfschina.com>
Date: Wed, 5 Jul 2023 03:43:59 +0800
Subject: kprobes: Remove unnecessary ‘NULL’ values from correct_ret_addr
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 'correct_ret_addr' pointer is always set in the later code, no need
to initialize it at definition time.

Link: https://lore.kernel.org/all/20230704194359.3124-1-zeming@nfschina.com/

Signed-off-by: Li zeming <zeming@nfschina.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 kernel/kprobes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 00e177de91cc..fd713ab439c2 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2007,9 +2007,9 @@ void __weak arch_kretprobe_fixup_return(struct pt_regs *regs,
 unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs,
 					     void *frame_pointer)
 {
-	kprobe_opcode_t *correct_ret_addr = NULL;
 	struct kretprobe_instance *ri = NULL;
 	struct llist_node *first, *node = NULL;
+	kprobe_opcode_t *correct_ret_addr;
 	struct kretprobe *rp;
 
 	/* Find correct address and all nodes for this frame. */
-- 
cgit v1.2.3


From ed9492dfef8738ca68879f5690dda5a04f1897dc Mon Sep 17 00:00:00 2001
From: Li zeming <zeming@nfschina.com>
Date: Wed, 12 Jul 2023 02:53:53 +0800
Subject: kernel: kprobes: Remove unnecessary ‘0’ values
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

it is assigned first, so it does not need to initialize the assignment.

Link: https://lore.kernel.org/all/20230711185353.3218-1-zeming@nfschina.com/

Signed-off-by: Li zeming <zeming@nfschina.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 kernel/kprobes.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index fd713ab439c2..a009c8ccd8ea 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1072,7 +1072,7 @@ static int kprobe_ftrace_enabled;
 static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
 			       int *cnt)
 {
-	int ret = 0;
+	int ret;
 
 	lockdep_assert_held(&kprobe_mutex);
 
@@ -1110,7 +1110,7 @@ static int arm_kprobe_ftrace(struct kprobe *p)
 static int __disarm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
 				  int *cnt)
 {
-	int ret = 0;
+	int ret;
 
 	lockdep_assert_held(&kprobe_mutex);
 
@@ -2692,7 +2692,7 @@ void kprobe_free_init_mem(void)
 
 static int __init init_kprobes(void)
 {
-	int i, err = 0;
+	int i, err;
 
 	/* FIXME allocate the probe table, currently defined statically */
 	/* initialize all list heads */
-- 
cgit v1.2.3


From 195b9cb5b288fec1c871ef89f78cc9a7461aad3a Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Fri, 7 Jul 2023 23:03:19 +0900
Subject: fprobe: Ensure running fprobe_exit_handler() finished before calling
 rethook_free()

Ensure running fprobe_exit_handler() has finished before
calling rethook_free() in the unregister_fprobe() so that caller can free
the fprobe right after unregister_fprobe().

unregister_fprobe() ensured that all running fprobe_entry/exit_handler()
have finished by calling unregister_ftrace_function() which synchronizes
RCU. But commit 5f81018753df ("fprobe: Release rethook after the ftrace_ops
is unregistered") changed to call rethook_free() after
unregister_ftrace_function(). So call rethook_stop() to make rethook
disabled before unregister_ftrace_function() and ensure it again.

Here is the possible code flow that can call the exit handler after
unregister_fprobe().

------
 CPU1                              CPU2
 call unregister_fprobe(fp)
 ...
                                   __fprobe_handler()
                                   rethook_hook() on probed function
 unregister_ftrace_function()
                                   return from probed function
                                   rethook hooks
                                   find rh->handler == fprobe_exit_handler
                                   call fprobe_exit_handler()
 rethook_free():
   set rh->handler = NULL;
 return from unreigster_fprobe;
                                   call fp->exit_handler() <- (*)
------

(*) At this point, the exit handler is called after returning from
unregister_fprobe().

This fixes it as following;
------
 CPU1                              CPU2
 call unregister_fprobe()
 ...
 rethook_stop():
   set rh->handler = NULL;
                                   __fprobe_handler()
                                   rethook_hook() on probed function
 unregister_ftrace_function()
                                   return from probed function
                                   rethook hooks
                                   find rh->handler == NULL
                                   return from rethook
 rethook_free()
 return from unreigster_fprobe;
------

Link: https://lore.kernel.org/all/168873859949.156157.13039240432299335849.stgit@devnote2/

Fixes: 5f81018753df ("fprobe: Release rethook after the ftrace_ops is unregistered")
Cc: stable@vger.kernel.org
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/rethook.h |  1 +
 kernel/trace/fprobe.c   |  3 +++
 kernel/trace/rethook.c  | 13 +++++++++++++
 3 files changed, 17 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/rethook.h b/include/linux/rethook.h
index c8ac1e5afcd1..bdbe6717f45a 100644
--- a/include/linux/rethook.h
+++ b/include/linux/rethook.h
@@ -59,6 +59,7 @@ struct rethook_node {
 };
 
 struct rethook *rethook_alloc(void *data, rethook_handler_t handler);
+void rethook_stop(struct rethook *rh);
 void rethook_free(struct rethook *rh);
 void rethook_add_node(struct rethook *rh, struct rethook_node *node);
 struct rethook_node *rethook_try_get(struct rethook *rh);
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 58e4b3607aef..2571f7f3d5f2 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -366,6 +366,9 @@ int unregister_fprobe(struct fprobe *fp)
 		    fp->ops.saved_func != fprobe_kprobe_handler))
 		return -EINVAL;
 
+	if (fp->rethook)
+		rethook_stop(fp->rethook);
+
 	ret = unregister_ftrace_function(&fp->ops);
 	if (ret < 0)
 		return ret;
diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c
index 60f6cb2b486b..468006cce7ca 100644
--- a/kernel/trace/rethook.c
+++ b/kernel/trace/rethook.c
@@ -53,6 +53,19 @@ static void rethook_free_rcu(struct rcu_head *head)
 		kfree(rh);
 }
 
+/**
+ * rethook_stop() - Stop using a rethook.
+ * @rh: the struct rethook to stop.
+ *
+ * Stop using a rethook to prepare for freeing it. If you want to wait for
+ * all running rethook handler before calling rethook_free(), you need to
+ * call this first and wait RCU, and call rethook_free().
+ */
+void rethook_stop(struct rethook *rh)
+{
+	WRITE_ONCE(rh->handler, NULL);
+}
+
 /**
  * rethook_free() - Free struct rethook.
  * @rh: the struct rethook to be freed.
-- 
cgit v1.2.3


From d0a3022f30629a208e5944022caeca3568add9e7 Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Thu, 29 Jun 2023 23:50:48 +0000
Subject: tracing/user_events: Fix struct arg size match check

When users register an event the name of the event and it's argument are
checked to ensure they match if the event already exists. Normally all
arguments are in the form of "type name", except for when the type
starts with "struct ". In those cases, the size of the struct is passed
in addition to the name, IE: "struct my_struct a 20" for an argument
that is of type "struct my_struct" with a field name of "a" and has the
size of 20 bytes.

The current code does not honor the above case properly when comparing
a match. This causes the event register to fail even when the same
string was used for events that contain a struct argument within them.
The example above "struct my_struct a 20" generates a match string of
"struct my_struct a" omitting the size field.

Add the struct size of the existing field when generating a comparison
string for a struct field to ensure proper match checking.

Link: https://lkml.kernel.org/r/20230629235049.581-2-beaub@linux.microsoft.com

Cc: stable@vger.kernel.org
Fixes: e6f89a149872 ("tracing/user_events: Ensure user provided strings are safely formatted")
Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_user.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 4f5e74bbdab2..33cb6af31f39 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -1317,6 +1317,9 @@ static int user_field_set_string(struct ftrace_event_field *field,
 	pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
 	pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->name);
 
+	if (str_has_prefix(field->type, "struct "))
+		pos += snprintf(buf + pos, LEN_OR_ZERO, " %d", field->size);
+
 	if (colon)
 		pos += snprintf(buf + pos, LEN_OR_ZERO, ";");
 
-- 
cgit v1.2.3


From 4369016497319a9635702da010d02af1ebb1849d Mon Sep 17 00:00:00 2001
From: Pu Lehui <pulehui@huawei.com>
Date: Tue, 11 Jul 2023 19:58:48 +0800
Subject: bpf: cpumap: Fix memory leak in cpu_map_update_elem

Syzkaller reported a memory leak as follows:

BUG: memory leak
unreferenced object 0xff110001198ef748 (size 192):
  comm "syz-executor.3", pid 17672, jiffies 4298118891 (age 9.906s)
  hex dump (first 32 bytes):
    00 00 00 00 4a 19 00 00 80 ad e3 e4 fe ff c0 00  ....J...........
    00 b2 d3 0c 01 00 11 ff 28 f5 8e 19 01 00 11 ff  ........(.......
  backtrace:
    [<ffffffffadd28087>] __cpu_map_entry_alloc+0xf7/0xb00
    [<ffffffffadd28d8e>] cpu_map_update_elem+0x2fe/0x3d0
    [<ffffffffadc6d0fd>] bpf_map_update_value.isra.0+0x2bd/0x520
    [<ffffffffadc7349b>] map_update_elem+0x4cb/0x720
    [<ffffffffadc7d983>] __se_sys_bpf+0x8c3/0xb90
    [<ffffffffb029cc80>] do_syscall_64+0x30/0x40
    [<ffffffffb0400099>] entry_SYSCALL_64_after_hwframe+0x61/0xc6

BUG: memory leak
unreferenced object 0xff110001198ef528 (size 192):
  comm "syz-executor.3", pid 17672, jiffies 4298118891 (age 9.906s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<ffffffffadd281f0>] __cpu_map_entry_alloc+0x260/0xb00
    [<ffffffffadd28d8e>] cpu_map_update_elem+0x2fe/0x3d0
    [<ffffffffadc6d0fd>] bpf_map_update_value.isra.0+0x2bd/0x520
    [<ffffffffadc7349b>] map_update_elem+0x4cb/0x720
    [<ffffffffadc7d983>] __se_sys_bpf+0x8c3/0xb90
    [<ffffffffb029cc80>] do_syscall_64+0x30/0x40
    [<ffffffffb0400099>] entry_SYSCALL_64_after_hwframe+0x61/0xc6

BUG: memory leak
unreferenced object 0xff1100010fd93d68 (size 8):
  comm "syz-executor.3", pid 17672, jiffies 4298118891 (age 9.906s)
  hex dump (first 8 bytes):
    00 00 00 00 00 00 00 00                          ........
  backtrace:
    [<ffffffffade5db3e>] kvmalloc_node+0x11e/0x170
    [<ffffffffadd28280>] __cpu_map_entry_alloc+0x2f0/0xb00
    [<ffffffffadd28d8e>] cpu_map_update_elem+0x2fe/0x3d0
    [<ffffffffadc6d0fd>] bpf_map_update_value.isra.0+0x2bd/0x520
    [<ffffffffadc7349b>] map_update_elem+0x4cb/0x720
    [<ffffffffadc7d983>] __se_sys_bpf+0x8c3/0xb90
    [<ffffffffb029cc80>] do_syscall_64+0x30/0x40
    [<ffffffffb0400099>] entry_SYSCALL_64_after_hwframe+0x61/0xc6

In the cpu_map_update_elem flow, when kthread_stop is called before
calling the threadfn of rcpu->kthread, since the KTHREAD_SHOULD_STOP bit
of kthread has been set by kthread_stop, the threadfn of rcpu->kthread
will never be executed, and rcpu->refcnt will never be 0, which will
lead to the allocated rcpu, rcpu->queue and rcpu->queue->queue cannot be
released.

Calling kthread_stop before executing kthread's threadfn will return
-EINTR. We can complete the release of memory resources in this state.

Fixes: 6710e1126934 ("bpf: introduce new bpf cpu map type BPF_MAP_TYPE_CPUMAP")
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Acked-by: Jesper Dangaard Brouer <hawk@kernel.org>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20230711115848.2701559-1-pulehui@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/cpumap.c | 40 ++++++++++++++++++++++++----------------
 1 file changed, 24 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 8a33e8747a0e..6ae02be7a48e 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -122,22 +122,6 @@ static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
 	atomic_inc(&rcpu->refcnt);
 }
 
-/* called from workqueue, to workaround syscall using preempt_disable */
-static void cpu_map_kthread_stop(struct work_struct *work)
-{
-	struct bpf_cpu_map_entry *rcpu;
-
-	rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq);
-
-	/* Wait for flush in __cpu_map_entry_free(), via full RCU barrier,
-	 * as it waits until all in-flight call_rcu() callbacks complete.
-	 */
-	rcu_barrier();
-
-	/* kthread_stop will wake_up_process and wait for it to complete */
-	kthread_stop(rcpu->kthread);
-}
-
 static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
 {
 	/* The tear-down procedure should have made sure that queue is
@@ -165,6 +149,30 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
 	}
 }
 
+/* called from workqueue, to workaround syscall using preempt_disable */
+static void cpu_map_kthread_stop(struct work_struct *work)
+{
+	struct bpf_cpu_map_entry *rcpu;
+	int err;
+
+	rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq);
+
+	/* Wait for flush in __cpu_map_entry_free(), via full RCU barrier,
+	 * as it waits until all in-flight call_rcu() callbacks complete.
+	 */
+	rcu_barrier();
+
+	/* kthread_stop will wake_up_process and wait for it to complete */
+	err = kthread_stop(rcpu->kthread);
+	if (err) {
+		/* kthread_stop may be called before cpu_map_kthread_run
+		 * is executed, so we need to release the memory related
+		 * to rcpu.
+		 */
+		put_cpu_map_entry(rcpu);
+	}
+}
+
 static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
 				     struct list_head *listp,
 				     struct xdp_cpumap_stats *stats)
-- 
cgit v1.2.3


From 7d8b31b73c79835572611ed1eed649e4d2e14245 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 17 May 2023 14:51:48 +0200
Subject: tracing: arm64: Avoid missing-prototype warnings

These are all tracing W=1 warnings in arm64 allmodconfig about missing
prototypes:

kernel/trace/trace_kprobe_selftest.c:7:5: error: no previous prototype for 'kprobe_trace_selftest_target' [-Werror=missing-pro
totypes]
kernel/trace/ftrace.c:329:5: error: no previous prototype for '__register_ftrace_function' [-Werror=missing-prototypes]
kernel/trace/ftrace.c:372:5: error: no previous prototype for '__unregister_ftrace_function' [-Werror=missing-prototypes]
kernel/trace/ftrace.c:4130:15: error: no previous prototype for 'arch_ftrace_match_adjust' [-Werror=missing-prototypes]
kernel/trace/fgraph.c:243:15: error: no previous prototype for 'ftrace_return_to_handler' [-Werror=missing-prototypes]
kernel/trace/fgraph.c:358:6: error: no previous prototype for 'ftrace_graph_sleep_time_control' [-Werror=missing-prototypes]
arch/arm64/kernel/ftrace.c:460:6: error: no previous prototype for 'prepare_ftrace_return' [-Werror=missing-prototypes]
arch/arm64/kernel/ptrace.c:2172:5: error: no previous prototype for 'syscall_trace_enter' [-Werror=missing-prototypes]
arch/arm64/kernel/ptrace.c:2195:6: error: no previous prototype for 'syscall_trace_exit' [-Werror=missing-prototypes]

Move the declarations to an appropriate header where they can be seen
by the caller and callee, and make sure the headers are included where
needed.

Link: https://lore.kernel.org/linux-trace-kernel/20230517125215.930689-1-arnd@kernel.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Florent Revest <revest@chromium.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
[ Fixed ftrace_return_to_handler() to handle CONFIG_HAVE_FUNCTION_GRAPH_RETVAL case ]
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 arch/arm64/include/asm/ftrace.h      | 4 ++++
 arch/arm64/include/asm/syscall.h     | 3 +++
 arch/arm64/kernel/syscall.c          | 3 ---
 include/linux/ftrace.h               | 9 +++++++++
 kernel/trace/fgraph.c                | 1 +
 kernel/trace/ftrace_internal.h       | 5 +++--
 kernel/trace/trace_kprobe_selftest.c | 3 +++
 7 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h
index 21ac1c5c71d3..ab158196480c 100644
--- a/arch/arm64/include/asm/ftrace.h
+++ b/arch/arm64/include/asm/ftrace.h
@@ -211,6 +211,10 @@ static inline unsigned long fgraph_ret_regs_frame_pointer(struct fgraph_ret_regs
 {
 	return ret_regs->fp;
 }
+
+void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
+			   unsigned long frame_pointer);
+
 #endif /* ifdef CONFIG_FUNCTION_GRAPH_TRACER  */
 #endif
 
diff --git a/arch/arm64/include/asm/syscall.h b/arch/arm64/include/asm/syscall.h
index 4cfe9b49709b..ab8e14b96f68 100644
--- a/arch/arm64/include/asm/syscall.h
+++ b/arch/arm64/include/asm/syscall.h
@@ -85,4 +85,7 @@ static inline int syscall_get_arch(struct task_struct *task)
 	return AUDIT_ARCH_AARCH64;
 }
 
+int syscall_trace_enter(struct pt_regs *regs);
+void syscall_trace_exit(struct pt_regs *regs);
+
 #endif	/* __ASM_SYSCALL_H */
diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c
index 5a668d7f3c1f..b1ae2f2eaf77 100644
--- a/arch/arm64/kernel/syscall.c
+++ b/arch/arm64/kernel/syscall.c
@@ -75,9 +75,6 @@ static inline bool has_syscall_work(unsigned long flags)
 	return unlikely(flags & _TIF_SYSCALL_WORK);
 }
 
-int syscall_trace_enter(struct pt_regs *regs);
-void syscall_trace_exit(struct pt_regs *regs);
-
 static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr,
 			   const syscall_fn_t syscall_table[])
 {
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 8e59bd954153..ce156c7704ee 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -41,6 +41,15 @@ struct ftrace_ops;
 struct ftrace_regs;
 struct dyn_ftrace;
 
+char *arch_ftrace_match_adjust(char *str, const char *search);
+
+#ifdef CONFIG_HAVE_FUNCTION_GRAPH_RETVAL
+struct fgraph_ret_regs;
+unsigned long ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs);
+#else
+unsigned long ftrace_return_to_handler(unsigned long frame_pointer);
+#endif
+
 #ifdef CONFIG_FUNCTION_TRACER
 /*
  * If the arch's mcount caller does not support all of ftrace's
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index cd2c35b1dd8f..c83c005e654e 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -15,6 +15,7 @@
 #include <trace/events/sched.h>
 
 #include "ftrace_internal.h"
+#include "trace.h"
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 #define ASSIGN_OPS_HASH(opsname, val) \
diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h
index 382775edf690..5012c04f92c0 100644
--- a/kernel/trace/ftrace_internal.h
+++ b/kernel/trace/ftrace_internal.h
@@ -2,6 +2,9 @@
 #ifndef _LINUX_KERNEL_FTRACE_INTERNAL_H
 #define  _LINUX_KERNEL_FTRACE_INTERNAL_H
 
+int __register_ftrace_function(struct ftrace_ops *ops);
+int __unregister_ftrace_function(struct ftrace_ops *ops);
+
 #ifdef CONFIG_FUNCTION_TRACER
 
 extern struct mutex ftrace_lock;
@@ -15,8 +18,6 @@ int ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs);
 
 #else /* !CONFIG_DYNAMIC_FTRACE */
 
-int __register_ftrace_function(struct ftrace_ops *ops);
-int __unregister_ftrace_function(struct ftrace_ops *ops);
 /* Keep as macros so we do not need to define the commands */
 # define ftrace_startup(ops, command)					\
 	({								\
diff --git a/kernel/trace/trace_kprobe_selftest.c b/kernel/trace/trace_kprobe_selftest.c
index 16548ee4c8c6..3851cd1e6a62 100644
--- a/kernel/trace/trace_kprobe_selftest.c
+++ b/kernel/trace/trace_kprobe_selftest.c
@@ -1,4 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
+
+#include "trace_kprobe_selftest.h"
+
 /*
  * Function used during the kprobe self test. This function is in a separate
  * compile unit so it can be compile with CC_FLAGS_FTRACE to ensure that it
-- 
cgit v1.2.3


From 7e42907f3a7b4ce3a2d1757f6d78336984daf8f5 Mon Sep 17 00:00:00 2001
From: Zheng Yejian <zhengyejian1@huawei.com>
Date: Sun, 9 Jul 2023 06:51:44 +0800
Subject: ring-buffer: Fix deadloop issue on reading trace_pipe

Soft lockup occurs when reading file 'trace_pipe':

  watchdog: BUG: soft lockup - CPU#6 stuck for 22s! [cat:4488]
  [...]
  RIP: 0010:ring_buffer_empty_cpu+0xed/0x170
  RSP: 0018:ffff88810dd6fc48 EFLAGS: 00000246
  RAX: 0000000000000000 RBX: 0000000000000246 RCX: ffffffff93d1aaeb
  RDX: ffff88810a280040 RSI: 0000000000000008 RDI: ffff88811164b218
  RBP: ffff88811164b218 R08: 0000000000000000 R09: ffff88815156600f
  R10: ffffed102a2acc01 R11: 0000000000000001 R12: 0000000051651901
  R13: 0000000000000000 R14: ffff888115e49500 R15: 0000000000000000
  [...]
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00007f8d853c2000 CR3: 000000010dcd8000 CR4: 00000000000006e0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
   __find_next_entry+0x1a8/0x4b0
   ? peek_next_entry+0x250/0x250
   ? down_write+0xa5/0x120
   ? down_write_killable+0x130/0x130
   trace_find_next_entry_inc+0x3b/0x1d0
   tracing_read_pipe+0x423/0xae0
   ? tracing_splice_read_pipe+0xcb0/0xcb0
   vfs_read+0x16b/0x490
   ksys_read+0x105/0x210
   ? __ia32_sys_pwrite64+0x200/0x200
   ? switch_fpu_return+0x108/0x220
   do_syscall_64+0x33/0x40
   entry_SYSCALL_64_after_hwframe+0x61/0xc6

Through the vmcore, I found it's because in tracing_read_pipe(),
ring_buffer_empty_cpu() found some buffer is not empty but then it
cannot read anything due to "rb_num_of_entries() == 0" always true,
Then it infinitely loop the procedure due to user buffer not been
filled, see following code path:

  tracing_read_pipe() {
    ... ...
    waitagain:
      tracing_wait_pipe() // 1. find non-empty buffer here
      trace_find_next_entry_inc()  // 2. loop here try to find an entry
        __find_next_entry()
          ring_buffer_empty_cpu();  // 3. find non-empty buffer
          peek_next_entry()  // 4. but peek always return NULL
            ring_buffer_peek()
              rb_buffer_peek()
                rb_get_reader_page()
                  // 5. because rb_num_of_entries() == 0 always true here
                  //    then return NULL
      // 6. user buffer not been filled so goto 'waitgain'
      //    and eventually leads to an deadloop in kernel!!!
  }

By some analyzing, I found that when resetting ringbuffer, the 'entries'
of its pages are not all cleared (see rb_reset_cpu()). Then when reducing
the ringbuffer, and if some reduced pages exist dirty 'entries' data, they
will be added into 'cpu_buffer->overrun' (see rb_remove_pages()), which
cause wrong 'overrun' count and eventually cause the deadloop issue.

To fix it, we need to clear every pages in rb_reset_cpu().

Link: https://lore.kernel.org/linux-trace-kernel/20230708225144.3785600-1-zhengyejian1@huawei.com

Cc: stable@vger.kernel.org
Fixes: a5fb833172eca ("ring-buffer: Fix uninitialized read_stamp")
Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 834b361a4a66..14d8001140c8 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5242,28 +5242,34 @@ unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_size);
 
+static void rb_clear_buffer_page(struct buffer_page *page)
+{
+	local_set(&page->write, 0);
+	local_set(&page->entries, 0);
+	rb_init_page(page->page);
+	page->read = 0;
+}
+
 static void
 rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 {
+	struct buffer_page *page;
+
 	rb_head_page_deactivate(cpu_buffer);
 
 	cpu_buffer->head_page
 		= list_entry(cpu_buffer->pages, struct buffer_page, list);
-	local_set(&cpu_buffer->head_page->write, 0);
-	local_set(&cpu_buffer->head_page->entries, 0);
-	local_set(&cpu_buffer->head_page->page->commit, 0);
-
-	cpu_buffer->head_page->read = 0;
+	rb_clear_buffer_page(cpu_buffer->head_page);
+	list_for_each_entry(page, cpu_buffer->pages, list) {
+		rb_clear_buffer_page(page);
+	}
 
 	cpu_buffer->tail_page = cpu_buffer->head_page;
 	cpu_buffer->commit_page = cpu_buffer->head_page;
 
 	INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
 	INIT_LIST_HEAD(&cpu_buffer->new_pages);
-	local_set(&cpu_buffer->reader_page->write, 0);
-	local_set(&cpu_buffer->reader_page->entries, 0);
-	local_set(&cpu_buffer->reader_page->page->commit, 0);
-	cpu_buffer->reader_page->read = 0;
+	rb_clear_buffer_page(cpu_buffer->reader_page);
 
 	local_set(&cpu_buffer->entries_bytes, 0);
 	local_set(&cpu_buffer->overrun, 0);
-- 
cgit v1.2.3


From 26efd79c4624294e553aeaa3439c646729bad084 Mon Sep 17 00:00:00 2001
From: Zheng Yejian <zhengyejian1@huawei.com>
Date: Wed, 12 Jul 2023 14:04:52 +0800
Subject: ftrace: Fix possible warning on checking all pages used in
 ftrace_process_locs()

As comments in ftrace_process_locs(), there may be NULL pointers in
mcount_loc section:
 > Some architecture linkers will pad between
 > the different mcount_loc sections of different
 > object files to satisfy alignments.
 > Skip any NULL pointers.

After commit 20e5227e9f55 ("ftrace: allow NULL pointers in mcount_loc"),
NULL pointers will be accounted when allocating ftrace pages but skipped
before adding into ftrace pages, this may result in some pages not being
used. Then after commit 706c81f87f84 ("ftrace: Remove extra helper
functions"), warning may occur at:
  WARN_ON(pg->next);

To fix it, only warn for case that no pointers skipped but pages not used
up, then free those unused pages after releasing ftrace_lock.

Link: https://lore.kernel.org/linux-trace-kernel/20230712060452.3175675-1-zhengyejian1@huawei.com

Cc: stable@vger.kernel.org
Fixes: 706c81f87f84 ("ftrace: Remove extra helper functions")
Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 45 +++++++++++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3740aca79fe7..05c0024815bf 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3305,6 +3305,22 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count)
 	return cnt;
 }
 
+static void ftrace_free_pages(struct ftrace_page *pages)
+{
+	struct ftrace_page *pg = pages;
+
+	while (pg) {
+		if (pg->records) {
+			free_pages((unsigned long)pg->records, pg->order);
+			ftrace_number_of_pages -= 1 << pg->order;
+		}
+		pages = pg->next;
+		kfree(pg);
+		pg = pages;
+		ftrace_number_of_groups--;
+	}
+}
+
 static struct ftrace_page *
 ftrace_allocate_pages(unsigned long num_to_init)
 {
@@ -3343,17 +3359,7 @@ ftrace_allocate_pages(unsigned long num_to_init)
 	return start_pg;
 
  free_pages:
-	pg = start_pg;
-	while (pg) {
-		if (pg->records) {
-			free_pages((unsigned long)pg->records, pg->order);
-			ftrace_number_of_pages -= 1 << pg->order;
-		}
-		start_pg = pg->next;
-		kfree(pg);
-		pg = start_pg;
-		ftrace_number_of_groups--;
-	}
+	ftrace_free_pages(start_pg);
 	pr_info("ftrace: FAILED to allocate memory for functions\n");
 	return NULL;
 }
@@ -6471,9 +6477,11 @@ static int ftrace_process_locs(struct module *mod,
 			       unsigned long *start,
 			       unsigned long *end)
 {
+	struct ftrace_page *pg_unuse = NULL;
 	struct ftrace_page *start_pg;
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
+	unsigned long skipped = 0;
 	unsigned long count;
 	unsigned long *p;
 	unsigned long addr;
@@ -6536,8 +6544,10 @@ static int ftrace_process_locs(struct module *mod,
 		 * object files to satisfy alignments.
 		 * Skip any NULL pointers.
 		 */
-		if (!addr)
+		if (!addr) {
+			skipped++;
 			continue;
+		}
 
 		end_offset = (pg->index+1) * sizeof(pg->records[0]);
 		if (end_offset > PAGE_SIZE << pg->order) {
@@ -6551,8 +6561,10 @@ static int ftrace_process_locs(struct module *mod,
 		rec->ip = addr;
 	}
 
-	/* We should have used all pages */
-	WARN_ON(pg->next);
+	if (pg->next) {
+		pg_unuse = pg->next;
+		pg->next = NULL;
+	}
 
 	/* Assign the last page to ftrace_pages */
 	ftrace_pages = pg;
@@ -6574,6 +6586,11 @@ static int ftrace_process_locs(struct module *mod,
  out:
 	mutex_unlock(&ftrace_lock);
 
+	/* We should have used all pages unless we skipped some */
+	if (pg_unuse) {
+		WARN_ON(!skipped);
+		ftrace_free_pages(pg_unuse);
+	}
 	return ret;
 }
 
-- 
cgit v1.2.3


From bec3c25c247c4f88a33d79675a09e1644c9a3114 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 12 Jul 2023 10:52:35 -0400
Subject: tracing: Stop FORTIFY_SOURCE complaining about stack trace caller

The stack_trace event is an event created by the tracing subsystem to
store stack traces. It originally just contained a hard coded array of 8
words to hold the stack, and a "size" to know how many entries are there.
This is exported to user space as:

name: kernel_stack
ID: 4
format:
	field:unsigned short common_type;	offset:0;	size:2;	signed:0;
	field:unsigned char common_flags;	offset:2;	size:1;	signed:0;
	field:unsigned char common_preempt_count;	offset:3;	size:1;	signed:0;
	field:int common_pid;	offset:4;	size:4;	signed:1;

	field:int size;	offset:8;	size:4;	signed:1;
	field:unsigned long caller[8];	offset:16;	size:64;	signed:0;

print fmt: "\t=> %ps\n\t=> %ps\n\t=> %ps\n" "\t=> %ps\n\t=> %ps\n\t=> %ps\n" "\t=> %ps\n\t=> %ps\n",i
 (void *)REC->caller[0], (void *)REC->caller[1], (void *)REC->caller[2],
 (void *)REC->caller[3], (void *)REC->caller[4], (void *)REC->caller[5],
 (void *)REC->caller[6], (void *)REC->caller[7]

Where the user space tracers could parse the stack. The library was
updated for this specific event to only look at the size, and not the
array. But some older users still look at the array (note, the older code
still checks to make sure the array fits inside the event that it read.
That is, if only 4 words were saved, the parser would not read the fifth
word because it will see that it was outside of the event size).

This event was changed a while ago to be more dynamic, and would save a
full stack even if it was greater than 8 words. It does this by simply
allocating more ring buffer to hold the extra words. Then it copies in the
stack via:

	memcpy(&entry->caller, fstack->calls, size);

As the entry is struct stack_entry, that is created by a macro to both
create the structure and export this to user space, it still had the caller
field of entry defined as: unsigned long caller[8].

When the stack is greater than 8, the FORTIFY_SOURCE code notices that the
amount being copied is greater than the source array and complains about
it. It has no idea that the source is pointing to the ring buffer with the
required allocation.

To hide this from the FORTIFY_SOURCE logic, pointer arithmetic is used:

	ptr = ring_buffer_event_data(event);
	entry = ptr;
	ptr += offsetof(typeof(*entry), caller);
	memcpy(ptr, fstack->calls, size);

Link: https://lore.kernel.org/all/20230612160748.4082850-1-svens@linux.ibm.com/
Link: https://lore.kernel.org/linux-trace-kernel/20230712105235.5fc441aa@gandalf.local.home

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Reported-by: Sven Schnelle <svens@linux.ibm.com>
Tested-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4529e264cb86..20122eeccf97 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3118,6 +3118,7 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
 	struct ftrace_stack *fstack;
 	struct stack_entry *entry;
 	int stackidx;
+	void *ptr;
 
 	/*
 	 * Add one, for this function and the call to save_stack_trace()
@@ -3161,9 +3162,25 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
 				    trace_ctx);
 	if (!event)
 		goto out;
-	entry = ring_buffer_event_data(event);
+	ptr = ring_buffer_event_data(event);
+	entry = ptr;
+
+	/*
+	 * For backward compatibility reasons, the entry->caller is an
+	 * array of 8 slots to store the stack. This is also exported
+	 * to user space. The amount allocated on the ring buffer actually
+	 * holds enough for the stack specified by nr_entries. This will
+	 * go into the location of entry->caller. Due to string fortifiers
+	 * checking the size of the destination of memcpy() it triggers
+	 * when it detects that size is greater than 8. To hide this from
+	 * the fortifiers, we use "ptr" and pointer arithmetic to assign caller.
+	 *
+	 * The below is really just:
+	 *   memcpy(&entry->caller, fstack->calls, size);
+	 */
+	ptr += offsetof(typeof(*entry), caller);
+	memcpy(ptr, fstack->calls, size);
 
-	memcpy(&entry->caller, fstack->calls, size);
 	entry->size = nr_entries;
 
 	if (!call_filter_check_discard(call, entry, buffer, event))
-- 
cgit v1.2.3


From 6018b585e8c6fa7d85d4b38d9ce49a5b67be7078 Mon Sep 17 00:00:00 2001
From: Mohamed Khalfella <mkhalfella@purestorage.com>
Date: Wed, 12 Jul 2023 22:30:21 +0000
Subject: tracing/histograms: Add histograms to hist_vars if they have
 referenced variables

Hist triggers can have referenced variables without having direct
variables fields. This can be the case if referenced variables are added
for trigger actions. In this case the newly added references will not
have field variables. Not taking such referenced variables into
consideration can result in a bug where it would be possible to remove
hist trigger with variables being refenced. This will result in a bug
that is easily reproducable like so

$ cd /sys/kernel/tracing
$ echo 'synthetic_sys_enter char[] comm; long id' >> synthetic_events
$ echo 'hist:keys=common_pid.execname,id.syscall:vals=hitcount:comm=common_pid.execname' >> events/raw_syscalls/sys_enter/trigger
$ echo 'hist:keys=common_pid.execname,id.syscall:onmatch(raw_syscalls.sys_enter).synthetic_sys_enter($comm, id)' >> events/raw_syscalls/sys_enter/trigger
$ echo '!hist:keys=common_pid.execname,id.syscall:vals=hitcount:comm=common_pid.execname' >> events/raw_syscalls/sys_enter/trigger

[  100.263533] ==================================================================
[  100.264634] BUG: KASAN: slab-use-after-free in resolve_var_refs+0xc7/0x180
[  100.265520] Read of size 8 at addr ffff88810375d0f0 by task bash/439
[  100.266320]
[  100.266533] CPU: 2 PID: 439 Comm: bash Not tainted 6.5.0-rc1 #4
[  100.267277] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-20220807_005459-localhost 04/01/2014
[  100.268561] Call Trace:
[  100.268902]  <TASK>
[  100.269189]  dump_stack_lvl+0x4c/0x70
[  100.269680]  print_report+0xc5/0x600
[  100.270165]  ? resolve_var_refs+0xc7/0x180
[  100.270697]  ? kasan_complete_mode_report_info+0x80/0x1f0
[  100.271389]  ? resolve_var_refs+0xc7/0x180
[  100.271913]  kasan_report+0xbd/0x100
[  100.272380]  ? resolve_var_refs+0xc7/0x180
[  100.272920]  __asan_load8+0x71/0xa0
[  100.273377]  resolve_var_refs+0xc7/0x180
[  100.273888]  event_hist_trigger+0x749/0x860
[  100.274505]  ? kasan_save_stack+0x2a/0x50
[  100.275024]  ? kasan_set_track+0x29/0x40
[  100.275536]  ? __pfx_event_hist_trigger+0x10/0x10
[  100.276138]  ? ksys_write+0xd1/0x170
[  100.276607]  ? do_syscall_64+0x3c/0x90
[  100.277099]  ? entry_SYSCALL_64_after_hwframe+0x6e/0xd8
[  100.277771]  ? destroy_hist_data+0x446/0x470
[  100.278324]  ? event_hist_trigger_parse+0xa6c/0x3860
[  100.278962]  ? __pfx_event_hist_trigger_parse+0x10/0x10
[  100.279627]  ? __kasan_check_write+0x18/0x20
[  100.280177]  ? mutex_unlock+0x85/0xd0
[  100.280660]  ? __pfx_mutex_unlock+0x10/0x10
[  100.281200]  ? kfree+0x7b/0x120
[  100.281619]  ? ____kasan_slab_free+0x15d/0x1d0
[  100.282197]  ? event_trigger_write+0xac/0x100
[  100.282764]  ? __kasan_slab_free+0x16/0x20
[  100.283293]  ? __kmem_cache_free+0x153/0x2f0
[  100.283844]  ? sched_mm_cid_remote_clear+0xb1/0x250
[  100.284550]  ? __pfx_sched_mm_cid_remote_clear+0x10/0x10
[  100.285221]  ? event_trigger_write+0xbc/0x100
[  100.285781]  ? __kasan_check_read+0x15/0x20
[  100.286321]  ? __bitmap_weight+0x66/0xa0
[  100.286833]  ? _find_next_bit+0x46/0xe0
[  100.287334]  ? task_mm_cid_work+0x37f/0x450
[  100.287872]  event_triggers_call+0x84/0x150
[  100.288408]  trace_event_buffer_commit+0x339/0x430
[  100.289073]  ? ring_buffer_event_data+0x3f/0x60
[  100.292189]  trace_event_raw_event_sys_enter+0x8b/0xe0
[  100.295434]  syscall_trace_enter.constprop.0+0x18f/0x1b0
[  100.298653]  syscall_enter_from_user_mode+0x32/0x40
[  100.301808]  do_syscall_64+0x1a/0x90
[  100.304748]  entry_SYSCALL_64_after_hwframe+0x6e/0xd8
[  100.307775] RIP: 0033:0x7f686c75c1cb
[  100.310617] Code: 73 01 c3 48 8b 0d 65 3c 10 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa b8 21 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 35 3c 10 00 f7 d8 64 89 01 48
[  100.317847] RSP: 002b:00007ffc60137a38 EFLAGS: 00000246 ORIG_RAX: 0000000000000021
[  100.321200] RAX: ffffffffffffffda RBX: 000055f566469ea0 RCX: 00007f686c75c1cb
[  100.324631] RDX: 0000000000000001 RSI: 0000000000000001 RDI: 000000000000000a
[  100.328104] RBP: 00007ffc60137ac0 R08: 00007f686c818460 R09: 000000000000000a
[  100.331509] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000009
[  100.334992] R13: 0000000000000007 R14: 000000000000000a R15: 0000000000000007
[  100.338381]  </TASK>

We hit the bug because when second hist trigger has was created
has_hist_vars() returned false because hist trigger did not have
variables. As a result of that save_hist_vars() was not called to add
the trigger to trace_array->hist_vars. Later on when we attempted to
remove the first histogram find_any_var_ref() failed to detect it is
being used because it did not find the second trigger in hist_vars list.

With this change we wait until trigger actions are created so we can take
into consideration if hist trigger has variable references. Also, now we
check the return value of save_hist_vars() and fail trigger creation if
save_hist_vars() fails.

Link: https://lore.kernel.org/linux-trace-kernel/20230712223021.636335-1-mkhalfella@purestorage.com

Cc: stable@vger.kernel.org
Fixes: 067fe038e70f6 ("tracing: Add variable reference handling to hist triggers")
Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index b97d3ad832f1..c8c61381eba4 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -6663,13 +6663,15 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
 	if (get_named_trigger_data(trigger_data))
 		goto enable;
 
-	if (has_hist_vars(hist_data))
-		save_hist_vars(hist_data);
-
 	ret = create_actions(hist_data);
 	if (ret)
 		goto out_unreg;
 
+	if (has_hist_vars(hist_data) || hist_data->n_var_refs) {
+		if (save_hist_vars(hist_data))
+			goto out_unreg;
+	}
+
 	ret = tracing_map_init(hist_data->map);
 	if (ret)
 		goto out_unreg;
-- 
cgit v1.2.3


From d5a821896360cc8b93a15bd888fabc858c038dc0 Mon Sep 17 00:00:00 2001
From: Zheng Yejian <zhengyejian1@huawei.com>
Date: Thu, 13 Jul 2023 22:14:35 +0800
Subject: tracing: Fix memory leak of iter->temp when reading trace_pipe

kmemleak reports:
  unreferenced object 0xffff88814d14e200 (size 256):
    comm "cat", pid 336, jiffies 4294871818 (age 779.490s)
    hex dump (first 32 bytes):
      04 00 01 03 00 00 00 00 08 00 00 00 00 00 00 00  ................
      0c d8 c8 9b ff ff ff ff 04 5a ca 9b ff ff ff ff  .........Z......
    backtrace:
      [<ffffffff9bdff18f>] __kmalloc+0x4f/0x140
      [<ffffffff9bc9238b>] trace_find_next_entry+0xbb/0x1d0
      [<ffffffff9bc9caef>] trace_print_lat_context+0xaf/0x4e0
      [<ffffffff9bc94490>] print_trace_line+0x3e0/0x950
      [<ffffffff9bc95499>] tracing_read_pipe+0x2d9/0x5a0
      [<ffffffff9bf03a43>] vfs_read+0x143/0x520
      [<ffffffff9bf04c2d>] ksys_read+0xbd/0x160
      [<ffffffff9d0f0edf>] do_syscall_64+0x3f/0x90
      [<ffffffff9d2000aa>] entry_SYSCALL_64_after_hwframe+0x6e/0xd8

when reading file 'trace_pipe', 'iter->temp' is allocated or relocated
in trace_find_next_entry() but not freed before 'trace_pipe' is closed.

To fix it, free 'iter->temp' in tracing_release_pipe().

Link: https://lore.kernel.org/linux-trace-kernel/20230713141435.1133021-1-zhengyejian1@huawei.com

Cc: stable@vger.kernel.org
Fixes: ff895103a84ab ("tracing: Save off entry when peeking at next entry")
Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 20122eeccf97..be847d45d81c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6781,6 +6781,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
 
 	free_cpumask_var(iter->started);
 	kfree(iter->fmt);
+	kfree(iter->temp);
 	mutex_destroy(&iter->mutex);
 	kfree(iter);
 
-- 
cgit v1.2.3