summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore2
-rw-r--r--kernel/Makefile48
-rw-r--r--kernel/bpf/core.c50
-rw-r--r--kernel/bpf/syscall.c19
-rw-r--r--kernel/bpf/verifier.c2
-rw-r--r--kernel/cfi.c15
-rw-r--r--kernel/cgroup/cgroup-v1.c14
-rw-r--r--kernel/cgroup/cpuset.c2
-rw-r--r--kernel/cgroup/rstat.c195
-rw-r--r--kernel/crash_core.c15
-rw-r--r--kernel/crash_reserve.c68
-rw-r--r--kernel/events/core.c36
-rw-r--r--kernel/events/uprobes.c9
-rw-r--r--kernel/exit.c9
-rw-r--r--kernel/fork.c108
-rwxr-xr-xkernel/gen_kheaders.sh99
-rw-r--r--kernel/hung_task.c29
-rw-r--r--kernel/irq/chip.c8
-rw-r--r--kernel/kcov.c2
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kexec_core.c100
-rw-r--r--kernel/kexec_file.c51
-rw-r--r--kernel/kexec_handover.c4
-rw-r--r--kernel/kexec_internal.h2
-rw-r--r--kernel/kthread.c11
-rw-r--r--kernel/locking/rwsem.c31
-rw-r--r--kernel/module/internal.h7
-rw-r--r--kernel/module/main.c25
-rw-r--r--kernel/padata.c154
-rw-r--r--kernel/panic.c71
-rw-r--r--kernel/printk/internal.h2
-rw-r--r--kernel/printk/nbcon.c89
-rw-r--r--kernel/printk/printk.c20
-rw-r--r--kernel/rcu/tree_stall.h7
-rw-r--r--kernel/relay.c69
-rw-r--r--kernel/sched/core.c29
-rw-r--r--kernel/sched/ext.c250
-rw-r--r--kernel/sched/ext.h20
-rw-r--r--kernel/sched/ext_idle.c45
-rw-r--r--kernel/sched/ext_idle.h12
-rw-r--r--kernel/sched/psi.c6
-rw-r--r--kernel/sched/sched.h9
-rw-r--r--kernel/sys.c50
-rw-r--r--kernel/time/clocksource.c5
-rw-r--r--kernel/trace/Kconfig27
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/blktrace.c22
-rw-r--r--kernel/trace/preemptirq_delay_test.c13
-rw-r--r--kernel/trace/ring_buffer.c16
-rw-r--r--kernel/trace/rv/rv.c6
-rw-r--r--kernel/trace/trace.c338
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--kernel/trace/trace_events.c154
-rw-r--r--kernel/trace/trace_events_filter.c28
-rw-r--r--kernel/trace/trace_events_synth.c6
-rw-r--r--kernel/trace/trace_hwlat.c5
-rw-r--r--kernel/trace/trace_output.c8
-rw-r--r--kernel/ucount.c16
-rw-r--r--kernel/unwind/Makefile1
-rw-r--r--kernel/unwind/deferred.c362
-rw-r--r--kernel/unwind/user.c128
-rw-r--r--kernel/vhost_task.c2
-rw-r--r--kernel/watchdog_buddy.c5
-rw-r--r--kernel/workqueue.c74
64 files changed, 1873 insertions, 1145 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index c6b299a6b786..a501bfc80694 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: GPL-2.0-only
/config_data
/kheaders.md5
+/kheaders-objlist
+/kheaders-srclist
diff --git a/kernel/Makefile b/kernel/Makefile
index a89602a2a0ab..c60623448235 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -54,6 +54,7 @@ obj-y += rcu/
obj-y += livepatch/
obj-y += dma/
obj-y += entry/
+obj-y += unwind/
obj-$(CONFIG_MODULES) += module/
obj-$(CONFIG_KCMP) += kcmp.o
@@ -158,11 +159,48 @@ filechk_cat = cat $<
$(obj)/config_data: $(KCONFIG_CONFIG) FORCE
$(call filechk,cat)
+# kheaders_data.tar.xz
$(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz
-quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz
- cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_kheaders.sh $@
-$(obj)/kheaders_data.tar.xz: FORCE
- $(call cmd,genikh)
+quiet_cmd_kheaders_data = GEN $@
+ cmd_kheaders_data = "$<" "$@" "$(obj)/kheaders-srclist" "$(obj)/kheaders-objlist" "$(KBUILD_BUILD_TIMESTAMP)"
+ cmd_kheaders_data_dep = cat $(depfile) >> $(dot-target).cmd; rm -f $(depfile)
-clean-files := kheaders_data.tar.xz kheaders.md5
+define rule_kheaders_data
+ $(call cmd_and_savecmd,kheaders_data)
+ $(call cmd,kheaders_data_dep)
+endef
+
+targets += kheaders_data.tar.xz
+$(obj)/kheaders_data.tar.xz: $(src)/gen_kheaders.sh $(obj)/kheaders-srclist $(obj)/kheaders-objlist $(obj)/kheaders.md5 FORCE
+ $(call if_changed_rule,kheaders_data)
+
+# generated headers in objtree
+#
+# include/generated/utsversion.h is ignored because it is generated
+# after gen_kheaders.sh is executed. (utsversion.h is unneeded for kheaders)
+filechk_kheaders_objlist = \
+ for d in include "arch/$(SRCARCH)/include"; do \
+ find "$${d}/generated" ! -path "include/generated/utsversion.h" -a -name "*.h" -print; \
+ done
+
+$(obj)/kheaders-objlist: FORCE
+ $(call filechk,kheaders_objlist)
+
+# non-generated headers in srctree
+filechk_kheaders_srclist = \
+ for d in include "arch/$(SRCARCH)/include"; do \
+ find "$(srctree)/$${d}" -path "$(srctree)/$${d}/generated" -prune -o -name "*.h" -print; \
+ done
+
+$(obj)/kheaders-srclist: FORCE
+ $(call filechk,kheaders_srclist)
+
+# Some files are symlinks. If symlinks are changed, kheaders_data.tar.xz should
+# be rebuilt.
+filechk_kheaders_md5sum = xargs -r -a $< stat -c %N | md5sum
+
+$(obj)/kheaders.md5: $(obj)/kheaders-srclist FORCE
+ $(call filechk,kheaders_md5sum)
+
+clean-files := kheaders.md5 kheaders-srclist kheaders-objlist
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 09dde5b00d0c..5d1650af899d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2377,28 +2377,44 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map,
const struct bpf_prog *fp)
{
enum bpf_prog_type prog_type = resolve_prog_type(fp);
- bool ret;
struct bpf_prog_aux *aux = fp->aux;
+ enum bpf_cgroup_storage_type i;
+ bool ret = false;
+ u64 cookie;
if (fp->kprobe_override)
- return false;
+ return ret;
- spin_lock(&map->owner.lock);
- if (!map->owner.type) {
- /* There's no owner yet where we could check for
- * compatibility.
- */
- map->owner.type = prog_type;
- map->owner.jited = fp->jited;
- map->owner.xdp_has_frags = aux->xdp_has_frags;
- map->owner.attach_func_proto = aux->attach_func_proto;
+ spin_lock(&map->owner_lock);
+ /* There's no owner yet where we could check for compatibility. */
+ if (!map->owner) {
+ map->owner = bpf_map_owner_alloc(map);
+ if (!map->owner)
+ goto err;
+ map->owner->type = prog_type;
+ map->owner->jited = fp->jited;
+ map->owner->xdp_has_frags = aux->xdp_has_frags;
+ map->owner->attach_func_proto = aux->attach_func_proto;
+ for_each_cgroup_storage_type(i) {
+ map->owner->storage_cookie[i] =
+ aux->cgroup_storage[i] ?
+ aux->cgroup_storage[i]->cookie : 0;
+ }
ret = true;
} else {
- ret = map->owner.type == prog_type &&
- map->owner.jited == fp->jited &&
- map->owner.xdp_has_frags == aux->xdp_has_frags;
+ ret = map->owner->type == prog_type &&
+ map->owner->jited == fp->jited &&
+ map->owner->xdp_has_frags == aux->xdp_has_frags;
+ for_each_cgroup_storage_type(i) {
+ if (!ret)
+ break;
+ cookie = aux->cgroup_storage[i] ?
+ aux->cgroup_storage[i]->cookie : 0;
+ ret = map->owner->storage_cookie[i] == cookie ||
+ !cookie;
+ }
if (ret &&
- map->owner.attach_func_proto != aux->attach_func_proto) {
+ map->owner->attach_func_proto != aux->attach_func_proto) {
switch (prog_type) {
case BPF_PROG_TYPE_TRACING:
case BPF_PROG_TYPE_LSM:
@@ -2411,8 +2427,8 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map,
}
}
}
- spin_unlock(&map->owner.lock);
-
+err:
+ spin_unlock(&map->owner_lock);
return ret;
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e63039817af3..0fbfa8532c39 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -37,6 +37,7 @@
#include <linux/trace_events.h>
#include <linux/tracepoint.h>
#include <linux/overflow.h>
+#include <linux/cookie.h>
#include <net/netfilter/nf_bpf_link.h>
#include <net/netkit.h>
@@ -53,6 +54,7 @@
#define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY)
DEFINE_PER_CPU(int, bpf_prog_active);
+DEFINE_COOKIE(bpf_map_cookie);
static DEFINE_IDR(prog_idr);
static DEFINE_SPINLOCK(prog_idr_lock);
static DEFINE_IDR(map_idr);
@@ -885,6 +887,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
security_bpf_map_free(map);
bpf_map_release_memcg(map);
+ bpf_map_owner_free(map);
bpf_map_free(map);
}
@@ -979,12 +982,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
struct bpf_map *map = filp->private_data;
u32 type = 0, jited = 0;
- if (map_type_contains_progs(map)) {
- spin_lock(&map->owner.lock);
- type = map->owner.type;
- jited = map->owner.jited;
- spin_unlock(&map->owner.lock);
+ spin_lock(&map->owner_lock);
+ if (map->owner) {
+ type = map->owner->type;
+ jited = map->owner->jited;
}
+ spin_unlock(&map->owner_lock);
seq_printf(m,
"map_type:\t%u\n"
@@ -1487,10 +1490,14 @@ static int map_create(union bpf_attr *attr, bool kernel)
if (err < 0)
goto free_map;
+ preempt_disable();
+ map->cookie = gen_cookie_next(&bpf_map_cookie);
+ preempt_enable();
+
atomic64_set(&map->refcnt, 1);
atomic64_set(&map->usercnt, 1);
mutex_init(&map->freeze_mutex);
- spin_lock_init(&map->owner.lock);
+ spin_lock_init(&map->owner_lock);
if (attr->btf_key_type_id || attr->btf_value_type_id ||
/* Even the map's value is a kernel's struct,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 399f03e62508..0806295945e4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -21445,7 +21445,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
&target_size);
if (cnt == 0 || cnt >= INSN_BUF_SIZE ||
(ctx_field_size && !target_size)) {
- verifier_bug(env, "error during ctx access conversion");
+ verifier_bug(env, "error during ctx access conversion (%d)", cnt);
return -EFAULT;
}
diff --git a/kernel/cfi.c b/kernel/cfi.c
index 422fa4f958ae..4dad04ead06c 100644
--- a/kernel/cfi.c
+++ b/kernel/cfi.c
@@ -5,6 +5,8 @@
* Copyright (C) 2022 Google LLC
*/
+#include <linux/bpf.h>
+#include <linux/cfi_types.h>
#include <linux/cfi.h>
bool cfi_warn __ro_after_init = IS_ENABLED(CONFIG_CFI_PERMISSIVE);
@@ -27,6 +29,19 @@ enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr,
return BUG_TRAP_TYPE_BUG;
}
+/*
+ * Declare two non-existent functions with types that match bpf_func_t and
+ * bpf_callback_t pointers, and use DEFINE_CFI_TYPE to define type hash
+ * variables for each function type. The cfi_bpf_* variables are used by
+ * arch-specific BPF JIT implementations to ensure indirectly callable JIT
+ * code has matching CFI type hashes.
+ */
+extern typeof(*(bpf_func_t)0) __bpf_prog_runX;
+DEFINE_CFI_TYPE(cfi_bpf_hash, __bpf_prog_runX);
+
+extern typeof(*(bpf_callback_t)0) __bpf_callback_fn;
+DEFINE_CFI_TYPE(cfi_bpf_subprog_hash, __bpf_callback_fn);
+
#ifdef CONFIG_ARCH_USES_CFI_TRAPS
static inline unsigned long trap_address(s32 *p)
{
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index fa24c032ed6f..2a4a387f867a 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -32,6 +32,9 @@ static u16 cgroup_no_v1_mask;
/* disable named v1 mounts */
static bool cgroup_no_v1_named;
+/* Show unavailable controllers in /proc/cgroups */
+static bool proc_show_all;
+
/*
* pidlist destructions need to be flushed on cgroup destruction. Use a
* separate workqueue as flush domain.
@@ -683,10 +686,11 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
*/
for_each_subsys(ss, i) {
- if (cgroup1_subsys_absent(ss))
- continue;
cgrp_v1_visible |= ss->root != &cgrp_dfl_root;
+ if (!proc_show_all && cgroup1_subsys_absent(ss))
+ continue;
+
seq_printf(m, "%s\t%d\t%d\t%d\n",
ss->legacy_name, ss->root->hierarchy_id,
atomic_read(&ss->root->nr_cgrps),
@@ -1359,3 +1363,9 @@ static int __init cgroup_no_v1(char *str)
return 1;
}
__setup("cgroup_no_v1=", cgroup_no_v1);
+
+static int __init cgroup_v1_proc(char *str)
+{
+ return (kstrtobool(str, &proc_show_all) == 0);
+}
+__setup("cgroup_v1_proc=", cgroup_v1_proc);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 3bc4301466f3..f74d04429a29 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -4051,7 +4051,7 @@ void __init cpuset_init_smp(void)
cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
top_cpuset.effective_mems = node_states[N_MEMORY];
- hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);
+ hotplug_node_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);
cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
BUG_ON(!cpuset_migrate_mm_wq);
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index cbeaa499a96a..981e2f77ad4e 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -10,7 +10,7 @@
#include <trace/events/cgroup.h>
static DEFINE_SPINLOCK(rstat_base_lock);
-static DEFINE_PER_CPU(raw_spinlock_t, rstat_base_cpu_lock);
+static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list);
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
@@ -45,84 +45,11 @@ static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss)
return &rstat_base_lock;
}
-static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu)
+static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu)
{
- if (ss) {
- /*
- * Depending on config, the subsystem per-cpu lock type may be an
- * empty struct. In enviromnents where this is the case, allocation
- * of this field is not performed in ss_rstat_init(). Avoid a
- * cpu-based offset relative to NULL by returning early. When the
- * lock type is zero in size, the corresponding lock functions are
- * no-ops so passing them NULL is acceptable.
- */
- if (sizeof(*ss->rstat_ss_cpu_lock) == 0)
- return NULL;
-
- return per_cpu_ptr(ss->rstat_ss_cpu_lock, cpu);
- }
-
- return per_cpu_ptr(&rstat_base_cpu_lock, cpu);
-}
-
-/*
- * Helper functions for rstat per CPU locks.
- *
- * This makes it easier to diagnose locking issues and contention in
- * production environments. The parameter @fast_path determine the
- * tracepoints being added, allowing us to diagnose "flush" related
- * operations without handling high-frequency fast-path "update" events.
- */
-static __always_inline
-unsigned long _css_rstat_cpu_lock(struct cgroup_subsys_state *css, int cpu,
- const bool fast_path)
-{
- struct cgroup *cgrp = css->cgroup;
- raw_spinlock_t *cpu_lock;
- unsigned long flags;
- bool contended;
-
- /*
- * The _irqsave() is needed because the locks used for flushing are
- * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring this lock
- * with the _irq() suffix only disables interrupts on a non-PREEMPT_RT
- * kernel. The raw_spinlock_t below disables interrupts on both
- * configurations. The _irqsave() ensures that interrupts are always
- * disabled and later restored.
- */
- cpu_lock = ss_rstat_cpu_lock(css->ss, cpu);
- contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
- if (contended) {
- if (fast_path)
- trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended);
- else
- trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended);
-
- raw_spin_lock_irqsave(cpu_lock, flags);
- }
-
- if (fast_path)
- trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended);
- else
- trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended);
-
- return flags;
-}
-
-static __always_inline
-void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu,
- unsigned long flags, const bool fast_path)
-{
- struct cgroup *cgrp = css->cgroup;
- raw_spinlock_t *cpu_lock;
-
- if (fast_path)
- trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false);
- else
- trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false);
-
- cpu_lock = ss_rstat_cpu_lock(css->ss, cpu);
- raw_spin_unlock_irqrestore(cpu_lock, flags);
+ if (ss)
+ return per_cpu_ptr(ss->lhead, cpu);
+ return per_cpu_ptr(&rstat_backlog_list, cpu);
}
/**
@@ -130,13 +57,22 @@ void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu,
* @css: target cgroup subsystem state
* @cpu: cpu on which rstat_cpu was updated
*
- * @css's rstat_cpu on @cpu was updated. Put it on the parent's matching
- * rstat_cpu->updated_children list. See the comment on top of
- * css_rstat_cpu definition for details.
+ * Atomically inserts the css in the ss's llist for the given cpu. This is
+ * reentrant safe i.e. safe against softirq, hardirq and nmi. The ss's llist
+ * will be processed at the flush time to create the update tree.
+ *
+ * NOTE: if the user needs the guarantee that the updater either add itself in
+ * the lockless list or the concurrent flusher flushes its updated stats, a
+ * memory barrier is needed before the call to css_rstat_updated() i.e. a
+ * barrier after updating the per-cpu stats and before calling
+ * css_rstat_updated().
*/
__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
{
- unsigned long flags;
+ struct llist_head *lhead;
+ struct css_rstat_cpu *rstatc;
+ struct css_rstat_cpu __percpu *rstatc_pcpu;
+ struct llist_node *self;
/*
* Since bpf programs can call this function, prevent access to
@@ -145,19 +81,49 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
if (!css_uses_rstat(css))
return;
+ lockdep_assert_preemption_disabled();
+
+ /*
+ * For archs withnot nmi safe cmpxchg or percpu ops support, ignore
+ * the requests from nmi context.
+ */
+ if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) ||
+ !IS_ENABLED(CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS)) && in_nmi())
+ return;
+
+ rstatc = css_rstat_cpu(css, cpu);
/*
- * Speculative already-on-list test. This may race leading to
- * temporary inaccuracies, which is fine.
+ * If already on list return. This check is racy and smp_mb() is needed
+ * to pair it with the smp_mb() in css_process_update_tree() if the
+ * guarantee that the updated stats are visible to concurrent flusher is
+ * needed.
+ */
+ if (llist_on_list(&rstatc->lnode))
+ return;
+
+ /*
+ * This function can be renentered by irqs and nmis for the same cgroup
+ * and may try to insert the same per-cpu lnode into the llist. Note
+ * that llist_add() does not protect against such scenarios.
*
- * Because @parent's updated_children is terminated with @parent
- * instead of NULL, we can tell whether @css is on the list by
- * testing the next pointer for NULL.
+ * To protect against such stacked contexts of irqs/nmis, we use the
+ * fact that lnode points to itself when not on a list and then use
+ * this_cpu_cmpxchg() to atomically set to NULL to select the winner
+ * which will call llist_add(). The losers can assume the insertion is
+ * successful and the winner will eventually add the per-cpu lnode to
+ * the llist.
*/
- if (data_race(css_rstat_cpu(css, cpu)->updated_next))
+ self = &rstatc->lnode;
+ rstatc_pcpu = css->rstat_cpu;
+ if (this_cpu_cmpxchg(rstatc_pcpu->lnode.next, self, NULL) != self)
return;
- flags = _css_rstat_cpu_lock(css, cpu, true);
+ lhead = ss_lhead_cpu(css->ss, cpu);
+ llist_add(&rstatc->lnode, lhead);
+}
+static void __css_process_update_tree(struct cgroup_subsys_state *css, int cpu)
+{
/* put @css and all ancestors on the corresponding updated lists */
while (true) {
struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
@@ -183,8 +149,34 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
css = parent;
}
+}
+
+static void css_process_update_tree(struct cgroup_subsys *ss, int cpu)
+{
+ struct llist_head *lhead = ss_lhead_cpu(ss, cpu);
+ struct llist_node *lnode;
+
+ while ((lnode = llist_del_first_init(lhead))) {
+ struct css_rstat_cpu *rstatc;
+
+ /*
+ * smp_mb() is needed here (more specifically in between
+ * init_llist_node() and per-cpu stats flushing) if the
+ * guarantee is required by a rstat user where etiher the
+ * updater should add itself on the lockless list or the
+ * flusher flush the stats updated by the updater who have
+ * observed that they are already on the list. The
+ * corresponding barrier pair for this one should be before
+ * css_rstat_updated() by the user.
+ *
+ * For now, there aren't any such user, so not adding the
+ * barrier here but if such a use-case arise, please add
+ * smp_mb() here.
+ */
- _css_rstat_cpu_unlock(css, cpu, flags, true);
+ rstatc = container_of(lnode, struct css_rstat_cpu, lnode);
+ __css_process_update_tree(rstatc->owner, cpu);
+ }
}
/**
@@ -288,13 +280,12 @@ static struct cgroup_subsys_state *css_rstat_updated_list(
{
struct css_rstat_cpu *rstatc = css_rstat_cpu(root, cpu);
struct cgroup_subsys_state *head = NULL, *parent, *child;
- unsigned long flags;
- flags = _css_rstat_cpu_lock(root, cpu, false);
+ css_process_update_tree(root->ss, cpu);
/* Return NULL if this subtree is not on-list */
if (!rstatc->updated_next)
- goto unlock_ret;
+ return NULL;
/*
* Unlink @root from its parent. As the updated_children list is
@@ -326,8 +317,7 @@ static struct cgroup_subsys_state *css_rstat_updated_list(
rstatc->updated_children = root;
if (child != root)
head = css_rstat_push_children(head, child, cpu);
-unlock_ret:
- _css_rstat_cpu_unlock(root, cpu, flags, false);
+
return head;
}
@@ -468,7 +458,8 @@ int css_rstat_init(struct cgroup_subsys_state *css)
for_each_possible_cpu(cpu) {
struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
- rstatc->updated_children = css;
+ rstatc->owner = rstatc->updated_children = css;
+ init_llist_node(&rstatc->lnode);
if (is_self) {
struct cgroup_rstat_base_cpu *rstatbc;
@@ -522,19 +513,15 @@ int __init ss_rstat_init(struct cgroup_subsys *ss)
{
int cpu;
- /*
- * Depending on config, the subsystem per-cpu lock type may be an empty
- * struct. Avoid allocating a size of zero in this case.
- */
- if (ss && sizeof(*ss->rstat_ss_cpu_lock)) {
- ss->rstat_ss_cpu_lock = alloc_percpu(raw_spinlock_t);
- if (!ss->rstat_ss_cpu_lock)
+ if (ss) {
+ ss->lhead = alloc_percpu(struct llist_head);
+ if (!ss->lhead)
return -ENOMEM;
}
spin_lock_init(ss_rstat_lock(ss));
for_each_possible_cpu(cpu)
- raw_spin_lock_init(ss_rstat_cpu_lock(ss, cpu));
+ init_llist_head(ss_lhead_cpu(ss, cpu));
return 0;
}
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 335b8425dd4b..a4ef79591eb2 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -21,6 +21,7 @@
#include <linux/reboot.h>
#include <linux/btf.h>
#include <linux/objtool.h>
+#include <linux/delay.h>
#include <asm/page.h>
#include <asm/sections.h>
@@ -33,6 +34,11 @@
/* Per cpu memory for storing cpu states in case of system crash. */
note_buf_t __percpu *crash_notes;
+/* time to wait for possible DMA to finish before starting the kdump kernel
+ * when a CMA reservation is used
+ */
+#define CMA_DMA_TIMEOUT_SEC 10
+
#ifdef CONFIG_CRASH_DUMP
int kimage_crash_copy_vmcoreinfo(struct kimage *image)
@@ -97,6 +103,14 @@ int kexec_crash_loaded(void)
}
EXPORT_SYMBOL_GPL(kexec_crash_loaded);
+static void crash_cma_clear_pending_dma(void)
+{
+ if (!crashk_cma_cnt)
+ return;
+
+ mdelay(CMA_DMA_TIMEOUT_SEC * 1000);
+}
+
/*
* No panic_cpu check version of crash_kexec(). This function is called
* only when panic_cpu holds the current CPU number; this is the only CPU
@@ -119,6 +133,7 @@ void __noclone __crash_kexec(struct pt_regs *regs)
crash_setup_regs(&fixed_regs, regs);
crash_save_vmcoreinfo();
machine_crash_shutdown(&fixed_regs);
+ crash_cma_clear_pending_dma();
machine_kexec(kexec_crash_image);
}
kexec_unlock();
diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c
index acb6bf42e30d..87bf4d41eabb 100644
--- a/kernel/crash_reserve.c
+++ b/kernel/crash_reserve.c
@@ -14,6 +14,8 @@
#include <linux/cpuhotplug.h>
#include <linux/memblock.h>
#include <linux/kmemleak.h>
+#include <linux/cma.h>
+#include <linux/crash_reserve.h>
#include <asm/page.h>
#include <asm/sections.h>
@@ -172,17 +174,19 @@ static int __init parse_crashkernel_simple(char *cmdline,
#define SUFFIX_HIGH 0
#define SUFFIX_LOW 1
-#define SUFFIX_NULL 2
+#define SUFFIX_CMA 2
+#define SUFFIX_NULL 3
static __initdata char *suffix_tbl[] = {
[SUFFIX_HIGH] = ",high",
[SUFFIX_LOW] = ",low",
+ [SUFFIX_CMA] = ",cma",
[SUFFIX_NULL] = NULL,
};
/*
* That function parses "suffix" crashkernel command lines like
*
- * crashkernel=size,[high|low]
+ * crashkernel=size,[high|low|cma]
*
* It returns 0 on success and -EINVAL on failure.
*/
@@ -298,9 +302,11 @@ int __init parse_crashkernel(char *cmdline,
unsigned long long *crash_size,
unsigned long long *crash_base,
unsigned long long *low_size,
+ unsigned long long *cma_size,
bool *high)
{
int ret;
+ unsigned long long __always_unused cma_base;
/* crashkernel=X[@offset] */
ret = __parse_crashkernel(cmdline, system_ram, crash_size,
@@ -331,6 +337,14 @@ int __init parse_crashkernel(char *cmdline,
*high = true;
}
+
+ /*
+ * optional CMA reservation
+ * cma_base is ignored
+ */
+ if (cma_size)
+ __parse_crashkernel(cmdline, 0, cma_size,
+ &cma_base, suffix_tbl[SUFFIX_CMA]);
#endif
if (!*crash_size)
ret = -EINVAL;
@@ -457,6 +471,56 @@ retry:
#endif
}
+struct range crashk_cma_ranges[CRASHKERNEL_CMA_RANGES_MAX];
+#ifdef CRASHKERNEL_CMA
+int crashk_cma_cnt;
+void __init reserve_crashkernel_cma(unsigned long long cma_size)
+{
+ unsigned long long request_size = roundup(cma_size, PAGE_SIZE);
+ unsigned long long reserved_size = 0;
+
+ if (!cma_size)
+ return;
+
+ while (cma_size > reserved_size &&
+ crashk_cma_cnt < CRASHKERNEL_CMA_RANGES_MAX) {
+
+ struct cma *res;
+
+ if (cma_declare_contiguous(0, request_size, 0, 0, 0, false,
+ "crashkernel", &res)) {
+ /* reservation failed, try half-sized blocks */
+ if (request_size <= PAGE_SIZE)
+ break;
+
+ request_size = roundup(request_size / 2, PAGE_SIZE);
+ continue;
+ }
+
+ crashk_cma_ranges[crashk_cma_cnt].start = cma_get_base(res);
+ crashk_cma_ranges[crashk_cma_cnt].end =
+ crashk_cma_ranges[crashk_cma_cnt].start +
+ cma_get_size(res) - 1;
+ ++crashk_cma_cnt;
+ reserved_size += request_size;
+ }
+
+ if (cma_size > reserved_size)
+ pr_warn("crashkernel CMA reservation failed: %lld MB requested, %lld MB reserved in %d ranges\n",
+ cma_size >> 20, reserved_size >> 20, crashk_cma_cnt);
+ else
+ pr_info("crashkernel CMA reserved: %lld MB in %d ranges\n",
+ reserved_size >> 20, crashk_cma_cnt);
+}
+
+#else /* CRASHKERNEL_CMA */
+void __init reserve_crashkernel_cma(unsigned long long cma_size)
+{
+ if (cma_size)
+ pr_warn("crashkernel CMA reservation not supported\n");
+}
+#endif
+
#ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY
static __init int insert_crashkernel_resources(void)
{
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 22fdf0c187cd..8060c2857bb2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6842,10 +6842,20 @@ static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf)
return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS;
}
+static int perf_mmap_may_split(struct vm_area_struct *vma, unsigned long addr)
+{
+ /*
+ * Forbid splitting perf mappings to prevent refcount leaks due to
+ * the resulting non-matching offsets and sizes. See open()/close().
+ */
+ return -EINVAL;
+}
+
static const struct vm_operations_struct perf_mmap_vmops = {
.open = perf_mmap_open,
.close = perf_mmap_close, /* non mergeable */
.pfn_mkwrite = perf_mmap_pfn_mkwrite,
+ .may_split = perf_mmap_may_split,
};
static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma)
@@ -7051,8 +7061,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
ret = 0;
goto unlock;
}
-
- atomic_set(&rb->aux_mmap_count, 1);
}
user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
@@ -7115,15 +7123,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
perf_event_update_time(event);
perf_event_init_userpage(event);
perf_event_update_userpage(event);
+ ret = 0;
} else {
ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
event->attr.aux_watermark, flags);
- if (!ret)
+ if (!ret) {
+ atomic_set(&rb->aux_mmap_count, 1);
rb->aux_mmap_locked = extra;
+ }
}
- ret = 0;
-
unlock:
if (!ret) {
atomic_long_add(user_extra, &user->locked_vm);
@@ -7131,6 +7140,7 @@ unlock:
atomic_inc(&event->mmap_count);
} else if (rb) {
+ /* AUX allocation failed */
atomic_dec(&rb->mmap_count);
}
aux_unlock:
@@ -7138,6 +7148,9 @@ aux_unlock:
mutex_unlock(aux_mutex);
mutex_unlock(&event->mmap_mutex);
+ if (ret)
+ return ret;
+
/*
* Since pinned accounting is per vm we cannot allow fork() to copy our
* vma.
@@ -7145,13 +7158,20 @@ aux_unlock:
vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
vma->vm_ops = &perf_mmap_vmops;
- if (!ret)
- ret = map_range(rb, vma);
-
mapped = get_mapped(event, event_mapped);
if (mapped)
mapped(event, vma->vm_mm);
+ /*
+ * Try to map it into the page table. On fail, invoke
+ * perf_mmap_close() to undo the above, as the callsite expects
+ * full cleanup in this case and therefore does not invoke
+ * vmops::close().
+ */
+ ret = map_range(rb, vma);
+ if (ret)
+ perf_mmap_close(vma);
+
return ret;
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 4c965ba77f9f..7ca1940607bd 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -436,8 +436,7 @@ static int __uprobe_write_opcode(struct vm_area_struct *vma,
* there are no unexpected folio references ...
*/
if (is_register || userfaultfd_missing(vma) ||
- (folio_ref_count(folio) != folio_mapcount(folio) + 1 +
- folio_test_swapcache(folio) * folio_nr_pages(folio)))
+ (folio_ref_count(folio) != folio_expected_ref_count(folio) + 1))
goto remap;
/*
@@ -540,7 +539,7 @@ retry:
}
ret = 0;
- if (unlikely(!folio_test_anon(folio))) {
+ if (unlikely(!folio_test_anon(folio) || folio_is_zone_device(folio))) {
VM_WARN_ON_ONCE(is_register);
folio_put(folio);
goto out;
@@ -581,8 +580,8 @@ retry:
out:
/* Revert back reference counter if instruction update failed. */
- if (ret < 0 && is_register && ref_ctr_updated)
- update_ref_ctr(uprobe, mm, -1);
+ if (ret < 0 && ref_ctr_updated)
+ update_ref_ctr(uprobe, mm, is_register ? -1 : 1);
/* try collapse pmd for compound page */
if (ret > 0)
diff --git a/kernel/exit.c b/kernel/exit.c
index bb184a67ac73..343eb97543d5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -68,6 +68,7 @@
#include <linux/rethook.h>
#include <linux/sysfs.h>
#include <linux/user_events.h>
+#include <linux/unwind_deferred.h>
#include <linux/uaccess.h>
#include <linux/pidfs.h>
@@ -692,12 +693,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
}
/*
- * This does two things:
- *
- * A. Make init inherit all the child processes
- * B. Check to see if any process groups have become orphaned
- * as a result of our exiting, and if they have any stopped
- * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
+ * Make init inherit all the child processes
*/
static void forget_original_parent(struct task_struct *father,
struct list_head *dead)
@@ -938,6 +934,7 @@ void __noreturn do_exit(long code)
tsk->exit_code = code;
taskstats_exit(tsk, group_dead);
+ unwind_deferred_task_exit(tsk);
trace_sched_process_exit(tsk, group_dead);
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 52901fe4a3c2..c4ada32598bd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -105,6 +105,7 @@
#include <uapi/linux/pidfd.h>
#include <linux/pidfs.h>
#include <linux/tick.h>
+#include <linux/unwind_deferred.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -188,33 +189,33 @@ static inline void free_task_struct(struct task_struct *tsk)
kmem_cache_free(task_struct_cachep, tsk);
}
-/*
- * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
- * kmemcache based allocator.
- */
-# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
-
-# ifdef CONFIG_VMAP_STACK
+#ifdef CONFIG_VMAP_STACK
/*
* vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
* flush. Try to minimize the number of calls by caching stacks.
*/
#define NR_CACHED_STACKS 2
static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
+/*
+ * Allocated stacks are cached and later reused by new threads, so memcg
+ * accounting is performed by the code assigning/releasing stacks to tasks.
+ * We need a zeroed memory without __GFP_ACCOUNT.
+ */
+#define GFP_VMAP_STACK (GFP_KERNEL | __GFP_ZERO)
struct vm_stack {
struct rcu_head rcu;
struct vm_struct *stack_vm_area;
};
-static bool try_release_thread_stack_to_cache(struct vm_struct *vm)
+static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area)
{
unsigned int i;
for (i = 0; i < NR_CACHED_STACKS; i++) {
struct vm_struct *tmp = NULL;
- if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm))
+ if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area))
return true;
}
return false;
@@ -223,11 +224,12 @@ static bool try_release_thread_stack_to_cache(struct vm_struct *vm)
static void thread_stack_free_rcu(struct rcu_head *rh)
{
struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu);
+ struct vm_struct *vm_area = vm_stack->stack_vm_area;
if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area))
return;
- vfree(vm_stack);
+ vfree(vm_area->addr);
}
static void thread_stack_delayed_free(struct task_struct *tsk)
@@ -240,32 +242,32 @@ static void thread_stack_delayed_free(struct task_struct *tsk)
static int free_vm_stack_cache(unsigned int cpu)
{
- struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
+ struct vm_struct **cached_vm_stack_areas = per_cpu_ptr(cached_stacks, cpu);
int i;
for (i = 0; i < NR_CACHED_STACKS; i++) {
- struct vm_struct *vm_stack = cached_vm_stacks[i];
+ struct vm_struct *vm_area = cached_vm_stack_areas[i];
- if (!vm_stack)
+ if (!vm_area)
continue;
- vfree(vm_stack->addr);
- cached_vm_stacks[i] = NULL;
+ vfree(vm_area->addr);
+ cached_vm_stack_areas[i] = NULL;
}
return 0;
}
-static int memcg_charge_kernel_stack(struct vm_struct *vm)
+static int memcg_charge_kernel_stack(struct vm_struct *vm_area)
{
int i;
int ret;
int nr_charged = 0;
- BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+ BUG_ON(vm_area->nr_pages != THREAD_SIZE / PAGE_SIZE);
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
- ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0);
+ ret = memcg_kmem_charge_page(vm_area->pages[i], GFP_KERNEL, 0);
if (ret)
goto err;
nr_charged++;
@@ -273,55 +275,47 @@ static int memcg_charge_kernel_stack(struct vm_struct *vm)
return 0;
err:
for (i = 0; i < nr_charged; i++)
- memcg_kmem_uncharge_page(vm->pages[i], 0);
+ memcg_kmem_uncharge_page(vm_area->pages[i], 0);
return ret;
}
static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
- struct vm_struct *vm;
+ struct vm_struct *vm_area;
void *stack;
int i;
for (i = 0; i < NR_CACHED_STACKS; i++) {
- struct vm_struct *s;
-
- s = this_cpu_xchg(cached_stacks[i], NULL);
-
- if (!s)
+ vm_area = this_cpu_xchg(cached_stacks[i], NULL);
+ if (!vm_area)
continue;
/* Reset stack metadata. */
- kasan_unpoison_range(s->addr, THREAD_SIZE);
+ kasan_unpoison_range(vm_area->addr, THREAD_SIZE);
- stack = kasan_reset_tag(s->addr);
+ stack = kasan_reset_tag(vm_area->addr);
/* Clear stale pointers from reused stack. */
memset(stack, 0, THREAD_SIZE);
- if (memcg_charge_kernel_stack(s)) {
- vfree(s->addr);
+ if (memcg_charge_kernel_stack(vm_area)) {
+ vfree(vm_area->addr);
return -ENOMEM;
}
- tsk->stack_vm_area = s;
+ tsk->stack_vm_area = vm_area;
tsk->stack = stack;
return 0;
}
- /*
- * Allocated stacks are cached and later reused by new threads,
- * so memcg accounting is performed manually on assigning/releasing
- * stacks to tasks. Drop __GFP_ACCOUNT.
- */
stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN,
- THREADINFO_GFP & ~__GFP_ACCOUNT,
+ GFP_VMAP_STACK,
node, __builtin_return_address(0));
if (!stack)
return -ENOMEM;
- vm = find_vm_area(stack);
- if (memcg_charge_kernel_stack(vm)) {
+ vm_area = find_vm_area(stack);
+ if (memcg_charge_kernel_stack(vm_area)) {
vfree(stack);
return -ENOMEM;
}
@@ -330,7 +324,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
* free_thread_stack() can be called in interrupt context,
* so cache the vm_struct.
*/
- tsk->stack_vm_area = vm;
+ tsk->stack_vm_area = vm_area;
stack = kasan_reset_tag(stack);
tsk->stack = stack;
return 0;
@@ -345,7 +339,13 @@ static void free_thread_stack(struct task_struct *tsk)
tsk->stack_vm_area = NULL;
}
-# else /* !CONFIG_VMAP_STACK */
+#else /* !CONFIG_VMAP_STACK */
+
+/*
+ * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
+ * kmemcache based allocator.
+ */
+#if THREAD_SIZE >= PAGE_SIZE
static void thread_stack_free_rcu(struct rcu_head *rh)
{
@@ -377,8 +377,7 @@ static void free_thread_stack(struct task_struct *tsk)
tsk->stack = NULL;
}
-# endif /* CONFIG_VMAP_STACK */
-# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */
+#else /* !(THREAD_SIZE >= PAGE_SIZE) */
static struct kmem_cache *thread_stack_cache;
@@ -417,7 +416,8 @@ void thread_stack_cache_init(void)
BUG_ON(thread_stack_cache == NULL);
}
-# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
+#endif /* THREAD_SIZE >= PAGE_SIZE */
+#endif /* CONFIG_VMAP_STACK */
/* SLAB cache for signal_struct structures (tsk->signal) */
static struct kmem_cache *signal_cachep;
@@ -437,11 +437,11 @@ static struct kmem_cache *mm_cachep;
static void account_kernel_stack(struct task_struct *tsk, int account)
{
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
- struct vm_struct *vm = task_stack_vm_area(tsk);
+ struct vm_struct *vm_area = task_stack_vm_area(tsk);
int i;
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
- mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB,
+ mod_lruvec_page_state(vm_area->pages[i], NR_KERNEL_STACK_KB,
account * (PAGE_SIZE / 1024));
} else {
void *stack = task_stack_page(tsk);
@@ -457,12 +457,12 @@ void exit_task_stack_account(struct task_struct *tsk)
account_kernel_stack(tsk, -1);
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
- struct vm_struct *vm;
+ struct vm_struct *vm_area;
int i;
- vm = task_stack_vm_area(tsk);
+ vm_area = task_stack_vm_area(tsk);
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
- memcg_kmem_uncharge_page(vm->pages[i], 0);
+ memcg_kmem_uncharge_page(vm_area->pages[i], 0);
}
}
@@ -585,9 +585,12 @@ static void check_mm(struct mm_struct *mm)
for (i = 0; i < NR_MM_COUNTERS; i++) {
long x = percpu_counter_sum(&mm->rss_stat[i]);
- if (unlikely(x))
- pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
- mm, resident_page_types[i], x);
+ if (unlikely(x)) {
+ pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld Comm:%s Pid:%d\n",
+ mm, resident_page_types[i], x,
+ current->comm,
+ task_pid_nr(current));
+ }
}
if (mm_pgtables_bytes(mm))
@@ -732,6 +735,7 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(refcount_read(&tsk->usage));
WARN_ON(tsk == current);
+ unwind_task_free(tsk);
sched_ext_free(tsk);
io_uring_free(tsk);
cgroup_free(tsk);
@@ -2135,6 +2139,8 @@ __latent_entropy struct task_struct *copy_process(
p->bpf_ctx = NULL;
#endif
+ unwind_task_init(p);
+
/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p);
if (retval)
diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index c9e5dc068e85..896a503dfb29 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -4,79 +4,34 @@
# This script generates an archive consisting of kernel headers
# for CONFIG_IKHEADERS.
set -e
-sfile="$(readlink -f "$0")"
-outdir="$(pwd)"
tarfile=$1
-tmpdir=$outdir/${tarfile%/*}/.tmp_dir
-
-dir_list="
-include/
-arch/$SRCARCH/include/
-"
-
-# Support incremental builds by skipping archive generation
-# if timestamps of files being archived are not changed.
-
-# This block is useful for debugging the incremental builds.
-# Uncomment it for debugging.
-# if [ ! -f /tmp/iter ]; then iter=1; echo 1 > /tmp/iter;
-# else iter=$(($(cat /tmp/iter) + 1)); echo $iter > /tmp/iter; fi
-# find $all_dirs -name "*.h" | xargs ls -l > /tmp/ls-$iter
-
-all_dirs=
-if [ "$building_out_of_srctree" ]; then
- for d in $dir_list; do
- all_dirs="$all_dirs $srctree/$d"
- done
-fi
-all_dirs="$all_dirs $dir_list"
-
-# include/generated/utsversion.h is ignored because it is generated after this
-# script is executed. (utsversion.h is unneeded for kheaders)
-#
-# When Kconfig regenerates include/generated/autoconf.h, its timestamp is
-# updated, but the contents might be still the same. When any CONFIG option is
-# changed, Kconfig touches the corresponding timestamp file include/config/*.
-# Hence, the md5sum detects the configuration change anyway. We do not need to
-# check include/generated/autoconf.h explicitly.
-#
-# Ignore them for md5 calculation to avoid pointless regeneration.
-headers_md5="$(find $all_dirs -name "*.h" -a \
- ! -path include/generated/utsversion.h -a \
- ! -path include/generated/autoconf.h |
- xargs ls -l | md5sum | cut -d ' ' -f1)"
-
-# Any changes to this script will also cause a rebuild of the archive.
-this_file_md5="$(ls -l $sfile | md5sum | cut -d ' ' -f1)"
-if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi
-if [ -f kernel/kheaders.md5 ] &&
- [ "$(head -n 1 kernel/kheaders.md5)" = "$headers_md5" ] &&
- [ "$(head -n 2 kernel/kheaders.md5 | tail -n 1)" = "$this_file_md5" ] &&
- [ "$(tail -n 1 kernel/kheaders.md5)" = "$tarfile_md5" ]; then
- exit
-fi
-
-echo " GEN $tarfile"
+srclist=$2
+objlist=$3
+timestamp=$4
+
+dir=$(dirname "${tarfile}")
+tmpdir=${dir}/.tmp_dir
+depfile=${dir}/.$(basename "${tarfile}").d
+
+# generate dependency list.
+{
+ echo
+ echo "deps_${tarfile} := \\"
+ sed 's:\(.*\): \1 \\:' "${srclist}"
+ sed -n '/^include\/generated\/autoconf\.h$/!s:\(.*\): \1 \\:p' "${objlist}"
+ echo
+ echo "${tarfile}: \$(deps_${tarfile})"
+ echo
+ echo "\$(deps_${tarfile}):"
+
+} > "${depfile}"
rm -rf "${tmpdir}"
mkdir "${tmpdir}"
-if [ "$building_out_of_srctree" ]; then
- (
- cd $srctree
- for f in $dir_list
- do find "$f" -name "*.h";
- done | tar -c -f - -T - | tar -xf - -C "${tmpdir}"
- )
-fi
-
-for f in $dir_list;
- do find "$f" -name "*.h";
-done | tar -c -f - -T - | tar -xf - -C "${tmpdir}"
-
-# Always exclude include/generated/utsversion.h
-# Otherwise, the contents of the tarball may vary depending on the build steps.
-rm -f "${tmpdir}/include/generated/utsversion.h"
+# shellcheck disable=SC2154 # srctree is passed as an env variable
+sed "s:^${srctree}/::" "${srclist}" | ${TAR} -c -f - -C "${srctree}" -T - | ${TAR} -xf - -C "${tmpdir}"
+${TAR} -c -f - -T "${objlist}" | ${TAR} -xf - -C "${tmpdir}"
# Remove comments except SDPX lines
# Use a temporary file to store directory contents to prevent find/xargs from
@@ -88,12 +43,8 @@ xargs -0 -P8 -n1 \
rm -f "${tmpdir}.contents.txt"
# Create archive and try to normalize metadata for reproducibility.
-tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \
+${TAR} "${timestamp:+--mtime=$timestamp}" \
--owner=0 --group=0 --sort=name --numeric-owner --mode=u=rw,go=r,a+X \
- -I $XZ -cf $tarfile -C "${tmpdir}/" . > /dev/null
-
-echo $headers_md5 > kernel/kheaders.md5
-echo "$this_file_md5" >> kernel/kheaders.md5
-echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5
+ -I "${XZ}" -cf "${tarfile}" -C "${tmpdir}/" . > /dev/null
rm -rf "${tmpdir}"
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index d2432df2b905..8708a1205f82 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -23,6 +23,7 @@
#include <linux/sched/debug.h>
#include <linux/sched/sysctl.h>
#include <linux/hung_task.h>
+#include <linux/rwsem.h>
#include <trace/events/sched.h>
@@ -100,6 +101,7 @@ static void debug_show_blocker(struct task_struct *task)
{
struct task_struct *g, *t;
unsigned long owner, blocker, blocker_type;
+ const char *rwsem_blocked_by, *rwsem_blocked_as;
RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held");
@@ -111,12 +113,20 @@ static void debug_show_blocker(struct task_struct *task)
switch (blocker_type) {
case BLOCKER_TYPE_MUTEX:
- owner = mutex_get_owner(
- (struct mutex *)hung_task_blocker_to_lock(blocker));
+ owner = mutex_get_owner(hung_task_blocker_to_lock(blocker));
break;
case BLOCKER_TYPE_SEM:
- owner = sem_last_holder(
- (struct semaphore *)hung_task_blocker_to_lock(blocker));
+ owner = sem_last_holder(hung_task_blocker_to_lock(blocker));
+ break;
+ case BLOCKER_TYPE_RWSEM_READER:
+ case BLOCKER_TYPE_RWSEM_WRITER:
+ owner = (unsigned long)rwsem_owner(
+ hung_task_blocker_to_lock(blocker));
+ rwsem_blocked_as = (blocker_type == BLOCKER_TYPE_RWSEM_READER) ?
+ "reader" : "writer";
+ rwsem_blocked_by = is_rwsem_reader_owned(
+ hung_task_blocker_to_lock(blocker)) ?
+ "reader" : "writer";
break;
default:
WARN_ON_ONCE(1);
@@ -134,6 +144,11 @@ static void debug_show_blocker(struct task_struct *task)
pr_err("INFO: task %s:%d is blocked on a semaphore, but the last holder is not found.\n",
task->comm, task->pid);
break;
+ case BLOCKER_TYPE_RWSEM_READER:
+ case BLOCKER_TYPE_RWSEM_WRITER:
+ pr_err("INFO: task %s:%d is blocked on an rw-semaphore, but the owner is not found.\n",
+ task->comm, task->pid);
+ break;
}
return;
}
@@ -152,6 +167,12 @@ static void debug_show_blocker(struct task_struct *task)
pr_err("INFO: task %s:%d blocked on a semaphore likely last held by task %s:%d\n",
task->comm, task->pid, t->comm, t->pid);
break;
+ case BLOCKER_TYPE_RWSEM_READER:
+ case BLOCKER_TYPE_RWSEM_WRITER:
+ pr_err("INFO: task %s:%d <%s> blocked on an rw-semaphore likely owned by task %s:%d <%s>\n",
+ task->comm, task->pid, rwsem_blocked_as, t->comm,
+ t->pid, rwsem_blocked_by);
+ break;
}
sched_show_task(t);
return;
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 624106e886ad..0d0276378c70 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -611,7 +611,13 @@ void handle_simple_irq(struct irq_desc *desc)
{
guard(raw_spinlock)(&desc->lock);
- if (!irq_can_handle(desc))
+ if (!irq_can_handle_pm(desc)) {
+ if (irqd_needs_resend_when_in_progress(&desc->irq_data))
+ desc->istate |= IRQS_PENDING;
+ return;
+ }
+
+ if (!irq_can_handle_actions(desc))
return;
kstat_incr_irqs_this_cpu(desc);
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 187ba1b80bda..1d85597057e1 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -552,7 +552,7 @@ static int kcov_get_mode(unsigned long arg)
/*
* Fault in a lazily-faulted vmalloc area before it can be used by
- * __santizer_cov_trace_pc(), to avoid recursion issues if any code on the
+ * __sanitizer_cov_trace_pc(), to avoid recursion issues if any code on the
* vmalloc fault handling path is instrumented.
*/
static void kcov_fault_in_area(struct kcov *kcov)
diff --git a/kernel/kexec.c b/kernel/kexec.c
index a6b3f96bb50c..28008e3d462e 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -152,7 +152,7 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
goto out;
for (i = 0; i < nr_segments; i++) {
- ret = kimage_load_segment(image, &image->segment[i]);
+ ret = kimage_load_segment(image, i);
if (ret)
goto out;
}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 351cd7d76dfa..31203f0bacaf 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -40,6 +40,7 @@
#include <linux/hugetlb.h>
#include <linux/objtool.h>
#include <linux/kmsg_dump.h>
+#include <linux/dma-map-ops.h>
#include <asm/page.h>
#include <asm/sections.h>
@@ -553,6 +554,24 @@ static void kimage_free_entry(kimage_entry_t entry)
kimage_free_pages(page);
}
+static void kimage_free_cma(struct kimage *image)
+{
+ unsigned long i;
+
+ for (i = 0; i < image->nr_segments; i++) {
+ struct page *cma = image->segment_cma[i];
+ u32 nr_pages = image->segment[i].memsz >> PAGE_SHIFT;
+
+ if (!cma)
+ continue;
+
+ arch_kexec_pre_free_pages(page_address(cma), nr_pages);
+ dma_release_from_contiguous(NULL, cma, nr_pages);
+ image->segment_cma[i] = NULL;
+ }
+
+}
+
void kimage_free(struct kimage *image)
{
kimage_entry_t *ptr, entry;
@@ -591,6 +610,9 @@ void kimage_free(struct kimage *image)
/* Free the kexec control pages... */
kimage_free_page_list(&image->control_pages);
+ /* Free CMA allocations */
+ kimage_free_cma(image);
+
/*
* Free up any temporary buffers allocated. This might hit if
* error occurred much later after buffer allocation.
@@ -716,9 +738,69 @@ static struct page *kimage_alloc_page(struct kimage *image,
return page;
}
-static int kimage_load_normal_segment(struct kimage *image,
- struct kexec_segment *segment)
+static int kimage_load_cma_segment(struct kimage *image, int idx)
+{
+ struct kexec_segment *segment = &image->segment[idx];
+ struct page *cma = image->segment_cma[idx];
+ char *ptr = page_address(cma);
+ unsigned long maddr;
+ size_t ubytes, mbytes;
+ int result = 0;
+ unsigned char __user *buf = NULL;
+ unsigned char *kbuf = NULL;
+
+ if (image->file_mode)
+ kbuf = segment->kbuf;
+ else
+ buf = segment->buf;
+ ubytes = segment->bufsz;
+ mbytes = segment->memsz;
+ maddr = segment->mem;
+
+ /* Then copy from source buffer to the CMA one */
+ while (mbytes) {
+ size_t uchunk, mchunk;
+
+ ptr += maddr & ~PAGE_MASK;
+ mchunk = min_t(size_t, mbytes,
+ PAGE_SIZE - (maddr & ~PAGE_MASK));
+ uchunk = min(ubytes, mchunk);
+
+ if (uchunk) {
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
+ ubytes -= uchunk;
+ if (image->file_mode)
+ kbuf += uchunk;
+ else
+ buf += uchunk;
+ }
+
+ if (result) {
+ result = -EFAULT;
+ goto out;
+ }
+
+ ptr += mchunk;
+ maddr += mchunk;
+ mbytes -= mchunk;
+
+ cond_resched();
+ }
+
+ /* Clear any remainder */
+ memset(ptr, 0, mbytes);
+
+out:
+ return result;
+}
+
+static int kimage_load_normal_segment(struct kimage *image, int idx)
{
+ struct kexec_segment *segment = &image->segment[idx];
unsigned long maddr;
size_t ubytes, mbytes;
int result;
@@ -733,6 +815,9 @@ static int kimage_load_normal_segment(struct kimage *image,
mbytes = segment->memsz;
maddr = segment->mem;
+ if (image->segment_cma[idx])
+ return kimage_load_cma_segment(image, idx);
+
result = kimage_set_destination(image, maddr);
if (result < 0)
goto out;
@@ -787,13 +872,13 @@ out:
}
#ifdef CONFIG_CRASH_DUMP
-static int kimage_load_crash_segment(struct kimage *image,
- struct kexec_segment *segment)
+static int kimage_load_crash_segment(struct kimage *image, int idx)
{
/* For crash dumps kernels we simply copy the data from
* user space to it's destination.
* We do things a page at a time for the sake of kmap.
*/
+ struct kexec_segment *segment = &image->segment[idx];
unsigned long maddr;
size_t ubytes, mbytes;
int result;
@@ -858,18 +943,17 @@ out:
}
#endif
-int kimage_load_segment(struct kimage *image,
- struct kexec_segment *segment)
+int kimage_load_segment(struct kimage *image, int idx)
{
int result = -ENOMEM;
switch (image->type) {
case KEXEC_TYPE_DEFAULT:
- result = kimage_load_normal_segment(image, segment);
+ result = kimage_load_normal_segment(image, idx);
break;
#ifdef CONFIG_CRASH_DUMP
case KEXEC_TYPE_CRASH:
- result = kimage_load_crash_segment(image, segment);
+ result = kimage_load_crash_segment(image, idx);
break;
#endif
}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index b835033c65eb..91d46502a817 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -26,6 +26,7 @@
#include <linux/kernel_read_file.h>
#include <linux/syscalls.h>
#include <linux/vmalloc.h>
+#include <linux/dma-map-ops.h>
#include "kexec_internal.h"
#ifdef CONFIG_KEXEC_SIG
@@ -253,6 +254,8 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
ret = 0;
}
+ image->no_cma = !!(flags & KEXEC_FILE_NO_CMA);
+
if (cmdline_len) {
image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len);
if (IS_ERR(image->cmdline_buf)) {
@@ -434,7 +437,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
i, ksegment->buf, ksegment->bufsz, ksegment->mem,
ksegment->memsz);
- ret = kimage_load_segment(image, &image->segment[i]);
+ ret = kimage_load_segment(image, i);
if (ret)
goto out;
}
@@ -663,6 +666,43 @@ static int kexec_walk_resources(struct kexec_buf *kbuf,
return walk_system_ram_res(0, ULONG_MAX, kbuf, func);
}
+static int kexec_alloc_contig(struct kexec_buf *kbuf)
+{
+ size_t nr_pages = kbuf->memsz >> PAGE_SHIFT;
+ unsigned long mem;
+ struct page *p;
+
+ /* User space disabled CMA allocations, bail out. */
+ if (kbuf->image->no_cma)
+ return -EPERM;
+
+ /* Skip CMA logic for crash kernel */
+ if (kbuf->image->type == KEXEC_TYPE_CRASH)
+ return -EPERM;
+
+ p = dma_alloc_from_contiguous(NULL, nr_pages, get_order(kbuf->buf_align), true);
+ if (!p)
+ return -ENOMEM;
+
+ pr_debug("allocated %zu DMA pages at 0x%lx", nr_pages, page_to_boot_pfn(p));
+
+ mem = page_to_boot_pfn(p) << PAGE_SHIFT;
+
+ if (kimage_is_destination_range(kbuf->image, mem, mem + kbuf->memsz)) {
+ /* Our region is already in use by a statically defined one. Bail out. */
+ pr_debug("CMA overlaps existing mem: 0x%lx+0x%lx\n", mem, kbuf->memsz);
+ dma_release_from_contiguous(NULL, p, nr_pages);
+ return -EBUSY;
+ }
+
+ kbuf->mem = page_to_boot_pfn(p) << PAGE_SHIFT;
+ kbuf->cma = p;
+
+ arch_kexec_post_alloc_pages(page_address(p), (int)nr_pages, 0);
+
+ return 0;
+}
+
/**
* kexec_locate_mem_hole - find free memory for the purgatory or the next kernel
* @kbuf: Parameters for the memory search.
@@ -687,6 +727,13 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
if (ret <= 0)
return ret;
+ /*
+ * Try to find a free physically contiguous block of memory first. With that, we
+ * can avoid any copying at kexec time.
+ */
+ if (!kexec_alloc_contig(kbuf))
+ return 0;
+
if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
ret = kexec_walk_resources(kbuf, locate_mem_hole_callback);
else
@@ -732,6 +779,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
/* Ensure minimum alignment needed for segments. */
kbuf->memsz = ALIGN(kbuf->memsz, PAGE_SIZE);
kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE);
+ kbuf->cma = NULL;
/* Walk the RAM ranges and allocate a suitable range for the buffer */
ret = arch_kexec_locate_mem_hole(kbuf);
@@ -744,6 +792,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
ksegment->bufsz = kbuf->bufsz;
ksegment->mem = kbuf->mem;
ksegment->memsz = kbuf->memsz;
+ kbuf->image->segment_cma[kbuf->image->nr_segments] = kbuf->cma;
kbuf->image->nr_segments++;
return 0;
}
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
index d3b13a909913..e49743ae52c5 100644
--- a/kernel/kexec_handover.c
+++ b/kernel/kexec_handover.c
@@ -1100,8 +1100,8 @@ static void __init kho_release_scratch(void)
ulong pfn;
for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages)
- set_pageblock_migratetype(pfn_to_page(pfn),
- MIGRATE_CMA);
+ init_pageblock_migratetype(pfn_to_page(pfn),
+ MIGRATE_CMA, false);
}
}
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 30a733a55a67..228bb88c018b 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -10,7 +10,7 @@ struct kimage *do_kimage_alloc_init(void);
int sanity_check_segment_list(struct kimage *image);
void kimage_free_page_list(struct list_head *list);
void kimage_free(struct kimage *image);
-int kimage_load_segment(struct kimage *image, struct kexec_segment *segment);
+int kimage_load_segment(struct kimage *image, int idx);
void kimage_terminate(struct kimage *image);
int kimage_is_destination_range(struct kimage *image,
unsigned long start, unsigned long end);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 85fc068f0083..0e98b228a8ef 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -88,13 +88,12 @@ static inline struct kthread *to_kthread(struct task_struct *k)
/*
* Variant of to_kthread() that doesn't assume @p is a kthread.
*
- * Per construction; when:
+ * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will
+ * always remain a kthread. For kthreads p->worker_private always
+ * points to a struct kthread. For tasks that are not kthreads
+ * p->worker_private is used to point to other things.
*
- * (p->flags & PF_KTHREAD) && p->worker_private
- *
- * the task is both a kthread and struct kthread is persistent. However
- * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and
- * begin_new_exec()).
+ * Return NULL for any task that is not a kthread.
*/
static inline struct kthread *__to_kthread(struct task_struct *p)
{
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 8572dba95af4..24df4d98f7d2 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -27,6 +27,7 @@
#include <linux/export.h>
#include <linux/rwsem.h>
#include <linux/atomic.h>
+#include <linux/hung_task.h>
#include <trace/events/lock.h>
#ifndef CONFIG_PREEMPT_RT
@@ -181,11 +182,11 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
__rwsem_set_reader_owned(sem, current);
}
-#ifdef CONFIG_DEBUG_RWSEMS
+#if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER)
/*
* Return just the real task structure pointer of the owner
*/
-static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem)
+struct task_struct *rwsem_owner(struct rw_semaphore *sem)
{
return (struct task_struct *)
(atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK);
@@ -194,7 +195,7 @@ static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem)
/*
* Return true if the rwsem is owned by a reader.
*/
-static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
+bool is_rwsem_reader_owned(struct rw_semaphore *sem)
{
/*
* Check the count to see if it is write-locked.
@@ -207,10 +208,10 @@ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
}
/*
- * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there
- * is a task pointer in owner of a reader-owned rwsem, it will be the
- * real owner or one of the real owners. The only exception is when the
- * unlock is done by up_read_non_owner().
+ * With CONFIG_DEBUG_RWSEMS or CONFIG_DETECT_HUNG_TASK_BLOCKER configured,
+ * it will make sure that the owner field of a reader-owned rwsem either
+ * points to a real reader-owner(s) or gets cleared. The only exception is
+ * when the unlock is done by up_read_non_owner().
*/
static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
{
@@ -1063,10 +1064,13 @@ queue:
wake_up_q(&wake_q);
trace_contention_begin(sem, LCB_F_READ);
+ set_current_state(state);
+
+ if (state == TASK_UNINTERRUPTIBLE)
+ hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_READER);
/* wait to be given the lock */
for (;;) {
- set_current_state(state);
if (!smp_load_acquire(&waiter.task)) {
/* Matches rwsem_mark_wake()'s smp_store_release(). */
break;
@@ -1081,8 +1085,12 @@ queue:
}
schedule_preempt_disabled();
lockevent_inc(rwsem_sleep_reader);
+ set_current_state(state);
}
+ if (state == TASK_UNINTERRUPTIBLE)
+ hung_task_clear_blocker();
+
__set_current_state(TASK_RUNNING);
lockevent_inc(rwsem_rlock);
trace_contention_end(sem, 0);
@@ -1144,6 +1152,9 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
set_current_state(state);
trace_contention_begin(sem, LCB_F_WRITE);
+ if (state == TASK_UNINTERRUPTIBLE)
+ hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_WRITER);
+
for (;;) {
if (rwsem_try_write_lock(sem, &waiter)) {
/* rwsem_try_write_lock() implies ACQUIRE on success */
@@ -1177,6 +1188,10 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
trylock_again:
raw_spin_lock_irq(&sem->wait_lock);
}
+
+ if (state == TASK_UNINTERRUPTIBLE)
+ hung_task_clear_blocker();
+
__set_current_state(TASK_RUNNING);
raw_spin_unlock_irq(&sem->wait_lock);
lockevent_inc(rwsem_wlock);
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 51ddd8866ef3..618202578b42 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -112,6 +112,13 @@ struct find_symbol_arg {
enum mod_license license;
};
+/* modules using other modules */
+struct module_use {
+ struct list_head source_list;
+ struct list_head target_list;
+ struct module *source, *target;
+};
+
int mod_verify_sig(const void *mod, struct load_info *info);
int try_to_force_load(struct module *mod, const char *reason);
bool find_symbol(struct find_symbol_arg *fsa);
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 81f9df8859dc..c66b26184936 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -608,7 +608,7 @@ MODINFO_ATTR(version);
MODINFO_ATTR(srcversion);
static struct {
- char name[MODULE_NAME_LEN + 1];
+ char name[MODULE_NAME_LEN];
char taints[MODULE_FLAGS_BUF_SIZE];
} last_unloaded_module;
@@ -779,14 +779,16 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
struct module *mod;
char name[MODULE_NAME_LEN];
char buf[MODULE_FLAGS_BUF_SIZE];
- int ret, forced = 0;
+ int ret, len, forced = 0;
if (!capable(CAP_SYS_MODULE) || modules_disabled)
return -EPERM;
- if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
- return -EFAULT;
- name[MODULE_NAME_LEN-1] = '\0';
+ len = strncpy_from_user(name, name_user, MODULE_NAME_LEN);
+ if (len == 0 || len == MODULE_NAME_LEN)
+ return -ENOENT;
+ if (len < 0)
+ return len;
audit_log_kern_module(name);
@@ -1320,20 +1322,11 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
else
execmem_type = EXECMEM_MODULE_TEXT;
- ptr = execmem_alloc(execmem_type, size);
+ ptr = execmem_alloc_rw(execmem_type, size);
if (!ptr)
return -ENOMEM;
- if (execmem_is_rox(execmem_type)) {
- int err = execmem_make_temp_rw(ptr, size);
-
- if (err) {
- execmem_free(ptr);
- return -ENOMEM;
- }
-
- mod->mem[type].is_rox = true;
- }
+ mod->mem[type].is_rox = execmem_is_rox(execmem_type);
/*
* The pointer to these blocks of memory are stored on the module
diff --git a/kernel/padata.c b/kernel/padata.c
index 7eee94166357..f85f8bd788d0 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -63,17 +63,6 @@ static inline void padata_put_pd(struct parallel_data *pd)
padata_put_pd_cnt(pd, 1);
}
-static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
-{
- int cpu, target_cpu;
-
- target_cpu = cpumask_first(pd->cpumask.pcpu);
- for (cpu = 0; cpu < cpu_index; cpu++)
- target_cpu = cpumask_next(target_cpu, pd->cpumask.pcpu);
-
- return target_cpu;
-}
-
static int padata_cpu_hash(struct parallel_data *pd, unsigned int seq_nr)
{
/*
@@ -82,7 +71,7 @@ static int padata_cpu_hash(struct parallel_data *pd, unsigned int seq_nr)
*/
int cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu);
- return padata_index_to_cpu(pd, cpu_index);
+ return cpumask_nth(cpu_index, pd->cpumask.pcpu);
}
static struct padata_work *padata_work_alloc(void)
@@ -192,9 +181,9 @@ int padata_do_parallel(struct padata_shell *ps,
struct padata_priv *padata, int *cb_cpu)
{
struct padata_instance *pinst = ps->pinst;
- int i, cpu, cpu_index, err;
struct parallel_data *pd;
struct padata_work *pw;
+ int cpu_index, err;
rcu_read_lock_bh();
@@ -210,12 +199,7 @@ int padata_do_parallel(struct padata_shell *ps,
/* Select an alternate fallback CPU and notify the caller. */
cpu_index = *cb_cpu % cpumask_weight(pd->cpumask.cbcpu);
-
- cpu = cpumask_first(pd->cpumask.cbcpu);
- for (i = 0; i < cpu_index; i++)
- cpu = cpumask_next(cpu, pd->cpumask.cbcpu);
-
- *cb_cpu = cpu;
+ *cb_cpu = cpumask_nth(cpu_index, pd->cpumask.cbcpu);
}
err = -EBUSY;
@@ -261,20 +245,17 @@ EXPORT_SYMBOL(padata_do_parallel);
* be parallel processed by another cpu and is not yet present in
* the cpu's reorder queue.
*/
-static struct padata_priv *padata_find_next(struct parallel_data *pd,
- bool remove_object)
+static struct padata_priv *padata_find_next(struct parallel_data *pd, int cpu,
+ unsigned int processed)
{
struct padata_priv *padata;
struct padata_list *reorder;
- int cpu = pd->cpu;
reorder = per_cpu_ptr(pd->reorder_list, cpu);
spin_lock(&reorder->lock);
- if (list_empty(&reorder->list)) {
- spin_unlock(&reorder->lock);
- return NULL;
- }
+ if (list_empty(&reorder->list))
+ goto notfound;
padata = list_entry(reorder->list.next, struct padata_priv, list);
@@ -282,97 +263,52 @@ static struct padata_priv *padata_find_next(struct parallel_data *pd,
* Checks the rare case where two or more parallel jobs have hashed to
* the same CPU and one of the later ones finishes first.
*/
- if (padata->seq_nr != pd->processed) {
- spin_unlock(&reorder->lock);
- return NULL;
- }
-
- if (remove_object) {
- list_del_init(&padata->list);
- ++pd->processed;
- pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu);
- }
+ if (padata->seq_nr != processed)
+ goto notfound;
+ list_del_init(&padata->list);
spin_unlock(&reorder->lock);
return padata;
+
+notfound:
+ pd->processed = processed;
+ pd->cpu = cpu;
+ spin_unlock(&reorder->lock);
+ return NULL;
}
-static void padata_reorder(struct parallel_data *pd)
+static void padata_reorder(struct padata_priv *padata)
{
+ struct parallel_data *pd = padata->pd;
struct padata_instance *pinst = pd->ps->pinst;
- int cb_cpu;
- struct padata_priv *padata;
- struct padata_serial_queue *squeue;
- struct padata_list *reorder;
+ unsigned int processed;
+ int cpu;
- /*
- * We need to ensure that only one cpu can work on dequeueing of
- * the reorder queue the time. Calculating in which percpu reorder
- * queue the next object will arrive takes some time. A spinlock
- * would be highly contended. Also it is not clear in which order
- * the objects arrive to the reorder queues. So a cpu could wait to
- * get the lock just to notice that there is nothing to do at the
- * moment. Therefore we use a trylock and let the holder of the lock
- * care for all the objects enqueued during the holdtime of the lock.
- */
- if (!spin_trylock_bh(&pd->lock))
- return;
+ processed = pd->processed;
+ cpu = pd->cpu;
- while (1) {
- padata = padata_find_next(pd, true);
+ do {
+ struct padata_serial_queue *squeue;
+ int cb_cpu;
- /*
- * If the next object that needs serialization is parallel
- * processed by another cpu and is still on it's way to the
- * cpu's reorder queue, nothing to do for now.
- */
- if (!padata)
- break;
+ cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu);
+ processed++;
cb_cpu = padata->cb_cpu;
squeue = per_cpu_ptr(pd->squeue, cb_cpu);
spin_lock(&squeue->serial.lock);
list_add_tail(&padata->list, &squeue->serial.list);
- spin_unlock(&squeue->serial.lock);
-
queue_work_on(cb_cpu, pinst->serial_wq, &squeue->work);
- }
-
- spin_unlock_bh(&pd->lock);
-
- /*
- * The next object that needs serialization might have arrived to
- * the reorder queues in the meantime.
- *
- * Ensure reorder queue is read after pd->lock is dropped so we see
- * new objects from another task in padata_do_serial. Pairs with
- * smp_mb in padata_do_serial.
- */
- smp_mb();
- reorder = per_cpu_ptr(pd->reorder_list, pd->cpu);
- if (!list_empty(&reorder->list) && padata_find_next(pd, false)) {
/*
- * Other context(eg. the padata_serial_worker) can finish the request.
- * To avoid UAF issue, add pd ref here, and put pd ref after reorder_work finish.
+ * If the next object that needs serialization is parallel
+ * processed by another cpu and is still on it's way to the
+ * cpu's reorder queue, end the loop.
*/
- padata_get_pd(pd);
- if (!queue_work(pinst->serial_wq, &pd->reorder_work))
- padata_put_pd(pd);
- }
-}
-
-static void invoke_padata_reorder(struct work_struct *work)
-{
- struct parallel_data *pd;
-
- local_bh_disable();
- pd = container_of(work, struct parallel_data, reorder_work);
- padata_reorder(pd);
- local_bh_enable();
- /* Pairs with putting the reorder_work in the serial_wq */
- padata_put_pd(pd);
+ padata = padata_find_next(pd, cpu, processed);
+ spin_unlock(&squeue->serial.lock);
+ } while (padata);
}
static void padata_serial_worker(struct work_struct *serial_work)
@@ -423,6 +359,7 @@ void padata_do_serial(struct padata_priv *padata)
struct padata_list *reorder = per_cpu_ptr(pd->reorder_list, hashed_cpu);
struct padata_priv *cur;
struct list_head *pos;
+ bool gotit = true;
spin_lock(&reorder->lock);
/* Sort in ascending order of sequence number. */
@@ -432,17 +369,14 @@ void padata_do_serial(struct padata_priv *padata)
if ((signed int)(cur->seq_nr - padata->seq_nr) < 0)
break;
}
- list_add(&padata->list, pos);
+ if (padata->seq_nr != pd->processed) {
+ gotit = false;
+ list_add(&padata->list, pos);
+ }
spin_unlock(&reorder->lock);
- /*
- * Ensure the addition to the reorder list is ordered correctly
- * with the trylock of pd->lock in padata_reorder. Pairs with smp_mb
- * in padata_reorder.
- */
- smp_mb();
-
- padata_reorder(pd);
+ if (gotit)
+ padata_reorder(padata);
}
EXPORT_SYMBOL(padata_do_serial);
@@ -632,9 +566,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps)
padata_init_squeues(pd);
pd->seq_nr = -1;
refcount_set(&pd->refcnt, 1);
- spin_lock_init(&pd->lock);
pd->cpu = cpumask_first(pd->cpumask.pcpu);
- INIT_WORK(&pd->reorder_work, invoke_padata_reorder);
return pd;
@@ -1144,12 +1076,6 @@ void padata_free_shell(struct padata_shell *ps)
if (!ps)
return;
- /*
- * Wait for all _do_serial calls to finish to avoid touching
- * freed pd's and ps's.
- */
- synchronize_rcu();
-
mutex_lock(&ps->pinst->lock);
list_del(&ps->list);
pd = rcu_dereference_protected(ps->pd, 1);
diff --git a/kernel/panic.c b/kernel/panic.c
index 43817111c979..72fcbb5a071b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -36,6 +36,7 @@
#include <linux/sysfs.h>
#include <linux/context_tracking.h>
#include <linux/seq_buf.h>
+#include <linux/sys_info.h>
#include <trace/events/error_report.h>
#include <asm/sections.h>
@@ -63,20 +64,13 @@ int panic_on_warn __read_mostly;
unsigned long panic_on_taint;
bool panic_on_taint_nousertaint = false;
static unsigned int warn_limit __read_mostly;
+static bool panic_console_replay;
bool panic_triggering_all_cpu_backtrace;
int panic_timeout = CONFIG_PANIC_TIMEOUT;
EXPORT_SYMBOL_GPL(panic_timeout);
-#define PANIC_PRINT_TASK_INFO 0x00000001
-#define PANIC_PRINT_MEM_INFO 0x00000002
-#define PANIC_PRINT_TIMER_INFO 0x00000004
-#define PANIC_PRINT_LOCK_INFO 0x00000008
-#define PANIC_PRINT_FTRACE_INFO 0x00000010
-#define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020
-#define PANIC_PRINT_ALL_CPU_BT 0x00000040
-#define PANIC_PRINT_BLOCKED_TASKS 0x00000080
unsigned long panic_print;
ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
@@ -128,6 +122,13 @@ static int proc_taint(const struct ctl_table *table, int write,
return err;
}
+static int sysctl_panic_print_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ pr_info_once("Kernel: 'panic_print' sysctl interface will be obsoleted by both 'panic_sys_info' and 'panic_console_replay'\n");
+ return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+}
+
static const struct ctl_table kern_panic_table[] = {
#ifdef CONFIG_SMP
{
@@ -165,7 +166,7 @@ static const struct ctl_table kern_panic_table[] = {
.data = &panic_print,
.maxlen = sizeof(unsigned long),
.mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
+ .proc_handler = sysctl_panic_print_handler,
},
{
.procname = "panic_on_warn",
@@ -193,6 +194,13 @@ static const struct ctl_table kern_panic_table[] = {
.proc_handler = proc_dointvec,
},
#endif
+ {
+ .procname = "panic_sys_info",
+ .data = &panic_print,
+ .maxlen = sizeof(panic_print),
+ .mode = 0644,
+ .proc_handler = sysctl_sys_info_handler,
+ },
};
static __init int kernel_panic_sysctls_init(void)
@@ -203,6 +211,15 @@ static __init int kernel_panic_sysctls_init(void)
late_initcall(kernel_panic_sysctls_init);
#endif
+/* The format is "panic_sys_info=tasks,mem,locks,ftrace,..." */
+static int __init setup_panic_sys_info(char *buf)
+{
+ /* There is no risk of race in kernel boot phase */
+ panic_print = sys_info_parse_param(buf);
+ return 1;
+}
+__setup("panic_sys_info=", setup_panic_sys_info);
+
static atomic_t warn_count = ATOMIC_INIT(0);
#ifdef CONFIG_SYSFS
@@ -298,33 +315,6 @@ void nmi_panic(struct pt_regs *regs, const char *msg)
}
EXPORT_SYMBOL(nmi_panic);
-static void panic_print_sys_info(bool console_flush)
-{
- if (console_flush) {
- if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG)
- console_flush_on_panic(CONSOLE_REPLAY_ALL);
- return;
- }
-
- if (panic_print & PANIC_PRINT_TASK_INFO)
- show_state();
-
- if (panic_print & PANIC_PRINT_MEM_INFO)
- show_mem();
-
- if (panic_print & PANIC_PRINT_TIMER_INFO)
- sysrq_timer_list_show();
-
- if (panic_print & PANIC_PRINT_LOCK_INFO)
- debug_show_all_locks();
-
- if (panic_print & PANIC_PRINT_FTRACE_INFO)
- ftrace_dump(DUMP_ALL);
-
- if (panic_print & PANIC_PRINT_BLOCKED_TASKS)
- show_state_filter(TASK_UNINTERRUPTIBLE);
-}
-
void check_panic_on_warn(const char *origin)
{
unsigned int limit;
@@ -345,7 +335,7 @@ void check_panic_on_warn(const char *origin)
*/
static void panic_other_cpus_shutdown(bool crash_kexec)
{
- if (panic_print & PANIC_PRINT_ALL_CPU_BT) {
+ if (panic_print & SYS_INFO_ALL_CPU_BT) {
/* Temporary allow non-panic CPUs to write their backtraces. */
panic_triggering_all_cpu_backtrace = true;
trigger_all_cpu_backtrace();
@@ -468,7 +458,7 @@ void vpanic(const char *fmt, va_list args)
*/
atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
- panic_print_sys_info(false);
+ sys_info(panic_print);
kmsg_dump_desc(KMSG_DUMP_PANIC, buf);
@@ -497,7 +487,9 @@ void vpanic(const char *fmt, va_list args)
debug_locks_off();
console_flush_on_panic(CONSOLE_FLUSH_PENDING);
- panic_print_sys_info(true);
+ if ((panic_print & SYS_INFO_PANIC_CONSOLE_REPLAY) ||
+ panic_console_replay)
+ console_flush_on_panic(CONSOLE_REPLAY_ALL);
if (!panic_blink)
panic_blink = no_blink;
@@ -949,6 +941,7 @@ core_param(panic_print, panic_print, ulong, 0644);
core_param(pause_on_oops, pause_on_oops, int, 0644);
core_param(panic_on_warn, panic_on_warn, int, 0644);
core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644);
+core_param(panic_console_replay, panic_console_replay, bool, 0644);
static int __init oops_setup(char *s)
{
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index bbed41ad29cf..ef282001f200 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -64,6 +64,7 @@ struct dev_printk_info;
extern struct printk_ringbuffer *prb;
extern bool printk_kthreads_running;
+extern bool printk_kthreads_ready;
extern bool debug_non_panic_cpus;
__printf(4, 0)
@@ -179,6 +180,7 @@ static inline void nbcon_kthread_wake(struct console *con)
#define PRINTKRB_RECORD_MAX 0
#define printk_kthreads_running (false)
+#define printk_kthreads_ready (false)
/*
* In !PRINTK builds we still export console_sem
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
index fd12efcc4aed..646801813415 100644
--- a/kernel/printk/nbcon.c
+++ b/kernel/printk/nbcon.c
@@ -214,8 +214,9 @@ static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq)
/**
* nbcon_context_try_acquire_direct - Try to acquire directly
- * @ctxt: The context of the caller
- * @cur: The current console state
+ * @ctxt: The context of the caller
+ * @cur: The current console state
+ * @is_reacquire: This acquire is a reacquire
*
* Acquire the console when it is released. Also acquire the console when
* the current owner has a lower priority and the console is in a safe state.
@@ -225,17 +226,17 @@ static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq)
*
* Errors:
*
- * -EPERM: A panic is in progress and this is not the panic CPU.
- * Or the current owner or waiter has the same or higher
- * priority. No acquire method can be successful in
- * this case.
+ * -EPERM: A panic is in progress and this is neither the panic
+ * CPU nor is this a reacquire. Or the current owner or
+ * waiter has the same or higher priority. No acquire
+ * method can be successful in these cases.
*
* -EBUSY: The current owner has a lower priority but the console
* in an unsafe state. The caller should try using
* the handover acquire method.
*/
static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt,
- struct nbcon_state *cur)
+ struct nbcon_state *cur, bool is_reacquire)
{
unsigned int cpu = smp_processor_id();
struct console *con = ctxt->console;
@@ -243,14 +244,20 @@ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt,
do {
/*
- * Panic does not imply that the console is owned. However, it
- * is critical that non-panic CPUs during panic are unable to
- * acquire ownership in order to satisfy the assumptions of
- * nbcon_waiter_matches(). In particular, the assumption that
- * lower priorities are ignored during panic.
+ * Panic does not imply that the console is owned. However,
+ * since all non-panic CPUs are stopped during panic(), it
+ * is safer to have them avoid gaining console ownership.
+ *
+ * If this acquire is a reacquire (and an unsafe takeover
+ * has not previously occurred) then it is allowed to attempt
+ * a direct acquire in panic. This gives console drivers an
+ * opportunity to perform any necessary cleanup if they were
+ * interrupted by the panic CPU while printing.
*/
- if (other_cpu_in_panic())
+ if (other_cpu_in_panic() &&
+ (!is_reacquire || cur->unsafe_takeover)) {
return -EPERM;
+ }
if (ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio)
return -EPERM;
@@ -301,8 +308,9 @@ static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio)
* Event #1 implies this context is EMERGENCY.
* Event #2 implies the new context is PANIC.
* Event #3 occurs when panic() has flushed the console.
- * Events #4 and #5 are not possible due to the other_cpu_in_panic()
- * check in nbcon_context_try_acquire_direct().
+ * Event #4 occurs when a non-panic CPU reacquires.
+ * Event #5 is not possible due to the other_cpu_in_panic() check
+ * in nbcon_context_try_acquire_handover().
*/
return (cur->req_prio == expected_prio);
@@ -431,6 +439,16 @@ static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt,
WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio);
WARN_ON_ONCE(!cur->unsafe);
+ /*
+ * Panic does not imply that the console is owned. However, it
+ * is critical that non-panic CPUs during panic are unable to
+ * wait for a handover in order to satisfy the assumptions of
+ * nbcon_waiter_matches(). In particular, the assumption that
+ * lower priorities are ignored during panic.
+ */
+ if (other_cpu_in_panic())
+ return -EPERM;
+
/* Handover is not possible on the same CPU. */
if (cur->cpu == cpu)
return -EBUSY;
@@ -558,7 +576,8 @@ static struct printk_buffers panic_nbcon_pbufs;
/**
* nbcon_context_try_acquire - Try to acquire nbcon console
- * @ctxt: The context of the caller
+ * @ctxt: The context of the caller
+ * @is_reacquire: This acquire is a reacquire
*
* Context: Under @ctxt->con->device_lock() or local_irq_save().
* Return: True if the console was acquired. False otherwise.
@@ -568,7 +587,7 @@ static struct printk_buffers panic_nbcon_pbufs;
* in an unsafe state. Otherwise, on success the caller may assume
* the console is not in an unsafe state.
*/
-static bool nbcon_context_try_acquire(struct nbcon_context *ctxt)
+static bool nbcon_context_try_acquire(struct nbcon_context *ctxt, bool is_reacquire)
{
unsigned int cpu = smp_processor_id();
struct console *con = ctxt->console;
@@ -577,7 +596,7 @@ static bool nbcon_context_try_acquire(struct nbcon_context *ctxt)
nbcon_state_read(con, &cur);
try_again:
- err = nbcon_context_try_acquire_direct(ctxt, &cur);
+ err = nbcon_context_try_acquire_direct(ctxt, &cur, is_reacquire);
if (err != -EBUSY)
goto out;
@@ -913,7 +932,7 @@ void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt)
{
struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
- while (!nbcon_context_try_acquire(ctxt))
+ while (!nbcon_context_try_acquire(ctxt, true))
cpu_relax();
nbcon_write_context_set_buf(wctxt, NULL, 0);
@@ -1101,7 +1120,7 @@ static bool nbcon_emit_one(struct nbcon_write_context *wctxt, bool use_atomic)
cant_migrate();
}
- if (!nbcon_context_try_acquire(ctxt))
+ if (!nbcon_context_try_acquire(ctxt, false))
goto out;
/*
@@ -1486,7 +1505,7 @@ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq,
ctxt->prio = nbcon_get_default_prio();
ctxt->allow_unsafe_takeover = allow_unsafe_takeover;
- if (!nbcon_context_try_acquire(ctxt))
+ if (!nbcon_context_try_acquire(ctxt, false))
return -EPERM;
while (nbcon_seq_read(con) < stop_seq) {
@@ -1671,6 +1690,9 @@ bool nbcon_alloc(struct console *con)
{
struct nbcon_state state = { };
+ /* Synchronize the kthread start. */
+ lockdep_assert_console_list_lock_held();
+
/* The write_thread() callback is mandatory. */
if (WARN_ON(!con->write_thread))
return false;
@@ -1701,12 +1723,15 @@ bool nbcon_alloc(struct console *con)
return false;
}
- if (printk_kthreads_running) {
+ if (printk_kthreads_ready && !have_boot_console) {
if (!nbcon_kthread_create(con)) {
kfree(con->pbufs);
con->pbufs = NULL;
return false;
}
+
+ /* Might be the first kthread. */
+ printk_kthreads_running = true;
}
}
@@ -1716,14 +1741,30 @@ bool nbcon_alloc(struct console *con)
/**
* nbcon_free - Free and cleanup the nbcon console specific data
* @con: Console to free/cleanup nbcon data
+ *
+ * Important: @have_nbcon_console must be updated before calling
+ * this function. In particular, it can be set only when there
+ * is still another nbcon console registered.
*/
void nbcon_free(struct console *con)
{
struct nbcon_state state = { };
- if (printk_kthreads_running)
+ /* Synchronize the kthread stop. */
+ lockdep_assert_console_list_lock_held();
+
+ if (printk_kthreads_running) {
nbcon_kthread_stop(con);
+ /* Might be the last nbcon console.
+ *
+ * Do not rely on printk_kthreads_check_locked(). It is not
+ * called in some code paths, see nbcon_free() callers.
+ */
+ if (!have_nbcon_console)
+ printk_kthreads_running = false;
+ }
+
nbcon_state_set(con, &state);
/* Boot consoles share global printk buffers. */
@@ -1762,7 +1803,7 @@ bool nbcon_device_try_acquire(struct console *con)
ctxt->console = con;
ctxt->prio = NBCON_PRIO_NORMAL;
- if (!nbcon_context_try_acquire(ctxt))
+ if (!nbcon_context_try_acquire(ctxt, false))
return false;
if (!nbcon_context_enter_unsafe(ctxt))
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 1eea80d0648e..0efbcdda9aab 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3574,7 +3574,7 @@ EXPORT_SYMBOL(console_resume);
static int unregister_console_locked(struct console *console);
/* True when system boot is far enough to create printer threads. */
-static bool printk_kthreads_ready __ro_after_init;
+bool printk_kthreads_ready __ro_after_init;
static struct task_struct *printk_legacy_kthread;
@@ -3713,6 +3713,7 @@ static void printk_kthreads_check_locked(void)
if (!printk_kthreads_ready)
return;
+ /* Start or stop the legacy kthread when needed. */
if (have_legacy_console || have_boot_console) {
if (!printk_legacy_kthread &&
force_legacy_kthread() &&
@@ -4204,14 +4205,6 @@ static int unregister_console_locked(struct console *console)
*/
synchronize_srcu(&console_srcu);
- if (console->flags & CON_NBCON)
- nbcon_free(console);
-
- console_sysfs_notify();
-
- if (console->exit)
- res = console->exit(console);
-
/*
* With this console gone, the global flags tracking registered
* console types may have changed. Update them.
@@ -4232,6 +4225,15 @@ static int unregister_console_locked(struct console *console)
if (!found_nbcon_con)
have_nbcon_console = found_nbcon_con;
+ /* @have_nbcon_console must be updated before calling nbcon_free(). */
+ if (console->flags & CON_NBCON)
+ nbcon_free(console);
+
+ console_sysfs_notify();
+
+ if (console->exit)
+ res = console->exit(console);
+
/* Changed console list, may require printer threads to start/stop. */
printk_kthreads_check_locked();
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 2a5a5329322d..d16afeb11506 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -163,6 +163,13 @@ static void panic_on_rcu_stall(void)
{
static int cpu_stall;
+ /*
+ * Attempt to kick out the BPF scheduler if it's installed and defer
+ * the panic to give the system a chance to recover.
+ */
+ if (scx_rcu_cpu_stall())
+ return;
+
if (++cpu_stall < sysctl_max_rcu_stall_to_panic)
return;
diff --git a/kernel/relay.c b/kernel/relay.c
index c0c93a04d4ce..8d915fe98198 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -118,7 +118,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
return NULL;
for (i = 0; i < n_pages; i++) {
- buf->page_array[i] = alloc_page(GFP_KERNEL);
+ buf->page_array[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (unlikely(!buf->page_array[i]))
goto depopulate;
set_page_private(buf->page_array[i], (unsigned long)buf);
@@ -127,7 +127,6 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
if (!mem)
goto depopulate;
- memset(mem, 0, *size);
buf->page_count = n_pages;
return mem;
@@ -250,13 +249,18 @@ EXPORT_SYMBOL_GPL(relay_buf_full);
*/
static int relay_subbuf_start(struct rchan_buf *buf, void *subbuf,
- void *prev_subbuf, size_t prev_padding)
+ void *prev_subbuf)
{
+ int full = relay_buf_full(buf);
+
+ if (full)
+ buf->stats.full_count++;
+
if (!buf->chan->cb->subbuf_start)
- return !relay_buf_full(buf);
+ return !full;
return buf->chan->cb->subbuf_start(buf, subbuf,
- prev_subbuf, prev_padding);
+ prev_subbuf);
}
/**
@@ -298,11 +302,13 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
buf->finalized = 0;
buf->data = buf->start;
buf->offset = 0;
+ buf->stats.full_count = 0;
+ buf->stats.big_count = 0;
for (i = 0; i < buf->chan->n_subbufs; i++)
buf->padding[i] = 0;
- relay_subbuf_start(buf, buf->data, NULL, 0);
+ relay_subbuf_start(buf, buf->data, NULL);
}
/**
@@ -555,9 +561,11 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
goto toobig;
if (buf->offset != buf->chan->subbuf_size + 1) {
- buf->prev_padding = buf->chan->subbuf_size - buf->offset;
+ size_t prev_padding;
+
+ prev_padding = buf->chan->subbuf_size - buf->offset;
old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
- buf->padding[old_subbuf] = buf->prev_padding;
+ buf->padding[old_subbuf] = prev_padding;
buf->subbufs_produced++;
if (buf->dentry)
d_inode(buf->dentry)->i_size +=
@@ -582,7 +590,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
new = buf->start + new_subbuf * buf->chan->subbuf_size;
buf->offset = 0;
- if (!relay_subbuf_start(buf, new, old, buf->prev_padding)) {
+ if (!relay_subbuf_start(buf, new, old)) {
buf->offset = buf->chan->subbuf_size + 1;
return 0;
}
@@ -595,7 +603,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
return length;
toobig:
- buf->chan->last_toobig = length;
+ buf->stats.big_count++;
return 0;
}
EXPORT_SYMBOL_GPL(relay_switch_subbuf);
@@ -655,11 +663,6 @@ void relay_close(struct rchan *chan)
if ((buf = *per_cpu_ptr(chan->buf, i)))
relay_close_buf(buf);
- if (chan->last_toobig)
- printk(KERN_WARNING "relay: one or more items not logged "
- "[item size (%zd) > sub-buffer size (%zd)]\n",
- chan->last_toobig, chan->subbuf_size);
-
list_del(&chan->list);
kref_put(&chan->kref, relay_destroy_channel);
mutex_unlock(&relay_channels_mutex);
@@ -694,6 +697,42 @@ void relay_flush(struct rchan *chan)
EXPORT_SYMBOL_GPL(relay_flush);
/**
+ * relay_stats - get channel buffer statistics
+ * @chan: the channel
+ * @flags: select particular information to get
+ *
+ * Returns the count of certain field that caller specifies.
+ */
+size_t relay_stats(struct rchan *chan, int flags)
+{
+ unsigned int i, count = 0;
+ struct rchan_buf *rbuf;
+
+ if (!chan || flags > RELAY_STATS_LAST)
+ return 0;
+
+ if (chan->is_global) {
+ rbuf = *per_cpu_ptr(chan->buf, 0);
+ if (flags & RELAY_STATS_BUF_FULL)
+ count = rbuf->stats.full_count;
+ else if (flags & RELAY_STATS_WRT_BIG)
+ count = rbuf->stats.big_count;
+ } else {
+ for_each_online_cpu(i) {
+ rbuf = *per_cpu_ptr(chan->buf, i);
+ if (rbuf) {
+ if (flags & RELAY_STATS_BUF_FULL)
+ count += rbuf->stats.full_count;
+ else if (flags & RELAY_STATS_WRT_BIG)
+ count += rbuf->stats.big_count;
+ }
+ }
+ }
+
+ return count;
+}
+
+/**
* relay_file_open - open file op for relay files
* @inode: the inode
* @filp: the file
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 66d93a872968..be00629f0ba4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9815,7 +9815,9 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v)
return 0;
}
+#endif /* CONFIG_CFS_BANDWIDTH */
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
const u64 max_bw_quota_period_us = 1 * USEC_PER_SEC; /* 1s */
static const u64 min_bw_quota_period_us = 1 * USEC_PER_MSEC; /* 1ms */
/* More than 203 days if BW_SHIFT equals 20. */
@@ -9824,12 +9826,21 @@ static const u64 max_bw_runtime_us = MAX_BW;
static void tg_bandwidth(struct task_group *tg,
u64 *period_us_p, u64 *quota_us_p, u64 *burst_us_p)
{
+#ifdef CONFIG_CFS_BANDWIDTH
if (period_us_p)
*period_us_p = tg_get_cfs_period(tg);
if (quota_us_p)
*quota_us_p = tg_get_cfs_quota(tg);
if (burst_us_p)
*burst_us_p = tg_get_cfs_burst(tg);
+#else /* !CONFIG_CFS_BANDWIDTH */
+ if (period_us_p)
+ *period_us_p = tg->scx.bw_period_us;
+ if (quota_us_p)
+ *quota_us_p = tg->scx.bw_quota_us;
+ if (burst_us_p)
+ *burst_us_p = tg->scx.bw_burst_us;
+#endif /* CONFIG_CFS_BANDWIDTH */
}
static u64 cpu_period_read_u64(struct cgroup_subsys_state *css,
@@ -9845,6 +9856,7 @@ static int tg_set_bandwidth(struct task_group *tg,
u64 period_us, u64 quota_us, u64 burst_us)
{
const u64 max_usec = U64_MAX / NSEC_PER_USEC;
+ int ret = 0;
if (tg == &root_task_group)
return -EINVAL;
@@ -9882,7 +9894,12 @@ static int tg_set_bandwidth(struct task_group *tg,
burst_us + quota_us > max_bw_runtime_us))
return -EINVAL;
- return tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us);
+#ifdef CONFIG_CFS_BANDWIDTH
+ ret = tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us);
+#endif /* CONFIG_CFS_BANDWIDTH */
+ if (!ret)
+ scx_group_set_bandwidth(tg, period_us, quota_us, burst_us);
+ return ret;
}
static s64 cpu_quota_read_s64(struct cgroup_subsys_state *css,
@@ -9935,7 +9952,7 @@ static int cpu_burst_write_u64(struct cgroup_subsys_state *css,
tg_bandwidth(tg, &period_us, &quota_us, NULL);
return tg_set_bandwidth(tg, period_us, quota_us, burst_us);
}
-#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_GROUP_SCHED_BANDWIDTH */
#ifdef CONFIG_RT_GROUP_SCHED
static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
@@ -9995,7 +10012,7 @@ static struct cftype cpu_legacy_files[] = {
.write_s64 = cpu_idle_write_s64,
},
#endif
-#ifdef CONFIG_CFS_BANDWIDTH
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
{
.name = "cfs_period_us",
.read_u64 = cpu_period_read_u64,
@@ -10011,6 +10028,8 @@ static struct cftype cpu_legacy_files[] = {
.read_u64 = cpu_burst_read_u64,
.write_u64 = cpu_burst_write_u64,
},
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
{
.name = "stat",
.seq_show = cpu_cfs_stat_show,
@@ -10224,7 +10243,7 @@ static int __maybe_unused cpu_period_quota_parse(char *buf, u64 *period_us_p,
return 0;
}
-#ifdef CONFIG_CFS_BANDWIDTH
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
static int cpu_max_show(struct seq_file *sf, void *v)
{
struct task_group *tg = css_tg(seq_css(sf));
@@ -10271,7 +10290,7 @@ static struct cftype cpu_files[] = {
.write_s64 = cpu_idle_write_s64,
},
#endif
-#ifdef CONFIG_CFS_BANDWIDTH
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
{
.name = "max",
.flags = CFTYPE_NOT_ON_ROOT,
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7dd5cbcb7a06..7dedc9a16281 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -203,6 +203,11 @@ struct scx_exit_task_args {
struct scx_cgroup_init_args {
/* the weight of the cgroup [1..10000] */
u32 weight;
+
+ /* bandwidth control parameters from cpu.max and cpu.max.burst */
+ u64 bw_period_us;
+ u64 bw_quota_us;
+ u64 bw_burst_us;
};
enum scx_cpu_preempt_reason {
@@ -664,9 +669,31 @@ struct sched_ext_ops {
* @cgrp: cgroup whose weight is being updated
* @weight: new weight [1..10000]
*
- * Update @tg's weight to @weight.
+ * Update @cgrp's weight to @weight.
*/
void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
+
+ /**
+ * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed
+ * @cgrp: cgroup whose bandwidth is being updated
+ * @period_us: bandwidth control period
+ * @quota_us: bandwidth control quota
+ * @burst_us: bandwidth control burst
+ *
+ * Update @cgrp's bandwidth control parameters. This is from the cpu.max
+ * cgroup interface.
+ *
+ * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled
+ * to. For example, if @period_us is 1_000_000 and @quota_us is
+ * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be
+ * interpreted in the same fashion and specifies how much @cgrp can
+ * burst temporarily. The specific control mechanism and thus the
+ * interpretation of @period_us and burstiness is upto to the BPF
+ * scheduler.
+ */
+ void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
+ u64 period_us, u64 quota_us, u64 burst_us);
+
#endif /* CONFIG_EXT_GROUP_SCHED */
/*
@@ -884,7 +911,7 @@ enum scx_enq_flags {
/*
* The task being enqueued was previously enqueued on the current CPU's
* %SCX_DSQ_LOCAL, but was removed from it in a call to the
- * bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was
+ * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was
* invoked in a ->cpu_release() callback, and the task is again
* dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
* task will not be scheduled on the CPU until at least the next invocation
@@ -1247,7 +1274,7 @@ static void scx_kf_disallow(u32 mask)
* This allows kfuncs to safely operate on rq from any scx ops callback,
* knowing which rq is already locked.
*/
-static DEFINE_PER_CPU(struct rq *, locked_rq);
+DEFINE_PER_CPU(struct rq *, scx_locked_rq_state);
static inline void update_locked_rq(struct rq *rq)
{
@@ -1258,16 +1285,7 @@ static inline void update_locked_rq(struct rq *rq)
*/
if (rq)
lockdep_assert_rq_held(rq);
- __this_cpu_write(locked_rq, rq);
-}
-
-/*
- * Return the rq currently locked from an scx callback, or NULL if no rq is
- * locked.
- */
-static inline struct rq *scx_locked_rq(void)
-{
- return __this_cpu_read(locked_rq);
+ __this_cpu_write(scx_locked_rq_state, rq);
}
#define SCX_CALL_OP(sch, mask, op, rq, args...) \
@@ -1641,7 +1659,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
* scx_add_event - Increase an event counter for 'name' by 'cnt'
* @sch: scx_sched to account events for
* @name: an event name defined in struct scx_event_stats
- * @cnt: the number of the event occured
+ * @cnt: the number of the event occurred
*
* This can be used when preemption is not disabled.
*/
@@ -1654,7 +1672,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
* __scx_add_event - Increase an event counter for 'name' by 'cnt'
* @sch: scx_sched to account events for
* @name: an event name defined in struct scx_event_stats
- * @cnt: the number of the event occured
+ * @cnt: the number of the event occurred
*
* This should be used only when preemption is disabled.
*/
@@ -1705,11 +1723,6 @@ static bool scx_tryset_enable_state(enum scx_enable_state to,
return atomic_try_cmpxchg(&scx_enable_state_var, &from_v, to);
}
-static bool scx_rq_bypassing(struct rq *rq)
-{
- return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
-}
-
/**
* wait_ops_state - Busy-wait the specified ops state to end
* @p: target task
@@ -1796,12 +1809,10 @@ static void run_deferred(struct rq *rq)
process_ddsp_deferred_locals(rq);
}
-#ifdef CONFIG_SMP
static void deferred_bal_cb_workfn(struct rq *rq)
{
run_deferred(rq);
}
-#endif
static void deferred_irq_workfn(struct irq_work *irq_work)
{
@@ -1824,7 +1835,6 @@ static void schedule_deferred(struct rq *rq)
{
lockdep_assert_rq_held(rq);
-#ifdef CONFIG_SMP
/*
* If in the middle of waking up a task, task_woken_scx() will be called
* afterwards which will then run the deferred actions, no need to
@@ -1842,7 +1852,7 @@ static void schedule_deferred(struct rq *rq)
deferred_bal_cb_workfn);
return;
}
-#endif
+
/*
* No scheduler hooks available. Queue an irq work. They are executed on
* IRQ re-enable which may take a bit longer than the scheduler hooks.
@@ -2546,7 +2556,6 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
p->scx.dsq = dst_dsq;
}
-#ifdef CONFIG_SMP
/**
* move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ
* @p: task to move
@@ -2713,11 +2722,6 @@ static bool consume_remote_task(struct rq *this_rq, struct task_struct *p,
return false;
}
}
-#else /* CONFIG_SMP */
-static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); }
-static inline bool task_can_run_on_remote_rq(struct scx_sched *sch, struct task_struct *p, struct rq *rq, bool enforce) { return false; }
-static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; }
-#endif /* CONFIG_SMP */
/**
* move_task_between_dsqs() - Move a task from one DSQ to another
@@ -2890,9 +2894,7 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
{
struct rq *src_rq = task_rq(p);
struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
-#ifdef CONFIG_SMP
struct rq *locked_rq = rq;
-#endif
/*
* We're synchronized against dequeue through DISPATCHING. As @p can't
@@ -2906,7 +2908,6 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
return;
}
-#ifdef CONFIG_SMP
if (src_rq != dst_rq &&
unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
dispatch_enqueue(sch, find_global_dsq(p), p,
@@ -2966,9 +2967,6 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
raw_spin_rq_unlock(locked_rq);
raw_spin_rq_lock(rq);
}
-#else /* CONFIG_SMP */
- BUG(); /* control can not reach here on UP */
-#endif /* CONFIG_SMP */
}
/**
@@ -3292,10 +3290,8 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
static enum scx_cpu_preempt_reason
preempt_reason_from_class(const struct sched_class *class)
{
-#ifdef CONFIG_SMP
if (class == &stop_sched_class)
return SCX_CPU_PREEMPT_STOP;
-#endif
if (class == &dl_sched_class)
return SCX_CPU_PREEMPT_DL;
if (class == &rt_sched_class)
@@ -3308,14 +3304,12 @@ static void switch_class(struct rq *rq, struct task_struct *next)
struct scx_sched *sch = scx_root;
const struct sched_class *next_class = next->sched_class;
-#ifdef CONFIG_SMP
/*
* Pairs with the smp_load_acquire() issued by a CPU in
* kick_cpus_irq_workfn() who is waiting for this CPU to perform a
* resched.
*/
smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
-#endif
if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT))
return;
@@ -3512,8 +3506,6 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
}
#endif /* CONFIG_SCHED_CORE */
-#ifdef CONFIG_SMP
-
static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
{
struct scx_sched *sch = scx_root;
@@ -3643,7 +3635,6 @@ static void rq_offline_scx(struct rq *rq)
rq->scx.flags &= ~SCX_RQ_ONLINE;
}
-#endif /* CONFIG_SMP */
static bool check_rq_for_timeouts(struct rq *rq)
{
@@ -4098,7 +4089,9 @@ static bool scx_cgroup_enabled;
void scx_tg_init(struct task_group *tg)
{
- tg->scx_weight = CGROUP_WEIGHT_DFL;
+ tg->scx.weight = CGROUP_WEIGHT_DFL;
+ tg->scx.bw_period_us = default_bw_period_us();
+ tg->scx.bw_quota_us = RUNTIME_INF;
}
int scx_tg_online(struct task_group *tg)
@@ -4106,14 +4099,17 @@ int scx_tg_online(struct task_group *tg)
struct scx_sched *sch = scx_root;
int ret = 0;
- WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED));
+ WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED));
percpu_down_read(&scx_cgroup_rwsem);
if (scx_cgroup_enabled) {
if (SCX_HAS_OP(sch, cgroup_init)) {
struct scx_cgroup_init_args args =
- { .weight = tg->scx_weight };
+ { .weight = tg->scx.weight,
+ .bw_period_us = tg->scx.bw_period_us,
+ .bw_quota_us = tg->scx.bw_quota_us,
+ .bw_burst_us = tg->scx.bw_burst_us };
ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init,
NULL, tg->css.cgroup, &args);
@@ -4121,9 +4117,9 @@ int scx_tg_online(struct task_group *tg)
ret = ops_sanitize_err(sch, "cgroup_init", ret);
}
if (ret == 0)
- tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED;
+ tg->scx.flags |= SCX_TG_ONLINE | SCX_TG_INITED;
} else {
- tg->scx_flags |= SCX_TG_ONLINE;
+ tg->scx.flags |= SCX_TG_ONLINE;
}
percpu_up_read(&scx_cgroup_rwsem);
@@ -4134,15 +4130,15 @@ void scx_tg_offline(struct task_group *tg)
{
struct scx_sched *sch = scx_root;
- WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE));
+ WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE));
percpu_down_read(&scx_cgroup_rwsem);
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) &&
- (tg->scx_flags & SCX_TG_INITED))
+ (tg->scx.flags & SCX_TG_INITED))
SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
tg->css.cgroup);
- tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
+ tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
percpu_up_read(&scx_cgroup_rwsem);
}
@@ -4251,11 +4247,11 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
percpu_down_read(&scx_cgroup_rwsem);
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) &&
- tg->scx_weight != weight)
+ tg->scx.weight != weight)
SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL,
tg_cgrp(tg), weight);
- tg->scx_weight = weight;
+ tg->scx.weight = weight;
percpu_up_read(&scx_cgroup_rwsem);
}
@@ -4265,6 +4261,27 @@ void scx_group_set_idle(struct task_group *tg, bool idle)
/* TODO: Implement ops->cgroup_set_idle() */
}
+void scx_group_set_bandwidth(struct task_group *tg,
+ u64 period_us, u64 quota_us, u64 burst_us)
+{
+ struct scx_sched *sch = scx_root;
+
+ percpu_down_read(&scx_cgroup_rwsem);
+
+ if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) &&
+ (tg->scx.bw_period_us != period_us ||
+ tg->scx.bw_quota_us != quota_us ||
+ tg->scx.bw_burst_us != burst_us))
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_bandwidth, NULL,
+ tg_cgrp(tg), period_us, quota_us, burst_us);
+
+ tg->scx.bw_period_us = period_us;
+ tg->scx.bw_quota_us = quota_us;
+ tg->scx.bw_burst_us = burst_us;
+
+ percpu_up_read(&scx_cgroup_rwsem);
+}
+
static void scx_cgroup_lock(void)
{
percpu_down_write(&scx_cgroup_rwsem);
@@ -4308,14 +4325,12 @@ DEFINE_SCHED_CLASS(ext) = {
.put_prev_task = put_prev_task_scx,
.set_next_task = set_next_task_scx,
-#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_scx,
.task_woken = task_woken_scx,
.set_cpus_allowed = set_cpus_allowed_scx,
.rq_online = rq_online_scx,
.rq_offline = rq_offline_scx,
-#endif
.task_tick = task_tick_scx,
@@ -4408,9 +4423,9 @@ static void scx_cgroup_exit(struct scx_sched *sch)
css_for_each_descendant_post(css, &root_task_group.css) {
struct task_group *tg = css_tg(css);
- if (!(tg->scx_flags & SCX_TG_INITED))
+ if (!(tg->scx.flags & SCX_TG_INITED))
continue;
- tg->scx_flags &= ~SCX_TG_INITED;
+ tg->scx.flags &= ~SCX_TG_INITED;
if (!sch->ops.cgroup_exit)
continue;
@@ -4442,14 +4457,19 @@ static int scx_cgroup_init(struct scx_sched *sch)
rcu_read_lock();
css_for_each_descendant_pre(css, &root_task_group.css) {
struct task_group *tg = css_tg(css);
- struct scx_cgroup_init_args args = { .weight = tg->scx_weight };
+ struct scx_cgroup_init_args args = {
+ .weight = tg->scx.weight,
+ .bw_period_us = tg->scx.bw_period_us,
+ .bw_quota_us = tg->scx.bw_quota_us,
+ .bw_burst_us = tg->scx.bw_burst_us,
+ };
- if ((tg->scx_flags &
+ if ((tg->scx.flags &
(SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE)
continue;
if (!sch->ops.cgroup_init) {
- tg->scx_flags |= SCX_TG_INITED;
+ tg->scx.flags |= SCX_TG_INITED;
continue;
}
@@ -4464,7 +4484,7 @@ static int scx_cgroup_init(struct scx_sched *sch)
scx_error(sch, "ops.cgroup_init() failed (%d)", ret);
return ret;
}
- tg->scx_flags |= SCX_TG_INITED;
+ tg->scx.flags |= SCX_TG_INITED;
rcu_read_lock();
css_put(css);
@@ -4657,6 +4677,41 @@ bool scx_allow_ttwu_queue(const struct task_struct *p)
}
/**
+ * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler
+ *
+ * While there are various reasons why RCU CPU stalls can occur on a system
+ * that may not be caused by the current BPF scheduler, try kicking out the
+ * current scheduler in an attempt to recover the system to a good state before
+ * issuing panics.
+ */
+bool scx_rcu_cpu_stall(void)
+{
+ struct scx_sched *sch;
+
+ rcu_read_lock();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch)) {
+ rcu_read_unlock();
+ return false;
+ }
+
+ switch (scx_enable_state()) {
+ case SCX_ENABLING:
+ case SCX_ENABLED:
+ break;
+ default:
+ rcu_read_unlock();
+ return false;
+ }
+
+ scx_error(sch, "RCU CPU stall detected!");
+ rcu_read_unlock();
+
+ return true;
+}
+
+/**
* scx_softlockup - sched_ext softlockup handler
* @dur_s: number of seconds of CPU stuck due to soft lockup
*
@@ -5944,6 +5999,7 @@ static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup
static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {}
+static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {}
#endif
static void sched_ext_ops__cpu_online(s32 cpu) {}
static void sched_ext_ops__cpu_offline(s32 cpu) {}
@@ -5981,6 +6037,7 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
.cgroup_move = sched_ext_ops__cgroup_move,
.cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move,
.cgroup_set_weight = sched_ext_ops__cgroup_set_weight,
+ .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth,
#endif
.cpu_online = sched_ext_ops__cpu_online,
.cpu_offline = sched_ext_ops__cpu_offline,
@@ -6338,7 +6395,8 @@ __bpf_kfunc_start_defs();
* When called from ops.dispatch(), there are no restrictions on @p or @dsq_id
* and this function can be called upto ops.dispatch_max_batch times to insert
* multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the
- * remaining slots. scx_bpf_consume() flushes the batch and resets the counter.
+ * remaining slots. scx_bpf_dsq_move_to_local() flushes the batch and resets the
+ * counter.
*
* This function doesn't have any locking restrictions and may be called under
* BPF locks (in the future when BPF introduces more flexible locking).
@@ -6362,14 +6420,6 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice
scx_dsq_insert_commit(p, dsq_id, enq_flags);
}
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
- u64 enq_flags)
-{
- printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()");
- scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags);
-}
-
/**
* scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ
* @p: task_struct to insert
@@ -6407,21 +6457,11 @@ __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
}
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id,
- u64 slice, u64 vtime, u64 enq_flags)
-{
- printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()");
- scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags);
-}
-
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch)
BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch)
static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
@@ -6594,13 +6634,6 @@ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
}
}
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc bool scx_bpf_consume(u64 dsq_id)
-{
- printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()");
- return scx_bpf_dsq_move_to_local(dsq_id);
-}
-
/**
* scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs
* @it__iter: DSQ iterator in progress
@@ -6619,14 +6652,6 @@ __bpf_kfunc void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter,
kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE;
}
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice(
- struct bpf_iter_scx_dsq *it__iter, u64 slice)
-{
- printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()");
- scx_bpf_dsq_move_set_slice(it__iter, slice);
-}
-
/**
* scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs
* @it__iter: DSQ iterator in progress
@@ -6646,14 +6671,6 @@ __bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter,
kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME;
}
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime(
- struct bpf_iter_scx_dsq *it__iter, u64 vtime)
-{
- printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()");
- scx_bpf_dsq_move_set_vtime(it__iter, vtime);
-}
-
/**
* scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ
* @it__iter: DSQ iterator in progress
@@ -6686,15 +6703,6 @@ __bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter,
p, dsq_id, enq_flags);
}
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter,
- struct task_struct *p, u64 dsq_id,
- u64 enq_flags)
-{
- printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()");
- return scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags);
-}
-
/**
* scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ
* @it__iter: DSQ iterator in progress
@@ -6720,30 +6728,16 @@ __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter,
p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
}
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter,
- struct task_struct *p, u64 dsq_id,
- u64 enq_flags)
-{
- printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_vtime() renamed to scx_bpf_dsq_move_vtime()");
- return scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags);
-}
-
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local)
-BTF_ID_FLAGS(func, scx_bpf_consume)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime)
BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
@@ -6874,10 +6868,6 @@ BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime)
BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_unlocked)
static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = {
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index a75835c23f15..292bb41a242e 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -13,8 +13,24 @@ static inline bool scx_kf_allowed_if_unlocked(void)
return !current->scx.kf_mask;
}
+static inline bool scx_rq_bypassing(struct rq *rq)
+{
+ return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
+}
+
DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
+DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
+
+/*
+ * Return the rq currently locked from an scx callback, or NULL if no rq is
+ * locked.
+ */
+static inline struct rq *scx_locked_rq(void)
+{
+ return __this_cpu_read(scx_locked_rq_state);
+}
+
void scx_tick(struct rq *rq);
void init_scx_entity(struct sched_ext_entity *scx);
void scx_pre_fork(struct task_struct *p);
@@ -65,7 +81,7 @@ static inline void init_sched_ext_class(void) {}
#endif /* CONFIG_SCHED_CLASS_EXT */
-#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP)
+#ifdef CONFIG_SCHED_CLASS_EXT
void __scx_update_idle(struct rq *rq, bool idle, bool do_notify);
static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify)
@@ -88,6 +104,7 @@ void scx_cgroup_finish_attach(void);
void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
void scx_group_set_idle(struct task_group *tg, bool idle);
+void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us);
#else /* CONFIG_EXT_GROUP_SCHED */
static inline void scx_tg_init(struct task_group *tg) {}
static inline int scx_tg_online(struct task_group *tg) { return 0; }
@@ -98,5 +115,6 @@ static inline void scx_cgroup_finish_attach(void) {}
static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
static inline void scx_group_set_idle(struct task_group *tg, bool idle) {}
+static inline void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us) {}
#endif /* CONFIG_EXT_GROUP_SCHED */
#endif /* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 001fb88a8481..7174e1c1a392 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -17,7 +17,6 @@ static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
/* Enable/disable per-node idle cpumasks */
static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_per_node);
-#ifdef CONFIG_SMP
/* Enable/disable LLC aware optimizations */
static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
@@ -75,7 +74,7 @@ static int scx_cpu_node_if_enabled(int cpu)
return cpu_to_node(cpu);
}
-bool scx_idle_test_and_clear_cpu(int cpu)
+static bool scx_idle_test_and_clear_cpu(int cpu)
{
int node = scx_cpu_node_if_enabled(cpu);
struct cpumask *idle_cpus = idle_cpumask(node)->cpu;
@@ -198,7 +197,7 @@ pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, int node, u6
/*
* Find an idle CPU in the system, starting from @node.
*/
-s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags)
+static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags)
{
s32 cpu;
@@ -250,7 +249,7 @@ static struct cpumask *llc_span(s32 cpu)
sd = rcu_dereference(per_cpu(sd_llc, cpu));
if (!sd)
- return 0;
+ return NULL;
return sched_domain_span(sd);
}
@@ -794,7 +793,6 @@ static void reset_idle_masks(struct sched_ext_ops *ops)
cpumask_and(idle_cpumask(node)->smt, cpu_online_mask, node_mask);
}
}
-#endif /* CONFIG_SMP */
void scx_idle_enable(struct sched_ext_ops *ops)
{
@@ -808,9 +806,7 @@ void scx_idle_enable(struct sched_ext_ops *ops)
else
static_branch_disable_cpuslocked(&scx_builtin_idle_per_node);
-#ifdef CONFIG_SMP
reset_idle_masks(ops);
-#endif
}
void scx_idle_disable(void)
@@ -860,8 +856,8 @@ static bool check_builtin_idle_enabled(void)
return false;
}
-s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
- const struct cpumask *allowed, u64 flags)
+static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *allowed, u64 flags)
{
struct rq *rq;
struct rq_flags rf;
@@ -896,7 +892,6 @@ s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
if (!rq)
lockdep_assert_held(&p->pi_lock);
-#ifdef CONFIG_SMP
/*
* This may also be called from ops.enqueue(), so we need to handle
* per-CPU tasks as well. For these tasks, we can skip all idle CPU
@@ -913,9 +908,7 @@ s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags,
allowed ?: p->cpus_ptr, flags);
}
-#else
- cpu = -EBUSY;
-#endif
+
if (scx_kf_allowed_if_unlocked())
task_rq_unlock(rq, p, &rf);
@@ -929,14 +922,10 @@ s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
*/
__bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
{
-#ifdef CONFIG_NUMA
if (!kf_cpu_valid(cpu, NULL))
return NUMA_NO_NODE;
return cpu_to_node(cpu);
-#else
- return 0;
-#endif
}
/**
@@ -1010,11 +999,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
if (node < 0)
return cpu_none_mask;
-#ifdef CONFIG_SMP
return idle_cpumask(node)->cpu;
-#else
- return cpu_none_mask;
-#endif
}
/**
@@ -1034,11 +1019,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
if (!check_builtin_idle_enabled())
return cpu_none_mask;
-#ifdef CONFIG_SMP
return idle_cpumask(NUMA_NO_NODE)->cpu;
-#else
- return cpu_none_mask;
-#endif
}
/**
@@ -1057,14 +1038,10 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
if (node < 0)
return cpu_none_mask;
-#ifdef CONFIG_SMP
if (sched_smt_active())
return idle_cpumask(node)->smt;
else
return idle_cpumask(node)->cpu;
-#else
- return cpu_none_mask;
-#endif
}
/**
@@ -1085,14 +1062,10 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
if (!check_builtin_idle_enabled())
return cpu_none_mask;
-#ifdef CONFIG_SMP
if (sched_smt_active())
return idle_cpumask(NUMA_NO_NODE)->smt;
else
return idle_cpumask(NUMA_NO_NODE)->cpu;
-#else
- return cpu_none_mask;
-#endif
}
/**
@@ -1125,10 +1098,10 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
if (!check_builtin_idle_enabled())
return false;
- if (kf_cpu_valid(cpu, NULL))
- return scx_idle_test_and_clear_cpu(cpu);
- else
+ if (!kf_cpu_valid(cpu, NULL))
return false;
+
+ return scx_idle_test_and_clear_cpu(cpu);
}
/**
diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h
index 37be78a7502b..fa583f141f35 100644
--- a/kernel/sched/ext_idle.h
+++ b/kernel/sched/ext_idle.h
@@ -12,20 +12,8 @@
struct sched_ext_ops;
-#ifdef CONFIG_SMP
void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops);
void scx_idle_init_masks(void);
-bool scx_idle_test_and_clear_cpu(int cpu);
-s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags);
-#else /* !CONFIG_SMP */
-static inline void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) {}
-static inline void scx_idle_init_masks(void) {}
-static inline bool scx_idle_test_and_clear_cpu(int cpu) { return false; }
-static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags)
-{
- return -EBUSY;
-}
-#endif /* CONFIG_SMP */
s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
const struct cpumask *cpus_allowed, u64 flags);
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 2024c1d36402..59fdb7ebbf22 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -176,7 +176,7 @@ struct psi_group psi_system = {
.pcpu = &system_group_pcpu,
};
-static DEFINE_PER_CPU(seqcount_t, psi_seq);
+static DEFINE_PER_CPU(seqcount_t, psi_seq) = SEQCNT_ZERO(psi_seq);
static inline void psi_write_begin(int cpu)
{
@@ -204,11 +204,7 @@ static void poll_timer_fn(struct timer_list *t);
static void group_init(struct psi_group *group)
{
- int cpu;
-
group->enabled = true;
- for_each_possible_cpu(cpu)
- seqcount_init(per_cpu_ptr(&psi_seq, cpu));
group->avg_last_update = sched_clock();
group->avg_next_update = group->avg_last_update + psi_period;
mutex_init(&group->avgs_lock);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d3f33d10c58c..be9745d104f7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -403,7 +403,7 @@ static inline bool dl_server_active(struct sched_dl_entity *dl_se)
extern struct list_head task_groups;
-#ifdef CONFIG_CFS_BANDWIDTH
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
extern const u64 max_bw_quota_period_us;
/*
@@ -414,7 +414,7 @@ static inline u64 default_bw_period_us(void)
{
return 100000ULL;
}
-#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_GROUP_SCHED_BANDWIDTH */
struct cfs_bandwidth {
#ifdef CONFIG_CFS_BANDWIDTH
@@ -472,10 +472,7 @@ struct task_group {
struct rt_bandwidth rt_bandwidth;
#endif
-#ifdef CONFIG_EXT_GROUP_SCHED
- u32 scx_flags; /* SCX_TG_* */
- u32 scx_weight;
-#endif
+ struct scx_task_group scx;
struct rcu_head rcu;
struct list_head list;
diff --git a/kernel/sys.c b/kernel/sys.c
index 18a037cc6f61..1e28b40053ce 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2372,54 +2372,14 @@ int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long st
#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE)
-#ifdef CONFIG_ANON_VMA_NAME
-
-#define ANON_VMA_NAME_MAX_LEN 80
-#define ANON_VMA_NAME_INVALID_CHARS "\\`$[]"
-
-static inline bool is_valid_name_char(char ch)
-{
- /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */
- return ch > 0x1f && ch < 0x7f &&
- !strchr(ANON_VMA_NAME_INVALID_CHARS, ch);
-}
-
static int prctl_set_vma(unsigned long opt, unsigned long addr,
unsigned long size, unsigned long arg)
{
- struct mm_struct *mm = current->mm;
- const char __user *uname;
- struct anon_vma_name *anon_name = NULL;
int error;
switch (opt) {
case PR_SET_VMA_ANON_NAME:
- uname = (const char __user *)arg;
- if (uname) {
- char *name, *pch;
-
- name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN);
- if (IS_ERR(name))
- return PTR_ERR(name);
-
- for (pch = name; *pch != '\0'; pch++) {
- if (!is_valid_name_char(*pch)) {
- kfree(name);
- return -EINVAL;
- }
- }
- /* anon_vma has its own copy */
- anon_name = anon_vma_name_alloc(name);
- kfree(name);
- if (!anon_name)
- return -ENOMEM;
-
- }
-
- mmap_write_lock(mm);
- error = madvise_set_anon_name(mm, addr, size, anon_name);
- mmap_write_unlock(mm);
- anon_vma_name_put(anon_name);
+ error = set_anon_vma_name(addr, size, (const char __user *)arg);
break;
default:
error = -EINVAL;
@@ -2428,14 +2388,6 @@ static int prctl_set_vma(unsigned long opt, unsigned long addr,
return error;
}
-#else /* CONFIG_ANON_VMA_NAME */
-static int prctl_set_vma(unsigned long opt, unsigned long start,
- unsigned long size, unsigned long arg)
-{
- return -EINVAL;
-}
-#endif /* CONFIG_ANON_VMA_NAME */
-
static inline unsigned long get_current_mdwe(void)
{
unsigned long ret = 0;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index e400fe150f9d..0aef0e349e49 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -340,10 +340,7 @@ static void clocksource_verify_choose_cpus(void)
* CPUs that are currently online.
*/
for (i = 1; i < n; i++) {
- cpu = get_random_u32_below(nr_cpu_ids);
- cpu = cpumask_next(cpu - 1, cpu_online_mask);
- if (cpu >= nr_cpu_ids)
- cpu = cpumask_first(cpu_online_mask);
+ cpu = cpumask_random(cpu_online_mask);
if (!WARN_ON_ONCE(cpu >= nr_cpu_ids))
cpumask_set_cpu(cpu, &cpus_chosen);
}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index f135d04e1860..d2c79da81e4f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -200,6 +200,19 @@ menuconfig FTRACE
if FTRACE
+config TRACEFS_AUTOMOUNT_DEPRECATED
+ bool "Automount tracefs on debugfs [DEPRECATED]"
+ depends on TRACING
+ default y
+ help
+ The tracing interface was moved from /sys/kernel/debug/tracing
+ to /sys/kernel/tracing in 2015, but the tracing file system
+ was still automounted in /sys/kernel/debug for backward
+ compatibility with tooling.
+
+ The new interface has been around for more than 10 years and
+ the old debug mount will soon be removed.
+
config BOOTTIME_TRACING
bool "Boot-time Tracing support"
depends on TRACING
@@ -780,6 +793,20 @@ config UPROBE_EVENTS
This option is required if you plan to use perf-probe subcommand
of perf tools on user space applications.
+config EPROBE_EVENTS
+ bool "Enable event-based dynamic events"
+ depends on TRACING
+ depends on HAVE_REGS_AND_STACK_ACCESS_API
+ select PROBE_EVENTS
+ select DYNAMIC_EVENTS
+ default y
+ help
+ Eprobes are dynamic events that can be placed on other existing
+ events. It can be used to limit what fields are recorded in
+ an event or even dereference a field of an event. It can
+ convert the type of an event field. For example, turn an
+ address into a string.
+
config BPF_EVENTS
depends on BPF_SYSCALL
depends on (KPROBE_EVENTS || UPROBE_EVENTS) && PERF_EVENTS
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 057cd975d014..dcb4e02afc5f 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -82,7 +82,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
endif
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
-obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o
+obj-$(CONFIG_EPROBE_EVENTS) += trace_eprobe.o
obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o
obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o
obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 47168d2afbf1..6941145b5058 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -415,9 +415,10 @@ static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
size_t count, loff_t *ppos)
{
struct blk_trace *bt = filp->private_data;
+ size_t dropped = relay_stats(bt->rchan, RELAY_STATS_BUF_FULL);
char buf[16];
- snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
+ snprintf(buf, sizeof(buf), "%zu\n", dropped);
return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
}
@@ -456,23 +457,6 @@ static const struct file_operations blk_msg_fops = {
.llseek = noop_llseek,
};
-/*
- * Keep track of how many times we encountered a full subbuffer, to aid
- * the user space app in telling how many lost events there were.
- */
-static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
- void *prev_subbuf, size_t prev_padding)
-{
- struct blk_trace *bt;
-
- if (!relay_buf_full(buf))
- return 1;
-
- bt = buf->chan->private_data;
- atomic_inc(&bt->dropped);
- return 0;
-}
-
static int blk_remove_buf_file_callback(struct dentry *dentry)
{
debugfs_remove(dentry);
@@ -491,7 +475,6 @@ static struct dentry *blk_create_buf_file_callback(const char *filename,
}
static const struct rchan_callbacks blk_relay_callbacks = {
- .subbuf_start = blk_subbuf_start_callback,
.create_buf_file = blk_create_buf_file_callback,
.remove_buf_file = blk_remove_buf_file_callback,
};
@@ -580,7 +563,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
}
bt->dev = dev;
- atomic_set(&bt->dropped, 0);
INIT_LIST_HEAD(&bt->running_list);
ret = -EIO;
diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c
index 314ffc143039..acb0c971a408 100644
--- a/kernel/trace/preemptirq_delay_test.c
+++ b/kernel/trace/preemptirq_delay_test.c
@@ -117,12 +117,15 @@ static int preemptirq_delay_run(void *data)
{
int i;
int s = MIN(burst_size, NR_TEST_FUNCS);
- struct cpumask cpu_mask;
+ cpumask_var_t cpu_mask;
+
+ if (!alloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
if (cpu_affinity > -1) {
- cpumask_clear(&cpu_mask);
- cpumask_set_cpu(cpu_affinity, &cpu_mask);
- if (set_cpus_allowed_ptr(current, &cpu_mask))
+ cpumask_clear(cpu_mask);
+ cpumask_set_cpu(cpu_affinity, cpu_mask);
+ if (set_cpus_allowed_ptr(current, cpu_mask))
pr_err("cpu_affinity:%d, failed\n", cpu_affinity);
}
@@ -139,6 +142,8 @@ static int preemptirq_delay_run(void *data)
__set_current_state(TASK_RUNNING);
+ free_cpumask_var(cpu_mask);
+
return 0;
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5176e0270f07..bb71a0dc9d69 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4812,26 +4812,26 @@ int ring_buffer_write(struct trace_buffer *buffer,
int ret = -EBUSY;
int cpu;
- preempt_disable_notrace();
+ guard(preempt_notrace)();
if (atomic_read(&buffer->record_disabled))
- goto out;
+ return -EBUSY;
cpu = raw_smp_processor_id();
if (!cpumask_test_cpu(cpu, buffer->cpumask))
- goto out;
+ return -EBUSY;
cpu_buffer = buffer->buffers[cpu];
if (atomic_read(&cpu_buffer->record_disabled))
- goto out;
+ return -EBUSY;
if (length > buffer->max_data_size)
- goto out;
+ return -EBUSY;
if (unlikely(trace_recursive_lock(cpu_buffer)))
- goto out;
+ return -EBUSY;
event = rb_reserve_next_event(buffer, cpu_buffer, length);
if (!event)
@@ -4849,10 +4849,6 @@ int ring_buffer_write(struct trace_buffer *buffer,
out_unlock:
trace_recursive_unlock(cpu_buffer);
-
- out:
- preempt_enable_notrace();
-
return ret;
}
EXPORT_SYMBOL_GPL(ring_buffer_write);
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index bd7d56dbf6c2..1482e91c39f4 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -674,8 +674,6 @@ static bool __read_mostly monitoring_on;
*/
bool rv_monitoring_on(void)
{
- /* Ensures that concurrent monitors read consistent monitoring_on */
- smp_rmb();
return READ_ONCE(monitoring_on);
}
@@ -695,8 +693,6 @@ static ssize_t monitoring_on_read_data(struct file *filp, char __user *user_buf,
static void turn_monitoring_off(void)
{
WRITE_ONCE(monitoring_on, false);
- /* Ensures that concurrent monitors read consistent monitoring_on */
- smp_wmb();
}
static void reset_all_monitors(void)
@@ -712,8 +708,6 @@ static void reset_all_monitors(void)
static void turn_monitoring_on(void)
{
WRITE_ONCE(monitoring_on, true);
- /* Ensures that concurrent monitors read consistent monitoring_on */
- smp_wmb();
}
static void turn_monitoring_on_with_reset(void)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7996f26c3f46..4283ed4e8f59 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -432,15 +432,13 @@ static void ftrace_exports(struct ring_buffer_event *event, int flag)
{
struct trace_export *export;
- preempt_disable_notrace();
+ guard(preempt_notrace)();
export = rcu_dereference_raw_check(ftrace_exports_list);
while (export) {
trace_process_export(export, event, flag);
export = rcu_dereference_raw_check(export->next);
}
-
- preempt_enable_notrace();
}
static inline void
@@ -497,27 +495,18 @@ int register_ftrace_export(struct trace_export *export)
if (WARN_ON_ONCE(!export->write))
return -1;
- mutex_lock(&ftrace_export_lock);
+ guard(mutex)(&ftrace_export_lock);
add_ftrace_export(&ftrace_exports_list, export);
- mutex_unlock(&ftrace_export_lock);
-
return 0;
}
EXPORT_SYMBOL_GPL(register_ftrace_export);
int unregister_ftrace_export(struct trace_export *export)
{
- int ret;
-
- mutex_lock(&ftrace_export_lock);
-
- ret = rm_ftrace_export(&ftrace_exports_list, export);
-
- mutex_unlock(&ftrace_export_lock);
-
- return ret;
+ guard(mutex)(&ftrace_export_lock);
+ return rm_ftrace_export(&ftrace_exports_list, export);
}
EXPORT_SYMBOL_GPL(unregister_ftrace_export);
@@ -640,9 +629,8 @@ void trace_array_put(struct trace_array *this_tr)
if (!this_tr)
return;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
__trace_array_put(this_tr);
- mutex_unlock(&trace_types_lock);
}
EXPORT_SYMBOL_GPL(trace_array_put);
@@ -936,7 +924,6 @@ int tracing_is_enabled(void)
* return the mirror variable of the state of the ring buffer.
* It's a little racy, but we don't really care.
*/
- smp_rmb();
return !global_trace.buffer_disabled;
}
@@ -1107,8 +1094,6 @@ void tracer_tracing_on(struct trace_array *tr)
* important to be fast than accurate.
*/
tr->buffer_disabled = 0;
- /* Make the flag seen by readers */
- smp_wmb();
}
/**
@@ -1163,13 +1148,11 @@ int __trace_array_puts(struct trace_array *tr, unsigned long ip,
trace_ctx = tracing_gen_ctx();
buffer = tr->array_buffer.buffer;
- ring_buffer_nest_start(buffer);
+ guard(ring_buffer_nest)(buffer);
event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
trace_ctx);
- if (!event) {
- size = 0;
- goto out;
- }
+ if (!event)
+ return 0;
entry = ring_buffer_event_data(event);
entry->ip = ip;
@@ -1185,8 +1168,6 @@ int __trace_array_puts(struct trace_array *tr, unsigned long ip,
__buffer_unlock_commit(buffer, event);
ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL);
- out:
- ring_buffer_nest_end(buffer);
return size;
}
EXPORT_SYMBOL_GPL(__trace_array_puts);
@@ -1216,7 +1197,6 @@ int __trace_bputs(unsigned long ip, const char *str)
struct bputs_entry *entry;
unsigned int trace_ctx;
int size = sizeof(struct bputs_entry);
- int ret = 0;
if (!printk_binsafe(tr))
return __trace_puts(ip, str, strlen(str));
@@ -1230,11 +1210,11 @@ int __trace_bputs(unsigned long ip, const char *str)
trace_ctx = tracing_gen_ctx();
buffer = tr->array_buffer.buffer;
- ring_buffer_nest_start(buffer);
+ guard(ring_buffer_nest)(buffer);
event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
trace_ctx);
if (!event)
- goto out;
+ return 0;
entry = ring_buffer_event_data(event);
entry->ip = ip;
@@ -1243,10 +1223,7 @@ int __trace_bputs(unsigned long ip, const char *str)
__buffer_unlock_commit(buffer, event);
ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL);
- ret = 1;
- out:
- ring_buffer_nest_end(buffer);
- return ret;
+ return 1;
}
EXPORT_SYMBOL_GPL(__trace_bputs);
@@ -1435,13 +1412,8 @@ static int tracing_arm_snapshot_locked(struct trace_array *tr)
int tracing_arm_snapshot(struct trace_array *tr)
{
- int ret;
-
- mutex_lock(&trace_types_lock);
- ret = tracing_arm_snapshot_locked(tr);
- mutex_unlock(&trace_types_lock);
-
- return ret;
+ guard(mutex)(&trace_types_lock);
+ return tracing_arm_snapshot_locked(tr);
}
void tracing_disarm_snapshot(struct trace_array *tr)
@@ -1640,8 +1612,6 @@ void tracer_tracing_off(struct trace_array *tr)
* important to be fast than accurate.
*/
tr->buffer_disabled = 1;
- /* Make the flag seen by readers */
- smp_wmb();
}
/**
@@ -1846,7 +1816,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
ret = get_user(ch, ubuf++);
if (ret)
- goto out;
+ return ret;
read++;
cnt--;
@@ -1860,7 +1830,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
while (cnt && isspace(ch)) {
ret = get_user(ch, ubuf++);
if (ret)
- goto out;
+ return ret;
read++;
cnt--;
}
@@ -1870,8 +1840,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
/* only spaces were written */
if (isspace(ch) || !ch) {
*ppos += read;
- ret = read;
- goto out;
+ return read;
}
}
@@ -1879,13 +1848,12 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
while (cnt && !isspace(ch) && ch) {
if (parser->idx < parser->size - 1)
parser->buffer[parser->idx++] = ch;
- else {
- ret = -EINVAL;
- goto out;
- }
+ else
+ return -EINVAL;
+
ret = get_user(ch, ubuf++);
if (ret)
- goto out;
+ return ret;
read++;
cnt--;
}
@@ -1900,15 +1868,11 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
/* Make sure the parsed string always terminates with '\0'. */
parser->buffer[parser->idx] = 0;
} else {
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
*ppos += read;
- ret = read;
-
-out:
- return ret;
+ return read;
}
/* TODO add a seq_buf_to_buffer() */
@@ -2410,10 +2374,10 @@ int __init register_tracer(struct tracer *type)
mutex_unlock(&trace_types_lock);
if (ret || !default_bootup_tracer)
- goto out_unlock;
+ return ret;
if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE))
- goto out_unlock;
+ return 0;
printk(KERN_INFO "Starting tracer '%s'\n", type->name);
/* Do we want this tracer to start on bootup? */
@@ -2425,8 +2389,7 @@ int __init register_tracer(struct tracer *type)
/* disable other selftests, since this will break it. */
disable_tracing_selftest("running a tracer");
- out_unlock:
- return ret;
+ return 0;
}
static void tracing_reset_cpu(struct array_buffer *buf, int cpu)
@@ -2503,9 +2466,8 @@ void tracing_reset_all_online_cpus_unlocked(void)
void tracing_reset_all_online_cpus(void)
{
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
tracing_reset_all_online_cpus_unlocked();
- mutex_unlock(&trace_types_lock);
}
int is_tracing_stopped(void)
@@ -2516,18 +2478,17 @@ int is_tracing_stopped(void)
static void tracing_start_tr(struct trace_array *tr)
{
struct trace_buffer *buffer;
- unsigned long flags;
if (tracing_disabled)
return;
- raw_spin_lock_irqsave(&tr->start_lock, flags);
+ guard(raw_spinlock_irqsave)(&tr->start_lock);
if (--tr->stop_count) {
if (WARN_ON_ONCE(tr->stop_count < 0)) {
/* Someone screwed up their debugging */
tr->stop_count = 0;
}
- goto out;
+ return;
}
/* Prevent the buffers from switching */
@@ -2544,9 +2505,6 @@ static void tracing_start_tr(struct trace_array *tr)
#endif
arch_spin_unlock(&tr->max_lock);
-
- out:
- raw_spin_unlock_irqrestore(&tr->start_lock, flags);
}
/**
@@ -2564,11 +2522,10 @@ void tracing_start(void)
static void tracing_stop_tr(struct trace_array *tr)
{
struct trace_buffer *buffer;
- unsigned long flags;
- raw_spin_lock_irqsave(&tr->start_lock, flags);
+ guard(raw_spinlock_irqsave)(&tr->start_lock);
if (tr->stop_count++)
- goto out;
+ return;
/* Prevent the buffers from switching */
arch_spin_lock(&tr->max_lock);
@@ -2584,9 +2541,6 @@ static void tracing_stop_tr(struct trace_array *tr)
#endif
arch_spin_unlock(&tr->max_lock);
-
- out:
- raw_spin_unlock_irqrestore(&tr->start_lock, flags);
}
/**
@@ -2699,19 +2653,17 @@ void trace_buffered_event_enable(void)
per_cpu(trace_buffered_event, cpu) = event;
- preempt_disable();
- if (cpu == smp_processor_id() &&
- __this_cpu_read(trace_buffered_event) !=
- per_cpu(trace_buffered_event, cpu))
- WARN_ON_ONCE(1);
- preempt_enable();
+ scoped_guard(preempt,) {
+ if (cpu == smp_processor_id() &&
+ __this_cpu_read(trace_buffered_event) !=
+ per_cpu(trace_buffered_event, cpu))
+ WARN_ON_ONCE(1);
+ }
}
}
static void enable_trace_buffered_event(void *data)
{
- /* Probably not needed, but do it anyway */
- smp_rmb();
this_cpu_dec(trace_buffered_event_cnt);
}
@@ -3051,7 +3003,7 @@ static void __ftrace_trace_stack(struct trace_array *tr,
skip++;
#endif
- preempt_disable_notrace();
+ guard(preempt_notrace)();
stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1;
@@ -3109,8 +3061,6 @@ static void __ftrace_trace_stack(struct trace_array *tr,
/* Again, don't let gcc optimize things here */
barrier();
__this_cpu_dec(ftrace_stack_reserve);
- preempt_enable_notrace();
-
}
static inline void ftrace_trace_stack(struct trace_array *tr,
@@ -3193,9 +3143,9 @@ ftrace_trace_userstack(struct trace_array *tr,
* prevent recursion, since the user stack tracing may
* trigger other kernel events.
*/
- preempt_disable();
+ guard(preempt)();
if (__this_cpu_read(user_stack_count))
- goto out;
+ return;
__this_cpu_inc(user_stack_count);
@@ -3213,8 +3163,6 @@ ftrace_trace_userstack(struct trace_array *tr,
out_drop_count:
__this_cpu_dec(user_stack_count);
- out:
- preempt_enable();
}
#else /* CONFIG_USER_STACKTRACE_SUPPORT */
static void ftrace_trace_userstack(struct trace_array *tr,
@@ -3396,7 +3344,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
pause_graph_tracing();
trace_ctx = tracing_gen_ctx();
- preempt_disable_notrace();
+ guard(preempt_notrace)();
tbuffer = get_trace_buf();
if (!tbuffer) {
@@ -3411,26 +3359,23 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
size = sizeof(*entry) + sizeof(u32) * len;
buffer = tr->array_buffer.buffer;
- ring_buffer_nest_start(buffer);
- event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
- trace_ctx);
- if (!event)
- goto out;
- entry = ring_buffer_event_data(event);
- entry->ip = ip;
- entry->fmt = fmt;
-
- memcpy(entry->buf, tbuffer, sizeof(u32) * len);
- __buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL);
+ scoped_guard(ring_buffer_nest, buffer) {
+ event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
+ trace_ctx);
+ if (!event)
+ goto out_put;
+ entry = ring_buffer_event_data(event);
+ entry->ip = ip;
+ entry->fmt = fmt;
-out:
- ring_buffer_nest_end(buffer);
+ memcpy(entry->buf, tbuffer, sizeof(u32) * len);
+ __buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL);
+ }
out_put:
put_trace_buf();
out_nobuffer:
- preempt_enable_notrace();
unpause_graph_tracing();
return len;
@@ -3454,7 +3399,7 @@ int __trace_array_vprintk(struct trace_buffer *buffer,
pause_graph_tracing();
trace_ctx = tracing_gen_ctx();
- preempt_disable_notrace();
+ guard(preempt_notrace)();
tbuffer = get_trace_buf();
@@ -3466,24 +3411,22 @@ int __trace_array_vprintk(struct trace_buffer *buffer,
len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
size = sizeof(*entry) + len + 1;
- ring_buffer_nest_start(buffer);
- event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
- trace_ctx);
- if (!event)
- goto out;
- entry = ring_buffer_event_data(event);
- entry->ip = ip;
-
- memcpy(&entry->buf, tbuffer, len + 1);
- __buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL);
+ scoped_guard(ring_buffer_nest, buffer) {
+ event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
+ trace_ctx);
+ if (!event)
+ goto out;
+ entry = ring_buffer_event_data(event);
+ entry->ip = ip;
+ memcpy(&entry->buf, tbuffer, len + 1);
+ __buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL);
+ }
out:
- ring_buffer_nest_end(buffer);
put_trace_buf();
out_nobuffer:
- preempt_enable_notrace();
unpause_graph_tracing();
return len;
@@ -4807,20 +4750,16 @@ int tracing_open_file_tr(struct inode *inode, struct file *filp)
if (ret)
return ret;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
/* Fail if the file is marked for removal */
if (file->flags & EVENT_FILE_FL_FREED) {
trace_array_put(file->tr);
- ret = -ENODEV;
+ return -ENODEV;
} else {
event_file_get(file);
}
- mutex_unlock(&event_mutex);
- if (ret)
- return ret;
-
filp->private_data = inode->i_private;
return 0;
@@ -5097,7 +5036,7 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf,
size_t count, loff_t *ppos)
{
struct trace_array *tr = file_inode(filp)->i_private;
- char *mask_str;
+ char *mask_str __free(kfree) = NULL;
int len;
len = snprintf(NULL, 0, "%*pb\n",
@@ -5108,16 +5047,10 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf,
len = snprintf(mask_str, len, "%*pb\n",
cpumask_pr_args(tr->tracing_cpumask));
- if (len >= count) {
- count = -EINVAL;
- goto out_err;
- }
- count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len);
-
-out_err:
- kfree(mask_str);
+ if (len >= count)
+ return -EINVAL;
- return count;
+ return simple_read_from_buffer(ubuf, count, ppos, mask_str, len);
}
int tracing_set_cpumask(struct trace_array *tr,
@@ -5931,17 +5864,27 @@ static inline void trace_insert_eval_map_file(struct module *mod,
struct trace_eval_map **start, int len) { }
#endif /* !CONFIG_TRACE_EVAL_MAP_FILE */
-static void trace_insert_eval_map(struct module *mod,
- struct trace_eval_map **start, int len)
+static void
+trace_event_update_with_eval_map(struct module *mod,
+ struct trace_eval_map **start,
+ int len)
{
struct trace_eval_map **map;
- if (len <= 0)
- return;
+ /* Always run sanitizer only if btf_type_tag attr exists. */
+ if (len <= 0) {
+ if (!(IS_ENABLED(CONFIG_DEBUG_INFO_BTF) &&
+ IS_ENABLED(CONFIG_PAHOLE_HAS_BTF_TAG) &&
+ __has_attribute(btf_type_tag)))
+ return;
+ }
map = start;
- trace_event_eval_update(map, len);
+ trace_event_update_all(map, len);
+
+ if (len <= 0)
+ return;
trace_insert_eval_map_file(mod, start, len);
}
@@ -5954,9 +5897,9 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
char buf[MAX_TRACER_SIZE+2];
int r;
- mutex_lock(&trace_types_lock);
- r = sprintf(buf, "%s\n", tr->current_trace->name);
- mutex_unlock(&trace_types_lock);
+ scoped_guard(mutex, &trace_types_lock) {
+ r = sprintf(buf, "%s\n", tr->current_trace->name);
+ }
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
@@ -6258,15 +6201,13 @@ int tracing_update_buffers(struct trace_array *tr)
{
int ret = 0;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
update_last_data(tr);
if (!tr->ring_buffer_expanded)
ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
RING_BUFFER_ALL_CPUS);
- mutex_unlock(&trace_types_lock);
-
return ret;
}
@@ -6297,7 +6238,7 @@ static bool tracer_options_updated;
static void add_tracer_options(struct trace_array *tr, struct tracer *t)
{
/* Only enable if the directory has been created already. */
- if (!tr->dir)
+ if (!tr->dir && !(tr->flags & TRACE_ARRAY_FL_GLOBAL))
return;
/* Only create trace option files after update_tracer_options finish */
@@ -6563,7 +6504,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
if (ret)
return ret;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
cpu = tracing_get_cpu(inode);
ret = open_pipe_on_cpu(tr, cpu);
if (ret)
@@ -6607,7 +6548,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
tr->trace_ref++;
- mutex_unlock(&trace_types_lock);
return ret;
fail:
@@ -6616,7 +6556,6 @@ fail_alloc_iter:
close_pipe_on_cpu(tr, cpu);
fail_pipe_on_cpu:
__trace_array_put(tr);
- mutex_unlock(&trace_types_lock);
return ret;
}
@@ -6625,14 +6564,13 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
struct trace_iterator *iter = file->private_data;
struct trace_array *tr = inode->i_private;
- mutex_lock(&trace_types_lock);
-
- tr->trace_ref--;
+ scoped_guard(mutex, &trace_types_lock) {
+ tr->trace_ref--;
- if (iter->trace->pipe_close)
- iter->trace->pipe_close(iter);
- close_pipe_on_cpu(tr, iter->cpu_file);
- mutex_unlock(&trace_types_lock);
+ if (iter->trace->pipe_close)
+ iter->trace->pipe_close(iter);
+ close_pipe_on_cpu(tr, iter->cpu_file);
+ }
free_trace_iter_content(iter);
kfree(iter);
@@ -7435,7 +7373,7 @@ int tracing_set_clock(struct trace_array *tr, const char *clockstr)
if (i == ARRAY_SIZE(trace_clocks))
return -EINVAL;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
tr->clock_id = i;
@@ -7459,8 +7397,6 @@ int tracing_set_clock(struct trace_array *tr, const char *clockstr)
tscratch->clock_id = i;
}
- mutex_unlock(&trace_types_lock);
-
return 0;
}
@@ -7512,15 +7448,13 @@ static int tracing_time_stamp_mode_show(struct seq_file *m, void *v)
{
struct trace_array *tr = m->private;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
if (ring_buffer_time_stamp_abs(tr->array_buffer.buffer))
seq_puts(m, "delta [absolute]\n");
else
seq_puts(m, "[delta] absolute\n");
- mutex_unlock(&trace_types_lock);
-
return 0;
}
@@ -8108,14 +8042,14 @@ static void clear_tracing_err_log(struct trace_array *tr)
{
struct tracing_log_err *err, *next;
- mutex_lock(&tracing_err_log_lock);
+ guard(mutex)(&tracing_err_log_lock);
+
list_for_each_entry_safe(err, next, &tr->err_log, list) {
list_del(&err->list);
free_tracing_log_err(err);
}
tr->n_err_log_entries = 0;
- mutex_unlock(&tracing_err_log_lock);
}
static void *tracing_err_log_seq_start(struct seq_file *m, loff_t *pos)
@@ -8386,7 +8320,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
struct ftrace_buffer_info *info = file->private_data;
struct trace_iterator *iter = &info->iter;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
iter->tr->trace_ref--;
@@ -8397,8 +8331,6 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
info->spare_cpu, info->spare);
kvfree(info);
- mutex_unlock(&trace_types_lock);
-
return 0;
}
@@ -8606,14 +8538,13 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned
* An ioctl call with cmd 0 to the ring buffer file will wake up all
* waiters
*/
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
/* Make sure the waiters see the new wait_index */
(void)atomic_fetch_inc_release(&iter->wait_index);
ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
- mutex_unlock(&trace_types_lock);
return 0;
}
@@ -8954,12 +8885,12 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash,
out_reg:
ret = tracing_arm_snapshot(tr);
if (ret < 0)
- goto out;
+ return ret;
ret = register_ftrace_function_probe(glob, tr, ops, count);
if (ret < 0)
tracing_disarm_snapshot(tr);
- out:
+
return ret < 0 ? ret : 0;
}
@@ -8978,13 +8909,13 @@ static inline __init int register_snapshot_cmd(void) { return 0; }
static struct dentry *tracing_get_dentry(struct trace_array *tr)
{
- if (WARN_ON(!tr->dir))
- return ERR_PTR(-ENODEV);
-
/* Top directory uses NULL as the parent */
if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
return NULL;
+ if (WARN_ON(!tr->dir))
+ return ERR_PTR(-ENODEV);
+
/* All sub buffers have a descriptor */
return tr->dir;
}
@@ -9103,10 +9034,9 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
return -EINVAL;
if (!!(topt->flags->val & topt->opt->bit) != val) {
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
ret = __set_tracer_option(topt->tr, topt->flags,
topt->opt, !val);
- mutex_unlock(&trace_types_lock);
if (ret)
return ret;
}
@@ -9415,7 +9345,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
return ret;
if (buffer) {
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
if (!!val == tracer_tracing_is_on(tr)) {
val = 0; /* do nothing */
} else if (val) {
@@ -9429,7 +9359,6 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
/* Wake up any waiters */
ring_buffer_wake_waiters(buffer, RING_BUFFER_ALL_CPUS);
}
- mutex_unlock(&trace_types_lock);
}
(*ppos)++;
@@ -9813,10 +9742,9 @@ static void __update_tracer_options(struct trace_array *tr)
static void update_tracer_options(struct trace_array *tr)
{
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
tracer_options_updated = true;
__update_tracer_options(tr);
- mutex_unlock(&trace_types_lock);
}
/* Must have trace_types_lock held */
@@ -9838,11 +9766,10 @@ struct trace_array *trace_array_find_get(const char *instance)
{
struct trace_array *tr;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
tr = trace_array_find(instance);
if (tr)
tr->ref++;
- mutex_unlock(&trace_types_lock);
return tr;
}
@@ -10250,6 +10177,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
ftrace_init_tracefs(tr, d_tracer);
}
+#ifdef CONFIG_TRACEFS_AUTOMOUNT_DEPRECATED
static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
{
struct vfsmount *mnt;
@@ -10271,6 +10199,8 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
if (IS_ERR(fc))
return ERR_CAST(fc);
+ pr_warn("NOTICE: Automounting of tracing to debugfs is deprecated and will be removed in 2030\n");
+
ret = vfs_parse_fs_string(fc, "source",
"tracefs", strlen("tracefs"));
if (!ret)
@@ -10281,6 +10211,7 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
put_fs_context(fc);
return mnt;
}
+#endif
/**
* tracing_init_dentry - initialize top level trace array
@@ -10305,6 +10236,7 @@ int tracing_init_dentry(void)
if (WARN_ON(!tracefs_initialized()))
return -ENODEV;
+#ifdef CONFIG_TRACEFS_AUTOMOUNT_DEPRECATED
/*
* As there may still be users that expect the tracing
* files to exist in debugfs/tracing, we must automount
@@ -10313,6 +10245,7 @@ int tracing_init_dentry(void)
*/
tr->dir = debugfs_create_automount("tracing", NULL,
trace_automount, NULL);
+#endif
return 0;
}
@@ -10329,7 +10262,7 @@ static void __init eval_map_work_func(struct work_struct *work)
int len;
len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps;
- trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len);
+ trace_event_update_with_eval_map(NULL, __start_ftrace_eval_maps, len);
}
static int __init trace_eval_init(void)
@@ -10367,7 +10300,7 @@ bool module_exists(const char *module)
{
/* All modules have the symbol __this_module */
static const char this_mod[] = "__this_module";
- char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2];
+ char modname[MODULE_NAME_LEN + sizeof(this_mod) + 2];
unsigned long val;
int n;
@@ -10382,9 +10315,6 @@ bool module_exists(const char *module)
static void trace_module_add_evals(struct module *mod)
{
- if (!mod->num_trace_evals)
- return;
-
/*
* Modules with bad taint do not have events created, do
* not bother with enums either.
@@ -10392,7 +10322,8 @@ static void trace_module_add_evals(struct module *mod)
if (trace_module_has_bad_taint(mod))
return;
- trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals);
+ /* Even if no trace_evals, this need to sanitize field types. */
+ trace_event_update_with_eval_map(mod, mod->trace_evals, mod->num_trace_evals);
}
#ifdef CONFIG_TRACE_EVAL_MAP_FILE
@@ -10796,7 +10727,8 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos,
int (*createfn)(const char *))
{
- char *kbuf, *buf, *tmp;
+ char *kbuf __free(kfree) = NULL;
+ char *buf, *tmp;
int ret = 0;
size_t done = 0;
size_t size;
@@ -10811,10 +10743,9 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
if (size >= WRITE_BUFSIZE)
size = WRITE_BUFSIZE - 1;
- if (copy_from_user(kbuf, buffer + done, size)) {
- ret = -EFAULT;
- goto out;
- }
+ if (copy_from_user(kbuf, buffer + done, size))
+ return -EFAULT;
+
kbuf[size] = '\0';
buf = kbuf;
do {
@@ -10830,8 +10761,7 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
/* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
pr_warn("Line length is too long: Should be less than %d\n",
WRITE_BUFSIZE - 2);
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
}
done += size;
@@ -10844,17 +10774,12 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
ret = createfn(buf);
if (ret)
- goto out;
+ return ret;
buf += size;
} while (done < count);
}
- ret = done;
-
-out:
- kfree(kbuf);
-
- return ret;
+ return done;
}
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -11057,7 +10982,7 @@ __init static int tracer_alloc_buffers(void)
BUILD_BUG_ON(TRACE_ITER_LAST_BIT > TRACE_FLAGS_MAX_SIZE);
if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
- goto out;
+ return -ENOMEM;
if (!alloc_cpumask_var(&global_trace.tracing_cpumask, GFP_KERNEL))
goto out_free_buffer_mask;
@@ -11175,7 +11100,6 @@ out_free_cpumask:
free_cpumask_var(global_trace.tracing_cpumask);
out_free_buffer_mask:
free_cpumask_var(tracing_buffer_mask);
-out:
return ret;
}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index bd084953a98b..1dbf1d3cf2f1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -2125,13 +2125,13 @@ static inline const char *get_syscall_name(int syscall)
#ifdef CONFIG_EVENT_TRACING
void trace_event_init(void);
-void trace_event_eval_update(struct trace_eval_map **map, int len);
+void trace_event_update_all(struct trace_eval_map **map, int len);
/* Used from boot time tracer */
extern int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set);
extern int trigger_process_regex(struct trace_event_file *file, char *buff);
#else
static inline void __init trace_event_init(void) { }
-static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { }
+static inline void trace_event_update_all(struct trace_eval_map **map, int len) { }
#endif
#ifdef CONFIG_TRACER_SNAPSHOT
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index d01e5c910ce1..9f3e9537417d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -768,6 +768,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
{
struct trace_event_call *call = file->event_call;
struct trace_array *tr = file->tr;
+ bool soft_mode = atomic_read(&file->sm_ref) != 0;
int ret = 0;
int disable;
@@ -782,7 +783,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
* is set we do not want the event to be enabled before we
* clear the bit.
*
- * When soft_disable is not set but the SOFT_MODE flag is,
+ * When soft_disable is not set but the soft_mode is,
* we do nothing. Do not disable the tracepoint, otherwise
* "soft enable"s (clearing the SOFT_DISABLED bit) wont work.
*/
@@ -790,11 +791,11 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
if (atomic_dec_return(&file->sm_ref) > 0)
break;
disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED;
- clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
+ soft_mode = false;
/* Disable use of trace_buffered_event */
trace_buffered_event_disable();
} else
- disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE);
+ disable = !soft_mode;
if (disable && (file->flags & EVENT_FILE_FL_ENABLED)) {
clear_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags);
@@ -812,8 +813,8 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
WARN_ON_ONCE(ret);
}
- /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */
- if (file->flags & EVENT_FILE_FL_SOFT_MODE)
+ /* If in soft mode, just set the SOFT_DISABLE_BIT, else clear it */
+ if (soft_mode)
set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
else
clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
@@ -823,7 +824,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
* When soft_disable is set and enable is set, we want to
* register the tracepoint for the event, but leave the event
* as is. That means, if the event was already enabled, we do
- * nothing (but set SOFT_MODE). If the event is disabled, we
+ * nothing (but set soft_mode). If the event is disabled, we
* set SOFT_DISABLED before enabling the event tracepoint, so
* it still seems to be disabled.
*/
@@ -832,7 +833,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
else {
if (atomic_inc_return(&file->sm_ref) > 1)
break;
- set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
+ soft_mode = true;
/* Enable use of trace_buffered_event */
trace_buffered_event_enable();
}
@@ -840,7 +841,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
if (!(file->flags & EVENT_FILE_FL_ENABLED)) {
bool cmd = false, tgid = false;
- /* Keep the event disabled, when going to SOFT_MODE. */
+ /* Keep the event disabled, when going to soft mode. */
if (soft_disable)
set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
@@ -1792,8 +1793,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
!(flags & EVENT_FILE_FL_SOFT_DISABLED))
strcpy(buf, "1");
- if (flags & EVENT_FILE_FL_SOFT_DISABLED ||
- flags & EVENT_FILE_FL_SOFT_MODE)
+ if (atomic_read(&file->sm_ref) != 0)
strcat(buf, "*");
strcat(buf, "\n");
@@ -3267,43 +3267,120 @@ static void add_str_to_module(struct module *module, char *str)
list_add(&modstr->next, &module_strings);
}
+#define ATTRIBUTE_STR "__attribute__("
+#define ATTRIBUTE_STR_LEN (sizeof(ATTRIBUTE_STR) - 1)
+
+/* Remove all __attribute__() from @type. Return allocated string or @type. */
+static char *sanitize_field_type(const char *type)
+{
+ char *attr, *tmp, *next, *ret = (char *)type;
+ int depth;
+
+ next = (char *)type;
+ while ((attr = strstr(next, ATTRIBUTE_STR))) {
+ /* Retry if "__attribute__(" is a part of another word. */
+ if (attr != next && !isspace(attr[-1])) {
+ next = attr + ATTRIBUTE_STR_LEN;
+ continue;
+ }
+
+ if (ret == type) {
+ ret = kstrdup(type, GFP_KERNEL);
+ if (WARN_ON_ONCE(!ret))
+ return NULL;
+ attr = ret + (attr - type);
+ }
+
+ /* the ATTRIBUTE_STR already has the first '(' */
+ depth = 1;
+ next = attr + ATTRIBUTE_STR_LEN;
+ do {
+ tmp = strpbrk(next, "()");
+ /* There is unbalanced parentheses */
+ if (WARN_ON_ONCE(!tmp)) {
+ kfree(ret);
+ return (char *)type;
+ }
+
+ if (*tmp == '(')
+ depth++;
+ else
+ depth--;
+ next = tmp + 1;
+ } while (depth > 0);
+ next = skip_spaces(next);
+ strcpy(attr, next);
+ next = attr;
+ }
+ return ret;
+}
+
+static char *find_replacable_eval(const char *type, const char *eval_string,
+ int len)
+{
+ char *ptr;
+
+ if (!eval_string)
+ return NULL;
+
+ ptr = strchr(type, '[');
+ if (!ptr)
+ return NULL;
+ ptr++;
+
+ if (!isalpha(*ptr) && *ptr != '_')
+ return NULL;
+
+ if (strncmp(eval_string, ptr, len) != 0)
+ return NULL;
+
+ return ptr;
+}
+
static void update_event_fields(struct trace_event_call *call,
struct trace_eval_map *map)
{
struct ftrace_event_field *field;
+ const char *eval_string = NULL;
struct list_head *head;
+ int len = 0;
char *ptr;
char *str;
- int len = strlen(map->eval_string);
/* Dynamic events should never have field maps */
- if (WARN_ON_ONCE(call->flags & TRACE_EVENT_FL_DYNAMIC))
+ if (call->flags & TRACE_EVENT_FL_DYNAMIC)
return;
+ if (map) {
+ eval_string = map->eval_string;
+ len = strlen(map->eval_string);
+ }
+
head = trace_get_fields(call);
list_for_each_entry(field, head, link) {
- ptr = strchr(field->type, '[');
- if (!ptr)
- continue;
- ptr++;
-
- if (!isalpha(*ptr) && *ptr != '_')
- continue;
+ str = sanitize_field_type(field->type);
+ if (!str)
+ return;
- if (strncmp(map->eval_string, ptr, len) != 0)
- continue;
+ ptr = find_replacable_eval(str, eval_string, len);
+ if (ptr) {
+ if (str == field->type) {
+ str = kstrdup(field->type, GFP_KERNEL);
+ if (WARN_ON_ONCE(!str))
+ return;
+ ptr = str + (ptr - field->type);
+ }
- str = kstrdup(field->type, GFP_KERNEL);
- if (WARN_ON_ONCE(!str))
- return;
- ptr = str + (ptr - field->type);
- ptr = eval_replace(ptr, map, len);
- /* enum/sizeof string smaller than value */
- if (WARN_ON_ONCE(!ptr)) {
- kfree(str);
- continue;
+ ptr = eval_replace(ptr, map, len);
+ /* enum/sizeof string smaller than value */
+ if (WARN_ON_ONCE(!ptr)) {
+ kfree(str);
+ continue;
+ }
}
+ if (str == field->type)
+ continue;
/*
* If the event is part of a module, then we need to free the string
* when the module is removed. Otherwise, it will stay allocated
@@ -3313,14 +3390,18 @@ static void update_event_fields(struct trace_event_call *call,
add_str_to_module(call->module, str);
field->type = str;
+ if (field->filter_type == FILTER_OTHER)
+ field->filter_type = filter_assign_type(field->type);
}
}
-void trace_event_eval_update(struct trace_eval_map **map, int len)
+/* Update all events for replacing eval and sanitizing */
+void trace_event_update_all(struct trace_eval_map **map, int len)
{
struct trace_event_call *call, *p;
const char *last_system = NULL;
bool first = false;
+ bool updated;
int last_i;
int i;
@@ -3333,6 +3414,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
last_system = call->class->system;
}
+ updated = false;
/*
* Since calls are grouped by systems, the likelihood that the
* next call in the iteration belongs to the same system as the
@@ -3352,8 +3434,12 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
}
update_event_printk(call, map[i]);
update_event_fields(call, map[i]);
+ updated = true;
}
}
+ /* If not updated yet, update field for sanitizing. */
+ if (!updated)
+ update_event_fields(call, NULL);
cond_resched();
}
up_write(&trace_event_sem);
@@ -3587,7 +3673,7 @@ static int probe_remove_event_call(struct trace_event_call *call)
continue;
/*
* We can't rely on ftrace_event_enable_disable(enable => 0)
- * we are going to do, EVENT_FILE_FL_SOFT_MODE can suppress
+ * we are going to do, soft mode can suppress
* TRACE_REG_UNREGISTER.
*/
if (file->flags & EVENT_FILE_FL_ENABLED)
@@ -3698,7 +3784,7 @@ static void trace_module_remove_events(struct module *mod)
if (call->module == mod)
__trace_remove_event_call(call);
}
- /* Check for any strings allocade for this module */
+ /* Check for any strings allocated for this module */
list_for_each_entry_safe(modstr, m, &module_strings, next) {
if (modstr->module != mod)
continue;
@@ -4002,7 +4088,7 @@ static int free_probe_data(void *data)
edata->ref--;
if (!edata->ref) {
- /* Remove the SOFT_MODE flag */
+ /* Remove soft mode */
__ftrace_event_enable_disable(edata->file, 0, 1);
trace_event_put_ref(edata->file->event_call);
kfree(edata);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e4581e10782b..54226b48b2d1 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1344,13 +1344,14 @@ struct filter_list {
struct filter_head {
struct list_head list;
- struct rcu_head rcu;
+ union {
+ struct rcu_head rcu;
+ struct rcu_work rwork;
+ };
};
-
-static void free_filter_list(struct rcu_head *rhp)
+static void free_filter_list(struct filter_head *filter_list)
{
- struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu);
struct filter_list *filter_item, *tmp;
list_for_each_entry_safe(filter_item, tmp, &filter_list->list, list) {
@@ -1361,9 +1362,20 @@ static void free_filter_list(struct rcu_head *rhp)
kfree(filter_list);
}
+static void free_filter_list_work(struct work_struct *work)
+{
+ struct filter_head *filter_list;
+
+ filter_list = container_of(to_rcu_work(work), struct filter_head, rwork);
+ free_filter_list(filter_list);
+}
+
static void free_filter_list_tasks(struct rcu_head *rhp)
{
- call_rcu(rhp, free_filter_list);
+ struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu);
+
+ INIT_RCU_WORK(&filter_list->rwork, free_filter_list_work);
+ queue_rcu_work(system_wq, &filter_list->rwork);
}
/*
@@ -1460,7 +1472,7 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir,
tracepoint_synchronize_unregister();
if (head)
- free_filter_list(&head->rcu);
+ free_filter_list(head);
list_for_each_entry(file, &tr->events, list) {
if (file->system != dir || !file->filter)
@@ -2305,7 +2317,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
return 0;
fail:
/* No call succeeded */
- free_filter_list(&filter_list->rcu);
+ free_filter_list(filter_list);
parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0);
return -EINVAL;
fail_mem:
@@ -2315,7 +2327,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
if (!fail)
delay_free_filter(filter_list);
else
- free_filter_list(&filter_list->rcu);
+ free_filter_list(filter_list);
return -ENOMEM;
}
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 33cfbd4ed76d..f24ee61f8884 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -536,12 +536,12 @@ static notrace void trace_event_raw_event_synth(void *__data,
* is being performed within another event.
*/
buffer = trace_file->tr->array_buffer.buffer;
- ring_buffer_nest_start(buffer);
+ guard(ring_buffer_nest)(buffer);
entry = trace_event_buffer_reserve(&fbuffer, trace_file,
sizeof(*entry) + fields_size);
if (!entry)
- goto out;
+ return;
for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
val_idx = var_ref_idx[i];
@@ -584,8 +584,6 @@ static notrace void trace_event_raw_event_synth(void *__data,
}
trace_event_buffer_commit(&fbuffer);
-out:
- ring_buffer_nest_end(buffer);
}
static void free_synth_event_print_fmt(struct trace_event_call *call)
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index b65353ec2837..2f7b94e98317 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -325,12 +325,9 @@ static void move_to_next_cpu(void)
cpus_read_lock();
cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
- next_cpu = cpumask_next(raw_smp_processor_id(), current_mask);
+ next_cpu = cpumask_next_wrap(raw_smp_processor_id(), current_mask);
cpus_read_unlock();
- if (next_cpu >= nr_cpu_ids)
- next_cpu = cpumask_first(current_mask);
-
if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */
goto change_mode;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 0b3db02030a7..97db0b0ccf3e 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -701,6 +701,7 @@ void print_function_args(struct trace_seq *s, unsigned long *args,
struct btf *btf;
s32 tid, nr = 0;
int a, p, x;
+ u16 encode;
trace_seq_printf(s, "(");
@@ -744,7 +745,12 @@ void print_function_args(struct trace_seq *s, unsigned long *args,
trace_seq_printf(s, "0x%lx", arg);
break;
case BTF_KIND_INT:
- trace_seq_printf(s, "%ld", arg);
+ encode = btf_int_encoding(t);
+ /* Print unsigned ints as hex */
+ if (encode & BTF_INT_SIGNED)
+ trace_seq_printf(s, "%ld", arg);
+ else
+ trace_seq_printf(s, "0x%lx", arg);
break;
case BTF_KIND_ENUM:
trace_seq_printf(s, "%ld", arg);
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 8686e329b8f2..586af49fc03e 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -199,18 +199,16 @@ void put_ucounts(struct ucounts *ucounts)
}
}
-static inline bool atomic_long_inc_below(atomic_long_t *v, int u)
+static inline bool atomic_long_inc_below(atomic_long_t *v, long u)
{
- long c, old;
- c = atomic_long_read(v);
- for (;;) {
+ long c = atomic_long_read(v);
+
+ do {
if (unlikely(c >= u))
return false;
- old = atomic_long_cmpxchg(v, c, c+1);
- if (likely(old == c))
- return true;
- c = old;
- }
+ } while (!atomic_long_try_cmpxchg(v, &c, c+1));
+
+ return true;
}
struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid,
diff --git a/kernel/unwind/Makefile b/kernel/unwind/Makefile
new file mode 100644
index 000000000000..eae37bea54fd
--- /dev/null
+++ b/kernel/unwind/Makefile
@@ -0,0 +1 @@
+ obj-$(CONFIG_UNWIND_USER) += user.o deferred.o
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
new file mode 100644
index 000000000000..dc6040aae3ee
--- /dev/null
+++ b/kernel/unwind/deferred.c
@@ -0,0 +1,362 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Deferred user space unwinding
+ */
+#include <linux/sched/task_stack.h>
+#include <linux/unwind_deferred.h>
+#include <linux/sched/clock.h>
+#include <linux/task_work.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+
+/*
+ * For requesting a deferred user space stack trace from NMI context
+ * the architecture must support a safe cmpxchg in NMI context.
+ * For those architectures that do not have that, then it cannot ask
+ * for a deferred user space stack trace from an NMI context. If it
+ * does, then it will get -EINVAL.
+ */
+#if defined(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG)
+# define CAN_USE_IN_NMI 1
+static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt)
+{
+ u32 old = 0;
+
+ return try_cmpxchg(&info->id.cnt, &old, cnt);
+}
+#else
+# define CAN_USE_IN_NMI 0
+/* When NMIs are not allowed, this always succeeds */
+static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt)
+{
+ info->id.cnt = cnt;
+ return true;
+}
+#endif
+
+/* Make the cache fit in a 4K page */
+#define UNWIND_MAX_ENTRIES \
+ ((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long))
+
+/* Guards adding to or removing from the list of callbacks */
+static DEFINE_MUTEX(callback_mutex);
+static LIST_HEAD(callbacks);
+
+#define RESERVED_BITS (UNWIND_PENDING | UNWIND_USED)
+
+/* Zero'd bits are available for assigning callback users */
+static unsigned long unwind_mask = RESERVED_BITS;
+DEFINE_STATIC_SRCU(unwind_srcu);
+
+static inline bool unwind_pending(struct unwind_task_info *info)
+{
+ return test_bit(UNWIND_PENDING_BIT, &info->unwind_mask);
+}
+
+/*
+ * This is a unique percpu identifier for a given task entry context.
+ * Conceptually, it's incremented every time the CPU enters the kernel from
+ * user space, so that each "entry context" on the CPU gets a unique ID. In
+ * reality, as an optimization, it's only incremented on demand for the first
+ * deferred unwind request after a given entry-from-user.
+ *
+ * It's combined with the CPU id to make a systemwide-unique "context cookie".
+ */
+static DEFINE_PER_CPU(u32, unwind_ctx_ctr);
+
+/*
+ * The context cookie is a unique identifier that is assigned to a user
+ * space stacktrace. As the user space stacktrace remains the same while
+ * the task is in the kernel, the cookie is an identifier for the stacktrace.
+ * Although it is possible for the stacktrace to get another cookie if another
+ * request is made after the cookie was cleared and before reentering user
+ * space.
+ */
+static u64 get_cookie(struct unwind_task_info *info)
+{
+ u32 cnt = 1;
+
+ if (info->id.cpu)
+ return info->id.id;
+
+ /* LSB is always set to ensure 0 is an invalid value */
+ cnt |= __this_cpu_read(unwind_ctx_ctr) + 2;
+ if (try_assign_cnt(info, cnt)) {
+ /* Update the per cpu counter */
+ __this_cpu_write(unwind_ctx_ctr, cnt);
+ }
+ /* Interrupts are disabled, the CPU will always be same */
+ info->id.cpu = smp_processor_id() + 1; /* Must be non zero */
+
+ return info->id.id;
+}
+
+/**
+ * unwind_user_faultable - Produce a user stacktrace in faultable context
+ * @trace: The descriptor that will store the user stacktrace
+ *
+ * This must be called in a known faultable context (usually when entering
+ * or exiting user space). Depending on the available implementations
+ * the @trace will be loaded with the addresses of the user space stacktrace
+ * if it can be found.
+ *
+ * Return: 0 on success and negative on error
+ * On success @trace will contain the user space stacktrace
+ */
+int unwind_user_faultable(struct unwind_stacktrace *trace)
+{
+ struct unwind_task_info *info = &current->unwind_info;
+ struct unwind_cache *cache;
+
+ /* Should always be called from faultable context */
+ might_fault();
+
+ if (!current->mm)
+ return -EINVAL;
+
+ if (!info->cache) {
+ info->cache = kzalloc(struct_size(cache, entries, UNWIND_MAX_ENTRIES),
+ GFP_KERNEL);
+ if (!info->cache)
+ return -ENOMEM;
+ }
+
+ cache = info->cache;
+ trace->entries = cache->entries;
+
+ if (cache->nr_entries) {
+ /*
+ * The user stack has already been previously unwound in this
+ * entry context. Skip the unwind and use the cache.
+ */
+ trace->nr = cache->nr_entries;
+ return 0;
+ }
+
+ trace->nr = 0;
+ unwind_user(trace, UNWIND_MAX_ENTRIES);
+
+ cache->nr_entries = trace->nr;
+
+ /* Clear nr_entries on way back to user space */
+ set_bit(UNWIND_USED_BIT, &info->unwind_mask);
+
+ return 0;
+}
+
+static void process_unwind_deferred(struct task_struct *task)
+{
+ struct unwind_task_info *info = &task->unwind_info;
+ struct unwind_stacktrace trace;
+ struct unwind_work *work;
+ unsigned long bits;
+ u64 cookie;
+
+ if (WARN_ON_ONCE(!unwind_pending(info)))
+ return;
+
+ /* Clear pending bit but make sure to have the current bits */
+ bits = atomic_long_fetch_andnot(UNWIND_PENDING,
+ (atomic_long_t *)&info->unwind_mask);
+ /*
+ * From here on out, the callback must always be called, even if it's
+ * just an empty trace.
+ */
+ trace.nr = 0;
+ trace.entries = NULL;
+
+ unwind_user_faultable(&trace);
+
+ if (info->cache)
+ bits &= ~(info->cache->unwind_completed);
+
+ cookie = info->id.id;
+
+ guard(srcu)(&unwind_srcu);
+ list_for_each_entry_srcu(work, &callbacks, list,
+ srcu_read_lock_held(&unwind_srcu)) {
+ if (test_bit(work->bit, &bits)) {
+ work->func(work, &trace, cookie);
+ if (info->cache)
+ info->cache->unwind_completed |= BIT(work->bit);
+ }
+ }
+}
+
+static void unwind_deferred_task_work(struct callback_head *head)
+{
+ process_unwind_deferred(current);
+}
+
+void unwind_deferred_task_exit(struct task_struct *task)
+{
+ struct unwind_task_info *info = &current->unwind_info;
+
+ if (!unwind_pending(info))
+ return;
+
+ process_unwind_deferred(task);
+
+ task_work_cancel(task, &info->work);
+}
+
+/**
+ * unwind_deferred_request - Request a user stacktrace on task kernel exit
+ * @work: Unwind descriptor requesting the trace
+ * @cookie: The cookie of the first request made for this task
+ *
+ * Schedule a user space unwind to be done in task work before exiting the
+ * kernel.
+ *
+ * The returned @cookie output is the generated cookie of the very first
+ * request for a user space stacktrace for this task since it entered the
+ * kernel. It can be from a request by any caller of this infrastructure.
+ * Its value will also be passed to the callback function. It can be
+ * used to stitch kernel and user stack traces together in post-processing.
+ *
+ * It's valid to call this function multiple times for the same @work within
+ * the same task entry context. Each call will return the same cookie
+ * while the task hasn't left the kernel. If the callback is not pending
+ * because it has already been previously called for the same entry context,
+ * it will be called again with the same stack trace and cookie.
+ *
+ * Return: 0 if the callback successfully was queued.
+ * 1 if the callback is pending or was already executed.
+ * Negative if there's an error.
+ * @cookie holds the cookie of the first request by any user
+ */
+int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
+{
+ struct unwind_task_info *info = &current->unwind_info;
+ unsigned long old, bits;
+ unsigned long bit;
+ int ret;
+
+ *cookie = 0;
+
+ if ((current->flags & (PF_KTHREAD | PF_EXITING)) ||
+ !user_mode(task_pt_regs(current)))
+ return -EINVAL;
+
+ /*
+ * NMI requires having safe cmpxchg operations.
+ * Trigger a warning to make it obvious that an architecture
+ * is using this in NMI when it should not be.
+ */
+ if (WARN_ON_ONCE(!CAN_USE_IN_NMI && in_nmi()))
+ return -EINVAL;
+
+ /* Do not allow cancelled works to request again */
+ bit = READ_ONCE(work->bit);
+ if (WARN_ON_ONCE(bit < 0))
+ return -EINVAL;
+
+ /* Only need the mask now */
+ bit = BIT(bit);
+
+ guard(irqsave)();
+
+ *cookie = get_cookie(info);
+
+ old = READ_ONCE(info->unwind_mask);
+
+ /* Is this already queued or executed */
+ if (old & bit)
+ return 1;
+
+ /*
+ * This work's bit hasn't been set yet. Now set it with the PENDING
+ * bit and fetch the current value of unwind_mask. If ether the
+ * work's bit or PENDING was already set, then this is already queued
+ * to have a callback.
+ */
+ bits = UNWIND_PENDING | bit;
+ old = atomic_long_fetch_or(bits, (atomic_long_t *)&info->unwind_mask);
+ if (old & bits) {
+ /*
+ * If the work's bit was set, whatever set it had better
+ * have also set pending and queued a callback.
+ */
+ WARN_ON_ONCE(!(old & UNWIND_PENDING));
+ return old & bit;
+ }
+
+ /* The work has been claimed, now schedule it. */
+ ret = task_work_add(current, &info->work, TWA_RESUME);
+
+ if (WARN_ON_ONCE(ret))
+ WRITE_ONCE(info->unwind_mask, 0);
+
+ return ret;
+}
+
+void unwind_deferred_cancel(struct unwind_work *work)
+{
+ struct task_struct *g, *t;
+ int bit;
+
+ if (!work)
+ return;
+
+ bit = work->bit;
+
+ /* No work should be using a reserved bit */
+ if (WARN_ON_ONCE(BIT(bit) & RESERVED_BITS))
+ return;
+
+ guard(mutex)(&callback_mutex);
+ list_del_rcu(&work->list);
+
+ /* Do not allow any more requests and prevent callbacks */
+ work->bit = -1;
+
+ __clear_bit(bit, &unwind_mask);
+
+ synchronize_srcu(&unwind_srcu);
+
+ guard(rcu)();
+ /* Clear this bit from all threads */
+ for_each_process_thread(g, t) {
+ clear_bit(bit, &t->unwind_info.unwind_mask);
+ if (t->unwind_info.cache)
+ clear_bit(bit, &t->unwind_info.cache->unwind_completed);
+ }
+}
+
+int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
+{
+ memset(work, 0, sizeof(*work));
+
+ guard(mutex)(&callback_mutex);
+
+ /* See if there's a bit in the mask available */
+ if (unwind_mask == ~0UL)
+ return -EBUSY;
+
+ work->bit = ffz(unwind_mask);
+ __set_bit(work->bit, &unwind_mask);
+
+ list_add_rcu(&work->list, &callbacks);
+ work->func = func;
+ return 0;
+}
+
+void unwind_task_init(struct task_struct *task)
+{
+ struct unwind_task_info *info = &task->unwind_info;
+
+ memset(info, 0, sizeof(*info));
+ init_task_work(&info->work, unwind_deferred_task_work);
+ info->unwind_mask = 0;
+}
+
+void unwind_task_free(struct task_struct *task)
+{
+ struct unwind_task_info *info = &task->unwind_info;
+
+ kfree(info->cache);
+ task_work_cancel(task, &info->work);
+}
diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c
new file mode 100644
index 000000000000..97a8415e3216
--- /dev/null
+++ b/kernel/unwind/user.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+* Generic interfaces for unwinding user space
+*/
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/unwind_user.h>
+#include <linux/uaccess.h>
+
+static const struct unwind_user_frame fp_frame = {
+ ARCH_INIT_USER_FP_FRAME
+};
+
+#define for_each_user_frame(state) \
+ for (unwind_user_start(state); !(state)->done; unwind_user_next(state))
+
+static int unwind_user_next_fp(struct unwind_user_state *state)
+{
+ const struct unwind_user_frame *frame = &fp_frame;
+ unsigned long cfa, fp, ra;
+ unsigned int shift;
+
+ if (frame->use_fp) {
+ if (state->fp < state->sp)
+ return -EINVAL;
+ cfa = state->fp;
+ } else {
+ cfa = state->sp;
+ }
+
+ /* Get the Canonical Frame Address (CFA) */
+ cfa += frame->cfa_off;
+
+ /* stack going in wrong direction? */
+ if (cfa <= state->sp)
+ return -EINVAL;
+
+ /* Make sure that the address is word aligned */
+ shift = sizeof(long) == 4 ? 2 : 3;
+ if (cfa & ((1 << shift) - 1))
+ return -EINVAL;
+
+ /* Find the Return Address (RA) */
+ if (get_user(ra, (unsigned long *)(cfa + frame->ra_off)))
+ return -EINVAL;
+
+ if (frame->fp_off && get_user(fp, (unsigned long __user *)(cfa + frame->fp_off)))
+ return -EINVAL;
+
+ state->ip = ra;
+ state->sp = cfa;
+ if (frame->fp_off)
+ state->fp = fp;
+ return 0;
+}
+
+static int unwind_user_next(struct unwind_user_state *state)
+{
+ unsigned long iter_mask = state->available_types;
+ unsigned int bit;
+
+ if (state->done)
+ return -EINVAL;
+
+ for_each_set_bit(bit, &iter_mask, NR_UNWIND_USER_TYPE_BITS) {
+ enum unwind_user_type type = BIT(bit);
+
+ state->current_type = type;
+ switch (type) {
+ case UNWIND_USER_TYPE_FP:
+ if (!unwind_user_next_fp(state))
+ return 0;
+ continue;
+ default:
+ WARN_ONCE(1, "Undefined unwind bit %d", bit);
+ break;
+ }
+ break;
+ }
+
+ /* No successful unwind method. */
+ state->current_type = UNWIND_USER_TYPE_NONE;
+ state->done = true;
+ return -EINVAL;
+}
+
+static int unwind_user_start(struct unwind_user_state *state)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+
+ memset(state, 0, sizeof(*state));
+
+ if ((current->flags & PF_KTHREAD) || !user_mode(regs)) {
+ state->done = true;
+ return -EINVAL;
+ }
+
+ if (IS_ENABLED(CONFIG_HAVE_UNWIND_USER_FP))
+ state->available_types |= UNWIND_USER_TYPE_FP;
+
+ state->ip = instruction_pointer(regs);
+ state->sp = user_stack_pointer(regs);
+ state->fp = frame_pointer(regs);
+
+ return 0;
+}
+
+int unwind_user(struct unwind_stacktrace *trace, unsigned int max_entries)
+{
+ struct unwind_user_state state;
+
+ trace->nr = 0;
+
+ if (!max_entries)
+ return -EINVAL;
+
+ if (current->flags & PF_KTHREAD)
+ return 0;
+
+ for_each_user_frame(&state) {
+ trace->entries[trace->nr++] = state.ip;
+ if (trace->nr >= max_entries)
+ break;
+ }
+
+ return 0;
+}
diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c
index 2f844c279a3e..bc738fa90c1d 100644
--- a/kernel/vhost_task.c
+++ b/kernel/vhost_task.c
@@ -145,7 +145,7 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *),
tsk = copy_process(NULL, 0, NUMA_NO_NODE, &args);
if (IS_ERR(tsk)) {
kfree(vtsk);
- return ERR_PTR(PTR_ERR(tsk));
+ return ERR_CAST(tsk);
}
vtsk->task = tsk;
diff --git a/kernel/watchdog_buddy.c b/kernel/watchdog_buddy.c
index 34dbfe091f4b..ee754d767c21 100644
--- a/kernel/watchdog_buddy.c
+++ b/kernel/watchdog_buddy.c
@@ -12,10 +12,7 @@ static unsigned int watchdog_next_cpu(unsigned int cpu)
{
unsigned int next_cpu;
- next_cpu = cpumask_next(cpu, &watchdog_cpus);
- if (next_cpu >= nr_cpu_ids)
- next_cpu = cpumask_first(&watchdog_cpus);
-
+ next_cpu = cpumask_next_wrap(cpu, &watchdog_cpus);
if (next_cpu == cpu)
return nr_cpu_ids;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9f9148075828..c6b79b3675c3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -505,12 +505,16 @@ static struct kthread_worker *pwq_release_worker __ro_after_init;
struct workqueue_struct *system_wq __ro_after_init;
EXPORT_SYMBOL(system_wq);
+struct workqueue_struct *system_percpu_wq __ro_after_init;
+EXPORT_SYMBOL(system_percpu_wq);
struct workqueue_struct *system_highpri_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_highpri_wq);
struct workqueue_struct *system_long_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_long_wq);
struct workqueue_struct *system_unbound_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_unbound_wq);
+struct workqueue_struct *system_dfl_wq __ro_after_init;
+EXPORT_SYMBOL_GPL(system_dfl_wq);
struct workqueue_struct *system_freezable_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_freezable_wq);
struct workqueue_struct *system_power_efficient_wq __ro_after_init;
@@ -1686,17 +1690,14 @@ static void __pwq_activate_work(struct pool_workqueue *pwq,
static bool tryinc_node_nr_active(struct wq_node_nr_active *nna)
{
int max = READ_ONCE(nna->max);
+ int old = atomic_read(&nna->nr);
- while (true) {
- int old, tmp;
-
- old = atomic_read(&nna->nr);
+ do {
if (old >= max)
return false;
- tmp = atomic_cmpxchg_relaxed(&nna->nr, old, old + 1);
- if (tmp == old)
- return true;
- }
+ } while (!atomic_try_cmpxchg_relaxed(&nna->nr, &old, old + 1));
+
+ return true;
}
/**
@@ -2221,12 +2222,9 @@ static int wq_select_unbound_cpu(int cpu)
}
new_cpu = __this_cpu_read(wq_rr_cpu_last);
- new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);
- if (unlikely(new_cpu >= nr_cpu_ids)) {
- new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask);
- if (unlikely(new_cpu >= nr_cpu_ids))
- return cpu;
- }
+ new_cpu = cpumask_next_and_wrap(new_cpu, wq_unbound_cpumask, cpu_online_mask);
+ if (unlikely(new_cpu >= nr_cpu_ids))
+ return cpu;
__this_cpu_write(wq_rr_cpu_last, new_cpu);
return new_cpu;
@@ -4629,7 +4627,7 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs)
*
* Return: The allocated new workqueue_attr on success. %NULL on failure.
*/
-struct workqueue_attrs *alloc_workqueue_attrs(void)
+struct workqueue_attrs *alloc_workqueue_attrs_noprof(void)
{
struct workqueue_attrs *attrs;
@@ -5682,12 +5680,12 @@ static struct workqueue_struct *__alloc_workqueue(const char *fmt,
else
wq_size = sizeof(*wq);
- wq = kzalloc(wq_size, GFP_KERNEL);
+ wq = kzalloc_noprof(wq_size, GFP_KERNEL);
if (!wq)
return NULL;
if (flags & WQ_UNBOUND) {
- wq->unbound_attrs = alloc_workqueue_attrs();
+ wq->unbound_attrs = alloc_workqueue_attrs_noprof();
if (!wq->unbound_attrs)
goto err_free_wq;
}
@@ -5777,9 +5775,9 @@ err_destroy:
}
__printf(1, 4)
-struct workqueue_struct *alloc_workqueue(const char *fmt,
- unsigned int flags,
- int max_active, ...)
+struct workqueue_struct *alloc_workqueue_noprof(const char *fmt,
+ unsigned int flags,
+ int max_active, ...)
{
struct workqueue_struct *wq;
va_list args;
@@ -5794,7 +5792,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
return wq;
}
-EXPORT_SYMBOL_GPL(alloc_workqueue);
+EXPORT_SYMBOL_GPL(alloc_workqueue_noprof);
#ifdef CONFIG_LOCKDEP
__printf(1, 5)
@@ -6770,31 +6768,6 @@ long work_on_cpu_key(int cpu, long (*fn)(void *),
return wfc.ret;
}
EXPORT_SYMBOL_GPL(work_on_cpu_key);
-
-/**
- * work_on_cpu_safe_key - run a function in thread context on a particular cpu
- * @cpu: the cpu to run on
- * @fn: the function to run
- * @arg: the function argument
- * @key: The lock class key for lock debugging purposes
- *
- * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
- * any locks which would prevent @fn from completing.
- *
- * Return: The value @fn returns.
- */
-long work_on_cpu_safe_key(int cpu, long (*fn)(void *),
- void *arg, struct lock_class_key *key)
-{
- long ret = -ENODEV;
-
- cpus_read_lock();
- if (cpu_online(cpu))
- ret = work_on_cpu_key(cpu, fn, arg, key);
- cpus_read_unlock();
- return ret;
-}
-EXPORT_SYMBOL_GPL(work_on_cpu_safe_key);
#endif /* CONFIG_SMP */
#ifdef CONFIG_FREEZER
@@ -7830,10 +7803,11 @@ void __init workqueue_init_early(void)
}
system_wq = alloc_workqueue("events", 0, 0);
+ system_percpu_wq = alloc_workqueue("events", 0, 0);
system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
system_long_wq = alloc_workqueue("events_long", 0, 0);
- system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
- WQ_MAX_ACTIVE);
+ system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
+ system_dfl_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
system_freezable_wq = alloc_workqueue("events_freezable",
WQ_FREEZABLE, 0);
system_power_efficient_wq = alloc_workqueue("events_power_efficient",
@@ -7844,8 +7818,8 @@ void __init workqueue_init_early(void)
system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0);
system_bh_highpri_wq = alloc_workqueue("events_bh_highpri",
WQ_BH | WQ_HIGHPRI, 0);
- BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
- !system_unbound_wq || !system_freezable_wq ||
+ BUG_ON(!system_wq || !system_percpu_wq|| !system_highpri_wq || !system_long_wq ||
+ !system_unbound_wq || !system_freezable_wq || !system_dfl_wq ||
!system_power_efficient_wq ||
!system_freezable_power_efficient_wq ||
!system_bh_wq || !system_bh_highpri_wq);