diff options
Diffstat (limited to 'kernel')
64 files changed, 1873 insertions, 1145 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore index c6b299a6b786..a501bfc80694 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -1,3 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only /config_data /kheaders.md5 +/kheaders-objlist +/kheaders-srclist diff --git a/kernel/Makefile b/kernel/Makefile index a89602a2a0ab..c60623448235 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -54,6 +54,7 @@ obj-y += rcu/ obj-y += livepatch/ obj-y += dma/ obj-y += entry/ +obj-y += unwind/ obj-$(CONFIG_MODULES) += module/ obj-$(CONFIG_KCMP) += kcmp.o @@ -158,11 +159,48 @@ filechk_cat = cat $< $(obj)/config_data: $(KCONFIG_CONFIG) FORCE $(call filechk,cat) +# kheaders_data.tar.xz $(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz -quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz - cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_kheaders.sh $@ -$(obj)/kheaders_data.tar.xz: FORCE - $(call cmd,genikh) +quiet_cmd_kheaders_data = GEN $@ + cmd_kheaders_data = "$<" "$@" "$(obj)/kheaders-srclist" "$(obj)/kheaders-objlist" "$(KBUILD_BUILD_TIMESTAMP)" + cmd_kheaders_data_dep = cat $(depfile) >> $(dot-target).cmd; rm -f $(depfile) -clean-files := kheaders_data.tar.xz kheaders.md5 +define rule_kheaders_data + $(call cmd_and_savecmd,kheaders_data) + $(call cmd,kheaders_data_dep) +endef + +targets += kheaders_data.tar.xz +$(obj)/kheaders_data.tar.xz: $(src)/gen_kheaders.sh $(obj)/kheaders-srclist $(obj)/kheaders-objlist $(obj)/kheaders.md5 FORCE + $(call if_changed_rule,kheaders_data) + +# generated headers in objtree +# +# include/generated/utsversion.h is ignored because it is generated +# after gen_kheaders.sh is executed. (utsversion.h is unneeded for kheaders) +filechk_kheaders_objlist = \ + for d in include "arch/$(SRCARCH)/include"; do \ + find "$${d}/generated" ! -path "include/generated/utsversion.h" -a -name "*.h" -print; \ + done + +$(obj)/kheaders-objlist: FORCE + $(call filechk,kheaders_objlist) + +# non-generated headers in srctree +filechk_kheaders_srclist = \ + for d in include "arch/$(SRCARCH)/include"; do \ + find "$(srctree)/$${d}" -path "$(srctree)/$${d}/generated" -prune -o -name "*.h" -print; \ + done + +$(obj)/kheaders-srclist: FORCE + $(call filechk,kheaders_srclist) + +# Some files are symlinks. If symlinks are changed, kheaders_data.tar.xz should +# be rebuilt. +filechk_kheaders_md5sum = xargs -r -a $< stat -c %N | md5sum + +$(obj)/kheaders.md5: $(obj)/kheaders-srclist FORCE + $(call filechk,kheaders_md5sum) + +clean-files := kheaders.md5 kheaders-srclist kheaders-objlist diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 09dde5b00d0c..5d1650af899d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2377,28 +2377,44 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp) { enum bpf_prog_type prog_type = resolve_prog_type(fp); - bool ret; struct bpf_prog_aux *aux = fp->aux; + enum bpf_cgroup_storage_type i; + bool ret = false; + u64 cookie; if (fp->kprobe_override) - return false; + return ret; - spin_lock(&map->owner.lock); - if (!map->owner.type) { - /* There's no owner yet where we could check for - * compatibility. - */ - map->owner.type = prog_type; - map->owner.jited = fp->jited; - map->owner.xdp_has_frags = aux->xdp_has_frags; - map->owner.attach_func_proto = aux->attach_func_proto; + spin_lock(&map->owner_lock); + /* There's no owner yet where we could check for compatibility. */ + if (!map->owner) { + map->owner = bpf_map_owner_alloc(map); + if (!map->owner) + goto err; + map->owner->type = prog_type; + map->owner->jited = fp->jited; + map->owner->xdp_has_frags = aux->xdp_has_frags; + map->owner->attach_func_proto = aux->attach_func_proto; + for_each_cgroup_storage_type(i) { + map->owner->storage_cookie[i] = + aux->cgroup_storage[i] ? + aux->cgroup_storage[i]->cookie : 0; + } ret = true; } else { - ret = map->owner.type == prog_type && - map->owner.jited == fp->jited && - map->owner.xdp_has_frags == aux->xdp_has_frags; + ret = map->owner->type == prog_type && + map->owner->jited == fp->jited && + map->owner->xdp_has_frags == aux->xdp_has_frags; + for_each_cgroup_storage_type(i) { + if (!ret) + break; + cookie = aux->cgroup_storage[i] ? + aux->cgroup_storage[i]->cookie : 0; + ret = map->owner->storage_cookie[i] == cookie || + !cookie; + } if (ret && - map->owner.attach_func_proto != aux->attach_func_proto) { + map->owner->attach_func_proto != aux->attach_func_proto) { switch (prog_type) { case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_LSM: @@ -2411,8 +2427,8 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map, } } } - spin_unlock(&map->owner.lock); - +err: + spin_unlock(&map->owner_lock); return ret; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e63039817af3..0fbfa8532c39 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -37,6 +37,7 @@ #include <linux/trace_events.h> #include <linux/tracepoint.h> #include <linux/overflow.h> +#include <linux/cookie.h> #include <net/netfilter/nf_bpf_link.h> #include <net/netkit.h> @@ -53,6 +54,7 @@ #define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY) DEFINE_PER_CPU(int, bpf_prog_active); +DEFINE_COOKIE(bpf_map_cookie); static DEFINE_IDR(prog_idr); static DEFINE_SPINLOCK(prog_idr_lock); static DEFINE_IDR(map_idr); @@ -885,6 +887,7 @@ static void bpf_map_free_deferred(struct work_struct *work) security_bpf_map_free(map); bpf_map_release_memcg(map); + bpf_map_owner_free(map); bpf_map_free(map); } @@ -979,12 +982,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) struct bpf_map *map = filp->private_data; u32 type = 0, jited = 0; - if (map_type_contains_progs(map)) { - spin_lock(&map->owner.lock); - type = map->owner.type; - jited = map->owner.jited; - spin_unlock(&map->owner.lock); + spin_lock(&map->owner_lock); + if (map->owner) { + type = map->owner->type; + jited = map->owner->jited; } + spin_unlock(&map->owner_lock); seq_printf(m, "map_type:\t%u\n" @@ -1487,10 +1490,14 @@ static int map_create(union bpf_attr *attr, bool kernel) if (err < 0) goto free_map; + preempt_disable(); + map->cookie = gen_cookie_next(&bpf_map_cookie); + preempt_enable(); + atomic64_set(&map->refcnt, 1); atomic64_set(&map->usercnt, 1); mutex_init(&map->freeze_mutex); - spin_lock_init(&map->owner.lock); + spin_lock_init(&map->owner_lock); if (attr->btf_key_type_id || attr->btf_value_type_id || /* Even the map's value is a kernel's struct, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 399f03e62508..0806295945e4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21445,7 +21445,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) &target_size); if (cnt == 0 || cnt >= INSN_BUF_SIZE || (ctx_field_size && !target_size)) { - verifier_bug(env, "error during ctx access conversion"); + verifier_bug(env, "error during ctx access conversion (%d)", cnt); return -EFAULT; } diff --git a/kernel/cfi.c b/kernel/cfi.c index 422fa4f958ae..4dad04ead06c 100644 --- a/kernel/cfi.c +++ b/kernel/cfi.c @@ -5,6 +5,8 @@ * Copyright (C) 2022 Google LLC */ +#include <linux/bpf.h> +#include <linux/cfi_types.h> #include <linux/cfi.h> bool cfi_warn __ro_after_init = IS_ENABLED(CONFIG_CFI_PERMISSIVE); @@ -27,6 +29,19 @@ enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, return BUG_TRAP_TYPE_BUG; } +/* + * Declare two non-existent functions with types that match bpf_func_t and + * bpf_callback_t pointers, and use DEFINE_CFI_TYPE to define type hash + * variables for each function type. The cfi_bpf_* variables are used by + * arch-specific BPF JIT implementations to ensure indirectly callable JIT + * code has matching CFI type hashes. + */ +extern typeof(*(bpf_func_t)0) __bpf_prog_runX; +DEFINE_CFI_TYPE(cfi_bpf_hash, __bpf_prog_runX); + +extern typeof(*(bpf_callback_t)0) __bpf_callback_fn; +DEFINE_CFI_TYPE(cfi_bpf_subprog_hash, __bpf_callback_fn); + #ifdef CONFIG_ARCH_USES_CFI_TRAPS static inline unsigned long trap_address(s32 *p) { diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index fa24c032ed6f..2a4a387f867a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -32,6 +32,9 @@ static u16 cgroup_no_v1_mask; /* disable named v1 mounts */ static bool cgroup_no_v1_named; +/* Show unavailable controllers in /proc/cgroups */ +static bool proc_show_all; + /* * pidlist destructions need to be flushed on cgroup destruction. Use a * separate workqueue as flush domain. @@ -683,10 +686,11 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) */ for_each_subsys(ss, i) { - if (cgroup1_subsys_absent(ss)) - continue; cgrp_v1_visible |= ss->root != &cgrp_dfl_root; + if (!proc_show_all && cgroup1_subsys_absent(ss)) + continue; + seq_printf(m, "%s\t%d\t%d\t%d\n", ss->legacy_name, ss->root->hierarchy_id, atomic_read(&ss->root->nr_cgrps), @@ -1359,3 +1363,9 @@ static int __init cgroup_no_v1(char *str) return 1; } __setup("cgroup_no_v1=", cgroup_no_v1); + +static int __init cgroup_v1_proc(char *str) +{ + return (kstrtobool(str, &proc_show_all) == 0); +} +__setup("cgroup_v1_proc=", cgroup_v1_proc); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 3bc4301466f3..f74d04429a29 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -4051,7 +4051,7 @@ void __init cpuset_init_smp(void) cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); top_cpuset.effective_mems = node_states[N_MEMORY]; - hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI); + hotplug_node_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI); cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); BUG_ON(!cpuset_migrate_mm_wq); diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index cbeaa499a96a..981e2f77ad4e 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -10,7 +10,7 @@ #include <trace/events/cgroup.h> static DEFINE_SPINLOCK(rstat_base_lock); -static DEFINE_PER_CPU(raw_spinlock_t, rstat_base_cpu_lock); +static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list); static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); @@ -45,84 +45,11 @@ static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss) return &rstat_base_lock; } -static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu) +static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu) { - if (ss) { - /* - * Depending on config, the subsystem per-cpu lock type may be an - * empty struct. In enviromnents where this is the case, allocation - * of this field is not performed in ss_rstat_init(). Avoid a - * cpu-based offset relative to NULL by returning early. When the - * lock type is zero in size, the corresponding lock functions are - * no-ops so passing them NULL is acceptable. - */ - if (sizeof(*ss->rstat_ss_cpu_lock) == 0) - return NULL; - - return per_cpu_ptr(ss->rstat_ss_cpu_lock, cpu); - } - - return per_cpu_ptr(&rstat_base_cpu_lock, cpu); -} - -/* - * Helper functions for rstat per CPU locks. - * - * This makes it easier to diagnose locking issues and contention in - * production environments. The parameter @fast_path determine the - * tracepoints being added, allowing us to diagnose "flush" related - * operations without handling high-frequency fast-path "update" events. - */ -static __always_inline -unsigned long _css_rstat_cpu_lock(struct cgroup_subsys_state *css, int cpu, - const bool fast_path) -{ - struct cgroup *cgrp = css->cgroup; - raw_spinlock_t *cpu_lock; - unsigned long flags; - bool contended; - - /* - * The _irqsave() is needed because the locks used for flushing are - * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring this lock - * with the _irq() suffix only disables interrupts on a non-PREEMPT_RT - * kernel. The raw_spinlock_t below disables interrupts on both - * configurations. The _irqsave() ensures that interrupts are always - * disabled and later restored. - */ - cpu_lock = ss_rstat_cpu_lock(css->ss, cpu); - contended = !raw_spin_trylock_irqsave(cpu_lock, flags); - if (contended) { - if (fast_path) - trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended); - else - trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended); - - raw_spin_lock_irqsave(cpu_lock, flags); - } - - if (fast_path) - trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended); - else - trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended); - - return flags; -} - -static __always_inline -void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu, - unsigned long flags, const bool fast_path) -{ - struct cgroup *cgrp = css->cgroup; - raw_spinlock_t *cpu_lock; - - if (fast_path) - trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false); - else - trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false); - - cpu_lock = ss_rstat_cpu_lock(css->ss, cpu); - raw_spin_unlock_irqrestore(cpu_lock, flags); + if (ss) + return per_cpu_ptr(ss->lhead, cpu); + return per_cpu_ptr(&rstat_backlog_list, cpu); } /** @@ -130,13 +57,22 @@ void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu, * @css: target cgroup subsystem state * @cpu: cpu on which rstat_cpu was updated * - * @css's rstat_cpu on @cpu was updated. Put it on the parent's matching - * rstat_cpu->updated_children list. See the comment on top of - * css_rstat_cpu definition for details. + * Atomically inserts the css in the ss's llist for the given cpu. This is + * reentrant safe i.e. safe against softirq, hardirq and nmi. The ss's llist + * will be processed at the flush time to create the update tree. + * + * NOTE: if the user needs the guarantee that the updater either add itself in + * the lockless list or the concurrent flusher flushes its updated stats, a + * memory barrier is needed before the call to css_rstat_updated() i.e. a + * barrier after updating the per-cpu stats and before calling + * css_rstat_updated(). */ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) { - unsigned long flags; + struct llist_head *lhead; + struct css_rstat_cpu *rstatc; + struct css_rstat_cpu __percpu *rstatc_pcpu; + struct llist_node *self; /* * Since bpf programs can call this function, prevent access to @@ -145,19 +81,49 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) if (!css_uses_rstat(css)) return; + lockdep_assert_preemption_disabled(); + + /* + * For archs withnot nmi safe cmpxchg or percpu ops support, ignore + * the requests from nmi context. + */ + if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) || + !IS_ENABLED(CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS)) && in_nmi()) + return; + + rstatc = css_rstat_cpu(css, cpu); /* - * Speculative already-on-list test. This may race leading to - * temporary inaccuracies, which is fine. + * If already on list return. This check is racy and smp_mb() is needed + * to pair it with the smp_mb() in css_process_update_tree() if the + * guarantee that the updated stats are visible to concurrent flusher is + * needed. + */ + if (llist_on_list(&rstatc->lnode)) + return; + + /* + * This function can be renentered by irqs and nmis for the same cgroup + * and may try to insert the same per-cpu lnode into the llist. Note + * that llist_add() does not protect against such scenarios. * - * Because @parent's updated_children is terminated with @parent - * instead of NULL, we can tell whether @css is on the list by - * testing the next pointer for NULL. + * To protect against such stacked contexts of irqs/nmis, we use the + * fact that lnode points to itself when not on a list and then use + * this_cpu_cmpxchg() to atomically set to NULL to select the winner + * which will call llist_add(). The losers can assume the insertion is + * successful and the winner will eventually add the per-cpu lnode to + * the llist. */ - if (data_race(css_rstat_cpu(css, cpu)->updated_next)) + self = &rstatc->lnode; + rstatc_pcpu = css->rstat_cpu; + if (this_cpu_cmpxchg(rstatc_pcpu->lnode.next, self, NULL) != self) return; - flags = _css_rstat_cpu_lock(css, cpu, true); + lhead = ss_lhead_cpu(css->ss, cpu); + llist_add(&rstatc->lnode, lhead); +} +static void __css_process_update_tree(struct cgroup_subsys_state *css, int cpu) +{ /* put @css and all ancestors on the corresponding updated lists */ while (true) { struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); @@ -183,8 +149,34 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) css = parent; } +} + +static void css_process_update_tree(struct cgroup_subsys *ss, int cpu) +{ + struct llist_head *lhead = ss_lhead_cpu(ss, cpu); + struct llist_node *lnode; + + while ((lnode = llist_del_first_init(lhead))) { + struct css_rstat_cpu *rstatc; + + /* + * smp_mb() is needed here (more specifically in between + * init_llist_node() and per-cpu stats flushing) if the + * guarantee is required by a rstat user where etiher the + * updater should add itself on the lockless list or the + * flusher flush the stats updated by the updater who have + * observed that they are already on the list. The + * corresponding barrier pair for this one should be before + * css_rstat_updated() by the user. + * + * For now, there aren't any such user, so not adding the + * barrier here but if such a use-case arise, please add + * smp_mb() here. + */ - _css_rstat_cpu_unlock(css, cpu, flags, true); + rstatc = container_of(lnode, struct css_rstat_cpu, lnode); + __css_process_update_tree(rstatc->owner, cpu); + } } /** @@ -288,13 +280,12 @@ static struct cgroup_subsys_state *css_rstat_updated_list( { struct css_rstat_cpu *rstatc = css_rstat_cpu(root, cpu); struct cgroup_subsys_state *head = NULL, *parent, *child; - unsigned long flags; - flags = _css_rstat_cpu_lock(root, cpu, false); + css_process_update_tree(root->ss, cpu); /* Return NULL if this subtree is not on-list */ if (!rstatc->updated_next) - goto unlock_ret; + return NULL; /* * Unlink @root from its parent. As the updated_children list is @@ -326,8 +317,7 @@ static struct cgroup_subsys_state *css_rstat_updated_list( rstatc->updated_children = root; if (child != root) head = css_rstat_push_children(head, child, cpu); -unlock_ret: - _css_rstat_cpu_unlock(root, cpu, flags, false); + return head; } @@ -468,7 +458,8 @@ int css_rstat_init(struct cgroup_subsys_state *css) for_each_possible_cpu(cpu) { struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); - rstatc->updated_children = css; + rstatc->owner = rstatc->updated_children = css; + init_llist_node(&rstatc->lnode); if (is_self) { struct cgroup_rstat_base_cpu *rstatbc; @@ -522,19 +513,15 @@ int __init ss_rstat_init(struct cgroup_subsys *ss) { int cpu; - /* - * Depending on config, the subsystem per-cpu lock type may be an empty - * struct. Avoid allocating a size of zero in this case. - */ - if (ss && sizeof(*ss->rstat_ss_cpu_lock)) { - ss->rstat_ss_cpu_lock = alloc_percpu(raw_spinlock_t); - if (!ss->rstat_ss_cpu_lock) + if (ss) { + ss->lhead = alloc_percpu(struct llist_head); + if (!ss->lhead) return -ENOMEM; } spin_lock_init(ss_rstat_lock(ss)); for_each_possible_cpu(cpu) - raw_spin_lock_init(ss_rstat_cpu_lock(ss, cpu)); + init_llist_head(ss_lhead_cpu(ss, cpu)); return 0; } diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 335b8425dd4b..a4ef79591eb2 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -21,6 +21,7 @@ #include <linux/reboot.h> #include <linux/btf.h> #include <linux/objtool.h> +#include <linux/delay.h> #include <asm/page.h> #include <asm/sections.h> @@ -33,6 +34,11 @@ /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; +/* time to wait for possible DMA to finish before starting the kdump kernel + * when a CMA reservation is used + */ +#define CMA_DMA_TIMEOUT_SEC 10 + #ifdef CONFIG_CRASH_DUMP int kimage_crash_copy_vmcoreinfo(struct kimage *image) @@ -97,6 +103,14 @@ int kexec_crash_loaded(void) } EXPORT_SYMBOL_GPL(kexec_crash_loaded); +static void crash_cma_clear_pending_dma(void) +{ + if (!crashk_cma_cnt) + return; + + mdelay(CMA_DMA_TIMEOUT_SEC * 1000); +} + /* * No panic_cpu check version of crash_kexec(). This function is called * only when panic_cpu holds the current CPU number; this is the only CPU @@ -119,6 +133,7 @@ void __noclone __crash_kexec(struct pt_regs *regs) crash_setup_regs(&fixed_regs, regs); crash_save_vmcoreinfo(); machine_crash_shutdown(&fixed_regs); + crash_cma_clear_pending_dma(); machine_kexec(kexec_crash_image); } kexec_unlock(); diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index acb6bf42e30d..87bf4d41eabb 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -14,6 +14,8 @@ #include <linux/cpuhotplug.h> #include <linux/memblock.h> #include <linux/kmemleak.h> +#include <linux/cma.h> +#include <linux/crash_reserve.h> #include <asm/page.h> #include <asm/sections.h> @@ -172,17 +174,19 @@ static int __init parse_crashkernel_simple(char *cmdline, #define SUFFIX_HIGH 0 #define SUFFIX_LOW 1 -#define SUFFIX_NULL 2 +#define SUFFIX_CMA 2 +#define SUFFIX_NULL 3 static __initdata char *suffix_tbl[] = { [SUFFIX_HIGH] = ",high", [SUFFIX_LOW] = ",low", + [SUFFIX_CMA] = ",cma", [SUFFIX_NULL] = NULL, }; /* * That function parses "suffix" crashkernel command lines like * - * crashkernel=size,[high|low] + * crashkernel=size,[high|low|cma] * * It returns 0 on success and -EINVAL on failure. */ @@ -298,9 +302,11 @@ int __init parse_crashkernel(char *cmdline, unsigned long long *crash_size, unsigned long long *crash_base, unsigned long long *low_size, + unsigned long long *cma_size, bool *high) { int ret; + unsigned long long __always_unused cma_base; /* crashkernel=X[@offset] */ ret = __parse_crashkernel(cmdline, system_ram, crash_size, @@ -331,6 +337,14 @@ int __init parse_crashkernel(char *cmdline, *high = true; } + + /* + * optional CMA reservation + * cma_base is ignored + */ + if (cma_size) + __parse_crashkernel(cmdline, 0, cma_size, + &cma_base, suffix_tbl[SUFFIX_CMA]); #endif if (!*crash_size) ret = -EINVAL; @@ -457,6 +471,56 @@ retry: #endif } +struct range crashk_cma_ranges[CRASHKERNEL_CMA_RANGES_MAX]; +#ifdef CRASHKERNEL_CMA +int crashk_cma_cnt; +void __init reserve_crashkernel_cma(unsigned long long cma_size) +{ + unsigned long long request_size = roundup(cma_size, PAGE_SIZE); + unsigned long long reserved_size = 0; + + if (!cma_size) + return; + + while (cma_size > reserved_size && + crashk_cma_cnt < CRASHKERNEL_CMA_RANGES_MAX) { + + struct cma *res; + + if (cma_declare_contiguous(0, request_size, 0, 0, 0, false, + "crashkernel", &res)) { + /* reservation failed, try half-sized blocks */ + if (request_size <= PAGE_SIZE) + break; + + request_size = roundup(request_size / 2, PAGE_SIZE); + continue; + } + + crashk_cma_ranges[crashk_cma_cnt].start = cma_get_base(res); + crashk_cma_ranges[crashk_cma_cnt].end = + crashk_cma_ranges[crashk_cma_cnt].start + + cma_get_size(res) - 1; + ++crashk_cma_cnt; + reserved_size += request_size; + } + + if (cma_size > reserved_size) + pr_warn("crashkernel CMA reservation failed: %lld MB requested, %lld MB reserved in %d ranges\n", + cma_size >> 20, reserved_size >> 20, crashk_cma_cnt); + else + pr_info("crashkernel CMA reserved: %lld MB in %d ranges\n", + reserved_size >> 20, crashk_cma_cnt); +} + +#else /* CRASHKERNEL_CMA */ +void __init reserve_crashkernel_cma(unsigned long long cma_size) +{ + if (cma_size) + pr_warn("crashkernel CMA reservation not supported\n"); +} +#endif + #ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY static __init int insert_crashkernel_resources(void) { diff --git a/kernel/events/core.c b/kernel/events/core.c index 22fdf0c187cd..8060c2857bb2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6842,10 +6842,20 @@ static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf) return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS; } +static int perf_mmap_may_split(struct vm_area_struct *vma, unsigned long addr) +{ + /* + * Forbid splitting perf mappings to prevent refcount leaks due to + * the resulting non-matching offsets and sizes. See open()/close(). + */ + return -EINVAL; +} + static const struct vm_operations_struct perf_mmap_vmops = { .open = perf_mmap_open, .close = perf_mmap_close, /* non mergeable */ .pfn_mkwrite = perf_mmap_pfn_mkwrite, + .may_split = perf_mmap_may_split, }; static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma) @@ -7051,8 +7061,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) ret = 0; goto unlock; } - - atomic_set(&rb->aux_mmap_count, 1); } user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); @@ -7115,15 +7123,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) perf_event_update_time(event); perf_event_init_userpage(event); perf_event_update_userpage(event); + ret = 0; } else { ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, event->attr.aux_watermark, flags); - if (!ret) + if (!ret) { + atomic_set(&rb->aux_mmap_count, 1); rb->aux_mmap_locked = extra; + } } - ret = 0; - unlock: if (!ret) { atomic_long_add(user_extra, &user->locked_vm); @@ -7131,6 +7140,7 @@ unlock: atomic_inc(&event->mmap_count); } else if (rb) { + /* AUX allocation failed */ atomic_dec(&rb->mmap_count); } aux_unlock: @@ -7138,6 +7148,9 @@ aux_unlock: mutex_unlock(aux_mutex); mutex_unlock(&event->mmap_mutex); + if (ret) + return ret; + /* * Since pinned accounting is per vm we cannot allow fork() to copy our * vma. @@ -7145,13 +7158,20 @@ aux_unlock: vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &perf_mmap_vmops; - if (!ret) - ret = map_range(rb, vma); - mapped = get_mapped(event, event_mapped); if (mapped) mapped(event, vma->vm_mm); + /* + * Try to map it into the page table. On fail, invoke + * perf_mmap_close() to undo the above, as the callsite expects + * full cleanup in this case and therefore does not invoke + * vmops::close(). + */ + ret = map_range(rb, vma); + if (ret) + perf_mmap_close(vma); + return ret; } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4c965ba77f9f..7ca1940607bd 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -436,8 +436,7 @@ static int __uprobe_write_opcode(struct vm_area_struct *vma, * there are no unexpected folio references ... */ if (is_register || userfaultfd_missing(vma) || - (folio_ref_count(folio) != folio_mapcount(folio) + 1 + - folio_test_swapcache(folio) * folio_nr_pages(folio))) + (folio_ref_count(folio) != folio_expected_ref_count(folio) + 1)) goto remap; /* @@ -540,7 +539,7 @@ retry: } ret = 0; - if (unlikely(!folio_test_anon(folio))) { + if (unlikely(!folio_test_anon(folio) || folio_is_zone_device(folio))) { VM_WARN_ON_ONCE(is_register); folio_put(folio); goto out; @@ -581,8 +580,8 @@ retry: out: /* Revert back reference counter if instruction update failed. */ - if (ret < 0 && is_register && ref_ctr_updated) - update_ref_ctr(uprobe, mm, -1); + if (ret < 0 && ref_ctr_updated) + update_ref_ctr(uprobe, mm, is_register ? -1 : 1); /* try collapse pmd for compound page */ if (ret > 0) diff --git a/kernel/exit.c b/kernel/exit.c index bb184a67ac73..343eb97543d5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -68,6 +68,7 @@ #include <linux/rethook.h> #include <linux/sysfs.h> #include <linux/user_events.h> +#include <linux/unwind_deferred.h> #include <linux/uaccess.h> #include <linux/pidfs.h> @@ -692,12 +693,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, } /* - * This does two things: - * - * A. Make init inherit all the child processes - * B. Check to see if any process groups have become orphaned - * as a result of our exiting, and if they have any stopped - * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) + * Make init inherit all the child processes */ static void forget_original_parent(struct task_struct *father, struct list_head *dead) @@ -938,6 +934,7 @@ void __noreturn do_exit(long code) tsk->exit_code = code; taskstats_exit(tsk, group_dead); + unwind_deferred_task_exit(tsk); trace_sched_process_exit(tsk, group_dead); /* diff --git a/kernel/fork.c b/kernel/fork.c index 52901fe4a3c2..c4ada32598bd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -105,6 +105,7 @@ #include <uapi/linux/pidfd.h> #include <linux/pidfs.h> #include <linux/tick.h> +#include <linux/unwind_deferred.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> @@ -188,33 +189,33 @@ static inline void free_task_struct(struct task_struct *tsk) kmem_cache_free(task_struct_cachep, tsk); } -/* - * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a - * kmemcache based allocator. - */ -# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) - -# ifdef CONFIG_VMAP_STACK +#ifdef CONFIG_VMAP_STACK /* * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB * flush. Try to minimize the number of calls by caching stacks. */ #define NR_CACHED_STACKS 2 static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); +/* + * Allocated stacks are cached and later reused by new threads, so memcg + * accounting is performed by the code assigning/releasing stacks to tasks. + * We need a zeroed memory without __GFP_ACCOUNT. + */ +#define GFP_VMAP_STACK (GFP_KERNEL | __GFP_ZERO) struct vm_stack { struct rcu_head rcu; struct vm_struct *stack_vm_area; }; -static bool try_release_thread_stack_to_cache(struct vm_struct *vm) +static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area) { unsigned int i; for (i = 0; i < NR_CACHED_STACKS; i++) { struct vm_struct *tmp = NULL; - if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm)) + if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area)) return true; } return false; @@ -223,11 +224,12 @@ static bool try_release_thread_stack_to_cache(struct vm_struct *vm) static void thread_stack_free_rcu(struct rcu_head *rh) { struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu); + struct vm_struct *vm_area = vm_stack->stack_vm_area; if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area)) return; - vfree(vm_stack); + vfree(vm_area->addr); } static void thread_stack_delayed_free(struct task_struct *tsk) @@ -240,32 +242,32 @@ static void thread_stack_delayed_free(struct task_struct *tsk) static int free_vm_stack_cache(unsigned int cpu) { - struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu); + struct vm_struct **cached_vm_stack_areas = per_cpu_ptr(cached_stacks, cpu); int i; for (i = 0; i < NR_CACHED_STACKS; i++) { - struct vm_struct *vm_stack = cached_vm_stacks[i]; + struct vm_struct *vm_area = cached_vm_stack_areas[i]; - if (!vm_stack) + if (!vm_area) continue; - vfree(vm_stack->addr); - cached_vm_stacks[i] = NULL; + vfree(vm_area->addr); + cached_vm_stack_areas[i] = NULL; } return 0; } -static int memcg_charge_kernel_stack(struct vm_struct *vm) +static int memcg_charge_kernel_stack(struct vm_struct *vm_area) { int i; int ret; int nr_charged = 0; - BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); + BUG_ON(vm_area->nr_pages != THREAD_SIZE / PAGE_SIZE); for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { - ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0); + ret = memcg_kmem_charge_page(vm_area->pages[i], GFP_KERNEL, 0); if (ret) goto err; nr_charged++; @@ -273,55 +275,47 @@ static int memcg_charge_kernel_stack(struct vm_struct *vm) return 0; err: for (i = 0; i < nr_charged; i++) - memcg_kmem_uncharge_page(vm->pages[i], 0); + memcg_kmem_uncharge_page(vm_area->pages[i], 0); return ret; } static int alloc_thread_stack_node(struct task_struct *tsk, int node) { - struct vm_struct *vm; + struct vm_struct *vm_area; void *stack; int i; for (i = 0; i < NR_CACHED_STACKS; i++) { - struct vm_struct *s; - - s = this_cpu_xchg(cached_stacks[i], NULL); - - if (!s) + vm_area = this_cpu_xchg(cached_stacks[i], NULL); + if (!vm_area) continue; /* Reset stack metadata. */ - kasan_unpoison_range(s->addr, THREAD_SIZE); + kasan_unpoison_range(vm_area->addr, THREAD_SIZE); - stack = kasan_reset_tag(s->addr); + stack = kasan_reset_tag(vm_area->addr); /* Clear stale pointers from reused stack. */ memset(stack, 0, THREAD_SIZE); - if (memcg_charge_kernel_stack(s)) { - vfree(s->addr); + if (memcg_charge_kernel_stack(vm_area)) { + vfree(vm_area->addr); return -ENOMEM; } - tsk->stack_vm_area = s; + tsk->stack_vm_area = vm_area; tsk->stack = stack; return 0; } - /* - * Allocated stacks are cached and later reused by new threads, - * so memcg accounting is performed manually on assigning/releasing - * stacks to tasks. Drop __GFP_ACCOUNT. - */ stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, - THREADINFO_GFP & ~__GFP_ACCOUNT, + GFP_VMAP_STACK, node, __builtin_return_address(0)); if (!stack) return -ENOMEM; - vm = find_vm_area(stack); - if (memcg_charge_kernel_stack(vm)) { + vm_area = find_vm_area(stack); + if (memcg_charge_kernel_stack(vm_area)) { vfree(stack); return -ENOMEM; } @@ -330,7 +324,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) * free_thread_stack() can be called in interrupt context, * so cache the vm_struct. */ - tsk->stack_vm_area = vm; + tsk->stack_vm_area = vm_area; stack = kasan_reset_tag(stack); tsk->stack = stack; return 0; @@ -345,7 +339,13 @@ static void free_thread_stack(struct task_struct *tsk) tsk->stack_vm_area = NULL; } -# else /* !CONFIG_VMAP_STACK */ +#else /* !CONFIG_VMAP_STACK */ + +/* + * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a + * kmemcache based allocator. + */ +#if THREAD_SIZE >= PAGE_SIZE static void thread_stack_free_rcu(struct rcu_head *rh) { @@ -377,8 +377,7 @@ static void free_thread_stack(struct task_struct *tsk) tsk->stack = NULL; } -# endif /* CONFIG_VMAP_STACK */ -# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */ +#else /* !(THREAD_SIZE >= PAGE_SIZE) */ static struct kmem_cache *thread_stack_cache; @@ -417,7 +416,8 @@ void thread_stack_cache_init(void) BUG_ON(thread_stack_cache == NULL); } -# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */ +#endif /* THREAD_SIZE >= PAGE_SIZE */ +#endif /* CONFIG_VMAP_STACK */ /* SLAB cache for signal_struct structures (tsk->signal) */ static struct kmem_cache *signal_cachep; @@ -437,11 +437,11 @@ static struct kmem_cache *mm_cachep; static void account_kernel_stack(struct task_struct *tsk, int account) { if (IS_ENABLED(CONFIG_VMAP_STACK)) { - struct vm_struct *vm = task_stack_vm_area(tsk); + struct vm_struct *vm_area = task_stack_vm_area(tsk); int i; for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) - mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB, + mod_lruvec_page_state(vm_area->pages[i], NR_KERNEL_STACK_KB, account * (PAGE_SIZE / 1024)); } else { void *stack = task_stack_page(tsk); @@ -457,12 +457,12 @@ void exit_task_stack_account(struct task_struct *tsk) account_kernel_stack(tsk, -1); if (IS_ENABLED(CONFIG_VMAP_STACK)) { - struct vm_struct *vm; + struct vm_struct *vm_area; int i; - vm = task_stack_vm_area(tsk); + vm_area = task_stack_vm_area(tsk); for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) - memcg_kmem_uncharge_page(vm->pages[i], 0); + memcg_kmem_uncharge_page(vm_area->pages[i], 0); } } @@ -585,9 +585,12 @@ static void check_mm(struct mm_struct *mm) for (i = 0; i < NR_MM_COUNTERS; i++) { long x = percpu_counter_sum(&mm->rss_stat[i]); - if (unlikely(x)) - pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", - mm, resident_page_types[i], x); + if (unlikely(x)) { + pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld Comm:%s Pid:%d\n", + mm, resident_page_types[i], x, + current->comm, + task_pid_nr(current)); + } } if (mm_pgtables_bytes(mm)) @@ -732,6 +735,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(refcount_read(&tsk->usage)); WARN_ON(tsk == current); + unwind_task_free(tsk); sched_ext_free(tsk); io_uring_free(tsk); cgroup_free(tsk); @@ -2135,6 +2139,8 @@ __latent_entropy struct task_struct *copy_process( p->bpf_ctx = NULL; #endif + unwind_task_init(p); + /* Perform scheduler related setup. Assign this task to a CPU. */ retval = sched_fork(clone_flags, p); if (retval) diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index c9e5dc068e85..896a503dfb29 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -4,79 +4,34 @@ # This script generates an archive consisting of kernel headers # for CONFIG_IKHEADERS. set -e -sfile="$(readlink -f "$0")" -outdir="$(pwd)" tarfile=$1 -tmpdir=$outdir/${tarfile%/*}/.tmp_dir - -dir_list=" -include/ -arch/$SRCARCH/include/ -" - -# Support incremental builds by skipping archive generation -# if timestamps of files being archived are not changed. - -# This block is useful for debugging the incremental builds. -# Uncomment it for debugging. -# if [ ! -f /tmp/iter ]; then iter=1; echo 1 > /tmp/iter; -# else iter=$(($(cat /tmp/iter) + 1)); echo $iter > /tmp/iter; fi -# find $all_dirs -name "*.h" | xargs ls -l > /tmp/ls-$iter - -all_dirs= -if [ "$building_out_of_srctree" ]; then - for d in $dir_list; do - all_dirs="$all_dirs $srctree/$d" - done -fi -all_dirs="$all_dirs $dir_list" - -# include/generated/utsversion.h is ignored because it is generated after this -# script is executed. (utsversion.h is unneeded for kheaders) -# -# When Kconfig regenerates include/generated/autoconf.h, its timestamp is -# updated, but the contents might be still the same. When any CONFIG option is -# changed, Kconfig touches the corresponding timestamp file include/config/*. -# Hence, the md5sum detects the configuration change anyway. We do not need to -# check include/generated/autoconf.h explicitly. -# -# Ignore them for md5 calculation to avoid pointless regeneration. -headers_md5="$(find $all_dirs -name "*.h" -a \ - ! -path include/generated/utsversion.h -a \ - ! -path include/generated/autoconf.h | - xargs ls -l | md5sum | cut -d ' ' -f1)" - -# Any changes to this script will also cause a rebuild of the archive. -this_file_md5="$(ls -l $sfile | md5sum | cut -d ' ' -f1)" -if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi -if [ -f kernel/kheaders.md5 ] && - [ "$(head -n 1 kernel/kheaders.md5)" = "$headers_md5" ] && - [ "$(head -n 2 kernel/kheaders.md5 | tail -n 1)" = "$this_file_md5" ] && - [ "$(tail -n 1 kernel/kheaders.md5)" = "$tarfile_md5" ]; then - exit -fi - -echo " GEN $tarfile" +srclist=$2 +objlist=$3 +timestamp=$4 + +dir=$(dirname "${tarfile}") +tmpdir=${dir}/.tmp_dir +depfile=${dir}/.$(basename "${tarfile}").d + +# generate dependency list. +{ + echo + echo "deps_${tarfile} := \\" + sed 's:\(.*\): \1 \\:' "${srclist}" + sed -n '/^include\/generated\/autoconf\.h$/!s:\(.*\): \1 \\:p' "${objlist}" + echo + echo "${tarfile}: \$(deps_${tarfile})" + echo + echo "\$(deps_${tarfile}):" + +} > "${depfile}" rm -rf "${tmpdir}" mkdir "${tmpdir}" -if [ "$building_out_of_srctree" ]; then - ( - cd $srctree - for f in $dir_list - do find "$f" -name "*.h"; - done | tar -c -f - -T - | tar -xf - -C "${tmpdir}" - ) -fi - -for f in $dir_list; - do find "$f" -name "*.h"; -done | tar -c -f - -T - | tar -xf - -C "${tmpdir}" - -# Always exclude include/generated/utsversion.h -# Otherwise, the contents of the tarball may vary depending on the build steps. -rm -f "${tmpdir}/include/generated/utsversion.h" +# shellcheck disable=SC2154 # srctree is passed as an env variable +sed "s:^${srctree}/::" "${srclist}" | ${TAR} -c -f - -C "${srctree}" -T - | ${TAR} -xf - -C "${tmpdir}" +${TAR} -c -f - -T "${objlist}" | ${TAR} -xf - -C "${tmpdir}" # Remove comments except SDPX lines # Use a temporary file to store directory contents to prevent find/xargs from @@ -88,12 +43,8 @@ xargs -0 -P8 -n1 \ rm -f "${tmpdir}.contents.txt" # Create archive and try to normalize metadata for reproducibility. -tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ +${TAR} "${timestamp:+--mtime=$timestamp}" \ --owner=0 --group=0 --sort=name --numeric-owner --mode=u=rw,go=r,a+X \ - -I $XZ -cf $tarfile -C "${tmpdir}/" . > /dev/null - -echo $headers_md5 > kernel/kheaders.md5 -echo "$this_file_md5" >> kernel/kheaders.md5 -echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5 + -I "${XZ}" -cf "${tarfile}" -C "${tmpdir}/" . > /dev/null rm -rf "${tmpdir}" diff --git a/kernel/hung_task.c b/kernel/hung_task.c index d2432df2b905..8708a1205f82 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -23,6 +23,7 @@ #include <linux/sched/debug.h> #include <linux/sched/sysctl.h> #include <linux/hung_task.h> +#include <linux/rwsem.h> #include <trace/events/sched.h> @@ -100,6 +101,7 @@ static void debug_show_blocker(struct task_struct *task) { struct task_struct *g, *t; unsigned long owner, blocker, blocker_type; + const char *rwsem_blocked_by, *rwsem_blocked_as; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held"); @@ -111,12 +113,20 @@ static void debug_show_blocker(struct task_struct *task) switch (blocker_type) { case BLOCKER_TYPE_MUTEX: - owner = mutex_get_owner( - (struct mutex *)hung_task_blocker_to_lock(blocker)); + owner = mutex_get_owner(hung_task_blocker_to_lock(blocker)); break; case BLOCKER_TYPE_SEM: - owner = sem_last_holder( - (struct semaphore *)hung_task_blocker_to_lock(blocker)); + owner = sem_last_holder(hung_task_blocker_to_lock(blocker)); + break; + case BLOCKER_TYPE_RWSEM_READER: + case BLOCKER_TYPE_RWSEM_WRITER: + owner = (unsigned long)rwsem_owner( + hung_task_blocker_to_lock(blocker)); + rwsem_blocked_as = (blocker_type == BLOCKER_TYPE_RWSEM_READER) ? + "reader" : "writer"; + rwsem_blocked_by = is_rwsem_reader_owned( + hung_task_blocker_to_lock(blocker)) ? + "reader" : "writer"; break; default: WARN_ON_ONCE(1); @@ -134,6 +144,11 @@ static void debug_show_blocker(struct task_struct *task) pr_err("INFO: task %s:%d is blocked on a semaphore, but the last holder is not found.\n", task->comm, task->pid); break; + case BLOCKER_TYPE_RWSEM_READER: + case BLOCKER_TYPE_RWSEM_WRITER: + pr_err("INFO: task %s:%d is blocked on an rw-semaphore, but the owner is not found.\n", + task->comm, task->pid); + break; } return; } @@ -152,6 +167,12 @@ static void debug_show_blocker(struct task_struct *task) pr_err("INFO: task %s:%d blocked on a semaphore likely last held by task %s:%d\n", task->comm, task->pid, t->comm, t->pid); break; + case BLOCKER_TYPE_RWSEM_READER: + case BLOCKER_TYPE_RWSEM_WRITER: + pr_err("INFO: task %s:%d <%s> blocked on an rw-semaphore likely owned by task %s:%d <%s>\n", + task->comm, task->pid, rwsem_blocked_as, t->comm, + t->pid, rwsem_blocked_by); + break; } sched_show_task(t); return; diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 624106e886ad..0d0276378c70 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -611,7 +611,13 @@ void handle_simple_irq(struct irq_desc *desc) { guard(raw_spinlock)(&desc->lock); - if (!irq_can_handle(desc)) + if (!irq_can_handle_pm(desc)) { + if (irqd_needs_resend_when_in_progress(&desc->irq_data)) + desc->istate |= IRQS_PENDING; + return; + } + + if (!irq_can_handle_actions(desc)) return; kstat_incr_irqs_this_cpu(desc); diff --git a/kernel/kcov.c b/kernel/kcov.c index 187ba1b80bda..1d85597057e1 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -552,7 +552,7 @@ static int kcov_get_mode(unsigned long arg) /* * Fault in a lazily-faulted vmalloc area before it can be used by - * __santizer_cov_trace_pc(), to avoid recursion issues if any code on the + * __sanitizer_cov_trace_pc(), to avoid recursion issues if any code on the * vmalloc fault handling path is instrumented. */ static void kcov_fault_in_area(struct kcov *kcov) diff --git a/kernel/kexec.c b/kernel/kexec.c index a6b3f96bb50c..28008e3d462e 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -152,7 +152,7 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, goto out; for (i = 0; i < nr_segments; i++) { - ret = kimage_load_segment(image, &image->segment[i]); + ret = kimage_load_segment(image, i); if (ret) goto out; } diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 351cd7d76dfa..31203f0bacaf 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -40,6 +40,7 @@ #include <linux/hugetlb.h> #include <linux/objtool.h> #include <linux/kmsg_dump.h> +#include <linux/dma-map-ops.h> #include <asm/page.h> #include <asm/sections.h> @@ -553,6 +554,24 @@ static void kimage_free_entry(kimage_entry_t entry) kimage_free_pages(page); } +static void kimage_free_cma(struct kimage *image) +{ + unsigned long i; + + for (i = 0; i < image->nr_segments; i++) { + struct page *cma = image->segment_cma[i]; + u32 nr_pages = image->segment[i].memsz >> PAGE_SHIFT; + + if (!cma) + continue; + + arch_kexec_pre_free_pages(page_address(cma), nr_pages); + dma_release_from_contiguous(NULL, cma, nr_pages); + image->segment_cma[i] = NULL; + } + +} + void kimage_free(struct kimage *image) { kimage_entry_t *ptr, entry; @@ -591,6 +610,9 @@ void kimage_free(struct kimage *image) /* Free the kexec control pages... */ kimage_free_page_list(&image->control_pages); + /* Free CMA allocations */ + kimage_free_cma(image); + /* * Free up any temporary buffers allocated. This might hit if * error occurred much later after buffer allocation. @@ -716,9 +738,69 @@ static struct page *kimage_alloc_page(struct kimage *image, return page; } -static int kimage_load_normal_segment(struct kimage *image, - struct kexec_segment *segment) +static int kimage_load_cma_segment(struct kimage *image, int idx) +{ + struct kexec_segment *segment = &image->segment[idx]; + struct page *cma = image->segment_cma[idx]; + char *ptr = page_address(cma); + unsigned long maddr; + size_t ubytes, mbytes; + int result = 0; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; + + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; + ubytes = segment->bufsz; + mbytes = segment->memsz; + maddr = segment->mem; + + /* Then copy from source buffer to the CMA one */ + while (mbytes) { + size_t uchunk, mchunk; + + ptr += maddr & ~PAGE_MASK; + mchunk = min_t(size_t, mbytes, + PAGE_SIZE - (maddr & ~PAGE_MASK)); + uchunk = min(ubytes, mchunk); + + if (uchunk) { + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); + ubytes -= uchunk; + if (image->file_mode) + kbuf += uchunk; + else + buf += uchunk; + } + + if (result) { + result = -EFAULT; + goto out; + } + + ptr += mchunk; + maddr += mchunk; + mbytes -= mchunk; + + cond_resched(); + } + + /* Clear any remainder */ + memset(ptr, 0, mbytes); + +out: + return result; +} + +static int kimage_load_normal_segment(struct kimage *image, int idx) { + struct kexec_segment *segment = &image->segment[idx]; unsigned long maddr; size_t ubytes, mbytes; int result; @@ -733,6 +815,9 @@ static int kimage_load_normal_segment(struct kimage *image, mbytes = segment->memsz; maddr = segment->mem; + if (image->segment_cma[idx]) + return kimage_load_cma_segment(image, idx); + result = kimage_set_destination(image, maddr); if (result < 0) goto out; @@ -787,13 +872,13 @@ out: } #ifdef CONFIG_CRASH_DUMP -static int kimage_load_crash_segment(struct kimage *image, - struct kexec_segment *segment) +static int kimage_load_crash_segment(struct kimage *image, int idx) { /* For crash dumps kernels we simply copy the data from * user space to it's destination. * We do things a page at a time for the sake of kmap. */ + struct kexec_segment *segment = &image->segment[idx]; unsigned long maddr; size_t ubytes, mbytes; int result; @@ -858,18 +943,17 @@ out: } #endif -int kimage_load_segment(struct kimage *image, - struct kexec_segment *segment) +int kimage_load_segment(struct kimage *image, int idx) { int result = -ENOMEM; switch (image->type) { case KEXEC_TYPE_DEFAULT: - result = kimage_load_normal_segment(image, segment); + result = kimage_load_normal_segment(image, idx); break; #ifdef CONFIG_CRASH_DUMP case KEXEC_TYPE_CRASH: - result = kimage_load_crash_segment(image, segment); + result = kimage_load_crash_segment(image, idx); break; #endif } diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index b835033c65eb..91d46502a817 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -26,6 +26,7 @@ #include <linux/kernel_read_file.h> #include <linux/syscalls.h> #include <linux/vmalloc.h> +#include <linux/dma-map-ops.h> #include "kexec_internal.h" #ifdef CONFIG_KEXEC_SIG @@ -253,6 +254,8 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, ret = 0; } + image->no_cma = !!(flags & KEXEC_FILE_NO_CMA); + if (cmdline_len) { image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len); if (IS_ERR(image->cmdline_buf)) { @@ -434,7 +437,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, i, ksegment->buf, ksegment->bufsz, ksegment->mem, ksegment->memsz); - ret = kimage_load_segment(image, &image->segment[i]); + ret = kimage_load_segment(image, i); if (ret) goto out; } @@ -663,6 +666,43 @@ static int kexec_walk_resources(struct kexec_buf *kbuf, return walk_system_ram_res(0, ULONG_MAX, kbuf, func); } +static int kexec_alloc_contig(struct kexec_buf *kbuf) +{ + size_t nr_pages = kbuf->memsz >> PAGE_SHIFT; + unsigned long mem; + struct page *p; + + /* User space disabled CMA allocations, bail out. */ + if (kbuf->image->no_cma) + return -EPERM; + + /* Skip CMA logic for crash kernel */ + if (kbuf->image->type == KEXEC_TYPE_CRASH) + return -EPERM; + + p = dma_alloc_from_contiguous(NULL, nr_pages, get_order(kbuf->buf_align), true); + if (!p) + return -ENOMEM; + + pr_debug("allocated %zu DMA pages at 0x%lx", nr_pages, page_to_boot_pfn(p)); + + mem = page_to_boot_pfn(p) << PAGE_SHIFT; + + if (kimage_is_destination_range(kbuf->image, mem, mem + kbuf->memsz)) { + /* Our region is already in use by a statically defined one. Bail out. */ + pr_debug("CMA overlaps existing mem: 0x%lx+0x%lx\n", mem, kbuf->memsz); + dma_release_from_contiguous(NULL, p, nr_pages); + return -EBUSY; + } + + kbuf->mem = page_to_boot_pfn(p) << PAGE_SHIFT; + kbuf->cma = p; + + arch_kexec_post_alloc_pages(page_address(p), (int)nr_pages, 0); + + return 0; +} + /** * kexec_locate_mem_hole - find free memory for the purgatory or the next kernel * @kbuf: Parameters for the memory search. @@ -687,6 +727,13 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) if (ret <= 0) return ret; + /* + * Try to find a free physically contiguous block of memory first. With that, we + * can avoid any copying at kexec time. + */ + if (!kexec_alloc_contig(kbuf)) + return 0; + if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) ret = kexec_walk_resources(kbuf, locate_mem_hole_callback); else @@ -732,6 +779,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf) /* Ensure minimum alignment needed for segments. */ kbuf->memsz = ALIGN(kbuf->memsz, PAGE_SIZE); kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE); + kbuf->cma = NULL; /* Walk the RAM ranges and allocate a suitable range for the buffer */ ret = arch_kexec_locate_mem_hole(kbuf); @@ -744,6 +792,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf) ksegment->bufsz = kbuf->bufsz; ksegment->mem = kbuf->mem; ksegment->memsz = kbuf->memsz; + kbuf->image->segment_cma[kbuf->image->nr_segments] = kbuf->cma; kbuf->image->nr_segments++; return 0; } diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index d3b13a909913..e49743ae52c5 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -1100,8 +1100,8 @@ static void __init kho_release_scratch(void) ulong pfn; for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) - set_pageblock_migratetype(pfn_to_page(pfn), - MIGRATE_CMA); + init_pageblock_migratetype(pfn_to_page(pfn), + MIGRATE_CMA, false); } } diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 30a733a55a67..228bb88c018b 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -10,7 +10,7 @@ struct kimage *do_kimage_alloc_init(void); int sanity_check_segment_list(struct kimage *image); void kimage_free_page_list(struct list_head *list); void kimage_free(struct kimage *image); -int kimage_load_segment(struct kimage *image, struct kexec_segment *segment); +int kimage_load_segment(struct kimage *image, int idx); void kimage_terminate(struct kimage *image); int kimage_is_destination_range(struct kimage *image, unsigned long start, unsigned long end); diff --git a/kernel/kthread.c b/kernel/kthread.c index 85fc068f0083..0e98b228a8ef 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -88,13 +88,12 @@ static inline struct kthread *to_kthread(struct task_struct *k) /* * Variant of to_kthread() that doesn't assume @p is a kthread. * - * Per construction; when: + * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will + * always remain a kthread. For kthreads p->worker_private always + * points to a struct kthread. For tasks that are not kthreads + * p->worker_private is used to point to other things. * - * (p->flags & PF_KTHREAD) && p->worker_private - * - * the task is both a kthread and struct kthread is persistent. However - * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and - * begin_new_exec()). + * Return NULL for any task that is not a kthread. */ static inline struct kthread *__to_kthread(struct task_struct *p) { diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 8572dba95af4..24df4d98f7d2 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -27,6 +27,7 @@ #include <linux/export.h> #include <linux/rwsem.h> #include <linux/atomic.h> +#include <linux/hung_task.h> #include <trace/events/lock.h> #ifndef CONFIG_PREEMPT_RT @@ -181,11 +182,11 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) __rwsem_set_reader_owned(sem, current); } -#ifdef CONFIG_DEBUG_RWSEMS +#if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER) /* * Return just the real task structure pointer of the owner */ -static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) +struct task_struct *rwsem_owner(struct rw_semaphore *sem) { return (struct task_struct *) (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK); @@ -194,7 +195,7 @@ static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) /* * Return true if the rwsem is owned by a reader. */ -static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) +bool is_rwsem_reader_owned(struct rw_semaphore *sem) { /* * Check the count to see if it is write-locked. @@ -207,10 +208,10 @@ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) } /* - * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there - * is a task pointer in owner of a reader-owned rwsem, it will be the - * real owner or one of the real owners. The only exception is when the - * unlock is done by up_read_non_owner(). + * With CONFIG_DEBUG_RWSEMS or CONFIG_DETECT_HUNG_TASK_BLOCKER configured, + * it will make sure that the owner field of a reader-owned rwsem either + * points to a real reader-owner(s) or gets cleared. The only exception is + * when the unlock is done by up_read_non_owner(). */ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) { @@ -1063,10 +1064,13 @@ queue: wake_up_q(&wake_q); trace_contention_begin(sem, LCB_F_READ); + set_current_state(state); + + if (state == TASK_UNINTERRUPTIBLE) + hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_READER); /* wait to be given the lock */ for (;;) { - set_current_state(state); if (!smp_load_acquire(&waiter.task)) { /* Matches rwsem_mark_wake()'s smp_store_release(). */ break; @@ -1081,8 +1085,12 @@ queue: } schedule_preempt_disabled(); lockevent_inc(rwsem_sleep_reader); + set_current_state(state); } + if (state == TASK_UNINTERRUPTIBLE) + hung_task_clear_blocker(); + __set_current_state(TASK_RUNNING); lockevent_inc(rwsem_rlock); trace_contention_end(sem, 0); @@ -1144,6 +1152,9 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) set_current_state(state); trace_contention_begin(sem, LCB_F_WRITE); + if (state == TASK_UNINTERRUPTIBLE) + hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_WRITER); + for (;;) { if (rwsem_try_write_lock(sem, &waiter)) { /* rwsem_try_write_lock() implies ACQUIRE on success */ @@ -1177,6 +1188,10 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) trylock_again: raw_spin_lock_irq(&sem->wait_lock); } + + if (state == TASK_UNINTERRUPTIBLE) + hung_task_clear_blocker(); + __set_current_state(TASK_RUNNING); raw_spin_unlock_irq(&sem->wait_lock); lockevent_inc(rwsem_wlock); diff --git a/kernel/module/internal.h b/kernel/module/internal.h index 51ddd8866ef3..618202578b42 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -112,6 +112,13 @@ struct find_symbol_arg { enum mod_license license; }; +/* modules using other modules */ +struct module_use { + struct list_head source_list; + struct list_head target_list; + struct module *source, *target; +}; + int mod_verify_sig(const void *mod, struct load_info *info); int try_to_force_load(struct module *mod, const char *reason); bool find_symbol(struct find_symbol_arg *fsa); diff --git a/kernel/module/main.c b/kernel/module/main.c index 81f9df8859dc..c66b26184936 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -608,7 +608,7 @@ MODINFO_ATTR(version); MODINFO_ATTR(srcversion); static struct { - char name[MODULE_NAME_LEN + 1]; + char name[MODULE_NAME_LEN]; char taints[MODULE_FLAGS_BUF_SIZE]; } last_unloaded_module; @@ -779,14 +779,16 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, struct module *mod; char name[MODULE_NAME_LEN]; char buf[MODULE_FLAGS_BUF_SIZE]; - int ret, forced = 0; + int ret, len, forced = 0; if (!capable(CAP_SYS_MODULE) || modules_disabled) return -EPERM; - if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0) - return -EFAULT; - name[MODULE_NAME_LEN-1] = '\0'; + len = strncpy_from_user(name, name_user, MODULE_NAME_LEN); + if (len == 0 || len == MODULE_NAME_LEN) + return -ENOENT; + if (len < 0) + return len; audit_log_kern_module(name); @@ -1320,20 +1322,11 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type) else execmem_type = EXECMEM_MODULE_TEXT; - ptr = execmem_alloc(execmem_type, size); + ptr = execmem_alloc_rw(execmem_type, size); if (!ptr) return -ENOMEM; - if (execmem_is_rox(execmem_type)) { - int err = execmem_make_temp_rw(ptr, size); - - if (err) { - execmem_free(ptr); - return -ENOMEM; - } - - mod->mem[type].is_rox = true; - } + mod->mem[type].is_rox = execmem_is_rox(execmem_type); /* * The pointer to these blocks of memory are stored on the module diff --git a/kernel/padata.c b/kernel/padata.c index 7eee94166357..f85f8bd788d0 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -63,17 +63,6 @@ static inline void padata_put_pd(struct parallel_data *pd) padata_put_pd_cnt(pd, 1); } -static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) -{ - int cpu, target_cpu; - - target_cpu = cpumask_first(pd->cpumask.pcpu); - for (cpu = 0; cpu < cpu_index; cpu++) - target_cpu = cpumask_next(target_cpu, pd->cpumask.pcpu); - - return target_cpu; -} - static int padata_cpu_hash(struct parallel_data *pd, unsigned int seq_nr) { /* @@ -82,7 +71,7 @@ static int padata_cpu_hash(struct parallel_data *pd, unsigned int seq_nr) */ int cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu); - return padata_index_to_cpu(pd, cpu_index); + return cpumask_nth(cpu_index, pd->cpumask.pcpu); } static struct padata_work *padata_work_alloc(void) @@ -192,9 +181,9 @@ int padata_do_parallel(struct padata_shell *ps, struct padata_priv *padata, int *cb_cpu) { struct padata_instance *pinst = ps->pinst; - int i, cpu, cpu_index, err; struct parallel_data *pd; struct padata_work *pw; + int cpu_index, err; rcu_read_lock_bh(); @@ -210,12 +199,7 @@ int padata_do_parallel(struct padata_shell *ps, /* Select an alternate fallback CPU and notify the caller. */ cpu_index = *cb_cpu % cpumask_weight(pd->cpumask.cbcpu); - - cpu = cpumask_first(pd->cpumask.cbcpu); - for (i = 0; i < cpu_index; i++) - cpu = cpumask_next(cpu, pd->cpumask.cbcpu); - - *cb_cpu = cpu; + *cb_cpu = cpumask_nth(cpu_index, pd->cpumask.cbcpu); } err = -EBUSY; @@ -261,20 +245,17 @@ EXPORT_SYMBOL(padata_do_parallel); * be parallel processed by another cpu and is not yet present in * the cpu's reorder queue. */ -static struct padata_priv *padata_find_next(struct parallel_data *pd, - bool remove_object) +static struct padata_priv *padata_find_next(struct parallel_data *pd, int cpu, + unsigned int processed) { struct padata_priv *padata; struct padata_list *reorder; - int cpu = pd->cpu; reorder = per_cpu_ptr(pd->reorder_list, cpu); spin_lock(&reorder->lock); - if (list_empty(&reorder->list)) { - spin_unlock(&reorder->lock); - return NULL; - } + if (list_empty(&reorder->list)) + goto notfound; padata = list_entry(reorder->list.next, struct padata_priv, list); @@ -282,97 +263,52 @@ static struct padata_priv *padata_find_next(struct parallel_data *pd, * Checks the rare case where two or more parallel jobs have hashed to * the same CPU and one of the later ones finishes first. */ - if (padata->seq_nr != pd->processed) { - spin_unlock(&reorder->lock); - return NULL; - } - - if (remove_object) { - list_del_init(&padata->list); - ++pd->processed; - pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu); - } + if (padata->seq_nr != processed) + goto notfound; + list_del_init(&padata->list); spin_unlock(&reorder->lock); return padata; + +notfound: + pd->processed = processed; + pd->cpu = cpu; + spin_unlock(&reorder->lock); + return NULL; } -static void padata_reorder(struct parallel_data *pd) +static void padata_reorder(struct padata_priv *padata) { + struct parallel_data *pd = padata->pd; struct padata_instance *pinst = pd->ps->pinst; - int cb_cpu; - struct padata_priv *padata; - struct padata_serial_queue *squeue; - struct padata_list *reorder; + unsigned int processed; + int cpu; - /* - * We need to ensure that only one cpu can work on dequeueing of - * the reorder queue the time. Calculating in which percpu reorder - * queue the next object will arrive takes some time. A spinlock - * would be highly contended. Also it is not clear in which order - * the objects arrive to the reorder queues. So a cpu could wait to - * get the lock just to notice that there is nothing to do at the - * moment. Therefore we use a trylock and let the holder of the lock - * care for all the objects enqueued during the holdtime of the lock. - */ - if (!spin_trylock_bh(&pd->lock)) - return; + processed = pd->processed; + cpu = pd->cpu; - while (1) { - padata = padata_find_next(pd, true); + do { + struct padata_serial_queue *squeue; + int cb_cpu; - /* - * If the next object that needs serialization is parallel - * processed by another cpu and is still on it's way to the - * cpu's reorder queue, nothing to do for now. - */ - if (!padata) - break; + cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu); + processed++; cb_cpu = padata->cb_cpu; squeue = per_cpu_ptr(pd->squeue, cb_cpu); spin_lock(&squeue->serial.lock); list_add_tail(&padata->list, &squeue->serial.list); - spin_unlock(&squeue->serial.lock); - queue_work_on(cb_cpu, pinst->serial_wq, &squeue->work); - } - - spin_unlock_bh(&pd->lock); - - /* - * The next object that needs serialization might have arrived to - * the reorder queues in the meantime. - * - * Ensure reorder queue is read after pd->lock is dropped so we see - * new objects from another task in padata_do_serial. Pairs with - * smp_mb in padata_do_serial. - */ - smp_mb(); - reorder = per_cpu_ptr(pd->reorder_list, pd->cpu); - if (!list_empty(&reorder->list) && padata_find_next(pd, false)) { /* - * Other context(eg. the padata_serial_worker) can finish the request. - * To avoid UAF issue, add pd ref here, and put pd ref after reorder_work finish. + * If the next object that needs serialization is parallel + * processed by another cpu and is still on it's way to the + * cpu's reorder queue, end the loop. */ - padata_get_pd(pd); - if (!queue_work(pinst->serial_wq, &pd->reorder_work)) - padata_put_pd(pd); - } -} - -static void invoke_padata_reorder(struct work_struct *work) -{ - struct parallel_data *pd; - - local_bh_disable(); - pd = container_of(work, struct parallel_data, reorder_work); - padata_reorder(pd); - local_bh_enable(); - /* Pairs with putting the reorder_work in the serial_wq */ - padata_put_pd(pd); + padata = padata_find_next(pd, cpu, processed); + spin_unlock(&squeue->serial.lock); + } while (padata); } static void padata_serial_worker(struct work_struct *serial_work) @@ -423,6 +359,7 @@ void padata_do_serial(struct padata_priv *padata) struct padata_list *reorder = per_cpu_ptr(pd->reorder_list, hashed_cpu); struct padata_priv *cur; struct list_head *pos; + bool gotit = true; spin_lock(&reorder->lock); /* Sort in ascending order of sequence number. */ @@ -432,17 +369,14 @@ void padata_do_serial(struct padata_priv *padata) if ((signed int)(cur->seq_nr - padata->seq_nr) < 0) break; } - list_add(&padata->list, pos); + if (padata->seq_nr != pd->processed) { + gotit = false; + list_add(&padata->list, pos); + } spin_unlock(&reorder->lock); - /* - * Ensure the addition to the reorder list is ordered correctly - * with the trylock of pd->lock in padata_reorder. Pairs with smp_mb - * in padata_reorder. - */ - smp_mb(); - - padata_reorder(pd); + if (gotit) + padata_reorder(padata); } EXPORT_SYMBOL(padata_do_serial); @@ -632,9 +566,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) padata_init_squeues(pd); pd->seq_nr = -1; refcount_set(&pd->refcnt, 1); - spin_lock_init(&pd->lock); pd->cpu = cpumask_first(pd->cpumask.pcpu); - INIT_WORK(&pd->reorder_work, invoke_padata_reorder); return pd; @@ -1144,12 +1076,6 @@ void padata_free_shell(struct padata_shell *ps) if (!ps) return; - /* - * Wait for all _do_serial calls to finish to avoid touching - * freed pd's and ps's. - */ - synchronize_rcu(); - mutex_lock(&ps->pinst->lock); list_del(&ps->list); pd = rcu_dereference_protected(ps->pd, 1); diff --git a/kernel/panic.c b/kernel/panic.c index 43817111c979..72fcbb5a071b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -36,6 +36,7 @@ #include <linux/sysfs.h> #include <linux/context_tracking.h> #include <linux/seq_buf.h> +#include <linux/sys_info.h> #include <trace/events/error_report.h> #include <asm/sections.h> @@ -63,20 +64,13 @@ int panic_on_warn __read_mostly; unsigned long panic_on_taint; bool panic_on_taint_nousertaint = false; static unsigned int warn_limit __read_mostly; +static bool panic_console_replay; bool panic_triggering_all_cpu_backtrace; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); -#define PANIC_PRINT_TASK_INFO 0x00000001 -#define PANIC_PRINT_MEM_INFO 0x00000002 -#define PANIC_PRINT_TIMER_INFO 0x00000004 -#define PANIC_PRINT_LOCK_INFO 0x00000008 -#define PANIC_PRINT_FTRACE_INFO 0x00000010 -#define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020 -#define PANIC_PRINT_ALL_CPU_BT 0x00000040 -#define PANIC_PRINT_BLOCKED_TASKS 0x00000080 unsigned long panic_print; ATOMIC_NOTIFIER_HEAD(panic_notifier_list); @@ -128,6 +122,13 @@ static int proc_taint(const struct ctl_table *table, int write, return err; } +static int sysctl_panic_print_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + pr_info_once("Kernel: 'panic_print' sysctl interface will be obsoleted by both 'panic_sys_info' and 'panic_console_replay'\n"); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); +} + static const struct ctl_table kern_panic_table[] = { #ifdef CONFIG_SMP { @@ -165,7 +166,7 @@ static const struct ctl_table kern_panic_table[] = { .data = &panic_print, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_doulongvec_minmax, + .proc_handler = sysctl_panic_print_handler, }, { .procname = "panic_on_warn", @@ -193,6 +194,13 @@ static const struct ctl_table kern_panic_table[] = { .proc_handler = proc_dointvec, }, #endif + { + .procname = "panic_sys_info", + .data = &panic_print, + .maxlen = sizeof(panic_print), + .mode = 0644, + .proc_handler = sysctl_sys_info_handler, + }, }; static __init int kernel_panic_sysctls_init(void) @@ -203,6 +211,15 @@ static __init int kernel_panic_sysctls_init(void) late_initcall(kernel_panic_sysctls_init); #endif +/* The format is "panic_sys_info=tasks,mem,locks,ftrace,..." */ +static int __init setup_panic_sys_info(char *buf) +{ + /* There is no risk of race in kernel boot phase */ + panic_print = sys_info_parse_param(buf); + return 1; +} +__setup("panic_sys_info=", setup_panic_sys_info); + static atomic_t warn_count = ATOMIC_INIT(0); #ifdef CONFIG_SYSFS @@ -298,33 +315,6 @@ void nmi_panic(struct pt_regs *regs, const char *msg) } EXPORT_SYMBOL(nmi_panic); -static void panic_print_sys_info(bool console_flush) -{ - if (console_flush) { - if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG) - console_flush_on_panic(CONSOLE_REPLAY_ALL); - return; - } - - if (panic_print & PANIC_PRINT_TASK_INFO) - show_state(); - - if (panic_print & PANIC_PRINT_MEM_INFO) - show_mem(); - - if (panic_print & PANIC_PRINT_TIMER_INFO) - sysrq_timer_list_show(); - - if (panic_print & PANIC_PRINT_LOCK_INFO) - debug_show_all_locks(); - - if (panic_print & PANIC_PRINT_FTRACE_INFO) - ftrace_dump(DUMP_ALL); - - if (panic_print & PANIC_PRINT_BLOCKED_TASKS) - show_state_filter(TASK_UNINTERRUPTIBLE); -} - void check_panic_on_warn(const char *origin) { unsigned int limit; @@ -345,7 +335,7 @@ void check_panic_on_warn(const char *origin) */ static void panic_other_cpus_shutdown(bool crash_kexec) { - if (panic_print & PANIC_PRINT_ALL_CPU_BT) { + if (panic_print & SYS_INFO_ALL_CPU_BT) { /* Temporary allow non-panic CPUs to write their backtraces. */ panic_triggering_all_cpu_backtrace = true; trigger_all_cpu_backtrace(); @@ -468,7 +458,7 @@ void vpanic(const char *fmt, va_list args) */ atomic_notifier_call_chain(&panic_notifier_list, 0, buf); - panic_print_sys_info(false); + sys_info(panic_print); kmsg_dump_desc(KMSG_DUMP_PANIC, buf); @@ -497,7 +487,9 @@ void vpanic(const char *fmt, va_list args) debug_locks_off(); console_flush_on_panic(CONSOLE_FLUSH_PENDING); - panic_print_sys_info(true); + if ((panic_print & SYS_INFO_PANIC_CONSOLE_REPLAY) || + panic_console_replay) + console_flush_on_panic(CONSOLE_REPLAY_ALL); if (!panic_blink) panic_blink = no_blink; @@ -949,6 +941,7 @@ core_param(panic_print, panic_print, ulong, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); core_param(panic_on_warn, panic_on_warn, int, 0644); core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644); +core_param(panic_console_replay, panic_console_replay, bool, 0644); static int __init oops_setup(char *s) { diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index bbed41ad29cf..ef282001f200 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -64,6 +64,7 @@ struct dev_printk_info; extern struct printk_ringbuffer *prb; extern bool printk_kthreads_running; +extern bool printk_kthreads_ready; extern bool debug_non_panic_cpus; __printf(4, 0) @@ -179,6 +180,7 @@ static inline void nbcon_kthread_wake(struct console *con) #define PRINTKRB_RECORD_MAX 0 #define printk_kthreads_running (false) +#define printk_kthreads_ready (false) /* * In !PRINTK builds we still export console_sem diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index fd12efcc4aed..646801813415 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -214,8 +214,9 @@ static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq) /** * nbcon_context_try_acquire_direct - Try to acquire directly - * @ctxt: The context of the caller - * @cur: The current console state + * @ctxt: The context of the caller + * @cur: The current console state + * @is_reacquire: This acquire is a reacquire * * Acquire the console when it is released. Also acquire the console when * the current owner has a lower priority and the console is in a safe state. @@ -225,17 +226,17 @@ static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq) * * Errors: * - * -EPERM: A panic is in progress and this is not the panic CPU. - * Or the current owner or waiter has the same or higher - * priority. No acquire method can be successful in - * this case. + * -EPERM: A panic is in progress and this is neither the panic + * CPU nor is this a reacquire. Or the current owner or + * waiter has the same or higher priority. No acquire + * method can be successful in these cases. * * -EBUSY: The current owner has a lower priority but the console * in an unsafe state. The caller should try using * the handover acquire method. */ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt, - struct nbcon_state *cur) + struct nbcon_state *cur, bool is_reacquire) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; @@ -243,14 +244,20 @@ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt, do { /* - * Panic does not imply that the console is owned. However, it - * is critical that non-panic CPUs during panic are unable to - * acquire ownership in order to satisfy the assumptions of - * nbcon_waiter_matches(). In particular, the assumption that - * lower priorities are ignored during panic. + * Panic does not imply that the console is owned. However, + * since all non-panic CPUs are stopped during panic(), it + * is safer to have them avoid gaining console ownership. + * + * If this acquire is a reacquire (and an unsafe takeover + * has not previously occurred) then it is allowed to attempt + * a direct acquire in panic. This gives console drivers an + * opportunity to perform any necessary cleanup if they were + * interrupted by the panic CPU while printing. */ - if (other_cpu_in_panic()) + if (other_cpu_in_panic() && + (!is_reacquire || cur->unsafe_takeover)) { return -EPERM; + } if (ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio) return -EPERM; @@ -301,8 +308,9 @@ static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio) * Event #1 implies this context is EMERGENCY. * Event #2 implies the new context is PANIC. * Event #3 occurs when panic() has flushed the console. - * Events #4 and #5 are not possible due to the other_cpu_in_panic() - * check in nbcon_context_try_acquire_direct(). + * Event #4 occurs when a non-panic CPU reacquires. + * Event #5 is not possible due to the other_cpu_in_panic() check + * in nbcon_context_try_acquire_handover(). */ return (cur->req_prio == expected_prio); @@ -431,6 +439,16 @@ static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt, WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio); WARN_ON_ONCE(!cur->unsafe); + /* + * Panic does not imply that the console is owned. However, it + * is critical that non-panic CPUs during panic are unable to + * wait for a handover in order to satisfy the assumptions of + * nbcon_waiter_matches(). In particular, the assumption that + * lower priorities are ignored during panic. + */ + if (other_cpu_in_panic()) + return -EPERM; + /* Handover is not possible on the same CPU. */ if (cur->cpu == cpu) return -EBUSY; @@ -558,7 +576,8 @@ static struct printk_buffers panic_nbcon_pbufs; /** * nbcon_context_try_acquire - Try to acquire nbcon console - * @ctxt: The context of the caller + * @ctxt: The context of the caller + * @is_reacquire: This acquire is a reacquire * * Context: Under @ctxt->con->device_lock() or local_irq_save(). * Return: True if the console was acquired. False otherwise. @@ -568,7 +587,7 @@ static struct printk_buffers panic_nbcon_pbufs; * in an unsafe state. Otherwise, on success the caller may assume * the console is not in an unsafe state. */ -static bool nbcon_context_try_acquire(struct nbcon_context *ctxt) +static bool nbcon_context_try_acquire(struct nbcon_context *ctxt, bool is_reacquire) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; @@ -577,7 +596,7 @@ static bool nbcon_context_try_acquire(struct nbcon_context *ctxt) nbcon_state_read(con, &cur); try_again: - err = nbcon_context_try_acquire_direct(ctxt, &cur); + err = nbcon_context_try_acquire_direct(ctxt, &cur, is_reacquire); if (err != -EBUSY) goto out; @@ -913,7 +932,7 @@ void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); - while (!nbcon_context_try_acquire(ctxt)) + while (!nbcon_context_try_acquire(ctxt, true)) cpu_relax(); nbcon_write_context_set_buf(wctxt, NULL, 0); @@ -1101,7 +1120,7 @@ static bool nbcon_emit_one(struct nbcon_write_context *wctxt, bool use_atomic) cant_migrate(); } - if (!nbcon_context_try_acquire(ctxt)) + if (!nbcon_context_try_acquire(ctxt, false)) goto out; /* @@ -1486,7 +1505,7 @@ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, ctxt->prio = nbcon_get_default_prio(); ctxt->allow_unsafe_takeover = allow_unsafe_takeover; - if (!nbcon_context_try_acquire(ctxt)) + if (!nbcon_context_try_acquire(ctxt, false)) return -EPERM; while (nbcon_seq_read(con) < stop_seq) { @@ -1671,6 +1690,9 @@ bool nbcon_alloc(struct console *con) { struct nbcon_state state = { }; + /* Synchronize the kthread start. */ + lockdep_assert_console_list_lock_held(); + /* The write_thread() callback is mandatory. */ if (WARN_ON(!con->write_thread)) return false; @@ -1701,12 +1723,15 @@ bool nbcon_alloc(struct console *con) return false; } - if (printk_kthreads_running) { + if (printk_kthreads_ready && !have_boot_console) { if (!nbcon_kthread_create(con)) { kfree(con->pbufs); con->pbufs = NULL; return false; } + + /* Might be the first kthread. */ + printk_kthreads_running = true; } } @@ -1716,14 +1741,30 @@ bool nbcon_alloc(struct console *con) /** * nbcon_free - Free and cleanup the nbcon console specific data * @con: Console to free/cleanup nbcon data + * + * Important: @have_nbcon_console must be updated before calling + * this function. In particular, it can be set only when there + * is still another nbcon console registered. */ void nbcon_free(struct console *con) { struct nbcon_state state = { }; - if (printk_kthreads_running) + /* Synchronize the kthread stop. */ + lockdep_assert_console_list_lock_held(); + + if (printk_kthreads_running) { nbcon_kthread_stop(con); + /* Might be the last nbcon console. + * + * Do not rely on printk_kthreads_check_locked(). It is not + * called in some code paths, see nbcon_free() callers. + */ + if (!have_nbcon_console) + printk_kthreads_running = false; + } + nbcon_state_set(con, &state); /* Boot consoles share global printk buffers. */ @@ -1762,7 +1803,7 @@ bool nbcon_device_try_acquire(struct console *con) ctxt->console = con; ctxt->prio = NBCON_PRIO_NORMAL; - if (!nbcon_context_try_acquire(ctxt)) + if (!nbcon_context_try_acquire(ctxt, false)) return false; if (!nbcon_context_enter_unsafe(ctxt)) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 1eea80d0648e..0efbcdda9aab 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3574,7 +3574,7 @@ EXPORT_SYMBOL(console_resume); static int unregister_console_locked(struct console *console); /* True when system boot is far enough to create printer threads. */ -static bool printk_kthreads_ready __ro_after_init; +bool printk_kthreads_ready __ro_after_init; static struct task_struct *printk_legacy_kthread; @@ -3713,6 +3713,7 @@ static void printk_kthreads_check_locked(void) if (!printk_kthreads_ready) return; + /* Start or stop the legacy kthread when needed. */ if (have_legacy_console || have_boot_console) { if (!printk_legacy_kthread && force_legacy_kthread() && @@ -4204,14 +4205,6 @@ static int unregister_console_locked(struct console *console) */ synchronize_srcu(&console_srcu); - if (console->flags & CON_NBCON) - nbcon_free(console); - - console_sysfs_notify(); - - if (console->exit) - res = console->exit(console); - /* * With this console gone, the global flags tracking registered * console types may have changed. Update them. @@ -4232,6 +4225,15 @@ static int unregister_console_locked(struct console *console) if (!found_nbcon_con) have_nbcon_console = found_nbcon_con; + /* @have_nbcon_console must be updated before calling nbcon_free(). */ + if (console->flags & CON_NBCON) + nbcon_free(console); + + console_sysfs_notify(); + + if (console->exit) + res = console->exit(console); + /* Changed console list, may require printer threads to start/stop. */ printk_kthreads_check_locked(); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 2a5a5329322d..d16afeb11506 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -163,6 +163,13 @@ static void panic_on_rcu_stall(void) { static int cpu_stall; + /* + * Attempt to kick out the BPF scheduler if it's installed and defer + * the panic to give the system a chance to recover. + */ + if (scx_rcu_cpu_stall()) + return; + if (++cpu_stall < sysctl_max_rcu_stall_to_panic) return; diff --git a/kernel/relay.c b/kernel/relay.c index c0c93a04d4ce..8d915fe98198 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -118,7 +118,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) return NULL; for (i = 0; i < n_pages; i++) { - buf->page_array[i] = alloc_page(GFP_KERNEL); + buf->page_array[i] = alloc_page(GFP_KERNEL | __GFP_ZERO); if (unlikely(!buf->page_array[i])) goto depopulate; set_page_private(buf->page_array[i], (unsigned long)buf); @@ -127,7 +127,6 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) if (!mem) goto depopulate; - memset(mem, 0, *size); buf->page_count = n_pages; return mem; @@ -250,13 +249,18 @@ EXPORT_SYMBOL_GPL(relay_buf_full); */ static int relay_subbuf_start(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, size_t prev_padding) + void *prev_subbuf) { + int full = relay_buf_full(buf); + + if (full) + buf->stats.full_count++; + if (!buf->chan->cb->subbuf_start) - return !relay_buf_full(buf); + return !full; return buf->chan->cb->subbuf_start(buf, subbuf, - prev_subbuf, prev_padding); + prev_subbuf); } /** @@ -298,11 +302,13 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) buf->finalized = 0; buf->data = buf->start; buf->offset = 0; + buf->stats.full_count = 0; + buf->stats.big_count = 0; for (i = 0; i < buf->chan->n_subbufs; i++) buf->padding[i] = 0; - relay_subbuf_start(buf, buf->data, NULL, 0); + relay_subbuf_start(buf, buf->data, NULL); } /** @@ -555,9 +561,11 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) goto toobig; if (buf->offset != buf->chan->subbuf_size + 1) { - buf->prev_padding = buf->chan->subbuf_size - buf->offset; + size_t prev_padding; + + prev_padding = buf->chan->subbuf_size - buf->offset; old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; - buf->padding[old_subbuf] = buf->prev_padding; + buf->padding[old_subbuf] = prev_padding; buf->subbufs_produced++; if (buf->dentry) d_inode(buf->dentry)->i_size += @@ -582,7 +590,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; new = buf->start + new_subbuf * buf->chan->subbuf_size; buf->offset = 0; - if (!relay_subbuf_start(buf, new, old, buf->prev_padding)) { + if (!relay_subbuf_start(buf, new, old)) { buf->offset = buf->chan->subbuf_size + 1; return 0; } @@ -595,7 +603,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) return length; toobig: - buf->chan->last_toobig = length; + buf->stats.big_count++; return 0; } EXPORT_SYMBOL_GPL(relay_switch_subbuf); @@ -655,11 +663,6 @@ void relay_close(struct rchan *chan) if ((buf = *per_cpu_ptr(chan->buf, i))) relay_close_buf(buf); - if (chan->last_toobig) - printk(KERN_WARNING "relay: one or more items not logged " - "[item size (%zd) > sub-buffer size (%zd)]\n", - chan->last_toobig, chan->subbuf_size); - list_del(&chan->list); kref_put(&chan->kref, relay_destroy_channel); mutex_unlock(&relay_channels_mutex); @@ -694,6 +697,42 @@ void relay_flush(struct rchan *chan) EXPORT_SYMBOL_GPL(relay_flush); /** + * relay_stats - get channel buffer statistics + * @chan: the channel + * @flags: select particular information to get + * + * Returns the count of certain field that caller specifies. + */ +size_t relay_stats(struct rchan *chan, int flags) +{ + unsigned int i, count = 0; + struct rchan_buf *rbuf; + + if (!chan || flags > RELAY_STATS_LAST) + return 0; + + if (chan->is_global) { + rbuf = *per_cpu_ptr(chan->buf, 0); + if (flags & RELAY_STATS_BUF_FULL) + count = rbuf->stats.full_count; + else if (flags & RELAY_STATS_WRT_BIG) + count = rbuf->stats.big_count; + } else { + for_each_online_cpu(i) { + rbuf = *per_cpu_ptr(chan->buf, i); + if (rbuf) { + if (flags & RELAY_STATS_BUF_FULL) + count += rbuf->stats.full_count; + else if (flags & RELAY_STATS_WRT_BIG) + count += rbuf->stats.big_count; + } + } + } + + return count; +} + +/** * relay_file_open - open file op for relay files * @inode: the inode * @filp: the file diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 66d93a872968..be00629f0ba4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9815,7 +9815,9 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v) return 0; } +#endif /* CONFIG_CFS_BANDWIDTH */ +#ifdef CONFIG_GROUP_SCHED_BANDWIDTH const u64 max_bw_quota_period_us = 1 * USEC_PER_SEC; /* 1s */ static const u64 min_bw_quota_period_us = 1 * USEC_PER_MSEC; /* 1ms */ /* More than 203 days if BW_SHIFT equals 20. */ @@ -9824,12 +9826,21 @@ static const u64 max_bw_runtime_us = MAX_BW; static void tg_bandwidth(struct task_group *tg, u64 *period_us_p, u64 *quota_us_p, u64 *burst_us_p) { +#ifdef CONFIG_CFS_BANDWIDTH if (period_us_p) *period_us_p = tg_get_cfs_period(tg); if (quota_us_p) *quota_us_p = tg_get_cfs_quota(tg); if (burst_us_p) *burst_us_p = tg_get_cfs_burst(tg); +#else /* !CONFIG_CFS_BANDWIDTH */ + if (period_us_p) + *period_us_p = tg->scx.bw_period_us; + if (quota_us_p) + *quota_us_p = tg->scx.bw_quota_us; + if (burst_us_p) + *burst_us_p = tg->scx.bw_burst_us; +#endif /* CONFIG_CFS_BANDWIDTH */ } static u64 cpu_period_read_u64(struct cgroup_subsys_state *css, @@ -9845,6 +9856,7 @@ static int tg_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us) { const u64 max_usec = U64_MAX / NSEC_PER_USEC; + int ret = 0; if (tg == &root_task_group) return -EINVAL; @@ -9882,7 +9894,12 @@ static int tg_set_bandwidth(struct task_group *tg, burst_us + quota_us > max_bw_runtime_us)) return -EINVAL; - return tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us); +#ifdef CONFIG_CFS_BANDWIDTH + ret = tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us); +#endif /* CONFIG_CFS_BANDWIDTH */ + if (!ret) + scx_group_set_bandwidth(tg, period_us, quota_us, burst_us); + return ret; } static s64 cpu_quota_read_s64(struct cgroup_subsys_state *css, @@ -9935,7 +9952,7 @@ static int cpu_burst_write_u64(struct cgroup_subsys_state *css, tg_bandwidth(tg, &period_us, "a_us, NULL); return tg_set_bandwidth(tg, period_us, quota_us, burst_us); } -#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* CONFIG_GROUP_SCHED_BANDWIDTH */ #ifdef CONFIG_RT_GROUP_SCHED static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, @@ -9995,7 +10012,7 @@ static struct cftype cpu_legacy_files[] = { .write_s64 = cpu_idle_write_s64, }, #endif -#ifdef CONFIG_CFS_BANDWIDTH +#ifdef CONFIG_GROUP_SCHED_BANDWIDTH { .name = "cfs_period_us", .read_u64 = cpu_period_read_u64, @@ -10011,6 +10028,8 @@ static struct cftype cpu_legacy_files[] = { .read_u64 = cpu_burst_read_u64, .write_u64 = cpu_burst_write_u64, }, +#endif +#ifdef CONFIG_CFS_BANDWIDTH { .name = "stat", .seq_show = cpu_cfs_stat_show, @@ -10224,7 +10243,7 @@ static int __maybe_unused cpu_period_quota_parse(char *buf, u64 *period_us_p, return 0; } -#ifdef CONFIG_CFS_BANDWIDTH +#ifdef CONFIG_GROUP_SCHED_BANDWIDTH static int cpu_max_show(struct seq_file *sf, void *v) { struct task_group *tg = css_tg(seq_css(sf)); @@ -10271,7 +10290,7 @@ static struct cftype cpu_files[] = { .write_s64 = cpu_idle_write_s64, }, #endif -#ifdef CONFIG_CFS_BANDWIDTH +#ifdef CONFIG_GROUP_SCHED_BANDWIDTH { .name = "max", .flags = CFTYPE_NOT_ON_ROOT, diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7dd5cbcb7a06..7dedc9a16281 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -203,6 +203,11 @@ struct scx_exit_task_args { struct scx_cgroup_init_args { /* the weight of the cgroup [1..10000] */ u32 weight; + + /* bandwidth control parameters from cpu.max and cpu.max.burst */ + u64 bw_period_us; + u64 bw_quota_us; + u64 bw_burst_us; }; enum scx_cpu_preempt_reason { @@ -664,9 +669,31 @@ struct sched_ext_ops { * @cgrp: cgroup whose weight is being updated * @weight: new weight [1..10000] * - * Update @tg's weight to @weight. + * Update @cgrp's weight to @weight. */ void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); + + /** + * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed + * @cgrp: cgroup whose bandwidth is being updated + * @period_us: bandwidth control period + * @quota_us: bandwidth control quota + * @burst_us: bandwidth control burst + * + * Update @cgrp's bandwidth control parameters. This is from the cpu.max + * cgroup interface. + * + * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled + * to. For example, if @period_us is 1_000_000 and @quota_us is + * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be + * interpreted in the same fashion and specifies how much @cgrp can + * burst temporarily. The specific control mechanism and thus the + * interpretation of @period_us and burstiness is upto to the BPF + * scheduler. + */ + void (*cgroup_set_bandwidth)(struct cgroup *cgrp, + u64 period_us, u64 quota_us, u64 burst_us); + #endif /* CONFIG_EXT_GROUP_SCHED */ /* @@ -884,7 +911,7 @@ enum scx_enq_flags { /* * The task being enqueued was previously enqueued on the current CPU's * %SCX_DSQ_LOCAL, but was removed from it in a call to the - * bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was + * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was * invoked in a ->cpu_release() callback, and the task is again * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the * task will not be scheduled on the CPU until at least the next invocation @@ -1247,7 +1274,7 @@ static void scx_kf_disallow(u32 mask) * This allows kfuncs to safely operate on rq from any scx ops callback, * knowing which rq is already locked. */ -static DEFINE_PER_CPU(struct rq *, locked_rq); +DEFINE_PER_CPU(struct rq *, scx_locked_rq_state); static inline void update_locked_rq(struct rq *rq) { @@ -1258,16 +1285,7 @@ static inline void update_locked_rq(struct rq *rq) */ if (rq) lockdep_assert_rq_held(rq); - __this_cpu_write(locked_rq, rq); -} - -/* - * Return the rq currently locked from an scx callback, or NULL if no rq is - * locked. - */ -static inline struct rq *scx_locked_rq(void) -{ - return __this_cpu_read(locked_rq); + __this_cpu_write(scx_locked_rq_state, rq); } #define SCX_CALL_OP(sch, mask, op, rq, args...) \ @@ -1641,7 +1659,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) * scx_add_event - Increase an event counter for 'name' by 'cnt' * @sch: scx_sched to account events for * @name: an event name defined in struct scx_event_stats - * @cnt: the number of the event occured + * @cnt: the number of the event occurred * * This can be used when preemption is not disabled. */ @@ -1654,7 +1672,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) * __scx_add_event - Increase an event counter for 'name' by 'cnt' * @sch: scx_sched to account events for * @name: an event name defined in struct scx_event_stats - * @cnt: the number of the event occured + * @cnt: the number of the event occurred * * This should be used only when preemption is disabled. */ @@ -1705,11 +1723,6 @@ static bool scx_tryset_enable_state(enum scx_enable_state to, return atomic_try_cmpxchg(&scx_enable_state_var, &from_v, to); } -static bool scx_rq_bypassing(struct rq *rq) -{ - return unlikely(rq->scx.flags & SCX_RQ_BYPASSING); -} - /** * wait_ops_state - Busy-wait the specified ops state to end * @p: target task @@ -1796,12 +1809,10 @@ static void run_deferred(struct rq *rq) process_ddsp_deferred_locals(rq); } -#ifdef CONFIG_SMP static void deferred_bal_cb_workfn(struct rq *rq) { run_deferred(rq); } -#endif static void deferred_irq_workfn(struct irq_work *irq_work) { @@ -1824,7 +1835,6 @@ static void schedule_deferred(struct rq *rq) { lockdep_assert_rq_held(rq); -#ifdef CONFIG_SMP /* * If in the middle of waking up a task, task_woken_scx() will be called * afterwards which will then run the deferred actions, no need to @@ -1842,7 +1852,7 @@ static void schedule_deferred(struct rq *rq) deferred_bal_cb_workfn); return; } -#endif + /* * No scheduler hooks available. Queue an irq work. They are executed on * IRQ re-enable which may take a bit longer than the scheduler hooks. @@ -2546,7 +2556,6 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags, p->scx.dsq = dst_dsq; } -#ifdef CONFIG_SMP /** * move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ * @p: task to move @@ -2713,11 +2722,6 @@ static bool consume_remote_task(struct rq *this_rq, struct task_struct *p, return false; } } -#else /* CONFIG_SMP */ -static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); } -static inline bool task_can_run_on_remote_rq(struct scx_sched *sch, struct task_struct *p, struct rq *rq, bool enforce) { return false; } -static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; } -#endif /* CONFIG_SMP */ /** * move_task_between_dsqs() - Move a task from one DSQ to another @@ -2890,9 +2894,7 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, { struct rq *src_rq = task_rq(p); struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); -#ifdef CONFIG_SMP struct rq *locked_rq = rq; -#endif /* * We're synchronized against dequeue through DISPATCHING. As @p can't @@ -2906,7 +2908,6 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, return; } -#ifdef CONFIG_SMP if (src_rq != dst_rq && unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { dispatch_enqueue(sch, find_global_dsq(p), p, @@ -2966,9 +2967,6 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, raw_spin_rq_unlock(locked_rq); raw_spin_rq_lock(rq); } -#else /* CONFIG_SMP */ - BUG(); /* control can not reach here on UP */ -#endif /* CONFIG_SMP */ } /** @@ -3292,10 +3290,8 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) static enum scx_cpu_preempt_reason preempt_reason_from_class(const struct sched_class *class) { -#ifdef CONFIG_SMP if (class == &stop_sched_class) return SCX_CPU_PREEMPT_STOP; -#endif if (class == &dl_sched_class) return SCX_CPU_PREEMPT_DL; if (class == &rt_sched_class) @@ -3308,14 +3304,12 @@ static void switch_class(struct rq *rq, struct task_struct *next) struct scx_sched *sch = scx_root; const struct sched_class *next_class = next->sched_class; -#ifdef CONFIG_SMP /* * Pairs with the smp_load_acquire() issued by a CPU in * kick_cpus_irq_workfn() who is waiting for this CPU to perform a * resched. */ smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1); -#endif if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT)) return; @@ -3512,8 +3506,6 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, } #endif /* CONFIG_SCHED_CORE */ -#ifdef CONFIG_SMP - static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) { struct scx_sched *sch = scx_root; @@ -3643,7 +3635,6 @@ static void rq_offline_scx(struct rq *rq) rq->scx.flags &= ~SCX_RQ_ONLINE; } -#endif /* CONFIG_SMP */ static bool check_rq_for_timeouts(struct rq *rq) { @@ -4098,7 +4089,9 @@ static bool scx_cgroup_enabled; void scx_tg_init(struct task_group *tg) { - tg->scx_weight = CGROUP_WEIGHT_DFL; + tg->scx.weight = CGROUP_WEIGHT_DFL; + tg->scx.bw_period_us = default_bw_period_us(); + tg->scx.bw_quota_us = RUNTIME_INF; } int scx_tg_online(struct task_group *tg) @@ -4106,14 +4099,17 @@ int scx_tg_online(struct task_group *tg) struct scx_sched *sch = scx_root; int ret = 0; - WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED)); + WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED)); percpu_down_read(&scx_cgroup_rwsem); if (scx_cgroup_enabled) { if (SCX_HAS_OP(sch, cgroup_init)) { struct scx_cgroup_init_args args = - { .weight = tg->scx_weight }; + { .weight = tg->scx.weight, + .bw_period_us = tg->scx.bw_period_us, + .bw_quota_us = tg->scx.bw_quota_us, + .bw_burst_us = tg->scx.bw_burst_us }; ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL, tg->css.cgroup, &args); @@ -4121,9 +4117,9 @@ int scx_tg_online(struct task_group *tg) ret = ops_sanitize_err(sch, "cgroup_init", ret); } if (ret == 0) - tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED; + tg->scx.flags |= SCX_TG_ONLINE | SCX_TG_INITED; } else { - tg->scx_flags |= SCX_TG_ONLINE; + tg->scx.flags |= SCX_TG_ONLINE; } percpu_up_read(&scx_cgroup_rwsem); @@ -4134,15 +4130,15 @@ void scx_tg_offline(struct task_group *tg) { struct scx_sched *sch = scx_root; - WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE)); + WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE)); percpu_down_read(&scx_cgroup_rwsem); if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) && - (tg->scx_flags & SCX_TG_INITED)) + (tg->scx.flags & SCX_TG_INITED)) SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, tg->css.cgroup); - tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); + tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); percpu_up_read(&scx_cgroup_rwsem); } @@ -4251,11 +4247,11 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) percpu_down_read(&scx_cgroup_rwsem); if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) && - tg->scx_weight != weight) + tg->scx.weight != weight) SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL, tg_cgrp(tg), weight); - tg->scx_weight = weight; + tg->scx.weight = weight; percpu_up_read(&scx_cgroup_rwsem); } @@ -4265,6 +4261,27 @@ void scx_group_set_idle(struct task_group *tg, bool idle) /* TODO: Implement ops->cgroup_set_idle() */ } +void scx_group_set_bandwidth(struct task_group *tg, + u64 period_us, u64 quota_us, u64 burst_us) +{ + struct scx_sched *sch = scx_root; + + percpu_down_read(&scx_cgroup_rwsem); + + if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) && + (tg->scx.bw_period_us != period_us || + tg->scx.bw_quota_us != quota_us || + tg->scx.bw_burst_us != burst_us)) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_bandwidth, NULL, + tg_cgrp(tg), period_us, quota_us, burst_us); + + tg->scx.bw_period_us = period_us; + tg->scx.bw_quota_us = quota_us; + tg->scx.bw_burst_us = burst_us; + + percpu_up_read(&scx_cgroup_rwsem); +} + static void scx_cgroup_lock(void) { percpu_down_write(&scx_cgroup_rwsem); @@ -4308,14 +4325,12 @@ DEFINE_SCHED_CLASS(ext) = { .put_prev_task = put_prev_task_scx, .set_next_task = set_next_task_scx, -#ifdef CONFIG_SMP .select_task_rq = select_task_rq_scx, .task_woken = task_woken_scx, .set_cpus_allowed = set_cpus_allowed_scx, .rq_online = rq_online_scx, .rq_offline = rq_offline_scx, -#endif .task_tick = task_tick_scx, @@ -4408,9 +4423,9 @@ static void scx_cgroup_exit(struct scx_sched *sch) css_for_each_descendant_post(css, &root_task_group.css) { struct task_group *tg = css_tg(css); - if (!(tg->scx_flags & SCX_TG_INITED)) + if (!(tg->scx.flags & SCX_TG_INITED)) continue; - tg->scx_flags &= ~SCX_TG_INITED; + tg->scx.flags &= ~SCX_TG_INITED; if (!sch->ops.cgroup_exit) continue; @@ -4442,14 +4457,19 @@ static int scx_cgroup_init(struct scx_sched *sch) rcu_read_lock(); css_for_each_descendant_pre(css, &root_task_group.css) { struct task_group *tg = css_tg(css); - struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; + struct scx_cgroup_init_args args = { + .weight = tg->scx.weight, + .bw_period_us = tg->scx.bw_period_us, + .bw_quota_us = tg->scx.bw_quota_us, + .bw_burst_us = tg->scx.bw_burst_us, + }; - if ((tg->scx_flags & + if ((tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) continue; if (!sch->ops.cgroup_init) { - tg->scx_flags |= SCX_TG_INITED; + tg->scx.flags |= SCX_TG_INITED; continue; } @@ -4464,7 +4484,7 @@ static int scx_cgroup_init(struct scx_sched *sch) scx_error(sch, "ops.cgroup_init() failed (%d)", ret); return ret; } - tg->scx_flags |= SCX_TG_INITED; + tg->scx.flags |= SCX_TG_INITED; rcu_read_lock(); css_put(css); @@ -4657,6 +4677,41 @@ bool scx_allow_ttwu_queue(const struct task_struct *p) } /** + * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler + * + * While there are various reasons why RCU CPU stalls can occur on a system + * that may not be caused by the current BPF scheduler, try kicking out the + * current scheduler in an attempt to recover the system to a good state before + * issuing panics. + */ +bool scx_rcu_cpu_stall(void) +{ + struct scx_sched *sch; + + rcu_read_lock(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) { + rcu_read_unlock(); + return false; + } + + switch (scx_enable_state()) { + case SCX_ENABLING: + case SCX_ENABLED: + break; + default: + rcu_read_unlock(); + return false; + } + + scx_error(sch, "RCU CPU stall detected!"); + rcu_read_unlock(); + + return true; +} + +/** * scx_softlockup - sched_ext softlockup handler * @dur_s: number of seconds of CPU stuck due to soft lockup * @@ -5944,6 +5999,7 @@ static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {} +static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {} #endif static void sched_ext_ops__cpu_online(s32 cpu) {} static void sched_ext_ops__cpu_offline(s32 cpu) {} @@ -5981,6 +6037,7 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = { .cgroup_move = sched_ext_ops__cgroup_move, .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move, .cgroup_set_weight = sched_ext_ops__cgroup_set_weight, + .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth, #endif .cpu_online = sched_ext_ops__cpu_online, .cpu_offline = sched_ext_ops__cpu_offline, @@ -6338,7 +6395,8 @@ __bpf_kfunc_start_defs(); * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id * and this function can be called upto ops.dispatch_max_batch times to insert * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the - * remaining slots. scx_bpf_consume() flushes the batch and resets the counter. + * remaining slots. scx_bpf_dsq_move_to_local() flushes the batch and resets the + * counter. * * This function doesn't have any locking restrictions and may be called under * BPF locks (in the future when BPF introduces more flexible locking). @@ -6362,14 +6420,6 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice scx_dsq_insert_commit(p, dsq_id, enq_flags); } -/* for backward compatibility, will be removed in v6.15 */ -__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, - u64 enq_flags) -{ - printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()"); - scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags); -} - /** * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ * @p: task_struct to insert @@ -6407,21 +6457,11 @@ __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); } -/* for backward compatibility, will be removed in v6.15 */ -__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, - u64 slice, u64 vtime, u64 enq_flags) -{ - printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()"); - scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags); -} - __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { @@ -6594,13 +6634,6 @@ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id) } } -/* for backward compatibility, will be removed in v6.15 */ -__bpf_kfunc bool scx_bpf_consume(u64 dsq_id) -{ - printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()"); - return scx_bpf_dsq_move_to_local(dsq_id); -} - /** * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs * @it__iter: DSQ iterator in progress @@ -6619,14 +6652,6 @@ __bpf_kfunc void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE; } -/* for backward compatibility, will be removed in v6.15 */ -__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice( - struct bpf_iter_scx_dsq *it__iter, u64 slice) -{ - printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()"); - scx_bpf_dsq_move_set_slice(it__iter, slice); -} - /** * scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs * @it__iter: DSQ iterator in progress @@ -6646,14 +6671,6 @@ __bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME; } -/* for backward compatibility, will be removed in v6.15 */ -__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime( - struct bpf_iter_scx_dsq *it__iter, u64 vtime) -{ - printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()"); - scx_bpf_dsq_move_set_vtime(it__iter, vtime); -} - /** * scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ * @it__iter: DSQ iterator in progress @@ -6686,15 +6703,6 @@ __bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, p, dsq_id, enq_flags); } -/* for backward compatibility, will be removed in v6.15 */ -__bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, - struct task_struct *p, u64 dsq_id, - u64 enq_flags) -{ - printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()"); - return scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags); -} - /** * scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ * @it__iter: DSQ iterator in progress @@ -6720,30 +6728,16 @@ __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); } -/* for backward compatibility, will be removed in v6.15 */ -__bpf_kfunc bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, - struct task_struct *p, u64 dsq_id, - u64 enq_flags) -{ - printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_vtime() renamed to scx_bpf_dsq_move_vtime()"); - return scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags); -} - __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_dispatch) BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots) BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel) BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local) -BTF_ID_FLAGS(func, scx_bpf_consume) BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice) BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime) BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice) -BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime) -BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_dispatch) static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { @@ -6874,10 +6868,6 @@ BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice) BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime) BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice) -BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime) -BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_unlocked) static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = { diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index a75835c23f15..292bb41a242e 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -13,8 +13,24 @@ static inline bool scx_kf_allowed_if_unlocked(void) return !current->scx.kf_mask; } +static inline bool scx_rq_bypassing(struct rq *rq) +{ + return unlikely(rq->scx.flags & SCX_RQ_BYPASSING); +} + DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup); +DECLARE_PER_CPU(struct rq *, scx_locked_rq_state); + +/* + * Return the rq currently locked from an scx callback, or NULL if no rq is + * locked. + */ +static inline struct rq *scx_locked_rq(void) +{ + return __this_cpu_read(scx_locked_rq_state); +} + void scx_tick(struct rq *rq); void init_scx_entity(struct sched_ext_entity *scx); void scx_pre_fork(struct task_struct *p); @@ -65,7 +81,7 @@ static inline void init_sched_ext_class(void) {} #endif /* CONFIG_SCHED_CLASS_EXT */ -#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP) +#ifdef CONFIG_SCHED_CLASS_EXT void __scx_update_idle(struct rq *rq, bool idle, bool do_notify); static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) @@ -88,6 +104,7 @@ void scx_cgroup_finish_attach(void); void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); void scx_group_set_idle(struct task_group *tg, bool idle); +void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us); #else /* CONFIG_EXT_GROUP_SCHED */ static inline void scx_tg_init(struct task_group *tg) {} static inline int scx_tg_online(struct task_group *tg) { return 0; } @@ -98,5 +115,6 @@ static inline void scx_cgroup_finish_attach(void) {} static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} static inline void scx_group_set_idle(struct task_group *tg, bool idle) {} +static inline void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us) {} #endif /* CONFIG_EXT_GROUP_SCHED */ #endif /* CONFIG_CGROUP_SCHED */ diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 001fb88a8481..7174e1c1a392 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -17,7 +17,6 @@ static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); /* Enable/disable per-node idle cpumasks */ static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_per_node); -#ifdef CONFIG_SMP /* Enable/disable LLC aware optimizations */ static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc); @@ -75,7 +74,7 @@ static int scx_cpu_node_if_enabled(int cpu) return cpu_to_node(cpu); } -bool scx_idle_test_and_clear_cpu(int cpu) +static bool scx_idle_test_and_clear_cpu(int cpu) { int node = scx_cpu_node_if_enabled(cpu); struct cpumask *idle_cpus = idle_cpumask(node)->cpu; @@ -198,7 +197,7 @@ pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, int node, u6 /* * Find an idle CPU in the system, starting from @node. */ -s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags) +static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags) { s32 cpu; @@ -250,7 +249,7 @@ static struct cpumask *llc_span(s32 cpu) sd = rcu_dereference(per_cpu(sd_llc, cpu)); if (!sd) - return 0; + return NULL; return sched_domain_span(sd); } @@ -794,7 +793,6 @@ static void reset_idle_masks(struct sched_ext_ops *ops) cpumask_and(idle_cpumask(node)->smt, cpu_online_mask, node_mask); } } -#endif /* CONFIG_SMP */ void scx_idle_enable(struct sched_ext_ops *ops) { @@ -808,9 +806,7 @@ void scx_idle_enable(struct sched_ext_ops *ops) else static_branch_disable_cpuslocked(&scx_builtin_idle_per_node); -#ifdef CONFIG_SMP reset_idle_masks(ops); -#endif } void scx_idle_disable(void) @@ -860,8 +856,8 @@ static bool check_builtin_idle_enabled(void) return false; } -s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags, - const struct cpumask *allowed, u64 flags) +static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + const struct cpumask *allowed, u64 flags) { struct rq *rq; struct rq_flags rf; @@ -896,7 +892,6 @@ s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags, if (!rq) lockdep_assert_held(&p->pi_lock); -#ifdef CONFIG_SMP /* * This may also be called from ops.enqueue(), so we need to handle * per-CPU tasks as well. For these tasks, we can skip all idle CPU @@ -913,9 +908,7 @@ s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags, cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, allowed ?: p->cpus_ptr, flags); } -#else - cpu = -EBUSY; -#endif + if (scx_kf_allowed_if_unlocked()) task_rq_unlock(rq, p, &rf); @@ -929,14 +922,10 @@ s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags, */ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu) { -#ifdef CONFIG_NUMA if (!kf_cpu_valid(cpu, NULL)) return NUMA_NO_NODE; return cpu_to_node(cpu); -#else - return 0; -#endif } /** @@ -1010,11 +999,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) if (node < 0) return cpu_none_mask; -#ifdef CONFIG_SMP return idle_cpumask(node)->cpu; -#else - return cpu_none_mask; -#endif } /** @@ -1034,11 +1019,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) if (!check_builtin_idle_enabled()) return cpu_none_mask; -#ifdef CONFIG_SMP return idle_cpumask(NUMA_NO_NODE)->cpu; -#else - return cpu_none_mask; -#endif } /** @@ -1057,14 +1038,10 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) if (node < 0) return cpu_none_mask; -#ifdef CONFIG_SMP if (sched_smt_active()) return idle_cpumask(node)->smt; else return idle_cpumask(node)->cpu; -#else - return cpu_none_mask; -#endif } /** @@ -1085,14 +1062,10 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) if (!check_builtin_idle_enabled()) return cpu_none_mask; -#ifdef CONFIG_SMP if (sched_smt_active()) return idle_cpumask(NUMA_NO_NODE)->smt; else return idle_cpumask(NUMA_NO_NODE)->cpu; -#else - return cpu_none_mask; -#endif } /** @@ -1125,10 +1098,10 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) if (!check_builtin_idle_enabled()) return false; - if (kf_cpu_valid(cpu, NULL)) - return scx_idle_test_and_clear_cpu(cpu); - else + if (!kf_cpu_valid(cpu, NULL)) return false; + + return scx_idle_test_and_clear_cpu(cpu); } /** diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h index 37be78a7502b..fa583f141f35 100644 --- a/kernel/sched/ext_idle.h +++ b/kernel/sched/ext_idle.h @@ -12,20 +12,8 @@ struct sched_ext_ops; -#ifdef CONFIG_SMP void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops); void scx_idle_init_masks(void); -bool scx_idle_test_and_clear_cpu(int cpu); -s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags); -#else /* !CONFIG_SMP */ -static inline void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) {} -static inline void scx_idle_init_masks(void) {} -static inline bool scx_idle_test_and_clear_cpu(int cpu) { return false; } -static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags) -{ - return -EBUSY; -} -#endif /* CONFIG_SMP */ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, const struct cpumask *cpus_allowed, u64 flags); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 2024c1d36402..59fdb7ebbf22 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -176,7 +176,7 @@ struct psi_group psi_system = { .pcpu = &system_group_pcpu, }; -static DEFINE_PER_CPU(seqcount_t, psi_seq); +static DEFINE_PER_CPU(seqcount_t, psi_seq) = SEQCNT_ZERO(psi_seq); static inline void psi_write_begin(int cpu) { @@ -204,11 +204,7 @@ static void poll_timer_fn(struct timer_list *t); static void group_init(struct psi_group *group) { - int cpu; - group->enabled = true; - for_each_possible_cpu(cpu) - seqcount_init(per_cpu_ptr(&psi_seq, cpu)); group->avg_last_update = sched_clock(); group->avg_next_update = group->avg_last_update + psi_period; mutex_init(&group->avgs_lock); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d3f33d10c58c..be9745d104f7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -403,7 +403,7 @@ static inline bool dl_server_active(struct sched_dl_entity *dl_se) extern struct list_head task_groups; -#ifdef CONFIG_CFS_BANDWIDTH +#ifdef CONFIG_GROUP_SCHED_BANDWIDTH extern const u64 max_bw_quota_period_us; /* @@ -414,7 +414,7 @@ static inline u64 default_bw_period_us(void) { return 100000ULL; } -#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* CONFIG_GROUP_SCHED_BANDWIDTH */ struct cfs_bandwidth { #ifdef CONFIG_CFS_BANDWIDTH @@ -472,10 +472,7 @@ struct task_group { struct rt_bandwidth rt_bandwidth; #endif -#ifdef CONFIG_EXT_GROUP_SCHED - u32 scx_flags; /* SCX_TG_* */ - u32 scx_weight; -#endif + struct scx_task_group scx; struct rcu_head rcu; struct list_head list; diff --git a/kernel/sys.c b/kernel/sys.c index 18a037cc6f61..1e28b40053ce 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2372,54 +2372,14 @@ int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long st #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) -#ifdef CONFIG_ANON_VMA_NAME - -#define ANON_VMA_NAME_MAX_LEN 80 -#define ANON_VMA_NAME_INVALID_CHARS "\\`$[]" - -static inline bool is_valid_name_char(char ch) -{ - /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */ - return ch > 0x1f && ch < 0x7f && - !strchr(ANON_VMA_NAME_INVALID_CHARS, ch); -} - static int prctl_set_vma(unsigned long opt, unsigned long addr, unsigned long size, unsigned long arg) { - struct mm_struct *mm = current->mm; - const char __user *uname; - struct anon_vma_name *anon_name = NULL; int error; switch (opt) { case PR_SET_VMA_ANON_NAME: - uname = (const char __user *)arg; - if (uname) { - char *name, *pch; - - name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN); - if (IS_ERR(name)) - return PTR_ERR(name); - - for (pch = name; *pch != '\0'; pch++) { - if (!is_valid_name_char(*pch)) { - kfree(name); - return -EINVAL; - } - } - /* anon_vma has its own copy */ - anon_name = anon_vma_name_alloc(name); - kfree(name); - if (!anon_name) - return -ENOMEM; - - } - - mmap_write_lock(mm); - error = madvise_set_anon_name(mm, addr, size, anon_name); - mmap_write_unlock(mm); - anon_vma_name_put(anon_name); + error = set_anon_vma_name(addr, size, (const char __user *)arg); break; default: error = -EINVAL; @@ -2428,14 +2388,6 @@ static int prctl_set_vma(unsigned long opt, unsigned long addr, return error; } -#else /* CONFIG_ANON_VMA_NAME */ -static int prctl_set_vma(unsigned long opt, unsigned long start, - unsigned long size, unsigned long arg) -{ - return -EINVAL; -} -#endif /* CONFIG_ANON_VMA_NAME */ - static inline unsigned long get_current_mdwe(void) { unsigned long ret = 0; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index e400fe150f9d..0aef0e349e49 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -340,10 +340,7 @@ static void clocksource_verify_choose_cpus(void) * CPUs that are currently online. */ for (i = 1; i < n; i++) { - cpu = get_random_u32_below(nr_cpu_ids); - cpu = cpumask_next(cpu - 1, cpu_online_mask); - if (cpu >= nr_cpu_ids) - cpu = cpumask_first(cpu_online_mask); + cpu = cpumask_random(cpu_online_mask); if (!WARN_ON_ONCE(cpu >= nr_cpu_ids)) cpumask_set_cpu(cpu, &cpus_chosen); } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f135d04e1860..d2c79da81e4f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -200,6 +200,19 @@ menuconfig FTRACE if FTRACE +config TRACEFS_AUTOMOUNT_DEPRECATED + bool "Automount tracefs on debugfs [DEPRECATED]" + depends on TRACING + default y + help + The tracing interface was moved from /sys/kernel/debug/tracing + to /sys/kernel/tracing in 2015, but the tracing file system + was still automounted in /sys/kernel/debug for backward + compatibility with tooling. + + The new interface has been around for more than 10 years and + the old debug mount will soon be removed. + config BOOTTIME_TRACING bool "Boot-time Tracing support" depends on TRACING @@ -780,6 +793,20 @@ config UPROBE_EVENTS This option is required if you plan to use perf-probe subcommand of perf tools on user space applications. +config EPROBE_EVENTS + bool "Enable event-based dynamic events" + depends on TRACING + depends on HAVE_REGS_AND_STACK_ACCESS_API + select PROBE_EVENTS + select DYNAMIC_EVENTS + default y + help + Eprobes are dynamic events that can be placed on other existing + events. It can be used to limit what fields are recorded in + an event or even dereference a field of an event. It can + convert the type of an event field. For example, turn an + address into a string. + config BPF_EVENTS depends on BPF_SYSCALL depends on (KPROBE_EVENTS || UPROBE_EVENTS) && PERF_EVENTS diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 057cd975d014..dcb4e02afc5f 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -82,7 +82,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o -obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o +obj-$(CONFIG_EPROBE_EVENTS) += trace_eprobe.o obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 47168d2afbf1..6941145b5058 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -415,9 +415,10 @@ static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, size_t count, loff_t *ppos) { struct blk_trace *bt = filp->private_data; + size_t dropped = relay_stats(bt->rchan, RELAY_STATS_BUF_FULL); char buf[16]; - snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped)); + snprintf(buf, sizeof(buf), "%zu\n", dropped); return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); } @@ -456,23 +457,6 @@ static const struct file_operations blk_msg_fops = { .llseek = noop_llseek, }; -/* - * Keep track of how many times we encountered a full subbuffer, to aid - * the user space app in telling how many lost events there were. - */ -static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, size_t prev_padding) -{ - struct blk_trace *bt; - - if (!relay_buf_full(buf)) - return 1; - - bt = buf->chan->private_data; - atomic_inc(&bt->dropped); - return 0; -} - static int blk_remove_buf_file_callback(struct dentry *dentry) { debugfs_remove(dentry); @@ -491,7 +475,6 @@ static struct dentry *blk_create_buf_file_callback(const char *filename, } static const struct rchan_callbacks blk_relay_callbacks = { - .subbuf_start = blk_subbuf_start_callback, .create_buf_file = blk_create_buf_file_callback, .remove_buf_file = blk_remove_buf_file_callback, }; @@ -580,7 +563,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, } bt->dev = dev; - atomic_set(&bt->dropped, 0); INIT_LIST_HEAD(&bt->running_list); ret = -EIO; diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c index 314ffc143039..acb0c971a408 100644 --- a/kernel/trace/preemptirq_delay_test.c +++ b/kernel/trace/preemptirq_delay_test.c @@ -117,12 +117,15 @@ static int preemptirq_delay_run(void *data) { int i; int s = MIN(burst_size, NR_TEST_FUNCS); - struct cpumask cpu_mask; + cpumask_var_t cpu_mask; + + if (!alloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; if (cpu_affinity > -1) { - cpumask_clear(&cpu_mask); - cpumask_set_cpu(cpu_affinity, &cpu_mask); - if (set_cpus_allowed_ptr(current, &cpu_mask)) + cpumask_clear(cpu_mask); + cpumask_set_cpu(cpu_affinity, cpu_mask); + if (set_cpus_allowed_ptr(current, cpu_mask)) pr_err("cpu_affinity:%d, failed\n", cpu_affinity); } @@ -139,6 +142,8 @@ static int preemptirq_delay_run(void *data) __set_current_state(TASK_RUNNING); + free_cpumask_var(cpu_mask); + return 0; } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 5176e0270f07..bb71a0dc9d69 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4812,26 +4812,26 @@ int ring_buffer_write(struct trace_buffer *buffer, int ret = -EBUSY; int cpu; - preempt_disable_notrace(); + guard(preempt_notrace)(); if (atomic_read(&buffer->record_disabled)) - goto out; + return -EBUSY; cpu = raw_smp_processor_id(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return -EBUSY; cpu_buffer = buffer->buffers[cpu]; if (atomic_read(&cpu_buffer->record_disabled)) - goto out; + return -EBUSY; if (length > buffer->max_data_size) - goto out; + return -EBUSY; if (unlikely(trace_recursive_lock(cpu_buffer))) - goto out; + return -EBUSY; event = rb_reserve_next_event(buffer, cpu_buffer, length); if (!event) @@ -4849,10 +4849,6 @@ int ring_buffer_write(struct trace_buffer *buffer, out_unlock: trace_recursive_unlock(cpu_buffer); - - out: - preempt_enable_notrace(); - return ret; } EXPORT_SYMBOL_GPL(ring_buffer_write); diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index bd7d56dbf6c2..1482e91c39f4 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -674,8 +674,6 @@ static bool __read_mostly monitoring_on; */ bool rv_monitoring_on(void) { - /* Ensures that concurrent monitors read consistent monitoring_on */ - smp_rmb(); return READ_ONCE(monitoring_on); } @@ -695,8 +693,6 @@ static ssize_t monitoring_on_read_data(struct file *filp, char __user *user_buf, static void turn_monitoring_off(void) { WRITE_ONCE(monitoring_on, false); - /* Ensures that concurrent monitors read consistent monitoring_on */ - smp_wmb(); } static void reset_all_monitors(void) @@ -712,8 +708,6 @@ static void reset_all_monitors(void) static void turn_monitoring_on(void) { WRITE_ONCE(monitoring_on, true); - /* Ensures that concurrent monitors read consistent monitoring_on */ - smp_wmb(); } static void turn_monitoring_on_with_reset(void) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7996f26c3f46..4283ed4e8f59 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -432,15 +432,13 @@ static void ftrace_exports(struct ring_buffer_event *event, int flag) { struct trace_export *export; - preempt_disable_notrace(); + guard(preempt_notrace)(); export = rcu_dereference_raw_check(ftrace_exports_list); while (export) { trace_process_export(export, event, flag); export = rcu_dereference_raw_check(export->next); } - - preempt_enable_notrace(); } static inline void @@ -497,27 +495,18 @@ int register_ftrace_export(struct trace_export *export) if (WARN_ON_ONCE(!export->write)) return -1; - mutex_lock(&ftrace_export_lock); + guard(mutex)(&ftrace_export_lock); add_ftrace_export(&ftrace_exports_list, export); - mutex_unlock(&ftrace_export_lock); - return 0; } EXPORT_SYMBOL_GPL(register_ftrace_export); int unregister_ftrace_export(struct trace_export *export) { - int ret; - - mutex_lock(&ftrace_export_lock); - - ret = rm_ftrace_export(&ftrace_exports_list, export); - - mutex_unlock(&ftrace_export_lock); - - return ret; + guard(mutex)(&ftrace_export_lock); + return rm_ftrace_export(&ftrace_exports_list, export); } EXPORT_SYMBOL_GPL(unregister_ftrace_export); @@ -640,9 +629,8 @@ void trace_array_put(struct trace_array *this_tr) if (!this_tr) return; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); __trace_array_put(this_tr); - mutex_unlock(&trace_types_lock); } EXPORT_SYMBOL_GPL(trace_array_put); @@ -936,7 +924,6 @@ int tracing_is_enabled(void) * return the mirror variable of the state of the ring buffer. * It's a little racy, but we don't really care. */ - smp_rmb(); return !global_trace.buffer_disabled; } @@ -1107,8 +1094,6 @@ void tracer_tracing_on(struct trace_array *tr) * important to be fast than accurate. */ tr->buffer_disabled = 0; - /* Make the flag seen by readers */ - smp_wmb(); } /** @@ -1163,13 +1148,11 @@ int __trace_array_puts(struct trace_array *tr, unsigned long ip, trace_ctx = tracing_gen_ctx(); buffer = tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); + guard(ring_buffer_nest)(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, trace_ctx); - if (!event) { - size = 0; - goto out; - } + if (!event) + return 0; entry = ring_buffer_event_data(event); entry->ip = ip; @@ -1185,8 +1168,6 @@ int __trace_array_puts(struct trace_array *tr, unsigned long ip, __buffer_unlock_commit(buffer, event); ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); - out: - ring_buffer_nest_end(buffer); return size; } EXPORT_SYMBOL_GPL(__trace_array_puts); @@ -1216,7 +1197,6 @@ int __trace_bputs(unsigned long ip, const char *str) struct bputs_entry *entry; unsigned int trace_ctx; int size = sizeof(struct bputs_entry); - int ret = 0; if (!printk_binsafe(tr)) return __trace_puts(ip, str, strlen(str)); @@ -1230,11 +1210,11 @@ int __trace_bputs(unsigned long ip, const char *str) trace_ctx = tracing_gen_ctx(); buffer = tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); + guard(ring_buffer_nest)(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, trace_ctx); if (!event) - goto out; + return 0; entry = ring_buffer_event_data(event); entry->ip = ip; @@ -1243,10 +1223,7 @@ int __trace_bputs(unsigned long ip, const char *str) __buffer_unlock_commit(buffer, event); ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); - ret = 1; - out: - ring_buffer_nest_end(buffer); - return ret; + return 1; } EXPORT_SYMBOL_GPL(__trace_bputs); @@ -1435,13 +1412,8 @@ static int tracing_arm_snapshot_locked(struct trace_array *tr) int tracing_arm_snapshot(struct trace_array *tr) { - int ret; - - mutex_lock(&trace_types_lock); - ret = tracing_arm_snapshot_locked(tr); - mutex_unlock(&trace_types_lock); - - return ret; + guard(mutex)(&trace_types_lock); + return tracing_arm_snapshot_locked(tr); } void tracing_disarm_snapshot(struct trace_array *tr) @@ -1640,8 +1612,6 @@ void tracer_tracing_off(struct trace_array *tr) * important to be fast than accurate. */ tr->buffer_disabled = 1; - /* Make the flag seen by readers */ - smp_wmb(); } /** @@ -1846,7 +1816,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, ret = get_user(ch, ubuf++); if (ret) - goto out; + return ret; read++; cnt--; @@ -1860,7 +1830,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, while (cnt && isspace(ch)) { ret = get_user(ch, ubuf++); if (ret) - goto out; + return ret; read++; cnt--; } @@ -1870,8 +1840,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, /* only spaces were written */ if (isspace(ch) || !ch) { *ppos += read; - ret = read; - goto out; + return read; } } @@ -1879,13 +1848,12 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, while (cnt && !isspace(ch) && ch) { if (parser->idx < parser->size - 1) parser->buffer[parser->idx++] = ch; - else { - ret = -EINVAL; - goto out; - } + else + return -EINVAL; + ret = get_user(ch, ubuf++); if (ret) - goto out; + return ret; read++; cnt--; } @@ -1900,15 +1868,11 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, /* Make sure the parsed string always terminates with '\0'. */ parser->buffer[parser->idx] = 0; } else { - ret = -EINVAL; - goto out; + return -EINVAL; } *ppos += read; - ret = read; - -out: - return ret; + return read; } /* TODO add a seq_buf_to_buffer() */ @@ -2410,10 +2374,10 @@ int __init register_tracer(struct tracer *type) mutex_unlock(&trace_types_lock); if (ret || !default_bootup_tracer) - goto out_unlock; + return ret; if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE)) - goto out_unlock; + return 0; printk(KERN_INFO "Starting tracer '%s'\n", type->name); /* Do we want this tracer to start on bootup? */ @@ -2425,8 +2389,7 @@ int __init register_tracer(struct tracer *type) /* disable other selftests, since this will break it. */ disable_tracing_selftest("running a tracer"); - out_unlock: - return ret; + return 0; } static void tracing_reset_cpu(struct array_buffer *buf, int cpu) @@ -2503,9 +2466,8 @@ void tracing_reset_all_online_cpus_unlocked(void) void tracing_reset_all_online_cpus(void) { - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); tracing_reset_all_online_cpus_unlocked(); - mutex_unlock(&trace_types_lock); } int is_tracing_stopped(void) @@ -2516,18 +2478,17 @@ int is_tracing_stopped(void) static void tracing_start_tr(struct trace_array *tr) { struct trace_buffer *buffer; - unsigned long flags; if (tracing_disabled) return; - raw_spin_lock_irqsave(&tr->start_lock, flags); + guard(raw_spinlock_irqsave)(&tr->start_lock); if (--tr->stop_count) { if (WARN_ON_ONCE(tr->stop_count < 0)) { /* Someone screwed up their debugging */ tr->stop_count = 0; } - goto out; + return; } /* Prevent the buffers from switching */ @@ -2544,9 +2505,6 @@ static void tracing_start_tr(struct trace_array *tr) #endif arch_spin_unlock(&tr->max_lock); - - out: - raw_spin_unlock_irqrestore(&tr->start_lock, flags); } /** @@ -2564,11 +2522,10 @@ void tracing_start(void) static void tracing_stop_tr(struct trace_array *tr) { struct trace_buffer *buffer; - unsigned long flags; - raw_spin_lock_irqsave(&tr->start_lock, flags); + guard(raw_spinlock_irqsave)(&tr->start_lock); if (tr->stop_count++) - goto out; + return; /* Prevent the buffers from switching */ arch_spin_lock(&tr->max_lock); @@ -2584,9 +2541,6 @@ static void tracing_stop_tr(struct trace_array *tr) #endif arch_spin_unlock(&tr->max_lock); - - out: - raw_spin_unlock_irqrestore(&tr->start_lock, flags); } /** @@ -2699,19 +2653,17 @@ void trace_buffered_event_enable(void) per_cpu(trace_buffered_event, cpu) = event; - preempt_disable(); - if (cpu == smp_processor_id() && - __this_cpu_read(trace_buffered_event) != - per_cpu(trace_buffered_event, cpu)) - WARN_ON_ONCE(1); - preempt_enable(); + scoped_guard(preempt,) { + if (cpu == smp_processor_id() && + __this_cpu_read(trace_buffered_event) != + per_cpu(trace_buffered_event, cpu)) + WARN_ON_ONCE(1); + } } } static void enable_trace_buffered_event(void *data) { - /* Probably not needed, but do it anyway */ - smp_rmb(); this_cpu_dec(trace_buffered_event_cnt); } @@ -3051,7 +3003,7 @@ static void __ftrace_trace_stack(struct trace_array *tr, skip++; #endif - preempt_disable_notrace(); + guard(preempt_notrace)(); stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1; @@ -3109,8 +3061,6 @@ static void __ftrace_trace_stack(struct trace_array *tr, /* Again, don't let gcc optimize things here */ barrier(); __this_cpu_dec(ftrace_stack_reserve); - preempt_enable_notrace(); - } static inline void ftrace_trace_stack(struct trace_array *tr, @@ -3193,9 +3143,9 @@ ftrace_trace_userstack(struct trace_array *tr, * prevent recursion, since the user stack tracing may * trigger other kernel events. */ - preempt_disable(); + guard(preempt)(); if (__this_cpu_read(user_stack_count)) - goto out; + return; __this_cpu_inc(user_stack_count); @@ -3213,8 +3163,6 @@ ftrace_trace_userstack(struct trace_array *tr, out_drop_count: __this_cpu_dec(user_stack_count); - out: - preempt_enable(); } #else /* CONFIG_USER_STACKTRACE_SUPPORT */ static void ftrace_trace_userstack(struct trace_array *tr, @@ -3396,7 +3344,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) pause_graph_tracing(); trace_ctx = tracing_gen_ctx(); - preempt_disable_notrace(); + guard(preempt_notrace)(); tbuffer = get_trace_buf(); if (!tbuffer) { @@ -3411,26 +3359,23 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) size = sizeof(*entry) + sizeof(u32) * len; buffer = tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); - event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, - trace_ctx); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->ip = ip; - entry->fmt = fmt; - - memcpy(entry->buf, tbuffer, sizeof(u32) * len); - __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); + scoped_guard(ring_buffer_nest, buffer) { + event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, + trace_ctx); + if (!event) + goto out_put; + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->fmt = fmt; -out: - ring_buffer_nest_end(buffer); + memcpy(entry->buf, tbuffer, sizeof(u32) * len); + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); + } out_put: put_trace_buf(); out_nobuffer: - preempt_enable_notrace(); unpause_graph_tracing(); return len; @@ -3454,7 +3399,7 @@ int __trace_array_vprintk(struct trace_buffer *buffer, pause_graph_tracing(); trace_ctx = tracing_gen_ctx(); - preempt_disable_notrace(); + guard(preempt_notrace)(); tbuffer = get_trace_buf(); @@ -3466,24 +3411,22 @@ int __trace_array_vprintk(struct trace_buffer *buffer, len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); size = sizeof(*entry) + len + 1; - ring_buffer_nest_start(buffer); - event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, - trace_ctx); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->ip = ip; - - memcpy(&entry->buf, tbuffer, len + 1); - __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL); + scoped_guard(ring_buffer_nest, buffer) { + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, + trace_ctx); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + entry->ip = ip; + memcpy(&entry->buf, tbuffer, len + 1); + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL); + } out: - ring_buffer_nest_end(buffer); put_trace_buf(); out_nobuffer: - preempt_enable_notrace(); unpause_graph_tracing(); return len; @@ -4807,20 +4750,16 @@ int tracing_open_file_tr(struct inode *inode, struct file *filp) if (ret) return ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); /* Fail if the file is marked for removal */ if (file->flags & EVENT_FILE_FL_FREED) { trace_array_put(file->tr); - ret = -ENODEV; + return -ENODEV; } else { event_file_get(file); } - mutex_unlock(&event_mutex); - if (ret) - return ret; - filp->private_data = inode->i_private; return 0; @@ -5097,7 +5036,7 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { struct trace_array *tr = file_inode(filp)->i_private; - char *mask_str; + char *mask_str __free(kfree) = NULL; int len; len = snprintf(NULL, 0, "%*pb\n", @@ -5108,16 +5047,10 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf, len = snprintf(mask_str, len, "%*pb\n", cpumask_pr_args(tr->tracing_cpumask)); - if (len >= count) { - count = -EINVAL; - goto out_err; - } - count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len); - -out_err: - kfree(mask_str); + if (len >= count) + return -EINVAL; - return count; + return simple_read_from_buffer(ubuf, count, ppos, mask_str, len); } int tracing_set_cpumask(struct trace_array *tr, @@ -5931,17 +5864,27 @@ static inline void trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, int len) { } #endif /* !CONFIG_TRACE_EVAL_MAP_FILE */ -static void trace_insert_eval_map(struct module *mod, - struct trace_eval_map **start, int len) +static void +trace_event_update_with_eval_map(struct module *mod, + struct trace_eval_map **start, + int len) { struct trace_eval_map **map; - if (len <= 0) - return; + /* Always run sanitizer only if btf_type_tag attr exists. */ + if (len <= 0) { + if (!(IS_ENABLED(CONFIG_DEBUG_INFO_BTF) && + IS_ENABLED(CONFIG_PAHOLE_HAS_BTF_TAG) && + __has_attribute(btf_type_tag))) + return; + } map = start; - trace_event_eval_update(map, len); + trace_event_update_all(map, len); + + if (len <= 0) + return; trace_insert_eval_map_file(mod, start, len); } @@ -5954,9 +5897,9 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf, char buf[MAX_TRACER_SIZE+2]; int r; - mutex_lock(&trace_types_lock); - r = sprintf(buf, "%s\n", tr->current_trace->name); - mutex_unlock(&trace_types_lock); + scoped_guard(mutex, &trace_types_lock) { + r = sprintf(buf, "%s\n", tr->current_trace->name); + } return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } @@ -6258,15 +6201,13 @@ int tracing_update_buffers(struct trace_array *tr) { int ret = 0; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); update_last_data(tr); if (!tr->ring_buffer_expanded) ret = __tracing_resize_ring_buffer(tr, trace_buf_size, RING_BUFFER_ALL_CPUS); - mutex_unlock(&trace_types_lock); - return ret; } @@ -6297,7 +6238,7 @@ static bool tracer_options_updated; static void add_tracer_options(struct trace_array *tr, struct tracer *t) { /* Only enable if the directory has been created already. */ - if (!tr->dir) + if (!tr->dir && !(tr->flags & TRACE_ARRAY_FL_GLOBAL)) return; /* Only create trace option files after update_tracer_options finish */ @@ -6563,7 +6504,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) if (ret) return ret; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); cpu = tracing_get_cpu(inode); ret = open_pipe_on_cpu(tr, cpu); if (ret) @@ -6607,7 +6548,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) tr->trace_ref++; - mutex_unlock(&trace_types_lock); return ret; fail: @@ -6616,7 +6556,6 @@ fail_alloc_iter: close_pipe_on_cpu(tr, cpu); fail_pipe_on_cpu: __trace_array_put(tr); - mutex_unlock(&trace_types_lock); return ret; } @@ -6625,14 +6564,13 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) struct trace_iterator *iter = file->private_data; struct trace_array *tr = inode->i_private; - mutex_lock(&trace_types_lock); - - tr->trace_ref--; + scoped_guard(mutex, &trace_types_lock) { + tr->trace_ref--; - if (iter->trace->pipe_close) - iter->trace->pipe_close(iter); - close_pipe_on_cpu(tr, iter->cpu_file); - mutex_unlock(&trace_types_lock); + if (iter->trace->pipe_close) + iter->trace->pipe_close(iter); + close_pipe_on_cpu(tr, iter->cpu_file); + } free_trace_iter_content(iter); kfree(iter); @@ -7435,7 +7373,7 @@ int tracing_set_clock(struct trace_array *tr, const char *clockstr) if (i == ARRAY_SIZE(trace_clocks)) return -EINVAL; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); tr->clock_id = i; @@ -7459,8 +7397,6 @@ int tracing_set_clock(struct trace_array *tr, const char *clockstr) tscratch->clock_id = i; } - mutex_unlock(&trace_types_lock); - return 0; } @@ -7512,15 +7448,13 @@ static int tracing_time_stamp_mode_show(struct seq_file *m, void *v) { struct trace_array *tr = m->private; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); if (ring_buffer_time_stamp_abs(tr->array_buffer.buffer)) seq_puts(m, "delta [absolute]\n"); else seq_puts(m, "[delta] absolute\n"); - mutex_unlock(&trace_types_lock); - return 0; } @@ -8108,14 +8042,14 @@ static void clear_tracing_err_log(struct trace_array *tr) { struct tracing_log_err *err, *next; - mutex_lock(&tracing_err_log_lock); + guard(mutex)(&tracing_err_log_lock); + list_for_each_entry_safe(err, next, &tr->err_log, list) { list_del(&err->list); free_tracing_log_err(err); } tr->n_err_log_entries = 0; - mutex_unlock(&tracing_err_log_lock); } static void *tracing_err_log_seq_start(struct seq_file *m, loff_t *pos) @@ -8386,7 +8320,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); iter->tr->trace_ref--; @@ -8397,8 +8331,6 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) info->spare_cpu, info->spare); kvfree(info); - mutex_unlock(&trace_types_lock); - return 0; } @@ -8606,14 +8538,13 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned * An ioctl call with cmd 0 to the ring buffer file will wake up all * waiters */ - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); /* Make sure the waiters see the new wait_index */ (void)atomic_fetch_inc_release(&iter->wait_index); ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); - mutex_unlock(&trace_types_lock); return 0; } @@ -8954,12 +8885,12 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, out_reg: ret = tracing_arm_snapshot(tr); if (ret < 0) - goto out; + return ret; ret = register_ftrace_function_probe(glob, tr, ops, count); if (ret < 0) tracing_disarm_snapshot(tr); - out: + return ret < 0 ? ret : 0; } @@ -8978,13 +8909,13 @@ static inline __init int register_snapshot_cmd(void) { return 0; } static struct dentry *tracing_get_dentry(struct trace_array *tr) { - if (WARN_ON(!tr->dir)) - return ERR_PTR(-ENODEV); - /* Top directory uses NULL as the parent */ if (tr->flags & TRACE_ARRAY_FL_GLOBAL) return NULL; + if (WARN_ON(!tr->dir)) + return ERR_PTR(-ENODEV); + /* All sub buffers have a descriptor */ return tr->dir; } @@ -9103,10 +9034,9 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, return -EINVAL; if (!!(topt->flags->val & topt->opt->bit) != val) { - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); ret = __set_tracer_option(topt->tr, topt->flags, topt->opt, !val); - mutex_unlock(&trace_types_lock); if (ret) return ret; } @@ -9415,7 +9345,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf, return ret; if (buffer) { - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); if (!!val == tracer_tracing_is_on(tr)) { val = 0; /* do nothing */ } else if (val) { @@ -9429,7 +9359,6 @@ rb_simple_write(struct file *filp, const char __user *ubuf, /* Wake up any waiters */ ring_buffer_wake_waiters(buffer, RING_BUFFER_ALL_CPUS); } - mutex_unlock(&trace_types_lock); } (*ppos)++; @@ -9813,10 +9742,9 @@ static void __update_tracer_options(struct trace_array *tr) static void update_tracer_options(struct trace_array *tr) { - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); tracer_options_updated = true; __update_tracer_options(tr); - mutex_unlock(&trace_types_lock); } /* Must have trace_types_lock held */ @@ -9838,11 +9766,10 @@ struct trace_array *trace_array_find_get(const char *instance) { struct trace_array *tr; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); tr = trace_array_find(instance); if (tr) tr->ref++; - mutex_unlock(&trace_types_lock); return tr; } @@ -10250,6 +10177,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) ftrace_init_tracefs(tr, d_tracer); } +#ifdef CONFIG_TRACEFS_AUTOMOUNT_DEPRECATED static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) { struct vfsmount *mnt; @@ -10271,6 +10199,8 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) if (IS_ERR(fc)) return ERR_CAST(fc); + pr_warn("NOTICE: Automounting of tracing to debugfs is deprecated and will be removed in 2030\n"); + ret = vfs_parse_fs_string(fc, "source", "tracefs", strlen("tracefs")); if (!ret) @@ -10281,6 +10211,7 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) put_fs_context(fc); return mnt; } +#endif /** * tracing_init_dentry - initialize top level trace array @@ -10305,6 +10236,7 @@ int tracing_init_dentry(void) if (WARN_ON(!tracefs_initialized())) return -ENODEV; +#ifdef CONFIG_TRACEFS_AUTOMOUNT_DEPRECATED /* * As there may still be users that expect the tracing * files to exist in debugfs/tracing, we must automount @@ -10313,6 +10245,7 @@ int tracing_init_dentry(void) */ tr->dir = debugfs_create_automount("tracing", NULL, trace_automount, NULL); +#endif return 0; } @@ -10329,7 +10262,7 @@ static void __init eval_map_work_func(struct work_struct *work) int len; len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps; - trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len); + trace_event_update_with_eval_map(NULL, __start_ftrace_eval_maps, len); } static int __init trace_eval_init(void) @@ -10367,7 +10300,7 @@ bool module_exists(const char *module) { /* All modules have the symbol __this_module */ static const char this_mod[] = "__this_module"; - char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2]; + char modname[MODULE_NAME_LEN + sizeof(this_mod) + 2]; unsigned long val; int n; @@ -10382,9 +10315,6 @@ bool module_exists(const char *module) static void trace_module_add_evals(struct module *mod) { - if (!mod->num_trace_evals) - return; - /* * Modules with bad taint do not have events created, do * not bother with enums either. @@ -10392,7 +10322,8 @@ static void trace_module_add_evals(struct module *mod) if (trace_module_has_bad_taint(mod)) return; - trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals); + /* Even if no trace_evals, this need to sanitize field types. */ + trace_event_update_with_eval_map(mod, mod->trace_evals, mod->num_trace_evals); } #ifdef CONFIG_TRACE_EVAL_MAP_FILE @@ -10796,7 +10727,8 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, size_t count, loff_t *ppos, int (*createfn)(const char *)) { - char *kbuf, *buf, *tmp; + char *kbuf __free(kfree) = NULL; + char *buf, *tmp; int ret = 0; size_t done = 0; size_t size; @@ -10811,10 +10743,9 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, if (size >= WRITE_BUFSIZE) size = WRITE_BUFSIZE - 1; - if (copy_from_user(kbuf, buffer + done, size)) { - ret = -EFAULT; - goto out; - } + if (copy_from_user(kbuf, buffer + done, size)) + return -EFAULT; + kbuf[size] = '\0'; buf = kbuf; do { @@ -10830,8 +10761,7 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */ pr_warn("Line length is too long: Should be less than %d\n", WRITE_BUFSIZE - 2); - ret = -EINVAL; - goto out; + return -EINVAL; } } done += size; @@ -10844,17 +10774,12 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, ret = createfn(buf); if (ret) - goto out; + return ret; buf += size; } while (done < count); } - ret = done; - -out: - kfree(kbuf); - - return ret; + return done; } #ifdef CONFIG_TRACER_MAX_TRACE @@ -11057,7 +10982,7 @@ __init static int tracer_alloc_buffers(void) BUILD_BUG_ON(TRACE_ITER_LAST_BIT > TRACE_FLAGS_MAX_SIZE); if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) - goto out; + return -ENOMEM; if (!alloc_cpumask_var(&global_trace.tracing_cpumask, GFP_KERNEL)) goto out_free_buffer_mask; @@ -11175,7 +11100,6 @@ out_free_cpumask: free_cpumask_var(global_trace.tracing_cpumask); out_free_buffer_mask: free_cpumask_var(tracing_buffer_mask); -out: return ret; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index bd084953a98b..1dbf1d3cf2f1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -2125,13 +2125,13 @@ static inline const char *get_syscall_name(int syscall) #ifdef CONFIG_EVENT_TRACING void trace_event_init(void); -void trace_event_eval_update(struct trace_eval_map **map, int len); +void trace_event_update_all(struct trace_eval_map **map, int len); /* Used from boot time tracer */ extern int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set); extern int trigger_process_regex(struct trace_event_file *file, char *buff); #else static inline void __init trace_event_init(void) { } -static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { } +static inline void trace_event_update_all(struct trace_eval_map **map, int len) { } #endif #ifdef CONFIG_TRACER_SNAPSHOT diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index d01e5c910ce1..9f3e9537417d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -768,6 +768,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, { struct trace_event_call *call = file->event_call; struct trace_array *tr = file->tr; + bool soft_mode = atomic_read(&file->sm_ref) != 0; int ret = 0; int disable; @@ -782,7 +783,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, * is set we do not want the event to be enabled before we * clear the bit. * - * When soft_disable is not set but the SOFT_MODE flag is, + * When soft_disable is not set but the soft_mode is, * we do nothing. Do not disable the tracepoint, otherwise * "soft enable"s (clearing the SOFT_DISABLED bit) wont work. */ @@ -790,11 +791,11 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, if (atomic_dec_return(&file->sm_ref) > 0) break; disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED; - clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); + soft_mode = false; /* Disable use of trace_buffered_event */ trace_buffered_event_disable(); } else - disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE); + disable = !soft_mode; if (disable && (file->flags & EVENT_FILE_FL_ENABLED)) { clear_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags); @@ -812,8 +813,8 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, WARN_ON_ONCE(ret); } - /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ - if (file->flags & EVENT_FILE_FL_SOFT_MODE) + /* If in soft mode, just set the SOFT_DISABLE_BIT, else clear it */ + if (soft_mode) set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); else clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); @@ -823,7 +824,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, * When soft_disable is set and enable is set, we want to * register the tracepoint for the event, but leave the event * as is. That means, if the event was already enabled, we do - * nothing (but set SOFT_MODE). If the event is disabled, we + * nothing (but set soft_mode). If the event is disabled, we * set SOFT_DISABLED before enabling the event tracepoint, so * it still seems to be disabled. */ @@ -832,7 +833,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, else { if (atomic_inc_return(&file->sm_ref) > 1) break; - set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); + soft_mode = true; /* Enable use of trace_buffered_event */ trace_buffered_event_enable(); } @@ -840,7 +841,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, if (!(file->flags & EVENT_FILE_FL_ENABLED)) { bool cmd = false, tgid = false; - /* Keep the event disabled, when going to SOFT_MODE. */ + /* Keep the event disabled, when going to soft mode. */ if (soft_disable) set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); @@ -1792,8 +1793,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, !(flags & EVENT_FILE_FL_SOFT_DISABLED)) strcpy(buf, "1"); - if (flags & EVENT_FILE_FL_SOFT_DISABLED || - flags & EVENT_FILE_FL_SOFT_MODE) + if (atomic_read(&file->sm_ref) != 0) strcat(buf, "*"); strcat(buf, "\n"); @@ -3267,43 +3267,120 @@ static void add_str_to_module(struct module *module, char *str) list_add(&modstr->next, &module_strings); } +#define ATTRIBUTE_STR "__attribute__(" +#define ATTRIBUTE_STR_LEN (sizeof(ATTRIBUTE_STR) - 1) + +/* Remove all __attribute__() from @type. Return allocated string or @type. */ +static char *sanitize_field_type(const char *type) +{ + char *attr, *tmp, *next, *ret = (char *)type; + int depth; + + next = (char *)type; + while ((attr = strstr(next, ATTRIBUTE_STR))) { + /* Retry if "__attribute__(" is a part of another word. */ + if (attr != next && !isspace(attr[-1])) { + next = attr + ATTRIBUTE_STR_LEN; + continue; + } + + if (ret == type) { + ret = kstrdup(type, GFP_KERNEL); + if (WARN_ON_ONCE(!ret)) + return NULL; + attr = ret + (attr - type); + } + + /* the ATTRIBUTE_STR already has the first '(' */ + depth = 1; + next = attr + ATTRIBUTE_STR_LEN; + do { + tmp = strpbrk(next, "()"); + /* There is unbalanced parentheses */ + if (WARN_ON_ONCE(!tmp)) { + kfree(ret); + return (char *)type; + } + + if (*tmp == '(') + depth++; + else + depth--; + next = tmp + 1; + } while (depth > 0); + next = skip_spaces(next); + strcpy(attr, next); + next = attr; + } + return ret; +} + +static char *find_replacable_eval(const char *type, const char *eval_string, + int len) +{ + char *ptr; + + if (!eval_string) + return NULL; + + ptr = strchr(type, '['); + if (!ptr) + return NULL; + ptr++; + + if (!isalpha(*ptr) && *ptr != '_') + return NULL; + + if (strncmp(eval_string, ptr, len) != 0) + return NULL; + + return ptr; +} + static void update_event_fields(struct trace_event_call *call, struct trace_eval_map *map) { struct ftrace_event_field *field; + const char *eval_string = NULL; struct list_head *head; + int len = 0; char *ptr; char *str; - int len = strlen(map->eval_string); /* Dynamic events should never have field maps */ - if (WARN_ON_ONCE(call->flags & TRACE_EVENT_FL_DYNAMIC)) + if (call->flags & TRACE_EVENT_FL_DYNAMIC) return; + if (map) { + eval_string = map->eval_string; + len = strlen(map->eval_string); + } + head = trace_get_fields(call); list_for_each_entry(field, head, link) { - ptr = strchr(field->type, '['); - if (!ptr) - continue; - ptr++; - - if (!isalpha(*ptr) && *ptr != '_') - continue; + str = sanitize_field_type(field->type); + if (!str) + return; - if (strncmp(map->eval_string, ptr, len) != 0) - continue; + ptr = find_replacable_eval(str, eval_string, len); + if (ptr) { + if (str == field->type) { + str = kstrdup(field->type, GFP_KERNEL); + if (WARN_ON_ONCE(!str)) + return; + ptr = str + (ptr - field->type); + } - str = kstrdup(field->type, GFP_KERNEL); - if (WARN_ON_ONCE(!str)) - return; - ptr = str + (ptr - field->type); - ptr = eval_replace(ptr, map, len); - /* enum/sizeof string smaller than value */ - if (WARN_ON_ONCE(!ptr)) { - kfree(str); - continue; + ptr = eval_replace(ptr, map, len); + /* enum/sizeof string smaller than value */ + if (WARN_ON_ONCE(!ptr)) { + kfree(str); + continue; + } } + if (str == field->type) + continue; /* * If the event is part of a module, then we need to free the string * when the module is removed. Otherwise, it will stay allocated @@ -3313,14 +3390,18 @@ static void update_event_fields(struct trace_event_call *call, add_str_to_module(call->module, str); field->type = str; + if (field->filter_type == FILTER_OTHER) + field->filter_type = filter_assign_type(field->type); } } -void trace_event_eval_update(struct trace_eval_map **map, int len) +/* Update all events for replacing eval and sanitizing */ +void trace_event_update_all(struct trace_eval_map **map, int len) { struct trace_event_call *call, *p; const char *last_system = NULL; bool first = false; + bool updated; int last_i; int i; @@ -3333,6 +3414,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) last_system = call->class->system; } + updated = false; /* * Since calls are grouped by systems, the likelihood that the * next call in the iteration belongs to the same system as the @@ -3352,8 +3434,12 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) } update_event_printk(call, map[i]); update_event_fields(call, map[i]); + updated = true; } } + /* If not updated yet, update field for sanitizing. */ + if (!updated) + update_event_fields(call, NULL); cond_resched(); } up_write(&trace_event_sem); @@ -3587,7 +3673,7 @@ static int probe_remove_event_call(struct trace_event_call *call) continue; /* * We can't rely on ftrace_event_enable_disable(enable => 0) - * we are going to do, EVENT_FILE_FL_SOFT_MODE can suppress + * we are going to do, soft mode can suppress * TRACE_REG_UNREGISTER. */ if (file->flags & EVENT_FILE_FL_ENABLED) @@ -3698,7 +3784,7 @@ static void trace_module_remove_events(struct module *mod) if (call->module == mod) __trace_remove_event_call(call); } - /* Check for any strings allocade for this module */ + /* Check for any strings allocated for this module */ list_for_each_entry_safe(modstr, m, &module_strings, next) { if (modstr->module != mod) continue; @@ -4002,7 +4088,7 @@ static int free_probe_data(void *data) edata->ref--; if (!edata->ref) { - /* Remove the SOFT_MODE flag */ + /* Remove soft mode */ __ftrace_event_enable_disable(edata->file, 0, 1); trace_event_put_ref(edata->file->event_call); kfree(edata); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e4581e10782b..54226b48b2d1 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1344,13 +1344,14 @@ struct filter_list { struct filter_head { struct list_head list; - struct rcu_head rcu; + union { + struct rcu_head rcu; + struct rcu_work rwork; + }; }; - -static void free_filter_list(struct rcu_head *rhp) +static void free_filter_list(struct filter_head *filter_list) { - struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu); struct filter_list *filter_item, *tmp; list_for_each_entry_safe(filter_item, tmp, &filter_list->list, list) { @@ -1361,9 +1362,20 @@ static void free_filter_list(struct rcu_head *rhp) kfree(filter_list); } +static void free_filter_list_work(struct work_struct *work) +{ + struct filter_head *filter_list; + + filter_list = container_of(to_rcu_work(work), struct filter_head, rwork); + free_filter_list(filter_list); +} + static void free_filter_list_tasks(struct rcu_head *rhp) { - call_rcu(rhp, free_filter_list); + struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu); + + INIT_RCU_WORK(&filter_list->rwork, free_filter_list_work); + queue_rcu_work(system_wq, &filter_list->rwork); } /* @@ -1460,7 +1472,7 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, tracepoint_synchronize_unregister(); if (head) - free_filter_list(&head->rcu); + free_filter_list(head); list_for_each_entry(file, &tr->events, list) { if (file->system != dir || !file->filter) @@ -2305,7 +2317,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir, return 0; fail: /* No call succeeded */ - free_filter_list(&filter_list->rcu); + free_filter_list(filter_list); parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); return -EINVAL; fail_mem: @@ -2315,7 +2327,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir, if (!fail) delay_free_filter(filter_list); else - free_filter_list(&filter_list->rcu); + free_filter_list(filter_list); return -ENOMEM; } diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 33cfbd4ed76d..f24ee61f8884 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -536,12 +536,12 @@ static notrace void trace_event_raw_event_synth(void *__data, * is being performed within another event. */ buffer = trace_file->tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); + guard(ring_buffer_nest)(buffer); entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry) + fields_size); if (!entry) - goto out; + return; for (i = 0, n_u64 = 0; i < event->n_fields; i++) { val_idx = var_ref_idx[i]; @@ -584,8 +584,6 @@ static notrace void trace_event_raw_event_synth(void *__data, } trace_event_buffer_commit(&fbuffer); -out: - ring_buffer_nest_end(buffer); } static void free_synth_event_print_fmt(struct trace_event_call *call) diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index b65353ec2837..2f7b94e98317 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -325,12 +325,9 @@ static void move_to_next_cpu(void) cpus_read_lock(); cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask); - next_cpu = cpumask_next(raw_smp_processor_id(), current_mask); + next_cpu = cpumask_next_wrap(raw_smp_processor_id(), current_mask); cpus_read_unlock(); - if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first(current_mask); - if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */ goto change_mode; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0b3db02030a7..97db0b0ccf3e 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -701,6 +701,7 @@ void print_function_args(struct trace_seq *s, unsigned long *args, struct btf *btf; s32 tid, nr = 0; int a, p, x; + u16 encode; trace_seq_printf(s, "("); @@ -744,7 +745,12 @@ void print_function_args(struct trace_seq *s, unsigned long *args, trace_seq_printf(s, "0x%lx", arg); break; case BTF_KIND_INT: - trace_seq_printf(s, "%ld", arg); + encode = btf_int_encoding(t); + /* Print unsigned ints as hex */ + if (encode & BTF_INT_SIGNED) + trace_seq_printf(s, "%ld", arg); + else + trace_seq_printf(s, "0x%lx", arg); break; case BTF_KIND_ENUM: trace_seq_printf(s, "%ld", arg); diff --git a/kernel/ucount.c b/kernel/ucount.c index 8686e329b8f2..586af49fc03e 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -199,18 +199,16 @@ void put_ucounts(struct ucounts *ucounts) } } -static inline bool atomic_long_inc_below(atomic_long_t *v, int u) +static inline bool atomic_long_inc_below(atomic_long_t *v, long u) { - long c, old; - c = atomic_long_read(v); - for (;;) { + long c = atomic_long_read(v); + + do { if (unlikely(c >= u)) return false; - old = atomic_long_cmpxchg(v, c, c+1); - if (likely(old == c)) - return true; - c = old; - } + } while (!atomic_long_try_cmpxchg(v, &c, c+1)); + + return true; } struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, diff --git a/kernel/unwind/Makefile b/kernel/unwind/Makefile new file mode 100644 index 000000000000..eae37bea54fd --- /dev/null +++ b/kernel/unwind/Makefile @@ -0,0 +1 @@ + obj-$(CONFIG_UNWIND_USER) += user.o deferred.o diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c new file mode 100644 index 000000000000..dc6040aae3ee --- /dev/null +++ b/kernel/unwind/deferred.c @@ -0,0 +1,362 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Deferred user space unwinding + */ +#include <linux/sched/task_stack.h> +#include <linux/unwind_deferred.h> +#include <linux/sched/clock.h> +#include <linux/task_work.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/sizes.h> +#include <linux/slab.h> +#include <linux/mm.h> + +/* + * For requesting a deferred user space stack trace from NMI context + * the architecture must support a safe cmpxchg in NMI context. + * For those architectures that do not have that, then it cannot ask + * for a deferred user space stack trace from an NMI context. If it + * does, then it will get -EINVAL. + */ +#if defined(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) +# define CAN_USE_IN_NMI 1 +static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt) +{ + u32 old = 0; + + return try_cmpxchg(&info->id.cnt, &old, cnt); +} +#else +# define CAN_USE_IN_NMI 0 +/* When NMIs are not allowed, this always succeeds */ +static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt) +{ + info->id.cnt = cnt; + return true; +} +#endif + +/* Make the cache fit in a 4K page */ +#define UNWIND_MAX_ENTRIES \ + ((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long)) + +/* Guards adding to or removing from the list of callbacks */ +static DEFINE_MUTEX(callback_mutex); +static LIST_HEAD(callbacks); + +#define RESERVED_BITS (UNWIND_PENDING | UNWIND_USED) + +/* Zero'd bits are available for assigning callback users */ +static unsigned long unwind_mask = RESERVED_BITS; +DEFINE_STATIC_SRCU(unwind_srcu); + +static inline bool unwind_pending(struct unwind_task_info *info) +{ + return test_bit(UNWIND_PENDING_BIT, &info->unwind_mask); +} + +/* + * This is a unique percpu identifier for a given task entry context. + * Conceptually, it's incremented every time the CPU enters the kernel from + * user space, so that each "entry context" on the CPU gets a unique ID. In + * reality, as an optimization, it's only incremented on demand for the first + * deferred unwind request after a given entry-from-user. + * + * It's combined with the CPU id to make a systemwide-unique "context cookie". + */ +static DEFINE_PER_CPU(u32, unwind_ctx_ctr); + +/* + * The context cookie is a unique identifier that is assigned to a user + * space stacktrace. As the user space stacktrace remains the same while + * the task is in the kernel, the cookie is an identifier for the stacktrace. + * Although it is possible for the stacktrace to get another cookie if another + * request is made after the cookie was cleared and before reentering user + * space. + */ +static u64 get_cookie(struct unwind_task_info *info) +{ + u32 cnt = 1; + + if (info->id.cpu) + return info->id.id; + + /* LSB is always set to ensure 0 is an invalid value */ + cnt |= __this_cpu_read(unwind_ctx_ctr) + 2; + if (try_assign_cnt(info, cnt)) { + /* Update the per cpu counter */ + __this_cpu_write(unwind_ctx_ctr, cnt); + } + /* Interrupts are disabled, the CPU will always be same */ + info->id.cpu = smp_processor_id() + 1; /* Must be non zero */ + + return info->id.id; +} + +/** + * unwind_user_faultable - Produce a user stacktrace in faultable context + * @trace: The descriptor that will store the user stacktrace + * + * This must be called in a known faultable context (usually when entering + * or exiting user space). Depending on the available implementations + * the @trace will be loaded with the addresses of the user space stacktrace + * if it can be found. + * + * Return: 0 on success and negative on error + * On success @trace will contain the user space stacktrace + */ +int unwind_user_faultable(struct unwind_stacktrace *trace) +{ + struct unwind_task_info *info = ¤t->unwind_info; + struct unwind_cache *cache; + + /* Should always be called from faultable context */ + might_fault(); + + if (!current->mm) + return -EINVAL; + + if (!info->cache) { + info->cache = kzalloc(struct_size(cache, entries, UNWIND_MAX_ENTRIES), + GFP_KERNEL); + if (!info->cache) + return -ENOMEM; + } + + cache = info->cache; + trace->entries = cache->entries; + + if (cache->nr_entries) { + /* + * The user stack has already been previously unwound in this + * entry context. Skip the unwind and use the cache. + */ + trace->nr = cache->nr_entries; + return 0; + } + + trace->nr = 0; + unwind_user(trace, UNWIND_MAX_ENTRIES); + + cache->nr_entries = trace->nr; + + /* Clear nr_entries on way back to user space */ + set_bit(UNWIND_USED_BIT, &info->unwind_mask); + + return 0; +} + +static void process_unwind_deferred(struct task_struct *task) +{ + struct unwind_task_info *info = &task->unwind_info; + struct unwind_stacktrace trace; + struct unwind_work *work; + unsigned long bits; + u64 cookie; + + if (WARN_ON_ONCE(!unwind_pending(info))) + return; + + /* Clear pending bit but make sure to have the current bits */ + bits = atomic_long_fetch_andnot(UNWIND_PENDING, + (atomic_long_t *)&info->unwind_mask); + /* + * From here on out, the callback must always be called, even if it's + * just an empty trace. + */ + trace.nr = 0; + trace.entries = NULL; + + unwind_user_faultable(&trace); + + if (info->cache) + bits &= ~(info->cache->unwind_completed); + + cookie = info->id.id; + + guard(srcu)(&unwind_srcu); + list_for_each_entry_srcu(work, &callbacks, list, + srcu_read_lock_held(&unwind_srcu)) { + if (test_bit(work->bit, &bits)) { + work->func(work, &trace, cookie); + if (info->cache) + info->cache->unwind_completed |= BIT(work->bit); + } + } +} + +static void unwind_deferred_task_work(struct callback_head *head) +{ + process_unwind_deferred(current); +} + +void unwind_deferred_task_exit(struct task_struct *task) +{ + struct unwind_task_info *info = ¤t->unwind_info; + + if (!unwind_pending(info)) + return; + + process_unwind_deferred(task); + + task_work_cancel(task, &info->work); +} + +/** + * unwind_deferred_request - Request a user stacktrace on task kernel exit + * @work: Unwind descriptor requesting the trace + * @cookie: The cookie of the first request made for this task + * + * Schedule a user space unwind to be done in task work before exiting the + * kernel. + * + * The returned @cookie output is the generated cookie of the very first + * request for a user space stacktrace for this task since it entered the + * kernel. It can be from a request by any caller of this infrastructure. + * Its value will also be passed to the callback function. It can be + * used to stitch kernel and user stack traces together in post-processing. + * + * It's valid to call this function multiple times for the same @work within + * the same task entry context. Each call will return the same cookie + * while the task hasn't left the kernel. If the callback is not pending + * because it has already been previously called for the same entry context, + * it will be called again with the same stack trace and cookie. + * + * Return: 0 if the callback successfully was queued. + * 1 if the callback is pending or was already executed. + * Negative if there's an error. + * @cookie holds the cookie of the first request by any user + */ +int unwind_deferred_request(struct unwind_work *work, u64 *cookie) +{ + struct unwind_task_info *info = ¤t->unwind_info; + unsigned long old, bits; + unsigned long bit; + int ret; + + *cookie = 0; + + if ((current->flags & (PF_KTHREAD | PF_EXITING)) || + !user_mode(task_pt_regs(current))) + return -EINVAL; + + /* + * NMI requires having safe cmpxchg operations. + * Trigger a warning to make it obvious that an architecture + * is using this in NMI when it should not be. + */ + if (WARN_ON_ONCE(!CAN_USE_IN_NMI && in_nmi())) + return -EINVAL; + + /* Do not allow cancelled works to request again */ + bit = READ_ONCE(work->bit); + if (WARN_ON_ONCE(bit < 0)) + return -EINVAL; + + /* Only need the mask now */ + bit = BIT(bit); + + guard(irqsave)(); + + *cookie = get_cookie(info); + + old = READ_ONCE(info->unwind_mask); + + /* Is this already queued or executed */ + if (old & bit) + return 1; + + /* + * This work's bit hasn't been set yet. Now set it with the PENDING + * bit and fetch the current value of unwind_mask. If ether the + * work's bit or PENDING was already set, then this is already queued + * to have a callback. + */ + bits = UNWIND_PENDING | bit; + old = atomic_long_fetch_or(bits, (atomic_long_t *)&info->unwind_mask); + if (old & bits) { + /* + * If the work's bit was set, whatever set it had better + * have also set pending and queued a callback. + */ + WARN_ON_ONCE(!(old & UNWIND_PENDING)); + return old & bit; + } + + /* The work has been claimed, now schedule it. */ + ret = task_work_add(current, &info->work, TWA_RESUME); + + if (WARN_ON_ONCE(ret)) + WRITE_ONCE(info->unwind_mask, 0); + + return ret; +} + +void unwind_deferred_cancel(struct unwind_work *work) +{ + struct task_struct *g, *t; + int bit; + + if (!work) + return; + + bit = work->bit; + + /* No work should be using a reserved bit */ + if (WARN_ON_ONCE(BIT(bit) & RESERVED_BITS)) + return; + + guard(mutex)(&callback_mutex); + list_del_rcu(&work->list); + + /* Do not allow any more requests and prevent callbacks */ + work->bit = -1; + + __clear_bit(bit, &unwind_mask); + + synchronize_srcu(&unwind_srcu); + + guard(rcu)(); + /* Clear this bit from all threads */ + for_each_process_thread(g, t) { + clear_bit(bit, &t->unwind_info.unwind_mask); + if (t->unwind_info.cache) + clear_bit(bit, &t->unwind_info.cache->unwind_completed); + } +} + +int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) +{ + memset(work, 0, sizeof(*work)); + + guard(mutex)(&callback_mutex); + + /* See if there's a bit in the mask available */ + if (unwind_mask == ~0UL) + return -EBUSY; + + work->bit = ffz(unwind_mask); + __set_bit(work->bit, &unwind_mask); + + list_add_rcu(&work->list, &callbacks); + work->func = func; + return 0; +} + +void unwind_task_init(struct task_struct *task) +{ + struct unwind_task_info *info = &task->unwind_info; + + memset(info, 0, sizeof(*info)); + init_task_work(&info->work, unwind_deferred_task_work); + info->unwind_mask = 0; +} + +void unwind_task_free(struct task_struct *task) +{ + struct unwind_task_info *info = &task->unwind_info; + + kfree(info->cache); + task_work_cancel(task, &info->work); +} diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c new file mode 100644 index 000000000000..97a8415e3216 --- /dev/null +++ b/kernel/unwind/user.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0 +/* +* Generic interfaces for unwinding user space +*/ +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/sched/task_stack.h> +#include <linux/unwind_user.h> +#include <linux/uaccess.h> + +static const struct unwind_user_frame fp_frame = { + ARCH_INIT_USER_FP_FRAME +}; + +#define for_each_user_frame(state) \ + for (unwind_user_start(state); !(state)->done; unwind_user_next(state)) + +static int unwind_user_next_fp(struct unwind_user_state *state) +{ + const struct unwind_user_frame *frame = &fp_frame; + unsigned long cfa, fp, ra; + unsigned int shift; + + if (frame->use_fp) { + if (state->fp < state->sp) + return -EINVAL; + cfa = state->fp; + } else { + cfa = state->sp; + } + + /* Get the Canonical Frame Address (CFA) */ + cfa += frame->cfa_off; + + /* stack going in wrong direction? */ + if (cfa <= state->sp) + return -EINVAL; + + /* Make sure that the address is word aligned */ + shift = sizeof(long) == 4 ? 2 : 3; + if (cfa & ((1 << shift) - 1)) + return -EINVAL; + + /* Find the Return Address (RA) */ + if (get_user(ra, (unsigned long *)(cfa + frame->ra_off))) + return -EINVAL; + + if (frame->fp_off && get_user(fp, (unsigned long __user *)(cfa + frame->fp_off))) + return -EINVAL; + + state->ip = ra; + state->sp = cfa; + if (frame->fp_off) + state->fp = fp; + return 0; +} + +static int unwind_user_next(struct unwind_user_state *state) +{ + unsigned long iter_mask = state->available_types; + unsigned int bit; + + if (state->done) + return -EINVAL; + + for_each_set_bit(bit, &iter_mask, NR_UNWIND_USER_TYPE_BITS) { + enum unwind_user_type type = BIT(bit); + + state->current_type = type; + switch (type) { + case UNWIND_USER_TYPE_FP: + if (!unwind_user_next_fp(state)) + return 0; + continue; + default: + WARN_ONCE(1, "Undefined unwind bit %d", bit); + break; + } + break; + } + + /* No successful unwind method. */ + state->current_type = UNWIND_USER_TYPE_NONE; + state->done = true; + return -EINVAL; +} + +static int unwind_user_start(struct unwind_user_state *state) +{ + struct pt_regs *regs = task_pt_regs(current); + + memset(state, 0, sizeof(*state)); + + if ((current->flags & PF_KTHREAD) || !user_mode(regs)) { + state->done = true; + return -EINVAL; + } + + if (IS_ENABLED(CONFIG_HAVE_UNWIND_USER_FP)) + state->available_types |= UNWIND_USER_TYPE_FP; + + state->ip = instruction_pointer(regs); + state->sp = user_stack_pointer(regs); + state->fp = frame_pointer(regs); + + return 0; +} + +int unwind_user(struct unwind_stacktrace *trace, unsigned int max_entries) +{ + struct unwind_user_state state; + + trace->nr = 0; + + if (!max_entries) + return -EINVAL; + + if (current->flags & PF_KTHREAD) + return 0; + + for_each_user_frame(&state) { + trace->entries[trace->nr++] = state.ip; + if (trace->nr >= max_entries) + break; + } + + return 0; +} diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c index 2f844c279a3e..bc738fa90c1d 100644 --- a/kernel/vhost_task.c +++ b/kernel/vhost_task.c @@ -145,7 +145,7 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *), tsk = copy_process(NULL, 0, NUMA_NO_NODE, &args); if (IS_ERR(tsk)) { kfree(vtsk); - return ERR_PTR(PTR_ERR(tsk)); + return ERR_CAST(tsk); } vtsk->task = tsk; diff --git a/kernel/watchdog_buddy.c b/kernel/watchdog_buddy.c index 34dbfe091f4b..ee754d767c21 100644 --- a/kernel/watchdog_buddy.c +++ b/kernel/watchdog_buddy.c @@ -12,10 +12,7 @@ static unsigned int watchdog_next_cpu(unsigned int cpu) { unsigned int next_cpu; - next_cpu = cpumask_next(cpu, &watchdog_cpus); - if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first(&watchdog_cpus); - + next_cpu = cpumask_next_wrap(cpu, &watchdog_cpus); if (next_cpu == cpu) return nr_cpu_ids; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9f9148075828..c6b79b3675c3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -505,12 +505,16 @@ static struct kthread_worker *pwq_release_worker __ro_after_init; struct workqueue_struct *system_wq __ro_after_init; EXPORT_SYMBOL(system_wq); +struct workqueue_struct *system_percpu_wq __ro_after_init; +EXPORT_SYMBOL(system_percpu_wq); struct workqueue_struct *system_highpri_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_highpri_wq); struct workqueue_struct *system_long_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_long_wq); struct workqueue_struct *system_unbound_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_unbound_wq); +struct workqueue_struct *system_dfl_wq __ro_after_init; +EXPORT_SYMBOL_GPL(system_dfl_wq); struct workqueue_struct *system_freezable_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_freezable_wq); struct workqueue_struct *system_power_efficient_wq __ro_after_init; @@ -1686,17 +1690,14 @@ static void __pwq_activate_work(struct pool_workqueue *pwq, static bool tryinc_node_nr_active(struct wq_node_nr_active *nna) { int max = READ_ONCE(nna->max); + int old = atomic_read(&nna->nr); - while (true) { - int old, tmp; - - old = atomic_read(&nna->nr); + do { if (old >= max) return false; - tmp = atomic_cmpxchg_relaxed(&nna->nr, old, old + 1); - if (tmp == old) - return true; - } + } while (!atomic_try_cmpxchg_relaxed(&nna->nr, &old, old + 1)); + + return true; } /** @@ -2221,12 +2222,9 @@ static int wq_select_unbound_cpu(int cpu) } new_cpu = __this_cpu_read(wq_rr_cpu_last); - new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask); - if (unlikely(new_cpu >= nr_cpu_ids)) { - new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask); - if (unlikely(new_cpu >= nr_cpu_ids)) - return cpu; - } + new_cpu = cpumask_next_and_wrap(new_cpu, wq_unbound_cpumask, cpu_online_mask); + if (unlikely(new_cpu >= nr_cpu_ids)) + return cpu; __this_cpu_write(wq_rr_cpu_last, new_cpu); return new_cpu; @@ -4629,7 +4627,7 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs) * * Return: The allocated new workqueue_attr on success. %NULL on failure. */ -struct workqueue_attrs *alloc_workqueue_attrs(void) +struct workqueue_attrs *alloc_workqueue_attrs_noprof(void) { struct workqueue_attrs *attrs; @@ -5682,12 +5680,12 @@ static struct workqueue_struct *__alloc_workqueue(const char *fmt, else wq_size = sizeof(*wq); - wq = kzalloc(wq_size, GFP_KERNEL); + wq = kzalloc_noprof(wq_size, GFP_KERNEL); if (!wq) return NULL; if (flags & WQ_UNBOUND) { - wq->unbound_attrs = alloc_workqueue_attrs(); + wq->unbound_attrs = alloc_workqueue_attrs_noprof(); if (!wq->unbound_attrs) goto err_free_wq; } @@ -5777,9 +5775,9 @@ err_destroy: } __printf(1, 4) -struct workqueue_struct *alloc_workqueue(const char *fmt, - unsigned int flags, - int max_active, ...) +struct workqueue_struct *alloc_workqueue_noprof(const char *fmt, + unsigned int flags, + int max_active, ...) { struct workqueue_struct *wq; va_list args; @@ -5794,7 +5792,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, return wq; } -EXPORT_SYMBOL_GPL(alloc_workqueue); +EXPORT_SYMBOL_GPL(alloc_workqueue_noprof); #ifdef CONFIG_LOCKDEP __printf(1, 5) @@ -6770,31 +6768,6 @@ long work_on_cpu_key(int cpu, long (*fn)(void *), return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu_key); - -/** - * work_on_cpu_safe_key - run a function in thread context on a particular cpu - * @cpu: the cpu to run on - * @fn: the function to run - * @arg: the function argument - * @key: The lock class key for lock debugging purposes - * - * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold - * any locks which would prevent @fn from completing. - * - * Return: The value @fn returns. - */ -long work_on_cpu_safe_key(int cpu, long (*fn)(void *), - void *arg, struct lock_class_key *key) -{ - long ret = -ENODEV; - - cpus_read_lock(); - if (cpu_online(cpu)) - ret = work_on_cpu_key(cpu, fn, arg, key); - cpus_read_unlock(); - return ret; -} -EXPORT_SYMBOL_GPL(work_on_cpu_safe_key); #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER @@ -7830,10 +7803,11 @@ void __init workqueue_init_early(void) } system_wq = alloc_workqueue("events", 0, 0); + system_percpu_wq = alloc_workqueue("events", 0, 0); system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); system_long_wq = alloc_workqueue("events_long", 0, 0); - system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, - WQ_MAX_ACTIVE); + system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE); + system_dfl_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", WQ_FREEZABLE, 0); system_power_efficient_wq = alloc_workqueue("events_power_efficient", @@ -7844,8 +7818,8 @@ void __init workqueue_init_early(void) system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0); system_bh_highpri_wq = alloc_workqueue("events_bh_highpri", WQ_BH | WQ_HIGHPRI, 0); - BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || - !system_unbound_wq || !system_freezable_wq || + BUG_ON(!system_wq || !system_percpu_wq|| !system_highpri_wq || !system_long_wq || + !system_unbound_wq || !system_freezable_wq || !system_dfl_wq || !system_power_efficient_wq || !system_freezable_power_efficient_wq || !system_bh_wq || !system_bh_highpri_wq); |