diff options
Diffstat (limited to 'kernel')
111 files changed, 5405 insertions, 2432 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 6fc72b3afbde..b69c95315480 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -13,7 +13,6 @@ obj-y = fork.o exec_domain.o panic.o \ async.o range.o smpboot.o ucount.o regset.o obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o -obj-$(CONFIG_MODULES) += kmod.o obj-$(CONFIG_MULTIUSER) += groups.o obj-$(CONFIG_VHOST_TASK) += vhost_task.o diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index a06e118a9be5..517b6a5928cc 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -173,11 +173,11 @@ void bpf_cgroup_atype_put(int cgroup_atype) { int i = cgroup_atype - CGROUP_LSM_START; - mutex_lock(&cgroup_mutex); + cgroup_lock(); if (--cgroup_lsm_atype[i].refcnt <= 0) cgroup_lsm_atype[i].attach_btf_id = 0; WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); } #else static enum cgroup_bpf_attach_type @@ -282,7 +282,7 @@ static void cgroup_bpf_release(struct work_struct *work) unsigned int atype; - mutex_lock(&cgroup_mutex); + cgroup_lock(); for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) { struct hlist_head *progs = &cgrp->bpf.progs[atype]; @@ -315,7 +315,7 @@ static void cgroup_bpf_release(struct work_struct *work) bpf_cgroup_storage_free(storage); } - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) cgroup_bpf_put(p); @@ -729,9 +729,9 @@ static int cgroup_bpf_attach(struct cgroup *cgrp, { int ret; - mutex_lock(&cgroup_mutex); + cgroup_lock(); ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); return ret; } @@ -831,7 +831,7 @@ static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog, cg_link = container_of(link, struct bpf_cgroup_link, link); - mutex_lock(&cgroup_mutex); + cgroup_lock(); /* link might have been auto-released by dying cgroup, so fail */ if (!cg_link->cgroup) { ret = -ENOLINK; @@ -843,7 +843,7 @@ static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog, } ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog); out_unlock: - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); return ret; } @@ -1009,9 +1009,9 @@ static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, { int ret; - mutex_lock(&cgroup_mutex); + cgroup_lock(); ret = __cgroup_bpf_detach(cgrp, prog, NULL, type); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); return ret; } @@ -1120,9 +1120,9 @@ static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, { int ret; - mutex_lock(&cgroup_mutex); + cgroup_lock(); ret = __cgroup_bpf_query(cgrp, attr, uattr); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); return ret; } @@ -1189,11 +1189,11 @@ static void bpf_cgroup_link_release(struct bpf_link *link) if (!cg_link->cgroup) return; - mutex_lock(&cgroup_mutex); + cgroup_lock(); /* re-check cgroup under lock again */ if (!cg_link->cgroup) { - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); return; } @@ -1205,7 +1205,7 @@ static void bpf_cgroup_link_release(struct bpf_link *link) cg = cg_link->cgroup; cg_link->cgroup = NULL; - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); cgroup_put(cg); } @@ -1232,10 +1232,10 @@ static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link, container_of(link, struct bpf_cgroup_link, link); u64 cg_id = 0; - mutex_lock(&cgroup_mutex); + cgroup_lock(); if (cg_link->cgroup) cg_id = cgroup_id(cg_link->cgroup); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); seq_printf(seq, "cgroup_id:\t%llu\n" @@ -1251,10 +1251,10 @@ static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link, container_of(link, struct bpf_cgroup_link, link); u64 cg_id = 0; - mutex_lock(&cgroup_mutex); + cgroup_lock(); if (cg_link->cgroup) cg_id = cgroup_id(cg_link->cgroup); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); info->cgroup.cgroup_id = cg_id; info->cgroup.attach_type = cg_link->type; diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c index 06989d278846..810378f04fbc 100644 --- a/kernel/bpf/cgroup_iter.c +++ b/kernel/bpf/cgroup_iter.c @@ -58,7 +58,7 @@ static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos) { struct cgroup_iter_priv *p = seq->private; - mutex_lock(&cgroup_mutex); + cgroup_lock(); /* cgroup_iter doesn't support read across multiple sessions. */ if (*pos > 0) { @@ -89,7 +89,7 @@ static void cgroup_iter_seq_stop(struct seq_file *seq, void *v) { struct cgroup_iter_priv *p = seq->private; - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); /* pass NULL to the prog for post-processing */ if (!v) { diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 00c253b84bf5..9901efee4339 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1215,7 +1215,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value ret = htab_lock_bucket(htab, b, hash, &flags); if (ret) - return ret; + goto err_lock_bucket; l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1236,6 +1236,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value err: htab_unlock_bucket(htab, b, hash, flags); +err_lock_bucket: if (ret) htab_lru_push_free(htab, l_new); else if (l_old) @@ -1338,7 +1339,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, ret = htab_lock_bucket(htab, b, hash, &flags); if (ret) - return ret; + goto err_lock_bucket; l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1361,6 +1362,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, ret = 0; err: htab_unlock_bucket(htab, b, hash, flags); +err_lock_bucket: if (l_new) bpf_lru_push_free(&htab->lru, &l_new->lru_node); return ret; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 4c7bbec4a9e4..a04f505aefe9 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -333,14 +333,14 @@ static void cgroup_storage_map_free(struct bpf_map *_map) struct list_head *storages = &map->list; struct bpf_cgroup_storage *storage, *stmp; - mutex_lock(&cgroup_mutex); + cgroup_lock(); list_for_each_entry_safe(storage, stmp, storages, list_map) { bpf_cgroup_storage_unlink(storage); bpf_cgroup_storage_free(storage); } - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); WARN_ON(!RB_EMPTY_ROOT(&map->root)); WARN_ON(!list_empty(&map->list)); diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index d9c9f45e3529..8a26cd8814c1 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -859,4 +859,4 @@ static int __init bpf_offload_init(void) return rhashtable_init(&offdevs, &offdevs_params); } -late_initcall(bpf_offload_init); +core_initcall(bpf_offload_init); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fbcf5a4e2fcd..5871aa78d01a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -17033,7 +17033,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH, insn->dst_reg, shift); - insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg, + insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, (1ULL << size * 8) - 1); } } diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 52bb5a74a23b..aeef06c465ef 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -58,7 +58,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) struct cgroup_root *root; int retval = 0; - mutex_lock(&cgroup_mutex); + cgroup_lock(); cgroup_attach_lock(true); for_each_root(root) { struct cgroup *from_cgrp; @@ -72,7 +72,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) break; } cgroup_attach_unlock(true); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); return retval; } @@ -106,7 +106,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) if (ret) return ret; - mutex_lock(&cgroup_mutex); + cgroup_lock(); percpu_down_write(&cgroup_threadgroup_rwsem); @@ -145,7 +145,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) out_err: cgroup_migrate_finish(&mgctx); percpu_up_write(&cgroup_threadgroup_rwsem); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); return ret; } @@ -847,13 +847,13 @@ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent kernfs_break_active_protection(new_parent); kernfs_break_active_protection(kn); - mutex_lock(&cgroup_mutex); + cgroup_lock(); ret = kernfs_rename(kn, new_parent, new_name_str); if (!ret) TRACE_CGROUP_PATH(rename, cgrp); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); kernfs_unbreak_active_protection(kn); kernfs_unbreak_active_protection(new_parent); @@ -1119,7 +1119,7 @@ int cgroup1_reconfigure(struct fs_context *fc) trace_cgroup_remount(root); out_unlock: - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); return ret; } @@ -1246,7 +1246,7 @@ int cgroup1_get_tree(struct fs_context *fc) if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt)) ret = 1; /* restart */ - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); if (!ret) ret = cgroup_do_get_tree(fc); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index ae518166d70a..625d7483951c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1391,7 +1391,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) cgroup_favor_dynmods(root, false); cgroup_exit_root_id(root); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); cgroup_rstat_exit(cgrp); kernfs_destroy_root(root->kf_root); @@ -1635,7 +1635,7 @@ void cgroup_kn_unlock(struct kernfs_node *kn) else cgrp = kn->parent->priv; - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); kernfs_unbreak_active_protection(kn); cgroup_put(cgrp); @@ -1680,7 +1680,7 @@ struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline) if (drain_offline) cgroup_lock_and_drain_offline(cgrp); else - mutex_lock(&cgroup_mutex); + cgroup_lock(); if (!cgroup_is_dead(cgrp)) return cgrp; @@ -2177,13 +2177,13 @@ int cgroup_do_get_tree(struct fs_context *fc) struct super_block *sb = fc->root->d_sb; struct cgroup *cgrp; - mutex_lock(&cgroup_mutex); + cgroup_lock(); spin_lock_irq(&css_set_lock); cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root); spin_unlock_irq(&css_set_lock); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); nsdentry = kernfs_node_dentry(cgrp->kn, sb); dput(fc->root); @@ -2366,13 +2366,13 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, { int ret; - mutex_lock(&cgroup_mutex); + cgroup_lock(); spin_lock_irq(&css_set_lock); ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns); spin_unlock_irq(&css_set_lock); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); return ret; } @@ -2398,7 +2398,7 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) int hierarchy_id = 1; int ret; - mutex_lock(&cgroup_mutex); + cgroup_lock(); spin_lock_irq(&css_set_lock); root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); @@ -2412,7 +2412,7 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) } spin_unlock_irq(&css_set_lock); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); return ret; } EXPORT_SYMBOL_GPL(task_cgroup_path); @@ -3121,7 +3121,7 @@ void cgroup_lock_and_drain_offline(struct cgroup *cgrp) int ssid; restart: - mutex_lock(&cgroup_mutex); + cgroup_lock(); cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { for_each_subsys(ss, ssid) { @@ -3135,7 +3135,7 @@ restart: prepare_to_wait(&dsct->offline_waitq, &wait, TASK_UNINTERRUPTIBLE); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); schedule(); finish_wait(&dsct->offline_waitq, &wait); @@ -3771,7 +3771,7 @@ static ssize_t pressure_write(struct kernfs_open_file *of, char *buf, } psi = cgroup_psi(cgrp); - new = psi_trigger_create(psi, buf, res); + new = psi_trigger_create(psi, buf, res, of->file); if (IS_ERR(new)) { cgroup_put(cgrp); return PTR_ERR(new); @@ -4384,9 +4384,9 @@ int cgroup_rm_cftypes(struct cftype *cfts) if (!(cfts[0].flags & __CFTYPE_ADDED)) return -ENOENT; - mutex_lock(&cgroup_mutex); + cgroup_lock(); ret = cgroup_rm_cftypes_locked(cfts); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); return ret; } @@ -4418,14 +4418,14 @@ static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) if (ret) return ret; - mutex_lock(&cgroup_mutex); + cgroup_lock(); list_add_tail(&cfts->node, &ss->cfts); ret = cgroup_apply_cftypes(cfts, true); if (ret) cgroup_rm_cftypes_locked(cfts); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); return ret; } @@ -5395,7 +5395,7 @@ static void css_release_work_fn(struct work_struct *work) struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; - mutex_lock(&cgroup_mutex); + cgroup_lock(); css->flags |= CSS_RELEASED; list_del_rcu(&css->sibling); @@ -5436,7 +5436,7 @@ static void css_release_work_fn(struct work_struct *work) NULL); } - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); @@ -5784,7 +5784,7 @@ static void css_killed_work_fn(struct work_struct *work) struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); - mutex_lock(&cgroup_mutex); + cgroup_lock(); do { offline_css(css); @@ -5793,7 +5793,7 @@ static void css_killed_work_fn(struct work_struct *work) css = css->parent; } while (css && atomic_dec_and_test(&css->online_cnt)); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); } /* css kill confirmation processing requires process context, bounce */ @@ -5977,7 +5977,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) pr_debug("Initializing cgroup subsys %s\n", ss->name); - mutex_lock(&cgroup_mutex); + cgroup_lock(); idr_init(&ss->css_idr); INIT_LIST_HEAD(&ss->cfts); @@ -6021,7 +6021,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) BUG_ON(online_css(css)); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); } /** @@ -6081,7 +6081,7 @@ int __init cgroup_init(void) get_user_ns(init_cgroup_ns.user_ns); - mutex_lock(&cgroup_mutex); + cgroup_lock(); /* * Add init_css_set to the hash table so that dfl_root can link to @@ -6092,7 +6092,7 @@ int __init cgroup_init(void) BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); for_each_subsys(ss, ssid) { if (ss->early_init) { @@ -6144,9 +6144,9 @@ int __init cgroup_init(void) if (ss->bind) ss->bind(init_css_set.subsys[ssid]); - mutex_lock(&cgroup_mutex); + cgroup_lock(); css_populate_dir(init_css_set.subsys[ssid]); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); } /* init_css_set.subsys[] has been updated, re-hash */ @@ -6251,7 +6251,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, if (!buf) goto out; - mutex_lock(&cgroup_mutex); + cgroup_lock(); spin_lock_irq(&css_set_lock); for_each_root(root) { @@ -6306,7 +6306,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, retval = 0; out_unlock: spin_unlock_irq(&css_set_lock); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); kfree(buf); out: return retval; @@ -6390,7 +6390,7 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) struct file *f; if (kargs->flags & CLONE_INTO_CGROUP) - mutex_lock(&cgroup_mutex); + cgroup_lock(); cgroup_threadgroup_change_begin(current); @@ -6465,7 +6465,7 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) err: cgroup_threadgroup_change_end(current); - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); if (f) fput(f); if (dst_cgrp) @@ -6492,7 +6492,7 @@ static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs) struct cgroup *cgrp = kargs->cgrp; struct css_set *cset = kargs->cset; - mutex_unlock(&cgroup_mutex); + cgroup_unlock(); if (cset) { put_css_set(cset); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 505d86b16642..e4ca2dd2b764 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1209,7 +1209,9 @@ void rebuild_sched_domains(void) * * Iterate through each task of @cs updating its cpus_allowed to the * effective cpuset's. As this function is called with cpuset_rwsem held, - * cpuset membership stays stable. + * cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask() + * is used instead of effective_cpus to make sure all offline CPUs are also + * included as hotplug code won't update cpumasks for tasks in top_cpuset. */ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) { @@ -1219,15 +1221,18 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) { - /* - * Percpu kthreads in top_cpuset are ignored - */ - if (top_cs && (task->flags & PF_KTHREAD) && - kthread_is_per_cpu(task)) - continue; + const struct cpumask *possible_mask = task_cpu_possible_mask(task); - cpumask_and(new_cpus, cs->effective_cpus, - task_cpu_possible_mask(task)); + if (top_cs) { + /* + * Percpu kthreads in top_cpuset are ignored + */ + if ((task->flags & PF_KTHREAD) && kthread_is_per_cpu(task)) + continue; + cpumask_andnot(new_cpus, possible_mask, cs->subparts_cpus); + } else { + cpumask_and(new_cpus, possible_mask, cs->effective_cpus); + } set_cpus_allowed_ptr(task, new_cpus); } css_task_iter_end(&it); @@ -3618,6 +3623,8 @@ retry: update_tasks: cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); mems_updated = !nodes_equal(new_mems, cs->effective_mems); + if (!cpus_updated && !mems_updated) + goto unlock; /* Hotplug doesn't affect this cpuset */ if (mems_updated) check_insane_mems_config(&new_mems); @@ -3629,6 +3636,7 @@ update_tasks: hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); +unlock: percpu_up_write(&cpuset_rwsem); } @@ -3941,7 +3949,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) } /* - * __cpuset_node_allowed - Can we allocate on a memory node? + * cpuset_node_allowed - Can we allocate on a memory node? * @node: is this an allowed node? * @gfp_mask: memory allocation flags * @@ -3980,7 +3988,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. */ -bool __cpuset_node_allowed(int node, gfp_t gfp_mask) +bool cpuset_node_allowed(int node, gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ bool allowed; /* is allocation in zone z allowed? */ diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 0a2b4967e333..9c4c55228567 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -241,12 +241,12 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp) } /** - * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush() + * cgroup_rstat_flush_atomic- atomic version of cgroup_rstat_flush() * @cgrp: target cgroup * * This function can be called from any context. */ -void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp) +void cgroup_rstat_flush_atomic(struct cgroup *cgrp) { unsigned long flags; diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config deleted file mode 100644 index 44b0f0146a3f..000000000000 --- a/kernel/configs/android-base.config +++ /dev/null @@ -1,159 +0,0 @@ -# KEEP ALPHABETICALLY SORTED -# CONFIG_DEVMEM is not set -# CONFIG_FHANDLE is not set -# CONFIG_INET_LRO is not set -# CONFIG_NFSD is not set -# CONFIG_NFS_FS is not set -# CONFIG_OABI_COMPAT is not set -# CONFIG_SYSVIPC is not set -# CONFIG_USELIB is not set -CONFIG_ANDROID_BINDER_IPC=y -CONFIG_ANDROID_BINDER_DEVICES=binder,hwbinder,vndbinder -CONFIG_ANDROID_LOW_MEMORY_KILLER=y -CONFIG_ARMV8_DEPRECATED=y -CONFIG_ASHMEM=y -CONFIG_AUDIT=y -CONFIG_BLK_DEV_INITRD=y -CONFIG_CGROUPS=y -CONFIG_CGROUP_BPF=y -CONFIG_CGROUP_CPUACCT=y -CONFIG_CGROUP_DEBUG=y -CONFIG_CGROUP_FREEZER=y -CONFIG_CGROUP_SCHED=y -CONFIG_CP15_BARRIER_EMULATION=y -CONFIG_DEFAULT_SECURITY_SELINUX=y -CONFIG_EMBEDDED=y -CONFIG_FB=y -CONFIG_HARDENED_USERCOPY=y -CONFIG_HIGH_RES_TIMERS=y -CONFIG_IKCONFIG=y -CONFIG_IKCONFIG_PROC=y -CONFIG_INET6_AH=y -CONFIG_INET6_ESP=y -CONFIG_INET6_IPCOMP=y -CONFIG_INET=y -CONFIG_INET_DIAG_DESTROY=y -CONFIG_INET_ESP=y -CONFIG_INET_XFRM_MODE_TUNNEL=y -CONFIG_IP6_NF_FILTER=y -CONFIG_IP6_NF_IPTABLES=y -CONFIG_IP6_NF_MANGLE=y -CONFIG_IP6_NF_RAW=y -CONFIG_IP6_NF_TARGET_REJECT=y -CONFIG_IPV6=y -CONFIG_IPV6_MIP6=y -CONFIG_IPV6_MULTIPLE_TABLES=y -CONFIG_IPV6_OPTIMISTIC_DAD=y -CONFIG_IPV6_ROUTER_PREF=y -CONFIG_IPV6_ROUTE_INFO=y -CONFIG_IP_ADVANCED_ROUTER=y -CONFIG_IP_MULTICAST=y -CONFIG_IP_MULTIPLE_TABLES=y -CONFIG_IP_NF_ARPFILTER=y -CONFIG_IP_NF_ARPTABLES=y -CONFIG_IP_NF_ARP_MANGLE=y -CONFIG_IP_NF_FILTER=y -CONFIG_IP_NF_IPTABLES=y -CONFIG_IP_NF_MANGLE=y -CONFIG_IP_NF_MATCH_AH=y -CONFIG_IP_NF_MATCH_ECN=y -CONFIG_IP_NF_MATCH_TTL=y -CONFIG_IP_NF_NAT=y -CONFIG_IP_NF_RAW=y -CONFIG_IP_NF_SECURITY=y -CONFIG_IP_NF_TARGET_MASQUERADE=y -CONFIG_IP_NF_TARGET_NETMAP=y -CONFIG_IP_NF_TARGET_REDIRECT=y -CONFIG_IP_NF_TARGET_REJECT=y -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -CONFIG_MODVERSIONS=y -CONFIG_NET=y -CONFIG_NETDEVICES=y -CONFIG_NETFILTER=y -CONFIG_NETFILTER_TPROXY=y -CONFIG_NETFILTER_XT_MATCH_COMMENT=y -CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y -CONFIG_NETFILTER_XT_MATCH_CONNMARK=y -CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y -CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y -CONFIG_NETFILTER_XT_MATCH_HELPER=y -CONFIG_NETFILTER_XT_MATCH_IPRANGE=y -CONFIG_NETFILTER_XT_MATCH_LENGTH=y -CONFIG_NETFILTER_XT_MATCH_LIMIT=y -CONFIG_NETFILTER_XT_MATCH_MAC=y -CONFIG_NETFILTER_XT_MATCH_MARK=y -CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y -CONFIG_NETFILTER_XT_MATCH_POLICY=y -CONFIG_NETFILTER_XT_MATCH_QUOTA=y -CONFIG_NETFILTER_XT_MATCH_SOCKET=y -CONFIG_NETFILTER_XT_MATCH_STATE=y -CONFIG_NETFILTER_XT_MATCH_STATISTIC=y -CONFIG_NETFILTER_XT_MATCH_STRING=y -CONFIG_NETFILTER_XT_MATCH_TIME=y -CONFIG_NETFILTER_XT_MATCH_U32=y -CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y -CONFIG_NETFILTER_XT_TARGET_CONNMARK=y -CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y -CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y -CONFIG_NETFILTER_XT_TARGET_MARK=y -CONFIG_NETFILTER_XT_TARGET_NFLOG=y -CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y -CONFIG_NETFILTER_XT_TARGET_SECMARK=y -CONFIG_NETFILTER_XT_TARGET_TCPMSS=y -CONFIG_NETFILTER_XT_TARGET_TPROXY=y -CONFIG_NETFILTER_XT_TARGET_TRACE=y -CONFIG_NET_CLS_ACT=y -CONFIG_NET_CLS_U32=y -CONFIG_NET_EMATCH=y -CONFIG_NET_EMATCH_U32=y -CONFIG_NET_KEY=y -CONFIG_NET_SCHED=y -CONFIG_NET_SCH_HTB=y -CONFIG_NF_CONNTRACK=y -CONFIG_NF_CONNTRACK_AMANDA=y -CONFIG_NF_CONNTRACK_EVENTS=y -CONFIG_NF_CONNTRACK_FTP=y -CONFIG_NF_CONNTRACK_H323=y -CONFIG_NF_CONNTRACK_IPV4=y -CONFIG_NF_CONNTRACK_IPV6=y -CONFIG_NF_CONNTRACK_IRC=y -CONFIG_NF_CONNTRACK_NETBIOS_NS=y -CONFIG_NF_CONNTRACK_PPTP=y -CONFIG_NF_CONNTRACK_SANE=y -CONFIG_NF_CONNTRACK_SECMARK=y -CONFIG_NF_CONNTRACK_TFTP=y -CONFIG_NF_CT_NETLINK=y -CONFIG_NF_CT_PROTO_DCCP=y -CONFIG_NF_CT_PROTO_SCTP=y -CONFIG_NF_CT_PROTO_UDPLITE=y -CONFIG_NF_NAT=y -CONFIG_NO_HZ=y -CONFIG_PACKET=y -CONFIG_PM_AUTOSLEEP=y -CONFIG_PM_WAKELOCKS=y -CONFIG_PPP=y -CONFIG_PPP_BSDCOMP=y -CONFIG_PPP_DEFLATE=y -CONFIG_PPP_MPPE=y -CONFIG_PREEMPT=y -CONFIG_QUOTA=y -CONFIG_RANDOMIZE_BASE=y -CONFIG_RTC_CLASS=y -CONFIG_RT_GROUP_SCHED=y -CONFIG_SECCOMP=y -CONFIG_SECURITY=y -CONFIG_SECURITY_NETWORK=y -CONFIG_SECURITY_SELINUX=y -CONFIG_SETEND_EMULATION=y -CONFIG_STAGING=y -CONFIG_SWP_EMULATION=y -CONFIG_SYNC=y -CONFIG_TUN=y -CONFIG_UNIX=y -CONFIG_USB_GADGET=y -CONFIG_USB_CONFIGFS=y -CONFIG_USB_CONFIGFS_F_FS=y -CONFIG_USB_CONFIGFS_F_MIDI=y -CONFIG_USB_OTG_WAKELOCK=y -CONFIG_XFRM_USER=y diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config deleted file mode 100644 index e400fbbc8aba..000000000000 --- a/kernel/configs/android-recommended.config +++ /dev/null @@ -1,127 +0,0 @@ -# KEEP ALPHABETICALLY SORTED -# CONFIG_BPF_UNPRIV_DEFAULT_OFF is not set -# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set -# CONFIG_INPUT_MOUSE is not set -# CONFIG_LEGACY_PTYS is not set -# CONFIG_NF_CONNTRACK_SIP is not set -# CONFIG_PM_WAKELOCKS_GC is not set -# CONFIG_VT is not set -CONFIG_ARM64_SW_TTBR0_PAN=y -CONFIG_BACKLIGHT_LCD_SUPPORT=y -CONFIG_BLK_DEV_DM=y -CONFIG_BLK_DEV_LOOP=y -CONFIG_BLK_DEV_RAM=y -CONFIG_BLK_DEV_RAM_SIZE=8192 -CONFIG_STACKPROTECTOR_STRONG=y -CONFIG_COMPACTION=y -CONFIG_CPU_SW_DOMAIN_PAN=y -CONFIG_DM_CRYPT=y -CONFIG_DM_UEVENT=y -CONFIG_DM_VERITY=y -CONFIG_DM_VERITY_FEC=y -CONFIG_DRAGONRISE_FF=y -CONFIG_ENABLE_DEFAULT_TRACERS=y -CONFIG_EXT4_FS=y -CONFIG_EXT4_FS_SECURITY=y -CONFIG_FUSE_FS=y -CONFIG_GREENASIA_FF=y -CONFIG_HIDRAW=y -CONFIG_HID_A4TECH=y -CONFIG_HID_ACRUX=y -CONFIG_HID_ACRUX_FF=y -CONFIG_HID_APPLE=y -CONFIG_HID_BELKIN=y -CONFIG_HID_CHERRY=y -CONFIG_HID_CHICONY=y -CONFIG_HID_CYPRESS=y -CONFIG_HID_DRAGONRISE=y -CONFIG_HID_ELECOM=y -CONFIG_HID_EMS_FF=y -CONFIG_HID_EZKEY=y -CONFIG_HID_GREENASIA=y -CONFIG_HID_GYRATION=y -CONFIG_HID_HOLTEK=y -CONFIG_HID_KENSINGTON=y -CONFIG_HID_KEYTOUCH=y -CONFIG_HID_KYE=y -CONFIG_HID_LCPOWER=y -CONFIG_HID_LOGITECH=y -CONFIG_HID_LOGITECH_DJ=y -CONFIG_HID_MAGICMOUSE=y -CONFIG_HID_MICROSOFT=y -CONFIG_HID_MONTEREY=y -CONFIG_HID_MULTITOUCH=y -CONFIG_HID_NTRIG=y -CONFIG_HID_ORTEK=y -CONFIG_HID_PANTHERLORD=y -CONFIG_HID_PETALYNX=y -CONFIG_HID_PICOLCD=y -CONFIG_HID_PRIMAX=y -CONFIG_HID_PRODIKEYS=y -CONFIG_HID_ROCCAT=y -CONFIG_HID_SAITEK=y -CONFIG_HID_SAMSUNG=y -CONFIG_HID_SMARTJOYPLUS=y -CONFIG_HID_SONY=y -CONFIG_HID_SPEEDLINK=y -CONFIG_HID_SUNPLUS=y -CONFIG_HID_THRUSTMASTER=y -CONFIG_HID_TIVO=y -CONFIG_HID_TOPSEED=y -CONFIG_HID_TWINHAN=y -CONFIG_HID_UCLOGIC=y -CONFIG_HID_WACOM=y -CONFIG_HID_WALTOP=y -CONFIG_HID_WIIMOTE=y -CONFIG_HID_ZEROPLUS=y -CONFIG_HID_ZYDACRON=y -CONFIG_INPUT_EVDEV=y -CONFIG_INPUT_GPIO=y -CONFIG_INPUT_JOYSTICK=y -CONFIG_INPUT_MISC=y -CONFIG_INPUT_TABLET=y -CONFIG_INPUT_UINPUT=y -CONFIG_JOYSTICK_XPAD=y -CONFIG_JOYSTICK_XPAD_FF=y -CONFIG_JOYSTICK_XPAD_LEDS=y -CONFIG_KALLSYMS_ALL=y -CONFIG_KSM=y -CONFIG_LOGIG940_FF=y -CONFIG_LOGIRUMBLEPAD2_FF=y -CONFIG_LOGITECH_FF=y -CONFIG_MD=y -CONFIG_MEDIA_SUPPORT=y -CONFIG_MSDOS_FS=y -CONFIG_PANIC_TIMEOUT=5 -CONFIG_PANTHERLORD_FF=y -CONFIG_PERF_EVENTS=y -CONFIG_PM_DEBUG=y -CONFIG_PM_RUNTIME=y -CONFIG_PM_WAKELOCKS_LIMIT=0 -CONFIG_POWER_SUPPLY=y -CONFIG_PSTORE=y -CONFIG_PSTORE_CONSOLE=y -CONFIG_PSTORE_RAM=y -CONFIG_SCHEDSTATS=y -CONFIG_SMARTJOYPLUS_FF=y -CONFIG_SND=y -CONFIG_SOUND=y -CONFIG_STRICT_KERNEL_RWX=y -CONFIG_SUSPEND_TIME=y -CONFIG_TABLET_USB_ACECAD=y -CONFIG_TABLET_USB_AIPTEK=y -CONFIG_TABLET_USB_HANWANG=y -CONFIG_TABLET_USB_KBTAB=y -CONFIG_TASKSTATS=y -CONFIG_TASK_DELAY_ACCT=y -CONFIG_TASK_IO_ACCOUNTING=y -CONFIG_TASK_XACCT=y -CONFIG_TIMER_STATS=y -CONFIG_TMPFS=y -CONFIG_TMPFS_POSIX_ACL=y -CONFIG_UHID=y -CONFIG_USB_ANNOUNCE_NEW_DEVICES=y -CONFIG_USB_EHCI_HCD=y -CONFIG_USB_HIDDEV=y -CONFIG_USB_USBNET=y -CONFIG_VFAT_FS=y diff --git a/kernel/cpu.c b/kernel/cpu.c index 6c0a92ca6bb5..f4a2c5845bcb 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -623,7 +623,7 @@ static int finish_cpu(unsigned int cpu) */ if (mm != &init_mm) idle->active_mm = &init_mm; - mmdrop(mm); + mmdrop_lazy_tlb(mm); return 0; } @@ -2569,22 +2569,33 @@ static const struct attribute_group cpuhp_smt_attr_group = { static int __init cpu_smt_sysfs_init(void) { - return sysfs_create_group(&cpu_subsys.dev_root->kobj, - &cpuhp_smt_attr_group); + struct device *dev_root; + int ret = -ENODEV; + + dev_root = bus_get_dev_root(&cpu_subsys); + if (dev_root) { + ret = sysfs_create_group(&dev_root->kobj, &cpuhp_smt_attr_group); + put_device(dev_root); + } + return ret; } static int __init cpuhp_sysfs_init(void) { + struct device *dev_root; int cpu, ret; ret = cpu_smt_sysfs_init(); if (ret) return ret; - ret = sysfs_create_group(&cpu_subsys.dev_root->kobj, - &cpuhp_cpu_root_attr_group); - if (ret) - return ret; + dev_root = bus_get_dev_root(&cpu_subsys); + if (dev_root) { + ret = sysfs_create_group(&dev_root->kobj, &cpuhp_cpu_root_attr_group); + put_device(dev_root); + if (ret) + return ret; + } for_each_possible_cpu(cpu) { struct device *dev = get_cpu_device(cpu); diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 755f5f08ab38..90ce1dfd591c 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -474,7 +474,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(list_head, prev); VMCOREINFO_OFFSET(vmap_area, va_start); VMCOREINFO_OFFSET(vmap_area, list); - VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); + VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER + 1); log_buf_vmcoreinfo_setup(); VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); VMCOREINFO_NUMBER(NR_FREE_PAGES); diff --git a/kernel/delayacct.c b/kernel/delayacct.c index e39cb696cfbd..6f0c358e73d8 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -179,12 +179,15 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp; tmp = d->wpcopy_delay_total + tsk->delays->wpcopy_delay; d->wpcopy_delay_total = (tmp < d->wpcopy_delay_total) ? 0 : tmp; + tmp = d->irq_delay_total + tsk->delays->irq_delay; + d->irq_delay_total = (tmp < d->irq_delay_total) ? 0 : tmp; d->blkio_count += tsk->delays->blkio_count; d->swapin_count += tsk->delays->swapin_count; d->freepages_count += tsk->delays->freepages_count; d->thrashing_count += tsk->delays->thrashing_count; d->compact_count += tsk->delays->compact_count; d->wpcopy_count += tsk->delays->wpcopy_count; + d->irq_count += tsk->delays->irq_count; raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return 0; @@ -274,3 +277,14 @@ void __delayacct_wpcopy_end(void) ¤t->delays->wpcopy_delay, ¤t->delays->wpcopy_count); } + +void __delayacct_irq(struct task_struct *task, u32 delta) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&task->delays->lock, flags); + task->delays->irq_delay += delta; + task->delays->irq_count++; + raw_spin_unlock_irqrestore(&task->delays->lock, flags); +} + diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 56866aaa2ae1..6677d0e64d27 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -76,6 +76,13 @@ config ARCH_HAS_DMA_PREP_COHERENT config ARCH_HAS_FORCE_DMA_UNENCRYPTED bool +# +# Select this option if the architecture assumes DMA devices are coherent +# by default. +# +config ARCH_DMA_DEFAULT_COHERENT + bool + config SWIOTLB bool select NEED_DMA_MAP_STATE diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 18c93c2276ca..f190651bcadd 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -53,6 +53,7 @@ enum map_err_types { * struct dma_debug_entry - track a dma_map* or dma_alloc_coherent mapping * @list: node on pre-allocated free_entries list * @dev: 'dev' argument to dma_map_{page|single|sg} or dma_alloc_coherent + * @dev_addr: dma address * @size: length of the mapping * @type: single, page, sg, coherent * @direction: enum dma_data_direction @@ -396,37 +397,6 @@ static unsigned long long phys_addr(struct dma_debug_entry *entry) } /* - * Dump mapping entries for debugging purposes - */ -void debug_dma_dump_mappings(struct device *dev) -{ - int idx; - - for (idx = 0; idx < HASH_SIZE; idx++) { - struct hash_bucket *bucket = &dma_entry_hash[idx]; - struct dma_debug_entry *entry; - unsigned long flags; - - spin_lock_irqsave(&bucket->lock, flags); - - list_for_each_entry(entry, &bucket->list, list) { - if (!dev || dev == entry->dev) { - dev_info(entry->dev, - "%s idx %d P=%Lx N=%lx D=%Lx L=%Lx %s %s\n", - type2name[entry->type], idx, - phys_addr(entry), entry->pfn, - entry->dev_addr, entry->size, - dir2name[entry->direction], - maperr2str[entry->map_err_type]); - } - } - - spin_unlock_irqrestore(&bucket->lock, flags); - cond_resched(); - } -} - -/* * For each mapping (initial cacheline in the case of * dma_alloc_coherent/dma_map_page, initial cacheline in each page of a * scatterlist, or the cacheline specified in dma_map_single) insert @@ -547,6 +517,70 @@ static void active_cacheline_remove(struct dma_debug_entry *entry) } /* + * Dump mappings entries on kernel space for debugging purposes + */ +void debug_dma_dump_mappings(struct device *dev) +{ + int idx; + phys_addr_t cln; + + for (idx = 0; idx < HASH_SIZE; idx++) { + struct hash_bucket *bucket = &dma_entry_hash[idx]; + struct dma_debug_entry *entry; + unsigned long flags; + + spin_lock_irqsave(&bucket->lock, flags); + list_for_each_entry(entry, &bucket->list, list) { + if (!dev || dev == entry->dev) { + cln = to_cacheline_number(entry); + dev_info(entry->dev, + "%s idx %d P=%llx N=%lx D=%llx L=%llx cln=%pa %s %s\n", + type2name[entry->type], idx, + phys_addr(entry), entry->pfn, + entry->dev_addr, entry->size, + &cln, dir2name[entry->direction], + maperr2str[entry->map_err_type]); + } + } + spin_unlock_irqrestore(&bucket->lock, flags); + + cond_resched(); + } +} + +/* + * Dump mappings entries on user space via debugfs + */ +static int dump_show(struct seq_file *seq, void *v) +{ + int idx; + phys_addr_t cln; + + for (idx = 0; idx < HASH_SIZE; idx++) { + struct hash_bucket *bucket = &dma_entry_hash[idx]; + struct dma_debug_entry *entry; + unsigned long flags; + + spin_lock_irqsave(&bucket->lock, flags); + list_for_each_entry(entry, &bucket->list, list) { + cln = to_cacheline_number(entry); + seq_printf(seq, + "%s %s %s idx %d P=%llx N=%lx D=%llx L=%llx cln=%pa %s %s\n", + dev_driver_string(entry->dev), + dev_name(entry->dev), + type2name[entry->type], idx, + phys_addr(entry), entry->pfn, + entry->dev_addr, entry->size, + &cln, dir2name[entry->direction], + maperr2str[entry->map_err_type]); + } + spin_unlock_irqrestore(&bucket->lock, flags); + } + return 0; +} +DEFINE_SHOW_ATTRIBUTE(dump); + +/* * Wrapper function for adding an entry to the hash. * This function takes care of locking itself. */ @@ -764,33 +798,6 @@ static const struct file_operations filter_fops = { .llseek = default_llseek, }; -static int dump_show(struct seq_file *seq, void *v) -{ - int idx; - - for (idx = 0; idx < HASH_SIZE; idx++) { - struct hash_bucket *bucket = &dma_entry_hash[idx]; - struct dma_debug_entry *entry; - unsigned long flags; - - spin_lock_irqsave(&bucket->lock, flags); - list_for_each_entry(entry, &bucket->list, list) { - seq_printf(seq, - "%s %s %s idx %d P=%llx N=%lx D=%llx L=%llx %s %s\n", - dev_name(entry->dev), - dev_driver_string(entry->dev), - type2name[entry->type], idx, - phys_addr(entry), entry->pfn, - entry->dev_addr, entry->size, - dir2name[entry->direction], - maperr2str[entry->map_err_type]); - } - spin_unlock_irqrestore(&bucket->lock, flags); - } - return 0; -} -DEFINE_SHOW_ATTRIBUTE(dump); - static int __init dma_debug_fs_init(void) { struct dentry *dentry = debugfs_create_dir("dma-api", NULL); @@ -1262,13 +1269,13 @@ void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) } EXPORT_SYMBOL(debug_dma_mapping_error); -void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, +void debug_dma_unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, int direction) { struct dma_debug_entry ref = { .type = dma_debug_single, .dev = dev, - .dev_addr = addr, + .dev_addr = dma_addr, .size = size, .direction = direction, }; @@ -1403,13 +1410,13 @@ void debug_dma_alloc_coherent(struct device *dev, size_t size, } void debug_dma_free_coherent(struct device *dev, size_t size, - void *virt, dma_addr_t addr) + void *virt, dma_addr_t dma_addr) { struct dma_debug_entry ref = { .type = dma_debug_coherent, .dev = dev, .offset = offset_in_page(virt), - .dev_addr = addr, + .dev_addr = dma_addr, .size = size, .direction = DMA_BIDIRECTIONAL, }; diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 63859a101ed8..5595d1d5cdcc 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -44,10 +44,11 @@ u64 dma_direct_get_required_mask(struct device *dev) return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; } -static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, - u64 *phys_limit) +static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 *phys_limit) { - u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit); + u64 dma_limit = min_not_zero( + dev->coherent_dma_mask, + dev->bus_dma_limit); /* * Optimistically try the zone that the physical address mask falls @@ -126,8 +127,7 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, if (is_swiotlb_for_alloc(dev)) return dma_direct_alloc_swiotlb(dev, size); - gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, - &phys_limit); + gfp |= dma_direct_optimal_gfp_mask(dev, &phys_limit); page = dma_alloc_contiguous(dev, size, gfp); if (page) { if (!dma_coherent_ok(dev, page_to_phys(page), size) || @@ -172,14 +172,13 @@ static void *dma_direct_alloc_from_pool(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { struct page *page; - u64 phys_mask; + u64 phys_limit; void *ret; if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_DMA_COHERENT_POOL))) return NULL; - gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, - &phys_mask); + gfp |= dma_direct_optimal_gfp_mask(dev, &phys_limit); page = dma_alloc_from_pool(dev, size, &ret, gfp, dma_coherent_ok); if (!page) return NULL; diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index 0520a8f4fb1d..02205ab53b7e 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -356,4 +356,3 @@ module_exit(map_benchmark_cleanup); MODULE_AUTHOR("Barry Song <song.bao.hua@hisilicon.com>"); MODULE_DESCRIPTION("dma_map benchmark driver"); -MODULE_LICENSE("GPL"); diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 68106e3791f6..9a4db5cce600 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -17,7 +17,11 @@ #include "debug.h" #include "direct.h" -bool dma_default_coherent; +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) +bool dma_default_coherent = IS_ENABLED(CONFIG_ARCH_DMA_DEFAULT_COHERENT); +#endif /* * Managed DMA API diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 4d40dcce7604..1acec2e22827 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -84,8 +84,8 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, void *addr; int ret = -ENOMEM; - /* Cannot allocate larger than MAX_ORDER-1 */ - order = min(get_order(pool_size), MAX_ORDER-1); + /* Cannot allocate larger than MAX_ORDER */ + order = min(get_order(pool_size), MAX_ORDER); do { pool_size = 1 << (PAGE_SHIFT + order); @@ -190,7 +190,7 @@ static int __init dma_atomic_pool_init(void) /* * If coherent_pool was not used on the command line, default the pool - * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER-1. + * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER. */ if (!atomic_pool_size) { unsigned long pages = totalram_pages() / (SZ_1G / SZ_128K); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index dac42a2ad588..af2e304c672c 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -73,8 +73,6 @@ static bool swiotlb_force_disable; struct io_tlb_mem io_tlb_default_mem; -phys_addr_t swiotlb_unencrypted_base; - static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT; static unsigned long default_nareas; @@ -202,34 +200,6 @@ static inline unsigned long nr_slots(u64 val) } /* - * Remap swioltb memory in the unencrypted physical address space - * when swiotlb_unencrypted_base is set. (e.g. for Hyper-V AMD SEV-SNP - * Isolation VMs). - */ -#ifdef CONFIG_HAS_IOMEM -static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes) -{ - void *vaddr = NULL; - - if (swiotlb_unencrypted_base) { - phys_addr_t paddr = mem->start + swiotlb_unencrypted_base; - - vaddr = memremap(paddr, bytes, MEMREMAP_WB); - if (!vaddr) - pr_err("Failed to map the unencrypted memory %pa size %lx.\n", - &paddr, bytes); - } - - return vaddr; -} -#else -static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes) -{ - return NULL; -} -#endif - -/* * Early SWIOTLB allocation may be too early to allow an architecture to * perform the desired operations. This function allows the architecture to * call SWIOTLB when the operations are possible. It needs to be called @@ -238,18 +208,12 @@ static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes) void __init swiotlb_update_mem_attributes(void) { struct io_tlb_mem *mem = &io_tlb_default_mem; - void *vaddr; unsigned long bytes; if (!mem->nslabs || mem->late_alloc) return; - vaddr = phys_to_virt(mem->start); bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT); - set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT); - - mem->vaddr = swiotlb_mem_remap(mem, bytes); - if (!mem->vaddr) - mem->vaddr = vaddr; + set_memory_decrypted((unsigned long)mem->vaddr, bytes >> PAGE_SHIFT); } static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, @@ -280,13 +244,6 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, mem->slots[i].alloc_size = 0; } - /* - * If swiotlb_unencrypted_base is set, the bounce buffer memory will - * be remapped and cleared in swiotlb_update_mem_attributes. - */ - if (swiotlb_unencrypted_base) - return; - memset(vaddr, 0, bytes); mem->vaddr = vaddr; return; @@ -609,6 +566,40 @@ static unsigned int wrap_area_index(struct io_tlb_mem *mem, unsigned int index) } /* + * Track the total used slots with a global atomic value in order to have + * correct information to determine the high water mark. The mem_used() + * function gives imprecise results because there's no locking across + * multiple areas. + */ +#ifdef CONFIG_DEBUG_FS +static void inc_used_and_hiwater(struct io_tlb_mem *mem, unsigned int nslots) +{ + unsigned long old_hiwater, new_used; + + new_used = atomic_long_add_return(nslots, &mem->total_used); + old_hiwater = atomic_long_read(&mem->used_hiwater); + do { + if (new_used <= old_hiwater) + break; + } while (!atomic_long_try_cmpxchg(&mem->used_hiwater, + &old_hiwater, new_used)); +} + +static void dec_used(struct io_tlb_mem *mem, unsigned int nslots) +{ + atomic_long_sub(nslots, &mem->total_used); +} + +#else /* !CONFIG_DEBUG_FS */ +static void inc_used_and_hiwater(struct io_tlb_mem *mem, unsigned int nslots) +{ +} +static void dec_used(struct io_tlb_mem *mem, unsigned int nslots) +{ +} +#endif /* CONFIG_DEBUG_FS */ + +/* * Find a suitable number of IO TLB entries size that will fit this request and * allocate a buffer from that IO TLB pool. */ @@ -702,6 +693,8 @@ found: area->index = wrap_area_index(mem, index + nslots); area->used += nslots; spin_unlock_irqrestore(&area->lock, flags); + + inc_used_and_hiwater(mem, nslots); return slot_index; } @@ -834,6 +827,8 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr) mem->slots[i].list = ++count; area->used -= nslots; spin_unlock_irqrestore(&area->lock, flags); + + dec_used(mem, nslots); } /* @@ -928,34 +923,73 @@ bool is_swiotlb_active(struct device *dev) } EXPORT_SYMBOL_GPL(is_swiotlb_active); +#ifdef CONFIG_DEBUG_FS + static int io_tlb_used_get(void *data, u64 *val) { - *val = mem_used(&io_tlb_default_mem); + struct io_tlb_mem *mem = data; + + *val = mem_used(mem); + return 0; +} + +static int io_tlb_hiwater_get(void *data, u64 *val) +{ + struct io_tlb_mem *mem = data; + + *val = atomic_long_read(&mem->used_hiwater); + return 0; +} + +static int io_tlb_hiwater_set(void *data, u64 val) +{ + struct io_tlb_mem *mem = data; + + /* Only allow setting to zero */ + if (val != 0) + return -EINVAL; + + atomic_long_set(&mem->used_hiwater, val); return 0; } + DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_used, io_tlb_used_get, NULL, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_hiwater, io_tlb_hiwater_get, + io_tlb_hiwater_set, "%llu\n"); static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, const char *dirname) { + atomic_long_set(&mem->total_used, 0); + atomic_long_set(&mem->used_hiwater, 0); + mem->debugfs = debugfs_create_dir(dirname, io_tlb_default_mem.debugfs); if (!mem->nslabs) return; debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs); - debugfs_create_file("io_tlb_used", 0400, mem->debugfs, NULL, + debugfs_create_file("io_tlb_used", 0400, mem->debugfs, mem, &fops_io_tlb_used); + debugfs_create_file("io_tlb_used_hiwater", 0600, mem->debugfs, mem, + &fops_io_tlb_hiwater); } -static int __init __maybe_unused swiotlb_create_default_debugfs(void) +static int __init swiotlb_create_default_debugfs(void) { swiotlb_create_debugfs_files(&io_tlb_default_mem, "swiotlb"); return 0; } -#ifdef CONFIG_DEBUG_FS late_initcall(swiotlb_create_default_debugfs); -#endif + +#else /* !CONFIG_DEBUG_FS */ + +static inline void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, + const char *dirname) +{ +} + +#endif /* CONFIG_DEBUG_FS */ #ifdef CONFIG_DMA_RESTRICTED_POOL @@ -998,6 +1032,11 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem, /* Set Per-device io tlb area to one */ unsigned int nareas = 1; + if (PageHighMem(pfn_to_page(PHYS_PFN(rmem->base)))) { + dev_err(dev, "Restricted DMA pool must be accessible within the linear mapping."); + return -EINVAL; + } + /* * Since multiple devices can share the same pool, the private data, * io_tlb_mem struct, will be initialized by the first device attached @@ -1059,11 +1098,6 @@ static int __init rmem_swiotlb_setup(struct reserved_mem *rmem) of_get_flat_dt_prop(node, "no-map", NULL)) return -EINVAL; - if (PageHighMem(pfn_to_page(PHYS_PFN(rmem->base)))) { - pr_err("Restricted DMA pool must be accessible within the linear mapping."); - return -EINVAL; - } - rmem->ops = &rmem_swiotlb_ops; pr_info("Reserved memory: created restricted DMA pool at %pa, size %ld MiB\n", &rmem->base, (unsigned long)rmem->size / SZ_1M); diff --git a/kernel/events/core.c b/kernel/events/core.c index 435815d3be3f..db016e418931 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9433,8 +9433,8 @@ __perf_event_account_interrupt(struct perf_event *event, int throttle) hwc->interrupts = 1; } else { hwc->interrupts++; - if (unlikely(throttle - && hwc->interrupts >= max_samples_per_tick)) { + if (unlikely(throttle && + hwc->interrupts > max_samples_per_tick)) { __this_cpu_inc(perf_throttled_count); tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); hwc->interrupts = MAX_INTERRUPTS; @@ -10150,8 +10150,20 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, perf_trace_buf_update(record, event_type); hlist_for_each_entry_rcu(event, head, hlist_entry) { - if (perf_tp_event_match(event, &data, regs)) + if (perf_tp_event_match(event, &data, regs)) { perf_swevent_event(event, count, &data, regs); + + /* + * Here use the same on-stack perf_sample_data, + * some members in data are event-specific and + * need to be re-computed for different sweveents. + * Re-initialize data->sample_flags safely to avoid + * the problem that next event skips preparing data + * because data->sample_flags is set. + */ + perf_sample_data_init(&data, 0, 0); + perf_sample_save_raw_data(&data, &raw); + } } /* diff --git a/kernel/events/hw_breakpoint_test.c b/kernel/events/hw_breakpoint_test.c index c57610f52bb4..2cfeeecf8de9 100644 --- a/kernel/events/hw_breakpoint_test.c +++ b/kernel/events/hw_breakpoint_test.c @@ -329,5 +329,4 @@ static struct kunit_suite hw_breakpoint_test_suite = { kunit_test_suites(&hw_breakpoint_test_suite); -MODULE_LICENSE("GPL"); MODULE_AUTHOR("Marco Elver <elver@google.com>"); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 273a0fe7910a..a0433f37b024 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -814,7 +814,7 @@ struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) size = sizeof(struct perf_buffer); size += nr_pages * sizeof(void *); - if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER) + if (order_base_2(size) > PAGE_SHIFT+MAX_ORDER) goto fail; node = (cpu == -1) ? cpu : cpu_to_node(cpu); diff --git a/kernel/exit.c b/kernel/exit.c index f2afdb0add7c..edb50b4c9972 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -68,6 +68,7 @@ #include <linux/kprobes.h> #include <linux/rethook.h> #include <linux/sysfs.h> +#include <linux/user_events.h> #include <linux/uaccess.h> #include <asm/unistd.h> @@ -410,7 +411,10 @@ static void coredump_task_exit(struct task_struct *tsk) tsk->flags |= PF_POSTCOREDUMP; core_state = tsk->signal->core_state; spin_unlock_irq(&tsk->sighand->siglock); - if (core_state) { + + /* The vhost_worker does not particpate in coredumps */ + if (core_state && + ((tsk->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER)) { struct core_thread self; self.task = current; @@ -537,7 +541,7 @@ static void exit_mm(void) return; sync_mm_rss(mm); mmap_read_lock(mm); - mmgrab(mm); + mmgrab_lazy_tlb(mm); BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ task_lock(current); @@ -818,6 +822,7 @@ void __noreturn do_exit(long code) coredump_task_exit(tsk); ptrace_event(PTRACE_EVENT_EXIT, code); + user_events_exit(tsk); validate_creds_for_do_exit(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index bfe73db1c26c..81cba91f30bb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -97,6 +97,8 @@ #include <linux/io_uring.h> #include <linux/bpf.h> #include <linux/stackprotector.h> +#include <linux/user_events.h> +#include <linux/iommu.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> @@ -451,13 +453,49 @@ static struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; +#ifdef CONFIG_PER_VMA_LOCK + +/* SLAB cache for vm_area_struct.lock */ +static struct kmem_cache *vma_lock_cachep; + +static bool vma_lock_alloc(struct vm_area_struct *vma) +{ + vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL); + if (!vma->vm_lock) + return false; + + init_rwsem(&vma->vm_lock->lock); + vma->vm_lock_seq = -1; + + return true; +} + +static inline void vma_lock_free(struct vm_area_struct *vma) +{ + kmem_cache_free(vma_lock_cachep, vma->vm_lock); +} + +#else /* CONFIG_PER_VMA_LOCK */ + +static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; } +static inline void vma_lock_free(struct vm_area_struct *vma) {} + +#endif /* CONFIG_PER_VMA_LOCK */ + struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) { struct vm_area_struct *vma; vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - if (vma) - vma_init(vma, mm); + if (!vma) + return NULL; + + vma_init(vma, mm); + if (!vma_lock_alloc(vma)) { + kmem_cache_free(vm_area_cachep, vma); + return NULL; + } + return vma; } @@ -465,26 +503,56 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) { struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - if (new) { - ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); - ASSERT_EXCLUSIVE_WRITER(orig->vm_file); - /* - * orig->shared.rb may be modified concurrently, but the clone - * will be reinitialized. - */ - data_race(memcpy(new, orig, sizeof(*new))); - INIT_LIST_HEAD(&new->anon_vma_chain); - dup_anon_vma_name(orig, new); + if (!new) + return NULL; + + ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); + ASSERT_EXCLUSIVE_WRITER(orig->vm_file); + /* + * orig->shared.rb may be modified concurrently, but the clone + * will be reinitialized. + */ + data_race(memcpy(new, orig, sizeof(*new))); + if (!vma_lock_alloc(new)) { + kmem_cache_free(vm_area_cachep, new); + return NULL; } + INIT_LIST_HEAD(&new->anon_vma_chain); + vma_numab_state_init(new); + dup_anon_vma_name(orig, new); + return new; } -void vm_area_free(struct vm_area_struct *vma) +void __vm_area_free(struct vm_area_struct *vma) { + vma_numab_state_free(vma); free_anon_vma_name(vma); + vma_lock_free(vma); kmem_cache_free(vm_area_cachep, vma); } +#ifdef CONFIG_PER_VMA_LOCK +static void vm_area_free_rcu_cb(struct rcu_head *head) +{ + struct vm_area_struct *vma = container_of(head, struct vm_area_struct, + vm_rcu); + + /* The vma should not be locked while being destroyed. */ + VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma); + __vm_area_free(vma); +} +#endif + +void vm_area_free(struct vm_area_struct *vma) +{ +#ifdef CONFIG_PER_VMA_LOCK + call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb); +#else + __vm_area_free(vma); +#endif +} + static void account_kernel_stack(struct task_struct *tsk, int account) { if (IS_ENABLED(CONFIG_VMAP_STACK)) { @@ -775,6 +843,67 @@ static void check_mm(struct mm_struct *mm) #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) +static void do_check_lazy_tlb(void *arg) +{ + struct mm_struct *mm = arg; + + WARN_ON_ONCE(current->active_mm == mm); +} + +static void do_shoot_lazy_tlb(void *arg) +{ + struct mm_struct *mm = arg; + + if (current->active_mm == mm) { + WARN_ON_ONCE(current->mm); + current->active_mm = &init_mm; + switch_mm(mm, &init_mm, current); + } +} + +static void cleanup_lazy_tlbs(struct mm_struct *mm) +{ + if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) { + /* + * In this case, lazy tlb mms are refounted and would not reach + * __mmdrop until all CPUs have switched away and mmdrop()ed. + */ + return; + } + + /* + * Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it + * requires lazy mm users to switch to another mm when the refcount + * drops to zero, before the mm is freed. This requires IPIs here to + * switch kernel threads to init_mm. + * + * archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm + * switch with the final userspace teardown TLB flush which leaves the + * mm lazy on this CPU but no others, reducing the need for additional + * IPIs here. There are cases where a final IPI is still required here, + * such as the final mmdrop being performed on a different CPU than the + * one exiting, or kernel threads using the mm when userspace exits. + * + * IPI overheads have not found to be expensive, but they could be + * reduced in a number of possible ways, for example (roughly + * increasing order of complexity): + * - The last lazy reference created by exit_mm() could instead switch + * to init_mm, however it's probable this will run on the same CPU + * immediately afterwards, so this may not reduce IPIs much. + * - A batch of mms requiring IPIs could be gathered and freed at once. + * - CPUs store active_mm where it can be remotely checked without a + * lock, to filter out false-positives in the cpumask. + * - After mm_users or mm_count reaches zero, switching away from the + * mm could clear mm_cpumask to reduce some IPIs, perhaps together + * with some batching or delaying of the final IPIs. + * - A delayed freeing and RCU-like quiescing sequence based on mm + * switching to avoid IPIs completely. + */ + on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1); + if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES)) + on_each_cpu(do_check_lazy_tlb, (void *)mm, 1); +} + /* * Called when the last reference to the mm * is dropped: either by a lazy thread or by @@ -786,6 +915,10 @@ void __mmdrop(struct mm_struct *mm) BUG_ON(mm == &init_mm); WARN_ON_ONCE(mm == current->mm); + + /* Ensure no CPUs are using this as their lazy tlb mm */ + cleanup_lazy_tlbs(mm); + WARN_ON_ONCE(mm == current->active_mm); mm_free_pgd(mm); destroy_context(mm); @@ -793,6 +926,7 @@ void __mmdrop(struct mm_struct *mm) check_mm(mm); put_user_ns(mm->user_ns); mm_pasid_drop(mm); + mm_destroy_cid(mm); for (i = 0; i < NR_MM_COUNTERS; i++) percpu_counter_destroy(&mm->rss_stat[i]); @@ -1057,7 +1191,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #ifdef CONFIG_SCHED_MM_CID tsk->mm_cid = -1; + tsk->last_mm_cid = -1; tsk->mm_cid_active = 0; + tsk->migrate_from_cpu = -1; #endif return tsk; @@ -1128,6 +1264,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, seqcount_init(&mm->write_protect_seq); mmap_init_lock(mm); INIT_LIST_HEAD(&mm->mmlist); +#ifdef CONFIG_PER_VMA_LOCK + mm->mm_lock_seq = 0; +#endif mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; @@ -1162,18 +1301,22 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (init_new_context(p, mm)) goto fail_nocontext; + if (mm_alloc_cid(mm)) + goto fail_cid; + for (i = 0; i < NR_MM_COUNTERS; i++) if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT)) goto fail_pcpu; mm->user_ns = get_user_ns(user_ns); lru_gen_init_mm(mm); - mm_init_cid(mm); return mm; fail_pcpu: while (i > 0) percpu_counter_destroy(&mm->rss_stat[--i]); + mm_destroy_cid(mm); +fail_cid: destroy_context(mm); fail_nocontext: mm_free_pgd(mm); @@ -2193,16 +2336,16 @@ __latent_entropy struct task_struct *copy_process( p->flags &= ~PF_KTHREAD; if (args->kthread) p->flags |= PF_KTHREAD; - if (args->user_worker) - p->flags |= PF_USER_WORKER; - if (args->io_thread) { + if (args->user_worker) { /* - * Mark us an IO worker, and block any signal that isn't + * Mark us a user worker, and block any signal that isn't * fatal or STOP */ - p->flags |= PF_IO_WORKER; + p->flags |= PF_USER_WORKER; siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP)); } + if (args->io_thread) + p->flags |= PF_IO_WORKER; if (args->name) strscpy_pad(p->comm, args->name, sizeof(p->comm)); @@ -2374,9 +2517,6 @@ __latent_entropy struct task_struct *copy_process( if (retval) goto bad_fork_cleanup_io; - if (args->ignore_signals) - ignore_signals(p); - stackleak_task_init(p); if (pid != &init_struct_pid) { @@ -2594,6 +2734,7 @@ __latent_entropy struct task_struct *copy_process( trace_task_newtask(p, clone_flags); uprobe_copy_process(p, clone_flags); + user_events_fork(p, clone_flags); copy_oom_score_adj(clone_flags, p); @@ -3159,6 +3300,9 @@ void __init proc_caches_init(void) NULL); vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); +#ifdef CONFIG_PER_VMA_LOCK + vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT); +#endif mmap_init(); nsproxy_cache_init(); } diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 322813366c6c..9a24574988d2 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -28,7 +28,7 @@ /* * The number of tasks checked: */ -int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; +static int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; /* * Limit number of tasks checked in a batch. @@ -47,9 +47,9 @@ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_ /* * Zero (default value) means use sysctl_hung_task_timeout_secs: */ -unsigned long __read_mostly sysctl_hung_task_check_interval_secs; +static unsigned long __read_mostly sysctl_hung_task_check_interval_secs; -int __read_mostly sysctl_hung_task_warnings = 10; +static int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; static bool hung_task_show_lock; @@ -72,8 +72,8 @@ static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; * Should we panic (and reboot, if panic_timeout= is set) when a * hung task is detected: */ -unsigned int __read_mostly sysctl_hung_task_panic = - IS_ENABLED(CONFIG_BOOTPARAM_HUNG_TASK_PANIC); +static unsigned int __read_mostly sysctl_hung_task_panic = + IS_ENABLED(CONFIG_BOOTPARAM_HUNG_TASK_PANIC); static int hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 7a97bcb086bf..b4c31a5c1147 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -542,7 +542,7 @@ fail: return ret; } -#ifdef CONFIG_PCI_MSI_ARCH_FALLBACKS +#if defined(CONFIG_PCI_MSI_ARCH_FALLBACKS) || defined(CONFIG_PCI_XEN) /** * msi_device_populate_sysfs - Populate msi_irqs sysfs entries for a device * @dev: The device (PCI, platform etc) which will get sysfs entries @@ -574,7 +574,7 @@ void msi_device_destroy_sysfs(struct device *dev) msi_for_each_desc(desc, dev, MSI_DESC_ALL) msi_sysfs_remove_desc(dev, desc); } -#endif /* CONFIG_PCI_MSI_ARCH_FALLBACK */ +#endif /* CONFIG_PCI_MSI_ARCH_FALLBACK || CONFIG_PCI_XEN */ #else /* CONFIG_SYSFS */ static inline int msi_sysfs_create_group(struct device *dev) { return 0; } static inline int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *desc) { return 0; } diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 7afa40fe5cc4..2f4fb336dda1 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -22,6 +22,8 @@ #include <asm/processor.h> #include <linux/kasan.h> +#include <trace/events/ipi.h> + static DEFINE_PER_CPU(struct llist_head, raised_list); static DEFINE_PER_CPU(struct llist_head, lazy_list); static DEFINE_PER_CPU(struct task_struct *, irq_workd); @@ -74,6 +76,14 @@ void __weak arch_irq_work_raise(void) */ } +static __always_inline void irq_work_raise(struct irq_work *work) +{ + if (trace_ipi_send_cpu_enabled() && arch_irq_work_has_interrupt()) + trace_ipi_send_cpu(smp_processor_id(), _RET_IP_, work->func); + + arch_irq_work_raise(); +} + /* Enqueue on current CPU, work must already be claimed and preempt disabled */ static void __irq_work_queue_local(struct irq_work *work) { @@ -99,7 +109,7 @@ static void __irq_work_queue_local(struct irq_work *work) /* If the work is "lazy", handle it from next tick if any */ if (!lazy_work || tick_nohz_tick_stopped()) - arch_irq_work_raise(); + irq_work_raise(work); } /* Enqueue the irq work @work on the current CPU */ diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 83f499182c9a..77747391f49b 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -288,8 +288,7 @@ unsigned long kallsyms_lookup_name(const char *name) * Iterate over all symbols in vmlinux. For symbols from modules use * module_kallsyms_on_each_symbol instead. */ -int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, - unsigned long), +int kallsyms_on_each_symbol(int (*fn)(void *, const char *, unsigned long), void *data) { char namebuf[KSYM_NAME_LEN]; @@ -299,7 +298,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, for (i = 0, off = 0; i < kallsyms_num_syms; i++) { off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); - ret = fn(data, namebuf, NULL, kallsyms_sym_address(i)); + ret = fn(data, namebuf, kallsyms_sym_address(i)); if (ret != 0) return ret; cond_resched(); diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c index bfbc12da3326..a2e3745d15c4 100644 --- a/kernel/kallsyms_selftest.c +++ b/kernel/kallsyms_selftest.c @@ -95,7 +95,7 @@ static struct test_item test_items[] = { static char stub_name[KSYM_NAME_LEN]; -static int stat_symbol_len(void *data, const char *name, struct module *mod, unsigned long addr) +static int stat_symbol_len(void *data, const char *name, unsigned long addr) { *(u32 *)data += strlen(name); @@ -154,7 +154,7 @@ static void test_kallsyms_compression_ratio(void) pr_info(" ---------------------------------------------------------\n"); } -static int lookup_name(void *data, const char *name, struct module *mod, unsigned long addr) +static int lookup_name(void *data, const char *name, unsigned long addr) { u64 t0, t1, t; struct test_stat *stat = (struct test_stat *)data; @@ -207,7 +207,7 @@ static bool match_cleanup_name(const char *s, const char *name) return !strncmp(s, name, len); } -static int find_symbol(void *data, const char *name, struct module *mod, unsigned long addr) +static int find_symbol(void *data, const char *name, unsigned long addr) { struct test_stat *stat = (struct test_stat *)data; diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index a60c561724be..0ddbdab5903d 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -1572,34 +1572,26 @@ static void test_exit(struct kunit *test) } __no_kcsan -static void register_tracepoints(struct tracepoint *tp, void *ignore) +static void register_tracepoints(void) { - check_trace_callback_type_console(probe_console); - if (!strcmp(tp->name, "console")) - WARN_ON(tracepoint_probe_register(tp, probe_console, NULL)); + register_trace_console(probe_console, NULL); } __no_kcsan -static void unregister_tracepoints(struct tracepoint *tp, void *ignore) +static void unregister_tracepoints(void) { - if (!strcmp(tp->name, "console")) - tracepoint_probe_unregister(tp, probe_console, NULL); + unregister_trace_console(probe_console, NULL); } static int kcsan_suite_init(struct kunit_suite *suite) { - /* - * Because we want to be able to build the test as a module, we need to - * iterate through all known tracepoints, since the static registration - * won't work here. - */ - for_each_kernel_tracepoint(register_tracepoints, NULL); + register_tracepoints(); return 0; } static void kcsan_suite_exit(struct kunit_suite *suite) { - for_each_kernel_tracepoint(unregister_tracepoints, NULL); + unregister_tracepoints(); tracepoint_synchronize_unregister(); } diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index f1a0e4e3fb5c..f989f5f1933b 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -65,7 +65,7 @@ int kexec_image_probe_default(struct kimage *image, void *buf, return ret; } -void *kexec_image_load_default(struct kimage *image) +static void *kexec_image_load_default(struct kimage *image) { if (!image->fops || !image->fops->load) return ERR_PTR(-ENOEXEC); @@ -249,8 +249,8 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, /* IMA needs to pass the measurement list to the next kernel. */ ima_add_kexec_buffer(image); - /* Call arch image load handlers */ - ldata = arch_kexec_kernel_image_load(image); + /* Call image load handler */ + ldata = kexec_image_load_default(image); if (IS_ERR(ldata)) { ret = PTR_ERR(ldata); diff --git a/kernel/kheaders.c b/kernel/kheaders.c index 8f69772af77b..42163c9e94e5 100644 --- a/kernel/kheaders.c +++ b/kernel/kheaders.c @@ -26,15 +26,15 @@ asm ( " .popsection \n" ); -extern char kernel_headers_data; -extern char kernel_headers_data_end; +extern char kernel_headers_data[]; +extern char kernel_headers_data_end[]; static ssize_t ikheaders_read(struct file *file, struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) { - memcpy(buf, &kernel_headers_data + off, len); + memcpy(buf, &kernel_headers_data[off], len); return len; } @@ -48,8 +48,8 @@ static struct bin_attribute kheaders_attr __ro_after_init = { static int __init ikheaders_init(void) { - kheaders_attr.size = (&kernel_headers_data_end - - &kernel_headers_data); + kheaders_attr.size = (kernel_headers_data_end - + kernel_headers_data); return sysfs_create_bin_file(kernel_kobj, &kheaders_attr); } diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 0408aab80941..aad7a3bfd846 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -39,7 +39,7 @@ static struct kobj_attribute _name##_attr = __ATTR_RW(_name) static ssize_t uevent_seqnum_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%llu\n", (unsigned long long)uevent_seqnum); + return sysfs_emit(buf, "%llu\n", (unsigned long long)uevent_seqnum); } KERNEL_ATTR_RO(uevent_seqnum); @@ -64,7 +64,7 @@ KERNEL_ATTR_RO(address_bits); static ssize_t uevent_helper_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%s\n", uevent_helper); + return sysfs_emit(buf, "%s\n", uevent_helper); } static ssize_t uevent_helper_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -85,7 +85,7 @@ KERNEL_ATTR_RW(uevent_helper); static ssize_t profiling_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", prof_on); + return sysfs_emit(buf, "%d\n", prof_on); } static ssize_t profiling_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -116,14 +116,14 @@ KERNEL_ATTR_RW(profiling); static ssize_t kexec_loaded_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", !!kexec_image); + return sysfs_emit(buf, "%d\n", !!kexec_image); } KERNEL_ATTR_RO(kexec_loaded); static ssize_t kexec_crash_loaded_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", kexec_crash_loaded()); + return sysfs_emit(buf, "%d\n", kexec_crash_loaded()); } KERNEL_ATTR_RO(kexec_crash_loaded); @@ -135,7 +135,7 @@ static ssize_t kexec_crash_size_show(struct kobject *kobj, if (size < 0) return size; - return sprintf(buf, "%zd\n", size); + return sysfs_emit(buf, "%zd\n", size); } static ssize_t kexec_crash_size_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -160,8 +160,8 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { phys_addr_t vmcore_base = paddr_vmcoreinfo_note(); - return sprintf(buf, "%pa %x\n", &vmcore_base, - (unsigned int)VMCOREINFO_NOTE_SIZE); + return sysfs_emit(buf, "%pa %x\n", &vmcore_base, + (unsigned int)VMCOREINFO_NOTE_SIZE); } KERNEL_ATTR_RO(vmcoreinfo); @@ -171,7 +171,7 @@ KERNEL_ATTR_RO(vmcoreinfo); static ssize_t fscaps_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", file_caps_enabled); + return sysfs_emit(buf, "%d\n", file_caps_enabled); } KERNEL_ATTR_RO(fscaps); @@ -180,7 +180,7 @@ int rcu_expedited; static ssize_t rcu_expedited_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", READ_ONCE(rcu_expedited)); + return sysfs_emit(buf, "%d\n", READ_ONCE(rcu_expedited)); } static ssize_t rcu_expedited_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -197,7 +197,7 @@ int rcu_normal; static ssize_t rcu_normal_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", READ_ONCE(rcu_normal)); + return sysfs_emit(buf, "%d\n", READ_ONCE(rcu_normal)); } static ssize_t rcu_normal_store(struct kobject *kobj, struct kobj_attribute *attr, diff --git a/kernel/kthread.c b/kernel/kthread.c index 4bc6e0971ec9..490792b1066e 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1406,14 +1406,18 @@ void kthread_use_mm(struct mm_struct *mm) WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(tsk->mm); + /* + * It is possible for mm to be the same as tsk->active_mm, but + * we must still mmgrab(mm) and mmdrop_lazy_tlb(active_mm), + * because these references are not equivalent. + */ + mmgrab(mm); + task_lock(tsk); /* Hold off tlb flush IPIs while switching mm's */ local_irq_disable(); active_mm = tsk->active_mm; - if (active_mm != mm) { - mmgrab(mm); - tsk->active_mm = mm; - } + tsk->active_mm = mm; tsk->mm = mm; membarrier_update_current_mm(mm); switch_mm_irqs_off(active_mm, mm, tsk); @@ -1430,12 +1434,9 @@ void kthread_use_mm(struct mm_struct *mm) * memory barrier after storing to tsk->mm, before accessing * user-space memory. A full memory barrier for membarrier * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by - * mmdrop(), or explicitly with smp_mb(). + * mmdrop_lazy_tlb(). */ - if (active_mm != mm) - mmdrop(active_mm); - else - smp_mb(); + mmdrop_lazy_tlb(active_mm); } EXPORT_SYMBOL_GPL(kthread_use_mm); @@ -1463,10 +1464,13 @@ void kthread_unuse_mm(struct mm_struct *mm) local_irq_disable(); tsk->mm = NULL; membarrier_update_current_mm(NULL); + mmgrab_lazy_tlb(mm); /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); local_irq_enable(); task_unlock(tsk); + + mmdrop(mm); } EXPORT_SYMBOL_GPL(kthread_unuse_mm); diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index feaf90c5e537..61328328c474 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -33,6 +33,7 @@ * * - klp_ftrace_handler() * - klp_update_patch_state() + * - __klp_sched_try_switch() */ DEFINE_MUTEX(klp_mutex); @@ -142,8 +143,7 @@ static int klp_match_callback(void *data, unsigned long addr) return 0; } -static int klp_find_callback(void *data, const char *name, - struct module *mod, unsigned long addr) +static int klp_find_callback(void *data, const char *name, unsigned long addr) { struct klp_find_arg *args = data; diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index f1b25ec581e0..e9fd83a02228 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -9,11 +9,14 @@ #include <linux/cpu.h> #include <linux/stacktrace.h> +#include <linux/static_call.h> #include "core.h" #include "patch.h" #include "transition.h" #define MAX_STACK_ENTRIES 100 +DEFINE_PER_CPU(unsigned long[MAX_STACK_ENTRIES], klp_stack_entries); + #define STACK_ERR_BUF_SIZE 128 #define SIGNALS_TIMEOUT 15 @@ -25,6 +28,25 @@ static int klp_target_state = KLP_UNDEFINED; static unsigned int klp_signals_cnt; /* + * When a livepatch is in progress, enable klp stack checking in + * cond_resched(). This helps CPU-bound kthreads get patched. + */ +#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) + +#define klp_cond_resched_enable() sched_dynamic_klp_enable() +#define klp_cond_resched_disable() sched_dynamic_klp_disable() + +#else /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ + +DEFINE_STATIC_KEY_FALSE(klp_sched_try_switch_key); +EXPORT_SYMBOL(klp_sched_try_switch_key); + +#define klp_cond_resched_enable() static_branch_enable(&klp_sched_try_switch_key) +#define klp_cond_resched_disable() static_branch_disable(&klp_sched_try_switch_key) + +#endif /* CONFIG_PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ + +/* * This work can be performed periodically to finish patching or unpatching any * "straggler" tasks which failed to transition in the first attempt. */ @@ -172,8 +194,8 @@ void klp_update_patch_state(struct task_struct *task) * barrier (smp_rmb) for two cases: * * 1) Enforce the order of the TIF_PATCH_PENDING read and the - * klp_target_state read. The corresponding write barrier is in - * klp_init_transition(). + * klp_target_state read. The corresponding write barriers are in + * klp_init_transition() and klp_reverse_transition(). * * 2) Enforce the order of the TIF_PATCH_PENDING read and a future read * of func->transition, if klp_ftrace_handler() is called later on @@ -240,12 +262,15 @@ static int klp_check_stack_func(struct klp_func *func, unsigned long *entries, */ static int klp_check_stack(struct task_struct *task, const char **oldname) { - static unsigned long entries[MAX_STACK_ENTRIES]; + unsigned long *entries = this_cpu_ptr(klp_stack_entries); struct klp_object *obj; struct klp_func *func; int ret, nr_entries; - ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries)); + /* Protect 'klp_stack_entries' */ + lockdep_assert_preemption_disabled(); + + ret = stack_trace_save_tsk_reliable(task, entries, MAX_STACK_ENTRIES); if (ret < 0) return -EINVAL; nr_entries = ret; @@ -307,7 +332,11 @@ static bool klp_try_switch_task(struct task_struct *task) * functions. If all goes well, switch the task to the target patch * state. */ - ret = task_call_func(task, klp_check_and_switch_task, &old_name); + if (task == current) + ret = klp_check_and_switch_task(current, &old_name); + else + ret = task_call_func(task, klp_check_and_switch_task, &old_name); + switch (ret) { case 0: /* success */ break; @@ -334,6 +363,44 @@ static bool klp_try_switch_task(struct task_struct *task) return !ret; } +void __klp_sched_try_switch(void) +{ + if (likely(!klp_patch_pending(current))) + return; + + /* + * This function is called from cond_resched() which is called in many + * places throughout the kernel. Using the klp_mutex here might + * deadlock. + * + * Instead, disable preemption to prevent racing with other callers of + * klp_try_switch_task(). Thanks to task_call_func() they won't be + * able to switch this task while it's running. + */ + preempt_disable(); + + /* + * Make sure current didn't get patched between the above check and + * preempt_disable(). + */ + if (unlikely(!klp_patch_pending(current))) + goto out; + + /* + * Enforce the order of the TIF_PATCH_PENDING read above and the + * klp_target_state read in klp_try_switch_task(). The corresponding + * write barriers are in klp_init_transition() and + * klp_reverse_transition(). + */ + smp_rmb(); + + klp_try_switch_task(current); + +out: + preempt_enable(); +} +EXPORT_SYMBOL(__klp_sched_try_switch); + /* * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set. * Kthreads with TIF_PATCH_PENDING set are woken up. @@ -440,7 +507,8 @@ void klp_try_complete_transition(void) return; } - /* we're done, now cleanup the data structures */ + /* Done! Now cleanup the data structures. */ + klp_cond_resched_disable(); patch = klp_transition_patch; klp_complete_transition(); @@ -492,6 +560,8 @@ void klp_start_transition(void) set_tsk_thread_flag(task, TIF_PATCH_PENDING); } + klp_cond_resched_enable(); + klp_signals_cnt = 0; } @@ -547,8 +617,9 @@ void klp_init_transition(struct klp_patch *patch, int state) * see a func in transition with a task->patch_state of KLP_UNDEFINED. * * Also enforce the order of the klp_target_state write and future - * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() doesn't - * set a task->patch_state to KLP_UNDEFINED. + * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() and + * __klp_sched_try_switch() don't set a task->patch_state to + * KLP_UNDEFINED. */ smp_wmb(); @@ -584,14 +655,10 @@ void klp_reverse_transition(void) klp_target_state == KLP_PATCHED ? "patching to unpatching" : "unpatching to patching"); - klp_transition_patch->enabled = !klp_transition_patch->enabled; - - klp_target_state = !klp_target_state; - /* * Clear all TIF_PATCH_PENDING flags to prevent races caused by - * klp_update_patch_state() running in parallel with - * klp_start_transition(). + * klp_update_patch_state() or __klp_sched_try_switch() running in + * parallel with the reverse transition. */ read_lock(&tasklist_lock); for_each_process_thread(g, task) @@ -601,9 +668,28 @@ void klp_reverse_transition(void) for_each_possible_cpu(cpu) clear_tsk_thread_flag(idle_task(cpu), TIF_PATCH_PENDING); - /* Let any remaining calls to klp_update_patch_state() complete */ + /* + * Make sure all existing invocations of klp_update_patch_state() and + * __klp_sched_try_switch() see the cleared TIF_PATCH_PENDING before + * starting the reverse transition. + */ klp_synchronize_transition(); + /* + * All patching has stopped, now re-initialize the global variables to + * prepare for the reverse transition. + */ + klp_transition_patch->enabled = !klp_transition_patch->enabled; + klp_target_state = !klp_target_state; + + /* + * Enforce the order of the klp_target_state write and the + * TIF_PATCH_PENDING writes in klp_start_transition() to ensure + * klp_update_patch_state() and __klp_sched_try_switch() don't set + * task->patch_state to the wrong value. + */ + smp_wmb(); + klp_start_transition(); } @@ -617,9 +703,9 @@ void klp_copy_process(struct task_struct *child) * the task flag up to date with the parent here. * * The operation is serialized against all klp_*_transition() - * operations by the tasklist_lock. The only exception is - * klp_update_patch_state(current), but we cannot race with - * that because we are current. + * operations by the tasklist_lock. The only exceptions are + * klp_update_patch_state(current) and __klp_sched_try_switch(), but we + * cannot race with them because we are current. */ if (test_tsk_thread_flag(current, TIF_PATCH_PENDING)) set_tsk_thread_flag(child, TIF_PATCH_PENDING); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index dcd1d5bfc1e0..4dfd2f3e09b2 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2263,6 +2263,9 @@ static inline bool usage_match(struct lock_list *entry, void *mask) static inline bool usage_skip(struct lock_list *entry, void *mask) { + if (entry->class->lock_type == LD_LOCK_NORMAL) + return false; + /* * Skip local_lock() for irq inversion detection. * @@ -2289,14 +2292,16 @@ static inline bool usage_skip(struct lock_list *entry, void *mask) * As a result, we will skip local_lock(), when we search for irq * inversion bugs. */ - if (entry->class->lock_type == LD_LOCK_PERCPU) { - if (DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG)) - return false; + if (entry->class->lock_type == LD_LOCK_PERCPU && + DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG)) + return false; - return true; - } + /* + * Skip WAIT_OVERRIDE for irq inversion detection -- it's not actually + * a lock and only used to override the wait_type. + */ - return false; + return true; } /* @@ -4768,7 +4773,8 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next) for (; depth < curr->lockdep_depth; depth++) { struct held_lock *prev = curr->held_locks + depth; - u8 prev_inner = hlock_class(prev)->wait_type_inner; + struct lock_class *class = hlock_class(prev); + u8 prev_inner = class->wait_type_inner; if (prev_inner) { /* @@ -4778,6 +4784,14 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next) * Also due to trylocks. */ curr_inner = min(curr_inner, prev_inner); + + /* + * Allow override for annotations -- this is typically + * only valid/needed for code that only exists when + * CONFIG_PREEMPT_RT=n. + */ + if (unlikely(class->lock_type == LD_LOCK_WAIT_OVERRIDE)) + curr_inner = prev_inner; } } diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c index c201aadb9301..25ec0239477c 100644 --- a/kernel/locking/rwbase_rt.c +++ b/kernel/locking/rwbase_rt.c @@ -72,15 +72,6 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, int ret; raw_spin_lock_irq(&rtm->wait_lock); - /* - * Allow readers, as long as the writer has not completely - * acquired the semaphore for write. - */ - if (atomic_read(&rwb->readers) != WRITER_BIAS) { - atomic_inc(&rwb->readers); - raw_spin_unlock_irq(&rtm->wait_lock); - return 0; - } /* * Call into the slow lock path with the rtmutex->wait_lock diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index acb5a50309a1..9eabd585ce7a 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1240,7 +1240,7 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) /* * lock for reading */ -static inline int __down_read_common(struct rw_semaphore *sem, int state) +static __always_inline int __down_read_common(struct rw_semaphore *sem, int state) { int ret = 0; long count; @@ -1258,17 +1258,17 @@ out: return ret; } -static inline void __down_read(struct rw_semaphore *sem) +static __always_inline void __down_read(struct rw_semaphore *sem) { __down_read_common(sem, TASK_UNINTERRUPTIBLE); } -static inline int __down_read_interruptible(struct rw_semaphore *sem) +static __always_inline int __down_read_interruptible(struct rw_semaphore *sem) { return __down_read_common(sem, TASK_INTERRUPTIBLE); } -static inline int __down_read_killable(struct rw_semaphore *sem) +static __always_inline int __down_read_killable(struct rw_semaphore *sem) { return __down_read_common(sem, TASK_KILLABLE); } diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index 424b3bc58f3f..33a2e991f608 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -22,6 +22,104 @@ menuconfig MODULES if MODULES +config MODULE_DEBUGFS + bool + +config MODULE_DEBUG + bool "Module debugging" + depends on DEBUG_FS + help + Allows you to enable / disable features which can help you debug + modules. You don't need these options on production systems. + +if MODULE_DEBUG + +config MODULE_STATS + bool "Module statistics" + depends on DEBUG_FS + select MODULE_DEBUGFS + help + This option allows you to maintain a record of module statistics. + For example, size of all modules, average size, text size, a list + of failed modules and the size for each of those. For failed + modules we keep track of modules which failed due to either the + existing module taking too long to load or that module was already + loaded. + + You should enable this if you are debugging production loads + and want to see if userspace or the kernel is doing stupid things + with loading modules when it shouldn't or if you want to help + optimize userspace / kernel space module autoloading schemes. + You might want to do this because failed modules tend to use + up significant amount of memory, and so you'd be doing everyone a + favor in avoiding these failures proactively. + + This functionality is also useful for those experimenting with + module .text ELF section optimization. + + If unsure, say N. + +config MODULE_DEBUG_AUTOLOAD_DUPS + bool "Debug duplicate modules with auto-loading" + help + Module autoloading allows in-kernel code to request modules through + the *request_module*() API calls. This in turn just calls userspace + modprobe. Although modprobe checks to see if a module is already + loaded before trying to load a module there is a small time window in + which multiple duplicate requests can end up in userspace and multiple + modprobe calls race calling finit_module() around the same time for + duplicate modules. The finit_module() system call can consume in the + worst case more than twice the respective module size in virtual + memory for each duplicate module requests. Although duplicate module + requests are non-fatal virtual memory is a limited resource and each + duplicate module request ends up just unnecessarily straining virtual + memory. + + This debugging facility will create pr_warn() splats for duplicate + module requests to help identify if module auto-loading may be the + culprit to your early boot virtual memory pressure. Since virtual + memory abuse caused by duplicate module requests could render a + system unusable this functionality will also converge races in + requests for the same module to a single request. You can boot with + the module.enable_dups_trace=1 kernel parameter to use WARN_ON() + instead of the pr_warn(). + + If the first module request used request_module_nowait() we cannot + use that as the anchor to wait for duplicate module requests, since + users of request_module() do want a proper return value. If a call + for the same module happened earlier with request_module() though, + then a duplicate request_module_nowait() would be detected. The + non-wait request_module() call is synchronous and waits until modprobe + completes. Subsequent auto-loading requests for the same module do + not trigger a new finit_module() calls and do not strain virtual + memory, and so as soon as modprobe successfully completes we remove + tracking for duplicates for that module. + + Enable this functionality to try to debug virtual memory abuse during + boot on systems which are failing to boot or if you suspect you may be + straining virtual memory during boot, and you want to identify if the + abuse was due to module auto-loading. These issues are currently only + known to occur on systems with many CPUs (over 400) and is likely the + result of udev issuing duplicate module requests for each CPU, and so + module auto-loading is not the culprit. There may very well still be + many duplicate module auto-loading requests which could be optimized + for and this debugging facility can be used to help identify them. + + Only enable this for debugging system functionality, never have it + enabled on real systems. + +config MODULE_DEBUG_AUTOLOAD_DUPS_TRACE + bool "Force full stack trace when duplicates are found" + depends on MODULE_DEBUG_AUTOLOAD_DUPS + help + Enabling this will force a full stack trace for duplicate module + auto-loading requests using WARN_ON() instead of pr_warn(). You + should keep this disabled at all times unless you are a developer + and are doing a manual inspection and want to debug exactly why + these duplicates occur. + +endif # MODULE_DEBUG + config MODULE_FORCE_LOAD bool "Forced module loading" default n @@ -51,7 +149,7 @@ config MODULE_FORCE_UNLOAD config MODULE_UNLOAD_TAINT_TRACKING bool "Tainted module unload tracking" depends on MODULE_UNLOAD - default n + select MODULE_DEBUGFS help This option allows you to maintain a record of each unloaded module that tainted the kernel. In addition to displaying a diff --git a/kernel/module/Makefile b/kernel/module/Makefile index 948efea81e85..a10b2b9a6fdf 100644 --- a/kernel/module/Makefile +++ b/kernel/module/Makefile @@ -7,7 +7,10 @@ # and produce insane amounts of uninteresting coverage. KCOV_INSTRUMENT_module.o := n -obj-y += main.o strict_rwx.o +obj-y += main.o +obj-y += strict_rwx.o +obj-y += kmod.o +obj-$(CONFIG_MODULE_DEBUG_AUTOLOAD_DUPS) += dups.o obj-$(CONFIG_MODULE_DECOMPRESS) += decompress.o obj-$(CONFIG_MODULE_SIG) += signing.o obj-$(CONFIG_LIVEPATCH) += livepatch.o @@ -19,3 +22,4 @@ obj-$(CONFIG_SYSFS) += sysfs.o obj-$(CONFIG_KGDB_KDB) += kdb.o obj-$(CONFIG_MODVERSIONS) += version.o obj-$(CONFIG_MODULE_UNLOAD_TAINT_TRACKING) += tracking.o +obj-$(CONFIG_MODULE_STATS) += stats.o diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c index bb79ac1a6d8f..8a5d6d63b06c 100644 --- a/kernel/module/decompress.c +++ b/kernel/module/decompress.c @@ -257,7 +257,7 @@ static ssize_t module_zstd_decompress(struct load_info *info, do { struct page *page = module_get_next_page(info); - if (!IS_ERR(page)) { + if (IS_ERR(page)) { retval = PTR_ERR(page); goto out; } @@ -267,7 +267,7 @@ static ssize_t module_zstd_decompress(struct load_info *info, zstd_dec.size = PAGE_SIZE; ret = zstd_decompress_stream(dstream, &zstd_dec, &zstd_buf); - kunmap(page); + kunmap_local(zstd_dec.dst); retval = zstd_get_error_code(ret); if (retval) break; @@ -297,6 +297,10 @@ int module_decompress(struct load_info *info, const void *buf, size_t size) ssize_t data_size; int error; +#if defined(CONFIG_MODULE_STATS) + info->compressed_len = size; +#endif + /* * Start with number of pages twice as big as needed for * compressed data. diff --git a/kernel/module/dups.c b/kernel/module/dups.c new file mode 100644 index 000000000000..f3d7ea1e96d8 --- /dev/null +++ b/kernel/module/dups.c @@ -0,0 +1,248 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * kmod dups - the kernel module autoloader duplicate suppressor + * + * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org> + */ + +#define pr_fmt(fmt) "module: " fmt + +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/binfmts.h> +#include <linux/syscalls.h> +#include <linux/unistd.h> +#include <linux/kmod.h> +#include <linux/slab.h> +#include <linux/completion.h> +#include <linux/cred.h> +#include <linux/file.h> +#include <linux/fdtable.h> +#include <linux/workqueue.h> +#include <linux/security.h> +#include <linux/mount.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/resource.h> +#include <linux/notifier.h> +#include <linux/suspend.h> +#include <linux/rwsem.h> +#include <linux/ptrace.h> +#include <linux/async.h> +#include <linux/uaccess.h> + +#include "internal.h" + +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "module." +static bool enable_dups_trace = IS_ENABLED(CONFIG_MODULE_DEBUG_AUTOLOAD_DUPS_TRACE); +module_param(enable_dups_trace, bool_enable_only, 0644); + +/* + * Protects dup_kmod_reqs list, adds / removals with RCU. + */ +static DEFINE_MUTEX(kmod_dup_mutex); +static LIST_HEAD(dup_kmod_reqs); + +struct kmod_dup_req { + struct list_head list; + char name[MODULE_NAME_LEN]; + struct completion first_req_done; + struct work_struct complete_work; + struct delayed_work delete_work; + int dup_ret; +}; + +static struct kmod_dup_req *kmod_dup_request_lookup(char *module_name) +{ + struct kmod_dup_req *kmod_req; + + list_for_each_entry_rcu(kmod_req, &dup_kmod_reqs, list, + lockdep_is_held(&kmod_dup_mutex)) { + if (strlen(kmod_req->name) == strlen(module_name) && + !memcmp(kmod_req->name, module_name, strlen(module_name))) { + return kmod_req; + } + } + + return NULL; +} + +static void kmod_dup_request_delete(struct work_struct *work) +{ + struct kmod_dup_req *kmod_req; + kmod_req = container_of(to_delayed_work(work), struct kmod_dup_req, delete_work); + + /* + * The typical situation is a module successully loaded. In that + * situation the module will be present already in userspace. If + * new requests come in after that, userspace will already know the + * module is loaded so will just return 0 right away. There is still + * a small chance right after we delete this entry new request_module() + * calls may happen after that, they can happen. These heuristics + * are to protect finit_module() abuse for auto-loading, if modules + * are still tryign to auto-load even if a module is already loaded, + * that's on them, and those inneficiencies should not be fixed by + * kmod. The inneficies there are a call to modprobe and modprobe + * just returning 0. + */ + mutex_lock(&kmod_dup_mutex); + list_del_rcu(&kmod_req->list); + synchronize_rcu(); + mutex_unlock(&kmod_dup_mutex); + kfree(kmod_req); +} + +static void kmod_dup_request_complete(struct work_struct *work) +{ + struct kmod_dup_req *kmod_req; + + kmod_req = container_of(work, struct kmod_dup_req, complete_work); + + /* + * This will ensure that the kernel will let all the waiters get + * informed its time to check the return value. It's time to + * go home. + */ + complete_all(&kmod_req->first_req_done); + + /* + * Now that we have allowed prior request_module() calls to go on + * with life, let's schedule deleting this entry. We don't have + * to do it right away, but we *eventually* want to do it so to not + * let this linger forever as this is just a boot optimization for + * possible abuses of vmalloc() incurred by finit_module() thrashing. + */ + queue_delayed_work(system_wq, &kmod_req->delete_work, 60 * HZ); +} + +bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret) +{ + struct kmod_dup_req *kmod_req, *new_kmod_req; + int ret; + + /* + * Pre-allocate the entry in case we have to use it later + * to avoid contention with the mutex. + */ + new_kmod_req = kzalloc(sizeof(*new_kmod_req), GFP_KERNEL); + if (!new_kmod_req) + return false; + + memcpy(new_kmod_req->name, module_name, strlen(module_name)); + INIT_WORK(&new_kmod_req->complete_work, kmod_dup_request_complete); + INIT_DELAYED_WORK(&new_kmod_req->delete_work, kmod_dup_request_delete); + init_completion(&new_kmod_req->first_req_done); + + mutex_lock(&kmod_dup_mutex); + + kmod_req = kmod_dup_request_lookup(module_name); + if (!kmod_req) { + /* + * If the first request that came through for a module + * was with request_module_nowait() we cannot wait for it + * and share its return value with other users which may + * have used request_module() and need a proper return value + * so just skip using them as an anchor. + * + * If a prior request to this one came through with + * request_module() though, then a request_module_nowait() + * would benefit from duplicate detection. + */ + if (!wait) { + kfree(new_kmod_req); + pr_debug("New request_module_nowait() for %s -- cannot track duplicates for this request\n", module_name); + mutex_unlock(&kmod_dup_mutex); + return false; + } + + /* + * There was no duplicate, just add the request so we can + * keep tab on duplicates later. + */ + pr_debug("New request_module() for %s\n", module_name); + list_add_rcu(&new_kmod_req->list, &dup_kmod_reqs); + mutex_unlock(&kmod_dup_mutex); + return false; + } + mutex_unlock(&kmod_dup_mutex); + + /* We are dealing with a duplicate request now */ + kfree(new_kmod_req); + + /* + * To fix these try to use try_then_request_module() instead as that + * will check if the component you are looking for is present or not. + * You could also just queue a single request to load the module once, + * instead of having each and everything you need try to request for + * the module. + * + * Duplicate request_module() calls can cause quite a bit of wasted + * vmalloc() space when racing with userspace. + */ + if (enable_dups_trace) + WARN(1, "module-autoload: duplicate request for module %s\n", module_name); + else + pr_warn("module-autoload: duplicate request for module %s\n", module_name); + + if (!wait) { + /* + * If request_module_nowait() was used then the user just + * wanted to issue the request and if another module request + * was already its way with the same name we don't care for + * the return value either. Let duplicate request_module_nowait() + * calls bail out right away. + */ + *dup_ret = 0; + return true; + } + + /* + * If a duplicate request_module() was used they *may* care for + * the return value, so we have no other option but to wait for + * the first caller to complete. If the first caller used + * the request_module_nowait() call, subsquent callers will + * deal with the comprmise of getting a successful call with this + * optimization enabled ... + */ + ret = wait_for_completion_state(&kmod_req->first_req_done, + TASK_UNINTERRUPTIBLE | TASK_KILLABLE); + if (ret) { + *dup_ret = ret; + return true; + } + + /* Now the duplicate request has the same exact return value as the first request */ + *dup_ret = kmod_req->dup_ret; + + return true; +} + +void kmod_dup_request_announce(char *module_name, int ret) +{ + struct kmod_dup_req *kmod_req; + + mutex_lock(&kmod_dup_mutex); + + kmod_req = kmod_dup_request_lookup(module_name); + if (!kmod_req) + goto out; + + kmod_req->dup_ret = ret; + + /* + * If we complete() here we may allow duplicate threads + * to continue before the first one that submitted the + * request. We're in no rush also, given that each and + * every bounce back to userspace is slow we avoid that + * with a slight delay here. So queueue up the completion + * and let duplicates suffer, just wait a tad bit longer. + * There is no rush. But we also don't want to hold the + * caller up forever or introduce any boot delays. + */ + queue_work(system_wq, &kmod_req->complete_work); + +out: + mutex_unlock(&kmod_dup_mutex); +} diff --git a/kernel/module/internal.h b/kernel/module/internal.h index 1c877561a7d2..dc7b0160c480 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -3,6 +3,7 @@ * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) + * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org> */ #include <linux/elf.h> @@ -17,27 +18,19 @@ #define ARCH_SHF_SMALL 0 #endif -/* If this is set, the section belongs in the init part of the module */ -#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG - 1)) -/* Maximum number of characters written by module_flags() */ -#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4) - -#ifndef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC -#define data_layout core_layout -#endif - /* - * Modules' sections will be aligned on page boundaries - * to ensure complete separation of code and data, but - * only when CONFIG_STRICT_MODULE_RWX=y + * Use highest 4 bits of sh_entsize to store the mod_mem_type of this + * section. This leaves 28 bits for offset on 32-bit systems, which is + * about 256 MiB (WARN_ON_ONCE if we exceed that). */ -static inline unsigned int strict_align(unsigned int size) -{ - if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) - return PAGE_ALIGN(size); - else - return size; -} + +#define SH_ENTSIZE_TYPE_BITS 4 +#define SH_ENTSIZE_TYPE_SHIFT (BITS_PER_LONG - SH_ENTSIZE_TYPE_BITS) +#define SH_ENTSIZE_TYPE_MASK ((1UL << SH_ENTSIZE_TYPE_BITS) - 1) +#define SH_ENTSIZE_OFFSET_MASK ((1UL << (BITS_PER_LONG - SH_ENTSIZE_TYPE_BITS)) - 1) + +/* Maximum number of characters written by module_flags() */ +#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4) extern struct mutex module_mutex; extern struct list_head modules; @@ -53,7 +46,6 @@ extern const struct kernel_symbol __stop___ksymtab_gpl[]; extern const s32 __start___kcrctab[]; extern const s32 __start___kcrctab_gpl[]; -#include <linux/dynamic_debug.h> struct load_info { const char *name; /* pointer to module in temporary copy, freed at end of load_module() */ @@ -63,12 +55,14 @@ struct load_info { Elf_Shdr *sechdrs; char *secstrings, *strtab; unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs; - struct _ddebug_info dyndbg; bool sig_ok; #ifdef CONFIG_KALLSYMS unsigned long mod_kallsyms_init_off; #endif #ifdef CONFIG_MODULE_DECOMPRESS +#ifdef CONFIG_MODULE_STATS + unsigned long compressed_len; +#endif struct page **pages; unsigned int max_pages; unsigned int used_pages; @@ -101,11 +95,16 @@ int try_to_force_load(struct module *mod, const char *reason); bool find_symbol(struct find_symbol_arg *fsa); struct module *find_module_all(const char *name, size_t len, bool even_unformed); int cmp_name(const void *name, const void *sym); -long module_get_offset(struct module *mod, unsigned int *size, Elf_Shdr *sechdr, - unsigned int section); +long module_get_offset_and_type(struct module *mod, enum mod_mem_type type, + Elf_Shdr *sechdr, unsigned int section); char *module_flags(struct module *mod, char *buf, bool show_state); size_t module_flags_taint(unsigned long taints, char *buf); +char *module_next_tag_pair(char *string, unsigned long *secsize); + +#define for_each_modinfo_entry(entry, info, name) \ + for (entry = get_modinfo(info, name); entry; entry = get_next_modinfo(info, name, entry)) + static inline void module_assert_mutex_or_preempt(void) { #ifdef CONFIG_LOCKDEP @@ -148,6 +147,95 @@ static inline bool set_livepatch_module(struct module *mod) #endif } +/** + * enum fail_dup_mod_reason - state at which a duplicate module was detected + * + * @FAIL_DUP_MOD_BECOMING: the module is read properly, passes all checks but + * we've determined that another module with the same name is already loaded + * or being processed on our &modules list. This happens on early_mod_check() + * right before layout_and_allocate(). The kernel would have already + * vmalloc()'d space for the entire module through finit_module(). If + * decompression was used two vmap() spaces were used. These failures can + * happen when userspace has not seen the module present on the kernel and + * tries to load the module multiple times at same time. + * @FAIL_DUP_MOD_LOAD: the module has been read properly, passes all validation + * checks and the kernel determines that the module was unique and because + * of this allocated yet another private kernel copy of the module space in + * layout_and_allocate() but after this determined in add_unformed_module() + * that another module with the same name is already loaded or being processed. + * These failures should be mitigated as much as possible and are indicative + * of really fast races in loading modules. Without module decompression + * they waste twice as much vmap space. With module decompression three + * times the module's size vmap space is wasted. + */ +enum fail_dup_mod_reason { + FAIL_DUP_MOD_BECOMING = 0, + FAIL_DUP_MOD_LOAD, +}; + +#ifdef CONFIG_MODULE_DEBUGFS +extern struct dentry *mod_debugfs_root; +#endif + +#ifdef CONFIG_MODULE_STATS + +#define mod_stat_add_long(count, var) atomic_long_add(count, var) +#define mod_stat_inc(name) atomic_inc(name) + +extern atomic_long_t total_mod_size; +extern atomic_long_t total_text_size; +extern atomic_long_t invalid_kread_bytes; +extern atomic_long_t invalid_decompress_bytes; + +extern atomic_t modcount; +extern atomic_t failed_kreads; +extern atomic_t failed_decompress; +struct mod_fail_load { + struct list_head list; + char name[MODULE_NAME_LEN]; + atomic_long_t count; + unsigned long dup_fail_mask; +}; + +int try_add_failed_module(const char *name, enum fail_dup_mod_reason reason); +void mod_stat_bump_invalid(struct load_info *info, int flags); +void mod_stat_bump_becoming(struct load_info *info, int flags); + +#else + +#define mod_stat_add_long(name, var) +#define mod_stat_inc(name) + +static inline int try_add_failed_module(const char *name, + enum fail_dup_mod_reason reason) +{ + return 0; +} + +static inline void mod_stat_bump_invalid(struct load_info *info, int flags) +{ +} + +static inline void mod_stat_bump_becoming(struct load_info *info, int flags) +{ +} + +#endif /* CONFIG_MODULE_STATS */ + +#ifdef CONFIG_MODULE_DEBUG_AUTOLOAD_DUPS +bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret); +void kmod_dup_request_announce(char *module_name, int ret); +#else +static inline bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret) +{ + return false; +} + +static inline void kmod_dup_request_announce(char *module_name, int ret) +{ +} +#endif + #ifdef CONFIG_MODULE_UNLOAD_TAINT_TRACKING struct mod_unload_taint { struct list_head list; @@ -190,10 +278,13 @@ struct mod_tree_root { #endif unsigned long addr_min; unsigned long addr_max; +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + unsigned long data_addr_min; + unsigned long data_addr_max; +#endif }; extern struct mod_tree_root mod_tree; -extern struct mod_tree_root mod_data_tree; #ifdef CONFIG_MODULES_TREE_LOOKUP void mod_tree_insert(struct module *mod); @@ -224,7 +315,6 @@ void module_enable_nx(const struct module *mod); void module_enable_x(const struct module *mod); int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, char *secstrings, struct module *mod); -bool module_check_misalignment(const struct module *mod); #ifdef CONFIG_MODULE_SIG int module_sig_check(struct load_info *info, int flags); diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index bdc911dbcde5..c550d7d45f2f 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -6,6 +6,7 @@ */ #include <linux/module.h> +#include <linux/module_symbol.h> #include <linux/kallsyms.h> #include <linux/buildid.h> #include <linux/bsearch.h> @@ -78,6 +79,7 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, unsigned int shnum, unsigned int pcpundx) { const Elf_Shdr *sec; + enum mod_mem_type type; if (src->st_shndx == SHN_UNDEF || src->st_shndx >= shnum || @@ -90,11 +92,12 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, #endif sec = sechdrs + src->st_shndx; + type = sec->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT; if (!(sec->sh_flags & SHF_ALLOC) #ifndef CONFIG_KALLSYMS_ALL || !(sec->sh_flags & SHF_EXECINSTR) #endif - || (sec->sh_entsize & INIT_OFFSET_MASK)) + || mod_mem_type_is_init(type)) return false; return true; @@ -113,11 +116,13 @@ void layout_symtab(struct module *mod, struct load_info *info) Elf_Shdr *strsect = info->sechdrs + info->index.str; const Elf_Sym *src; unsigned int i, nsrc, ndst, strtab_size = 0; + struct module_memory *mod_mem_data = &mod->mem[MOD_DATA]; + struct module_memory *mod_mem_init_data = &mod->mem[MOD_INIT_DATA]; /* Put symbol section at end of init part of module. */ symsect->sh_flags |= SHF_ALLOC; - symsect->sh_entsize = module_get_offset(mod, &mod->init_layout.size, symsect, - info->index.sym) | INIT_OFFSET_MASK; + symsect->sh_entsize = module_get_offset_and_type(mod, MOD_INIT_DATA, + symsect, info->index.sym); pr_debug("\t%s\n", info->secstrings + symsect->sh_name); src = (void *)info->hdr + symsect->sh_offset; @@ -134,28 +139,27 @@ void layout_symtab(struct module *mod, struct load_info *info) } /* Append room for core symbols at end of core part. */ - info->symoffs = ALIGN(mod->data_layout.size, symsect->sh_addralign ?: 1); - info->stroffs = mod->data_layout.size = info->symoffs + ndst * sizeof(Elf_Sym); - mod->data_layout.size += strtab_size; + info->symoffs = ALIGN(mod_mem_data->size, symsect->sh_addralign ?: 1); + info->stroffs = mod_mem_data->size = info->symoffs + ndst * sizeof(Elf_Sym); + mod_mem_data->size += strtab_size; /* Note add_kallsyms() computes strtab_size as core_typeoffs - stroffs */ - info->core_typeoffs = mod->data_layout.size; - mod->data_layout.size += ndst * sizeof(char); - mod->data_layout.size = strict_align(mod->data_layout.size); + info->core_typeoffs = mod_mem_data->size; + mod_mem_data->size += ndst * sizeof(char); /* Put string table section at end of init part of module. */ strsect->sh_flags |= SHF_ALLOC; - strsect->sh_entsize = module_get_offset(mod, &mod->init_layout.size, strsect, - info->index.str) | INIT_OFFSET_MASK; + strsect->sh_entsize = module_get_offset_and_type(mod, MOD_INIT_DATA, + strsect, info->index.str); pr_debug("\t%s\n", info->secstrings + strsect->sh_name); /* We'll tack temporary mod_kallsyms on the end. */ - mod->init_layout.size = ALIGN(mod->init_layout.size, - __alignof__(struct mod_kallsyms)); - info->mod_kallsyms_init_off = mod->init_layout.size; - mod->init_layout.size += sizeof(struct mod_kallsyms); - info->init_typeoffs = mod->init_layout.size; - mod->init_layout.size += nsrc * sizeof(char); - mod->init_layout.size = strict_align(mod->init_layout.size); + mod_mem_init_data->size = ALIGN(mod_mem_init_data->size, + __alignof__(struct mod_kallsyms)); + info->mod_kallsyms_init_off = mod_mem_init_data->size; + + mod_mem_init_data->size += sizeof(struct mod_kallsyms); + info->init_typeoffs = mod_mem_init_data->size; + mod_mem_init_data->size += nsrc * sizeof(char); } /* @@ -171,9 +175,11 @@ void add_kallsyms(struct module *mod, const struct load_info *info) char *s; Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; unsigned long strtab_size; + void *data_base = mod->mem[MOD_DATA].base; + void *init_data_base = mod->mem[MOD_INIT_DATA].base; /* Set up to point into init section. */ - mod->kallsyms = (void __rcu *)mod->init_layout.base + + mod->kallsyms = (void __rcu *)init_data_base + info->mod_kallsyms_init_off; rcu_read_lock(); @@ -183,15 +189,15 @@ void add_kallsyms(struct module *mod, const struct load_info *info) /* Make sure we get permanent strtab: don't use info->strtab. */ rcu_dereference(mod->kallsyms)->strtab = (void *)info->sechdrs[info->index.str].sh_addr; - rcu_dereference(mod->kallsyms)->typetab = mod->init_layout.base + info->init_typeoffs; + rcu_dereference(mod->kallsyms)->typetab = init_data_base + info->init_typeoffs; /* * Now populate the cut down core kallsyms for after init * and set types up while we still have access to sections. */ - mod->core_kallsyms.symtab = dst = mod->data_layout.base + info->symoffs; - mod->core_kallsyms.strtab = s = mod->data_layout.base + info->stroffs; - mod->core_kallsyms.typetab = mod->data_layout.base + info->core_typeoffs; + mod->core_kallsyms.symtab = dst = data_base + info->symoffs; + mod->core_kallsyms.strtab = s = data_base + info->stroffs; + mod->core_kallsyms.typetab = data_base + info->core_typeoffs; strtab_size = info->core_typeoffs - info->stroffs; src = rcu_dereference(mod->kallsyms)->symtab; for (ndst = i = 0; i < rcu_dereference(mod->kallsyms)->num_symtab; i++) { @@ -238,18 +244,6 @@ void init_build_id(struct module *mod, const struct load_info *info) } #endif -/* - * This ignores the intensely annoying "mapping symbols" found - * in ARM ELF files: $a, $t and $d. - */ -static inline int is_arm_mapping_symbol(const char *str) -{ - if (str[0] == '.' && str[1] == 'L') - return true; - return str[0] == '$' && strchr("axtd", str[1]) && - (str[2] == '\0' || str[2] == '.'); -} - static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum) { return kallsyms->strtab + kallsyms->symtab[symnum].st_name; @@ -267,12 +261,15 @@ static const char *find_kallsyms_symbol(struct module *mod, unsigned int i, best = 0; unsigned long nextval, bestval; struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); + struct module_memory *mod_mem; /* At worse, next value is at end of module */ if (within_module_init(addr, mod)) - nextval = (unsigned long)mod->init_layout.base + mod->init_layout.text_size; + mod_mem = &mod->mem[MOD_INIT_TEXT]; else - nextval = (unsigned long)mod->core_layout.base + mod->core_layout.text_size; + mod_mem = &mod->mem[MOD_TEXT]; + + nextval = (unsigned long)mod_mem->base + mod_mem->size; bestval = kallsyms_symbol_value(&kallsyms->symtab[best]); @@ -292,7 +289,7 @@ static const char *find_kallsyms_symbol(struct module *mod, * and inserted at a whim. */ if (*kallsyms_symbol_name(kallsyms, i) == '\0' || - is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i))) + is_mapping_symbol(kallsyms_symbol_name(kallsyms, i))) continue; if (thisval <= addr && thisval > bestval) { @@ -505,8 +502,7 @@ unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name) } int module_kallsyms_on_each_symbol(const char *modname, - int (*fn)(void *, const char *, - struct module *, unsigned long), + int (*fn)(void *, const char *, unsigned long), void *data) { struct module *mod; @@ -535,7 +531,7 @@ int module_kallsyms_on_each_symbol(const char *modname, continue; ret = fn(data, kallsyms_symbol_name(kallsyms, i), - mod, kallsyms_symbol_value(sym)); + kallsyms_symbol_value(sym)); if (ret != 0) goto out; } diff --git a/kernel/module/kdb.c b/kernel/module/kdb.c index f4317f92e189..995c32d3698f 100644 --- a/kernel/module/kdb.c +++ b/kernel/module/kdb.c @@ -26,10 +26,11 @@ int kdb_lsmod(int argc, const char **argv) if (mod->state == MODULE_STATE_UNFORMED) continue; - kdb_printf("%-20s%8u", mod->name, mod->core_layout.size); -#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC - kdb_printf("/%8u", mod->data_layout.size); -#endif + kdb_printf("%-20s%8u", mod->name, mod->mem[MOD_TEXT].size); + kdb_printf("/%8u", mod->mem[MOD_RODATA].size); + kdb_printf("/%8u", mod->mem[MOD_RO_AFTER_INIT].size); + kdb_printf("/%8u", mod->mem[MOD_DATA].size); + kdb_printf(" 0x%px ", (void *)mod); #ifdef CONFIG_MODULE_UNLOAD kdb_printf("%4d ", module_refcount(mod)); @@ -40,10 +41,10 @@ int kdb_lsmod(int argc, const char **argv) kdb_printf(" (Loading)"); else kdb_printf(" (Live)"); - kdb_printf(" 0x%px", mod->core_layout.base); -#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC - kdb_printf("/0x%px", mod->data_layout.base); -#endif + kdb_printf(" 0x%px", mod->mem[MOD_TEXT].base); + kdb_printf("/0x%px", mod->mem[MOD_RODATA].base); + kdb_printf("/0x%px", mod->mem[MOD_RO_AFTER_INIT].base); + kdb_printf("/0x%px", mod->mem[MOD_DATA].base); #ifdef CONFIG_MODULE_UNLOAD { diff --git a/kernel/kmod.c b/kernel/module/kmod.c index b717134ebe17..0800d9891692 100644 --- a/kernel/kmod.c +++ b/kernel/module/kmod.c @@ -1,6 +1,9 @@ /* * kmod - the kernel module loader + * + * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org> */ + #include <linux/module.h> #include <linux/sched.h> #include <linux/sched/task.h> @@ -27,6 +30,7 @@ #include <linux/uaccess.h> #include <trace/events/module.h> +#include "internal.h" /* * Assuming: @@ -40,8 +44,7 @@ * effect. Systems like these are very unlikely if modules are enabled. */ #define MAX_KMOD_CONCURRENT 50 -static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT); -static DECLARE_WAIT_QUEUE_HEAD(kmod_wq); +static DEFINE_SEMAPHORE(kmod_concurrent_max, MAX_KMOD_CONCURRENT); /* * This is a restriction on having *all* MAX_KMOD_CONCURRENT threads @@ -66,7 +69,7 @@ static void free_modprobe_argv(struct subprocess_info *info) kfree(info->argv); } -static int call_modprobe(char *module_name, int wait) +static int call_modprobe(char *orig_module_name, int wait) { struct subprocess_info *info; static char *envp[] = { @@ -75,12 +78,14 @@ static int call_modprobe(char *module_name, int wait) "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL }; + char *module_name; + int ret; char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL); if (!argv) goto out; - module_name = kstrdup(module_name, GFP_KERNEL); + module_name = kstrdup(orig_module_name, GFP_KERNEL); if (!module_name) goto free_argv; @@ -95,13 +100,16 @@ static int call_modprobe(char *module_name, int wait) if (!info) goto free_module_name; - return call_usermodehelper_exec(info, wait | UMH_KILLABLE); + ret = call_usermodehelper_exec(info, wait | UMH_KILLABLE); + kmod_dup_request_announce(orig_module_name, ret); + return ret; free_module_name: kfree(module_name); free_argv: kfree(argv); out: + kmod_dup_request_announce(orig_module_name, -ENOMEM); return -ENOMEM; } @@ -125,7 +133,7 @@ int __request_module(bool wait, const char *fmt, ...) { va_list args; char module_name[MODULE_NAME_LEN]; - int ret; + int ret, dup_ret; /* * We don't allow synchronous module loading from async. Module @@ -148,29 +156,24 @@ int __request_module(bool wait, const char *fmt, ...) if (ret) return ret; - if (atomic_dec_if_positive(&kmod_concurrent_max) < 0) { - pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...", - atomic_read(&kmod_concurrent_max), - MAX_KMOD_CONCURRENT, module_name); - ret = wait_event_killable_timeout(kmod_wq, - atomic_dec_if_positive(&kmod_concurrent_max) >= 0, - MAX_KMOD_ALL_BUSY_TIMEOUT * HZ); - if (!ret) { - pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now", - module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT); - return -ETIME; - } else if (ret == -ERESTARTSYS) { - pr_warn_ratelimited("request_module: sigkill sent for modprobe %s, giving up", module_name); - return ret; - } + ret = down_timeout(&kmod_concurrent_max, MAX_KMOD_ALL_BUSY_TIMEOUT * HZ); + if (ret) { + pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now", + module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT); + return ret; } trace_module_request(module_name, wait, _RET_IP_); + if (kmod_dup_request_exists_wait(module_name, wait, &dup_ret)) { + ret = dup_ret; + goto out; + } + ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); - atomic_inc(&kmod_concurrent_max); - wake_up(&kmod_wq); +out: + up(&kmod_concurrent_max); return ret; } diff --git a/kernel/module/main.c b/kernel/module/main.c index d3be89de706d..4e2cf784cf8c 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2002 Richard Henderson * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. + * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org> */ #define INCLUDE_VERMAGIC @@ -55,6 +56,7 @@ #include <linux/dynamic_debug.h> #include <linux/audit.h> #include <linux/cfi.h> +#include <linux/debugfs.h> #include <uapi/linux/module.h> #include "internal.h" @@ -80,12 +82,6 @@ struct mod_tree_root mod_tree __cacheline_aligned = { .addr_min = -1UL, }; -#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC -struct mod_tree_root mod_data_tree __cacheline_aligned = { - .addr_min = -1UL, -}; -#endif - struct symsearch { const struct kernel_symbol *start, *stop; const s32 *crcs; @@ -93,14 +89,24 @@ struct symsearch { }; /* - * Bounds of module text, for speeding up __module_address. + * Bounds of module memory, for speeding up __module_address. * Protected by module_mutex. */ -static void __mod_update_bounds(void *base, unsigned int size, struct mod_tree_root *tree) +static void __mod_update_bounds(enum mod_mem_type type __maybe_unused, void *base, + unsigned int size, struct mod_tree_root *tree) { unsigned long min = (unsigned long)base; unsigned long max = min + size; +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + if (mod_mem_type_is_core_data(type)) { + if (min < tree->data_addr_min) + tree->data_addr_min = min; + if (max > tree->data_addr_max) + tree->data_addr_max = max; + return; + } +#endif if (min < tree->addr_min) tree->addr_min = min; if (max > tree->addr_max) @@ -109,12 +115,12 @@ static void __mod_update_bounds(void *base, unsigned int size, struct mod_tree_r static void mod_update_bounds(struct module *mod) { - __mod_update_bounds(mod->core_layout.base, mod->core_layout.size, &mod_tree); - if (mod->init_layout.size) - __mod_update_bounds(mod->init_layout.base, mod->init_layout.size, &mod_tree); -#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC - __mod_update_bounds(mod->data_layout.base, mod->data_layout.size, &mod_data_tree); -#endif + for_each_mod_mem_type(type) { + struct module_memory *mod_mem = &mod->mem[type]; + + if (mod_mem->size) + __mod_update_bounds(type, mod_mem->base, mod_mem->size, &mod_tree); + } } /* Block module loading/unloading? */ @@ -559,10 +565,8 @@ static int already_uses(struct module *a, struct module *b) struct module_use *use; list_for_each_entry(use, &b->source_list, source_list) { - if (use->source == a) { - pr_debug("%s uses %s!\n", a->name, b->name); + if (use->source == a) return 1; - } } pr_debug("%s does not use %s!\n", a->name, b->name); return 0; @@ -926,7 +930,13 @@ struct module_attribute module_uevent = static ssize_t show_coresize(struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { - return sprintf(buffer, "%u\n", mk->mod->core_layout.size); + unsigned int size = mk->mod->mem[MOD_TEXT].size; + + if (!IS_ENABLED(CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC)) { + for_class_mod_mem_type(type, core_data) + size += mk->mod->mem[type].size; + } + return sprintf(buffer, "%u\n", size); } static struct module_attribute modinfo_coresize = @@ -936,7 +946,11 @@ static struct module_attribute modinfo_coresize = static ssize_t show_datasize(struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { - return sprintf(buffer, "%u\n", mk->mod->data_layout.size); + unsigned int size = 0; + + for_class_mod_mem_type(type, core_data) + size += mk->mod->mem[type].size; + return sprintf(buffer, "%u\n", size); } static struct module_attribute modinfo_datasize = @@ -946,7 +960,11 @@ static struct module_attribute modinfo_datasize = static ssize_t show_initsize(struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { - return sprintf(buffer, "%u\n", mk->mod->init_layout.size); + unsigned int size = 0; + + for_class_mod_mem_type(type, init) + size += mk->mod->mem[type].size; + return sprintf(buffer, "%u\n", size); } static struct module_attribute modinfo_initsize = @@ -998,9 +1016,55 @@ int try_to_force_load(struct module *mod, const char *reason) #endif } -static char *get_modinfo(const struct load_info *info, const char *tag); +/* Parse tag=value strings from .modinfo section */ +char *module_next_tag_pair(char *string, unsigned long *secsize) +{ + /* Skip non-zero chars */ + while (string[0]) { + string++; + if ((*secsize)-- <= 1) + return NULL; + } + + /* Skip any zero padding. */ + while (!string[0]) { + string++; + if ((*secsize)-- <= 1) + return NULL; + } + return string; +} + static char *get_next_modinfo(const struct load_info *info, const char *tag, - char *prev); + char *prev) +{ + char *p; + unsigned int taglen = strlen(tag); + Elf_Shdr *infosec = &info->sechdrs[info->index.info]; + unsigned long size = infosec->sh_size; + + /* + * get_modinfo() calls made before rewrite_section_headers() + * must use sh_offset, as sh_addr isn't set! + */ + char *modinfo = (char *)info->hdr + infosec->sh_offset; + + if (prev) { + size -= prev - modinfo; + modinfo = module_next_tag_pair(prev, &size); + } + + for (p = modinfo; p; p = module_next_tag_pair(p, &size)) { + if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') + return p + taglen + 1; + } + return NULL; +} + +static char *get_modinfo(const struct load_info *info, const char *tag) +{ + return get_next_modinfo(info, tag, NULL); +} static int verify_namespace_is_imported(const struct load_info *info, const struct kernel_symbol *sym, @@ -1011,12 +1075,9 @@ static int verify_namespace_is_imported(const struct load_info *info, namespace = kernel_symbol_namespace(sym); if (namespace && namespace[0]) { - imported_namespace = get_modinfo(info, "import_ns"); - while (imported_namespace) { + for_each_modinfo_entry(imported_namespace, info, "import_ns") { if (strcmp(namespace, imported_namespace) == 0) return 0; - imported_namespace = get_next_modinfo( - info, "import_ns", imported_namespace); } #ifdef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS pr_warn( @@ -1143,6 +1204,46 @@ void __weak module_arch_freeing_init(struct module *mod) { } +static bool mod_mem_use_vmalloc(enum mod_mem_type type) +{ + return IS_ENABLED(CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC) && + mod_mem_type_is_core_data(type); +} + +static void *module_memory_alloc(unsigned int size, enum mod_mem_type type) +{ + if (mod_mem_use_vmalloc(type)) + return vzalloc(size); + return module_alloc(size); +} + +static void module_memory_free(void *ptr, enum mod_mem_type type) +{ + if (mod_mem_use_vmalloc(type)) + vfree(ptr); + else + module_memfree(ptr); +} + +static void free_mod_mem(struct module *mod) +{ + for_each_mod_mem_type(type) { + struct module_memory *mod_mem = &mod->mem[type]; + + if (type == MOD_DATA) + continue; + + /* Free lock-classes; relies on the preceding sync_rcu(). */ + lockdep_free_key_range(mod_mem->base, mod_mem->size); + if (mod_mem->size) + module_memory_free(mod_mem->base, type); + } + + /* MOD_DATA hosts mod, so free it at last */ + lockdep_free_key_range(mod->mem[MOD_DATA].base, mod->mem[MOD_DATA].size); + module_memory_free(mod->mem[MOD_DATA].base, MOD_DATA); +} + /* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { @@ -1158,9 +1259,6 @@ static void free_module(struct module *mod) mod->state = MODULE_STATE_UNFORMED; mutex_unlock(&module_mutex); - /* Remove dynamic debug info */ - ddebug_remove_module(mod->name); - /* Arch-specific cleanup. */ module_arch_cleanup(mod); @@ -1189,18 +1287,10 @@ static void free_module(struct module *mod) /* This may be empty, but that's OK */ module_arch_freeing_init(mod); - module_memfree(mod->init_layout.base); kfree(mod->args); percpu_modfree(mod); - /* Free lock-classes; relies on the preceding sync_rcu(). */ - lockdep_free_key_range(mod->data_layout.base, mod->data_layout.size); - - /* Finally, free the core (containing the module structure) */ - module_memfree(mod->core_layout.base); -#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC - vfree(mod->data_layout.base); -#endif + free_mod_mem(mod); } void *__symbol_get(const char *symbol) @@ -1303,8 +1393,8 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) case SHN_ABS: /* Don't need to do anything */ - pr_debug("Absolute symbol: 0x%08lx\n", - (long)sym[i].st_value); + pr_debug("Absolute symbol: 0x%08lx %s\n", + (long)sym[i].st_value, name); break; case SHN_LIVEPATCH: @@ -1387,16 +1477,18 @@ unsigned int __weak arch_mod_section_prepend(struct module *mod, return 0; } -/* Update size with this section: return offset. */ -long module_get_offset(struct module *mod, unsigned int *size, - Elf_Shdr *sechdr, unsigned int section) +long module_get_offset_and_type(struct module *mod, enum mod_mem_type type, + Elf_Shdr *sechdr, unsigned int section) { - long ret; + long offset; + long mask = ((unsigned long)(type) & SH_ENTSIZE_TYPE_MASK) << SH_ENTSIZE_TYPE_SHIFT; - *size += arch_mod_section_prepend(mod, section); - ret = ALIGN(*size, sechdr->sh_addralign ?: 1); - *size = ret + sechdr->sh_size; - return ret; + mod->mem[type].size += arch_mod_section_prepend(mod, section); + offset = ALIGN(mod->mem[type].size, sechdr->sh_addralign ?: 1); + mod->mem[type].size = offset + sechdr->sh_size; + + WARN_ON_ONCE(offset & mask); + return offset | mask; } static bool module_init_layout_section(const char *sname) @@ -1408,15 +1500,11 @@ static bool module_init_layout_section(const char *sname) return module_init_section(sname); } -/* - * Lay out the SHF_ALLOC sections in a way not dissimilar to how ld - * might -- code, read-only data, read-write data, small data. Tally - * sizes, and place the offsets into sh_entsize fields: high bit means it - * belongs in init. - */ -static void layout_sections(struct module *mod, struct load_info *info) +static void __layout_sections(struct module *mod, struct load_info *info, bool is_init) { - static unsigned long const masks[][2] = { + unsigned int m, i; + + static const unsigned long masks[][2] = { /* * NOTE: all executable code must be the first section * in this array; otherwise modify the text_size @@ -1428,85 +1516,64 @@ static void layout_sections(struct module *mod, struct load_info *info) { SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL }, { ARCH_SHF_SMALL | SHF_ALLOC, 0 } }; - unsigned int m, i; - - for (i = 0; i < info->hdr->e_shnum; i++) - info->sechdrs[i].sh_entsize = ~0UL; + static const int core_m_to_mem_type[] = { + MOD_TEXT, + MOD_RODATA, + MOD_RO_AFTER_INIT, + MOD_DATA, + MOD_DATA, + }; + static const int init_m_to_mem_type[] = { + MOD_INIT_TEXT, + MOD_INIT_RODATA, + MOD_INVALID, + MOD_INIT_DATA, + MOD_INIT_DATA, + }; - pr_debug("Core section allocation order:\n"); for (m = 0; m < ARRAY_SIZE(masks); ++m) { + enum mod_mem_type type = is_init ? init_m_to_mem_type[m] : core_m_to_mem_type[m]; + for (i = 0; i < info->hdr->e_shnum; ++i) { Elf_Shdr *s = &info->sechdrs[i]; const char *sname = info->secstrings + s->sh_name; - unsigned int *sizep; if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL - || module_init_layout_section(sname)) + || is_init != module_init_layout_section(sname)) continue; - sizep = m ? &mod->data_layout.size : &mod->core_layout.size; - s->sh_entsize = module_get_offset(mod, sizep, s, i); - pr_debug("\t%s\n", sname); - } - switch (m) { - case 0: /* executable */ - mod->core_layout.size = strict_align(mod->core_layout.size); - mod->core_layout.text_size = mod->core_layout.size; - break; - case 1: /* RO: text and ro-data */ - mod->data_layout.size = strict_align(mod->data_layout.size); - mod->data_layout.ro_size = mod->data_layout.size; - break; - case 2: /* RO after init */ - mod->data_layout.size = strict_align(mod->data_layout.size); - mod->data_layout.ro_after_init_size = mod->data_layout.size; - break; - case 4: /* whole core */ - mod->data_layout.size = strict_align(mod->data_layout.size); - break; - } - } - - pr_debug("Init section allocation order:\n"); - for (m = 0; m < ARRAY_SIZE(masks); ++m) { - for (i = 0; i < info->hdr->e_shnum; ++i) { - Elf_Shdr *s = &info->sechdrs[i]; - const char *sname = info->secstrings + s->sh_name; - if ((s->sh_flags & masks[m][0]) != masks[m][0] - || (s->sh_flags & masks[m][1]) - || s->sh_entsize != ~0UL - || !module_init_layout_section(sname)) + if (WARN_ON_ONCE(type == MOD_INVALID)) continue; - s->sh_entsize = (module_get_offset(mod, &mod->init_layout.size, s, i) - | INIT_OFFSET_MASK); + + s->sh_entsize = module_get_offset_and_type(mod, type, s, i); pr_debug("\t%s\n", sname); } - switch (m) { - case 0: /* executable */ - mod->init_layout.size = strict_align(mod->init_layout.size); - mod->init_layout.text_size = mod->init_layout.size; - break; - case 1: /* RO: text and ro-data */ - mod->init_layout.size = strict_align(mod->init_layout.size); - mod->init_layout.ro_size = mod->init_layout.size; - break; - case 2: - /* - * RO after init doesn't apply to init_layout (only - * core_layout), so it just takes the value of ro_size. - */ - mod->init_layout.ro_after_init_size = mod->init_layout.ro_size; - break; - case 4: /* whole init */ - mod->init_layout.size = strict_align(mod->init_layout.size); - break; - } } } -static void set_license(struct module *mod, const char *license) +/* + * Lay out the SHF_ALLOC sections in a way not dissimilar to how ld + * might -- code, read-only data, read-write data, small data. Tally + * sizes, and place the offsets into sh_entsize fields: high bit means it + * belongs in init. + */ +static void layout_sections(struct module *mod, struct load_info *info) +{ + unsigned int i; + + for (i = 0; i < info->hdr->e_shnum; i++) + info->sechdrs[i].sh_entsize = ~0UL; + + pr_debug("Core section allocation order for %s:\n", mod->name); + __layout_sections(mod, info, false); + + pr_debug("Init section allocation order for %s:\n", mod->name); + __layout_sections(mod, info, true); +} + +static void module_license_taint_check(struct module *mod, const char *license) { if (!license) license = "unspecified"; @@ -1520,56 +1587,6 @@ static void set_license(struct module *mod, const char *license) } } -/* Parse tag=value strings from .modinfo section */ -static char *next_string(char *string, unsigned long *secsize) -{ - /* Skip non-zero chars */ - while (string[0]) { - string++; - if ((*secsize)-- <= 1) - return NULL; - } - - /* Skip any zero padding. */ - while (!string[0]) { - string++; - if ((*secsize)-- <= 1) - return NULL; - } - return string; -} - -static char *get_next_modinfo(const struct load_info *info, const char *tag, - char *prev) -{ - char *p; - unsigned int taglen = strlen(tag); - Elf_Shdr *infosec = &info->sechdrs[info->index.info]; - unsigned long size = infosec->sh_size; - - /* - * get_modinfo() calls made before rewrite_section_headers() - * must use sh_offset, as sh_addr isn't set! - */ - char *modinfo = (char *)info->hdr + infosec->sh_offset; - - if (prev) { - size -= prev - modinfo; - modinfo = next_string(prev, &size); - } - - for (p = modinfo; p; p = next_string(p, &size)) { - if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') - return p + taglen + 1; - } - return NULL; -} - -static char *get_modinfo(const struct load_info *info, const char *tag) -{ - return get_next_modinfo(info, tag, NULL); -} - static void setup_modinfo(struct module *mod, struct load_info *info) { struct module_attribute *attr; @@ -1592,19 +1609,6 @@ static void free_modinfo(struct module *mod) } } -static void dynamic_debug_setup(struct module *mod, struct _ddebug_info *dyndbg) -{ - if (!dyndbg->num_descs) - return; - ddebug_add_module(dyndbg, mod->name); -} - -static void dynamic_debug_remove(struct module *mod, struct _ddebug_info *dyndbg) -{ - if (dyndbg->num_descs) - ddebug_remove_module(mod->name); -} - void * __weak module_alloc(unsigned long size) { return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, @@ -1642,16 +1646,33 @@ static int validate_section_offset(struct load_info *info, Elf_Shdr *shdr) } /* - * Sanity checks against invalid binaries, wrong arch, weird elf version. + * Check userspace passed ELF module against our expectations, and cache + * useful variables for further processing as we go. * - * Also do basic validity checks against section offsets and sizes, the + * This does basic validity checks against section offsets and sizes, the * section name string table, and the indices used for it (sh_name). + * + * As a last step, since we're already checking the ELF sections we cache + * useful variables which will be used later for our convenience: + * + * o pointers to section headers + * o cache the modinfo symbol section + * o cache the string symbol section + * o cache the module section + * + * As a last step we set info->mod to the temporary copy of the module in + * info->hdr. The final one will be allocated in move_module(). Any + * modifications we make to our copy of the module will be carried over + * to the final minted module. */ -static int elf_validity_check(struct load_info *info) +static int elf_validity_cache_copy(struct load_info *info, int flags) { unsigned int i; Elf_Shdr *shdr, *strhdr; int err; + unsigned int num_mod_secs = 0, mod_idx; + unsigned int num_info_secs = 0, info_idx; + unsigned int num_sym_secs = 0, sym_idx; if (info->len < sizeof(*(info->hdr))) { pr_err("Invalid ELF header len %lu\n", info->len); @@ -1755,6 +1776,8 @@ static int elf_validity_check(struct load_info *info) info->hdr->e_shnum); goto no_exec; } + num_sym_secs++; + sym_idx = i; fallthrough; default: err = validate_section_offset(info, shdr); @@ -1763,6 +1786,15 @@ static int elf_validity_check(struct load_info *info) i, shdr->sh_type); return err; } + if (strcmp(info->secstrings + shdr->sh_name, + ".gnu.linkonce.this_module") == 0) { + num_mod_secs++; + mod_idx = i; + } else if (strcmp(info->secstrings + shdr->sh_name, + ".modinfo") == 0) { + num_info_secs++; + info_idx = i; + } if (shdr->sh_flags & SHF_ALLOC) { if (shdr->sh_name >= strhdr->sh_size) { @@ -1775,6 +1807,91 @@ static int elf_validity_check(struct load_info *info) } } + if (num_info_secs > 1) { + pr_err("Only one .modinfo section must exist.\n"); + goto no_exec; + } else if (num_info_secs == 1) { + /* Try to find a name early so we can log errors with a module name */ + info->index.info = info_idx; + info->name = get_modinfo(info, "name"); + } + + if (num_sym_secs != 1) { + pr_warn("%s: module has no symbols (stripped?)\n", + info->name ?: "(missing .modinfo section or name field)"); + goto no_exec; + } + + /* Sets internal symbols and strings. */ + info->index.sym = sym_idx; + shdr = &info->sechdrs[sym_idx]; + info->index.str = shdr->sh_link; + info->strtab = (char *)info->hdr + info->sechdrs[info->index.str].sh_offset; + + /* + * The ".gnu.linkonce.this_module" ELF section is special. It is + * what modpost uses to refer to __this_module and let's use rely + * on THIS_MODULE to point to &__this_module properly. The kernel's + * modpost declares it on each modules's *.mod.c file. If the struct + * module of the kernel changes a full kernel rebuild is required. + * + * We have a few expectaions for this special section, the following + * code validates all this for us: + * + * o Only one section must exist + * o We expect the kernel to always have to allocate it: SHF_ALLOC + * o The section size must match the kernel's run time's struct module + * size + */ + if (num_mod_secs != 1) { + pr_err("module %s: Only one .gnu.linkonce.this_module section must exist.\n", + info->name ?: "(missing .modinfo section or name field)"); + goto no_exec; + } + + shdr = &info->sechdrs[mod_idx]; + + /* + * This is already implied on the switch above, however let's be + * pedantic about it. + */ + if (shdr->sh_type == SHT_NOBITS) { + pr_err("module %s: .gnu.linkonce.this_module section must have a size set\n", + info->name ?: "(missing .modinfo section or name field)"); + goto no_exec; + } + + if (!(shdr->sh_flags & SHF_ALLOC)) { + pr_err("module %s: .gnu.linkonce.this_module must occupy memory during process execution\n", + info->name ?: "(missing .modinfo section or name field)"); + goto no_exec; + } + + if (shdr->sh_size != sizeof(struct module)) { + pr_err("module %s: .gnu.linkonce.this_module section size must match the kernel's built struct module size at run time\n", + info->name ?: "(missing .modinfo section or name field)"); + goto no_exec; + } + + info->index.mod = mod_idx; + + /* This is temporary: point mod into copy of data. */ + info->mod = (void *)info->hdr + shdr->sh_offset; + + /* + * If we didn't load the .modinfo 'name' field earlier, fall back to + * on-disk struct mod 'name' field. + */ + if (!info->name) + info->name = info->mod->name; + + if (flags & MODULE_INIT_IGNORE_MODVERSIONS) + info->index.vers = 0; /* Pretend no __versions section! */ + else + info->index.vers = find_sec(info, "__versions"); + + info->index.pcpu = find_pcpusec(info); + return 0; no_exec: @@ -1804,12 +1921,8 @@ static int check_modinfo_livepatch(struct module *mod, struct load_info *info) /* Nothing more to do */ return 0; - if (set_livepatch_module(mod)) { - add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK); - pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n", - mod->name); + if (set_livepatch_module(mod)) return 0; - } pr_err("%s: module is marked as livepatch module, but livepatch support is disabled", mod->name); @@ -1892,63 +2005,71 @@ static int rewrite_section_headers(struct load_info *info, int flags) } /* - * Set up our basic convenience variables (pointers to section headers, - * search for module section index etc), and do some basic section - * verification. - * - * Set info->mod to the temporary copy of the module in info->hdr. The final one - * will be allocated in move_module(). - */ -static int setup_load_info(struct load_info *info, int flags) + * These calls taint the kernel depending certain module circumstances */ +static void module_augment_kernel_taints(struct module *mod, struct load_info *info) { - unsigned int i; + int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE); - /* Try to find a name early so we can log errors with a module name */ - info->index.info = find_sec(info, ".modinfo"); - if (info->index.info) - info->name = get_modinfo(info, "name"); + if (!get_modinfo(info, "intree")) { + if (!test_taint(TAINT_OOT_MODULE)) + pr_warn("%s: loading out-of-tree module taints kernel.\n", + mod->name); + add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK); + } - /* Find internal symbols and strings. */ - for (i = 1; i < info->hdr->e_shnum; i++) { - if (info->sechdrs[i].sh_type == SHT_SYMTAB) { - info->index.sym = i; - info->index.str = info->sechdrs[i].sh_link; - info->strtab = (char *)info->hdr - + info->sechdrs[info->index.str].sh_offset; - break; - } + check_modinfo_retpoline(mod, info); + + if (get_modinfo(info, "staging")) { + add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK); + pr_warn("%s: module is from the staging directory, the quality " + "is unknown, you have been warned.\n", mod->name); } - if (info->index.sym == 0) { - pr_warn("%s: module has no symbols (stripped?)\n", - info->name ?: "(missing .modinfo section or name field)"); - return -ENOEXEC; + if (is_livepatch_module(mod)) { + add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK); + pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n", + mod->name); } - info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); - if (!info->index.mod) { - pr_warn("%s: No module found in object\n", - info->name ?: "(missing .modinfo section or name field)"); - return -ENOEXEC; + module_license_taint_check(mod, get_modinfo(info, "license")); + + if (get_modinfo(info, "test")) { + if (!test_taint(TAINT_TEST)) + pr_warn("%s: loading test module taints kernel.\n", + mod->name); + add_taint_module(mod, TAINT_TEST, LOCKDEP_STILL_OK); } - /* This is temporary: point mod into copy of data. */ - info->mod = (void *)info->hdr + info->sechdrs[info->index.mod].sh_offset; +#ifdef CONFIG_MODULE_SIG + mod->sig_ok = info->sig_ok; + if (!mod->sig_ok) { + pr_notice_once("%s: module verification failed: signature " + "and/or required key missing - tainting " + "kernel\n", mod->name); + add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK); + } +#endif /* - * If we didn't load the .modinfo 'name' field earlier, fall back to - * on-disk struct mod 'name' field. + * ndiswrapper is under GPL by itself, but loads proprietary modules. + * Don't use add_taint_module(), as it would prevent ndiswrapper from + * using GPL-only symbols it needs. */ - if (!info->name) - info->name = info->mod->name; + if (strcmp(mod->name, "ndiswrapper") == 0) + add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE); - if (flags & MODULE_INIT_IGNORE_MODVERSIONS) - info->index.vers = 0; /* Pretend no __versions section! */ - else - info->index.vers = find_sec(info, "__versions"); + /* driverloader was caught wrongly pretending to be under GPL */ + if (strcmp(mod->name, "driverloader") == 0) + add_taint_module(mod, TAINT_PROPRIETARY_MODULE, + LOCKDEP_NOW_UNRELIABLE); - info->index.pcpu = find_pcpusec(info); + /* lve claims to be GPL but upstream won't provide source */ + if (strcmp(mod->name, "lve") == 0) + add_taint_module(mod, TAINT_PROPRIETARY_MODULE, + LOCKDEP_NOW_UNRELIABLE); + + if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE)) + pr_warn("%s: module license taints kernel.\n", mod->name); - return 0; } static int check_modinfo(struct module *mod, struct load_info *info, int flags) @@ -1970,35 +2091,10 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) return -ENOEXEC; } - if (!get_modinfo(info, "intree")) { - if (!test_taint(TAINT_OOT_MODULE)) - pr_warn("%s: loading out-of-tree module taints kernel.\n", - mod->name); - add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK); - } - - check_modinfo_retpoline(mod, info); - - if (get_modinfo(info, "staging")) { - add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK); - pr_warn("%s: module is from the staging directory, the quality " - "is unknown, you have been warned.\n", mod->name); - } - err = check_modinfo_livepatch(mod, info); if (err) return err; - /* Set up license info based on the info section */ - set_license(mod, get_modinfo(info, "license")); - - if (get_modinfo(info, "test")) { - if (!test_taint(TAINT_TEST)) - pr_warn("%s: loading test module taints kernel.\n", - mod->name); - add_taint_module(mod, TAINT_TEST, LOCKDEP_STILL_OK); - } - return 0; } @@ -2110,10 +2206,14 @@ static int find_module_sections(struct module *mod, struct load_info *info) if (section_addr(info, "__obsparm")) pr_warn("%s: Ignoring obsolete parameters\n", mod->name); - info->dyndbg.descs = section_objs(info, "__dyndbg", - sizeof(*info->dyndbg.descs), &info->dyndbg.num_descs); - info->dyndbg.classes = section_objs(info, "__dyndbg_classes", - sizeof(*info->dyndbg.classes), &info->dyndbg.num_classes); +#ifdef CONFIG_DYNAMIC_DEBUG_CORE + mod->dyndbg_info.descs = section_objs(info, "__dyndbg", + sizeof(*mod->dyndbg_info.descs), + &mod->dyndbg_info.num_descs); + mod->dyndbg_info.classes = section_objs(info, "__dyndbg_classes", + sizeof(*mod->dyndbg_info.classes), + &mod->dyndbg_info.num_classes); +#endif return 0; } @@ -2122,109 +2222,82 @@ static int move_module(struct module *mod, struct load_info *info) { int i; void *ptr; + enum mod_mem_type t = 0; + int ret = -ENOMEM; - /* Do the allocs. */ - ptr = module_alloc(mod->core_layout.size); - /* - * The pointer to this block is stored in the module structure - * which is inside the block. Just mark it as not being a - * leak. - */ - kmemleak_not_leak(ptr); - if (!ptr) - return -ENOMEM; - - memset(ptr, 0, mod->core_layout.size); - mod->core_layout.base = ptr; - - if (mod->init_layout.size) { - ptr = module_alloc(mod->init_layout.size); + for_each_mod_mem_type(type) { + if (!mod->mem[type].size) { + mod->mem[type].base = NULL; + continue; + } + mod->mem[type].size = PAGE_ALIGN(mod->mem[type].size); + ptr = module_memory_alloc(mod->mem[type].size, type); /* - * The pointer to this block is stored in the module structure - * which is inside the block. This block doesn't need to be - * scanned as it contains data and code that will be freed - * after the module is initialized. + * The pointer to these blocks of memory are stored on the module + * structure and we keep that around so long as the module is + * around. We only free that memory when we unload the module. + * Just mark them as not being a leak then. The .init* ELF + * sections *do* get freed after boot so we *could* treat them + * slightly differently with kmemleak_ignore() and only grey + * them out as they work as typical memory allocations which + * *do* eventually get freed, but let's just keep things simple + * and avoid *any* false positives. */ - kmemleak_ignore(ptr); + kmemleak_not_leak(ptr); if (!ptr) { - module_memfree(mod->core_layout.base); - return -ENOMEM; + t = type; + goto out_enomem; } - memset(ptr, 0, mod->init_layout.size); - mod->init_layout.base = ptr; - } else - mod->init_layout.base = NULL; - -#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC - /* Do the allocs. */ - ptr = vzalloc(mod->data_layout.size); - /* - * The pointer to this block is stored in the module structure - * which is inside the block. Just mark it as not being a - * leak. - */ - kmemleak_not_leak(ptr); - if (!ptr) { - module_memfree(mod->core_layout.base); - module_memfree(mod->init_layout.base); - return -ENOMEM; + memset(ptr, 0, mod->mem[type].size); + mod->mem[type].base = ptr; } - mod->data_layout.base = ptr; -#endif /* Transfer each section which specifies SHF_ALLOC */ - pr_debug("final section addresses:\n"); + pr_debug("Final section addresses for %s:\n", mod->name); for (i = 0; i < info->hdr->e_shnum; i++) { void *dest; Elf_Shdr *shdr = &info->sechdrs[i]; + enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT; if (!(shdr->sh_flags & SHF_ALLOC)) continue; - if (shdr->sh_entsize & INIT_OFFSET_MASK) - dest = mod->init_layout.base - + (shdr->sh_entsize & ~INIT_OFFSET_MASK); - else if (!(shdr->sh_flags & SHF_EXECINSTR)) - dest = mod->data_layout.base + shdr->sh_entsize; - else - dest = mod->core_layout.base + shdr->sh_entsize; + dest = mod->mem[type].base + (shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK); - if (shdr->sh_type != SHT_NOBITS) + if (shdr->sh_type != SHT_NOBITS) { + /* + * Our ELF checker already validated this, but let's + * be pedantic and make the goal clearer. We actually + * end up copying over all modifications made to the + * userspace copy of the entire struct module. + */ + if (i == info->index.mod && + (WARN_ON_ONCE(shdr->sh_size != sizeof(struct module)))) { + ret = -ENOEXEC; + goto out_enomem; + } memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); - /* Update sh_addr to point to copy in image. */ + } + /* + * Update the userspace copy's ELF section address to point to + * our newly allocated memory as a pure convenience so that + * users of info can keep taking advantage and using the newly + * minted official memory area. + */ shdr->sh_addr = (unsigned long)dest; - pr_debug("\t0x%lx %s\n", - (long)shdr->sh_addr, info->secstrings + shdr->sh_name); + pr_debug("\t0x%lx 0x%.8lx %s\n", (long)shdr->sh_addr, + (long)shdr->sh_size, info->secstrings + shdr->sh_name); } return 0; +out_enomem: + for (t--; t >= 0; t--) + module_memory_free(mod->mem[t].base, t); + return ret; } -static int check_module_license_and_versions(struct module *mod) +static int check_export_symbol_versions(struct module *mod) { - int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE); - - /* - * ndiswrapper is under GPL by itself, but loads proprietary modules. - * Don't use add_taint_module(), as it would prevent ndiswrapper from - * using GPL-only symbols it needs. - */ - if (strcmp(mod->name, "ndiswrapper") == 0) - add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE); - - /* driverloader was caught wrongly pretending to be under GPL */ - if (strcmp(mod->name, "driverloader") == 0) - add_taint_module(mod, TAINT_PROPRIETARY_MODULE, - LOCKDEP_NOW_UNRELIABLE); - - /* lve claims to be GPL but upstream won't provide source */ - if (strcmp(mod->name, "lve") == 0) - add_taint_module(mod, TAINT_PROPRIETARY_MODULE, - LOCKDEP_NOW_UNRELIABLE); - - if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE)) - pr_warn("%s: module license taints kernel.\n", mod->name); - #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) || (mod->num_gpl_syms && !mod->gpl_crcs)) { @@ -2242,12 +2315,14 @@ static void flush_module_icache(const struct module *mod) * Do it before processing of module parameters, so the module * can provide parameter accessor functions of its own. */ - if (mod->init_layout.base) - flush_icache_range((unsigned long)mod->init_layout.base, - (unsigned long)mod->init_layout.base - + mod->init_layout.size); - flush_icache_range((unsigned long)mod->core_layout.base, - (unsigned long)mod->core_layout.base + mod->core_layout.size); + for_each_mod_mem_type(type) { + const struct module_memory *mod_mem = &mod->mem[type]; + + if (mod_mem->size) { + flush_icache_range((unsigned long)mod_mem->base, + (unsigned long)mod_mem->base + mod_mem->size); + } + } } bool __weak module_elf_check_arch(Elf_Ehdr *hdr) @@ -2290,10 +2365,6 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) unsigned int ndx; int err; - err = check_modinfo(info->mod, info, flags); - if (err) - return ERR_PTR(err); - /* Allow arches to frob section contents and sizes. */ err = module_frob_arch_sections(info->hdr, info->sechdrs, info->secstrings, info->mod); @@ -2350,11 +2421,8 @@ static void module_deallocate(struct module *mod, struct load_info *info) { percpu_modfree(mod); module_arch_freeing_init(mod); - module_memfree(mod->init_layout.base); - module_memfree(mod->core_layout.base); -#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC - vfree(mod->data_layout.base); -#endif + + free_mod_mem(mod); } int __weak module_finalize(const Elf_Ehdr *hdr, @@ -2380,27 +2448,6 @@ static int post_relocation(struct module *mod, const struct load_info *info) return module_finalize(info->hdr, info->sechdrs, mod); } -/* Is this module of this name done loading? No locks held. */ -static bool finished_loading(const char *name) -{ - struct module *mod; - bool ret; - - /* - * The module_mutex should not be a heavily contended lock; - * if we get the occasional sleep here, we'll go an extra iteration - * in the wait_event_interruptible(), which is harmless. - */ - sched_annotate_sleep(); - mutex_lock(&module_mutex); - mod = find_module_all(name, strlen(name), true); - ret = !mod || mod->state == MODULE_STATE_LIVE - || mod->state == MODULE_STATE_GOING; - mutex_unlock(&module_mutex); - - return ret; -} - /* Call module constructors. */ static void do_mod_ctors(struct module *mod) { @@ -2415,7 +2462,9 @@ static void do_mod_ctors(struct module *mod) /* For freeing module_init on success, in case kallsyms traversing */ struct mod_initfree { struct llist_node node; - void *module_init; + void *init_text; + void *init_data; + void *init_rodata; }; static void do_free_init(struct work_struct *w) @@ -2429,7 +2478,9 @@ static void do_free_init(struct work_struct *w) llist_for_each_safe(pos, n, list) { initfree = container_of(pos, struct mod_initfree, node); - module_memfree(initfree->module_init); + module_memfree(initfree->init_text); + module_memfree(initfree->init_data); + module_memfree(initfree->init_rodata); kfree(initfree); } } @@ -2450,13 +2501,27 @@ static noinline int do_init_module(struct module *mod) { int ret = 0; struct mod_initfree *freeinit; +#if defined(CONFIG_MODULE_STATS) + unsigned int text_size = 0, total_size = 0; + + for_each_mod_mem_type(type) { + const struct module_memory *mod_mem = &mod->mem[type]; + if (mod_mem->size) { + total_size += mod_mem->size; + if (type == MOD_TEXT || type == MOD_INIT_TEXT) + text_size += mod_mem->size; + } + } +#endif freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL); if (!freeinit) { ret = -ENOMEM; goto fail; } - freeinit->module_init = mod->init_layout.base; + freeinit->init_text = mod->mem[MOD_INIT_TEXT].base; + freeinit->init_data = mod->mem[MOD_INIT_DATA].base; + freeinit->init_rodata = mod->mem[MOD_INIT_RODATA].base; do_mod_ctors(mod); /* Start the module */ @@ -2492,8 +2557,8 @@ static noinline int do_init_module(struct module *mod) if (!mod->async_probe_requested) async_synchronize_full(); - ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base + - mod->init_layout.size); + ftrace_free_mem(mod, mod->mem[MOD_INIT_TEXT].base, + mod->mem[MOD_INIT_TEXT].base + mod->mem[MOD_INIT_TEXT].size); mutex_lock(&module_mutex); /* Drop initial reference. */ module_put(mod); @@ -2505,11 +2570,11 @@ static noinline int do_init_module(struct module *mod) module_enable_ro(mod, true); mod_tree_remove_init(mod); module_arch_freeing_init(mod); - mod->init_layout.base = NULL; - mod->init_layout.size = 0; - mod->init_layout.ro_size = 0; - mod->init_layout.ro_after_init_size = 0; - mod->init_layout.text_size = 0; + for_class_mod_mem_type(type, init) { + mod->mem[type].base = NULL; + mod->mem[type].size = 0; + } + #ifdef CONFIG_DEBUG_INFO_BTF_MODULES /* .BTF is not SHF_ALLOC and will get removed, so sanitize pointer */ mod->btf_data = NULL; @@ -2533,6 +2598,11 @@ static noinline int do_init_module(struct module *mod) mutex_unlock(&module_mutex); wake_up_all(&module_wq); + mod_stat_add_long(text_size, &total_text_size); + mod_stat_add_long(total_size, &total_mod_size); + + mod_stat_inc(&modcount); + return 0; fail_free_freeinit: @@ -2548,6 +2618,7 @@ fail: ftrace_release_mod(mod); free_module(mod); wake_up_all(&module_wq); + return ret; } @@ -2559,6 +2630,67 @@ static int may_init_module(void) return 0; } +/* Is this module of this name done loading? No locks held. */ +static bool finished_loading(const char *name) +{ + struct module *mod; + bool ret; + + /* + * The module_mutex should not be a heavily contended lock; + * if we get the occasional sleep here, we'll go an extra iteration + * in the wait_event_interruptible(), which is harmless. + */ + sched_annotate_sleep(); + mutex_lock(&module_mutex); + mod = find_module_all(name, strlen(name), true); + ret = !mod || mod->state == MODULE_STATE_LIVE + || mod->state == MODULE_STATE_GOING; + mutex_unlock(&module_mutex); + + return ret; +} + +/* Must be called with module_mutex held */ +static int module_patient_check_exists(const char *name, + enum fail_dup_mod_reason reason) +{ + struct module *old; + int err = 0; + + old = find_module_all(name, strlen(name), true); + if (old == NULL) + return 0; + + if (old->state == MODULE_STATE_COMING || + old->state == MODULE_STATE_UNFORMED) { + /* Wait in case it fails to load. */ + mutex_unlock(&module_mutex); + err = wait_event_interruptible(module_wq, + finished_loading(name)); + mutex_lock(&module_mutex); + if (err) + return err; + + /* The module might have gone in the meantime. */ + old = find_module_all(name, strlen(name), true); + } + + if (try_add_failed_module(name, reason)) + pr_warn("Could not add fail-tracking for module: %s\n", name); + + /* + * We are here only when the same module was being loaded. Do + * not try to load it again right now. It prevents long delays + * caused by serialized module load failures. It might happen + * when more devices of the same type trigger load of + * a particular module. + */ + if (old && old->state == MODULE_STATE_LIVE) + return -EEXIST; + return -EBUSY; +} + /* * We try to place it in the list now to make sure it's unique before * we dedicate too many resources. In particular, temporary percpu @@ -2567,41 +2699,14 @@ static int may_init_module(void) static int add_unformed_module(struct module *mod) { int err; - struct module *old; mod->state = MODULE_STATE_UNFORMED; mutex_lock(&module_mutex); - old = find_module_all(mod->name, strlen(mod->name), true); - if (old != NULL) { - if (old->state == MODULE_STATE_COMING - || old->state == MODULE_STATE_UNFORMED) { - /* Wait in case it fails to load. */ - mutex_unlock(&module_mutex); - err = wait_event_interruptible(module_wq, - finished_loading(mod->name)); - if (err) - goto out_unlocked; - - /* The module might have gone in the meantime. */ - mutex_lock(&module_mutex); - old = find_module_all(mod->name, strlen(mod->name), - true); - } - - /* - * We are here only when the same module was being loaded. Do - * not try to load it again right now. It prevents long delays - * caused by serialized module load failures. It might happen - * when more devices of the same type trigger load of - * a particular module. - */ - if (old && old->state == MODULE_STATE_LIVE) - err = -EEXIST; - else - err = -EBUSY; + err = module_patient_check_exists(mod->name, FAIL_DUP_MOD_LOAD); + if (err) goto out; - } + mod_update_bounds(mod); list_add_rcu(&mod->list, &modules); mod_tree_insert(mod); @@ -2609,7 +2714,6 @@ static int add_unformed_module(struct module *mod) out: mutex_unlock(&module_mutex); -out_unlocked: return err; } @@ -2628,9 +2732,6 @@ static int complete_formation(struct module *mod, struct load_info *info) module_bug_finalize(info->hdr, info->sechdrs, mod); module_cfi_finalize(info->hdr, info->sechdrs, mod); - if (module_check_misalignment(mod)) - goto out_misaligned; - module_enable_ro(mod, false); module_enable_nx(mod); module_enable_x(mod); @@ -2644,8 +2745,6 @@ static int complete_formation(struct module *mod, struct load_info *info) return 0; -out_misaligned: - err = -EINVAL; out: mutex_unlock(&module_mutex); return err; @@ -2688,6 +2787,39 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname, return 0; } +/* Module within temporary copy, this doesn't do any allocation */ +static int early_mod_check(struct load_info *info, int flags) +{ + int err; + + /* + * Now that we know we have the correct module name, check + * if it's blacklisted. + */ + if (blacklisted(info->name)) { + pr_err("Module %s is blacklisted\n", info->name); + return -EPERM; + } + + err = rewrite_section_headers(info, flags); + if (err) + return err; + + /* Check module struct version now, before we try to use module. */ + if (!check_modstruct_version(info, info->mod)) + return -ENOEXEC; + + err = check_modinfo(info->mod, info, flags); + if (err) + return err; + + mutex_lock(&module_mutex); + err = module_patient_check_exists(info->mod->name, FAIL_DUP_MOD_BECOMING); + mutex_unlock(&module_mutex); + + return err; +} + /* * Allocate and load the module: note that size of section 0 is always * zero, and we rely on this for optional sections. @@ -2696,6 +2828,7 @@ static int load_module(struct load_info *info, const char __user *uargs, int flags) { struct module *mod; + bool module_allocated = false; long err = 0; char *after_dashes; @@ -2717,40 +2850,17 @@ static int load_module(struct load_info *info, const char __user *uargs, /* * Do basic sanity checks against the ELF header and - * sections. + * sections. Cache useful sections and set the + * info->mod to the userspace passed struct module. */ - err = elf_validity_check(info); + err = elf_validity_cache_copy(info, flags); if (err) goto free_copy; - /* - * Everything checks out, so set up the section info - * in the info structure. - */ - err = setup_load_info(info, flags); + err = early_mod_check(info, flags); if (err) goto free_copy; - /* - * Now that we know we have the correct module name, check - * if it's blacklisted. - */ - if (blacklisted(info->name)) { - err = -EPERM; - pr_err("Module %s is blacklisted\n", info->name); - goto free_copy; - } - - err = rewrite_section_headers(info, flags); - if (err) - goto free_copy; - - /* Check module struct version now, before we try to use module. */ - if (!check_modstruct_version(info, info->mod)) { - err = -ENOEXEC; - goto free_copy; - } - /* Figure out module layout, and allocate all the memory. */ mod = layout_and_allocate(info, flags); if (IS_ERR(mod)) { @@ -2758,6 +2868,8 @@ static int load_module(struct load_info *info, const char __user *uargs, goto free_copy; } + module_allocated = true; + audit_log_kern_module(mod->name); /* Reserve our place in the list. */ @@ -2765,15 +2877,11 @@ static int load_module(struct load_info *info, const char __user *uargs, if (err) goto free_module; -#ifdef CONFIG_MODULE_SIG - mod->sig_ok = info->sig_ok; - if (!mod->sig_ok) { - pr_notice_once("%s: module verification failed: signature " - "and/or required key missing - tainting " - "kernel\n", mod->name); - add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK); - } -#endif + /* + * We are tainting your kernel if your module gets into + * the modules linked list somehow. + */ + module_augment_kernel_taints(mod, info); /* To avoid stressing percpu allocator, do this once we're unique. */ err = percpu_modalloc(mod, info); @@ -2795,7 +2903,7 @@ static int load_module(struct load_info *info, const char __user *uargs, if (err) goto free_unload; - err = check_module_license_and_versions(mod); + err = check_export_symbol_versions(mod); if (err) goto free_unload; @@ -2825,7 +2933,6 @@ static int load_module(struct load_info *info, const char __user *uargs, } init_build_id(mod, info); - dynamic_debug_setup(mod, &info->dyndbg); /* Ftrace init must be called in the MODULE_STATE_UNFORMED state */ ftrace_module_init(mod); @@ -2889,7 +2996,6 @@ static int load_module(struct load_info *info, const char __user *uargs, ddebug_cleanup: ftrace_release_mod(mod); - dynamic_debug_remove(mod, &info->dyndbg); synchronize_rcu(); kfree(mod->args); free_arch_cleanup: @@ -2908,11 +3014,22 @@ static int load_module(struct load_info *info, const char __user *uargs, synchronize_rcu(); mutex_unlock(&module_mutex); free_module: + mod_stat_bump_invalid(info, flags); /* Free lock-classes; relies on the preceding sync_rcu() */ - lockdep_free_key_range(mod->data_layout.base, mod->data_layout.size); + for_class_mod_mem_type(type, core_data) { + lockdep_free_key_range(mod->mem[type].base, + mod->mem[type].size); + } module_deallocate(mod, info); free_copy: + /* + * The info->len is always set. We distinguish between + * failures once the proper module was allocated and + * before that. + */ + if (!module_allocated) + mod_stat_bump_becoming(info, flags); free_copy(info, flags); return err; } @@ -2931,8 +3048,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, umod, len, uargs); err = copy_module_from_user(umod, len, &info); - if (err) + if (err) { + mod_stat_inc(&failed_kreads); + mod_stat_add_long(len, &invalid_kread_bytes); return err; + } return load_module(&info, uargs, 0); } @@ -2957,14 +3077,20 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) len = kernel_read_file_from_fd(fd, 0, &buf, INT_MAX, NULL, READING_MODULE); - if (len < 0) + if (len < 0) { + mod_stat_inc(&failed_kreads); + mod_stat_add_long(len, &invalid_kread_bytes); return len; + } if (flags & MODULE_INIT_COMPRESSED_FILE) { err = module_decompress(&info, buf, len); vfree(buf); /* compressed data is no longer needed */ - if (err) + if (err) { + mod_stat_inc(&failed_decompress); + mod_stat_add_long(len, &invalid_decompress_bytes); return err; + } } else { info.hdr = buf; info.len = len; @@ -2973,11 +3099,6 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) return load_module(&info, uargs, flags); } -static inline int within(unsigned long addr, void *start, unsigned long size) -{ - return ((void *)addr >= start && (void *)addr < start + size); -} - /* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */ char *module_flags(struct module *mod, char *buf, bool show_state) { @@ -3060,20 +3181,21 @@ bool is_module_address(unsigned long addr) struct module *__module_address(unsigned long addr) { struct module *mod; - struct mod_tree_root *tree; if (addr >= mod_tree.addr_min && addr <= mod_tree.addr_max) - tree = &mod_tree; + goto lookup; + #ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC - else if (addr >= mod_data_tree.addr_min && addr <= mod_data_tree.addr_max) - tree = &mod_data_tree; + if (addr >= mod_tree.data_addr_min && addr <= mod_tree.data_addr_max) + goto lookup; #endif - else - return NULL; + return NULL; + +lookup: module_assert_mutex_or_preempt(); - mod = mod_find(addr, tree); + mod = mod_find(addr, &mod_tree); if (mod) { BUG_ON(!within_module(addr, mod)); if (mod->state == MODULE_STATE_UNFORMED) @@ -3113,8 +3235,8 @@ struct module *__module_text_address(unsigned long addr) struct module *mod = __module_address(addr); if (mod) { /* Make sure it's within the text section. */ - if (!within(addr, mod->init_layout.base, mod->init_layout.text_size) - && !within(addr, mod->core_layout.base, mod->core_layout.text_size)) + if (!within_module_mem_type(addr, mod, MOD_TEXT) && + !within_module_mem_type(addr, mod, MOD_INIT_TEXT)) mod = NULL; } return mod; @@ -3142,3 +3264,14 @@ void print_modules(void) last_unloaded_module.taints); pr_cont("\n"); } + +#ifdef CONFIG_MODULE_DEBUGFS +struct dentry *mod_debugfs_root; + +static int module_debugfs_init(void) +{ + mod_debugfs_root = debugfs_create_dir("modules", NULL); + return 0; +} +module_init(module_debugfs_init); +#endif diff --git a/kernel/module/procfs.c b/kernel/module/procfs.c index cf5b9f1e6ec4..0a4841e88adb 100644 --- a/kernel/module/procfs.c +++ b/kernel/module/procfs.c @@ -62,6 +62,15 @@ static void m_stop(struct seq_file *m, void *p) mutex_unlock(&module_mutex); } +static unsigned int module_total_size(struct module *mod) +{ + int size = 0; + + for_each_mod_mem_type(type) + size += mod->mem[type].size; + return size; +} + static int m_show(struct seq_file *m, void *p) { struct module *mod = list_entry(p, struct module, list); @@ -73,10 +82,7 @@ static int m_show(struct seq_file *m, void *p) if (mod->state == MODULE_STATE_UNFORMED) return 0; - size = mod->init_layout.size + mod->core_layout.size; -#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC - size += mod->data_layout.size; -#endif + size = module_total_size(mod); seq_printf(m, "%s %u", mod->name, size); print_unload_info(m, mod); @@ -86,7 +92,7 @@ static int m_show(struct seq_file *m, void *p) mod->state == MODULE_STATE_COMING ? "Loading" : "Live"); /* Used by oprofile and other similar tools. */ - value = m->private ? NULL : mod->core_layout.base; + value = m->private ? NULL : mod->mem[MOD_TEXT].base; seq_printf(m, " 0x%px", value); /* Taints info */ diff --git a/kernel/module/stats.c b/kernel/module/stats.c new file mode 100644 index 000000000000..6ab2c94d6bc3 --- /dev/null +++ b/kernel/module/stats.c @@ -0,0 +1,432 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Debugging module statistics. + * + * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org> + */ + +#include <linux/module.h> +#include <uapi/linux/module.h> +#include <linux/string.h> +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/list.h> +#include <linux/debugfs.h> +#include <linux/rculist.h> +#include <linux/math.h> + +#include "internal.h" + +/** + * DOC: module debugging statistics overview + * + * Enabling CONFIG_MODULE_STATS enables module debugging statistics which + * are useful to monitor and root cause memory pressure issues with module + * loading. These statistics are useful to allow us to improve production + * workloads. + * + * The current module debugging statistics supported help keep track of module + * loading failures to enable improvements either for kernel module auto-loading + * usage (request_module()) or interactions with userspace. Statistics are + * provided to track all possible failures in the finit_module() path and memory + * wasted in this process space. Each of the failure counters are associated + * to a type of module loading failure which is known to incur a certain amount + * of memory allocation loss. In the worst case loading a module will fail after + * a 3 step memory allocation process: + * + * a) memory allocated with kernel_read_file_from_fd() + * b) module decompression processes the file read from + * kernel_read_file_from_fd(), and vmap() is used to map + * the decompressed module to a new local buffer which represents + * a copy of the decompressed module passed from userspace. The buffer + * from kernel_read_file_from_fd() is freed right away. + * c) layout_and_allocate() allocates space for the final resting + * place where we would keep the module if it were to be processed + * successfully. + * + * If a failure occurs after these three different allocations only one + * counter will be incremented with the summation of the allocated bytes freed + * incurred during this failure. Likewise, if module loading failed only after + * step b) a separate counter is used and incremented for the bytes freed and + * not used during both of those allocations. + * + * Virtual memory space can be limited, for example on x86 virtual memory size + * defaults to 128 MiB. We should strive to limit and avoid wasting virtual + * memory allocations when possible. These module debugging statistics help + * to evaluate how much memory is being wasted on bootup due to module loading + * failures. + * + * All counters are designed to be incremental. Atomic counters are used so to + * remain simple and avoid delays and deadlocks. + */ + +/** + * DOC: dup_failed_modules - tracks duplicate failed modules + * + * Linked list of modules which failed to be loaded because an already existing + * module with the same name was already being processed or already loaded. + * The finit_module() system call incurs heavy virtual memory allocations. In + * the worst case an finit_module() system call can end up allocating virtual + * memory 3 times: + * + * 1) kernel_read_file_from_fd() call uses vmalloc() + * 2) optional module decompression uses vmap() + * 3) layout_and allocate() can use vzalloc() or an arch specific variation of + * vmalloc to deal with ELF sections requiring special permissions + * + * In practice on a typical boot today most finit_module() calls fail due to + * the module with the same name already being loaded or about to be processed. + * All virtual memory allocated to these failed modules will be freed with + * no functional use. + * + * To help with this the dup_failed_modules allows us to track modules which + * failed to load due to the fact that a module was already loaded or being + * processed. There are only two points at which we can fail such calls, + * we list them below along with the number of virtual memory allocation + * calls: + * + * a) FAIL_DUP_MOD_BECOMING: at the end of early_mod_check() before + * layout_and_allocate(). + * - with module decompression: 2 virtual memory allocation calls + * - without module decompression: 1 virtual memory allocation calls + * b) FAIL_DUP_MOD_LOAD: after layout_and_allocate() on add_unformed_module() + * - with module decompression 3 virtual memory allocation calls + * - without module decompression 2 virtual memory allocation calls + * + * We should strive to get this list to be as small as possible. If this list + * is not empty it is a reflection of possible work or optimizations possible + * either in-kernel or in userspace. + */ +static LIST_HEAD(dup_failed_modules); + +/** + * DOC: module statistics debugfs counters + * + * The total amount of wasted virtual memory allocation space during module + * loading can be computed by adding the total from the summation: + * + * * @invalid_kread_bytes + + * @invalid_decompress_bytes + + * @invalid_becoming_bytes + + * @invalid_mod_bytes + * + * The following debugfs counters are available to inspect module loading + * failures: + * + * * total_mod_size: total bytes ever used by all modules we've dealt with on + * this system + * * total_text_size: total bytes of the .text and .init.text ELF section + * sizes we've dealt with on this system + * * invalid_kread_bytes: bytes allocated and then freed on failures which + * happen due to the initial kernel_read_file_from_fd(). kernel_read_file_from_fd() + * uses vmalloc(). These should typically not happen unless your system is + * under memory pressure. + * * invalid_decompress_bytes: number of bytes allocated and freed due to + * memory allocations in the module decompression path that use vmap(). + * These typically should not happen unless your system is under memory + * pressure. + * * invalid_becoming_bytes: total number of bytes allocated and freed used + * used to read the kernel module userspace wants us to read before we + * promote it to be processed to be added to our @modules linked list. These + * failures can happen if we had a check in between a successful kernel_read_file_from_fd() + * call and right before we allocate the our private memory for the module + * which would be kept if the module is successfully loaded. The most common + * reason for this failure is when userspace is racing to load a module + * which it does not yet see loaded. The first module to succeed in + * add_unformed_module() will add a module to our &modules list and + * subsequent loads of modules with the same name will error out at the + * end of early_mod_check(). The check for module_patient_check_exists() + * at the end of early_mod_check() prevents duplicate allocations + * on layout_and_allocate() for modules already being processed. These + * duplicate failed modules are non-fatal, however they typically are + * indicative of userspace not seeing a module in userspace loaded yet and + * unnecessarily trying to load a module before the kernel even has a chance + * to begin to process prior requests. Although duplicate failures can be + * non-fatal, we should try to reduce vmalloc() pressure proactively, so + * ideally after boot this will be close to as 0 as possible. If module + * decompression was used we also add to this counter the cost of the + * initial kernel_read_file_from_fd() of the compressed module. If module + * decompression was not used the value represents the total allocated and + * freed bytes in kernel_read_file_from_fd() calls for these type of + * failures. These failures can occur because: + * + * * module_sig_check() - module signature checks + * * elf_validity_cache_copy() - some ELF validation issue + * * early_mod_check(): + * + * * blacklisting + * * failed to rewrite section headers + * * version magic + * * live patch requirements didn't check out + * * the module was detected as being already present + * + * * invalid_mod_bytes: these are the total number of bytes allocated and + * freed due to failures after we did all the sanity checks of the module + * which userspace passed to us and after our first check that the module + * is unique. A module can still fail to load if we detect the module is + * loaded after we allocate space for it with layout_and_allocate(), we do + * this check right before processing the module as live and run its + * initialization routines. Note that you have a failure of this type it + * also means the respective kernel_read_file_from_fd() memory space was + * also freed and not used, and so we increment this counter with twice + * the size of the module. Additionally if you used module decompression + * the size of the compressed module is also added to this counter. + * + * * modcount: how many modules we've loaded in our kernel life time + * * failed_kreads: how many modules failed due to failed kernel_read_file_from_fd() + * * failed_decompress: how many failed module decompression attempts we've had. + * These really should not happen unless your compression / decompression + * might be broken. + * * failed_becoming: how many modules failed after we kernel_read_file_from_fd() + * it and before we allocate memory for it with layout_and_allocate(). This + * counter is never incremented if you manage to validate the module and + * call layout_and_allocate() for it. + * * failed_load_modules: how many modules failed once we've allocated our + * private space for our module using layout_and_allocate(). These failures + * should hopefully mostly be dealt with already. Races in theory could + * still exist here, but it would just mean the kernel had started processing + * two threads concurrently up to early_mod_check() and one thread won. + * These failures are good signs the kernel or userspace is doing something + * seriously stupid or that could be improved. We should strive to fix these, + * but it is perhaps not easy to fix them. A recent example are the modules + * requests incurred for frequency modules, a separate module request was + * being issued for each CPU on a system. + */ + +atomic_long_t total_mod_size; +atomic_long_t total_text_size; +atomic_long_t invalid_kread_bytes; +atomic_long_t invalid_decompress_bytes; +static atomic_long_t invalid_becoming_bytes; +static atomic_long_t invalid_mod_bytes; +atomic_t modcount; +atomic_t failed_kreads; +atomic_t failed_decompress; +static atomic_t failed_becoming; +static atomic_t failed_load_modules; + +static const char *mod_fail_to_str(struct mod_fail_load *mod_fail) +{ + if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask) && + test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask)) + return "Becoming & Load"; + if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask)) + return "Becoming"; + if (test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask)) + return "Load"; + return "Bug-on-stats"; +} + +void mod_stat_bump_invalid(struct load_info *info, int flags) +{ + atomic_long_add(info->len * 2, &invalid_mod_bytes); + atomic_inc(&failed_load_modules); +#if defined(CONFIG_MODULE_DECOMPRESS) + if (flags & MODULE_INIT_COMPRESSED_FILE) + atomic_long_add(info->compressed_len, &invalid_mod_bytes); +#endif +} + +void mod_stat_bump_becoming(struct load_info *info, int flags) +{ + atomic_inc(&failed_becoming); + atomic_long_add(info->len, &invalid_becoming_bytes); +#if defined(CONFIG_MODULE_DECOMPRESS) + if (flags & MODULE_INIT_COMPRESSED_FILE) + atomic_long_add(info->compressed_len, &invalid_becoming_bytes); +#endif +} + +int try_add_failed_module(const char *name, enum fail_dup_mod_reason reason) +{ + struct mod_fail_load *mod_fail; + + list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list, + lockdep_is_held(&module_mutex)) { + if (!strcmp(mod_fail->name, name)) { + atomic_long_inc(&mod_fail->count); + __set_bit(reason, &mod_fail->dup_fail_mask); + goto out; + } + } + + mod_fail = kzalloc(sizeof(*mod_fail), GFP_KERNEL); + if (!mod_fail) + return -ENOMEM; + memcpy(mod_fail->name, name, strlen(name)); + __set_bit(reason, &mod_fail->dup_fail_mask); + atomic_long_inc(&mod_fail->count); + list_add_rcu(&mod_fail->list, &dup_failed_modules); +out: + return 0; +} + +/* + * At 64 bytes per module and assuming a 1024 bytes preamble we can fit the + * 112 module prints within 8k. + * + * 1024 + (64*112) = 8k + */ +#define MAX_PREAMBLE 1024 +#define MAX_FAILED_MOD_PRINT 112 +#define MAX_BYTES_PER_MOD 64 +static ssize_t read_file_mod_stats(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct mod_fail_load *mod_fail; + unsigned int len, size, count_failed = 0; + char *buf; + int ret; + u32 live_mod_count, fkreads, fdecompress, fbecoming, floads; + unsigned long total_size, text_size, ikread_bytes, ibecoming_bytes, + idecompress_bytes, imod_bytes, total_virtual_lost; + + live_mod_count = atomic_read(&modcount); + fkreads = atomic_read(&failed_kreads); + fdecompress = atomic_read(&failed_decompress); + fbecoming = atomic_read(&failed_becoming); + floads = atomic_read(&failed_load_modules); + + total_size = atomic_long_read(&total_mod_size); + text_size = atomic_long_read(&total_text_size); + ikread_bytes = atomic_long_read(&invalid_kread_bytes); + idecompress_bytes = atomic_long_read(&invalid_decompress_bytes); + ibecoming_bytes = atomic_long_read(&invalid_becoming_bytes); + imod_bytes = atomic_long_read(&invalid_mod_bytes); + + total_virtual_lost = ikread_bytes + idecompress_bytes + ibecoming_bytes + imod_bytes; + + size = MAX_PREAMBLE + min((unsigned int)(floads + fbecoming), + (unsigned int)MAX_FAILED_MOD_PRINT) * MAX_BYTES_PER_MOD; + buf = kzalloc(size, GFP_KERNEL); + if (buf == NULL) + return -ENOMEM; + + /* The beginning of our debug preamble */ + len = scnprintf(buf, size, "%25s\t%u\n", "Mods ever loaded", live_mod_count); + + len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on kread", fkreads); + + len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on decompress", + fdecompress); + len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on becoming", fbecoming); + + len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on load", floads); + + len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Total module size", total_size); + len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Total mod text size", text_size); + + len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed kread bytes", ikread_bytes); + + len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed decompress bytes", + idecompress_bytes); + + len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed becoming bytes", ibecoming_bytes); + + len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed kmod bytes", imod_bytes); + + len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Virtual mem wasted bytes", total_virtual_lost); + + if (live_mod_count && total_size) { + len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average mod size", + DIV_ROUND_UP(total_size, live_mod_count)); + } + + if (live_mod_count && text_size) { + len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average mod text size", + DIV_ROUND_UP(text_size, live_mod_count)); + } + + /* + * We use WARN_ON_ONCE() for the counters to ensure we always have parity + * for keeping tabs on a type of failure with one type of byte counter. + * The counters for imod_bytes does not increase for fkreads failures + * for example, and so on. + */ + + WARN_ON_ONCE(ikread_bytes && !fkreads); + if (fkreads && ikread_bytes) { + len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail kread bytes", + DIV_ROUND_UP(ikread_bytes, fkreads)); + } + + WARN_ON_ONCE(ibecoming_bytes && !fbecoming); + if (fbecoming && ibecoming_bytes) { + len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail becoming bytes", + DIV_ROUND_UP(ibecoming_bytes, fbecoming)); + } + + WARN_ON_ONCE(idecompress_bytes && !fdecompress); + if (fdecompress && idecompress_bytes) { + len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail decomp bytes", + DIV_ROUND_UP(idecompress_bytes, fdecompress)); + } + + WARN_ON_ONCE(imod_bytes && !floads); + if (floads && imod_bytes) { + len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average fail load bytes", + DIV_ROUND_UP(imod_bytes, floads)); + } + + /* End of our debug preamble header. */ + + /* Catch when we've gone beyond our expected preamble */ + WARN_ON_ONCE(len >= MAX_PREAMBLE); + + if (list_empty(&dup_failed_modules)) + goto out; + + len += scnprintf(buf + len, size - len, "Duplicate failed modules:\n"); + len += scnprintf(buf + len, size - len, "%25s\t%15s\t%25s\n", + "Module-name", "How-many-times", "Reason"); + mutex_lock(&module_mutex); + + + list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list) { + if (WARN_ON_ONCE(++count_failed >= MAX_FAILED_MOD_PRINT)) + goto out_unlock; + len += scnprintf(buf + len, size - len, "%25s\t%15lu\t%25s\n", mod_fail->name, + atomic_long_read(&mod_fail->count), mod_fail_to_str(mod_fail)); + } +out_unlock: + mutex_unlock(&module_mutex); +out: + ret = simple_read_from_buffer(user_buf, count, ppos, buf, len); + kfree(buf); + return ret; +} +#undef MAX_PREAMBLE +#undef MAX_FAILED_MOD_PRINT +#undef MAX_BYTES_PER_MOD + +static const struct file_operations fops_mod_stats = { + .read = read_file_mod_stats, + .open = simple_open, + .owner = THIS_MODULE, + .llseek = default_llseek, +}; + +#define mod_debug_add_ulong(name) debugfs_create_ulong(#name, 0400, mod_debugfs_root, (unsigned long *) &name.counter) +#define mod_debug_add_atomic(name) debugfs_create_atomic_t(#name, 0400, mod_debugfs_root, &name) +static int __init module_stats_init(void) +{ + mod_debug_add_ulong(total_mod_size); + mod_debug_add_ulong(total_text_size); + mod_debug_add_ulong(invalid_kread_bytes); + mod_debug_add_ulong(invalid_decompress_bytes); + mod_debug_add_ulong(invalid_becoming_bytes); + mod_debug_add_ulong(invalid_mod_bytes); + + mod_debug_add_atomic(modcount); + mod_debug_add_atomic(failed_kreads); + mod_debug_add_atomic(failed_decompress); + mod_debug_add_atomic(failed_becoming); + mod_debug_add_atomic(failed_load_modules); + + debugfs_create_file("stats", 0400, mod_debugfs_root, mod_debugfs_root, &fops_mod_stats); + + return 0; +} +#undef mod_debug_add_ulong +#undef mod_debug_add_atomic +module_init(module_stats_init); diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c index 14fbea66f12f..a2b656b4e3d2 100644 --- a/kernel/module/strict_rwx.c +++ b/kernel/module/strict_rwx.c @@ -11,82 +11,25 @@ #include <linux/set_memory.h> #include "internal.h" -/* - * LKM RO/NX protection: protect module's text/ro-data - * from modification and any data from execution. - * - * General layout of module is: - * [text] [read-only-data] [ro-after-init] [writable data] - * text_size -----^ ^ ^ ^ - * ro_size ------------------------| | | - * ro_after_init_size -----------------------------| | - * size -----------------------------------------------------------| - * - * These values are always page-aligned (as is base) when - * CONFIG_STRICT_MODULE_RWX is set. - */ +static void module_set_memory(const struct module *mod, enum mod_mem_type type, + int (*set_memory)(unsigned long start, int num_pages)) +{ + const struct module_memory *mod_mem = &mod->mem[type]; + + set_vm_flush_reset_perms(mod_mem->base); + set_memory((unsigned long)mod_mem->base, mod_mem->size >> PAGE_SHIFT); +} /* * Since some arches are moving towards PAGE_KERNEL module allocations instead - * of PAGE_KERNEL_EXEC, keep frob_text() and module_enable_x() independent of + * of PAGE_KERNEL_EXEC, keep module_enable_x() independent of * CONFIG_STRICT_MODULE_RWX because they are needed regardless of whether we * are strict. */ -static void frob_text(const struct module_layout *layout, - int (*set_memory)(unsigned long start, int num_pages)) -{ - set_memory((unsigned long)layout->base, - PAGE_ALIGN(layout->text_size) >> PAGE_SHIFT); -} - -static void frob_rodata(const struct module_layout *layout, - int (*set_memory)(unsigned long start, int num_pages)) -{ - set_memory((unsigned long)layout->base + layout->text_size, - (layout->ro_size - layout->text_size) >> PAGE_SHIFT); -} - -static void frob_ro_after_init(const struct module_layout *layout, - int (*set_memory)(unsigned long start, int num_pages)) -{ - set_memory((unsigned long)layout->base + layout->ro_size, - (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT); -} - -static void frob_writable_data(const struct module_layout *layout, - int (*set_memory)(unsigned long start, int num_pages)) -{ - set_memory((unsigned long)layout->base + layout->ro_after_init_size, - (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT); -} - -static bool layout_check_misalignment(const struct module_layout *layout) -{ - return WARN_ON(!PAGE_ALIGNED(layout->base)) || - WARN_ON(!PAGE_ALIGNED(layout->text_size)) || - WARN_ON(!PAGE_ALIGNED(layout->ro_size)) || - WARN_ON(!PAGE_ALIGNED(layout->ro_after_init_size)) || - WARN_ON(!PAGE_ALIGNED(layout->size)); -} - -bool module_check_misalignment(const struct module *mod) -{ - if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) - return false; - - return layout_check_misalignment(&mod->core_layout) || - layout_check_misalignment(&mod->data_layout) || - layout_check_misalignment(&mod->init_layout); -} - void module_enable_x(const struct module *mod) { - if (!PAGE_ALIGNED(mod->core_layout.base) || - !PAGE_ALIGNED(mod->init_layout.base)) - return; - - frob_text(&mod->core_layout, set_memory_x); - frob_text(&mod->init_layout, set_memory_x); + for_class_mod_mem_type(type, text) + module_set_memory(mod, type, set_memory_x); } void module_enable_ro(const struct module *mod, bool after_init) @@ -98,16 +41,13 @@ void module_enable_ro(const struct module *mod, bool after_init) return; #endif - set_vm_flush_reset_perms(mod->core_layout.base); - set_vm_flush_reset_perms(mod->init_layout.base); - frob_text(&mod->core_layout, set_memory_ro); - - frob_rodata(&mod->data_layout, set_memory_ro); - frob_text(&mod->init_layout, set_memory_ro); - frob_rodata(&mod->init_layout, set_memory_ro); + module_set_memory(mod, MOD_TEXT, set_memory_ro); + module_set_memory(mod, MOD_INIT_TEXT, set_memory_ro); + module_set_memory(mod, MOD_RODATA, set_memory_ro); + module_set_memory(mod, MOD_INIT_RODATA, set_memory_ro); if (after_init) - frob_ro_after_init(&mod->data_layout, set_memory_ro); + module_set_memory(mod, MOD_RO_AFTER_INIT, set_memory_ro); } void module_enable_nx(const struct module *mod) @@ -115,11 +55,8 @@ void module_enable_nx(const struct module *mod) if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) return; - frob_rodata(&mod->data_layout, set_memory_nx); - frob_ro_after_init(&mod->data_layout, set_memory_nx); - frob_writable_data(&mod->data_layout, set_memory_nx); - frob_rodata(&mod->init_layout, set_memory_nx); - frob_writable_data(&mod->init_layout, set_memory_nx); + for_class_mod_mem_type(type, data) + module_set_memory(mod, type, set_memory_nx); } int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, diff --git a/kernel/module/tracking.c b/kernel/module/tracking.c index 26d812e07615..16742d1c630c 100644 --- a/kernel/module/tracking.c +++ b/kernel/module/tracking.c @@ -15,6 +15,7 @@ #include "internal.h" static LIST_HEAD(unloaded_tainted_modules); +extern struct dentry *mod_debugfs_root; int try_add_tainted_module(struct module *mod) { @@ -120,12 +121,8 @@ static const struct file_operations unloaded_tainted_modules_fops = { static int __init unloaded_tainted_modules_init(void) { - struct dentry *dir; - - dir = debugfs_create_dir("modules", NULL); - debugfs_create_file("unloaded_tainted", 0444, dir, NULL, + debugfs_create_file("unloaded_tainted", 0444, mod_debugfs_root, NULL, &unloaded_tainted_modules_fops); - return 0; } module_init(unloaded_tainted_modules_init); diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c index 8ec5cfd60496..277197977d43 100644 --- a/kernel/module/tree_lookup.c +++ b/kernel/module/tree_lookup.c @@ -21,16 +21,16 @@ static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n) { - struct module_layout *layout = container_of(n, struct module_layout, mtn.node); + struct module_memory *mod_mem = container_of(n, struct module_memory, mtn.node); - return (unsigned long)layout->base; + return (unsigned long)mod_mem->base; } static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n) { - struct module_layout *layout = container_of(n, struct module_layout, mtn.node); + struct module_memory *mod_mem = container_of(n, struct module_memory, mtn.node); - return (unsigned long)layout->size; + return (unsigned long)mod_mem->size; } static __always_inline bool @@ -77,32 +77,27 @@ static void __mod_tree_remove(struct mod_tree_node *node, struct mod_tree_root * */ void mod_tree_insert(struct module *mod) { - mod->core_layout.mtn.mod = mod; - mod->init_layout.mtn.mod = mod; - - __mod_tree_insert(&mod->core_layout.mtn, &mod_tree); - if (mod->init_layout.size) - __mod_tree_insert(&mod->init_layout.mtn, &mod_tree); - -#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC - mod->data_layout.mtn.mod = mod; - __mod_tree_insert(&mod->data_layout.mtn, &mod_data_tree); -#endif + for_each_mod_mem_type(type) { + mod->mem[type].mtn.mod = mod; + if (mod->mem[type].size) + __mod_tree_insert(&mod->mem[type].mtn, &mod_tree); + } } void mod_tree_remove_init(struct module *mod) { - if (mod->init_layout.size) - __mod_tree_remove(&mod->init_layout.mtn, &mod_tree); + for_class_mod_mem_type(type, init) { + if (mod->mem[type].size) + __mod_tree_remove(&mod->mem[type].mtn, &mod_tree); + } } void mod_tree_remove(struct module *mod) { - __mod_tree_remove(&mod->core_layout.mtn, &mod_tree); - mod_tree_remove_init(mod); -#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC - __mod_tree_remove(&mod->data_layout.mtn, &mod_data_tree); -#endif + for_each_mod_mem_type(type) { + if (mod->mem[type].size) + __mod_tree_remove(&mod->mem[type].mtn, &mod_tree); + } } struct module *mod_find(unsigned long addr, struct mod_tree_root *tree) diff --git a/kernel/notifier.c b/kernel/notifier.c index d353e4b5402d..b3ce28f39eb6 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -7,6 +7,9 @@ #include <linux/vmalloc.h> #include <linux/reboot.h> +#define CREATE_TRACE_POINTS +#include <trace/events/notifier.h> + /* * Notifier list for kernel code which wants to be called * at shutdown. This is used to stop any idling DMA operations @@ -37,6 +40,7 @@ static int notifier_chain_register(struct notifier_block **nl, } n->next = *nl; rcu_assign_pointer(*nl, n); + trace_notifier_register((void *)n->notifier_call); return 0; } @@ -46,6 +50,7 @@ static int notifier_chain_unregister(struct notifier_block **nl, while ((*nl) != NULL) { if ((*nl) == n) { rcu_assign_pointer(*nl, n->next); + trace_notifier_unregister((void *)n->notifier_call); return 0; } nl = &((*nl)->next); @@ -84,6 +89,7 @@ static int notifier_call_chain(struct notifier_block **nl, continue; } #endif + trace_notifier_run((void *)nb->notifier_call); ret = nb->notifier_call(nb, val, v); if (nr_calls) diff --git a/kernel/panic.c b/kernel/panic.c index 5cfea8302d23..886d2ebd0a0d 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -141,7 +141,7 @@ EXPORT_SYMBOL(panic_blink); /* * Stop ourself in panic -- architecture code may override this */ -void __weak panic_smp_self_stop(void) +void __weak __noreturn panic_smp_self_stop(void) { while (1) cpu_relax(); @@ -151,7 +151,7 @@ void __weak panic_smp_self_stop(void) * Stop ourselves in NMI context if another CPU has already panicked. Arch code * may override this to prepare for crash dumping, e.g. save regs info. */ -void __weak nmi_panic_self_stop(struct pt_regs *regs) +void __weak __noreturn nmi_panic_self_stop(struct pt_regs *regs) { panic_smp_self_stop(); } diff --git a/kernel/params.c b/kernel/params.c index 6e34ca89ebae..6a7548979aa9 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -948,7 +948,7 @@ static void module_kobj_release(struct kobject *kobj) complete(mk->kobj_completion); } -struct kobj_type module_ktype = { +const struct kobj_type module_ktype = { .release = module_kobj_release, .sysfs_ops = &module_sysfs_ops, }; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 46e0d5a3f91f..b43eee07b00c 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -314,7 +314,6 @@ static struct ctl_table pid_ns_ctl_table[] = { }, { } }; -static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; #endif /* CONFIG_CHECKPOINT_RESTORE */ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) @@ -473,7 +472,7 @@ static __init int pid_namespaces_init(void) pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT); #ifdef CONFIG_CHECKPOINT_RESTORE - register_sysctl_paths(kern_path, pid_ns_ctl_table); + register_sysctl_init("kernel", pid_ns_ctl_table); #endif register_pid_ns_sysctl_table_vm(); diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h index e22d072e1e24..d67a4d45bb42 100644 --- a/kernel/pid_sysctl.h +++ b/kernel/pid_sysctl.h @@ -46,10 +46,9 @@ static struct ctl_table pid_ns_ctl_table_vm[] = { }, { } }; -static struct ctl_path vm_path[] = { { .procname = "vm", }, { } }; static inline void register_pid_ns_sysctl_table_vm(void) { - register_sysctl_paths(vm_path, pid_ns_ctl_table_vm); + register_sysctl("vm", pid_ns_ctl_table_vm); } #else static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns) {} diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 793c55a2becb..30d1274f03f6 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -64,6 +64,7 @@ enum { static int hibernation_mode = HIBERNATION_SHUTDOWN; bool freezer_test_done; +bool snapshot_test; static const struct platform_hibernation_ops *hibernation_ops; @@ -687,18 +688,22 @@ static int load_image_and_restore(void) { int error; unsigned int flags; + fmode_t mode = FMODE_READ; + + if (snapshot_test) + mode |= FMODE_EXCL; pm_pr_dbg("Loading hibernation image.\n"); lock_device_hotplug(); error = create_basic_memory_bitmaps(); if (error) { - swsusp_close(FMODE_READ | FMODE_EXCL); + swsusp_close(mode); goto Unlock; } error = swsusp_read(&flags); - swsusp_close(FMODE_READ | FMODE_EXCL); + swsusp_close(mode); if (!error) error = hibernation_restore(flags & SF_PLATFORM_MODE); @@ -716,7 +721,6 @@ static int load_image_and_restore(void) */ int hibernate(void) { - bool snapshot_test = false; unsigned int sleep_flags; int error; @@ -744,6 +748,9 @@ int hibernate(void) if (error) goto Exit; + /* protected by system_transition_mutex */ + snapshot_test = false; + lock_device_hotplug(); /* Allocate memory management structures */ error = create_basic_memory_bitmaps(); @@ -940,6 +947,8 @@ static int software_resume(void) */ mutex_lock_nested(&system_transition_mutex, SINGLE_DEPTH_NESTING); + snapshot_test = false; + if (swsusp_resume_device) goto Check_image; diff --git a/kernel/power/power.h b/kernel/power/power.h index b4f433943209..b83c8d5e188d 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -59,6 +59,7 @@ asmlinkage int swsusp_save(void); /* kernel/power/hibernate.c */ extern bool freezer_test_done; +extern bool snapshot_test; extern int hibernation_snapshot(int platform_mode); extern int hibernation_restore(int platform_mode); diff --git a/kernel/power/process.c b/kernel/power/process.c index 6c1c7e566d35..cae81a87cc91 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -93,7 +93,7 @@ static int try_to_freeze_tasks(bool user_only) todo - wq_busy, wq_busy); if (wq_busy) - show_all_workqueues(); + show_freezable_workqueues(); if (!wakeup || pm_debug_messages_on) { read_lock(&tasklist_lock); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 36a1df48280c..92e41ed292ad 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -1518,9 +1518,13 @@ int swsusp_check(void) { int error; void *holder; + fmode_t mode = FMODE_READ; + + if (snapshot_test) + mode |= FMODE_EXCL; hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, - FMODE_READ | FMODE_EXCL, &holder); + mode, &holder); if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); @@ -1547,7 +1551,7 @@ int swsusp_check(void) put: if (error) - blkdev_put(hib_resume_bdev, FMODE_READ | FMODE_EXCL); + blkdev_put(hib_resume_bdev, mode); else pr_debug("Image signature found, resuming\n"); } else { diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9644f6e5bf15..6a333adce3b3 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -71,6 +71,8 @@ EXPORT_SYMBOL_GPL(console_printk); atomic_t ignore_console_lock_warning __read_mostly = ATOMIC_INIT(0); EXPORT_SYMBOL(ignore_console_lock_warning); +EXPORT_TRACEPOINT_SYMBOL_GPL(console); + /* * Low level drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. @@ -89,7 +91,7 @@ static DEFINE_MUTEX(console_mutex); * console_sem protects updates to console->seq and console_suspended, * and also provides serialization for console printing. */ -static DEFINE_SEMAPHORE(console_sem); +static DEFINE_SEMAPHORE(console_sem, 1); HLIST_HEAD(console_list); EXPORT_SYMBOL_GPL(console_list); DEFINE_STATIC_SRCU(console_srcu); diff --git a/kernel/relay.c b/kernel/relay.c index 9aa70ae53d24..a80fa01042e9 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -989,7 +989,8 @@ static size_t relay_file_read_start_pos(struct rchan_buf *buf) size_t subbuf_size = buf->chan->subbuf_size; size_t n_subbufs = buf->chan->n_subbufs; size_t consumed = buf->subbufs_consumed % n_subbufs; - size_t read_pos = consumed * subbuf_size + buf->bytes_consumed; + size_t read_pos = (consumed * subbuf_size + buf->bytes_consumed) + % (n_subbufs * subbuf_size); read_subbuf = read_pos / subbuf_size; padding = buf->padding[read_subbuf]; diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 5732fa75ebab..b5cc2b53464d 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -300,6 +300,9 @@ noinstr u64 local_clock(void) if (static_branch_likely(&__sched_clock_stable)) return sched_clock() + __sched_clock_offset; + if (!static_branch_likely(&sched_clock_running)) + return sched_clock(); + preempt_disable_notrace(); clock = sched_clock_local(this_scd()); preempt_enable_notrace(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0d18c3969f90..a68d1276bab0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -80,6 +80,7 @@ #define CREATE_TRACE_POINTS #include <linux/sched/rseq_api.h> #include <trace/events/sched.h> +#include <trace/events/ipi.h> #undef CREATE_TRACE_POINTS #include "sched.h" @@ -95,6 +96,9 @@ #include "../../io_uring/io-wq.h" #include "../smpboot.h" +EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu); +EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask); + /* * Export tracepoints that act as a bare tracehook (ie: have no trace event * associated with them) to allow external modules to probe them. @@ -261,36 +265,51 @@ void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) resched_curr(rq); } -/* - * Find left-most (aka, highest priority) task matching @cookie. - */ -static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie) +static int sched_task_is_throttled(struct task_struct *p, int cpu) { - struct rb_node *node; - - node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp); - /* - * The idle task always matches any cookie! - */ - if (!node) - return idle_sched_class.pick_task(rq); + if (p->sched_class->task_is_throttled) + return p->sched_class->task_is_throttled(p, cpu); - return __node_2_sc(node); + return 0; } static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie) { struct rb_node *node = &p->core_node; + int cpu = task_cpu(p); + + do { + node = rb_next(node); + if (!node) + return NULL; + + p = __node_2_sc(node); + if (p->core_cookie != cookie) + return NULL; + + } while (sched_task_is_throttled(p, cpu)); - node = rb_next(node); + return p; +} + +/* + * Find left-most (aka, highest priority) and unthrottled task matching @cookie. + * If no suitable task is found, NULL will be returned. + */ +static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie) +{ + struct task_struct *p; + struct rb_node *node; + + node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp); if (!node) return NULL; - p = container_of(node, struct task_struct, core_node); - if (p->core_cookie != cookie) - return NULL; + p = __node_2_sc(node); + if (!sched_task_is_throttled(p, rq->cpu)) + return p; - return p; + return sched_core_next(p, cookie); } /* @@ -704,6 +723,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->prev_irq_time += irq_delta; delta -= irq_delta; psi_account_irqtime(rq->curr, irq_delta); + delayacct_irq(rq->curr, irq_delta); #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING if (static_key_false((¶virt_steal_rq_enabled))) { @@ -2086,6 +2106,8 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_on_rq_migrating(p)) flags |= ENQUEUE_MIGRATED; + if (flags & ENQUEUE_MIGRATED) + sched_mm_cid_migrate_to(rq, p); enqueue_task(rq, p, flags); @@ -3195,6 +3217,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; rseq_migrate(p); + sched_mm_cid_migrate_from(p); perf_event_task_migrate(p); } @@ -3829,14 +3852,20 @@ void sched_ttwu_pending(void *arg) rq_unlock_irqrestore(rq, &rf); } -void send_call_function_single_ipi(int cpu) +/* + * Prepare the scene for sending an IPI for a remote smp_call + * + * Returns true if the caller can proceed with sending the IPI. + * Returns false otherwise. + */ +bool call_function_single_prep_ipi(int cpu) { - struct rq *rq = cpu_rq(cpu); - - if (!set_nr_if_polling(rq->idle)) - arch_send_call_function_single_ipi(cpu); - else + if (set_nr_if_polling(cpu_rq(cpu)->idle)) { trace_sched_wake_idle_without_ipi(cpu); + return false; + } + + return true; } /* @@ -4468,6 +4497,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->wake_entry.u_flags = CSD_TYPE_TTWU; p->migration_pending = NULL; #endif + init_sched_mm_cid(p); } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); @@ -5114,7 +5144,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); rseq_preempt(prev); - switch_mm_cid(prev, next); fire_sched_out_preempt_notifiers(prev, next); kmap_local_sched_out(); prepare_task(next); @@ -5203,13 +5232,14 @@ static struct rq *finish_task_switch(struct task_struct *prev) * rq->curr, before returning to userspace, so provide them here: * * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly - * provided by mmdrop(), + * provided by mmdrop_lazy_tlb(), * - a sync_core for SYNC_CORE. */ if (mm) { membarrier_mm_sync_core_before_usermode(mm); - mmdrop_sched(mm); + mmdrop_lazy_tlb_sched(mm); } + if (unlikely(prev_state == TASK_DEAD)) { if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); @@ -5266,17 +5296,20 @@ context_switch(struct rq *rq, struct task_struct *prev, /* * kernel -> kernel lazy + transfer active - * user -> kernel lazy + mmgrab() active + * user -> kernel lazy + mmgrab_lazy_tlb() active * - * kernel -> user switch + mmdrop() active + * kernel -> user switch + mmdrop_lazy_tlb() active * user -> user switch + * + * switch_mm_cid() needs to be updated if the barriers provided + * by context_switch() are modified. */ if (!next->mm) { // to kernel enter_lazy_tlb(prev->active_mm, next); next->active_mm = prev->active_mm; if (prev->mm) // from user - mmgrab(prev->active_mm); + mmgrab_lazy_tlb(prev->active_mm); else prev->active_mm = NULL; } else { // to user @@ -5293,12 +5326,15 @@ context_switch(struct rq *rq, struct task_struct *prev, lru_gen_use_mm(next->mm); if (!prev->mm) { // from kernel - /* will mmdrop() in finish_task_switch(). */ + /* will mmdrop_lazy_tlb() in finish_task_switch(). */ rq->prev_mm = prev->active_mm; prev->active_mm = NULL; } } + /* switch_mm_cid() requires the memory barriers above. */ + switch_mm_cid(rq, prev, next); + rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); prepare_lock_switch(rq, next, rf); @@ -5587,6 +5623,7 @@ void scheduler_tick(void) resched_latency = cpu_resched_latency(rq); calc_global_load_tick(rq); sched_core_tick(rq); + task_tick_mm_cid(rq, curr); rq_unlock(rq, &rf); @@ -6239,7 +6276,7 @@ static bool try_steal_cookie(int this, int that) goto unlock; p = sched_core_find(src, cookie); - if (p == src->idle) + if (!p) goto unlock; do { @@ -6251,6 +6288,13 @@ static bool try_steal_cookie(int this, int that) if (p->core_occupation > dst->idle->core_occupation) goto next; + /* + * sched_core_find() and sched_core_next() will ensure that task @p + * is not throttled now, we also need to check whether the runqueue + * of the destination CPU is being throttled. + */ + if (sched_task_is_throttled(p, this)) + goto next; deactivate_task(src, p, 0); set_task_cpu(p, this); @@ -8506,6 +8550,7 @@ EXPORT_STATIC_CALL_TRAMP(might_resched); static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched); int __sched dynamic_cond_resched(void) { + klp_sched_try_switch(); if (!static_branch_unlikely(&sk_dynamic_cond_resched)) return 0; return __cond_resched(); @@ -8654,13 +8699,17 @@ int sched_dynamic_mode(const char *str) #error "Unsupported PREEMPT_DYNAMIC mechanism" #endif -void sched_dynamic_update(int mode) +static DEFINE_MUTEX(sched_dynamic_mutex); +static bool klp_override; + +static void __sched_dynamic_update(int mode) { /* * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in * the ZERO state, which is invalid. */ - preempt_dynamic_enable(cond_resched); + if (!klp_override) + preempt_dynamic_enable(cond_resched); preempt_dynamic_enable(might_resched); preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notrace); @@ -8668,36 +8717,79 @@ void sched_dynamic_update(int mode) switch (mode) { case preempt_dynamic_none: - preempt_dynamic_enable(cond_resched); + if (!klp_override) + preempt_dynamic_enable(cond_resched); preempt_dynamic_disable(might_resched); preempt_dynamic_disable(preempt_schedule); preempt_dynamic_disable(preempt_schedule_notrace); preempt_dynamic_disable(irqentry_exit_cond_resched); - pr_info("Dynamic Preempt: none\n"); + if (mode != preempt_dynamic_mode) + pr_info("Dynamic Preempt: none\n"); break; case preempt_dynamic_voluntary: - preempt_dynamic_enable(cond_resched); + if (!klp_override) + preempt_dynamic_enable(cond_resched); preempt_dynamic_enable(might_resched); preempt_dynamic_disable(preempt_schedule); preempt_dynamic_disable(preempt_schedule_notrace); preempt_dynamic_disable(irqentry_exit_cond_resched); - pr_info("Dynamic Preempt: voluntary\n"); + if (mode != preempt_dynamic_mode) + pr_info("Dynamic Preempt: voluntary\n"); break; case preempt_dynamic_full: - preempt_dynamic_disable(cond_resched); + if (!klp_override) + preempt_dynamic_disable(cond_resched); preempt_dynamic_disable(might_resched); preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notrace); preempt_dynamic_enable(irqentry_exit_cond_resched); - pr_info("Dynamic Preempt: full\n"); + if (mode != preempt_dynamic_mode) + pr_info("Dynamic Preempt: full\n"); break; } preempt_dynamic_mode = mode; } +void sched_dynamic_update(int mode) +{ + mutex_lock(&sched_dynamic_mutex); + __sched_dynamic_update(mode); + mutex_unlock(&sched_dynamic_mutex); +} + +#ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL + +static int klp_cond_resched(void) +{ + __klp_sched_try_switch(); + return __cond_resched(); +} + +void sched_dynamic_klp_enable(void) +{ + mutex_lock(&sched_dynamic_mutex); + + klp_override = true; + static_call_update(cond_resched, klp_cond_resched); + + mutex_unlock(&sched_dynamic_mutex); +} + +void sched_dynamic_klp_disable(void) +{ + mutex_lock(&sched_dynamic_mutex); + + klp_override = false; + __sched_dynamic_update(preempt_dynamic_mode); + + mutex_unlock(&sched_dynamic_mutex); +} + +#endif /* CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ + static int __init setup_preempt_mode(char *str) { int mode = sched_dynamic_mode(str); @@ -9935,7 +10027,7 @@ void __init sched_init(void) /* * The boot idle thread does lazy MMU switching as well: */ - mmgrab(&init_mm); + mmgrab_lazy_tlb(&init_mm); enter_lazy_tlb(&init_mm, current); /* @@ -10332,7 +10424,7 @@ void sched_release_group(struct task_group *tg) spin_unlock_irqrestore(&task_group_lock, flags); } -static void sched_change_group(struct task_struct *tsk) +static struct task_group *sched_get_task_group(struct task_struct *tsk) { struct task_group *tg; @@ -10344,7 +10436,13 @@ static void sched_change_group(struct task_struct *tsk) tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), struct task_group, css); tg = autogroup_task_group(tsk, tg); - tsk->sched_task_group = tg; + + return tg; +} + +static void sched_change_group(struct task_struct *tsk, struct task_group *group) +{ + tsk->sched_task_group = group; #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_change_group) @@ -10365,10 +10463,19 @@ void sched_move_task(struct task_struct *tsk) { int queued, running, queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; + struct task_group *group; struct rq_flags rf; struct rq *rq; rq = task_rq_lock(tsk, &rf); + /* + * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous + * group changes. + */ + group = sched_get_task_group(tsk); + if (group == tsk->sched_task_group) + goto unlock; + update_rq_clock(rq); running = task_current(rq, tsk); @@ -10379,7 +10486,7 @@ void sched_move_task(struct task_struct *tsk) if (running) put_prev_task(rq, tsk); - sched_change_group(tsk); + sched_change_group(tsk, group); if (queued) enqueue_task(rq, tsk, queue_flags); @@ -10393,6 +10500,7 @@ void sched_move_task(struct task_struct *tsk) resched_curr(rq); } +unlock: task_rq_unlock(rq, tsk, &rf); } @@ -11383,45 +11491,524 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) } #ifdef CONFIG_SCHED_MM_CID -void sched_mm_cid_exit_signals(struct task_struct *t) + +/* + * @cid_lock: Guarantee forward-progress of cid allocation. + * + * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock + * is only used when contention is detected by the lock-free allocation so + * forward progress can be guaranteed. + */ +DEFINE_RAW_SPINLOCK(cid_lock); + +/* + * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock. + * + * When @use_cid_lock is 0, the cid allocation is lock-free. When contention is + * detected, it is set to 1 to ensure that all newly coming allocations are + * serialized by @cid_lock until the allocation which detected contention + * completes and sets @use_cid_lock back to 0. This guarantees forward progress + * of a cid allocation. + */ +int use_cid_lock; + +/* + * mm_cid remote-clear implements a lock-free algorithm to clear per-mm/cpu cid + * concurrently with respect to the execution of the source runqueue context + * switch. + * + * There is one basic properties we want to guarantee here: + * + * (1) Remote-clear should _never_ mark a per-cpu cid UNSET when it is actively + * used by a task. That would lead to concurrent allocation of the cid and + * userspace corruption. + * + * Provide this guarantee by introducing a Dekker memory ordering to guarantee + * that a pair of loads observe at least one of a pair of stores, which can be + * shown as: + * + * X = Y = 0 + * + * w[X]=1 w[Y]=1 + * MB MB + * r[Y]=y r[X]=x + * + * Which guarantees that x==0 && y==0 is impossible. But rather than using + * values 0 and 1, this algorithm cares about specific state transitions of the + * runqueue current task (as updated by the scheduler context switch), and the + * per-mm/cpu cid value. + * + * Let's introduce task (Y) which has task->mm == mm and task (N) which has + * task->mm != mm for the rest of the discussion. There are two scheduler state + * transitions on context switch we care about: + * + * (TSA) Store to rq->curr with transition from (N) to (Y) + * + * (TSB) Store to rq->curr with transition from (Y) to (N) + * + * On the remote-clear side, there is one transition we care about: + * + * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag + * + * There is also a transition to UNSET state which can be performed from all + * sides (scheduler, remote-clear). It is always performed with a cmpxchg which + * guarantees that only a single thread will succeed: + * + * (TMB) cmpxchg to *pcpu_cid to mark UNSET + * + * Just to be clear, what we do _not_ want to happen is a transition to UNSET + * when a thread is actively using the cid (property (1)). + * + * Let's looks at the relevant combinations of TSA/TSB, and TMA transitions. + * + * Scenario A) (TSA)+(TMA) (from next task perspective) + * + * CPU0 CPU1 + * + * Context switch CS-1 Remote-clear + * - store to rq->curr: (N)->(Y) (TSA) - cmpxchg to *pcpu_id to LAZY (TMA) + * (implied barrier after cmpxchg) + * - switch_mm_cid() + * - memory barrier (see switch_mm_cid() + * comment explaining how this barrier + * is combined with other scheduler + * barriers) + * - mm_cid_get (next) + * - READ_ONCE(*pcpu_cid) - rcu_dereference(src_rq->curr) + * + * This Dekker ensures that either task (Y) is observed by the + * rcu_dereference() or the LAZY flag is observed by READ_ONCE(), or both are + * observed. + * + * If task (Y) store is observed by rcu_dereference(), it means that there is + * still an active task on the cpu. Remote-clear will therefore not transition + * to UNSET, which fulfills property (1). + * + * If task (Y) is not observed, but the lazy flag is observed by READ_ONCE(), + * it will move its state to UNSET, which clears the percpu cid perhaps + * uselessly (which is not an issue for correctness). Because task (Y) is not + * observed, CPU1 can move ahead to set the state to UNSET. Because moving + * state to UNSET is done with a cmpxchg expecting that the old state has the + * LAZY flag set, only one thread will successfully UNSET. + * + * If both states (LAZY flag and task (Y)) are observed, the thread on CPU0 + * will observe the LAZY flag and transition to UNSET (perhaps uselessly), and + * CPU1 will observe task (Y) and do nothing more, which is fine. + * + * What we are effectively preventing with this Dekker is a scenario where + * neither LAZY flag nor store (Y) are observed, which would fail property (1) + * because this would UNSET a cid which is actively used. + */ + +void sched_mm_cid_migrate_from(struct task_struct *t) +{ + t->migrate_from_cpu = task_cpu(t); +} + +static +int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq, + struct task_struct *t, + struct mm_cid *src_pcpu_cid) { struct mm_struct *mm = t->mm; - unsigned long flags; + struct task_struct *src_task; + int src_cid, last_mm_cid; + + if (!mm) + return -1; + + last_mm_cid = t->last_mm_cid; + /* + * If the migrated task has no last cid, or if the current + * task on src rq uses the cid, it means the source cid does not need + * to be moved to the destination cpu. + */ + if (last_mm_cid == -1) + return -1; + src_cid = READ_ONCE(src_pcpu_cid->cid); + if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_cid) + return -1; + + /* + * If we observe an active task using the mm on this rq, it means we + * are not the last task to be migrated from this cpu for this mm, so + * there is no need to move src_cid to the destination cpu. + */ + rcu_read_lock(); + src_task = rcu_dereference(src_rq->curr); + if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { + rcu_read_unlock(); + t->last_mm_cid = -1; + return -1; + } + rcu_read_unlock(); + + return src_cid; +} + +static +int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq, + struct task_struct *t, + struct mm_cid *src_pcpu_cid, + int src_cid) +{ + struct task_struct *src_task; + struct mm_struct *mm = t->mm; + int lazy_cid; + + if (src_cid == -1) + return -1; + + /* + * Attempt to clear the source cpu cid to move it to the destination + * cpu. + */ + lazy_cid = mm_cid_set_lazy_put(src_cid); + if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_cid)) + return -1; + + /* + * The implicit barrier after cmpxchg per-mm/cpu cid before loading + * rq->curr->mm matches the scheduler barrier in context_switch() + * between store to rq->curr and load of prev and next task's + * per-mm/cpu cid. + * + * The implicit barrier after cmpxchg per-mm/cpu cid before loading + * rq->curr->mm_cid_active matches the barrier in + * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and + * sched_mm_cid_after_execve() between store to t->mm_cid_active and + * load of per-mm/cpu cid. + */ + + /* + * If we observe an active task using the mm on this rq after setting + * the lazy-put flag, this task will be responsible for transitioning + * from lazy-put flag set to MM_CID_UNSET. + */ + rcu_read_lock(); + src_task = rcu_dereference(src_rq->curr); + if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { + rcu_read_unlock(); + /* + * We observed an active task for this mm, there is therefore + * no point in moving this cid to the destination cpu. + */ + t->last_mm_cid = -1; + return -1; + } + rcu_read_unlock(); + + /* + * The src_cid is unused, so it can be unset. + */ + if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) + return -1; + return src_cid; +} + +/* + * Migration to dst cpu. Called with dst_rq lock held. + * Interrupts are disabled, which keeps the window of cid ownership without the + * source rq lock held small. + */ +void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) +{ + struct mm_cid *src_pcpu_cid, *dst_pcpu_cid; + struct mm_struct *mm = t->mm; + int src_cid, dst_cid, src_cpu; + struct rq *src_rq; + + lockdep_assert_rq_held(dst_rq); if (!mm) return; + src_cpu = t->migrate_from_cpu; + if (src_cpu == -1) { + t->last_mm_cid = -1; + return; + } + /* + * Move the src cid if the dst cid is unset. This keeps id + * allocation closest to 0 in cases where few threads migrate around + * many cpus. + * + * If destination cid is already set, we may have to just clear + * the src cid to ensure compactness in frequent migrations + * scenarios. + * + * It is not useful to clear the src cid when the number of threads is + * greater or equal to the number of allowed cpus, because user-space + * can expect that the number of allowed cids can reach the number of + * allowed cpus. + */ + dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq)); + dst_cid = READ_ONCE(dst_pcpu_cid->cid); + if (!mm_cid_is_unset(dst_cid) && + atomic_read(&mm->mm_users) >= t->nr_cpus_allowed) + return; + src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu); + src_rq = cpu_rq(src_cpu); + src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_rq, t, src_pcpu_cid); + if (src_cid == -1) + return; + src_cid = __sched_mm_cid_migrate_from_try_steal_cid(src_rq, t, src_pcpu_cid, + src_cid); + if (src_cid == -1) + return; + if (!mm_cid_is_unset(dst_cid)) { + __mm_cid_put(mm, src_cid); + return; + } + /* Move src_cid to dst cpu. */ + mm_cid_snapshot_time(dst_rq, mm); + WRITE_ONCE(dst_pcpu_cid->cid, src_cid); +} + +static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid, + int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct task_struct *t; + unsigned long flags; + int cid, lazy_cid; + + cid = READ_ONCE(pcpu_cid->cid); + if (!mm_cid_is_valid(cid)) + return; + + /* + * Clear the cpu cid if it is set to keep cid allocation compact. If + * there happens to be other tasks left on the source cpu using this + * mm, the next task using this mm will reallocate its cid on context + * switch. + */ + lazy_cid = mm_cid_set_lazy_put(cid); + if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid)) + return; + + /* + * The implicit barrier after cmpxchg per-mm/cpu cid before loading + * rq->curr->mm matches the scheduler barrier in context_switch() + * between store to rq->curr and load of prev and next task's + * per-mm/cpu cid. + * + * The implicit barrier after cmpxchg per-mm/cpu cid before loading + * rq->curr->mm_cid_active matches the barrier in + * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and + * sched_mm_cid_after_execve() between store to t->mm_cid_active and + * load of per-mm/cpu cid. + */ + + /* + * If we observe an active task using the mm on this rq after setting + * the lazy-put flag, that task will be responsible for transitioning + * from lazy-put flag set to MM_CID_UNSET. + */ + rcu_read_lock(); + t = rcu_dereference(rq->curr); + if (READ_ONCE(t->mm_cid_active) && t->mm == mm) { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + + /* + * The cid is unused, so it can be unset. + * Disable interrupts to keep the window of cid ownership without rq + * lock small. + */ local_irq_save(flags); - mm_cid_put(mm, t->mm_cid); - t->mm_cid = -1; - t->mm_cid_active = 0; + if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) + __mm_cid_put(mm, cid); local_irq_restore(flags); } +static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct mm_cid *pcpu_cid; + struct task_struct *curr; + u64 rq_clock; + + /* + * rq->clock load is racy on 32-bit but one spurious clear once in a + * while is irrelevant. + */ + rq_clock = READ_ONCE(rq->clock); + pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); + + /* + * In order to take care of infrequently scheduled tasks, bump the time + * snapshot associated with this cid if an active task using the mm is + * observed on this rq. + */ + rcu_read_lock(); + curr = rcu_dereference(rq->curr); + if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) { + WRITE_ONCE(pcpu_cid->time, rq_clock); + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + + if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS) + return; + sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); +} + +static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu, + int weight) +{ + struct mm_cid *pcpu_cid; + int cid; + + pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); + cid = READ_ONCE(pcpu_cid->cid); + if (!mm_cid_is_valid(cid) || cid < weight) + return; + sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); +} + +static void task_mm_cid_work(struct callback_head *work) +{ + unsigned long now = jiffies, old_scan, next_scan; + struct task_struct *t = current; + struct cpumask *cidmask; + struct mm_struct *mm; + int weight, cpu; + + SCHED_WARN_ON(t != container_of(work, struct task_struct, cid_work)); + + work->next = work; /* Prevent double-add */ + if (t->flags & PF_EXITING) + return; + mm = t->mm; + if (!mm) + return; + old_scan = READ_ONCE(mm->mm_cid_next_scan); + next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY); + if (!old_scan) { + unsigned long res; + + res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan); + if (res != old_scan) + old_scan = res; + else + old_scan = next_scan; + } + if (time_before(now, old_scan)) + return; + if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan)) + return; + cidmask = mm_cidmask(mm); + /* Clear cids that were not recently used. */ + for_each_possible_cpu(cpu) + sched_mm_cid_remote_clear_old(mm, cpu); + weight = cpumask_weight(cidmask); + /* + * Clear cids that are greater or equal to the cidmask weight to + * recompact it. + */ + for_each_possible_cpu(cpu) + sched_mm_cid_remote_clear_weight(mm, cpu, weight); +} + +void init_sched_mm_cid(struct task_struct *t) +{ + struct mm_struct *mm = t->mm; + int mm_users = 0; + + if (mm) { + mm_users = atomic_read(&mm->mm_users); + if (mm_users == 1) + mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY); + } + t->cid_work.next = &t->cid_work; /* Protect against double add */ + init_task_work(&t->cid_work, task_mm_cid_work); +} + +void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) +{ + struct callback_head *work = &curr->cid_work; + unsigned long now = jiffies; + + if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || + work->next != work) + return; + if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan))) + return; + task_work_add(curr, work, TWA_RESUME); +} + +void sched_mm_cid_exit_signals(struct task_struct *t) +{ + struct mm_struct *mm = t->mm; + struct rq_flags rf; + struct rq *rq; + + if (!mm) + return; + + preempt_disable(); + rq = this_rq(); + rq_lock_irqsave(rq, &rf); + preempt_enable_no_resched(); /* holding spinlock */ + WRITE_ONCE(t->mm_cid_active, 0); + /* + * Store t->mm_cid_active before loading per-mm/cpu cid. + * Matches barrier in sched_mm_cid_remote_clear_old(). + */ + smp_mb(); + mm_cid_put(mm); + t->last_mm_cid = t->mm_cid = -1; + rq_unlock_irqrestore(rq, &rf); +} + void sched_mm_cid_before_execve(struct task_struct *t) { struct mm_struct *mm = t->mm; - unsigned long flags; + struct rq_flags rf; + struct rq *rq; if (!mm) return; - local_irq_save(flags); - mm_cid_put(mm, t->mm_cid); - t->mm_cid = -1; - t->mm_cid_active = 0; - local_irq_restore(flags); + + preempt_disable(); + rq = this_rq(); + rq_lock_irqsave(rq, &rf); + preempt_enable_no_resched(); /* holding spinlock */ + WRITE_ONCE(t->mm_cid_active, 0); + /* + * Store t->mm_cid_active before loading per-mm/cpu cid. + * Matches barrier in sched_mm_cid_remote_clear_old(). + */ + smp_mb(); + mm_cid_put(mm); + t->last_mm_cid = t->mm_cid = -1; + rq_unlock_irqrestore(rq, &rf); } void sched_mm_cid_after_execve(struct task_struct *t) { struct mm_struct *mm = t->mm; - unsigned long flags; + struct rq_flags rf; + struct rq *rq; if (!mm) return; - local_irq_save(flags); - t->mm_cid = mm_cid_get(mm); - t->mm_cid_active = 1; - local_irq_restore(flags); + + preempt_disable(); + rq = this_rq(); + rq_lock_irqsave(rq, &rf); + preempt_enable_no_resched(); /* holding spinlock */ + WRITE_ONCE(t->mm_cid_active, 1); + /* + * Store t->mm_cid_active before loading per-mm/cpu cid. + * Matches barrier in sched_mm_cid_remote_clear_old(). + */ + smp_mb(); + t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm); + rq_unlock_irqrestore(rq, &rf); rseq_set_notify_resume(t); } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 71b24371a6f7..5a9a4b81c972 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2246,6 +2246,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) || task_on_cpu(rq, task) || !dl_task(task) || + is_migration_disabled(task) || !task_on_rq_queued(task))) { double_unlock_balance(rq, later_rq); later_rq = NULL; @@ -2704,6 +2705,13 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, #endif } +#ifdef CONFIG_SCHED_CORE +static int task_is_throttled_dl(struct task_struct *p, int cpu) +{ + return p->dl.dl_throttled; +} +#endif + DEFINE_SCHED_CLASS(dl) = { .enqueue_task = enqueue_task_dl, @@ -2736,6 +2744,9 @@ DEFINE_SCHED_CLASS(dl) = { .switched_to = switched_to_dl, .update_curr = update_curr_dl, +#ifdef CONFIG_SCHED_CORE + .task_is_throttled = task_is_throttled_dl, +#endif }; /* Used for dl_bw check and update, used under sched_rt_handler()::mutex */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 1637b65ba07a..0b2340a79b65 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -280,6 +280,45 @@ static const struct file_operations sched_dynamic_fops = { __read_mostly bool sched_debug_verbose; +#ifdef CONFIG_SMP +static struct dentry *sd_dentry; + + +static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + ssize_t result; + bool orig; + + cpus_read_lock(); + mutex_lock(&sched_domains_mutex); + + orig = sched_debug_verbose; + result = debugfs_write_file_bool(filp, ubuf, cnt, ppos); + + if (sched_debug_verbose && !orig) + update_sched_domain_debugfs(); + else if (!sched_debug_verbose && orig) { + debugfs_remove(sd_dentry); + sd_dentry = NULL; + } + + mutex_unlock(&sched_domains_mutex); + cpus_read_unlock(); + + return result; +} +#else +#define sched_verbose_write debugfs_write_file_bool +#endif + +static const struct file_operations sched_verbose_fops = { + .read = debugfs_read_file_bool, + .write = sched_verbose_write, + .open = simple_open, + .llseek = default_llseek, +}; + static const struct seq_operations sched_debug_sops; static int sched_debug_open(struct inode *inode, struct file *filp) @@ -303,7 +342,7 @@ static __init int sched_init_debug(void) debugfs_sched = debugfs_create_dir("sched", NULL); debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops); - debugfs_create_bool("verbose", 0644, debugfs_sched, &sched_debug_verbose); + debugfs_create_file_unsafe("verbose", 0644, debugfs_sched, &sched_debug_verbose, &sched_verbose_fops); #ifdef CONFIG_PREEMPT_DYNAMIC debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); #endif @@ -345,7 +384,6 @@ late_initcall(sched_init_debug); #ifdef CONFIG_SMP static cpumask_var_t sd_sysctl_cpus; -static struct dentry *sd_dentry; static int sd_flags_show(struct seq_file *m, void *v) { @@ -402,15 +440,23 @@ void update_sched_domain_debugfs(void) if (!debugfs_sched) return; + if (!sched_debug_verbose) + return; + if (!cpumask_available(sd_sysctl_cpus)) { if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) return; cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); } - if (!sd_dentry) + if (!sd_dentry) { sd_dentry = debugfs_create_dir("domains", debugfs_sched); + /* rebuild sd_sysctl_cpus if empty since it gets cleared below */ + if (cpumask_empty(sd_sysctl_cpus)) + cpumask_copy(sd_sysctl_cpus, cpu_online_mask); + } + for_each_cpu(cpu, sd_sysctl_cpus) { struct sched_domain *sd; struct dentry *d_cpu; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5f6587d94c1d..373ff5f55884 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2928,6 +2928,24 @@ static void reset_ptenuma_scan(struct task_struct *p) p->mm->numa_scan_offset = 0; } +static bool vma_is_accessed(struct vm_area_struct *vma) +{ + unsigned long pids; + /* + * Allow unconditional access first two times, so that all the (pages) + * of VMAs get prot_none fault introduced irrespective of accesses. + * This is also done to avoid any side effect of task scanning + * amplifying the unfairness of disjoint set of VMAs' access. + */ + if (READ_ONCE(current->mm->numa_scan_seq) < 2) + return true; + + pids = vma->numab_state->access_pids[0] | vma->numab_state->access_pids[1]; + return test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids); +} + +#define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay) + /* * The expensive part of numa migration is done from task_work context. * Triggered from task_tick_numa(). @@ -3027,6 +3045,45 @@ static void task_numa_work(struct callback_head *work) if (!vma_is_accessible(vma)) continue; + /* Initialise new per-VMA NUMAB state. */ + if (!vma->numab_state) { + vma->numab_state = kzalloc(sizeof(struct vma_numab_state), + GFP_KERNEL); + if (!vma->numab_state) + continue; + + vma->numab_state->next_scan = now + + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); + + /* Reset happens after 4 times scan delay of scan start */ + vma->numab_state->next_pid_reset = vma->numab_state->next_scan + + msecs_to_jiffies(VMA_PID_RESET_PERIOD); + } + + /* + * Scanning the VMA's of short lived tasks add more overhead. So + * delay the scan for new VMAs. + */ + if (mm->numa_scan_seq && time_before(jiffies, + vma->numab_state->next_scan)) + continue; + + /* Do not scan the VMA if task has not accessed */ + if (!vma_is_accessed(vma)) + continue; + + /* + * RESET access PIDs regularly for old VMAs. Resetting after checking + * vma for recent access to avoid clearing PID info before access.. + */ + if (mm->numa_scan_seq && + time_after(jiffies, vma->numab_state->next_pid_reset)) { + vma->numab_state->next_pid_reset = vma->numab_state->next_pid_reset + + msecs_to_jiffies(VMA_PID_RESET_PERIOD); + vma->numab_state->access_pids[0] = READ_ONCE(vma->numab_state->access_pids[1]); + vma->numab_state->access_pids[1] = 0; + } + do { start = max(start, vma->vm_start); end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); @@ -5959,6 +6016,10 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); cfs_b->period_timer.function = sched_cfs_period_timer; + + /* Add a random offset so that timers interleave */ + hrtimer_set_expires(&cfs_b->period_timer, + get_random_u32_below(cfs_b->period)); hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->slack_timer.function = sched_cfs_slack_timer; cfs_b->slack_started = false; @@ -6614,7 +6675,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); schedstat_inc(p->stats.nr_wakeups_affine_attempts); - if (target == nr_cpumask_bits) + if (target != this_cpu) return prev_cpu; schedstat_inc(sd->ttwu_move_affine); @@ -11976,6 +12037,18 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, return delta > 0; } + +static int task_is_throttled_fair(struct task_struct *p, int cpu) +{ + struct cfs_rq *cfs_rq; + +#ifdef CONFIG_FAIR_GROUP_SCHED + cfs_rq = task_group(p)->cfs_rq[cpu]; +#else + cfs_rq = &cpu_rq(cpu)->cfs; +#endif + return throttled_hierarchy(cfs_rq); +} #else static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} #endif @@ -12602,6 +12675,10 @@ DEFINE_SCHED_CLASS(fair) = { .task_change_group = task_change_group_fair, #endif +#ifdef CONFIG_SCHED_CORE + .task_is_throttled = task_is_throttled_fair, +#endif + #ifdef CONFIG_UCLAMP_TASK .uclamp_enabled = 1, #endif diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index e9ef66be2870..342f58a329f5 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -75,7 +75,7 @@ static noinline int __cpuidle cpu_idle_poll(void) void __weak arch_cpu_idle_prepare(void) { } void __weak arch_cpu_idle_enter(void) { } void __weak arch_cpu_idle_exit(void) { } -void __weak arch_cpu_idle_dead(void) { } +void __weak __noreturn arch_cpu_idle_dead(void) { while (1); } void __weak arch_cpu_idle(void) { cpu_idle_force_poll = 1; diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 02e011cabe91..e072f6b31bf3 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -186,17 +186,22 @@ static void group_init(struct psi_group *group) seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); group->avg_last_update = sched_clock(); group->avg_next_update = group->avg_last_update + psi_period; - INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); mutex_init(&group->avgs_lock); - /* Init trigger-related members */ - atomic_set(&group->poll_scheduled, 0); - mutex_init(&group->trigger_lock); - INIT_LIST_HEAD(&group->triggers); - group->poll_min_period = U32_MAX; - group->polling_next_update = ULLONG_MAX; - init_waitqueue_head(&group->poll_wait); - timer_setup(&group->poll_timer, poll_timer_fn, 0); - rcu_assign_pointer(group->poll_task, NULL); + + /* Init avg trigger-related members */ + INIT_LIST_HEAD(&group->avg_triggers); + memset(group->avg_nr_triggers, 0, sizeof(group->avg_nr_triggers)); + INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); + + /* Init rtpoll trigger-related members */ + atomic_set(&group->rtpoll_scheduled, 0); + mutex_init(&group->rtpoll_trigger_lock); + INIT_LIST_HEAD(&group->rtpoll_triggers); + group->rtpoll_min_period = U32_MAX; + group->rtpoll_next_update = ULLONG_MAX; + init_waitqueue_head(&group->rtpoll_wait); + timer_setup(&group->rtpoll_timer, poll_timer_fn, 0); + rcu_assign_pointer(group->rtpoll_task, NULL); } void __init psi_init(void) @@ -384,92 +389,6 @@ static void collect_percpu_times(struct psi_group *group, *pchanged_states = changed_states; } -static u64 update_averages(struct psi_group *group, u64 now) -{ - unsigned long missed_periods = 0; - u64 expires, period; - u64 avg_next_update; - int s; - - /* avgX= */ - expires = group->avg_next_update; - if (now - expires >= psi_period) - missed_periods = div_u64(now - expires, psi_period); - - /* - * The periodic clock tick can get delayed for various - * reasons, especially on loaded systems. To avoid clock - * drift, we schedule the clock in fixed psi_period intervals. - * But the deltas we sample out of the per-cpu buckets above - * are based on the actual time elapsing between clock ticks. - */ - avg_next_update = expires + ((1 + missed_periods) * psi_period); - period = now - (group->avg_last_update + (missed_periods * psi_period)); - group->avg_last_update = now; - - for (s = 0; s < NR_PSI_STATES - 1; s++) { - u32 sample; - - sample = group->total[PSI_AVGS][s] - group->avg_total[s]; - /* - * Due to the lockless sampling of the time buckets, - * recorded time deltas can slip into the next period, - * which under full pressure can result in samples in - * excess of the period length. - * - * We don't want to report non-sensical pressures in - * excess of 100%, nor do we want to drop such events - * on the floor. Instead we punt any overage into the - * future until pressure subsides. By doing this we - * don't underreport the occurring pressure curve, we - * just report it delayed by one period length. - * - * The error isn't cumulative. As soon as another - * delta slips from a period P to P+1, by definition - * it frees up its time T in P. - */ - if (sample > period) - sample = period; - group->avg_total[s] += sample; - calc_avgs(group->avg[s], missed_periods, sample, period); - } - - return avg_next_update; -} - -static void psi_avgs_work(struct work_struct *work) -{ - struct delayed_work *dwork; - struct psi_group *group; - u32 changed_states; - u64 now; - - dwork = to_delayed_work(work); - group = container_of(dwork, struct psi_group, avgs_work); - - mutex_lock(&group->avgs_lock); - - now = sched_clock(); - - collect_percpu_times(group, PSI_AVGS, &changed_states); - /* - * If there is task activity, periodically fold the per-cpu - * times and feed samples into the running averages. If things - * are idle and there is no data to process, stop the clock. - * Once restarted, we'll catch up the running averages in one - * go - see calc_avgs() and missed_periods. - */ - if (now >= group->avg_next_update) - group->avg_next_update = update_averages(group, now); - - if (changed_states & PSI_STATE_RESCHEDULE) { - schedule_delayed_work(dwork, nsecs_to_jiffies( - group->avg_next_update - now) + 1); - } - - mutex_unlock(&group->avgs_lock); -} - /* Trigger tracking window manipulations */ static void window_reset(struct psi_window *win, u64 now, u64 value, u64 prev_growth) @@ -516,33 +435,32 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value) return growth; } -static void init_triggers(struct psi_group *group, u64 now) -{ - struct psi_trigger *t; - - list_for_each_entry(t, &group->triggers, node) - window_reset(&t->win, now, - group->total[PSI_POLL][t->state], 0); - memcpy(group->polling_total, group->total[PSI_POLL], - sizeof(group->polling_total)); - group->polling_next_update = now + group->poll_min_period; -} - -static u64 update_triggers(struct psi_group *group, u64 now) +static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total, + enum psi_aggregators aggregator) { struct psi_trigger *t; - bool update_total = false; - u64 *total = group->total[PSI_POLL]; + u64 *total = group->total[aggregator]; + struct list_head *triggers; + u64 *aggregator_total; + *update_total = false; + + if (aggregator == PSI_AVGS) { + triggers = &group->avg_triggers; + aggregator_total = group->avg_total; + } else { + triggers = &group->rtpoll_triggers; + aggregator_total = group->rtpoll_total; + } /* * On subsequent updates, calculate growth deltas and let * watchers know when their specified thresholds are exceeded. */ - list_for_each_entry(t, &group->triggers, node) { + list_for_each_entry(t, triggers, node) { u64 growth; bool new_stall; - new_stall = group->polling_total[t->state] != total[t->state]; + new_stall = aggregator_total[t->state] != total[t->state]; /* Check for stall activity or a previous threshold breach */ if (!new_stall && !t->pending_event) @@ -560,7 +478,7 @@ static u64 update_triggers(struct psi_group *group, u64 now) * been through all of them. Also remember to extend the * polling time if we see new stall activity. */ - update_total = true; + *update_total = true; /* Calculate growth since last update */ growth = window_update(&t->win, now, total[t->state]); @@ -583,52 +501,150 @@ static u64 update_triggers(struct psi_group *group, u64 now) t->pending_event = false; } - if (update_total) - memcpy(group->polling_total, total, - sizeof(group->polling_total)); + return now + group->rtpoll_min_period; +} + +static u64 update_averages(struct psi_group *group, u64 now) +{ + unsigned long missed_periods = 0; + u64 expires, period; + u64 avg_next_update; + int s; + + /* avgX= */ + expires = group->avg_next_update; + if (now - expires >= psi_period) + missed_periods = div_u64(now - expires, psi_period); + + /* + * The periodic clock tick can get delayed for various + * reasons, especially on loaded systems. To avoid clock + * drift, we schedule the clock in fixed psi_period intervals. + * But the deltas we sample out of the per-cpu buckets above + * are based on the actual time elapsing between clock ticks. + */ + avg_next_update = expires + ((1 + missed_periods) * psi_period); + period = now - (group->avg_last_update + (missed_periods * psi_period)); + group->avg_last_update = now; + + for (s = 0; s < NR_PSI_STATES - 1; s++) { + u32 sample; + + sample = group->total[PSI_AVGS][s] - group->avg_total[s]; + /* + * Due to the lockless sampling of the time buckets, + * recorded time deltas can slip into the next period, + * which under full pressure can result in samples in + * excess of the period length. + * + * We don't want to report non-sensical pressures in + * excess of 100%, nor do we want to drop such events + * on the floor. Instead we punt any overage into the + * future until pressure subsides. By doing this we + * don't underreport the occurring pressure curve, we + * just report it delayed by one period length. + * + * The error isn't cumulative. As soon as another + * delta slips from a period P to P+1, by definition + * it frees up its time T in P. + */ + if (sample > period) + sample = period; + group->avg_total[s] += sample; + calc_avgs(group->avg[s], missed_periods, sample, period); + } + + return avg_next_update; +} + +static void psi_avgs_work(struct work_struct *work) +{ + struct delayed_work *dwork; + struct psi_group *group; + u32 changed_states; + bool update_total; + u64 now; + + dwork = to_delayed_work(work); + group = container_of(dwork, struct psi_group, avgs_work); + + mutex_lock(&group->avgs_lock); + + now = sched_clock(); + + collect_percpu_times(group, PSI_AVGS, &changed_states); + /* + * If there is task activity, periodically fold the per-cpu + * times and feed samples into the running averages. If things + * are idle and there is no data to process, stop the clock. + * Once restarted, we'll catch up the running averages in one + * go - see calc_avgs() and missed_periods. + */ + if (now >= group->avg_next_update) { + update_triggers(group, now, &update_total, PSI_AVGS); + group->avg_next_update = update_averages(group, now); + } + + if (changed_states & PSI_STATE_RESCHEDULE) { + schedule_delayed_work(dwork, nsecs_to_jiffies( + group->avg_next_update - now) + 1); + } - return now + group->poll_min_period; + mutex_unlock(&group->avgs_lock); +} + +static void init_rtpoll_triggers(struct psi_group *group, u64 now) +{ + struct psi_trigger *t; + + list_for_each_entry(t, &group->rtpoll_triggers, node) + window_reset(&t->win, now, + group->total[PSI_POLL][t->state], 0); + memcpy(group->rtpoll_total, group->total[PSI_POLL], + sizeof(group->rtpoll_total)); + group->rtpoll_next_update = now + group->rtpoll_min_period; } /* Schedule polling if it's not already scheduled or forced. */ -static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay, +static void psi_schedule_rtpoll_work(struct psi_group *group, unsigned long delay, bool force) { struct task_struct *task; /* * atomic_xchg should be called even when !force to provide a - * full memory barrier (see the comment inside psi_poll_work). + * full memory barrier (see the comment inside psi_rtpoll_work). */ - if (atomic_xchg(&group->poll_scheduled, 1) && !force) + if (atomic_xchg(&group->rtpoll_scheduled, 1) && !force) return; rcu_read_lock(); - task = rcu_dereference(group->poll_task); + task = rcu_dereference(group->rtpoll_task); /* * kworker might be NULL in case psi_trigger_destroy races with * psi_task_change (hotpath) which can't use locks */ if (likely(task)) - mod_timer(&group->poll_timer, jiffies + delay); + mod_timer(&group->rtpoll_timer, jiffies + delay); else - atomic_set(&group->poll_scheduled, 0); + atomic_set(&group->rtpoll_scheduled, 0); rcu_read_unlock(); } -static void psi_poll_work(struct psi_group *group) +static void psi_rtpoll_work(struct psi_group *group) { bool force_reschedule = false; u32 changed_states; + bool update_total; u64 now; - mutex_lock(&group->trigger_lock); + mutex_lock(&group->rtpoll_trigger_lock); now = sched_clock(); - if (now > group->polling_until) { + if (now > group->rtpoll_until) { /* * We are either about to start or might stop polling if no * state change was recorded. Resetting poll_scheduled leaves @@ -638,7 +654,7 @@ static void psi_poll_work(struct psi_group *group) * should be negligible and polling_next_update still keeps * updates correctly on schedule. */ - atomic_set(&group->poll_scheduled, 0); + atomic_set(&group->rtpoll_scheduled, 0); /* * A task change can race with the poll worker that is supposed to * report on it. To avoid missing events, ensure ordering between @@ -667,60 +683,64 @@ static void psi_poll_work(struct psi_group *group) collect_percpu_times(group, PSI_POLL, &changed_states); - if (changed_states & group->poll_states) { + if (changed_states & group->rtpoll_states) { /* Initialize trigger windows when entering polling mode */ - if (now > group->polling_until) - init_triggers(group, now); + if (now > group->rtpoll_until) + init_rtpoll_triggers(group, now); /* * Keep the monitor active for at least the duration of the * minimum tracking window as long as monitor states are * changing. */ - group->polling_until = now + - group->poll_min_period * UPDATES_PER_WINDOW; + group->rtpoll_until = now + + group->rtpoll_min_period * UPDATES_PER_WINDOW; } - if (now > group->polling_until) { - group->polling_next_update = ULLONG_MAX; + if (now > group->rtpoll_until) { + group->rtpoll_next_update = ULLONG_MAX; goto out; } - if (now >= group->polling_next_update) - group->polling_next_update = update_triggers(group, now); + if (now >= group->rtpoll_next_update) { + group->rtpoll_next_update = update_triggers(group, now, &update_total, PSI_POLL); + if (update_total) + memcpy(group->rtpoll_total, group->total[PSI_POLL], + sizeof(group->rtpoll_total)); + } - psi_schedule_poll_work(group, - nsecs_to_jiffies(group->polling_next_update - now) + 1, + psi_schedule_rtpoll_work(group, + nsecs_to_jiffies(group->rtpoll_next_update - now) + 1, force_reschedule); out: - mutex_unlock(&group->trigger_lock); + mutex_unlock(&group->rtpoll_trigger_lock); } -static int psi_poll_worker(void *data) +static int psi_rtpoll_worker(void *data) { struct psi_group *group = (struct psi_group *)data; sched_set_fifo_low(current); while (true) { - wait_event_interruptible(group->poll_wait, - atomic_cmpxchg(&group->poll_wakeup, 1, 0) || + wait_event_interruptible(group->rtpoll_wait, + atomic_cmpxchg(&group->rtpoll_wakeup, 1, 0) || kthread_should_stop()); if (kthread_should_stop()) break; - psi_poll_work(group); + psi_rtpoll_work(group); } return 0; } static void poll_timer_fn(struct timer_list *t) { - struct psi_group *group = from_timer(group, t, poll_timer); + struct psi_group *group = from_timer(group, t, rtpoll_timer); - atomic_set(&group->poll_wakeup, 1); - wake_up_interruptible(&group->poll_wait); + atomic_set(&group->rtpoll_wakeup, 1); + wake_up_interruptible(&group->rtpoll_wait); } static void record_times(struct psi_group_cpu *groupc, u64 now) @@ -851,8 +871,8 @@ static void psi_group_change(struct psi_group *group, int cpu, write_seqcount_end(&groupc->seq); - if (state_mask & group->poll_states) - psi_schedule_poll_work(group, 1, false); + if (state_mask & group->rtpoll_states) + psi_schedule_rtpoll_work(group, 1, false); if (wake_clock && !delayed_work_pending(&group->avgs_work)) schedule_delayed_work(&group->avgs_work, PSI_FREQ); @@ -1005,8 +1025,8 @@ void psi_account_irqtime(struct task_struct *task, u32 delta) write_seqcount_end(&groupc->seq); - if (group->poll_states & (1 << PSI_IRQ_FULL)) - psi_schedule_poll_work(group, 1, false); + if (group->rtpoll_states & (1 << PSI_IRQ_FULL)) + psi_schedule_rtpoll_work(group, 1, false); } while ((group = group->parent)); } #endif @@ -1101,7 +1121,7 @@ void psi_cgroup_free(struct cgroup *cgroup) cancel_delayed_work_sync(&cgroup->psi->avgs_work); free_percpu(cgroup->psi->pcpu); /* All triggers must be removed by now */ - WARN_ONCE(cgroup->psi->poll_states, "psi: trigger leak\n"); + WARN_ONCE(cgroup->psi->rtpoll_states, "psi: trigger leak\n"); kfree(cgroup->psi); } @@ -1253,16 +1273,23 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) } struct psi_trigger *psi_trigger_create(struct psi_group *group, - char *buf, enum psi_res res) + char *buf, enum psi_res res, struct file *file) { struct psi_trigger *t; enum psi_states state; u32 threshold_us; + bool privileged; u32 window_us; if (static_branch_likely(&psi_disabled)) return ERR_PTR(-EOPNOTSUPP); + /* + * Checking the privilege here on file->f_cred implies that a privileged user + * could open the file and delegate the write to an unprivileged one. + */ + privileged = cap_raised(file->f_cred->cap_effective, CAP_SYS_RESOURCE); + if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2) state = PSI_IO_SOME + res * 2; else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2) @@ -1282,6 +1309,13 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, window_us > WINDOW_MAX_US) return ERR_PTR(-EINVAL); + /* + * Unprivileged users can only use 2s windows so that averages aggregation + * work is used, and no RT threads need to be spawned. + */ + if (!privileged && window_us % 2000000) + return ERR_PTR(-EINVAL); + /* Check threshold */ if (threshold_us == 0 || threshold_us > window_us) return ERR_PTR(-EINVAL); @@ -1301,31 +1335,40 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, t->last_event_time = 0; init_waitqueue_head(&t->event_wait); t->pending_event = false; + t->aggregator = privileged ? PSI_POLL : PSI_AVGS; - mutex_lock(&group->trigger_lock); + if (privileged) { + mutex_lock(&group->rtpoll_trigger_lock); - if (!rcu_access_pointer(group->poll_task)) { - struct task_struct *task; + if (!rcu_access_pointer(group->rtpoll_task)) { + struct task_struct *task; - task = kthread_create(psi_poll_worker, group, "psimon"); - if (IS_ERR(task)) { - kfree(t); - mutex_unlock(&group->trigger_lock); - return ERR_CAST(task); + task = kthread_create(psi_rtpoll_worker, group, "psimon"); + if (IS_ERR(task)) { + kfree(t); + mutex_unlock(&group->rtpoll_trigger_lock); + return ERR_CAST(task); + } + atomic_set(&group->rtpoll_wakeup, 0); + wake_up_process(task); + rcu_assign_pointer(group->rtpoll_task, task); } - atomic_set(&group->poll_wakeup, 0); - wake_up_process(task); - rcu_assign_pointer(group->poll_task, task); - } - list_add(&t->node, &group->triggers); - group->poll_min_period = min(group->poll_min_period, - div_u64(t->win.size, UPDATES_PER_WINDOW)); - group->nr_triggers[t->state]++; - group->poll_states |= (1 << t->state); + list_add(&t->node, &group->rtpoll_triggers); + group->rtpoll_min_period = min(group->rtpoll_min_period, + div_u64(t->win.size, UPDATES_PER_WINDOW)); + group->rtpoll_nr_triggers[t->state]++; + group->rtpoll_states |= (1 << t->state); + + mutex_unlock(&group->rtpoll_trigger_lock); + } else { + mutex_lock(&group->avgs_lock); - mutex_unlock(&group->trigger_lock); + list_add(&t->node, &group->avg_triggers); + group->avg_nr_triggers[t->state]++; + mutex_unlock(&group->avgs_lock); + } return t; } @@ -1349,51 +1392,59 @@ void psi_trigger_destroy(struct psi_trigger *t) */ wake_up_pollfree(&t->event_wait); - mutex_lock(&group->trigger_lock); - - if (!list_empty(&t->node)) { - struct psi_trigger *tmp; - u64 period = ULLONG_MAX; - - list_del(&t->node); - group->nr_triggers[t->state]--; - if (!group->nr_triggers[t->state]) - group->poll_states &= ~(1 << t->state); - /* reset min update period for the remaining triggers */ - list_for_each_entry(tmp, &group->triggers, node) - period = min(period, div_u64(tmp->win.size, - UPDATES_PER_WINDOW)); - group->poll_min_period = period; - /* Destroy poll_task when the last trigger is destroyed */ - if (group->poll_states == 0) { - group->polling_until = 0; - task_to_destroy = rcu_dereference_protected( - group->poll_task, - lockdep_is_held(&group->trigger_lock)); - rcu_assign_pointer(group->poll_task, NULL); - del_timer(&group->poll_timer); + if (t->aggregator == PSI_AVGS) { + mutex_lock(&group->avgs_lock); + if (!list_empty(&t->node)) { + list_del(&t->node); + group->avg_nr_triggers[t->state]--; } + mutex_unlock(&group->avgs_lock); + } else { + mutex_lock(&group->rtpoll_trigger_lock); + if (!list_empty(&t->node)) { + struct psi_trigger *tmp; + u64 period = ULLONG_MAX; + + list_del(&t->node); + group->rtpoll_nr_triggers[t->state]--; + if (!group->rtpoll_nr_triggers[t->state]) + group->rtpoll_states &= ~(1 << t->state); + /* reset min update period for the remaining triggers */ + list_for_each_entry(tmp, &group->rtpoll_triggers, node) + period = min(period, div_u64(tmp->win.size, + UPDATES_PER_WINDOW)); + group->rtpoll_min_period = period; + /* Destroy rtpoll_task when the last trigger is destroyed */ + if (group->rtpoll_states == 0) { + group->rtpoll_until = 0; + task_to_destroy = rcu_dereference_protected( + group->rtpoll_task, + lockdep_is_held(&group->rtpoll_trigger_lock)); + rcu_assign_pointer(group->rtpoll_task, NULL); + del_timer(&group->rtpoll_timer); + } + } + mutex_unlock(&group->rtpoll_trigger_lock); } - mutex_unlock(&group->trigger_lock); - /* - * Wait for psi_schedule_poll_work RCU to complete its read-side + * Wait for psi_schedule_rtpoll_work RCU to complete its read-side * critical section before destroying the trigger and optionally the - * poll_task. + * rtpoll_task. */ synchronize_rcu(); /* - * Stop kthread 'psimon' after releasing trigger_lock to prevent a - * deadlock while waiting for psi_poll_work to acquire trigger_lock + * Stop kthread 'psimon' after releasing rtpoll_trigger_lock to prevent + * a deadlock while waiting for psi_rtpoll_work to acquire + * rtpoll_trigger_lock */ if (task_to_destroy) { /* * After the RCU grace period has expired, the worker - * can no longer be found through group->poll_task. + * can no longer be found through group->rtpoll_task. */ kthread_stop(task_to_destroy); - atomic_set(&group->poll_scheduled, 0); + atomic_set(&group->rtpoll_scheduled, 0); } kfree(t); } @@ -1435,27 +1486,19 @@ static int psi_cpu_show(struct seq_file *m, void *v) return psi_show(m, &psi_system, PSI_CPU); } -static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *)) -{ - if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) - return -EPERM; - - return single_open(file, psi_show, NULL); -} - static int psi_io_open(struct inode *inode, struct file *file) { - return psi_open(file, psi_io_show); + return single_open(file, psi_io_show, NULL); } static int psi_memory_open(struct inode *inode, struct file *file) { - return psi_open(file, psi_memory_show); + return single_open(file, psi_memory_show, NULL); } static int psi_cpu_open(struct inode *inode, struct file *file) { - return psi_open(file, psi_cpu_show); + return single_open(file, psi_cpu_show, NULL); } static ssize_t psi_write(struct file *file, const char __user *user_buf, @@ -1489,7 +1532,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, return -EBUSY; } - new = psi_trigger_create(&psi_system, buf, res); + new = psi_trigger_create(&psi_system, buf, res, file); if (IS_ERR(new)) { mutex_unlock(&seq->lock); return PTR_ERR(new); @@ -1569,7 +1612,7 @@ static int psi_irq_show(struct seq_file *m, void *v) static int psi_irq_open(struct inode *inode, struct file *file) { - return psi_open(file, psi_irq_show); + return single_open(file, psi_irq_show, NULL); } static ssize_t psi_irq_write(struct file *file, const char __user *user_buf, diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0a11f44adee5..00e0e5074115 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2000,11 +2000,15 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) * the mean time, task could have * migrated already or had its affinity changed. * Also make sure that it wasn't scheduled on its rq. + * It is possible the task was scheduled, set + * "migrate_disabled" and then got preempted, so we must + * check the task migration disable flag here too. */ if (unlikely(task_rq(task) != rq || !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) || task_on_cpu(rq, task) || !rt_task(task) || + is_migration_disabled(task) || !task_on_rq_queued(task))) { double_unlock_balance(rq, lowest_rq); @@ -2677,6 +2681,21 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) return 0; } +#ifdef CONFIG_SCHED_CORE +static int task_is_throttled_rt(struct task_struct *p, int cpu) +{ + struct rt_rq *rt_rq; + +#ifdef CONFIG_RT_GROUP_SCHED + rt_rq = task_group(p)->rt_rq[cpu]; +#else + rt_rq = &cpu_rq(cpu)->rt; +#endif + + return rt_rq_throttled(rt_rq); +} +#endif + DEFINE_SCHED_CLASS(rt) = { .enqueue_task = enqueue_task_rt, @@ -2710,6 +2729,10 @@ DEFINE_SCHED_CLASS(rt) = { .update_curr = update_curr_rt, +#ifdef CONFIG_SCHED_CORE + .task_is_throttled = task_is_throttled_rt, +#endif + #ifdef CONFIG_UCLAMP_TASK .uclamp_enabled = 1, #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3e8df6d31c1e..ec7b3e0a2b20 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2224,6 +2224,10 @@ struct sched_class { #ifdef CONFIG_FAIR_GROUP_SCHED void (*task_change_group)(struct task_struct *p); #endif + +#ifdef CONFIG_SCHED_CORE + int (*task_is_throttled)(struct task_struct *p, int cpu); +#endif }; static inline void put_prev_task(struct rq *rq, struct task_struct *prev) @@ -3249,61 +3253,238 @@ static inline void update_current_exec_runtime(struct task_struct *curr, } #ifdef CONFIG_SCHED_MM_CID -static inline int __mm_cid_get(struct mm_struct *mm) + +#define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */ +#define MM_CID_SCAN_DELAY 100 /* 100ms */ + +extern raw_spinlock_t cid_lock; +extern int use_cid_lock; + +extern void sched_mm_cid_migrate_from(struct task_struct *t); +extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t); +extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr); +extern void init_sched_mm_cid(struct task_struct *t); + +static inline void __mm_cid_put(struct mm_struct *mm, int cid) +{ + if (cid < 0) + return; + cpumask_clear_cpu(cid, mm_cidmask(mm)); +} + +/* + * The per-mm/cpu cid can have the MM_CID_LAZY_PUT flag set or transition to + * the MM_CID_UNSET state without holding the rq lock, but the rq lock needs to + * be held to transition to other states. + * + * State transitions synchronized with cmpxchg or try_cmpxchg need to be + * consistent across cpus, which prevents use of this_cpu_cmpxchg. + */ +static inline void mm_cid_put_lazy(struct task_struct *t) +{ + struct mm_struct *mm = t->mm; + struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; + int cid; + + lockdep_assert_irqs_disabled(); + cid = __this_cpu_read(pcpu_cid->cid); + if (!mm_cid_is_lazy_put(cid) || + !try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) + return; + __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); +} + +static inline int mm_cid_pcpu_unset(struct mm_struct *mm) +{ + struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; + int cid, res; + + lockdep_assert_irqs_disabled(); + cid = __this_cpu_read(pcpu_cid->cid); + for (;;) { + if (mm_cid_is_unset(cid)) + return MM_CID_UNSET; + /* + * Attempt transition from valid or lazy-put to unset. + */ + res = cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, cid, MM_CID_UNSET); + if (res == cid) + break; + cid = res; + } + return cid; +} + +static inline void mm_cid_put(struct mm_struct *mm) +{ + int cid; + + lockdep_assert_irqs_disabled(); + cid = mm_cid_pcpu_unset(mm); + if (cid == MM_CID_UNSET) + return; + __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); +} + +static inline int __mm_cid_try_get(struct mm_struct *mm) { struct cpumask *cpumask; int cid; cpumask = mm_cidmask(mm); - cid = cpumask_first_zero(cpumask); - if (cid >= nr_cpu_ids) + /* + * Retry finding first zero bit if the mask is temporarily + * filled. This only happens during concurrent remote-clear + * which owns a cid without holding a rq lock. + */ + for (;;) { + cid = cpumask_first_zero(cpumask); + if (cid < nr_cpu_ids) + break; + cpu_relax(); + } + if (cpumask_test_and_set_cpu(cid, cpumask)) return -1; - __cpumask_set_cpu(cid, cpumask); return cid; } -static inline void mm_cid_put(struct mm_struct *mm, int cid) +/* + * Save a snapshot of the current runqueue time of this cpu + * with the per-cpu cid value, allowing to estimate how recently it was used. + */ +static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm) +{ + struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(rq)); + + lockdep_assert_rq_held(rq); + WRITE_ONCE(pcpu_cid->time, rq->clock); +} + +static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) { - lockdep_assert_irqs_disabled(); - if (cid < 0) - return; - raw_spin_lock(&mm->cid_lock); - __cpumask_clear_cpu(cid, mm_cidmask(mm)); - raw_spin_unlock(&mm->cid_lock); + int cid; + + /* + * All allocations (even those using the cid_lock) are lock-free. If + * use_cid_lock is set, hold the cid_lock to perform cid allocation to + * guarantee forward progress. + */ + if (!READ_ONCE(use_cid_lock)) { + cid = __mm_cid_try_get(mm); + if (cid >= 0) + goto end; + raw_spin_lock(&cid_lock); + } else { + raw_spin_lock(&cid_lock); + cid = __mm_cid_try_get(mm); + if (cid >= 0) + goto unlock; + } + + /* + * cid concurrently allocated. Retry while forcing following + * allocations to use the cid_lock to ensure forward progress. + */ + WRITE_ONCE(use_cid_lock, 1); + /* + * Set use_cid_lock before allocation. Only care about program order + * because this is only required for forward progress. + */ + barrier(); + /* + * Retry until it succeeds. It is guaranteed to eventually succeed once + * all newcoming allocations observe the use_cid_lock flag set. + */ + do { + cid = __mm_cid_try_get(mm); + cpu_relax(); + } while (cid < 0); + /* + * Allocate before clearing use_cid_lock. Only care about + * program order because this is for forward progress. + */ + barrier(); + WRITE_ONCE(use_cid_lock, 0); +unlock: + raw_spin_unlock(&cid_lock); +end: + mm_cid_snapshot_time(rq, mm); + return cid; } -static inline int mm_cid_get(struct mm_struct *mm) +static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm) { - int ret; + struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; + struct cpumask *cpumask; + int cid; - lockdep_assert_irqs_disabled(); - raw_spin_lock(&mm->cid_lock); - ret = __mm_cid_get(mm); - raw_spin_unlock(&mm->cid_lock); - return ret; + lockdep_assert_rq_held(rq); + cpumask = mm_cidmask(mm); + cid = __this_cpu_read(pcpu_cid->cid); + if (mm_cid_is_valid(cid)) { + mm_cid_snapshot_time(rq, mm); + return cid; + } + if (mm_cid_is_lazy_put(cid)) { + if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) + __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); + } + cid = __mm_cid_get(rq, mm); + __this_cpu_write(pcpu_cid->cid, cid); + return cid; } -static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) +static inline void switch_mm_cid(struct rq *rq, + struct task_struct *prev, + struct task_struct *next) { + /* + * Provide a memory barrier between rq->curr store and load of + * {prev,next}->mm->pcpu_cid[cpu] on rq->curr->mm transition. + * + * Should be adapted if context_switch() is modified. + */ + if (!next->mm) { // to kernel + /* + * user -> kernel transition does not guarantee a barrier, but + * we can use the fact that it performs an atomic operation in + * mmgrab(). + */ + if (prev->mm) // from user + smp_mb__after_mmgrab(); + /* + * kernel -> kernel transition does not change rq->curr->mm + * state. It stays NULL. + */ + } else { // to user + /* + * kernel -> user transition does not provide a barrier + * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu]. + * Provide it here. + */ + if (!prev->mm) // from kernel + smp_mb(); + /* + * user -> user transition guarantees a memory barrier through + * switch_mm() when current->mm changes. If current->mm is + * unchanged, no barrier is needed. + */ + } if (prev->mm_cid_active) { - if (next->mm_cid_active && next->mm == prev->mm) { - /* - * Context switch between threads in same mm, hand over - * the mm_cid from prev to next. - */ - next->mm_cid = prev->mm_cid; - prev->mm_cid = -1; - return; - } - mm_cid_put(prev->mm, prev->mm_cid); + mm_cid_snapshot_time(rq, prev->mm); + mm_cid_put_lazy(prev); prev->mm_cid = -1; } if (next->mm_cid_active) - next->mm_cid = mm_cid_get(next->mm); + next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next->mm); } #else -static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { } +static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { } +static inline void sched_mm_cid_migrate_from(struct task_struct *t) { } +static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { } +static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } +static inline void init_sched_mm_cid(struct task_struct *t) { } #endif #endif /* _KERNEL_SCHED_SCHED_H */ diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h index 2eb23dd0f285..21ac44428bb0 100644 --- a/kernel/sched/smp.h +++ b/kernel/sched/smp.h @@ -6,7 +6,7 @@ extern void sched_ttwu_pending(void *arg); -extern void send_call_function_single_ipi(int cpu); +extern bool call_function_single_prep_ipi(int cpu); #ifdef CONFIG_SMP extern void flush_smp_call_function_queue(void); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 051aaf65c749..6682535e37c8 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -209,8 +209,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) DEFINE_STATIC_KEY_FALSE(sched_energy_present); static unsigned int sysctl_sched_energy_aware = 1; -DEFINE_MUTEX(sched_energy_mutex); -bool sched_energy_update; +static DEFINE_MUTEX(sched_energy_mutex); +static bool sched_energy_update; void rebuild_sched_domains_energy(void) { diff --git a/kernel/seccomp.c b/kernel/seccomp.c index cebf26445f9e..d3e584065c7f 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -2368,12 +2368,6 @@ static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write, return ret; } -static struct ctl_path seccomp_sysctl_path[] = { - { .procname = "kernel", }, - { .procname = "seccomp", }, - { } -}; - static struct ctl_table seccomp_sysctl_table[] = { { .procname = "actions_avail", @@ -2392,14 +2386,7 @@ static struct ctl_table seccomp_sysctl_table[] = { static int __init seccomp_sysctl_init(void) { - struct ctl_table_header *hdr; - - hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table); - if (!hdr) - pr_warn("sysctl registration failed\n"); - else - kmemleak_not_leak(hdr); - + register_sysctl_init("kernel/seccomp", seccomp_sysctl_table); return 0; } diff --git a/kernel/signal.c b/kernel/signal.c index 8f6330f0e9ca..2547fa73bde5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1368,7 +1368,9 @@ int zap_other_threads(struct task_struct *p) while_each_thread(p, t) { task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); - count++; + /* Don't require de_thread to wait for the vhost_worker */ + if ((t->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER) + count++; /* Don't bother with already dead threads */ if (t->exit_state) @@ -2861,11 +2863,11 @@ relock: } /* - * PF_IO_WORKER threads will catch and exit on fatal signals + * PF_USER_WORKER threads will catch and exit on fatal signals * themselves. They have cleanup that must be performed, so * we cannot call do_exit() on their behalf. */ - if (current->flags & PF_IO_WORKER) + if (current->flags & PF_USER_WORKER) goto out; /* diff --git a/kernel/smp.c b/kernel/smp.c index 06a413987a14..ab3e5dad6cfe 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -26,68 +26,15 @@ #include <linux/sched/debug.h> #include <linux/jump_label.h> +#include <trace/events/ipi.h> + #include "smpboot.h" #include "sched/smp.h" #define CSD_TYPE(_csd) ((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK) -#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG -union cfd_seq_cnt { - u64 val; - struct { - u64 src:16; - u64 dst:16; -#define CFD_SEQ_NOCPU 0xffff - u64 type:4; -#define CFD_SEQ_QUEUE 0 -#define CFD_SEQ_IPI 1 -#define CFD_SEQ_NOIPI 2 -#define CFD_SEQ_PING 3 -#define CFD_SEQ_PINGED 4 -#define CFD_SEQ_HANDLE 5 -#define CFD_SEQ_DEQUEUE 6 -#define CFD_SEQ_IDLE 7 -#define CFD_SEQ_GOTIPI 8 -#define CFD_SEQ_HDLEND 9 - u64 cnt:28; - } u; -}; - -static char *seq_type[] = { - [CFD_SEQ_QUEUE] = "queue", - [CFD_SEQ_IPI] = "ipi", - [CFD_SEQ_NOIPI] = "noipi", - [CFD_SEQ_PING] = "ping", - [CFD_SEQ_PINGED] = "pinged", - [CFD_SEQ_HANDLE] = "handle", - [CFD_SEQ_DEQUEUE] = "dequeue (src CPU 0 == empty)", - [CFD_SEQ_IDLE] = "idle", - [CFD_SEQ_GOTIPI] = "gotipi", - [CFD_SEQ_HDLEND] = "hdlend (src CPU 0 == early)", -}; - -struct cfd_seq_local { - u64 ping; - u64 pinged; - u64 handle; - u64 dequeue; - u64 idle; - u64 gotipi; - u64 hdlend; -}; -#endif - -struct cfd_percpu { - call_single_data_t csd; -#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG - u64 seq_queue; - u64 seq_ipi; - u64 seq_noipi; -#endif -}; - struct call_function_data { - struct cfd_percpu __percpu *pcpu; + call_single_data_t __percpu *csd; cpumask_var_t cpumask; cpumask_var_t cpumask_ipi; }; @@ -110,8 +57,8 @@ int smpcfd_prepare_cpu(unsigned int cpu) free_cpumask_var(cfd->cpumask); return -ENOMEM; } - cfd->pcpu = alloc_percpu(struct cfd_percpu); - if (!cfd->pcpu) { + cfd->csd = alloc_percpu(call_single_data_t); + if (!cfd->csd) { free_cpumask_var(cfd->cpumask); free_cpumask_var(cfd->cpumask_ipi); return -ENOMEM; @@ -126,7 +73,7 @@ int smpcfd_dead_cpu(unsigned int cpu) free_cpumask_var(cfd->cpumask); free_cpumask_var(cfd->cpumask_ipi); - free_percpu(cfd->pcpu); + free_percpu(cfd->csd); return 0; } @@ -156,23 +103,49 @@ void __init call_function_init(void) smpcfd_prepare_cpu(smp_processor_id()); } +static __always_inline void +send_call_function_single_ipi(int cpu) +{ + if (call_function_single_prep_ipi(cpu)) { + trace_ipi_send_cpu(cpu, _RET_IP_, + generic_smp_call_function_single_interrupt); + arch_send_call_function_single_ipi(cpu); + } +} + +static __always_inline void +send_call_function_ipi_mask(struct cpumask *mask) +{ + trace_ipi_send_cpumask(mask, _RET_IP_, + generic_smp_call_function_single_interrupt); + arch_send_call_function_ipi_mask(mask); +} + #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG -static DEFINE_STATIC_KEY_FALSE(csdlock_debug_enabled); -static DEFINE_STATIC_KEY_FALSE(csdlock_debug_extended); +static DEFINE_STATIC_KEY_MAYBE(CONFIG_CSD_LOCK_WAIT_DEBUG_DEFAULT, csdlock_debug_enabled); +/* + * Parse the csdlock_debug= kernel boot parameter. + * + * If you need to restore the old "ext" value that once provided + * additional debugging information, reapply the following commits: + * + * de7b09ef658d ("locking/csd_lock: Prepare more CSD lock debugging") + * a5aabace5fb8 ("locking/csd_lock: Add more data to CSD lock debugging") + */ static int __init csdlock_debug(char *str) { + int ret; unsigned int val = 0; - if (str && !strcmp(str, "ext")) { - val = 1; - static_branch_enable(&csdlock_debug_extended); - } else - get_option(&str, &val); - - if (val) - static_branch_enable(&csdlock_debug_enabled); + ret = get_option(&str, &val); + if (ret) { + if (val) + static_branch_enable(&csdlock_debug_enabled); + else + static_branch_disable(&csdlock_debug_enabled); + } return 1; } @@ -181,36 +154,11 @@ __setup("csdlock_debug=", csdlock_debug); static DEFINE_PER_CPU(call_single_data_t *, cur_csd); static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func); static DEFINE_PER_CPU(void *, cur_csd_info); -static DEFINE_PER_CPU(struct cfd_seq_local, cfd_seq_local); static ulong csd_lock_timeout = 5000; /* CSD lock timeout in milliseconds. */ module_param(csd_lock_timeout, ulong, 0444); static atomic_t csd_bug_count = ATOMIC_INIT(0); -static u64 cfd_seq; - -#define CFD_SEQ(s, d, t, c) \ - (union cfd_seq_cnt){ .u.src = s, .u.dst = d, .u.type = t, .u.cnt = c } - -static u64 cfd_seq_inc(unsigned int src, unsigned int dst, unsigned int type) -{ - union cfd_seq_cnt new, old; - - new = CFD_SEQ(src, dst, type, 0); - - do { - old.val = READ_ONCE(cfd_seq); - new.u.cnt = old.u.cnt + 1; - } while (cmpxchg(&cfd_seq, old.val, new.val) != old.val); - - return old.val; -} - -#define cfd_seq_store(var, src, dst, type) \ - do { \ - if (static_branch_unlikely(&csdlock_debug_extended)) \ - var = cfd_seq_inc(src, dst, type); \ - } while (0) /* Record current CSD work for current CPU, NULL to erase. */ static void __csd_lock_record(struct __call_single_data *csd) @@ -244,80 +192,6 @@ static int csd_lock_wait_getcpu(struct __call_single_data *csd) return -1; } -static void cfd_seq_data_add(u64 val, unsigned int src, unsigned int dst, - unsigned int type, union cfd_seq_cnt *data, - unsigned int *n_data, unsigned int now) -{ - union cfd_seq_cnt new[2]; - unsigned int i, j, k; - - new[0].val = val; - new[1] = CFD_SEQ(src, dst, type, new[0].u.cnt + 1); - - for (i = 0; i < 2; i++) { - if (new[i].u.cnt <= now) - new[i].u.cnt |= 0x80000000U; - for (j = 0; j < *n_data; j++) { - if (new[i].u.cnt == data[j].u.cnt) { - /* Direct read value trumps generated one. */ - if (i == 0) - data[j].val = new[i].val; - break; - } - if (new[i].u.cnt < data[j].u.cnt) { - for (k = *n_data; k > j; k--) - data[k].val = data[k - 1].val; - data[j].val = new[i].val; - (*n_data)++; - break; - } - } - if (j == *n_data) { - data[j].val = new[i].val; - (*n_data)++; - } - } -} - -static const char *csd_lock_get_type(unsigned int type) -{ - return (type >= ARRAY_SIZE(seq_type)) ? "?" : seq_type[type]; -} - -static void csd_lock_print_extended(struct __call_single_data *csd, int cpu) -{ - struct cfd_seq_local *seq = &per_cpu(cfd_seq_local, cpu); - unsigned int srccpu = csd->node.src; - struct call_function_data *cfd = per_cpu_ptr(&cfd_data, srccpu); - struct cfd_percpu *pcpu = per_cpu_ptr(cfd->pcpu, cpu); - unsigned int now; - union cfd_seq_cnt data[2 * ARRAY_SIZE(seq_type)]; - unsigned int n_data = 0, i; - - data[0].val = READ_ONCE(cfd_seq); - now = data[0].u.cnt; - - cfd_seq_data_add(pcpu->seq_queue, srccpu, cpu, CFD_SEQ_QUEUE, data, &n_data, now); - cfd_seq_data_add(pcpu->seq_ipi, srccpu, cpu, CFD_SEQ_IPI, data, &n_data, now); - cfd_seq_data_add(pcpu->seq_noipi, srccpu, cpu, CFD_SEQ_NOIPI, data, &n_data, now); - - cfd_seq_data_add(per_cpu(cfd_seq_local.ping, srccpu), srccpu, CFD_SEQ_NOCPU, CFD_SEQ_PING, data, &n_data, now); - cfd_seq_data_add(per_cpu(cfd_seq_local.pinged, srccpu), srccpu, CFD_SEQ_NOCPU, CFD_SEQ_PINGED, data, &n_data, now); - - cfd_seq_data_add(seq->idle, CFD_SEQ_NOCPU, cpu, CFD_SEQ_IDLE, data, &n_data, now); - cfd_seq_data_add(seq->gotipi, CFD_SEQ_NOCPU, cpu, CFD_SEQ_GOTIPI, data, &n_data, now); - cfd_seq_data_add(seq->handle, CFD_SEQ_NOCPU, cpu, CFD_SEQ_HANDLE, data, &n_data, now); - cfd_seq_data_add(seq->dequeue, CFD_SEQ_NOCPU, cpu, CFD_SEQ_DEQUEUE, data, &n_data, now); - cfd_seq_data_add(seq->hdlend, CFD_SEQ_NOCPU, cpu, CFD_SEQ_HDLEND, data, &n_data, now); - - for (i = 0; i < n_data; i++) { - pr_alert("\tcsd: cnt(%07x): %04x->%04x %s\n", - data[i].u.cnt & ~0x80000000U, data[i].u.src, - data[i].u.dst, csd_lock_get_type(data[i].u.type)); - } - pr_alert("\tcsd: cnt now: %07x\n", now); -} - /* * Complain if too much time spent waiting. Note that only * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU, @@ -368,8 +242,6 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 * *bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request"); } if (cpu >= 0) { - if (static_branch_unlikely(&csdlock_debug_extended)) - csd_lock_print_extended(csd, cpu); dump_cpu_task(cpu); if (!cpu_cur_csd) { pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu); @@ -412,27 +284,7 @@ static __always_inline void csd_lock_wait(struct __call_single_data *csd) smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK)); } - -static void __smp_call_single_queue_debug(int cpu, struct llist_node *node) -{ - unsigned int this_cpu = smp_processor_id(); - struct cfd_seq_local *seq = this_cpu_ptr(&cfd_seq_local); - struct call_function_data *cfd = this_cpu_ptr(&cfd_data); - struct cfd_percpu *pcpu = per_cpu_ptr(cfd->pcpu, cpu); - - cfd_seq_store(pcpu->seq_queue, this_cpu, cpu, CFD_SEQ_QUEUE); - if (llist_add(node, &per_cpu(call_single_queue, cpu))) { - cfd_seq_store(pcpu->seq_ipi, this_cpu, cpu, CFD_SEQ_IPI); - cfd_seq_store(seq->ping, this_cpu, cpu, CFD_SEQ_PING); - send_call_function_single_ipi(cpu); - cfd_seq_store(seq->pinged, this_cpu, cpu, CFD_SEQ_PINGED); - } else { - cfd_seq_store(pcpu->seq_noipi, this_cpu, cpu, CFD_SEQ_NOIPI); - } -} #else -#define cfd_seq_store(var, src, dst, type) - static void csd_lock_record(struct __call_single_data *csd) { } @@ -470,23 +322,29 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); void __smp_call_single_queue(int cpu, struct llist_node *node) { -#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG - if (static_branch_unlikely(&csdlock_debug_extended)) { - unsigned int type; - - type = CSD_TYPE(container_of(node, call_single_data_t, - node.llist)); - if (type == CSD_TYPE_SYNC || type == CSD_TYPE_ASYNC) { - __smp_call_single_queue_debug(cpu, node); - return; - } + /* + * We have to check the type of the CSD before queueing it, because + * once queued it can have its flags cleared by + * flush_smp_call_function_queue() + * even if we haven't sent the smp_call IPI yet (e.g. the stopper + * executes migration_cpu_stop() on the remote CPU). + */ + if (trace_ipi_send_cpu_enabled()) { + call_single_data_t *csd; + smp_call_func_t func; + + csd = container_of(node, call_single_data_t, node.llist); + func = CSD_TYPE(csd) == CSD_TYPE_TTWU ? + sched_ttwu_pending : csd->func; + + trace_ipi_send_cpu(cpu, _RET_IP_, func); } -#endif /* - * The list addition should be visible before sending the IPI - * handler locks the list to pull the entry off it because of - * normal cache coherency rules implied by spinlocks. + * The list addition should be visible to the target CPU when it pops + * the head of the list to pull the entry off it in the IPI handler + * because of normal cache coherency rules implied by the underlying + * llist ops. * * If IPIs can go out of order to the cache coherency protocol * in an architecture, sufficient synchronisation should be added @@ -541,8 +399,6 @@ static int generic_exec_single(int cpu, struct __call_single_data *csd) */ void generic_smp_call_function_single_interrupt(void) { - cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->gotipi, CFD_SEQ_NOCPU, - smp_processor_id(), CFD_SEQ_GOTIPI); __flush_smp_call_function_queue(true); } @@ -570,13 +426,7 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline) lockdep_assert_irqs_disabled(); head = this_cpu_ptr(&call_single_queue); - cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->handle, CFD_SEQ_NOCPU, - smp_processor_id(), CFD_SEQ_HANDLE); entry = llist_del_all(head); - cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->dequeue, - /* Special meaning of source cpu: 0 == queue empty */ - entry ? CFD_SEQ_NOCPU : 0, - smp_processor_id(), CFD_SEQ_DEQUEUE); entry = llist_reverse_order(entry); /* There shouldn't be any pending callbacks on an offline CPU. */ @@ -635,12 +485,8 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline) } } - if (!entry) { - cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->hdlend, - 0, smp_processor_id(), - CFD_SEQ_HDLEND); + if (!entry) return; - } /* * Second; run all !SYNC callbacks. @@ -678,9 +524,6 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline) */ if (entry) sched_ttwu_pending(entry); - - cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->hdlend, CFD_SEQ_NOCPU, - smp_processor_id(), CFD_SEQ_HDLEND); } @@ -704,8 +547,6 @@ void flush_smp_call_function_queue(void) if (llist_empty(this_cpu_ptr(&call_single_queue))) return; - cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->idle, CFD_SEQ_NOCPU, - smp_processor_id(), CFD_SEQ_IDLE); local_irq_save(flags); /* Get the already pending soft interrupts for RT enabled kernels */ was_pending = local_softirq_pending(); @@ -887,9 +728,9 @@ static void smp_call_function_many_cond(const struct cpumask *mask, int cpu, last_cpu, this_cpu = smp_processor_id(); struct call_function_data *cfd; bool wait = scf_flags & SCF_WAIT; + int nr_cpus = 0, nr_queued = 0; bool run_remote = false; bool run_local = false; - int nr_cpus = 0; lockdep_assert_preemption_disabled(); @@ -929,11 +770,12 @@ static void smp_call_function_many_cond(const struct cpumask *mask, cpumask_clear(cfd->cpumask_ipi); for_each_cpu(cpu, cfd->cpumask) { - struct cfd_percpu *pcpu = per_cpu_ptr(cfd->pcpu, cpu); - call_single_data_t *csd = &pcpu->csd; + call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu); - if (cond_func && !cond_func(cpu, info)) + if (cond_func && !cond_func(cpu, info)) { + __cpumask_clear_cpu(cpu, cfd->cpumask); continue; + } csd_lock(csd); if (wait) @@ -944,19 +786,20 @@ static void smp_call_function_many_cond(const struct cpumask *mask, csd->node.src = smp_processor_id(); csd->node.dst = cpu; #endif - cfd_seq_store(pcpu->seq_queue, this_cpu, cpu, CFD_SEQ_QUEUE); if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) { __cpumask_set_cpu(cpu, cfd->cpumask_ipi); nr_cpus++; last_cpu = cpu; - - cfd_seq_store(pcpu->seq_ipi, this_cpu, cpu, CFD_SEQ_IPI); - } else { - cfd_seq_store(pcpu->seq_noipi, this_cpu, cpu, CFD_SEQ_NOIPI); } + nr_queued++; } - cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->ping, this_cpu, CFD_SEQ_NOCPU, CFD_SEQ_PING); + /* + * Trace each smp_function_call_*() as an IPI, actual IPIs + * will be traced with func==generic_smp_call_function_single_ipi(). + */ + if (nr_queued) + trace_ipi_send_cpumask(cfd->cpumask, _RET_IP_, func); /* * Choose the most efficient way to send an IPI. Note that the @@ -966,9 +809,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask, if (nr_cpus == 1) send_call_function_single_ipi(last_cpu); else if (likely(nr_cpus > 1)) - arch_send_call_function_ipi_mask(cfd->cpumask_ipi); - - cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->pinged, this_cpu, CFD_SEQ_NOCPU, CFD_SEQ_PINGED); + send_call_function_ipi_mask(cfd->cpumask_ipi); } if (run_local && (!cond_func || cond_func(this_cpu, info))) { @@ -983,7 +824,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask, for_each_cpu(cpu, cfd->cpumask) { call_single_data_t *csd; - csd = &per_cpu_ptr(cfd->pcpu, cpu)->csd; + csd = per_cpu_ptr(cfd->csd, cpu); csd_lock_wait(csd); } } diff --git a/kernel/stackleak.c b/kernel/stackleak.c index c2c33d2202e9..34c9d81eea94 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -70,6 +70,18 @@ late_initcall(stackleak_sysctls_init); #define skip_erasing() false #endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */ +#ifndef __stackleak_poison +static __always_inline void __stackleak_poison(unsigned long erase_low, + unsigned long erase_high, + unsigned long poison) +{ + while (erase_low < erase_high) { + *(unsigned long *)erase_low = poison; + erase_low += sizeof(unsigned long); + } +} +#endif + static __always_inline void __stackleak_erase(bool on_task_stack) { const unsigned long task_stack_low = stackleak_task_low_bound(current); @@ -101,10 +113,7 @@ static __always_inline void __stackleak_erase(bool on_task_stack) else erase_high = task_stack_high; - while (erase_low < erase_high) { - *(unsigned long *)erase_low = STACKLEAK_POISON; - erase_low += sizeof(unsigned long); - } + __stackleak_poison(erase_low, erase_high, STACKLEAK_POISON); /* Reset the 'lowest_stack' value for the next syscall */ current->lowest_stack = task_stack_high; diff --git a/kernel/sys.c b/kernel/sys.c index 351de7916302..339fee3eff6a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -15,6 +15,7 @@ #include <linux/highuid.h> #include <linux/fs.h> #include <linux/kmod.h> +#include <linux/ksm.h> #include <linux/perf_event.h> #include <linux/resource.h> #include <linux/kernel.h> @@ -2388,6 +2389,16 @@ static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3, PR_MDWE_REFUSE_EXEC_GAIN : 0; } +static int prctl_get_auxv(void __user *addr, unsigned long len) +{ + struct mm_struct *mm = current->mm; + unsigned long size = min_t(unsigned long, sizeof(mm->saved_auxv), len); + + if (size && copy_to_user(addr, mm->saved_auxv, size)) + return -EFAULT; + return sizeof(mm->saved_auxv); +} + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2518,6 +2529,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, else return -EINVAL; break; + case PR_GET_AUXV: + if (arg4 || arg5) + return -EINVAL; + error = prctl_get_auxv((void __user *)arg2, arg3); + break; default: return -EINVAL; } @@ -2672,6 +2688,26 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_VMA: error = prctl_set_vma(arg2, arg3, arg4, arg5); break; +#ifdef CONFIG_KSM + case PR_SET_MEMORY_MERGE: + if (arg3 || arg4 || arg5) + return -EINVAL; + if (mmap_write_lock_killable(me->mm)) + return -EINTR; + + if (arg2) + error = ksm_enable_merge_any(me->mm); + else + error = ksm_disable_merge_any(me->mm); + mmap_write_unlock(me->mm); + break; + case PR_GET_MEMORY_MERGE: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + + error = !!test_bit(MMF_VM_MERGE_ANY, &me->mm->flags); + break; +#endif default: error = -EINVAL; break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1c240d2c99bc..bfe53e835524 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -42,7 +42,6 @@ #include <linux/highuid.h> #include <linux/writeback.h> #include <linux/ratelimit.h> -#include <linux/compaction.h> #include <linux/hugetlb.h> #include <linux/initrd.h> #include <linux/key.h> @@ -746,27 +745,6 @@ int proc_dointvec(struct ctl_table *table, int write, void *buffer, return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL); } -#ifdef CONFIG_COMPACTION -static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, - int write, void *buffer, size_t *lenp, loff_t *ppos) -{ - int ret, old; - - if (!IS_ENABLED(CONFIG_PREEMPT_RT) || !write) - return proc_dointvec_minmax(table, write, buffer, lenp, ppos); - - old = *(int *)table->data; - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (ret) - return ret; - if (old != *(int *)table->data) - pr_warn_once("sysctl attribute %s changed by %s[%d]\n", - table->procname, current->comm, - task_pid_nr(current)); - return ret; -} -#endif - /** * proc_douintvec - read a vector of unsigned integers * @table: the sysctl table @@ -2141,38 +2119,6 @@ static struct ctl_table vm_table[] = { .extra2 = SYSCTL_ONE, }, #endif -#ifdef CONFIG_HUGETLB_PAGE - { - .procname = "nr_hugepages", - .data = NULL, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = hugetlb_sysctl_handler, - }, -#ifdef CONFIG_NUMA - { - .procname = "nr_hugepages_mempolicy", - .data = NULL, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = &hugetlb_mempolicy_sysctl_handler, - }, -#endif - { - .procname = "hugetlb_shm_group", - .data = &sysctl_hugetlb_shm_group, - .maxlen = sizeof(gid_t), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "nr_overcommit_hugepages", - .data = NULL, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = hugetlb_overcommit_handler, - }, -#endif { .procname = "lowmem_reserve_ratio", .data = &sysctl_lowmem_reserve_ratio, @@ -2189,43 +2135,6 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_FOUR, }, -#ifdef CONFIG_COMPACTION - { - .procname = "compact_memory", - .data = NULL, - .maxlen = sizeof(int), - .mode = 0200, - .proc_handler = sysctl_compaction_handler, - }, - { - .procname = "compaction_proactiveness", - .data = &sysctl_compaction_proactiveness, - .maxlen = sizeof(sysctl_compaction_proactiveness), - .mode = 0644, - .proc_handler = compaction_proactiveness_sysctl_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - { - .procname = "extfrag_threshold", - .data = &sysctl_extfrag_threshold, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_THOUSAND, - }, - { - .procname = "compact_unevictable_allowed", - .data = &sysctl_compact_unevictable_allowed, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax_warn_RT_change, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - -#endif /* CONFIG_COMPACTION */ { .procname = "min_free_kbytes", .data = &min_free_kbytes, @@ -2383,26 +2292,6 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, }, #endif -#ifdef CONFIG_MEMORY_FAILURE - { - .procname = "memory_failure_early_kill", - .data = &sysctl_memory_failure_early_kill, - .maxlen = sizeof(sysctl_memory_failure_early_kill), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "memory_failure_recovery", - .data = &sysctl_memory_failure_recovery, - .maxlen = sizeof(sysctl_memory_failure_recovery), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif { .procname = "user_reserve_kbytes", .data = &sysctl_user_reserve_kbytes, @@ -2439,17 +2328,6 @@ static struct ctl_table vm_table[] = { .extra2 = (void *)&mmap_rnd_compat_bits_max, }, #endif -#ifdef CONFIG_USERFAULTFD - { - .procname = "unprivileged_userfaultfd", - .data = &sysctl_unprivileged_userfaultfd, - .maxlen = sizeof(sysctl_unprivileged_userfaultfd), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif { } }; diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 7e5dff602585..82b28ab0f328 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -81,8 +81,7 @@ struct rtc_device *alarmtimer_get_rtcdev(void) } EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev); -static int alarmtimer_rtc_add_device(struct device *dev, - struct class_interface *class_intf) +static int alarmtimer_rtc_add_device(struct device *dev) { unsigned long flags; struct rtc_device *rtc = to_rtc_device(dev); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 93bf2b4e47e5..771d1e040303 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -35,14 +35,15 @@ static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock); #ifdef CONFIG_TICK_ONESHOT static DEFINE_PER_CPU(struct clock_event_device *, tick_oneshot_wakeup_device); -static void tick_broadcast_setup_oneshot(struct clock_event_device *bc); +static void tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic); static void tick_broadcast_clear_oneshot(int cpu); static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); # ifdef CONFIG_HOTPLUG_CPU static void tick_broadcast_oneshot_offline(unsigned int cpu); # endif #else -static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } +static inline void +tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic) { BUG(); } static inline void tick_broadcast_clear_oneshot(int cpu) { } static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } # ifdef CONFIG_HOTPLUG_CPU @@ -264,7 +265,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_broadcast_start_periodic(bc); else - tick_broadcast_setup_oneshot(bc); + tick_broadcast_setup_oneshot(bc, false); ret = 1; } else { /* @@ -500,7 +501,7 @@ void tick_broadcast_control(enum tick_broadcast_mode mode) if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_broadcast_start_periodic(bc); else - tick_broadcast_setup_oneshot(bc); + tick_broadcast_setup_oneshot(bc, false); } } out: @@ -1020,48 +1021,101 @@ static inline ktime_t tick_get_next_period(void) /** * tick_broadcast_setup_oneshot - setup the broadcast device */ -static void tick_broadcast_setup_oneshot(struct clock_event_device *bc) +static void tick_broadcast_setup_oneshot(struct clock_event_device *bc, + bool from_periodic) { int cpu = smp_processor_id(); + ktime_t nexttick = 0; if (!bc) return; - /* Set it up only once ! */ - if (bc->event_handler != tick_handle_oneshot_broadcast) { - int was_periodic = clockevent_state_periodic(bc); - - bc->event_handler = tick_handle_oneshot_broadcast; - + /* + * When the broadcast device was switched to oneshot by the first + * CPU handling the NOHZ change, the other CPUs will reach this + * code via hrtimer_run_queues() -> tick_check_oneshot_change() + * too. Set up the broadcast device only once! + */ + if (bc->event_handler == tick_handle_oneshot_broadcast) { /* - * We must be careful here. There might be other CPUs - * waiting for periodic broadcast. We need to set the - * oneshot_mask bits for those and program the - * broadcast device to fire. + * The CPU which switched from periodic to oneshot mode + * set the broadcast oneshot bit for all other CPUs which + * are in the general (periodic) broadcast mask to ensure + * that CPUs which wait for the periodic broadcast are + * woken up. + * + * Clear the bit for the local CPU as the set bit would + * prevent the first tick_broadcast_enter() after this CPU + * switched to oneshot state to program the broadcast + * device. + * + * This code can also be reached via tick_broadcast_control(), + * but this cannot avoid the tick_broadcast_clear_oneshot() + * as that would break the periodic to oneshot transition of + * secondary CPUs. But that's harmless as the below only + * clears already cleared bits. */ + tick_broadcast_clear_oneshot(cpu); + return; + } + + + bc->event_handler = tick_handle_oneshot_broadcast; + bc->next_event = KTIME_MAX; + + /* + * When the tick mode is switched from periodic to oneshot it must + * be ensured that CPUs which are waiting for periodic broadcast + * get their wake-up at the next tick. This is achieved by ORing + * tick_broadcast_mask into tick_broadcast_oneshot_mask. + * + * For other callers, e.g. broadcast device replacement, + * tick_broadcast_oneshot_mask must not be touched as this would + * set bits for CPUs which are already NOHZ, but not idle. Their + * next tick_broadcast_enter() would observe the bit set and fail + * to update the expiry time and the broadcast event device. + */ + if (from_periodic) { cpumask_copy(tmpmask, tick_broadcast_mask); + /* Remove the local CPU as it is obviously not idle */ cpumask_clear_cpu(cpu, tmpmask); - cpumask_or(tick_broadcast_oneshot_mask, - tick_broadcast_oneshot_mask, tmpmask); + cpumask_or(tick_broadcast_oneshot_mask, tick_broadcast_oneshot_mask, tmpmask); - if (was_periodic && !cpumask_empty(tmpmask)) { - ktime_t nextevt = tick_get_next_period(); + /* + * Ensure that the oneshot broadcast handler will wake the + * CPUs which are still waiting for periodic broadcast. + */ + nexttick = tick_get_next_period(); + tick_broadcast_init_next_event(tmpmask, nexttick); - clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT); - tick_broadcast_init_next_event(tmpmask, nextevt); - tick_broadcast_set_event(bc, cpu, nextevt); - } else - bc->next_event = KTIME_MAX; - } else { /* - * The first cpu which switches to oneshot mode sets - * the bit for all other cpus which are in the general - * (periodic) broadcast mask. So the bit is set and - * would prevent the first broadcast enter after this - * to program the bc device. + * If the underlying broadcast clock event device is + * already in oneshot state, then there is nothing to do. + * The device was already armed for the next tick + * in tick_handle_broadcast_periodic() */ - tick_broadcast_clear_oneshot(cpu); + if (clockevent_state_oneshot(bc)) + return; } + + /* + * When switching from periodic to oneshot mode arm the broadcast + * device for the next tick. + * + * If the broadcast device has been replaced in oneshot mode and + * the oneshot broadcast mask is not empty, then arm it to expire + * immediately in order to reevaluate the next expiring timer. + * @nexttick is 0 and therefore in the past which will cause the + * clockevent code to force an event. + * + * For both cases the programming can be avoided when the oneshot + * broadcast mask is empty. + * + * tick_broadcast_set_event() implicitly switches the broadcast + * device to oneshot state. + */ + if (!cpumask_empty(tick_broadcast_oneshot_mask)) + tick_broadcast_set_event(bc, cpu, nexttick); } /* @@ -1070,14 +1124,16 @@ static void tick_broadcast_setup_oneshot(struct clock_event_device *bc) void tick_broadcast_switch_to_oneshot(void) { struct clock_event_device *bc; + enum tick_device_mode oldmode; unsigned long flags; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + oldmode = tick_broadcast_device.mode; tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; bc = tick_broadcast_device.evtdev; if (bc) - tick_broadcast_setup_oneshot(bc); + tick_broadcast_setup_oneshot(bc, oldmode == TICKDEV_MODE_PERIODIC); raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5579ead449f2..09d594900ee0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -526,7 +526,7 @@ EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); * partially updated. Since the tk->offs_boot update is a rare event, this * should be a rare occurrence which postprocessing should be able to handle. * - * The caveats vs. timestamp ordering as documented for ktime_get_fast_ns() + * The caveats vs. timestamp ordering as documented for ktime_get_mono_fast_ns() * apply as well. */ u64 notrace ktime_get_boot_fast_ns(void) @@ -576,7 +576,7 @@ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) /** * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime. * - * See ktime_get_fast_ns() for documentation of the time stamp ordering. + * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering. */ u64 ktime_get_real_fast_ns(void) { diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5b1e7fa41ca8..8cf97fa4a4b3 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -792,15 +792,15 @@ config USER_EVENTS bool "User trace events" select TRACING select DYNAMIC_EVENTS - depends on BROKEN || COMPILE_TEST # API needs to be straighten out help User trace events are user-defined trace events that can be used like an existing kernel trace event. User trace events are generated by writing to a tracefs file. User processes can determine if their tracing events should be - generated by memory mapping a tracefs file and checking for - an associated byte being non-zero. + generated by registering a value and bit with the kernel + that reflects when it is enabled or not. + See Documentation/trace/user_events.rst. If in doubt, say N. config HIST_TRIGGERS diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index bcf91bc7bf71..9a050e36dc6c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2640,9 +2640,20 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, return err; } -static void +static int kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip, - struct pt_regs *regs) + struct pt_regs *regs, void *data) +{ + struct bpf_kprobe_multi_link *link; + + link = container_of(fp, struct bpf_kprobe_multi_link, fp); + kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs); + return 0; +} + +static void +kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip, + struct pt_regs *regs, void *data) { struct bpf_kprobe_multi_link *link; @@ -2844,7 +2855,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr goto error; if (flags & BPF_F_KPROBE_MULTI_RETURN) - link->fp.exit_handler = kprobe_multi_link_handler; + link->fp.exit_handler = kprobe_multi_link_exit_handler; else link->fp.entry_handler = kprobe_multi_link_handler; diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index e8143e368074..18d36842faf5 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -17,57 +17,98 @@ struct fprobe_rethook_node { struct rethook_node node; unsigned long entry_ip; + unsigned long entry_parent_ip; + char data[]; }; -static void fprobe_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct ftrace_regs *fregs) +static inline void __fprobe_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct ftrace_regs *fregs) { struct fprobe_rethook_node *fpr; - struct rethook_node *rh; + struct rethook_node *rh = NULL; struct fprobe *fp; - int bit; + void *entry_data = NULL; + int ret = 0; fp = container_of(ops, struct fprobe, ops); - if (fprobe_disabled(fp)) - return; - - bit = ftrace_test_recursion_trylock(ip, parent_ip); - if (bit < 0) { - fp->nmissed++; - return; - } - - if (fp->entry_handler) - fp->entry_handler(fp, ip, ftrace_get_regs(fregs)); if (fp->exit_handler) { rh = rethook_try_get(fp->rethook); if (!rh) { fp->nmissed++; - goto out; + return; } fpr = container_of(rh, struct fprobe_rethook_node, node); fpr->entry_ip = ip; - rethook_hook(rh, ftrace_get_regs(fregs), true); + fpr->entry_parent_ip = parent_ip; + if (fp->entry_data_size) + entry_data = fpr->data; } -out: + if (fp->entry_handler) + ret = fp->entry_handler(fp, ip, ftrace_get_regs(fregs), entry_data); + + /* If entry_handler returns !0, nmissed is not counted. */ + if (rh) { + if (ret) + rethook_recycle(rh); + else + rethook_hook(rh, ftrace_get_regs(fregs), true); + } +} + +static void fprobe_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct ftrace_regs *fregs) +{ + struct fprobe *fp; + int bit; + + fp = container_of(ops, struct fprobe, ops); + if (fprobe_disabled(fp)) + return; + + /* recursion detection has to go before any traceable function and + * all functions before this point should be marked as notrace + */ + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) { + fp->nmissed++; + return; + } + __fprobe_handler(ip, parent_ip, ops, fregs); ftrace_test_recursion_unlock(bit); + } NOKPROBE_SYMBOL(fprobe_handler); static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct ftrace_regs *fregs) { - struct fprobe *fp = container_of(ops, struct fprobe, ops); + struct fprobe *fp; + int bit; + + fp = container_of(ops, struct fprobe, ops); + if (fprobe_disabled(fp)) + return; + + /* recursion detection has to go before any traceable function and + * all functions called before this point should be marked as notrace + */ + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) { + fp->nmissed++; + return; + } if (unlikely(kprobe_running())) { fp->nmissed++; return; } + kprobe_busy_begin(); - fprobe_handler(ip, parent_ip, ops, fregs); + __fprobe_handler(ip, parent_ip, ops, fregs); kprobe_busy_end(); + ftrace_test_recursion_unlock(bit); } static void fprobe_exit_handler(struct rethook_node *rh, void *data, @@ -75,13 +116,26 @@ static void fprobe_exit_handler(struct rethook_node *rh, void *data, { struct fprobe *fp = (struct fprobe *)data; struct fprobe_rethook_node *fpr; + int bit; if (!fp || fprobe_disabled(fp)) return; fpr = container_of(rh, struct fprobe_rethook_node, node); - fp->exit_handler(fp, fpr->entry_ip, regs); + /* + * we need to assure no calls to traceable functions in-between the + * end of fprobe_handler and the beginning of fprobe_exit_handler. + */ + bit = ftrace_test_recursion_trylock(fpr->entry_ip, fpr->entry_parent_ip); + if (bit < 0) { + fp->nmissed++; + return; + } + + fp->exit_handler(fp, fpr->entry_ip, regs, + fp->entry_data_size ? (void *)fpr->data : NULL); + ftrace_test_recursion_unlock(bit); } NOKPROBE_SYMBOL(fprobe_exit_handler); @@ -136,7 +190,10 @@ static int fprobe_init_rethook(struct fprobe *fp, int num) } /* Initialize rethook if needed */ - size = num * num_possible_cpus() * 2; + if (fp->nr_maxactive) + size = fp->nr_maxactive; + else + size = num * num_possible_cpus() * 2; if (size < 0) return -E2BIG; @@ -146,7 +203,7 @@ static int fprobe_init_rethook(struct fprobe *fp, int num) for (i = 0; i < size; i++) { struct fprobe_rethook_node *node; - node = kzalloc(sizeof(*node), GFP_KERNEL); + node = kzalloc(sizeof(*node) + fp->entry_data_size, GFP_KERNEL); if (!node) { rethook_free(fp->rethook); fp->rethook = NULL; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3b46dba3f69b..764668467155 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -45,6 +45,10 @@ #include "trace_output.h" #include "trace_stat.h" +/* Flags that do not get reset */ +#define FTRACE_NOCLEAR_FLAGS (FTRACE_FL_DISABLED | FTRACE_FL_TOUCHED | \ + FTRACE_FL_MODIFIED) + #define FTRACE_INVALID_FUNCTION "__ftrace_invalid_address__" #define FTRACE_WARN_ON(cond) \ @@ -2256,7 +2260,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) flag ^= rec->flags & FTRACE_FL_ENABLED; if (update) { - rec->flags |= FTRACE_FL_ENABLED; + rec->flags |= FTRACE_FL_ENABLED | FTRACE_FL_TOUCHED; if (flag & FTRACE_FL_REGS) { if (rec->flags & FTRACE_FL_REGS) rec->flags |= FTRACE_FL_REGS_EN; @@ -2270,6 +2274,10 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) rec->flags &= ~FTRACE_FL_TRAMP_EN; } + /* Keep track of anything that modifies the function */ + if (rec->flags & (FTRACE_FL_DIRECT | FTRACE_FL_IPMODIFY)) + rec->flags |= FTRACE_FL_MODIFIED; + if (flag & FTRACE_FL_DIRECT) { /* * If there's only one user (direct_ops helper) @@ -2326,7 +2334,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) if (update) { /* If there's no more users, clear all flags */ if (!ftrace_rec_count(rec)) - rec->flags &= FTRACE_FL_DISABLED; + rec->flags &= FTRACE_NOCLEAR_FLAGS; else /* * Just disable the record, but keep the ops TRAMP @@ -3147,7 +3155,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) struct dyn_ftrace *rec; do_for_each_ftrace_rec(pg, rec) { - if (FTRACE_WARN_ON_ONCE(rec->flags & ~FTRACE_FL_DISABLED)) + if (FTRACE_WARN_ON_ONCE(rec->flags & ~FTRACE_NOCLEAR_FLAGS)) pr_warn(" %pS flags:%lx\n", (void *)rec->ip, rec->flags); } while_for_each_ftrace_rec(); @@ -3598,7 +3606,10 @@ t_func_next(struct seq_file *m, loff_t *pos) !ftrace_lookup_ip(iter->hash, rec->ip)) || ((iter->flags & FTRACE_ITER_ENABLED) && - !(rec->flags & FTRACE_FL_ENABLED))) { + !(rec->flags & FTRACE_FL_ENABLED)) || + + ((iter->flags & FTRACE_ITER_TOUCHED) && + !(rec->flags & FTRACE_FL_TOUCHED))) { rec = NULL; goto retry; @@ -3857,15 +3868,16 @@ static int t_show(struct seq_file *m, void *v) return 0; } - if (iter->flags & FTRACE_ITER_ENABLED) { + if (iter->flags & (FTRACE_ITER_ENABLED | FTRACE_ITER_TOUCHED)) { struct ftrace_ops *ops; - seq_printf(m, " (%ld)%s%s%s%s", + seq_printf(m, " (%ld)%s%s%s%s%s", ftrace_rec_count(rec), rec->flags & FTRACE_FL_REGS ? " R" : " ", rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ", rec->flags & FTRACE_FL_DIRECT ? " D" : " ", - rec->flags & FTRACE_FL_CALL_OPS ? " O" : " "); + rec->flags & FTRACE_FL_CALL_OPS ? " O" : " ", + rec->flags & FTRACE_FL_MODIFIED ? " M " : " "); if (rec->flags & FTRACE_FL_TRAMP_EN) { ops = ftrace_find_tramp_ops_any(rec); if (ops) { @@ -3959,6 +3971,31 @@ ftrace_enabled_open(struct inode *inode, struct file *file) return 0; } +static int +ftrace_touched_open(struct inode *inode, struct file *file) +{ + struct ftrace_iterator *iter; + + /* + * This shows us what functions have ever been enabled + * (traced, direct, patched, etc). Not sure if we want lockdown + * to hide such critical information for an admin. + * Although, perhaps it can show information we don't + * want people to see, but if something had traced + * something, we probably want to know about it. + */ + + iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); + if (!iter) + return -ENOMEM; + + iter->pg = ftrace_pages_start; + iter->flags = FTRACE_ITER_TOUCHED; + iter->ops = &global_ops; + + return 0; +} + /** * ftrace_regex_open - initialize function tracer filter files * @ops: The ftrace_ops that hold the hash filters @@ -5872,6 +5909,13 @@ static const struct file_operations ftrace_enabled_fops = { .release = seq_release_private, }; +static const struct file_operations ftrace_touched_fops = { + .open = ftrace_touched_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + static const struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, .read = seq_read, @@ -6336,6 +6380,9 @@ static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer) trace_create_file("enabled_functions", TRACE_MODE_READ, d_tracer, NULL, &ftrace_enabled_fops); + trace_create_file("touched_functions", TRACE_MODE_READ, + d_tracer, NULL, &ftrace_touched_fops); + ftrace_create_filter_files(&global_ops, d_tracer); #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -8006,8 +8053,7 @@ struct kallsyms_data { * and returns 1 in case we resolved all the requested symbols, * 0 otherwise. */ -static int kallsyms_callback(void *data, const char *name, - struct module *mod, unsigned long addr) +static int kallsyms_callback(void *data, const char *name, unsigned long addr) { struct kallsyms_data *args = data; const char **sym; diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c index 32c3dfdb4d6a..60f6cb2b486b 100644 --- a/kernel/trace/rethook.c +++ b/kernel/trace/rethook.c @@ -288,7 +288,7 @@ unsigned long rethook_trampoline_handler(struct pt_regs *regs, * These loops must be protected from rethook_free_rcu() because those * are accessing 'rhn->rethook'. */ - preempt_disable(); + preempt_disable_notrace(); /* * Run the handler on the shadow stack. Do not unlink the list here because @@ -321,7 +321,7 @@ unsigned long rethook_trampoline_handler(struct pt_regs *regs, first = first->next; rethook_recycle(rhn); } - preempt_enable(); + preempt_enable_notrace(); return correct_ret_addr; } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 76a2d91eecad..834b361a4a66 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -163,7 +163,7 @@ enum { #define extended_time(event) \ (event->type_len >= RINGBUF_TYPE_TIME_EXTEND) -static inline int rb_null_event(struct ring_buffer_event *event) +static inline bool rb_null_event(struct ring_buffer_event *event) { return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; } @@ -363,11 +363,9 @@ static void free_buffer_page(struct buffer_page *bpage) /* * We need to fit the time_stamp delta into 27 bits. */ -static inline int test_time_stamp(u64 delta) +static inline bool test_time_stamp(u64 delta) { - if (delta & TS_DELTA_TEST) - return 1; - return 0; + return !!(delta & TS_DELTA_TEST); } #define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) @@ -696,7 +694,7 @@ rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set) return ret == expect; } -static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) +static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) { unsigned long cnt, top, bottom, msb; unsigned long cnt2, top2, bottom2, msb2; @@ -1486,7 +1484,7 @@ rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer) return NULL; } -static int rb_head_page_replace(struct buffer_page *old, +static bool rb_head_page_replace(struct buffer_page *old, struct buffer_page *new) { unsigned long *ptr = (unsigned long *)&old->list.prev->next; @@ -1565,15 +1563,12 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, } } -static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer, +static void rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *bpage) { unsigned long val = (unsigned long)bpage; - if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK)) - return 1; - - return 0; + RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK); } /** @@ -1583,30 +1578,28 @@ static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer, * As a safety measure we check to make sure the data pages have not * been corrupted. */ -static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) +static void rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *head = rb_list_head(cpu_buffer->pages); struct list_head *tmp; if (RB_WARN_ON(cpu_buffer, rb_list_head(rb_list_head(head->next)->prev) != head)) - return -1; + return; if (RB_WARN_ON(cpu_buffer, rb_list_head(rb_list_head(head->prev)->next) != head)) - return -1; + return; for (tmp = rb_list_head(head->next); tmp != head; tmp = rb_list_head(tmp->next)) { if (RB_WARN_ON(cpu_buffer, rb_list_head(rb_list_head(tmp->next)->prev) != tmp)) - return -1; + return; if (RB_WARN_ON(cpu_buffer, rb_list_head(rb_list_head(tmp->prev)->next) != tmp)) - return -1; + return; } - - return 0; } static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, @@ -1774,6 +1767,8 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) struct list_head *head = cpu_buffer->pages; struct buffer_page *bpage, *tmp; + irq_work_sync(&cpu_buffer->irq_work.work); + free_buffer_page(cpu_buffer->reader_page); if (head) { @@ -1880,6 +1875,8 @@ ring_buffer_free(struct trace_buffer *buffer) cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); + irq_work_sync(&buffer->irq_work.work); + for_each_buffer_cpu(buffer, cpu) rb_free_cpu_buffer(buffer->buffers[cpu]); @@ -1918,7 +1915,7 @@ static inline unsigned long rb_page_write(struct buffer_page *bpage) return local_read(&bpage->write) & RB_WRITE_MASK; } -static int +static bool rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) { struct list_head *tail_page, *to_remove, *next_page; @@ -2031,12 +2028,13 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) return nr_removed == 0; } -static int +static bool rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) { struct list_head *pages = &cpu_buffer->new_pages; - int retries, success; unsigned long flags; + bool success; + int retries; /* Can be called at early boot up, where interrupts must not been enabled */ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); @@ -2055,15 +2053,16 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) * spinning. */ retries = 10; - success = 0; + success = false; while (retries--) { struct list_head *head_page, *prev_page, *r; struct list_head *last_page, *first_page; struct list_head *head_page_with_bit; + struct buffer_page *hpage = rb_set_head_page(cpu_buffer); - head_page = &rb_set_head_page(cpu_buffer)->list; - if (!head_page) + if (!hpage) break; + head_page = &hpage->list; prev_page = head_page->prev; first_page = pages->next; @@ -2084,7 +2083,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) * pointer to point to end of list */ head_page->prev = last_page; - success = 1; + success = true; break; } } @@ -2112,7 +2111,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer) { - int success; + bool success; if (cpu_buffer->nr_pages_to_update > 0) success = rb_insert_pages(cpu_buffer); @@ -2995,7 +2994,7 @@ static u64 rb_time_delta(struct ring_buffer_event *event) } } -static inline int +static inline bool rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { @@ -3016,7 +3015,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, delta = rb_time_delta(event); if (!rb_time_read(&cpu_buffer->write_stamp, &write_stamp)) - return 0; + return false; /* Make sure the write stamp is read before testing the location */ barrier(); @@ -3029,7 +3028,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, /* Something came in, can't discard */ if (!rb_time_cmpxchg(&cpu_buffer->write_stamp, write_stamp, write_stamp - delta)) - return 0; + return false; /* * It's possible that the event time delta is zero @@ -3062,12 +3061,12 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, if (index == old_index) { /* update counters */ local_sub(event_length, &cpu_buffer->entries_bytes); - return 1; + return true; } } /* could not discard */ - return 0; + return false; } static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) @@ -3288,7 +3287,7 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) * Note: The TRANSITION bit only handles a single transition between context. */ -static __always_inline int +static __always_inline bool trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) { unsigned int val = cpu_buffer->current_context; @@ -3305,14 +3304,14 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) bit = RB_CTX_TRANSITION; if (val & (1 << (bit + cpu_buffer->nest))) { do_ring_buffer_record_recursion(); - return 1; + return true; } } val |= (1 << (bit + cpu_buffer->nest)); cpu_buffer->current_context = val; - return 0; + return false; } static __always_inline void @@ -4069,10 +4068,10 @@ void ring_buffer_record_off(struct trace_buffer *buffer) unsigned int rd; unsigned int new_rd; + rd = atomic_read(&buffer->record_disabled); do { - rd = atomic_read(&buffer->record_disabled); new_rd = rd | RB_BUFFER_OFF; - } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd); + } while (!atomic_try_cmpxchg(&buffer->record_disabled, &rd, new_rd)); } EXPORT_SYMBOL_GPL(ring_buffer_record_off); @@ -4092,10 +4091,10 @@ void ring_buffer_record_on(struct trace_buffer *buffer) unsigned int rd; unsigned int new_rd; + rd = atomic_read(&buffer->record_disabled); do { - rd = atomic_read(&buffer->record_disabled); new_rd = rd & ~RB_BUFFER_OFF; - } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd); + } while (!atomic_try_cmpxchg(&buffer->record_disabled, &rd, new_rd)); } EXPORT_SYMBOL_GPL(ring_buffer_record_on); @@ -4502,7 +4501,6 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, default: RB_WARN_ON(cpu_buffer, 1); } - return; } static void @@ -4533,7 +4531,6 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, default: RB_WARN_ON(iter->cpu_buffer, 1); } - return; } static struct buffer_page * @@ -4543,7 +4540,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) unsigned long overwrite; unsigned long flags; int nr_loops = 0; - int ret; + bool ret; local_irq_save(flags); arch_spin_lock(&cpu_buffer->lock); @@ -4953,7 +4950,6 @@ rb_reader_unlock(struct ring_buffer_per_cpu *cpu_buffer, bool locked) { if (likely(locked)) raw_spin_unlock(&cpu_buffer->reader_lock); - return; } /** @@ -5345,6 +5341,9 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); +/* Flag to ensure proper resetting of atomic variables */ +#define RESET_BIT (1 << 30) + /** * ring_buffer_reset_online_cpus - reset a ring buffer per CPU buffer * @buffer: The ring buffer to reset a per cpu buffer of @@ -5361,20 +5360,27 @@ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) for_each_online_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - atomic_inc(&cpu_buffer->resize_disabled); + atomic_add(RESET_BIT, &cpu_buffer->resize_disabled); atomic_inc(&cpu_buffer->record_disabled); } /* Make sure all commits have finished */ synchronize_rcu(); - for_each_online_buffer_cpu(buffer, cpu) { + for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; + /* + * If a CPU came online during the synchronize_rcu(), then + * ignore it. + */ + if (!(atomic_read(&cpu_buffer->resize_disabled) & RESET_BIT)) + continue; + reset_disabled_cpu_buffer(cpu_buffer); atomic_dec(&cpu_buffer->record_disabled); - atomic_dec(&cpu_buffer->resize_disabled); + atomic_sub(RESET_BIT, &cpu_buffer->resize_disabled); } mutex_unlock(&buffer->mutex); @@ -5424,8 +5430,8 @@ bool ring_buffer_empty(struct trace_buffer *buffer) struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; bool dolock; + bool ret; int cpu; - int ret; /* yes this is racy, but if you don't like the race, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { @@ -5454,7 +5460,7 @@ bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu) struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; bool dolock; - int ret; + bool ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return true; diff --git a/kernel/trace/rv/reactor_panic.c b/kernel/trace/rv/reactor_panic.c index d65f6c25a87c..0186ff4cbd0b 100644 --- a/kernel/trace/rv/reactor_panic.c +++ b/kernel/trace/rv/reactor_panic.c @@ -38,6 +38,5 @@ static void __exit unregister_react_panic(void) module_init(register_react_panic); module_exit(unregister_react_panic); -MODULE_LICENSE("GPL"); MODULE_AUTHOR("Daniel Bristot de Oliveira"); MODULE_DESCRIPTION("panic rv reactor: panic if an exception is found."); diff --git a/kernel/trace/rv/reactor_printk.c b/kernel/trace/rv/reactor_printk.c index 4b6b7106a477..178759dbf89f 100644 --- a/kernel/trace/rv/reactor_printk.c +++ b/kernel/trace/rv/reactor_printk.c @@ -37,6 +37,5 @@ static void __exit unregister_react_printk(void) module_init(register_react_printk); module_exit(unregister_react_printk); -MODULE_LICENSE("GPL"); MODULE_AUTHOR("Daniel Bristot de Oliveira"); MODULE_DESCRIPTION("printk rv reactor: printk if an exception is hit."); diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 7e9061828c24..2f68e93fff0b 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -290,8 +290,6 @@ static ssize_t monitor_enable_write_data(struct file *filp, const char __user *u if (retval) return retval; - retval = count; - mutex_lock(&rv_interface_lock); if (val) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 36a6037823cd..64a4dde073ef 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -60,6 +60,7 @@ */ bool ring_buffer_expanded; +#ifdef CONFIG_FTRACE_STARTUP_TEST /* * We need to change this state when a selftest is running. * A selftest will lurk into the ring-buffer to count the @@ -75,7 +76,6 @@ static bool __read_mostly tracing_selftest_running; */ bool __read_mostly tracing_selftest_disabled; -#ifdef CONFIG_FTRACE_STARTUP_TEST void __init disable_tracing_selftest(const char *reason) { if (!tracing_selftest_disabled) { @@ -83,6 +83,9 @@ void __init disable_tracing_selftest(const char *reason) pr_info("Ftrace startup test is disabled due to %s\n", reason); } } +#else +#define tracing_selftest_running 0 +#define tracing_selftest_disabled 0 #endif /* Pipe tracepoints to printk */ @@ -1051,7 +1054,10 @@ int __trace_array_puts(struct trace_array *tr, unsigned long ip, if (!(tr->trace_flags & TRACE_ITER_PRINTK)) return 0; - if (unlikely(tracing_selftest_running || tracing_disabled)) + if (unlikely(tracing_selftest_running && tr == &global_trace)) + return 0; + + if (unlikely(tracing_disabled)) return 0; alloc = sizeof(*entry) + size + 2; /* possible \n added */ @@ -2041,6 +2047,24 @@ static int run_tracer_selftest(struct tracer *type) return 0; } +static int do_run_tracer_selftest(struct tracer *type) +{ + int ret; + + /* + * Tests can take a long time, especially if they are run one after the + * other, as does happen during bootup when all the tracers are + * registered. This could cause the soft lockup watchdog to trigger. + */ + cond_resched(); + + tracing_selftest_running = true; + ret = run_tracer_selftest(type); + tracing_selftest_running = false; + + return ret; +} + static __init int init_trace_selftests(void) { struct trace_selftests *p, *n; @@ -2092,6 +2116,10 @@ static inline int run_tracer_selftest(struct tracer *type) { return 0; } +static inline int do_run_tracer_selftest(struct tracer *type) +{ + return 0; +} #endif /* CONFIG_FTRACE_STARTUP_TEST */ static void add_tracer_options(struct trace_array *tr, struct tracer *t); @@ -2127,8 +2155,6 @@ int __init register_tracer(struct tracer *type) mutex_lock(&trace_types_lock); - tracing_selftest_running = true; - for (t = trace_types; t; t = t->next) { if (strcmp(type->name, t->name) == 0) { /* already found */ @@ -2157,7 +2183,7 @@ int __init register_tracer(struct tracer *type) /* store the tracer for __set_tracer_option */ type->flags->trace = type; - ret = run_tracer_selftest(type); + ret = do_run_tracer_selftest(type); if (ret < 0) goto out; @@ -2166,7 +2192,6 @@ int __init register_tracer(struct tracer *type) add_tracer_options(&global_trace, type); out: - tracing_selftest_running = false; mutex_unlock(&trace_types_lock); if (ret || !default_bootup_tracer) @@ -3490,7 +3515,7 @@ __trace_array_vprintk(struct trace_buffer *buffer, unsigned int trace_ctx; char *tbuffer; - if (tracing_disabled || tracing_selftest_running) + if (tracing_disabled) return 0; /* Don't pollute graph traces with trace_vprintk internals */ @@ -3538,6 +3563,9 @@ __printf(3, 0) int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args) { + if (tracing_selftest_running && tr == &global_trace) + return 0; + return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args); } @@ -3726,7 +3754,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, #define STATIC_FMT_BUF_SIZE 128 static char static_fmt_buf[STATIC_FMT_BUF_SIZE]; -static char *trace_iter_expand_format(struct trace_iterator *iter) +char *trace_iter_expand_format(struct trace_iterator *iter) { char *tmp; @@ -4446,8 +4474,11 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) if (trace_seq_has_overflowed(s)) return TRACE_TYPE_PARTIAL_LINE; - if (event) + if (event) { + if (tr->trace_flags & TRACE_ITER_FIELDS) + return print_event_fields(iter, event); return event->funcs->trace(iter, sym_flags, event); + } trace_seq_printf(s, "Unknown type %d\n", entry->type); @@ -5749,7 +5780,7 @@ static const char readme_msg[] = "\t table using the key(s) and value(s) named, and the value of a\n" "\t sum called 'hitcount' is incremented. Keys and values\n" "\t correspond to fields in the event's format description. Keys\n" - "\t can be any field, or the special string 'stacktrace'.\n" + "\t can be any field, or the special string 'common_stacktrace'.\n" "\t Compound keys consisting of up to two fields can be specified\n" "\t by the 'keys' keyword. Values must correspond to numeric\n" "\t fields. Sort keys consisting of up to two fields can be\n" @@ -9658,7 +9689,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) tr->buffer_percent = 50; - trace_create_file("buffer_percent", TRACE_MODE_READ, d_tracer, + trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer, tr, &buffer_percent_fops); create_trace_options_dir(tr); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 616e1aa1c4da..79bdefe9261b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -619,6 +619,7 @@ bool trace_is_tracepoint_string(const char *str); const char *trace_event_format(struct trace_iterator *iter, const char *fmt); void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, va_list ap) __printf(2, 0); +char *trace_iter_expand_format(struct trace_iterator *iter); int trace_empty(struct trace_iterator *iter); @@ -1199,6 +1200,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, C(HEX, "hex"), \ C(BIN, "bin"), \ C(BLOCK, "block"), \ + C(FIELDS, "fields"), \ C(PRINTK, "trace_printk"), \ C(ANNOTATE, "annotate"), \ C(USERSTACKTRACE, "userstacktrace"), \ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 654ffa40457a..57e539d47989 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -194,6 +194,8 @@ static int trace_define_generic_fields(void) __generic_field(int, common_cpu, FILTER_CPU); __generic_field(char *, COMM, FILTER_COMM); __generic_field(char *, comm, FILTER_COMM); + __generic_field(char *, stacktrace, FILTER_STACKTRACE); + __generic_field(char *, STACKTRACE, FILTER_STACKTRACE); return ret; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 486cca3c2b75..b97d3ad832f1 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1364,7 +1364,7 @@ static const char *hist_field_name(struct hist_field *field, if (field->field) field_name = field->field->name; else - field_name = "stacktrace"; + field_name = "common_stacktrace"; } else if (field->flags & HIST_FIELD_FL_HITCOUNT) field_name = "hitcount"; @@ -2367,7 +2367,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, hist_data->enable_timestamps = true; if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS) hist_data->attrs->ts_in_usecs = true; - } else if (strcmp(field_name, "stacktrace") == 0) { + } else if (strcmp(field_name, "common_stacktrace") == 0) { *flags |= HIST_FIELD_FL_STACKTRACE; } else if (strcmp(field_name, "common_cpu") == 0) *flags |= HIST_FIELD_FL_CPU; @@ -2378,11 +2378,15 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, if (!field || !field->size) { /* * For backward compatibility, if field_name - * was "cpu", then we treat this the same as - * common_cpu. This also works for "CPU". + * was "cpu" or "stacktrace", then we treat this + * the same as common_cpu and common_stacktrace + * respectively. This also works for "CPU", and + * "STACKTRACE". */ if (field && field->filter_type == FILTER_CPU) { *flags |= HIST_FIELD_FL_CPU; + } else if (field && field->filter_type == FILTER_STACKTRACE) { + *flags |= HIST_FIELD_FL_STACKTRACE; } else { hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, errpos(field_name)); @@ -4238,13 +4242,19 @@ static int __create_val_field(struct hist_trigger_data *hist_data, goto out; } - /* Some types cannot be a value */ - if (hist_field->flags & (HIST_FIELD_FL_GRAPH | HIST_FIELD_FL_PERCENT | - HIST_FIELD_FL_BUCKET | HIST_FIELD_FL_LOG2 | - HIST_FIELD_FL_SYM | HIST_FIELD_FL_SYM_OFFSET | - HIST_FIELD_FL_SYSCALL | HIST_FIELD_FL_STACKTRACE)) { - hist_err(file->tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(field_str)); - ret = -EINVAL; + /* values and variables should not have some modifiers */ + if (hist_field->flags & HIST_FIELD_FL_VAR) { + /* Variable */ + if (hist_field->flags & (HIST_FIELD_FL_GRAPH | HIST_FIELD_FL_PERCENT | + HIST_FIELD_FL_BUCKET | HIST_FIELD_FL_LOG2)) + goto err; + } else { + /* Value */ + if (hist_field->flags & (HIST_FIELD_FL_GRAPH | HIST_FIELD_FL_PERCENT | + HIST_FIELD_FL_BUCKET | HIST_FIELD_FL_LOG2 | + HIST_FIELD_FL_SYM | HIST_FIELD_FL_SYM_OFFSET | + HIST_FIELD_FL_SYSCALL | HIST_FIELD_FL_STACKTRACE)) + goto err; } hist_data->fields[val_idx] = hist_field; @@ -4256,6 +4266,9 @@ static int __create_val_field(struct hist_trigger_data *hist_data, ret = -EINVAL; out: return ret; + err: + hist_err(file->tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(field_str)); + return -EINVAL; } static int create_val_field(struct hist_trigger_data *hist_data, @@ -5385,7 +5398,7 @@ static void hist_trigger_print_key(struct seq_file *m, if (key_field->field) seq_printf(m, "%s.stacktrace", key_field->field->name); else - seq_puts(m, "stacktrace:\n"); + seq_puts(m, "common_stacktrace:\n"); hist_trigger_stacktrace_print(m, key + key_field->offset, HIST_STACKTRACE_DEPTH); @@ -5968,7 +5981,7 @@ static int event_hist_trigger_print(struct seq_file *m, if (field->field) seq_printf(m, "%s.stacktrace", field->field->name); else - seq_puts(m, "stacktrace"); + seq_puts(m, "common_stacktrace"); } else hist_field_print(m, field); } diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 908e8a13c675..dbb14705d0d3 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -19,14 +19,12 @@ #include <linux/tracefs.h> #include <linux/types.h> #include <linux/uaccess.h> -/* Reminder to move to uapi when everything works */ -#ifdef CONFIG_COMPILE_TEST +#include <linux/highmem.h> +#include <linux/init.h> #include <linux/user_events.h> -#else -#include <uapi/linux/user_events.h> -#endif -#include "trace.h" #include "trace_dynevent.h" +#include "trace_output.h" +#include "trace.h" #define USER_EVENTS_PREFIX_LEN (sizeof(USER_EVENTS_PREFIX)-1) @@ -34,35 +32,12 @@ #define FIELD_DEPTH_NAME 1 #define FIELD_DEPTH_SIZE 2 -/* - * Limits how many trace_event calls user processes can create: - * Must be a power of two of PAGE_SIZE. - */ -#define MAX_PAGE_ORDER 0 -#define MAX_PAGES (1 << MAX_PAGE_ORDER) -#define MAX_BYTES (MAX_PAGES * PAGE_SIZE) -#define MAX_EVENTS (MAX_BYTES * 8) - /* Limit how long of an event name plus args within the subsystem. */ #define MAX_EVENT_DESC 512 #define EVENT_NAME(user_event) ((user_event)->tracepoint.name) #define MAX_FIELD_ARRAY_SIZE 1024 /* - * The MAP_STATUS_* macros are used for taking a index and determining the - * appropriate byte and the bit in the byte to set/reset for an event. - * - * The lower 3 bits of the index decide which bit to set. - * The remaining upper bits of the index decide which byte to use for the bit. - * - * This is used when an event has a probe attached/removed to reflect live - * status of the event wanting tracing or not to user-programs via shared - * memory maps. - */ -#define MAP_STATUS_BYTE(index) ((index) >> 3) -#define MAP_STATUS_MASK(index) BIT((index) & 7) - -/* * Internal bits (kernel side only) to keep track of connected probes: * These are used when status is requested in text form about an event. These * bits are compared against an internal byte on the event to determine which @@ -75,25 +50,25 @@ #define EVENT_STATUS_OTHER BIT(7) /* - * Stores the pages, tables, and locks for a group of events. - * Each logical grouping of events has its own group, with a - * matching page for status checks within user programs. This - * allows for isolation of events to user programs by various - * means. + * Stores the system name, tables, and locks for a group of events. This + * allows isolation for events by various means. */ struct user_event_group { - struct page *pages; - char *register_page_data; - char *system_name; - struct hlist_node node; - struct mutex reg_mutex; + char *system_name; + struct hlist_node node; + struct mutex reg_mutex; DECLARE_HASHTABLE(register_table, 8); - DECLARE_BITMAP(page_bitmap, MAX_EVENTS); }; /* Group for init_user_ns mapping, top-most group */ static struct user_event_group *init_group; +/* Max allowed events for the whole system */ +static unsigned int max_user_events = 32768; + +/* Current number of events on the whole system */ +static unsigned int current_user_events; + /* * Stores per-event properties, as users register events * within a file a user_event might be created if it does not @@ -102,21 +77,63 @@ static struct user_event_group *init_group; * refcnt reaches one. */ struct user_event { - struct user_event_group *group; - struct tracepoint tracepoint; - struct trace_event_call call; - struct trace_event_class class; - struct dyn_event devent; - struct hlist_node node; - struct list_head fields; - struct list_head validators; - refcount_t refcnt; - int index; - int flags; - int min_size; - char status; + struct user_event_group *group; + struct tracepoint tracepoint; + struct trace_event_call call; + struct trace_event_class class; + struct dyn_event devent; + struct hlist_node node; + struct list_head fields; + struct list_head validators; + refcount_t refcnt; + int min_size; + char status; +}; + +/* + * Stores per-mm/event properties that enable an address to be + * updated properly for each task. As tasks are forked, we use + * these to track enablement sites that are tied to an event. + */ +struct user_event_enabler { + struct list_head mm_enablers_link; + struct user_event *event; + unsigned long addr; + + /* Track enable bit, flags, etc. Aligned for bitops. */ + unsigned long values; +}; + +/* Bits 0-5 are for the bit to update upon enable/disable (0-63 allowed) */ +#define ENABLE_VAL_BIT_MASK 0x3F + +/* Bit 6 is for faulting status of enablement */ +#define ENABLE_VAL_FAULTING_BIT 6 + +/* Bit 7 is for freeing status of enablement */ +#define ENABLE_VAL_FREEING_BIT 7 + +/* Only duplicate the bit value */ +#define ENABLE_VAL_DUP_MASK ENABLE_VAL_BIT_MASK + +#define ENABLE_BITOPS(e) (&(e)->values) + +#define ENABLE_BIT(e) ((int)((e)->values & ENABLE_VAL_BIT_MASK)) + +/* Used for asynchronous faulting in of pages */ +struct user_event_enabler_fault { + struct work_struct work; + struct user_event_mm *mm; + struct user_event_enabler *enabler; + int attempt; }; +static struct kmem_cache *fault_cache; + +/* Global list of memory descriptors using user_events */ +static LIST_HEAD(user_event_mms); +static DEFINE_SPINLOCK(user_event_mms_lock); + /* * Stores per-file events references, as users register events * within a file this structure is modified and freed via RCU. @@ -124,23 +141,23 @@ struct user_event { * These are not shared and only accessible by the file that created it. */ struct user_event_refs { - struct rcu_head rcu; - int count; - struct user_event *events[]; + struct rcu_head rcu; + int count; + struct user_event *events[]; }; struct user_event_file_info { - struct user_event_group *group; - struct user_event_refs *refs; + struct user_event_group *group; + struct user_event_refs *refs; }; #define VALIDATOR_ENSURE_NULL (1 << 0) #define VALIDATOR_REL (1 << 1) struct user_event_validator { - struct list_head link; - int offset; - int flags; + struct list_head user_event_link; + int offset; + int flags; }; typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i, @@ -150,33 +167,17 @@ static int user_event_parse(struct user_event_group *group, char *name, char *args, char *flags, struct user_event **newuser); +static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm); +static struct user_event_mm *user_event_mm_get_all(struct user_event *user); +static void user_event_mm_put(struct user_event_mm *mm); + static u32 user_event_key(char *name) { return jhash(name, strlen(name), 0); } -static void set_page_reservations(char *pages, bool set) -{ - int page; - - for (page = 0; page < MAX_PAGES; ++page) { - void *addr = pages + (PAGE_SIZE * page); - - if (set) - SetPageReserved(virt_to_page(addr)); - else - ClearPageReserved(virt_to_page(addr)); - } -} - static void user_event_group_destroy(struct user_event_group *group) { - if (group->register_page_data) - set_page_reservations(group->register_page_data, false); - - if (group->pages) - __free_pages(group->pages, MAX_PAGE_ORDER); - kfree(group->system_name); kfree(group); } @@ -247,19 +248,6 @@ static struct user_event_group if (!group->system_name) goto error; - group->pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER); - - if (!group->pages) - goto error; - - group->register_page_data = page_address(group->pages); - - set_page_reservations(group->register_page_data, true); - - /* Zero all bits beside 0 (which is reserved for failures) */ - bitmap_zero(group->page_bitmap, MAX_EVENTS); - set_bit(0, group->page_bitmap); - mutex_init(&group->reg_mutex); hash_init(group->register_table); @@ -271,20 +259,545 @@ error: return NULL; }; -static __always_inline -void user_event_register_set(struct user_event *user) +static void user_event_enabler_destroy(struct user_event_enabler *enabler) +{ + list_del_rcu(&enabler->mm_enablers_link); + + /* No longer tracking the event via the enabler */ + refcount_dec(&enabler->event->refcnt); + + kfree(enabler); +} + +static int user_event_mm_fault_in(struct user_event_mm *mm, unsigned long uaddr, + int attempt) +{ + bool unlocked; + int ret; + + /* + * Normally this is low, ensure that it cannot be taken advantage of by + * bad user processes to cause excessive looping. + */ + if (attempt > 10) + return -EFAULT; + + mmap_read_lock(mm->mm); + + /* Ensure MM has tasks, cannot use after exit_mm() */ + if (refcount_read(&mm->tasks) == 0) { + ret = -ENOENT; + goto out; + } + + ret = fixup_user_fault(mm->mm, uaddr, FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE, + &unlocked); +out: + mmap_read_unlock(mm->mm); + + return ret; +} + +static int user_event_enabler_write(struct user_event_mm *mm, + struct user_event_enabler *enabler, + bool fixup_fault, int *attempt); + +static void user_event_enabler_fault_fixup(struct work_struct *work) +{ + struct user_event_enabler_fault *fault = container_of( + work, struct user_event_enabler_fault, work); + struct user_event_enabler *enabler = fault->enabler; + struct user_event_mm *mm = fault->mm; + unsigned long uaddr = enabler->addr; + int attempt = fault->attempt; + int ret; + + ret = user_event_mm_fault_in(mm, uaddr, attempt); + + if (ret && ret != -ENOENT) { + struct user_event *user = enabler->event; + + pr_warn("user_events: Fault for mm: 0x%pK @ 0x%llx event: %s\n", + mm->mm, (unsigned long long)uaddr, EVENT_NAME(user)); + } + + /* Prevent state changes from racing */ + mutex_lock(&event_mutex); + + /* User asked for enabler to be removed during fault */ + if (test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))) { + user_event_enabler_destroy(enabler); + goto out; + } + + /* + * If we managed to get the page, re-issue the write. We do not + * want to get into a possible infinite loop, which is why we only + * attempt again directly if the page came in. If we couldn't get + * the page here, then we will try again the next time the event is + * enabled/disabled. + */ + clear_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)); + + if (!ret) { + mmap_read_lock(mm->mm); + user_event_enabler_write(mm, enabler, true, &attempt); + mmap_read_unlock(mm->mm); + } +out: + mutex_unlock(&event_mutex); + + /* In all cases we no longer need the mm or fault */ + user_event_mm_put(mm); + kmem_cache_free(fault_cache, fault); +} + +static bool user_event_enabler_queue_fault(struct user_event_mm *mm, + struct user_event_enabler *enabler, + int attempt) +{ + struct user_event_enabler_fault *fault; + + fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT | __GFP_NOWARN); + + if (!fault) + return false; + + INIT_WORK(&fault->work, user_event_enabler_fault_fixup); + fault->mm = user_event_mm_get(mm); + fault->enabler = enabler; + fault->attempt = attempt; + + /* Don't try to queue in again while we have a pending fault */ + set_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)); + + if (!schedule_work(&fault->work)) { + /* Allow another attempt later */ + clear_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)); + + user_event_mm_put(mm); + kmem_cache_free(fault_cache, fault); + + return false; + } + + return true; +} + +static int user_event_enabler_write(struct user_event_mm *mm, + struct user_event_enabler *enabler, + bool fixup_fault, int *attempt) +{ + unsigned long uaddr = enabler->addr; + unsigned long *ptr; + struct page *page; + void *kaddr; + int ret; + + lockdep_assert_held(&event_mutex); + mmap_assert_locked(mm->mm); + + *attempt += 1; + + /* Ensure MM has tasks, cannot use after exit_mm() */ + if (refcount_read(&mm->tasks) == 0) + return -ENOENT; + + if (unlikely(test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)) || + test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler)))) + return -EBUSY; + + ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT, + &page, NULL, NULL); + + if (unlikely(ret <= 0)) { + if (!fixup_fault) + return -EFAULT; + + if (!user_event_enabler_queue_fault(mm, enabler, *attempt)) + pr_warn("user_events: Unable to queue fault handler\n"); + + return -EFAULT; + } + + kaddr = kmap_local_page(page); + ptr = kaddr + (uaddr & ~PAGE_MASK); + + /* Update bit atomically, user tracers must be atomic as well */ + if (enabler->event && enabler->event->status) + set_bit(ENABLE_BIT(enabler), ptr); + else + clear_bit(ENABLE_BIT(enabler), ptr); + + kunmap_local(kaddr); + unpin_user_pages_dirty_lock(&page, 1, true); + + return 0; +} + +static bool user_event_enabler_exists(struct user_event_mm *mm, + unsigned long uaddr, unsigned char bit) +{ + struct user_event_enabler *enabler; + + list_for_each_entry(enabler, &mm->enablers, mm_enablers_link) { + if (enabler->addr == uaddr && ENABLE_BIT(enabler) == bit) + return true; + } + + return false; +} + +static void user_event_enabler_update(struct user_event *user) +{ + struct user_event_enabler *enabler; + struct user_event_mm *next; + struct user_event_mm *mm; + int attempt; + + lockdep_assert_held(&event_mutex); + + /* + * We need to build a one-shot list of all the mms that have an + * enabler for the user_event passed in. This list is only valid + * while holding the event_mutex. The only reason for this is due + * to the global mm list being RCU protected and we use methods + * which can wait (mmap_read_lock and pin_user_pages_remote). + * + * NOTE: user_event_mm_get_all() increments the ref count of each + * mm that is added to the list to prevent removal timing windows. + * We must always put each mm after they are used, which may wait. + */ + mm = user_event_mm_get_all(user); + + while (mm) { + next = mm->next; + mmap_read_lock(mm->mm); + + list_for_each_entry(enabler, &mm->enablers, mm_enablers_link) { + if (enabler->event == user) { + attempt = 0; + user_event_enabler_write(mm, enabler, true, &attempt); + } + } + + mmap_read_unlock(mm->mm); + user_event_mm_put(mm); + mm = next; + } +} + +static bool user_event_enabler_dup(struct user_event_enabler *orig, + struct user_event_mm *mm) +{ + struct user_event_enabler *enabler; + + /* Skip pending frees */ + if (unlikely(test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(orig)))) + return true; + + enabler = kzalloc(sizeof(*enabler), GFP_NOWAIT | __GFP_ACCOUNT); + + if (!enabler) + return false; + + enabler->event = orig->event; + enabler->addr = orig->addr; + + /* Only dup part of value (ignore future flags, etc) */ + enabler->values = orig->values & ENABLE_VAL_DUP_MASK; + + refcount_inc(&enabler->event->refcnt); + + /* Enablers not exposed yet, RCU not required */ + list_add(&enabler->mm_enablers_link, &mm->enablers); + + return true; +} + +static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm) +{ + refcount_inc(&mm->refcnt); + + return mm; +} + +static struct user_event_mm *user_event_mm_get_all(struct user_event *user) +{ + struct user_event_mm *found = NULL; + struct user_event_enabler *enabler; + struct user_event_mm *mm; + + /* + * We use the mm->next field to build a one-shot list from the global + * RCU protected list. To build this list the event_mutex must be held. + * This lets us build a list without requiring allocs that could fail + * when user based events are most wanted for diagnostics. + */ + lockdep_assert_held(&event_mutex); + + /* + * We do not want to block fork/exec while enablements are being + * updated, so we use RCU to walk the current tasks that have used + * user_events ABI for 1 or more events. Each enabler found in each + * task that matches the event being updated has a write to reflect + * the kernel state back into the process. Waits/faults must not occur + * during this. So we scan the list under RCU for all the mm that have + * the event within it. This is needed because mm_read_lock() can wait. + * Each user mm returned has a ref inc to handle remove RCU races. + */ + rcu_read_lock(); + + list_for_each_entry_rcu(mm, &user_event_mms, mms_link) { + list_for_each_entry_rcu(enabler, &mm->enablers, mm_enablers_link) { + if (enabler->event == user) { + mm->next = found; + found = user_event_mm_get(mm); + break; + } + } + } + + rcu_read_unlock(); + + return found; +} + +static struct user_event_mm *user_event_mm_alloc(struct task_struct *t) +{ + struct user_event_mm *user_mm; + + user_mm = kzalloc(sizeof(*user_mm), GFP_KERNEL_ACCOUNT); + + if (!user_mm) + return NULL; + + user_mm->mm = t->mm; + INIT_LIST_HEAD(&user_mm->enablers); + refcount_set(&user_mm->refcnt, 1); + refcount_set(&user_mm->tasks, 1); + + /* + * The lifetime of the memory descriptor can slightly outlast + * the task lifetime if a ref to the user_event_mm is taken + * between list_del_rcu() and call_rcu(). Therefore we need + * to take a reference to it to ensure it can live this long + * under this corner case. This can also occur in clones that + * outlast the parent. + */ + mmgrab(user_mm->mm); + + return user_mm; +} + +static void user_event_mm_attach(struct user_event_mm *user_mm, struct task_struct *t) +{ + unsigned long flags; + + spin_lock_irqsave(&user_event_mms_lock, flags); + list_add_rcu(&user_mm->mms_link, &user_event_mms); + spin_unlock_irqrestore(&user_event_mms_lock, flags); + + t->user_event_mm = user_mm; +} + +static struct user_event_mm *current_user_event_mm(void) +{ + struct user_event_mm *user_mm = current->user_event_mm; + + if (user_mm) + goto inc; + + user_mm = user_event_mm_alloc(current); + + if (!user_mm) + goto error; + + user_event_mm_attach(user_mm, current); +inc: + refcount_inc(&user_mm->refcnt); +error: + return user_mm; +} + +static void user_event_mm_destroy(struct user_event_mm *mm) +{ + struct user_event_enabler *enabler, *next; + + list_for_each_entry_safe(enabler, next, &mm->enablers, mm_enablers_link) + user_event_enabler_destroy(enabler); + + mmdrop(mm->mm); + kfree(mm); +} + +static void user_event_mm_put(struct user_event_mm *mm) +{ + if (mm && refcount_dec_and_test(&mm->refcnt)) + user_event_mm_destroy(mm); +} + +static void delayed_user_event_mm_put(struct work_struct *work) +{ + struct user_event_mm *mm; + + mm = container_of(to_rcu_work(work), struct user_event_mm, put_rwork); + user_event_mm_put(mm); +} + +void user_event_mm_remove(struct task_struct *t) +{ + struct user_event_mm *mm; + unsigned long flags; + + might_sleep(); + + mm = t->user_event_mm; + t->user_event_mm = NULL; + + /* Clone will increment the tasks, only remove if last clone */ + if (!refcount_dec_and_test(&mm->tasks)) + return; + + /* Remove the mm from the list, so it can no longer be enabled */ + spin_lock_irqsave(&user_event_mms_lock, flags); + list_del_rcu(&mm->mms_link); + spin_unlock_irqrestore(&user_event_mms_lock, flags); + + /* + * We need to wait for currently occurring writes to stop within + * the mm. This is required since exit_mm() snaps the current rss + * stats and clears them. On the final mmdrop(), check_mm() will + * report a bug if these increment. + * + * All writes/pins are done under mmap_read lock, take the write + * lock to ensure in-progress faults have completed. Faults that + * are pending but yet to run will check the task count and skip + * the fault since the mm is going away. + */ + mmap_write_lock(mm->mm); + mmap_write_unlock(mm->mm); + + /* + * Put for mm must be done after RCU delay to handle new refs in + * between the list_del_rcu() and now. This ensures any get refs + * during rcu_read_lock() are accounted for during list removal. + * + * CPU A | CPU B + * --------------------------------------------------------------- + * user_event_mm_remove() | rcu_read_lock(); + * list_del_rcu() | list_for_each_entry_rcu(); + * call_rcu() | refcount_inc(); + * . | rcu_read_unlock(); + * schedule_work() | . + * user_event_mm_put() | . + * + * mmdrop() cannot be called in the softirq context of call_rcu() + * so we use a work queue after call_rcu() to run within. + */ + INIT_RCU_WORK(&mm->put_rwork, delayed_user_event_mm_put); + queue_rcu_work(system_wq, &mm->put_rwork); +} + +void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm) { - int i = user->index; + struct user_event_mm *mm = user_event_mm_alloc(t); + struct user_event_enabler *enabler; + + if (!mm) + return; + + rcu_read_lock(); - user->group->register_page_data[MAP_STATUS_BYTE(i)] |= MAP_STATUS_MASK(i); + list_for_each_entry_rcu(enabler, &old_mm->enablers, mm_enablers_link) { + if (!user_event_enabler_dup(enabler, mm)) + goto error; + } + + rcu_read_unlock(); + + user_event_mm_attach(mm, t); + return; +error: + rcu_read_unlock(); + user_event_mm_destroy(mm); } -static __always_inline -void user_event_register_clear(struct user_event *user) +static bool current_user_event_enabler_exists(unsigned long uaddr, + unsigned char bit) { - int i = user->index; + struct user_event_mm *user_mm = current_user_event_mm(); + bool exists; + + if (!user_mm) + return false; + + exists = user_event_enabler_exists(user_mm, uaddr, bit); - user->group->register_page_data[MAP_STATUS_BYTE(i)] &= ~MAP_STATUS_MASK(i); + user_event_mm_put(user_mm); + + return exists; +} + +static struct user_event_enabler +*user_event_enabler_create(struct user_reg *reg, struct user_event *user, + int *write_result) +{ + struct user_event_enabler *enabler; + struct user_event_mm *user_mm; + unsigned long uaddr = (unsigned long)reg->enable_addr; + int attempt = 0; + + user_mm = current_user_event_mm(); + + if (!user_mm) + return NULL; + + enabler = kzalloc(sizeof(*enabler), GFP_KERNEL_ACCOUNT); + + if (!enabler) + goto out; + + enabler->event = user; + enabler->addr = uaddr; + enabler->values = reg->enable_bit; +retry: + /* Prevents state changes from racing with new enablers */ + mutex_lock(&event_mutex); + + /* Attempt to reflect the current state within the process */ + mmap_read_lock(user_mm->mm); + *write_result = user_event_enabler_write(user_mm, enabler, false, + &attempt); + mmap_read_unlock(user_mm->mm); + + /* + * If the write works, then we will track the enabler. A ref to the + * underlying user_event is held by the enabler to prevent it going + * away while the enabler is still in use by a process. The ref is + * removed when the enabler is destroyed. This means a event cannot + * be forcefully deleted from the system until all tasks using it + * exit or run exec(), which includes forks and clones. + */ + if (!*write_result) { + refcount_inc(&enabler->event->refcnt); + list_add_rcu(&enabler->mm_enablers_link, &user_mm->enablers); + } + + mutex_unlock(&event_mutex); + + if (*write_result) { + /* Attempt to fault-in and retry if it worked */ + if (!user_event_mm_fault_in(user_mm, uaddr, attempt)) + goto retry; + + kfree(enabler); + enabler = NULL; + } +out: + user_event_mm_put(user_mm); + + return enabler; } static __always_inline __must_check @@ -424,8 +937,8 @@ static void user_event_destroy_validators(struct user_event *user) struct user_event_validator *validator, *next; struct list_head *head = &user->validators; - list_for_each_entry_safe(validator, next, head, link) { - list_del(&validator->link); + list_for_each_entry_safe(validator, next, head, user_event_link) { + list_del(&validator->user_event_link); kfree(validator); } } @@ -449,7 +962,7 @@ static int user_event_add_field(struct user_event *user, const char *type, struct ftrace_event_field *field; int validator_flags = 0; - field = kmalloc(sizeof(*field), GFP_KERNEL); + field = kmalloc(sizeof(*field), GFP_KERNEL_ACCOUNT); if (!field) return -ENOMEM; @@ -468,7 +981,7 @@ add_validator: if (strstr(type, "char") != NULL) validator_flags |= VALIDATOR_ENSURE_NULL; - validator = kmalloc(sizeof(*validator), GFP_KERNEL); + validator = kmalloc(sizeof(*validator), GFP_KERNEL_ACCOUNT); if (!validator) { kfree(field); @@ -479,7 +992,7 @@ add_validator: validator->offset = offset; /* Want sequential access when validating */ - list_add_tail(&validator->link, &user->validators); + list_add_tail(&validator->user_event_link, &user->validators); add_field: field->type = type; @@ -489,6 +1002,9 @@ add_field: field->is_signed = is_signed; field->filter_type = filter_type; + if (filter_type == FILTER_OTHER) + field->filter_type = filter_assign_type(type); + list_add(&field->link, &user->fields); /* @@ -754,7 +1270,7 @@ static int user_event_create_print_fmt(struct user_event *user) len = user_event_set_print_fmt(user, NULL, 0); - print_fmt = kmalloc(len, GFP_KERNEL); + print_fmt = kmalloc(len, GFP_KERNEL_ACCOUNT); if (!print_fmt) return -ENOMEM; @@ -770,11 +1286,7 @@ static enum print_line_t user_event_print_trace(struct trace_iterator *iter, int flags, struct trace_event *event) { - /* Unsafe to try to decode user provided print_fmt, use hex */ - trace_print_hex_dump_seq(&iter->seq, "", DUMP_PREFIX_OFFSET, 16, - 1, iter->ent, iter->ent_size, true); - - return trace_handle_return(&iter->seq); + return print_event_fields(iter, event); } static struct trace_event_functions user_event_funcs = { @@ -820,6 +1332,8 @@ static int destroy_user_event(struct user_event *user) { int ret = 0; + lockdep_assert_held(&event_mutex); + /* Must destroy fields before call removal */ user_event_destroy_fields(user); @@ -829,9 +1343,6 @@ static int destroy_user_event(struct user_event *user) return ret; dyn_event_remove(&user->devent); - - user_event_register_clear(user); - clear_bit(user->index, user->group->page_bitmap); hash_del(&user->node); user_event_destroy_validators(user); @@ -839,6 +1350,11 @@ static int destroy_user_event(struct user_event *user) kfree(EVENT_NAME(user)); kfree(user); + if (current_user_events > 0) + current_user_events--; + else + pr_alert("BUG: Bad current_user_events\n"); + return ret; } @@ -866,7 +1382,7 @@ static int user_event_validate(struct user_event *user, void *data, int len) void *pos, *end = data + len; u32 loc, offset, size; - list_for_each_entry(validator, head, link) { + list_for_each_entry(validator, head, user_event_link) { pos = data + validator->offset; /* Already done min_size check, no bounds check here */ @@ -977,9 +1493,9 @@ discard: #endif /* - * Update the register page that is shared between user processes. + * Update the enabled bit among all user processes. */ -static void update_reg_page_for(struct user_event *user) +static void update_enable_bit_for(struct user_event *user) { struct tracepoint *tp = &user->tracepoint; char status = 0; @@ -1010,12 +1526,9 @@ static void update_reg_page_for(struct user_event *user) rcu_read_unlock_sched(); } - if (status) - user_event_register_set(user); - else - user_event_register_clear(user); - user->status = status; + + user_event_enabler_update(user); } /* @@ -1072,10 +1585,10 @@ static int user_event_reg(struct trace_event_call *call, return ret; inc: refcount_inc(&user->refcnt); - update_reg_page_for(user); + update_enable_bit_for(user); return 0; dec: - update_reg_page_for(user); + update_enable_bit_for(user); refcount_dec(&user->refcnt); return 0; } @@ -1093,7 +1606,7 @@ static int user_event_create(const char *raw_command) raw_command += USER_EVENTS_PREFIX_LEN; raw_command = skip_spaces(raw_command); - name = kstrdup(raw_command, GFP_KERNEL); + name = kstrdup(raw_command, GFP_KERNEL_ACCOUNT); if (!name) return -ENOMEM; @@ -1271,7 +1784,6 @@ static int user_event_parse(struct user_event_group *group, char *name, struct user_event **newuser) { int ret; - int index; u32 key; struct user_event *user; @@ -1290,12 +1802,7 @@ static int user_event_parse(struct user_event_group *group, char *name, return 0; } - index = find_first_zero_bit(group->page_bitmap, MAX_EVENTS); - - if (index == MAX_EVENTS) - return -EMFILE; - - user = kzalloc(sizeof(*user), GFP_KERNEL); + user = kzalloc(sizeof(*user), GFP_KERNEL_ACCOUNT); if (!user) return -ENOMEM; @@ -1335,20 +1842,23 @@ static int user_event_parse(struct user_event_group *group, char *name, mutex_lock(&event_mutex); + if (current_user_events >= max_user_events) { + ret = -EMFILE; + goto put_user_lock; + } + ret = user_event_trace_register(user); if (ret) goto put_user_lock; - user->index = index; - /* Ensure we track self ref and caller ref (2) */ refcount_set(&user->refcnt, 2); dyn_event_init(&user->devent, &user_event_dops); dyn_event_add(&user->devent, &user->call); - set_bit(user->index, group->page_bitmap); hash_add(group->register_table, &user->node, key); + current_user_events++; mutex_unlock(&event_mutex); @@ -1398,6 +1908,9 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) if (unlikely(copy_from_iter(&idx, sizeof(idx), i) != sizeof(idx))) return -EFAULT; + if (idx < 0) + return -EINVAL; + rcu_read_lock_sched(); refs = rcu_dereference_sched(info->refs); @@ -1468,7 +1981,7 @@ static int user_events_open(struct inode *node, struct file *file) if (!group) return -ENOENT; - info = kzalloc(sizeof(*info), GFP_KERNEL); + info = kzalloc(sizeof(*info), GFP_KERNEL_ACCOUNT); if (!info) return -ENOMEM; @@ -1521,7 +2034,7 @@ static int user_events_ref_add(struct user_event_file_info *info, size = struct_size(refs, events, count + 1); - new_refs = kzalloc(size, GFP_KERNEL); + new_refs = kzalloc(size, GFP_KERNEL_ACCOUNT); if (!new_refs) return -ENOMEM; @@ -1564,6 +2077,37 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg) if (ret) return ret; + /* Ensure no flags, since we don't support any yet */ + if (kreg->flags != 0) + return -EINVAL; + + /* Ensure supported size */ + switch (kreg->enable_size) { + case 4: + /* 32-bit */ + break; +#if BITS_PER_LONG >= 64 + case 8: + /* 64-bit */ + break; +#endif + default: + return -EINVAL; + } + + /* Ensure natural alignment */ + if (kreg->enable_addr % kreg->enable_size) + return -EINVAL; + + /* Ensure bit range for size */ + if (kreg->enable_bit > (kreg->enable_size * BITS_PER_BYTE) - 1) + return -EINVAL; + + /* Ensure accessible */ + if (!access_ok((const void __user *)(uintptr_t)kreg->enable_addr, + kreg->enable_size)) + return -EFAULT; + kreg->size = size; return 0; @@ -1578,14 +2122,26 @@ static long user_events_ioctl_reg(struct user_event_file_info *info, struct user_reg __user *ureg = (struct user_reg __user *)uarg; struct user_reg reg; struct user_event *user; + struct user_event_enabler *enabler; char *name; long ret; + int write_result; ret = user_reg_get(ureg, ®); if (ret) return ret; + /* + * Prevent users from using the same address and bit multiple times + * within the same mm address space. This can cause unexpected behavior + * for user processes that is far easier to debug if this is explictly + * an error upon registering. + */ + if (current_user_event_enabler_exists((unsigned long)reg.enable_addr, + reg.enable_bit)) + return -EADDRINUSE; + name = strndup_user((const char __user *)(uintptr_t)reg.name_args, MAX_EVENT_DESC); @@ -1610,8 +2166,28 @@ static long user_events_ioctl_reg(struct user_event_file_info *info, if (ret < 0) return ret; + /* + * user_events_ref_add succeeded: + * At this point we have a user_event, it's lifetime is bound by the + * reference count, not this file. If anything fails, the user_event + * still has a reference until the file is released. During release + * any remaining references (from user_events_ref_add) are decremented. + * + * Attempt to create an enabler, which too has a lifetime tied in the + * same way for the event. Once the task that caused the enabler to be + * created exits or issues exec() then the enablers it has created + * will be destroyed and the ref to the event will be decremented. + */ + enabler = user_event_enabler_create(®, user, &write_result); + + if (!enabler) + return -ENOMEM; + + /* Write failed/faulted, give error back to caller */ + if (write_result) + return write_result; + put_user((u32)ret, &ureg->write_index); - put_user(user->index, &ureg->status_bit); return 0; } @@ -1641,6 +2217,115 @@ static long user_events_ioctl_del(struct user_event_file_info *info, return ret; } +static long user_unreg_get(struct user_unreg __user *ureg, + struct user_unreg *kreg) +{ + u32 size; + long ret; + + ret = get_user(size, &ureg->size); + + if (ret) + return ret; + + if (size > PAGE_SIZE) + return -E2BIG; + + if (size < offsetofend(struct user_unreg, disable_addr)) + return -EINVAL; + + ret = copy_struct_from_user(kreg, sizeof(*kreg), ureg, size); + + /* Ensure no reserved values, since we don't support any yet */ + if (kreg->__reserved || kreg->__reserved2) + return -EINVAL; + + return ret; +} + +static int user_event_mm_clear_bit(struct user_event_mm *user_mm, + unsigned long uaddr, unsigned char bit) +{ + struct user_event_enabler enabler; + int result; + int attempt = 0; + + memset(&enabler, 0, sizeof(enabler)); + enabler.addr = uaddr; + enabler.values = bit; +retry: + /* Prevents state changes from racing with new enablers */ + mutex_lock(&event_mutex); + + /* Force the bit to be cleared, since no event is attached */ + mmap_read_lock(user_mm->mm); + result = user_event_enabler_write(user_mm, &enabler, false, &attempt); + mmap_read_unlock(user_mm->mm); + + mutex_unlock(&event_mutex); + + if (result) { + /* Attempt to fault-in and retry if it worked */ + if (!user_event_mm_fault_in(user_mm, uaddr, attempt)) + goto retry; + } + + return result; +} + +/* + * Unregisters an enablement address/bit within a task/user mm. + */ +static long user_events_ioctl_unreg(unsigned long uarg) +{ + struct user_unreg __user *ureg = (struct user_unreg __user *)uarg; + struct user_event_mm *mm = current->user_event_mm; + struct user_event_enabler *enabler, *next; + struct user_unreg reg; + long ret; + + ret = user_unreg_get(ureg, ®); + + if (ret) + return ret; + + if (!mm) + return -ENOENT; + + ret = -ENOENT; + + /* + * Flags freeing and faulting are used to indicate if the enabler is in + * use at all. When faulting is set a page-fault is occurring asyncly. + * During async fault if freeing is set, the enabler will be destroyed. + * If no async fault is happening, we can destroy it now since we hold + * the event_mutex during these checks. + */ + mutex_lock(&event_mutex); + + list_for_each_entry_safe(enabler, next, &mm->enablers, mm_enablers_link) { + if (enabler->addr == reg.disable_addr && + ENABLE_BIT(enabler) == reg.disable_bit) { + set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler)); + + if (!test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler))) + user_event_enabler_destroy(enabler); + + /* Removed at least one */ + ret = 0; + } + } + + mutex_unlock(&event_mutex); + + /* Ensure bit is now cleared for user, regardless of event status */ + if (!ret) + ret = user_event_mm_clear_bit(mm, reg.disable_addr, + reg.disable_bit); + + return ret; +} + /* * Handles the ioctl from user mode to register or alter operations. */ @@ -1663,6 +2348,12 @@ static long user_events_ioctl(struct file *file, unsigned int cmd, ret = user_events_ioctl_del(info, uarg); mutex_unlock(&group->reg_mutex); break; + + case DIAG_IOCSUNREG: + mutex_lock(&group->reg_mutex); + ret = user_events_ioctl_unreg(uarg); + mutex_unlock(&group->reg_mutex); + break; } return ret; @@ -1718,45 +2409,13 @@ out: } static const struct file_operations user_data_fops = { - .open = user_events_open, - .write = user_events_write, - .write_iter = user_events_write_iter, + .open = user_events_open, + .write = user_events_write, + .write_iter = user_events_write_iter, .unlocked_ioctl = user_events_ioctl, - .release = user_events_release, + .release = user_events_release, }; -static struct user_event_group *user_status_group(struct file *file) -{ - struct seq_file *m = file->private_data; - - if (!m) - return NULL; - - return m->private; -} - -/* - * Maps the shared page into the user process for checking if event is enabled. - */ -static int user_status_mmap(struct file *file, struct vm_area_struct *vma) -{ - char *pages; - struct user_event_group *group = user_status_group(file); - unsigned long size = vma->vm_end - vma->vm_start; - - if (size != MAX_BYTES) - return -EINVAL; - - if (!group) - return -EINVAL; - - pages = group->register_page_data; - - return remap_pfn_range(vma, vma->vm_start, - virt_to_phys(pages) >> PAGE_SHIFT, - size, vm_get_page_prot(VM_READ)); -} - static void *user_seq_start(struct seq_file *m, loff_t *pos) { if (*pos) @@ -1780,7 +2439,7 @@ static int user_seq_show(struct seq_file *m, void *p) struct user_event_group *group = m->private; struct user_event *user; char status; - int i, active = 0, busy = 0, flags; + int i, active = 0, busy = 0; if (!group) return -EINVAL; @@ -1789,11 +2448,10 @@ static int user_seq_show(struct seq_file *m, void *p) hash_for_each(group->register_table, i, user, node) { status = user->status; - flags = user->flags; - seq_printf(m, "%d:%s", user->index, EVENT_NAME(user)); + seq_printf(m, "%s", EVENT_NAME(user)); - if (flags != 0 || status != 0) + if (status != 0) seq_puts(m, " #"); if (status != 0) { @@ -1816,16 +2474,15 @@ static int user_seq_show(struct seq_file *m, void *p) seq_puts(m, "\n"); seq_printf(m, "Active: %d\n", active); seq_printf(m, "Busy: %d\n", busy); - seq_printf(m, "Max: %ld\n", MAX_EVENTS); return 0; } static const struct seq_operations user_seq_ops = { - .start = user_seq_start, - .next = user_seq_next, - .stop = user_seq_stop, - .show = user_seq_show, + .start = user_seq_start, + .next = user_seq_next, + .stop = user_seq_stop, + .show = user_seq_show, }; static int user_status_open(struct inode *node, struct file *file) @@ -1851,11 +2508,10 @@ static int user_status_open(struct inode *node, struct file *file) } static const struct file_operations user_status_fops = { - .open = user_status_open, - .mmap = user_status_mmap, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, + .open = user_status_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, }; /* @@ -1873,8 +2529,7 @@ static int create_user_tracefs(void) goto err; } - /* mmap with MAP_SHARED requires writable fd */ - emmap = tracefs_create_file("user_events_status", TRACE_MODE_WRITE, + emmap = tracefs_create_file("user_events_status", TRACE_MODE_READ, NULL, NULL, &user_status_fops); if (!emmap) { @@ -1888,20 +2543,53 @@ err: return -ENODEV; } +static int set_max_user_events_sysctl(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + mutex_lock(&event_mutex); + + ret = proc_douintvec(table, write, buffer, lenp, ppos); + + mutex_unlock(&event_mutex); + + return ret; +} + +static struct ctl_table user_event_sysctls[] = { + { + .procname = "user_events_max", + .data = &max_user_events, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = set_max_user_events_sysctl, + }, + {} +}; + static int __init trace_events_user_init(void) { int ret; + fault_cache = KMEM_CACHE(user_event_enabler_fault, 0); + + if (!fault_cache) + return -ENOMEM; + init_group = user_event_group_create(&init_user_ns); - if (!init_group) + if (!init_group) { + kmem_cache_destroy(fault_cache); return -ENOMEM; + } ret = create_user_tracefs(); if (ret) { pr_warn("user_events could not register with tracefs\n"); user_event_group_destroy(init_group); + kmem_cache_destroy(fault_cache); init_group = NULL; return ret; } @@ -1909,6 +2597,8 @@ static int __init trace_events_user_init(void) if (dyn_event_register(&user_event_dops)) pr_warn("user_events could not register with dyn_events\n"); + register_sysctl_init("kernel", user_event_sysctls); + return 0; } diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index efbbec2caff8..e97e3fa5cbed 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1652,6 +1652,8 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer) osnoise_stop_tracing(); notify_new_max_latency(diff); + wake_up_process(tlat->kthread); + return HRTIMER_NORESTART; } } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index bd475a00f96d..15f05faaae44 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -221,8 +221,11 @@ trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len, const char *ret = trace_seq_buffer_ptr(p); const char *fmt = concatenate ? "%*phN" : "%*ph"; - for (i = 0; i < buf_len; i += 16) + for (i = 0; i < buf_len; i += 16) { + if (!concatenate && i != 0) + trace_seq_putc(p, ' '); trace_seq_printf(p, fmt, min(buf_len - i, 16), &buf[i]); + } trace_seq_putc(p, 0); return ret; @@ -808,6 +811,176 @@ EXPORT_SYMBOL_GPL(unregister_trace_event); * Standard events */ +static void print_array(struct trace_iterator *iter, void *pos, + struct ftrace_event_field *field) +{ + int offset; + int len; + int i; + + offset = *(int *)pos & 0xffff; + len = *(int *)pos >> 16; + + if (field) + offset += field->offset + sizeof(int); + + if (offset + len > iter->ent_size) { + trace_seq_puts(&iter->seq, "<OVERFLOW>"); + return; + } + + pos = (void *)iter->ent + offset; + + for (i = 0; i < len; i++, pos++) { + if (i) + trace_seq_putc(&iter->seq, ','); + trace_seq_printf(&iter->seq, "%02x", *(unsigned char *)pos); + } +} + +static void print_fields(struct trace_iterator *iter, struct trace_event_call *call, + struct list_head *head) +{ + struct ftrace_event_field *field; + int offset; + int len; + int ret; + void *pos; + + list_for_each_entry(field, head, link) { + trace_seq_printf(&iter->seq, " %s=", field->name); + if (field->offset + field->size > iter->ent_size) { + trace_seq_puts(&iter->seq, "<OVERFLOW>"); + continue; + } + pos = (void *)iter->ent + field->offset; + + switch (field->filter_type) { + case FILTER_COMM: + case FILTER_STATIC_STRING: + trace_seq_printf(&iter->seq, "%.*s", field->size, (char *)pos); + break; + case FILTER_RDYN_STRING: + case FILTER_DYN_STRING: + offset = *(int *)pos & 0xffff; + len = *(int *)pos >> 16; + + if (field->filter_type == FILTER_RDYN_STRING) + offset += field->offset + sizeof(int); + + if (offset + len > iter->ent_size) { + trace_seq_puts(&iter->seq, "<OVERFLOW>"); + break; + } + pos = (void *)iter->ent + offset; + trace_seq_printf(&iter->seq, "%.*s", len, (char *)pos); + break; + case FILTER_PTR_STRING: + if (!iter->fmt_size) + trace_iter_expand_format(iter); + pos = *(void **)pos; + ret = strncpy_from_kernel_nofault(iter->fmt, pos, + iter->fmt_size); + if (ret < 0) + trace_seq_printf(&iter->seq, "(0x%px)", pos); + else + trace_seq_printf(&iter->seq, "(0x%px:%s)", + pos, iter->fmt); + break; + case FILTER_TRACE_FN: + pos = *(void **)pos; + trace_seq_printf(&iter->seq, "%pS", pos); + break; + case FILTER_CPU: + case FILTER_OTHER: + switch (field->size) { + case 1: + if (isprint(*(char *)pos)) { + trace_seq_printf(&iter->seq, "'%c'", + *(unsigned char *)pos); + } + trace_seq_printf(&iter->seq, "(%d)", + *(unsigned char *)pos); + break; + case 2: + trace_seq_printf(&iter->seq, "0x%x (%d)", + *(unsigned short *)pos, + *(unsigned short *)pos); + break; + case 4: + /* dynamic array info is 4 bytes */ + if (strstr(field->type, "__data_loc")) { + print_array(iter, pos, NULL); + break; + } + + if (strstr(field->type, "__rel_loc")) { + print_array(iter, pos, field); + break; + } + + trace_seq_printf(&iter->seq, "0x%x (%d)", + *(unsigned int *)pos, + *(unsigned int *)pos); + break; + case 8: + trace_seq_printf(&iter->seq, "0x%llx (%lld)", + *(unsigned long long *)pos, + *(unsigned long long *)pos); + break; + default: + trace_seq_puts(&iter->seq, "<INVALID-SIZE>"); + break; + } + break; + default: + trace_seq_puts(&iter->seq, "<INVALID-TYPE>"); + } + } + trace_seq_putc(&iter->seq, '\n'); +} + +enum print_line_t print_event_fields(struct trace_iterator *iter, + struct trace_event *event) +{ + struct trace_event_call *call; + struct list_head *head; + + /* ftrace defined events have separate call structures */ + if (event->type <= __TRACE_LAST_TYPE) { + bool found = false; + + down_read(&trace_event_sem); + list_for_each_entry(call, &ftrace_events, list) { + if (call->event.type == event->type) { + found = true; + break; + } + /* No need to search all events */ + if (call->event.type > __TRACE_LAST_TYPE) + break; + } + up_read(&trace_event_sem); + if (!found) { + trace_seq_printf(&iter->seq, "UNKNOWN TYPE %d\n", event->type); + goto out; + } + } else { + call = container_of(event, struct trace_event_call, event); + } + head = trace_get_fields(call); + + trace_seq_printf(&iter->seq, "%s:", trace_event_name(call)); + + if (head && !list_empty(head)) + print_fields(iter, call, head); + else + trace_seq_puts(&iter->seq, "No fields found\n"); + + out: + return trace_handle_return(&iter->seq); +} + enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, struct trace_event *event) { diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 4c954636caf0..dca40f1f1da4 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -19,6 +19,8 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, extern void trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset); extern int trace_print_context(struct trace_iterator *iter); extern int trace_print_lat_context(struct trace_iterator *iter); +extern enum print_line_t print_event_fields(struct trace_iterator *iter, + struct trace_event *event); extern void trace_event_read_lock(void); extern void trace_event_read_unlock(void); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index ef8ed3b65d05..6a4ecfb1da43 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -308,7 +308,7 @@ trace_probe_primary_from_call(struct trace_event_call *call) { struct trace_probe_event *tpe = trace_probe_event_from_call(call); - return list_first_entry(&tpe->probes, struct trace_probe, list); + return list_first_entry_or_null(&tpe->probes, struct trace_probe, list); } static inline struct list_head *trace_probe_probe_list(struct trace_probe *tp) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index a931d9aaea26..529590499b1f 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -848,6 +848,12 @@ trace_selftest_startup_function_graph(struct tracer *trace, } #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + /* + * These tests can take some time to run. Make sure on non PREEMPT + * kernels, we do not trigger the softlockup detector. + */ + cond_resched(); + tracing_reset_online_cpus(&tr->array_buffer); set_graph_array(tr); @@ -869,6 +875,8 @@ trace_selftest_startup_function_graph(struct tracer *trace, if (ret) goto out; + cond_resched(); + ret = register_ftrace_graph(&fgraph_ops); if (ret) { warn_failed_init_tracer(trace, ret); @@ -891,6 +899,8 @@ trace_selftest_startup_function_graph(struct tracer *trace, if (ret) goto out; + cond_resched(); + tracing_start(); if (!ret && !count) { diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index f50398cb790d..019e3a1566cf 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -123,15 +123,6 @@ static struct ctl_table uts_kern_table[] = { {} }; -static struct ctl_table uts_root_table[] = { - { - .procname = "kernel", - .mode = 0555, - .child = uts_kern_table, - }, - {} -}; - #ifdef CONFIG_PROC_SYSCTL /* * Notify userspace about a change in a certain entry of uts_kern_table, @@ -147,7 +138,7 @@ void uts_proc_notify(enum uts_proc proc) static int __init utsname_sysctl_init(void) { - register_sysctl_table(uts_root_table); + register_sysctl("kernel", uts_kern_table); return 0; } diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c index b7cbd66f889e..f80d5c51ae67 100644 --- a/kernel/vhost_task.c +++ b/kernel/vhost_task.c @@ -12,58 +12,88 @@ enum vhost_task_flags { VHOST_TASK_FLAGS_STOP, }; +struct vhost_task { + bool (*fn)(void *data); + void *data; + struct completion exited; + unsigned long flags; + struct task_struct *task; +}; + static int vhost_task_fn(void *data) { struct vhost_task *vtsk = data; - int ret; + bool dead = false; + + for (;;) { + bool did_work; + + /* mb paired w/ vhost_task_stop */ + if (test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags)) + break; + + if (!dead && signal_pending(current)) { + struct ksignal ksig; + /* + * Calling get_signal will block in SIGSTOP, + * or clear fatal_signal_pending, but remember + * what was set. + * + * This thread won't actually exit until all + * of the file descriptors are closed, and + * the release function is called. + */ + dead = get_signal(&ksig); + if (dead) + clear_thread_flag(TIF_SIGPENDING); + } + + did_work = vtsk->fn(vtsk->data); + if (!did_work) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } + } - ret = vtsk->fn(vtsk->data); complete(&vtsk->exited); - do_exit(ret); + do_exit(0); +} + +/** + * vhost_task_wake - wakeup the vhost_task + * @vtsk: vhost_task to wake + * + * wake up the vhost_task worker thread + */ +void vhost_task_wake(struct vhost_task *vtsk) +{ + wake_up_process(vtsk->task); } +EXPORT_SYMBOL_GPL(vhost_task_wake); /** * vhost_task_stop - stop a vhost_task * @vtsk: vhost_task to stop * - * Callers must call vhost_task_should_stop and return from their worker - * function when it returns true; + * vhost_task_fn ensures the worker thread exits after + * VHOST_TASK_FLAGS_SOP becomes true. */ void vhost_task_stop(struct vhost_task *vtsk) { - pid_t pid = vtsk->task->pid; - set_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags); - wake_up_process(vtsk->task); + vhost_task_wake(vtsk); /* * Make sure vhost_task_fn is no longer accessing the vhost_task before - * freeing it below. If userspace crashed or exited without closing, - * then the vhost_task->task could already be marked dead so - * kernel_wait will return early. + * freeing it below. */ wait_for_completion(&vtsk->exited); - /* - * If we are just closing/removing a device and the parent process is - * not exiting then reap the task. - */ - kernel_wait4(pid, NULL, __WCLONE, NULL); kfree(vtsk); } EXPORT_SYMBOL_GPL(vhost_task_stop); /** - * vhost_task_should_stop - should the vhost task return from the work function - * @vtsk: vhost_task to stop - */ -bool vhost_task_should_stop(struct vhost_task *vtsk) -{ - return test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags); -} -EXPORT_SYMBOL_GPL(vhost_task_should_stop); - -/** - * vhost_task_create - create a copy of a process to be used by the kernel - * @fn: thread stack + * vhost_task_create - create a copy of a task to be used by the kernel + * @fn: vhost worker function * @arg: data to be passed to fn * @name: the thread's name * @@ -71,17 +101,17 @@ EXPORT_SYMBOL_GPL(vhost_task_should_stop); * failure. The returned task is inactive, and the caller must fire it up * through vhost_task_start(). */ -struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg, +struct vhost_task *vhost_task_create(bool (*fn)(void *), void *arg, const char *name) { struct kernel_clone_args args = { - .flags = CLONE_FS | CLONE_UNTRACED | CLONE_VM, + .flags = CLONE_FS | CLONE_UNTRACED | CLONE_VM | + CLONE_THREAD | CLONE_SIGHAND, .exit_signal = 0, .fn = vhost_task_fn, .name = name, .user_worker = 1, .no_files = 1, - .ignore_signals = 1, }; struct vhost_task *vtsk; struct task_struct *tsk; diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index f10f403104e7..e91cb4c2833f 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -29,7 +29,6 @@ MODULE_DESCRIPTION("Watch queue"); MODULE_AUTHOR("Red Hat, Inc."); -MODULE_LICENSE("GPL"); #define WATCH_QUEUE_NOTE_SIZE 128 #define WATCH_QUEUE_NOTES_PER_PAGE (PAGE_SIZE / WATCH_QUEUE_NOTE_SIZE) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b8b541caed48..4666a1a92a31 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -49,6 +49,7 @@ #include <linux/moduleparam.h> #include <linux/uaccess.h> #include <linux/sched/isolation.h> +#include <linux/sched/debug.h> #include <linux/nmi.h> #include <linux/kvm_para.h> @@ -141,6 +142,8 @@ enum { * WR: wq->mutex protected for writes. RCU protected for reads. * * MD: wq_mayday_lock protected. + * + * WD: Used internally by the watchdog. */ /* struct worker is defined in workqueue_internal.h */ @@ -153,6 +156,7 @@ struct worker_pool { unsigned int flags; /* X: flags */ unsigned long watchdog_ts; /* L: watchdog timestamp */ + bool cpu_stall; /* WD: stalled cpu bound pool */ /* * The counter is incremented in a process context on the associated CPU @@ -1392,15 +1396,13 @@ static bool is_chained_work(struct workqueue_struct *wq) */ static int wq_select_unbound_cpu(int cpu) { - static bool printed_dbg_warning; int new_cpu; if (likely(!wq_debug_force_rr_cpu)) { if (cpumask_test_cpu(cpu, wq_unbound_cpumask)) return cpu; - } else if (!printed_dbg_warning) { - pr_warn("workqueue: round-robin CPU selection forced, expect performance impact\n"); - printed_dbg_warning = true; + } else { + pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n"); } if (cpumask_empty(wq_unbound_cpumask)) @@ -1938,12 +1940,17 @@ static struct worker *create_worker(struct worker_pool *pool) /* ID is needed to determine kthread name */ id = ida_alloc(&pool->worker_ida, GFP_KERNEL); - if (id < 0) + if (id < 0) { + pr_err_once("workqueue: Failed to allocate a worker ID: %pe\n", + ERR_PTR(id)); return NULL; + } worker = alloc_worker(pool->node); - if (!worker) + if (!worker) { + pr_err_once("workqueue: Failed to allocate a worker\n"); goto fail; + } worker->id = id; @@ -1955,8 +1962,16 @@ static struct worker *create_worker(struct worker_pool *pool) worker->task = kthread_create_on_node(worker_thread, worker, pool->node, "kworker/%s", id_buf); - if (IS_ERR(worker->task)) + if (IS_ERR(worker->task)) { + if (PTR_ERR(worker->task) == -EINTR) { + pr_err("workqueue: Interrupted when creating a worker thread \"kworker/%s\"\n", + id_buf); + } else { + pr_err_once("workqueue: Failed to create a worker thread: %pe", + worker->task); + } goto fail; + } set_user_nice(worker->task, pool->attrs->nice); kthread_bind_mask(worker->task, pool->attrs->cpumask); @@ -4380,13 +4395,18 @@ static int init_rescuer(struct workqueue_struct *wq) return 0; rescuer = alloc_worker(NUMA_NO_NODE); - if (!rescuer) + if (!rescuer) { + pr_err("workqueue: Failed to allocate a rescuer for wq \"%s\"\n", + wq->name); return -ENOMEM; + } rescuer->rescue_wq = wq; rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name); if (IS_ERR(rescuer->task)) { ret = PTR_ERR(rescuer->task); + pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe", + wq->name, ERR_PTR(ret)); kfree(rescuer); return ret; } @@ -5002,10 +5022,16 @@ static void show_one_worker_pool(struct worker_pool *pool) struct worker *worker; bool first = true; unsigned long flags; + unsigned long hung = 0; raw_spin_lock_irqsave(&pool->lock, flags); if (pool->nr_workers == pool->nr_idle) goto next_pool; + + /* How long the first pending work is waiting for a worker. */ + if (!list_empty(&pool->worklist)) + hung = jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000; + /* * Defer printing to avoid deadlocks in console drivers that * queue work while holding locks also taken in their write @@ -5014,9 +5040,7 @@ static void show_one_worker_pool(struct worker_pool *pool) printk_deferred_enter(); pr_info("pool %d:", pool->id); pr_cont_pool_info(pool); - pr_cont(" hung=%us workers=%d", - jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000, - pool->nr_workers); + pr_cont(" hung=%lus workers=%d", hung, pool->nr_workers); if (pool->manager) pr_cont(" manager: %d", task_pid_nr(pool->manager->task)); @@ -5041,8 +5065,7 @@ next_pool: /** * show_all_workqueues - dump workqueue state * - * Called from a sysrq handler or try_to_freeze_tasks() and prints out - * all busy workqueues and pools. + * Called from a sysrq handler and prints out all busy workqueues and pools. */ void show_all_workqueues(void) { @@ -5063,6 +5086,29 @@ void show_all_workqueues(void) rcu_read_unlock(); } +/** + * show_freezable_workqueues - dump freezable workqueue state + * + * Called from try_to_freeze_tasks() and prints out all freezable workqueues + * still busy. + */ +void show_freezable_workqueues(void) +{ + struct workqueue_struct *wq; + + rcu_read_lock(); + + pr_info("Showing freezable workqueues that are still busy:\n"); + + list_for_each_entry_rcu(wq, &workqueues, list) { + if (!(wq->flags & WQ_FREEZABLE)) + continue; + show_one_workqueue(wq); + } + + rcu_read_unlock(); +} + /* used to show worker information through /proc/PID/{comm,stat,status} */ void wq_worker_comm(char *buf, size_t size, struct task_struct *task) { @@ -5826,13 +5872,19 @@ static struct device_attribute wq_sysfs_cpumask_attr = static int __init wq_sysfs_init(void) { + struct device *dev_root; int err; err = subsys_virtual_register(&wq_subsys, NULL); if (err) return err; - return device_create_file(wq_subsys.dev_root, &wq_sysfs_cpumask_attr); + dev_root = bus_get_dev_root(&wq_subsys); + if (dev_root) { + err = device_create_file(dev_root, &wq_sysfs_cpumask_attr); + put_device(dev_root); + } + return err; } core_initcall(wq_sysfs_init); @@ -5956,6 +6008,57 @@ static struct timer_list wq_watchdog_timer; static unsigned long wq_watchdog_touched = INITIAL_JIFFIES; static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES; +/* + * Show workers that might prevent the processing of pending work items. + * The only candidates are CPU-bound workers in the running state. + * Pending work items should be handled by another idle worker + * in all other situations. + */ +static void show_cpu_pool_hog(struct worker_pool *pool) +{ + struct worker *worker; + unsigned long flags; + int bkt; + + raw_spin_lock_irqsave(&pool->lock, flags); + + hash_for_each(pool->busy_hash, bkt, worker, hentry) { + if (task_is_running(worker->task)) { + /* + * Defer printing to avoid deadlocks in console + * drivers that queue work while holding locks + * also taken in their write paths. + */ + printk_deferred_enter(); + + pr_info("pool %d:\n", pool->id); + sched_show_task(worker->task); + + printk_deferred_exit(); + } + } + + raw_spin_unlock_irqrestore(&pool->lock, flags); +} + +static void show_cpu_pools_hogs(void) +{ + struct worker_pool *pool; + int pi; + + pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n"); + + rcu_read_lock(); + + for_each_pool(pool, pi) { + if (pool->cpu_stall) + show_cpu_pool_hog(pool); + + } + + rcu_read_unlock(); +} + static void wq_watchdog_reset_touched(void) { int cpu; @@ -5969,6 +6072,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) { unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; bool lockup_detected = false; + bool cpu_pool_stall = false; unsigned long now = jiffies; struct worker_pool *pool; int pi; @@ -5981,6 +6085,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) for_each_pool(pool, pi) { unsigned long pool_ts, touched, ts; + pool->cpu_stall = false; if (list_empty(&pool->worklist)) continue; @@ -6005,11 +6110,17 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) /* did we stall? */ if (time_after(now, ts + thresh)) { lockup_detected = true; + if (pool->cpu >= 0) { + pool->cpu_stall = true; + cpu_pool_stall = true; + } pr_emerg("BUG: workqueue lockup - pool"); pr_cont_pool_info(pool); pr_cont(" stuck for %us!\n", jiffies_to_msecs(now - pool_ts) / 1000); } + + } rcu_read_unlock(); @@ -6017,6 +6128,9 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) if (lockup_detected) show_all_workqueues(); + if (cpu_pool_stall) + show_cpu_pools_hogs(); + wq_watchdog_reset_touched(); mod_timer(&wq_watchdog_timer, jiffies + thresh); } |
