From 21b195c05cf6a6cc49777d6992772bcf01502186 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 23 Dec 2021 20:31:37 +0800 Subject: workqueue: Remove the mb() pair between wq_worker_sleeping() and insert_work() In wq_worker_sleeping(), the access to worklist is protected by the pool->lock, so the memory barrier is unneeded. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 33f1106b4f99..29b070106f34 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -918,10 +918,6 @@ void wq_worker_sleeping(struct task_struct *task) } /* - * The counterpart of the following dec_and_test, implied mb, - * worklist not empty test sequence is in insert_work(). - * Please read comment there. - * * NOT_RUNNING is clear. This means that we're bound to and * running on the local cpu w/ rq lock held and preemption * disabled, which in turn means that none else could be @@ -1372,13 +1368,6 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, list_add_tail(&work->entry, head); get_pwq(pwq); - /* - * Ensure either wq_worker_sleeping() sees the above - * list_add_tail() or we see zero nr_running to avoid workers lying - * around lazily while there are works to be processed. - */ - smp_mb(); - if (__need_more_worker(pool)) wake_up_worker(pool); } -- cgit v1.2.3 From 2c1f1a9180bfacbc3c8e5b10075640cc810cf9c0 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 23 Dec 2021 20:31:38 +0800 Subject: workqueue: Change the comments of the synchronization about the idle_list The access to idle_list in wq_worker_sleeping() is changed to be protected by pool->lock, so the comments above idle_list can be changed to "L:" which is the meaning of "access with pool->lock held". And the outdated comments in wq_worker_sleeping() is removed since the function is not called with rq lock held any more, idle_list is dereferenced with pool lock now. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 29b070106f34..b3207722671c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -162,7 +162,7 @@ struct worker_pool { int nr_workers; /* L: total number of workers */ int nr_idle; /* L: currently idle workers */ - struct list_head idle_list; /* X: list of idle workers */ + struct list_head idle_list; /* L: list of idle workers */ struct timer_list idle_timer; /* L: worker idle timeout */ struct timer_list mayday_timer; /* L: SOS timer for workers */ @@ -826,7 +826,7 @@ static bool too_many_workers(struct worker_pool *pool) * Wake up functions. */ -/* Return the first idle worker. Safe with preemption disabled */ +/* Return the first idle worker. Called with pool->lock held. */ static struct worker *first_idle_worker(struct worker_pool *pool) { if (unlikely(list_empty(&pool->idle_list))) @@ -917,13 +917,6 @@ void wq_worker_sleeping(struct task_struct *task) return; } - /* - * NOT_RUNNING is clear. This means that we're bound to and - * running on the local cpu w/ rq lock held and preemption - * disabled, which in turn means that none else could be - * manipulating idle_list, so dereferencing idle_list without pool - * lock is safe. - */ if (atomic_dec_and_test(&pool->nr_running) && !list_empty(&pool->worklist)) { next = first_idle_worker(pool); -- cgit v1.2.3 From cc5bff38463e0894dd596befa99f9d6860e15f5e Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 23 Dec 2021 20:31:39 +0800 Subject: workqueue: Use wake_up_worker() in wq_worker_sleeping() instead of open code The wakeup code in wq_worker_sleeping() is the same as wake_up_worker(). Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b3207722671c..69cbe9e62bf1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -887,7 +887,7 @@ void wq_worker_running(struct task_struct *task) */ void wq_worker_sleeping(struct task_struct *task) { - struct worker *next, *worker = kthread_data(task); + struct worker *worker = kthread_data(task); struct worker_pool *pool; /* @@ -918,11 +918,8 @@ void wq_worker_sleeping(struct task_struct *task) } if (atomic_dec_and_test(&pool->nr_running) && - !list_empty(&pool->worklist)) { - next = first_idle_worker(pool); - if (next) - wake_up_process(next->task); - } + !list_empty(&pool->worklist)) + wake_up_worker(pool); raw_spin_unlock_irq(&pool->lock); } -- cgit v1.2.3 From bc35f7ef96284b8c963991357a9278a6beafca54 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 23 Dec 2021 20:31:40 +0800 Subject: workqueue: Convert the type of pool->nr_running to int It is only modified in associated CPU, so it doesn't need to be atomic. tj: Comment updated. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 69cbe9e62bf1..835d25e65bb2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -154,8 +154,13 @@ struct worker_pool { unsigned long watchdog_ts; /* L: watchdog timestamp */ - /* The current concurrency level. */ - atomic_t nr_running; + /* + * The counter is incremented in a process context on the associated CPU + * w/ preemption disabled, and decremented or reset in the same context + * but w/ pool->lock held. The readers grab pool->lock and are + * guaranteed to see if the counter reached zero. + */ + int nr_running; struct list_head worklist; /* L: list of pending works */ @@ -777,7 +782,7 @@ static bool work_is_canceling(struct work_struct *work) static bool __need_more_worker(struct worker_pool *pool) { - return !atomic_read(&pool->nr_running); + return !pool->nr_running; } /* @@ -802,8 +807,7 @@ static bool may_start_working(struct worker_pool *pool) /* Do I need to keep working? Called from currently running workers. */ static bool keep_working(struct worker_pool *pool) { - return !list_empty(&pool->worklist) && - atomic_read(&pool->nr_running) <= 1; + return !list_empty(&pool->worklist) && (pool->nr_running <= 1); } /* Do we need a new worker? Called from manager. */ @@ -873,7 +877,7 @@ void wq_worker_running(struct task_struct *task) */ preempt_disable(); if (!(worker->flags & WORKER_NOT_RUNNING)) - atomic_inc(&worker->pool->nr_running); + worker->pool->nr_running++; preempt_enable(); worker->sleeping = 0; } @@ -917,8 +921,8 @@ void wq_worker_sleeping(struct task_struct *task) return; } - if (atomic_dec_and_test(&pool->nr_running) && - !list_empty(&pool->worklist)) + pool->nr_running--; + if (need_more_worker(pool)) wake_up_worker(pool); raw_spin_unlock_irq(&pool->lock); } @@ -973,7 +977,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags) /* If transitioning into NOT_RUNNING, adjust nr_running. */ if ((flags & WORKER_NOT_RUNNING) && !(worker->flags & WORKER_NOT_RUNNING)) { - atomic_dec(&pool->nr_running); + pool->nr_running--; } worker->flags |= flags; @@ -1005,7 +1009,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) */ if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) if (!(worker->flags & WORKER_NOT_RUNNING)) - atomic_inc(&pool->nr_running); + pool->nr_running++; } /** @@ -1806,8 +1810,7 @@ static void worker_enter_idle(struct worker *worker) mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); /* Sanity check nr_running. */ - WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && - atomic_read(&pool->nr_running)); + WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running); } /** @@ -4985,7 +4988,7 @@ static void unbind_workers(int cpu) * an unbound (in terms of concurrency management) pool which * are served by workers tied to the pool. */ - atomic_set(&pool->nr_running, 0); + pool->nr_running = 0; /* * With concurrency management just turned off, a busy -- cgit v1.2.3 From 4148be7de0a3316edd1af45609d354cac0e6a021 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 8 Jan 2022 00:38:16 +0000 Subject: cgroup: rstat: use same convention to assign cgroup_base_stat In function cgroup_base_stat_flush(), we update cgroup_base_stat by getting rstatc->bstat and adjust delta to related fields. There are two convention to assign cgroup_base_stat in this function: * rstat2 = rstat1 * rstat2.cputime = rstat1.cputime The second convention may make audience think just field "cputime" is updated, while cputime is the only field in cgroup_base_stat. Let's use the same convention to eliminate this confusion. Signed-off-by: Wei Yang Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 9d331ba44870..0b32fa62e93c 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -325,7 +325,7 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) /* fetch the current per-cpu values */ do { seq = __u64_stats_fetch_begin(&rstatc->bsync); - cur.cputime = rstatc->bstat.cputime; + cur = rstatc->bstat; } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); /* propagate percpu delta to global */ -- cgit v1.2.3 From 95b99f353cf3d9d753796975f2093b13f75e0fbd Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 8 Jan 2022 00:38:17 +0000 Subject: cgroup: rstat: retrieve current bstat to delta directly Instead of retrieve current bstat to cur and copy it to delta, let's use delta directly. This saves one copy operation and has the same code convention as propagating delta to parent. Signed-off-by: Wei Yang Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 0b32fa62e93c..29ea74f0eab3 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -315,7 +315,7 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); struct cgroup *parent = cgroup_parent(cgrp); - struct cgroup_base_stat cur, delta; + struct cgroup_base_stat delta; unsigned seq; /* Root-level stats are sourced from system-wide CPU stats */ @@ -325,11 +325,10 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) /* fetch the current per-cpu values */ do { seq = __u64_stats_fetch_begin(&rstatc->bsync); - cur = rstatc->bstat; + delta = rstatc->bstat; } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); /* propagate percpu delta to global */ - delta = cur; cgroup_base_stat_sub(&delta, &rstatc->last_bstat); cgroup_base_stat_add(&cgrp->bstat, &delta); cgroup_base_stat_add(&rstatc->last_bstat, &delta); -- cgit v1.2.3 From ffacbd11e2580a93546733d2e4e320c181f76844 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Sat, 8 Jan 2022 14:38:12 +0800 Subject: cgroup: Fix cgroup_can_fork() and cgroup_post_fork() kernel-doc comment Add the description of @kargs in cgroup_can_fork() and cgroup_post_fork() kernel-doc comment to remove warnings found by running scripts/kernel-doc, which is caused by using 'make W=1'. kernel/cgroup/cgroup.c:6235: warning: Function parameter or member 'kargs' not described in 'cgroup_can_fork' kernel/cgroup/cgroup.c:6296: warning: Function parameter or member 'kargs' not described in 'cgroup_post_fork' Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b31e1465868a..37c49e1a672f 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6224,6 +6224,7 @@ static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs) /** * cgroup_can_fork - called on a new task before the process is exposed * @child: the child process + * @kargs: the arguments passed to create the child process * * This prepares a new css_set for the child process which the child will * be attached to in cgroup_post_fork(). @@ -6286,6 +6287,7 @@ void cgroup_cancel_fork(struct task_struct *child, /** * cgroup_post_fork - finalize cgroup setup for the child process * @child: the child process + * @kargs: the arguments passed to create the child process * * Attach the child process to its css_set calling the subsystem fork() * callbacks. -- cgit v1.2.3 From d068eebbd4822b6c14a7ea375dfe53ca5c69c776 Mon Sep 17 00:00:00 2001 From: Michal Koutný Date: Fri, 17 Dec 2021 16:48:54 +0100 Subject: cgroup/cpuset: Make child cpusets restrict parents on v1 hierarchy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit 1f1562fcd04a ("cgroup/cpuset: Don't let child cpusets restrict parent in default hierarchy") inteded to relax the check only on the default hierarchy (or v2 mode) but it dropped the check in v1 too. This patch returns and separates the legacy-only validations so that they can be considered only in the v1 mode, which should enforce the old constraints for the sake of compatibility. Fixes: 1f1562fcd04a ("cgroup/cpuset: Don't let child cpusets restrict parent in default hierarchy") Suggested-by: Waiman Long Signed-off-by: Michal Koutný Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 52 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index dc653ab26e50..bb3531e7fda7 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -590,6 +590,35 @@ static inline void free_cpuset(struct cpuset *cs) kfree(cs); } +/* + * validate_change_legacy() - Validate conditions specific to legacy (v1) + * behavior. + */ +static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial) +{ + struct cgroup_subsys_state *css; + struct cpuset *c, *par; + int ret; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* Each of our child cpusets must be a subset of us */ + ret = -EBUSY; + cpuset_for_each_child(c, css, cur) + if (!is_cpuset_subset(c, trial)) + goto out; + + /* On legacy hierarchy, we must be a subset of our parent cpuset. */ + ret = -EACCES; + par = parent_cs(cur); + if (par && !is_cpuset_subset(trial, par)) + goto out; + + ret = 0; +out: + return ret; +} + /* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. @@ -614,20 +643,21 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) { struct cgroup_subsys_state *css; struct cpuset *c, *par; - int ret; - - /* The checks don't apply to root cpuset */ - if (cur == &top_cpuset) - return 0; + int ret = 0; rcu_read_lock(); - par = parent_cs(cur); - /* On legacy hierarchy, we must be a subset of our parent cpuset. */ - ret = -EACCES; - if (!is_in_v2_mode() && !is_cpuset_subset(trial, par)) + if (!is_in_v2_mode()) + ret = validate_change_legacy(cur, trial); + if (ret) + goto out; + + /* Remaining checks don't apply to root cpuset */ + if (cur == &top_cpuset) goto out; + par = parent_cs(cur); + /* * If either I or some sibling (!= me) is exclusive, we can't * overlap @@ -1175,9 +1205,7 @@ enum subparts_cmd { * * Because of the implicit cpu exclusive nature of a partition root, * cpumask changes that violates the cpu exclusivity rule will not be - * permitted when checked by validate_change(). The validate_change() - * function will also prevent any changes to the cpu list if it is not - * a superset of children's cpu lists. + * permitted when checked by validate_change(). */ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, struct cpumask *newmask, -- cgit v1.2.3 From 18688de203b47e5d8d9d0953385bf30b5949324f Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 14 Jan 2022 22:09:44 +0530 Subject: bpf: Fix UAF due to race between btf_try_get_module and load_module While working on code to populate kfunc BTF ID sets for module BTF from its initcall, I noticed that by the time the initcall is invoked, the module BTF can already be seen by userspace (and the BPF verifier). The existing btf_try_get_module calls try_module_get which only fails if mod->state == MODULE_STATE_GOING, i.e. it can increment module reference when module initcall is happening in parallel. Currently, BTF parsing happens from MODULE_STATE_COMING notifier callback. At this point, the module initcalls have not been invoked. The notifier callback parses and prepares the module BTF, allocates an ID, which publishes it to userspace, and then adds it to the btf_modules list allowing the kernel to invoke btf_try_get_module for the BTF. However, at this point, the module has not been fully initialized (i.e. its initcalls have not finished). The code in module.c can still fail and free the module, without caring for other users. However, nothing stops btf_try_get_module from succeeding between the state transition from MODULE_STATE_COMING to MODULE_STATE_LIVE. This leads to a use-after-free issue when BPF program loads successfully in the state transition, load_module's do_init_module call fails and frees the module, and BPF program fd on close calls module_put for the freed module. Future patch has test case to verify we don't regress in this area in future. There are multiple points after prepare_coming_module (in load_module) where failure can occur and module loading can return error. We illustrate and test for the race using the last point where it can practically occur (in module __init function). An illustration of the race: CPU 0 CPU 1 load_module notifier_call(MODULE_STATE_COMING) btf_parse_module btf_alloc_id // Published to userspace list_add(&btf_mod->list, btf_modules) mod->init(...) ... ^ bpf_check | check_pseudo_btf_id | btf_try_get_module | returns true | ... ... | module __init in progress return prog_fd | ... ... V if (ret < 0) free_module(mod) ... close(prog_fd) ... bpf_prog_free_deferred module_put(used_btf.mod) // use-after-free We fix this issue by setting a flag BTF_MODULE_F_LIVE, from the notifier callback when MODULE_STATE_LIVE state is reached for the module, so that we return NULL from btf_try_get_module for modules that are not fully formed. Since try_module_get already checks that module is not in MODULE_STATE_GOING state, and that is the only transition a live module can make before being removed from btf_modules list, this is enough to close the race and prevent the bug. A later selftest patch crafts the race condition artifically to verify that it has been fixed, and that verifier fails to load program (with ENXIO). Lastly, a couple of comments: 1. Even if this race didn't exist, it seems more appropriate to only access resources (ksyms and kfuncs) of a fully formed module which has been initialized completely. 2. This patch was born out of need for synchronization against module initcall for the next patch, so it is needed for correctness even without the aforementioned race condition. The BTF resources initialized by module initcall are set up once and then only looked up, so just waiting until the initcall has finished ensures correct behavior. Fixes: 541c3bad8dc5 ("bpf: Support BPF ksym variables in kernel modules") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220114163953.1455836-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 33bb8ae4a804..f25bca59909d 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6200,12 +6200,17 @@ bool btf_id_set_contains(const struct btf_id_set *set, u32 id) return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; } +enum { + BTF_MODULE_F_LIVE = (1 << 0), +}; + #ifdef CONFIG_DEBUG_INFO_BTF_MODULES struct btf_module { struct list_head list; struct module *module; struct btf *btf; struct bin_attribute *sysfs_attr; + int flags; }; static LIST_HEAD(btf_modules); @@ -6233,7 +6238,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, int err = 0; if (mod->btf_data_size == 0 || - (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING)) + (op != MODULE_STATE_COMING && op != MODULE_STATE_LIVE && + op != MODULE_STATE_GOING)) goto out; switch (op) { @@ -6291,6 +6297,17 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, btf_mod->sysfs_attr = attr; } + break; + case MODULE_STATE_LIVE: + mutex_lock(&btf_module_mutex); + list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { + if (btf_mod->module != module) + continue; + + btf_mod->flags |= BTF_MODULE_F_LIVE; + break; + } + mutex_unlock(&btf_module_mutex); break; case MODULE_STATE_GOING: mutex_lock(&btf_module_mutex); @@ -6338,7 +6355,12 @@ struct module *btf_try_get_module(const struct btf *btf) if (btf_mod->btf != btf) continue; - if (try_module_get(btf_mod->module)) + /* We must only consider module whose __init routine has + * finished, hence we must check for BTF_MODULE_F_LIVE flag, + * which is set from the notifier callback for + * MODULE_STATE_LIVE. + */ + if ((btf_mod->flags & BTF_MODULE_F_LIVE) && try_module_get(btf_mod->module)) res = btf_mod->module; break; -- cgit v1.2.3 From dee872e124e8d5de22b68c58f6f6c3f5e8889160 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 14 Jan 2022 22:09:45 +0530 Subject: bpf: Populate kfunc BTF ID sets in struct btf This patch prepares the kernel to support putting all kinds of kfunc BTF ID sets in the struct btf itself. The various kernel subsystems will make register_btf_kfunc_id_set call in the initcalls (for built-in code and modules). The 'hook' is one of the many program types, e.g. XDP and TC/SCHED_CLS, STRUCT_OPS, and 'types' are check (allowed or not), acquire, release, and ret_null (with PTR_TO_BTF_ID_OR_NULL return type). A maximum of BTF_KFUNC_SET_MAX_CNT (32) kfunc BTF IDs are permitted in a set of certain hook and type for vmlinux sets, since they are allocated on demand, and otherwise set as NULL. Module sets can only be registered once per hook and type, hence they are directly assigned. A new btf_kfunc_id_set_contains function is exposed for use in verifier, this new method is faster than the existing list searching method, and is also automatic. It also lets other code not care whether the set is unallocated or not. Note that module code can only do single register_btf_kfunc_id_set call per hook. This is why sorting is only done for in-kernel vmlinux sets, because there might be multiple sets for the same hook and type that must be concatenated, hence sorting them is required to ensure bsearch in btf_id_set_contains continues to work correctly. Next commit will update the kernel users to make use of this infrastructure. Finally, add __maybe_unused annotation for BTF ID macros for the !CONFIG_DEBUG_INFO_BTF case, so that they don't produce warnings during build time. The previous patch is also needed to provide synchronization against initialization for module BTF's kfunc_set_tab introduced here, as described below: The kfunc_set_tab pointer in struct btf is write-once (if we consider the registration phase (comprised of multiple register_btf_kfunc_id_set calls) as a single operation). In this sense, once it has been fully prepared, it isn't modified, only used for lookup (from the verifier context). For btf_vmlinux, it is initialized fully during the do_initcalls phase, which happens fairly early in the boot process, before any processes are present. This also eliminates the possibility of bpf_check being called at that point, thus relieving us of ensuring any synchronization between the registration and lookup function (btf_kfunc_id_set_contains). However, the case for module BTF is a bit tricky. The BTF is parsed, prepared, and published from the MODULE_STATE_COMING notifier callback. After this, the module initcalls are invoked, where our registration function will be called to populate the kfunc_set_tab for module BTF. At this point, BTF may be available to userspace while its corresponding module is still intializing. A BTF fd can then be passed to verifier using bpf syscall (e.g. for kfunc call insn). Hence, there is a race window where verifier may concurrently try to lookup the kfunc_set_tab. To prevent this race, we must ensure the operations are serialized, or waiting for the __init functions to complete. In the earlier registration API, this race was alleviated as verifier bpf_check_mod_kfunc_call didn't find the kfunc BTF ID until it was added by the registration function (called usually at the end of module __init function after all module resources have been initialized). If the verifier made the check_kfunc_call before kfunc BTF ID was added to the list, it would fail verification (saying call isn't allowed). The access to list was protected using a mutex. Now, it would still fail verification, but for a different reason (returning ENXIO due to the failed btf_try_get_module call in add_kfunc_call), because if the __init call is in progress the module will be in the middle of MODULE_STATE_COMING -> MODULE_STATE_LIVE transition, and the BTF_MODULE_LIVE flag for btf_module instance will not be set, so the btf_try_get_module call will fail. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220114163953.1455836-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 39 ++++++++ include/linux/btf_ids.h | 13 +-- kernel/bpf/btf.c | 244 +++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 289 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/btf.h b/include/linux/btf.h index 0c74348cbc9d..c451f8e2612a 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -12,11 +12,33 @@ #define BTF_TYPE_EMIT(type) ((void)(type *)0) #define BTF_TYPE_EMIT_ENUM(enum_val) ((void)enum_val) +enum btf_kfunc_type { + BTF_KFUNC_TYPE_CHECK, + BTF_KFUNC_TYPE_ACQUIRE, + BTF_KFUNC_TYPE_RELEASE, + BTF_KFUNC_TYPE_RET_NULL, + BTF_KFUNC_TYPE_MAX, +}; + struct btf; struct btf_member; struct btf_type; union bpf_attr; struct btf_show; +struct btf_id_set; + +struct btf_kfunc_id_set { + struct module *owner; + union { + struct { + struct btf_id_set *check_set; + struct btf_id_set *acquire_set; + struct btf_id_set *release_set; + struct btf_id_set *ret_null_set; + }; + struct btf_id_set *sets[BTF_KFUNC_TYPE_MAX]; + }; +}; extern const struct file_operations btf_fops; @@ -307,6 +329,11 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); const char *btf_name_by_offset(const struct btf *btf, u32 offset); struct btf *btf_parse_vmlinux(void); struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog); +bool btf_kfunc_id_set_contains(const struct btf *btf, + enum bpf_prog_type prog_type, + enum btf_kfunc_type type, u32 kfunc_btf_id); +int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, + const struct btf_kfunc_id_set *s); #else static inline const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) @@ -318,6 +345,18 @@ static inline const char *btf_name_by_offset(const struct btf *btf, { return NULL; } +static inline bool btf_kfunc_id_set_contains(const struct btf *btf, + enum bpf_prog_type prog_type, + enum btf_kfunc_type type, + u32 kfunc_btf_id) +{ + return false; +} +static inline int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, + const struct btf_kfunc_id_set *s) +{ + return 0; +} #endif struct kfunc_btf_id_set { diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 919c0fde1c51..bc5d9cc34e4c 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -11,6 +11,7 @@ struct btf_id_set { #ifdef CONFIG_DEBUG_INFO_BTF #include /* for __PASTE */ +#include /* for __maybe_unused */ /* * Following macros help to define lists of BTF IDs placed @@ -146,14 +147,14 @@ extern struct btf_id_set name; #else -#define BTF_ID_LIST(name) static u32 name[5]; +#define BTF_ID_LIST(name) static u32 __maybe_unused name[5]; #define BTF_ID(prefix, name) #define BTF_ID_UNUSED -#define BTF_ID_LIST_GLOBAL(name, n) u32 name[n]; -#define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 name[1]; -#define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) u32 name[1]; -#define BTF_SET_START(name) static struct btf_id_set name = { 0 }; -#define BTF_SET_START_GLOBAL(name) static struct btf_id_set name = { 0 }; +#define BTF_ID_LIST_GLOBAL(name, n) u32 __maybe_unused name[n]; +#define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 __maybe_unused name[1]; +#define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) u32 __maybe_unused name[1]; +#define BTF_SET_START(name) static struct btf_id_set __maybe_unused name = { 0 }; +#define BTF_SET_START_GLOBAL(name) static struct btf_id_set __maybe_unused name = { 0 }; #define BTF_SET_END(name) #endif /* CONFIG_DEBUG_INFO_BTF */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f25bca59909d..74037bd65d17 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -198,6 +198,21 @@ DEFINE_IDR(btf_idr); DEFINE_SPINLOCK(btf_idr_lock); +enum btf_kfunc_hook { + BTF_KFUNC_HOOK_XDP, + BTF_KFUNC_HOOK_TC, + BTF_KFUNC_HOOK_STRUCT_OPS, + BTF_KFUNC_HOOK_MAX, +}; + +enum { + BTF_KFUNC_SET_MAX_CNT = 32, +}; + +struct btf_kfunc_set_tab { + struct btf_id_set *sets[BTF_KFUNC_HOOK_MAX][BTF_KFUNC_TYPE_MAX]; +}; + struct btf { void *data; struct btf_type **types; @@ -212,6 +227,7 @@ struct btf { refcount_t refcnt; u32 id; struct rcu_head rcu; + struct btf_kfunc_set_tab *kfunc_set_tab; /* split BTF support */ struct btf *base_btf; @@ -1531,8 +1547,30 @@ static void btf_free_id(struct btf *btf) spin_unlock_irqrestore(&btf_idr_lock, flags); } +static void btf_free_kfunc_set_tab(struct btf *btf) +{ + struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab; + int hook, type; + + if (!tab) + return; + /* For module BTF, we directly assign the sets being registered, so + * there is nothing to free except kfunc_set_tab. + */ + if (btf_is_module(btf)) + goto free_tab; + for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++) { + for (type = 0; type < ARRAY_SIZE(tab->sets[0]); type++) + kfree(tab->sets[hook][type]); + } +free_tab: + kfree(tab); + btf->kfunc_set_tab = NULL; +} + static void btf_free(struct btf *btf) { + btf_free_kfunc_set_tab(btf); kvfree(btf->types); kvfree(btf->resolved_sizes); kvfree(btf->resolved_ids); @@ -6371,6 +6409,36 @@ struct module *btf_try_get_module(const struct btf *btf) return res; } +/* Returns struct btf corresponding to the struct module + * + * This function can return NULL or ERR_PTR. Note that caller must + * release reference for struct btf iff btf_is_module is true. + */ +static struct btf *btf_get_module_btf(const struct module *module) +{ + struct btf *btf = NULL; +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + struct btf_module *btf_mod, *tmp; +#endif + + if (!module) + return bpf_get_btf_vmlinux(); +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + mutex_lock(&btf_module_mutex); + list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { + if (btf_mod->module != module) + continue; + + btf_get(btf_mod->btf); + btf = btf_mod->btf; + break; + } + mutex_unlock(&btf_module_mutex); +#endif + + return btf; +} + BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int, flags) { struct btf *btf; @@ -6438,7 +6506,181 @@ BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE) BTF_TRACING_TYPE_xxx #undef BTF_TRACING_TYPE -/* BTF ID set registration API for modules */ +/* Kernel Function (kfunc) BTF ID set registration API */ + +static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, + enum btf_kfunc_type type, + struct btf_id_set *add_set, bool vmlinux_set) +{ + struct btf_kfunc_set_tab *tab; + struct btf_id_set *set; + u32 set_cnt; + int ret; + + if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX) { + ret = -EINVAL; + goto end; + } + + if (!add_set->cnt) + return 0; + + tab = btf->kfunc_set_tab; + if (!tab) { + tab = kzalloc(sizeof(*tab), GFP_KERNEL | __GFP_NOWARN); + if (!tab) + return -ENOMEM; + btf->kfunc_set_tab = tab; + } + + set = tab->sets[hook][type]; + /* Warn when register_btf_kfunc_id_set is called twice for the same hook + * for module sets. + */ + if (WARN_ON_ONCE(set && !vmlinux_set)) { + ret = -EINVAL; + goto end; + } + + /* We don't need to allocate, concatenate, and sort module sets, because + * only one is allowed per hook. Hence, we can directly assign the + * pointer and return. + */ + if (!vmlinux_set) { + tab->sets[hook][type] = add_set; + return 0; + } + + /* In case of vmlinux sets, there may be more than one set being + * registered per hook. To create a unified set, we allocate a new set + * and concatenate all individual sets being registered. While each set + * is individually sorted, they may become unsorted when concatenated, + * hence re-sorting the final set again is required to make binary + * searching the set using btf_id_set_contains function work. + */ + set_cnt = set ? set->cnt : 0; + + if (set_cnt > U32_MAX - add_set->cnt) { + ret = -EOVERFLOW; + goto end; + } + + if (set_cnt + add_set->cnt > BTF_KFUNC_SET_MAX_CNT) { + ret = -E2BIG; + goto end; + } + + /* Grow set */ + set = krealloc(tab->sets[hook][type], + offsetof(struct btf_id_set, ids[set_cnt + add_set->cnt]), + GFP_KERNEL | __GFP_NOWARN); + if (!set) { + ret = -ENOMEM; + goto end; + } + + /* For newly allocated set, initialize set->cnt to 0 */ + if (!tab->sets[hook][type]) + set->cnt = 0; + tab->sets[hook][type] = set; + + /* Concatenate the two sets */ + memcpy(set->ids + set->cnt, add_set->ids, add_set->cnt * sizeof(set->ids[0])); + set->cnt += add_set->cnt; + + sort(set->ids, set->cnt, sizeof(set->ids[0]), btf_id_cmp_func, NULL); + + return 0; +end: + btf_free_kfunc_set_tab(btf); + return ret; +} + +static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, + const struct btf_kfunc_id_set *kset) +{ + bool vmlinux_set = !btf_is_module(btf); + int type, ret; + + for (type = 0; type < ARRAY_SIZE(kset->sets); type++) { + if (!kset->sets[type]) + continue; + + ret = __btf_populate_kfunc_set(btf, hook, type, kset->sets[type], vmlinux_set); + if (ret) + break; + } + return ret; +} + +static bool __btf_kfunc_id_set_contains(const struct btf *btf, + enum btf_kfunc_hook hook, + enum btf_kfunc_type type, + u32 kfunc_btf_id) +{ + struct btf_id_set *set; + + if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX) + return false; + if (!btf->kfunc_set_tab) + return false; + set = btf->kfunc_set_tab->sets[hook][type]; + if (!set) + return false; + return btf_id_set_contains(set, kfunc_btf_id); +} + +static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) +{ + switch (prog_type) { + case BPF_PROG_TYPE_XDP: + return BTF_KFUNC_HOOK_XDP; + case BPF_PROG_TYPE_SCHED_CLS: + return BTF_KFUNC_HOOK_TC; + case BPF_PROG_TYPE_STRUCT_OPS: + return BTF_KFUNC_HOOK_STRUCT_OPS; + default: + return BTF_KFUNC_HOOK_MAX; + } +} + +/* Caution: + * Reference to the module (obtained using btf_try_get_module) corresponding to + * the struct btf *MUST* be held when calling this function from verifier + * context. This is usually true as we stash references in prog's kfunc_btf_tab; + * keeping the reference for the duration of the call provides the necessary + * protection for looking up a well-formed btf->kfunc_set_tab. + */ +bool btf_kfunc_id_set_contains(const struct btf *btf, + enum bpf_prog_type prog_type, + enum btf_kfunc_type type, u32 kfunc_btf_id) +{ + enum btf_kfunc_hook hook; + + hook = bpf_prog_type_to_kfunc_hook(prog_type); + return __btf_kfunc_id_set_contains(btf, hook, type, kfunc_btf_id); +} + +/* This function must be invoked only from initcalls/module init functions */ +int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, + const struct btf_kfunc_id_set *kset) +{ + enum btf_kfunc_hook hook; + struct btf *btf; + int ret; + + btf = btf_get_module_btf(kset->owner); + if (IS_ERR_OR_NULL(btf)) + return btf ? PTR_ERR(btf) : -ENOENT; + + hook = bpf_prog_type_to_kfunc_hook(prog_type); + ret = btf_populate_kfunc_set(btf, hook, kset); + /* reference is only taken for module BTF */ + if (btf_is_module(btf)) + btf_put(btf); + return ret; +} +EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set); #ifdef CONFIG_DEBUG_INFO_BTF_MODULES -- cgit v1.2.3 From b202d84422223b7222cba5031d182f20b37e146e Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 14 Jan 2022 22:09:46 +0530 Subject: bpf: Remove check_kfunc_call callback and old kfunc BTF ID API Completely remove the old code for check_kfunc_call to help it work with modules, and also the callback itself. The previous commit adds infrastructure to register all sets and put them in vmlinux or module BTF, and concatenates all related sets organized by the hook and the type. Once populated, these sets remain immutable for the lifetime of the struct btf. Also, since we don't need the 'owner' module anywhere when doing check_kfunc_call, drop the 'btf_modp' module parameter from find_kfunc_desc_btf. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220114163953.1455836-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 ---- include/linux/btf.h | 44 --------------------- kernel/bpf/btf.c | 46 ---------------------- kernel/bpf/verifier.c | 20 ++++------ net/bpf/test_run.c | 23 ++++++----- net/core/filter.c | 1 - net/ipv4/bpf_tcp_ca.c | 22 ++++++----- net/ipv4/tcp_bbr.c | 18 +++++---- net/ipv4/tcp_cubic.c | 17 ++++---- net/ipv4/tcp_dctcp.c | 18 +++++---- .../selftests/bpf/bpf_testmod/bpf_testmod.c | 17 ++++---- 11 files changed, 73 insertions(+), 161 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6e947cd91152..6d7346c54d83 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -573,7 +573,6 @@ struct bpf_verifier_ops { const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id); - bool (*check_kfunc_call)(u32 kfunc_btf_id, struct module *owner); }; struct bpf_prog_offload_ops { @@ -1719,7 +1718,6 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); -bool bpf_prog_test_check_kfunc_call(u32 kfunc_id, struct module *owner); bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); @@ -1971,12 +1969,6 @@ static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, return -ENOTSUPP; } -static inline bool bpf_prog_test_check_kfunc_call(u32 kfunc_id, - struct module *owner) -{ - return false; -} - static inline void bpf_map_put(struct bpf_map *map) { } diff --git a/include/linux/btf.h b/include/linux/btf.h index c451f8e2612a..b12cfe3b12bb 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -359,48 +359,4 @@ static inline int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, } #endif -struct kfunc_btf_id_set { - struct list_head list; - struct btf_id_set *set; - struct module *owner; -}; - -struct kfunc_btf_id_list { - struct list_head list; - struct mutex mutex; -}; - -#ifdef CONFIG_DEBUG_INFO_BTF_MODULES -void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l, - struct kfunc_btf_id_set *s); -void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l, - struct kfunc_btf_id_set *s); -bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id, - struct module *owner); - -extern struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list; -extern struct kfunc_btf_id_list prog_test_kfunc_list; -#else -static inline void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l, - struct kfunc_btf_id_set *s) -{ -} -static inline void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l, - struct kfunc_btf_id_set *s) -{ -} -static inline bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, - u32 kfunc_id, struct module *owner) -{ - return false; -} - -static struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list __maybe_unused; -static struct kfunc_btf_id_list prog_test_kfunc_list __maybe_unused; -#endif - -#define DEFINE_KFUNC_BTF_ID_SET(set, name) \ - struct kfunc_btf_id_set name = { LIST_HEAD_INIT(name.list), (set), \ - THIS_MODULE } - #endif diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 74037bd65d17..4be5cf629ca9 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6682,52 +6682,6 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, } EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set); -#ifdef CONFIG_DEBUG_INFO_BTF_MODULES - -void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l, - struct kfunc_btf_id_set *s) -{ - mutex_lock(&l->mutex); - list_add(&s->list, &l->list); - mutex_unlock(&l->mutex); -} -EXPORT_SYMBOL_GPL(register_kfunc_btf_id_set); - -void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l, - struct kfunc_btf_id_set *s) -{ - mutex_lock(&l->mutex); - list_del_init(&s->list); - mutex_unlock(&l->mutex); -} -EXPORT_SYMBOL_GPL(unregister_kfunc_btf_id_set); - -bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id, - struct module *owner) -{ - struct kfunc_btf_id_set *s; - - mutex_lock(&klist->mutex); - list_for_each_entry(s, &klist->list, list) { - if (s->owner == owner && btf_id_set_contains(s->set, kfunc_id)) { - mutex_unlock(&klist->mutex); - return true; - } - } - mutex_unlock(&klist->mutex); - return false; -} - -#define DEFINE_KFUNC_BTF_ID_LIST(name) \ - struct kfunc_btf_id_list name = { LIST_HEAD_INIT(name.list), \ - __MUTEX_INITIALIZER(name.mutex) }; \ - EXPORT_SYMBOL_GPL(name) - -DEFINE_KFUNC_BTF_ID_LIST(bpf_tcp_ca_kfunc_list); -DEFINE_KFUNC_BTF_ID_LIST(prog_test_kfunc_list); - -#endif - int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, __u32 targ_id) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bfb45381fb3f..72802c1eb5ac 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1741,7 +1741,7 @@ find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset) } static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env, - s16 offset, struct module **btf_modp) + s16 offset) { struct bpf_kfunc_btf kf_btf = { .offset = offset }; struct bpf_kfunc_btf_tab *tab; @@ -1795,8 +1795,6 @@ static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env, sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), kfunc_btf_cmp_by_off, NULL); } - if (btf_modp) - *btf_modp = b->module; return b->btf; } @@ -1813,8 +1811,7 @@ void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab) } static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, - u32 func_id, s16 offset, - struct module **btf_modp) + u32 func_id, s16 offset) { if (offset) { if (offset < 0) { @@ -1825,7 +1822,7 @@ static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, return ERR_PTR(-EINVAL); } - return __find_kfunc_desc_btf(env, offset, btf_modp); + return __find_kfunc_desc_btf(env, offset); } return btf_vmlinux ?: ERR_PTR(-ENOENT); } @@ -1888,7 +1885,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) prog_aux->kfunc_btf_tab = btf_tab; } - desc_btf = find_kfunc_desc_btf(env, func_id, offset, NULL); + desc_btf = find_kfunc_desc_btf(env, func_id, offset); if (IS_ERR(desc_btf)) { verbose(env, "failed to find BTF for kernel function\n"); return PTR_ERR(desc_btf); @@ -2349,7 +2346,7 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL) return NULL; - desc_btf = find_kfunc_desc_btf(data, insn->imm, insn->off, NULL); + desc_btf = find_kfunc_desc_btf(data, insn->imm, insn->off); if (IS_ERR(desc_btf)) return ""; @@ -6820,7 +6817,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn) struct bpf_reg_state *regs = cur_regs(env); const char *func_name, *ptr_type_name; u32 i, nargs, func_id, ptr_type_id; - struct module *btf_mod = NULL; const struct btf_param *args; struct btf *desc_btf; int err; @@ -6829,7 +6825,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn) if (!insn->imm) return 0; - desc_btf = find_kfunc_desc_btf(env, insn->imm, insn->off, &btf_mod); + desc_btf = find_kfunc_desc_btf(env, insn->imm, insn->off); if (IS_ERR(desc_btf)) return PTR_ERR(desc_btf); @@ -6838,8 +6834,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn) func_name = btf_name_by_offset(desc_btf, func->name_off); func_proto = btf_type_by_id(desc_btf, func->type); - if (!env->ops->check_kfunc_call || - !env->ops->check_kfunc_call(func_id, btf_mod)) { + if (!btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), + BTF_KFUNC_TYPE_CHECK, func_id)) { verbose(env, "calling kernel function %s is not allowed\n", func_name); return -EACCES; diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 46dd95755967..7796a8c747a0 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -236,18 +237,11 @@ __diag_pop(); ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO); -BTF_SET_START(test_sk_kfunc_ids) +BTF_SET_START(test_sk_check_kfunc_ids) BTF_ID(func, bpf_kfunc_call_test1) BTF_ID(func, bpf_kfunc_call_test2) BTF_ID(func, bpf_kfunc_call_test3) -BTF_SET_END(test_sk_kfunc_ids) - -bool bpf_prog_test_check_kfunc_call(u32 kfunc_id, struct module *owner) -{ - if (btf_id_set_contains(&test_sk_kfunc_ids, kfunc_id)) - return true; - return bpf_check_mod_kfunc_call(&prog_test_kfunc_list, kfunc_id, owner); -} +BTF_SET_END(test_sk_check_kfunc_ids) static void *bpf_test_init(const union bpf_attr *kattr, u32 size, u32 headroom, u32 tailroom) @@ -1067,3 +1061,14 @@ out: kfree(ctx); return err; } + +static const struct btf_kfunc_id_set bpf_prog_test_kfunc_set = { + .owner = THIS_MODULE, + .check_set = &test_sk_check_kfunc_ids, +}; + +static int __init bpf_prog_test_run_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_prog_test_kfunc_set); +} +late_initcall(bpf_prog_test_run_init); diff --git a/net/core/filter.c b/net/core/filter.c index 4603b7cd3cd1..f73a84c75970 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -10062,7 +10062,6 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, .gen_ld_abs = bpf_gen_ld_abs, - .check_kfunc_call = bpf_prog_test_check_kfunc_call, }; const struct bpf_prog_ops tc_cls_act_prog_ops = { diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index de610cb83694..b60c9fd7147e 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ +#include #include #include #include @@ -212,26 +213,23 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, } } -BTF_SET_START(bpf_tcp_ca_kfunc_ids) +BTF_SET_START(bpf_tcp_ca_check_kfunc_ids) BTF_ID(func, tcp_reno_ssthresh) BTF_ID(func, tcp_reno_cong_avoid) BTF_ID(func, tcp_reno_undo_cwnd) BTF_ID(func, tcp_slow_start) BTF_ID(func, tcp_cong_avoid_ai) -BTF_SET_END(bpf_tcp_ca_kfunc_ids) +BTF_SET_END(bpf_tcp_ca_check_kfunc_ids) -static bool bpf_tcp_ca_check_kfunc_call(u32 kfunc_btf_id, struct module *owner) -{ - if (btf_id_set_contains(&bpf_tcp_ca_kfunc_ids, kfunc_btf_id)) - return true; - return bpf_check_mod_kfunc_call(&bpf_tcp_ca_kfunc_list, kfunc_btf_id, owner); -} +static const struct btf_kfunc_id_set bpf_tcp_ca_kfunc_set = { + .owner = THIS_MODULE, + .check_set = &bpf_tcp_ca_check_kfunc_ids, +}; static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = { .get_func_proto = bpf_tcp_ca_get_func_proto, .is_valid_access = bpf_tcp_ca_is_valid_access, .btf_struct_access = bpf_tcp_ca_btf_struct_access, - .check_kfunc_call = bpf_tcp_ca_check_kfunc_call, }; static int bpf_tcp_ca_init_member(const struct btf_type *t, @@ -300,3 +298,9 @@ struct bpf_struct_ops bpf_tcp_congestion_ops = { .init = bpf_tcp_ca_init, .name = "tcp_congestion_ops", }; + +static int __init bpf_tcp_ca_kfunc_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_tcp_ca_kfunc_set); +} +late_initcall(bpf_tcp_ca_kfunc_init); diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index ec5550089b4d..02e8626ccb27 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -1154,7 +1154,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { .set_state = bbr_set_state, }; -BTF_SET_START(tcp_bbr_kfunc_ids) +BTF_SET_START(tcp_bbr_check_kfunc_ids) #ifdef CONFIG_X86 #ifdef CONFIG_DYNAMIC_FTRACE BTF_ID(func, bbr_init) @@ -1167,25 +1167,27 @@ BTF_ID(func, bbr_min_tso_segs) BTF_ID(func, bbr_set_state) #endif #endif -BTF_SET_END(tcp_bbr_kfunc_ids) +BTF_SET_END(tcp_bbr_check_kfunc_ids) -static DEFINE_KFUNC_BTF_ID_SET(&tcp_bbr_kfunc_ids, tcp_bbr_kfunc_btf_set); +static const struct btf_kfunc_id_set tcp_bbr_kfunc_set = { + .owner = THIS_MODULE, + .check_set = &tcp_bbr_check_kfunc_ids, +}; static int __init bbr_register(void) { int ret; BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE); - ret = tcp_register_congestion_control(&tcp_bbr_cong_ops); - if (ret) + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_bbr_kfunc_set); + if (ret < 0) return ret; - register_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_bbr_kfunc_btf_set); - return 0; + return tcp_register_congestion_control(&tcp_bbr_cong_ops); } static void __exit bbr_unregister(void) { - unregister_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_bbr_kfunc_btf_set); tcp_unregister_congestion_control(&tcp_bbr_cong_ops); } diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index e07837e23b3f..24d562dd6225 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -485,7 +485,7 @@ static struct tcp_congestion_ops cubictcp __read_mostly = { .name = "cubic", }; -BTF_SET_START(tcp_cubic_kfunc_ids) +BTF_SET_START(tcp_cubic_check_kfunc_ids) #ifdef CONFIG_X86 #ifdef CONFIG_DYNAMIC_FTRACE BTF_ID(func, cubictcp_init) @@ -496,9 +496,12 @@ BTF_ID(func, cubictcp_cwnd_event) BTF_ID(func, cubictcp_acked) #endif #endif -BTF_SET_END(tcp_cubic_kfunc_ids) +BTF_SET_END(tcp_cubic_check_kfunc_ids) -static DEFINE_KFUNC_BTF_ID_SET(&tcp_cubic_kfunc_ids, tcp_cubic_kfunc_btf_set); +static const struct btf_kfunc_id_set tcp_cubic_kfunc_set = { + .owner = THIS_MODULE, + .check_set = &tcp_cubic_check_kfunc_ids, +}; static int __init cubictcp_register(void) { @@ -534,16 +537,14 @@ static int __init cubictcp_register(void) /* divide by bic_scale and by constant Srtt (100ms) */ do_div(cube_factor, bic_scale * 10); - ret = tcp_register_congestion_control(&cubictcp); - if (ret) + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_cubic_kfunc_set); + if (ret < 0) return ret; - register_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_cubic_kfunc_btf_set); - return 0; + return tcp_register_congestion_control(&cubictcp); } static void __exit cubictcp_unregister(void) { - unregister_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_cubic_kfunc_btf_set); tcp_unregister_congestion_control(&cubictcp); } diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 0d7ab3cc7b61..1943a6630341 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -238,7 +238,7 @@ static struct tcp_congestion_ops dctcp_reno __read_mostly = { .name = "dctcp-reno", }; -BTF_SET_START(tcp_dctcp_kfunc_ids) +BTF_SET_START(tcp_dctcp_check_kfunc_ids) #ifdef CONFIG_X86 #ifdef CONFIG_DYNAMIC_FTRACE BTF_ID(func, dctcp_init) @@ -249,25 +249,27 @@ BTF_ID(func, dctcp_cwnd_undo) BTF_ID(func, dctcp_state) #endif #endif -BTF_SET_END(tcp_dctcp_kfunc_ids) +BTF_SET_END(tcp_dctcp_check_kfunc_ids) -static DEFINE_KFUNC_BTF_ID_SET(&tcp_dctcp_kfunc_ids, tcp_dctcp_kfunc_btf_set); +static const struct btf_kfunc_id_set tcp_dctcp_kfunc_set = { + .owner = THIS_MODULE, + .check_set = &tcp_dctcp_check_kfunc_ids, +}; static int __init dctcp_register(void) { int ret; BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE); - ret = tcp_register_congestion_control(&dctcp); - if (ret) + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_dctcp_kfunc_set); + if (ret < 0) return ret; - register_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_dctcp_kfunc_btf_set); - return 0; + return tcp_register_congestion_control(&dctcp); } static void __exit dctcp_unregister(void) { - unregister_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_dctcp_kfunc_btf_set); tcp_unregister_congestion_control(&dctcp); } diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index df3b292a8ffe..c0805d0d753f 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -109,26 +109,27 @@ static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = { .write = bpf_testmod_test_write, }; -BTF_SET_START(bpf_testmod_kfunc_ids) +BTF_SET_START(bpf_testmod_check_kfunc_ids) BTF_ID(func, bpf_testmod_test_mod_kfunc) -BTF_SET_END(bpf_testmod_kfunc_ids) +BTF_SET_END(bpf_testmod_check_kfunc_ids) -static DEFINE_KFUNC_BTF_ID_SET(&bpf_testmod_kfunc_ids, bpf_testmod_kfunc_btf_set); +static const struct btf_kfunc_id_set bpf_testmod_kfunc_set = { + .owner = THIS_MODULE, + .check_set = &bpf_testmod_check_kfunc_ids, +}; static int bpf_testmod_init(void) { int ret; - ret = sysfs_create_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file); - if (ret) + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_testmod_kfunc_set); + if (ret < 0) return ret; - register_kfunc_btf_id_set(&prog_test_kfunc_list, &bpf_testmod_kfunc_btf_set); - return 0; + return sysfs_create_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file); } static void bpf_testmod_exit(void) { - unregister_kfunc_btf_id_set(&prog_test_kfunc_list, &bpf_testmod_kfunc_btf_set); return sysfs_remove_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file); } -- cgit v1.2.3 From d583691c47dc0424ebe926000339a6d6cd590ff7 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 14 Jan 2022 22:09:47 +0530 Subject: bpf: Introduce mem, size argument pair support for kfunc BPF helpers can associate two adjacent arguments together to pass memory of certain size, using ARG_PTR_TO_MEM and ARG_CONST_SIZE arguments. Since we don't use bpf_func_proto for kfunc, we need to leverage BTF to implement similar support. The ARG_CONST_SIZE processing for helpers is refactored into a common check_mem_size_reg helper that is shared with kfunc as well. kfunc ptr_to_mem support follows logic similar to global functions, where verification is done as if pointer is not null, even when it may be null. This leads to a simple to follow rule for writing kfunc: always check the argument pointer for NULL, except when it is PTR_TO_CTX. Also, the PTR_TO_CTX case is also only safe when the helper expecting pointer to program ctx is not exposed to other programs where same struct is not ctx type. In that case, the type check will fall through to other cases and would permit passing other types of pointers, possibly NULL at runtime. Currently, we require the size argument to be suffixed with "__sz" in the parameter name. This information is then recorded in kernel BTF and verified during function argument checking. In the future we can use BTF tagging instead, and modify the kernel function definitions. This will be a purely kernel-side change. This allows us to have some form of backwards compatibility for structures that are passed in to the kernel function with their size, and allow variable length structures to be passed in if they are accompanied by a size parameter. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220114163953.1455836-5-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 + kernel/bpf/btf.c | 48 +++++++++++++++-- kernel/bpf/verifier.c | 124 +++++++++++++++++++++++++++---------------- 3 files changed, 126 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 143401d4c9d9..857fd687bdc2 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -521,6 +521,8 @@ bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt); int check_ctx_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno); +int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + u32 regno); int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4be5cf629ca9..cf46694cb266 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5654,6 +5654,32 @@ static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log, return true; } +static bool is_kfunc_arg_mem_size(const struct btf *btf, + const struct btf_param *arg, + const struct bpf_reg_state *reg) +{ + int len, sfx_len = sizeof("__sz") - 1; + const struct btf_type *t; + const char *param_name; + + t = btf_type_skip_modifiers(btf, arg->type, NULL); + if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE) + return false; + + /* In the future, this can be ported to use BTF tagging */ + param_name = btf_name_by_offset(btf, arg->name_off); + if (str_is_empty(param_name)) + return false; + len = strlen(param_name); + if (len < sfx_len) + return false; + param_name += len - sfx_len; + if (strncmp(param_name, "__sz", sfx_len)) + return false; + + return true; +} + static int btf_check_func_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, @@ -5765,17 +5791,33 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, u32 type_size; if (is_kfunc) { + bool arg_mem_size = i + 1 < nargs && is_kfunc_arg_mem_size(btf, &args[i + 1], ®s[regno + 1]); + /* Permit pointer to mem, but only when argument * type is pointer to scalar, or struct composed * (recursively) of scalars. + * When arg_mem_size is true, the pointer can be + * void *. */ if (!btf_type_is_scalar(ref_t) && - !__btf_type_is_scalar_struct(log, btf, ref_t, 0)) { + !__btf_type_is_scalar_struct(log, btf, ref_t, 0) && + (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) { bpf_log(log, - "arg#%d pointer type %s %s must point to scalar or struct with scalar\n", - i, btf_type_str(ref_t), ref_tname); + "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n", + i, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : ""); return -EINVAL; } + + /* Check for mem, len pair */ + if (arg_mem_size) { + if (check_kfunc_mem_size_reg(env, ®s[regno + 1], regno + 1)) { + bpf_log(log, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", + i, i + 1); + return -EINVAL; + } + i++; + continue; + } } resolve_ret = btf_resolve_size(btf, ref_t, &type_size); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 72802c1eb5ac..2b186185b6b2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4864,6 +4864,62 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } } +static int check_mem_size_reg(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, u32 regno, + bool zero_size_allowed, + struct bpf_call_arg_meta *meta) +{ + int err; + + /* This is used to refine r0 return value bounds for helpers + * that enforce this value as an upper bound on return values. + * See do_refine_retval_range() for helpers that can refine + * the return value. C type of helper is u32 so we pull register + * bound from umax_value however, if negative verifier errors + * out. Only upper bounds can be learned because retval is an + * int type and negative retvals are allowed. + */ + if (meta) + meta->msize_max_value = reg->umax_value; + + /* The register is SCALAR_VALUE; the access check + * happens using its boundaries. + */ + if (!tnum_is_const(reg->var_off)) + /* For unprivileged variable accesses, disable raw + * mode so that the program is required to + * initialize all the memory that the helper could + * just partially fill up. + */ + meta = NULL; + + if (reg->smin_value < 0) { + verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n", + regno); + return -EACCES; + } + + if (reg->umin_value == 0) { + err = check_helper_mem_access(env, regno - 1, 0, + zero_size_allowed, + meta); + if (err) + return err; + } + + if (reg->umax_value >= BPF_MAX_VAR_SIZ) { + verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", + regno); + return -EACCES; + } + err = check_helper_mem_access(env, regno - 1, + reg->umax_value, + zero_size_allowed, meta); + if (!err) + err = mark_chain_precision(env, regno); + return err; +} + int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size) { @@ -4887,6 +4943,28 @@ int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, return check_helper_mem_access(env, regno, mem_size, true, NULL); } +int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + u32 regno) +{ + struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1]; + bool may_be_null = type_may_be_null(mem_reg->type); + struct bpf_reg_state saved_reg; + int err; + + WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5); + + if (may_be_null) { + saved_reg = *mem_reg; + mark_ptr_not_null_reg(mem_reg); + } + + err = check_mem_size_reg(env, reg, regno, true, NULL); + + if (may_be_null) + *mem_reg = saved_reg; + return err; +} + /* Implementation details: * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL * Two bpf_map_lookups (even with the same key) will have different reg->id. @@ -5408,51 +5486,7 @@ skip_type_check: } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); - /* This is used to refine r0 return value bounds for helpers - * that enforce this value as an upper bound on return values. - * See do_refine_retval_range() for helpers that can refine - * the return value. C type of helper is u32 so we pull register - * bound from umax_value however, if negative verifier errors - * out. Only upper bounds can be learned because retval is an - * int type and negative retvals are allowed. - */ - meta->msize_max_value = reg->umax_value; - - /* The register is SCALAR_VALUE; the access check - * happens using its boundaries. - */ - if (!tnum_is_const(reg->var_off)) - /* For unprivileged variable accesses, disable raw - * mode so that the program is required to - * initialize all the memory that the helper could - * just partially fill up. - */ - meta = NULL; - - if (reg->smin_value < 0) { - verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n", - regno); - return -EACCES; - } - - if (reg->umin_value == 0) { - err = check_helper_mem_access(env, regno - 1, 0, - zero_size_allowed, - meta); - if (err) - return err; - } - - if (reg->umax_value >= BPF_MAX_VAR_SIZ) { - verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", - regno); - return -EACCES; - } - err = check_helper_mem_access(env, regno - 1, - reg->umax_value, - zero_size_allowed, meta); - if (!err) - err = mark_chain_precision(env, regno); + err = check_mem_size_reg(env, reg, regno, zero_size_allowed, meta); } else if (arg_type_is_alloc_size(arg_type)) { if (!tnum_is_const(reg->var_off)) { verbose(env, "R%d is not a known constant'\n", -- cgit v1.2.3 From 5c073f26f9dc78a6c8194b23eac7537c9692c7d7 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 14 Jan 2022 22:09:48 +0530 Subject: bpf: Add reference tracking support to kfunc This patch adds verifier support for PTR_TO_BTF_ID return type of kfunc to be a reference, by reusing acquire_reference_state/release_reference support for existing in-kernel bpf helpers. We make use of the three kfunc types: - BTF_KFUNC_TYPE_ACQUIRE Return true if kfunc_btf_id is an acquire kfunc. This will acquire_reference_state for the returned PTR_TO_BTF_ID (this is the only allow return value). Note that acquire kfunc must always return a PTR_TO_BTF_ID{_OR_NULL}, otherwise the program is rejected. - BTF_KFUNC_TYPE_RELEASE Return true if kfunc_btf_id is a release kfunc. This will release the reference to the passed in PTR_TO_BTF_ID which has a reference state (from earlier acquire kfunc). The btf_check_func_arg_match returns the regno (of argument register, hence > 0) if the kfunc is a release kfunc, and a proper referenced PTR_TO_BTF_ID is being passed to it. This is similar to how helper call check uses bpf_call_arg_meta to store the ref_obj_id that is later used to release the reference. Similar to in-kernel helper, we only allow passing one referenced PTR_TO_BTF_ID as an argument. It can also be passed in to normal kfunc, but in case of release kfunc there must always be one PTR_TO_BTF_ID argument that is referenced. - BTF_KFUNC_TYPE_RET_NULL For kfunc returning PTR_TO_BTF_ID, tells if it can be NULL, hence force caller to mark the pointer not null (using check) before accessing it. Note that taking into account the case fixed by commit 93c230e3f5bd ("bpf: Enforce id generation for all may-be-null register type") we assign a non-zero id for mark_ptr_or_null_reg logic. Later, if more return types are supported by kfunc, which have a _OR_NULL variant, it might be better to move this id generation under a common reg_type_may_be_null check, similar to the case in the commit. Referenced PTR_TO_BTF_ID is currently only limited to kfunc, but can be extended in the future to other BPF helpers as well. For now, we can rely on the btf_struct_ids_match check to ensure we get the pointer to the expected struct type. In the future, care needs to be taken to avoid ambiguity for reference PTR_TO_BTF_ID passed to release function, in case multiple candidates can release same BTF ID. e.g. there might be two release kfuncs (or kfunc and helper): foo(struct abc *p); bar(struct abc *p); ... such that both release a PTR_TO_BTF_ID with btf_id of struct abc. In this case we would need to track the acquire function corresponding to the release function to avoid type confusion, and store this information in the register state so that an incorrect program can be rejected. This is not a problem right now, hence it is left as an exercise for the future patch introducing such a case in the kernel. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220114163953.1455836-6-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 5 +++++ kernel/bpf/btf.c | 32 +++++++++++++++++++++++++-- kernel/bpf/verifier.c | 52 +++++++++++++++++++++++++++++++++++--------- 3 files changed, 77 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 857fd687bdc2..ac4797155412 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -566,4 +566,9 @@ static inline u32 type_flag(u32 type) return type & ~BPF_BASE_TYPE_MASK; } +static inline enum bpf_prog_type resolve_prog_type(struct bpf_prog *prog) +{ + return prog->aux->dst_prog ? prog->aux->dst_prog->type : prog->type; +} + #endif /* _LINUX_BPF_VERIFIER_H */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index cf46694cb266..57f5fd5af2f9 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5686,11 +5686,13 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, bool ptr_to_mem_ok) { struct bpf_verifier_log *log = &env->log; + u32 i, nargs, ref_id, ref_obj_id = 0; bool is_kfunc = btf_is_kernel(btf); const char *func_name, *ref_tname; const struct btf_type *t, *ref_t; const struct btf_param *args; - u32 i, nargs, ref_id; + int ref_regno = 0; + bool rel = false; t = btf_type_by_id(btf, func_id); if (!t || !btf_type_is_func(t)) { @@ -5768,6 +5770,16 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, if (reg->type == PTR_TO_BTF_ID) { reg_btf = reg->btf; reg_ref_id = reg->btf_id; + /* Ensure only one argument is referenced PTR_TO_BTF_ID */ + if (reg->ref_obj_id) { + if (ref_obj_id) { + bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", + regno, reg->ref_obj_id, ref_obj_id); + return -EFAULT; + } + ref_regno = regno; + ref_obj_id = reg->ref_obj_id; + } } else { reg_btf = btf_vmlinux; reg_ref_id = *reg2btf_ids[reg->type]; @@ -5838,7 +5850,23 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, } } - return 0; + /* Either both are set, or neither */ + WARN_ON_ONCE((ref_obj_id && !ref_regno) || (!ref_obj_id && ref_regno)); + if (is_kfunc) { + rel = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog), + BTF_KFUNC_TYPE_RELEASE, func_id); + /* We already made sure ref_obj_id is set only for one argument */ + if (rel && !ref_obj_id) { + bpf_log(log, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n", + func_name); + return -EINVAL; + } + /* Allow (!rel && ref_obj_id), so that passing such referenced PTR_TO_BTF_ID to + * other kfuncs works + */ + } + /* returns argument register number > 0 in case of reference release kfunc */ + return rel ? ref_regno : 0; } /* Compare BTF of a function with given bpf_reg_state. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2b186185b6b2..8c5a46d41f28 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -452,7 +452,8 @@ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) { return base_type(type) == PTR_TO_SOCKET || base_type(type) == PTR_TO_TCP_SOCK || - base_type(type) == PTR_TO_MEM; + base_type(type) == PTR_TO_MEM || + base_type(type) == PTR_TO_BTF_ID; } static bool type_is_rdonly_mem(u32 type) @@ -3493,11 +3494,6 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, #define MAX_PACKET_OFF 0xffff -static enum bpf_prog_type resolve_prog_type(struct bpf_prog *prog) -{ - return prog->aux->dst_prog ? prog->aux->dst_prog->type : prog->type; -} - static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, const struct bpf_call_arg_meta *meta, enum bpf_access_type t) @@ -6845,15 +6841,17 @@ static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, } } -static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn) +static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx_p) { const struct btf_type *t, *func, *func_proto, *ptr_type; struct bpf_reg_state *regs = cur_regs(env); const char *func_name, *ptr_type_name; u32 i, nargs, func_id, ptr_type_id; + int err, insn_idx = *insn_idx_p; const struct btf_param *args; struct btf *desc_btf; - int err; + bool acq; /* skip for now, but return error when we find this in fixup_kfunc_call */ if (!insn->imm) @@ -6875,16 +6873,36 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EACCES; } + acq = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), + BTF_KFUNC_TYPE_ACQUIRE, func_id); + /* Check the arguments */ err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs); - if (err) + if (err < 0) return err; + /* In case of release function, we get register number of refcounted + * PTR_TO_BTF_ID back from btf_check_kfunc_arg_match, do the release now + */ + if (err) { + err = release_reference(env, regs[err].ref_obj_id); + if (err) { + verbose(env, "kfunc %s#%d reference has not been acquired before\n", + func_name, func_id); + return err; + } + } for (i = 0; i < CALLER_SAVED_REGS; i++) mark_reg_not_init(env, regs, caller_saved[i]); /* Check return type */ t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL); + + if (acq && !btf_type_is_ptr(t)) { + verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n"); + return -EINVAL; + } + if (btf_type_is_scalar(t)) { mark_reg_unknown(env, regs, BPF_REG_0); mark_btf_func_reg_size(env, BPF_REG_0, t->size); @@ -6903,7 +6921,21 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn) regs[BPF_REG_0].btf = desc_btf; regs[BPF_REG_0].type = PTR_TO_BTF_ID; regs[BPF_REG_0].btf_id = ptr_type_id; + if (btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), + BTF_KFUNC_TYPE_RET_NULL, func_id)) { + regs[BPF_REG_0].type |= PTR_MAYBE_NULL; + /* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */ + regs[BPF_REG_0].id = ++env->id_gen; + } mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *)); + if (acq) { + int id = acquire_reference_state(env, insn_idx); + + if (id < 0) + return id; + regs[BPF_REG_0].id = id; + regs[BPF_REG_0].ref_obj_id = id; + } } /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */ nargs = btf_type_vlen(func_proto); @@ -11548,7 +11580,7 @@ static int do_check(struct bpf_verifier_env *env) if (insn->src_reg == BPF_PSEUDO_CALL) err = check_func_call(env, insn, &env->insn_idx); else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) - err = check_kfunc_call(env, insn); + err = check_kfunc_call(env, insn, &env->insn_idx); else err = check_helper_call(env, insn, &env->insn_idx); if (err) -- cgit v1.2.3 From f10d059661968b01ef61a8b516775f95a18ab8ae Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Thu, 16 Dec 2021 02:04:25 +0000 Subject: bpf: Make BPF_PROG_RUN_ARRAY return -err instead of allow boolean Right now BPF_PROG_RUN_ARRAY and related macros return 1 or 0 for whether the prog array allows or rejects whatever is being hooked. The caller of these macros then return -EPERM or continue processing based on thw macro's return value. Unforunately this is inflexible, since -EPERM is the only err that can be returned. This patch should be a no-op; it prepares for the next patch. The returning of the -EPERM is moved to inside the macros, so the outer functions are directly returning what the macros returned if they are non-zero. Signed-off-by: YiFei Zhu Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/r/788abcdca55886d1f43274c918eaa9f792a9f33b.1639619851.git.zhuyifei@google.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 16 +++++++++------- kernel/bpf/cgroup.c | 41 +++++++++++++++-------------------------- security/device_cgroup.c | 2 +- 3 files changed, 25 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6d7346c54d83..83da1764fcfe 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1277,7 +1277,7 @@ static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx) typedef u32 (*bpf_prog_run_fn)(const struct bpf_prog *prog, const void *ctx); -static __always_inline u32 +static __always_inline int BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu, const void *ctx, bpf_prog_run_fn run_prog, u32 *ret_flags) @@ -1287,7 +1287,7 @@ BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu, const struct bpf_prog_array *array; struct bpf_run_ctx *old_run_ctx; struct bpf_cg_run_ctx run_ctx; - u32 ret = 1; + int ret = 0; u32 func_ret; migrate_disable(); @@ -1298,7 +1298,8 @@ BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu, while ((prog = READ_ONCE(item->prog))) { run_ctx.prog_item = item; func_ret = run_prog(prog, ctx); - ret &= (func_ret & 1); + if (!(func_ret & 1)) + ret = -EPERM; *(ret_flags) |= (func_ret >> 1); item++; } @@ -1308,7 +1309,7 @@ BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu, return ret; } -static __always_inline u32 +static __always_inline int BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu, const void *ctx, bpf_prog_run_fn run_prog) { @@ -1317,7 +1318,7 @@ BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu, const struct bpf_prog_array *array; struct bpf_run_ctx *old_run_ctx; struct bpf_cg_run_ctx run_ctx; - u32 ret = 1; + int ret = 0; migrate_disable(); rcu_read_lock(); @@ -1326,7 +1327,8 @@ BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu, old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); while ((prog = READ_ONCE(item->prog))) { run_ctx.prog_item = item; - ret &= run_prog(prog, ctx); + if (!run_prog(prog, ctx)) + ret = -EPERM; item++; } bpf_reset_run_ctx(old_run_ctx); @@ -1394,7 +1396,7 @@ out: u32 _ret; \ _ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(array, ctx, func, &_flags); \ _cn = _flags & BPF_RET_SET_CN; \ - if (_ret) \ + if (!_ret) \ _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \ else \ _ret = (_cn ? NET_XMIT_DROP : -EPERM); \ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 514b4681a90a..386155d279b3 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1080,7 +1080,6 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, } else { ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb); - ret = (ret == 1 ? 0 : -EPERM); } bpf_restore_data_end(skb, saved_data_end); __skb_pull(skb, offset); @@ -1107,10 +1106,9 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, enum cgroup_bpf_attach_type atype) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - int ret; - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, bpf_prog_run); - return ret == 1 ? 0 : -EPERM; + return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, + bpf_prog_run); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -1142,7 +1140,6 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, }; struct sockaddr_storage unspec; struct cgroup *cgrp; - int ret; /* Check socket family since not all sockets represent network * endpoint (e.g. AF_UNIX). @@ -1156,10 +1153,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, } cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx, - bpf_prog_run, flags); - - return ret == 1 ? 0 : -EPERM; + return BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx, + bpf_prog_run, flags); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); @@ -1184,11 +1179,9 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, enum cgroup_bpf_attach_type atype) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - int ret; - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops, - bpf_prog_run); - return ret == 1 ? 0 : -EPERM; + return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops, + bpf_prog_run); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); @@ -1201,15 +1194,15 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, .major = major, .minor = minor, }; - int allow; + int ret; rcu_read_lock(); cgrp = task_dfl_cgroup(current); - allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, - bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, + bpf_prog_run); rcu_read_unlock(); - return !allow; + return ret; } static const struct bpf_func_proto * @@ -1350,7 +1343,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, kfree(ctx.new_val); } - return ret == 1 ? 0 : -EPERM; + return ret; } #ifdef CONFIG_NET @@ -1455,10 +1448,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, &ctx, bpf_prog_run); release_sock(sk); - if (!ret) { - ret = -EPERM; + if (ret) goto out; - } if (ctx.optlen == -1) { /* optlen set to -1, bypass kernel */ @@ -1565,10 +1556,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, &ctx, bpf_prog_run); release_sock(sk); - if (!ret) { - ret = -EPERM; + if (ret) goto out; - } if (ctx.optlen > max_optlen || ctx.optlen < 0) { ret = -EFAULT; @@ -1624,8 +1613,8 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT], &ctx, bpf_prog_run); - if (!ret) - return -EPERM; + if (ret) + return ret; if (ctx.optlen > *optlen) return -EFAULT; diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 842889f3dcb7..a9f8c63a96d1 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -838,7 +838,7 @@ int devcgroup_check_permission(short type, u32 major, u32 minor, short access) int rc = BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access); if (rc) - return -EPERM; + return rc; #ifdef CONFIG_CGROUP_DEVICE return devcgroup_legacy_check_permission(type, major, minor, access); -- cgit v1.2.3 From c4dcfdd406aa2167396ac215e351e5e4dfd7efe3 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Thu, 16 Dec 2021 02:04:26 +0000 Subject: bpf: Move getsockopt retval to struct bpf_cg_run_ctx The retval value is moved to struct bpf_cg_run_ctx for ease of access in different prog types with different context structs layouts. The helper implementation (to be added in a later patch in the series) can simply perform a container_of from current->bpf_ctx to retrieve bpf_cg_run_ctx. Unfortunately, there is no easy way to access the current task_struct via the verifier BPF bytecode rewrite, aside from possibly calling a helper, so a pointer to current task is added to struct bpf_sockopt_kern so that the rewritten BPF bytecode can access struct bpf_cg_run_ctx with an indirection. For backward compatibility, if a getsockopt program rejects a syscall by returning 0, an -EPERM will be generated, by having the BPF_PROG_RUN_ARRAY_CG family macros automatically set the retval to -EPERM. Unlike prior to this patch, this -EPERM will be visible to ctx->retval for any other hooks down the line in the prog array. Additionally, the restriction that getsockopt filters can only set the retval to 0 is removed, considering that certain getsockopt implementations may return optlen. Filters are now able to set the value arbitrarily. Signed-off-by: YiFei Zhu Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/r/73b0325f5c29912ccea7ea57ec1ed4d388fc1d37.1639619851.git.zhuyifei@google.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 20 ++++++------ include/linux/filter.h | 5 ++- kernel/bpf/cgroup.c | 82 +++++++++++++++++++++++++++++--------------------- 3 files changed, 63 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 83da1764fcfe..7b0c11f414d0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1245,6 +1245,7 @@ struct bpf_run_ctx {}; struct bpf_cg_run_ctx { struct bpf_run_ctx run_ctx; const struct bpf_prog_array_item *prog_item; + int retval; }; struct bpf_trace_run_ctx { @@ -1280,16 +1281,16 @@ typedef u32 (*bpf_prog_run_fn)(const struct bpf_prog *prog, const void *ctx); static __always_inline int BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu, const void *ctx, bpf_prog_run_fn run_prog, - u32 *ret_flags) + int retval, u32 *ret_flags) { const struct bpf_prog_array_item *item; const struct bpf_prog *prog; const struct bpf_prog_array *array; struct bpf_run_ctx *old_run_ctx; struct bpf_cg_run_ctx run_ctx; - int ret = 0; u32 func_ret; + run_ctx.retval = retval; migrate_disable(); rcu_read_lock(); array = rcu_dereference(array_rcu); @@ -1299,27 +1300,28 @@ BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu, run_ctx.prog_item = item; func_ret = run_prog(prog, ctx); if (!(func_ret & 1)) - ret = -EPERM; + run_ctx.retval = -EPERM; *(ret_flags) |= (func_ret >> 1); item++; } bpf_reset_run_ctx(old_run_ctx); rcu_read_unlock(); migrate_enable(); - return ret; + return run_ctx.retval; } static __always_inline int BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu, - const void *ctx, bpf_prog_run_fn run_prog) + const void *ctx, bpf_prog_run_fn run_prog, + int retval) { const struct bpf_prog_array_item *item; const struct bpf_prog *prog; const struct bpf_prog_array *array; struct bpf_run_ctx *old_run_ctx; struct bpf_cg_run_ctx run_ctx; - int ret = 0; + run_ctx.retval = retval; migrate_disable(); rcu_read_lock(); array = rcu_dereference(array_rcu); @@ -1328,13 +1330,13 @@ BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu, while ((prog = READ_ONCE(item->prog))) { run_ctx.prog_item = item; if (!run_prog(prog, ctx)) - ret = -EPERM; + run_ctx.retval = -EPERM; item++; } bpf_reset_run_ctx(old_run_ctx); rcu_read_unlock(); migrate_enable(); - return ret; + return run_ctx.retval; } static __always_inline u32 @@ -1394,7 +1396,7 @@ out: u32 _flags = 0; \ bool _cn; \ u32 _ret; \ - _ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(array, ctx, func, &_flags); \ + _ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(array, ctx, func, 0, &_flags); \ _cn = _flags & BPF_RET_SET_CN; \ if (!_ret) \ _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \ diff --git a/include/linux/filter.h b/include/linux/filter.h index 71fa57b88bfc..d23e999dc032 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1356,7 +1356,10 @@ struct bpf_sockopt_kern { s32 level; s32 optname; s32 optlen; - s32 retval; + /* for retval in struct bpf_cg_run_ctx */ + struct task_struct *current_task; + /* Temporary "register" for indirect stores to ppos. */ + u64 tmp_reg; }; int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 386155d279b3..b6fad0bbf5a7 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1079,7 +1079,7 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb); } else { ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb, - __bpf_prog_run_save_cb); + __bpf_prog_run_save_cb, 0); } bpf_restore_data_end(skb, saved_data_end); __skb_pull(skb, offset); @@ -1108,7 +1108,7 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, - bpf_prog_run); + bpf_prog_run, 0); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -1154,7 +1154,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); return BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx, - bpf_prog_run, flags); + bpf_prog_run, 0, flags); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); @@ -1181,7 +1181,7 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops, - bpf_prog_run); + bpf_prog_run, 0); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); @@ -1199,7 +1199,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, rcu_read_lock(); cgrp = task_dfl_cgroup(current); ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, - bpf_prog_run); + bpf_prog_run, 0); rcu_read_unlock(); return ret; @@ -1330,7 +1330,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, + bpf_prog_run, 0); rcu_read_unlock(); kfree(ctx.cur_val); @@ -1445,7 +1446,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, lock_sock(sk); ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_SETSOCKOPT], - &ctx, bpf_prog_run); + &ctx, bpf_prog_run, 0); release_sock(sk); if (ret) @@ -1509,7 +1510,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, .sk = sk, .level = level, .optname = optname, - .retval = retval, + .current_task = current, }; int ret; @@ -1553,10 +1554,10 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, lock_sock(sk); ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT], - &ctx, bpf_prog_run); + &ctx, bpf_prog_run, retval); release_sock(sk); - if (ret) + if (ret < 0) goto out; if (ctx.optlen > max_optlen || ctx.optlen < 0) { @@ -1564,14 +1565,6 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, goto out; } - /* BPF programs only allowed to set retval to 0, not some - * arbitrary value. - */ - if (ctx.retval != 0 && ctx.retval != retval) { - ret = -EFAULT; - goto out; - } - if (ctx.optlen != 0) { if (copy_to_user(optval, ctx.optval, ctx.optlen) || put_user(ctx.optlen, optlen)) { @@ -1580,8 +1573,6 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, } } - ret = ctx.retval; - out: sockopt_free_buf(&ctx, &buf); return ret; @@ -1596,10 +1587,10 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, .sk = sk, .level = level, .optname = optname, - .retval = retval, .optlen = *optlen, .optval = optval, .optval_end = optval + *optlen, + .current_task = current, }; int ret; @@ -1612,25 +1603,19 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, */ ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT], - &ctx, bpf_prog_run); - if (ret) + &ctx, bpf_prog_run, retval); + if (ret < 0) return ret; if (ctx.optlen > *optlen) return -EFAULT; - /* BPF programs only allowed to set retval to 0, not some - * arbitrary value. - */ - if (ctx.retval != 0 && ctx.retval != retval) - return -EFAULT; - /* BPF programs can shrink the buffer, export the modifications. */ if (ctx.optlen != 0) *optlen = ctx.optlen; - return ctx.retval; + return ret; } #endif @@ -2046,10 +2031,39 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen); break; case offsetof(struct bpf_sockopt, retval): - if (type == BPF_WRITE) - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval); - else - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval); + BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0); + + if (type == BPF_WRITE) { + int treg = BPF_REG_9; + + if (si->src_reg == treg || si->dst_reg == treg) + --treg; + if (si->src_reg == treg || si->dst_reg == treg) + --treg; + *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg, + offsetof(struct bpf_sockopt_kern, tmp_reg)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task), + treg, si->dst_reg, + offsetof(struct bpf_sockopt_kern, current_task)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx), + treg, treg, + offsetof(struct task_struct, bpf_ctx)); + *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval), + treg, si->src_reg, + offsetof(struct bpf_cg_run_ctx, retval)); + *insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg, + offsetof(struct bpf_sockopt_kern, tmp_reg)); + } else { + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sockopt_kern, current_task)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx), + si->dst_reg, si->dst_reg, + offsetof(struct task_struct, bpf_ctx)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval), + si->dst_reg, si->dst_reg, + offsetof(struct bpf_cg_run_ctx, retval)); + } break; case offsetof(struct bpf_sockopt, optval): *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval); -- cgit v1.2.3 From b44123b4a3dcad4664d3a0f72c011ffd4c9c4d93 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Thu, 16 Dec 2021 02:04:27 +0000 Subject: bpf: Add cgroup helpers bpf_{get,set}_retval to get/set syscall return value The helpers continue to use int for retval because all the hooks are int-returning rather than long-returning. The return value of bpf_set_retval is int for future-proofing, in case in the future there may be errors trying to set the retval. After the previous patch, if a program rejects a syscall by returning 0, an -EPERM will be generated no matter if the retval is already set to -err. This patch change it being forced only if retval is not -err. This is because we want to support, for example, invoking bpf_set_retval(-EINVAL) and return 0, and have the syscall return value be -EINVAL not -EPERM. For BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY, the prior behavior is that, if the return value is NET_XMIT_DROP, the packet is silently dropped. We preserve this behavior for backward compatibility reasons, so even if an errno is set, the errno does not return to caller. However, setting a non-err to retval cannot propagate so this is not allowed and we return a -EFAULT in that case. Signed-off-by: YiFei Zhu Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/r/b4013fd5d16bed0b01977c1fafdeae12e1de61fb.1639619851.git.zhuyifei@google.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 10 ++++++---- include/uapi/linux/bpf.h | 18 ++++++++++++++++++ kernel/bpf/cgroup.c | 38 +++++++++++++++++++++++++++++++++++++- tools/include/uapi/linux/bpf.h | 18 ++++++++++++++++++ 4 files changed, 79 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7b0c11f414d0..dce54eb0aae8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1299,7 +1299,7 @@ BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu, while ((prog = READ_ONCE(item->prog))) { run_ctx.prog_item = item; func_ret = run_prog(prog, ctx); - if (!(func_ret & 1)) + if (!(func_ret & 1) && !IS_ERR_VALUE((long)run_ctx.retval)) run_ctx.retval = -EPERM; *(ret_flags) |= (func_ret >> 1); item++; @@ -1329,7 +1329,7 @@ BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu, old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); while ((prog = READ_ONCE(item->prog))) { run_ctx.prog_item = item; - if (!run_prog(prog, ctx)) + if (!run_prog(prog, ctx) && !IS_ERR_VALUE((long)run_ctx.retval)) run_ctx.retval = -EPERM; item++; } @@ -1389,7 +1389,7 @@ out: * 0: NET_XMIT_SUCCESS skb should be transmitted * 1: NET_XMIT_DROP skb should be dropped and cn * 2: NET_XMIT_CN skb should be transmitted and cn - * 3: -EPERM skb should be dropped + * 3: -err skb should be dropped */ #define BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(array, ctx, func) \ ({ \ @@ -1398,10 +1398,12 @@ out: u32 _ret; \ _ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(array, ctx, func, 0, &_flags); \ _cn = _flags & BPF_RET_SET_CN; \ + if (_ret && !IS_ERR_VALUE((long)_ret)) \ + _ret = -EFAULT; \ if (!_ret) \ _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \ else \ - _ret = (_cn ? NET_XMIT_DROP : -EPERM); \ + _ret = (_cn ? NET_XMIT_DROP : _ret); \ _ret; \ }) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a9c96c21330a..fe2272defcd9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5033,6 +5033,22 @@ union bpf_attr { * * Return * The number of arguments of the traced function. + * + * int bpf_get_retval(void) + * Description + * Get the syscall's return value that will be returned to userspace. + * + * This helper is currently supported by cgroup programs only. + * Return + * The syscall's return value. + * + * int bpf_set_retval(int retval) + * Description + * Set the syscall's return value that will be returned to userspace. + * + * This helper is currently supported by cgroup programs only. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5221,6 +5237,8 @@ union bpf_attr { FN(get_func_arg), \ FN(get_func_ret), \ FN(get_func_arg_cnt), \ + FN(get_retval), \ + FN(set_retval), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index b6fad0bbf5a7..279ebbed75a5 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1044,7 +1044,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr * NET_XMIT_CN (2) - continue with packet output and notify TCP * to call cwr - * -EPERM - drop packet + * -err - drop packet * * For ingress packets, this function will return -EPERM if any * attached program was found and if it returned != 1 during execution. @@ -1080,6 +1080,8 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, } else { ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb, 0); + if (ret && !IS_ERR_VALUE((long)ret)) + ret = -EFAULT; } bpf_restore_data_end(skb, saved_data_end); __skb_pull(skb, offset); @@ -1205,6 +1207,36 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, return ret; } +BPF_CALL_0(bpf_get_retval) +{ + struct bpf_cg_run_ctx *ctx = + container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); + + return ctx->retval; +} + +static const struct bpf_func_proto bpf_get_retval_proto = { + .func = bpf_get_retval, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + +BPF_CALL_1(bpf_set_retval, int, retval) +{ + struct bpf_cg_run_ctx *ctx = + container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); + + ctx->retval = retval; + return 0; +} + +static const struct bpf_func_proto bpf_set_retval_proto = { + .func = bpf_set_retval, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1217,6 +1249,10 @@ cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_cgroup_id_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; + case BPF_FUNC_get_retval: + return &bpf_get_retval_proto; + case BPF_FUNC_set_retval: + return &bpf_set_retval_proto; default: return bpf_base_func_proto(func_id); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a9c96c21330a..fe2272defcd9 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5033,6 +5033,22 @@ union bpf_attr { * * Return * The number of arguments of the traced function. + * + * int bpf_get_retval(void) + * Description + * Get the syscall's return value that will be returned to userspace. + * + * This helper is currently supported by cgroup programs only. + * Return + * The syscall's return value. + * + * int bpf_set_retval(int retval) + * Description + * Set the syscall's return value that will be returned to userspace. + * + * This helper is currently supported by cgroup programs only. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5221,6 +5237,8 @@ union bpf_attr { FN(get_func_arg), \ FN(get_func_ret), \ FN(get_func_arg_cnt), \ + FN(get_retval), \ + FN(set_retval), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 748cd5729ac7421091316e32dcdffb0578563880 Mon Sep 17 00:00:00 2001 From: Di Zhu Date: Wed, 19 Jan 2022 09:40:04 +0800 Subject: bpf: support BPF_PROG_QUERY for progs attached to sockmap Right now there is no way to query whether BPF programs are attached to a sockmap or not. we can use the standard interface in libbpf to query, such as: bpf_prog_query(mapFd, BPF_SK_SKB_STREAM_PARSER, 0, NULL, ...); the mapFd is the fd of sockmap. Signed-off-by: Di Zhu Acked-by: Yonghong Song Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/r/20220119014005.1209-1-zhudi2@huawei.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 9 ++++++ kernel/bpf/syscall.c | 5 ++++ net/core/sock_map.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 84 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dce54eb0aae8..80e3387ea3af 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2069,6 +2069,9 @@ int bpf_prog_test_run_syscall(struct bpf_prog *prog, int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags); +int sock_map_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr); + void sock_map_unhash(struct sock *sk); void sock_map_close(struct sock *sk, long timeout); #else @@ -2122,6 +2125,12 @@ static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void { return -EOPNOTSUPP; } + +static inline int sock_map_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EINVAL; +} #endif /* CONFIG_BPF_SYSCALL */ #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index fa4505f9b611..9e0631f091a6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3318,6 +3318,11 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_FLOW_DISSECTOR: case BPF_SK_LOOKUP: return netns_bpf_prog_query(attr, uattr); + case BPF_SK_SKB_STREAM_PARSER: + case BPF_SK_SKB_STREAM_VERDICT: + case BPF_SK_MSG_VERDICT: + case BPF_SK_SKB_VERDICT: + return sock_map_bpf_prog_query(attr, uattr); default: return -EINVAL; } diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 1827669eedd6..2d213c4011db 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1416,38 +1416,50 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) return NULL; } -static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which) +static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog, + u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); - struct bpf_prog **pprog; if (!progs) return -EOPNOTSUPP; switch (which) { case BPF_SK_MSG_VERDICT: - pprog = &progs->msg_parser; + *pprog = &progs->msg_parser; break; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: - pprog = &progs->stream_parser; + *pprog = &progs->stream_parser; break; #endif case BPF_SK_SKB_STREAM_VERDICT: if (progs->skb_verdict) return -EBUSY; - pprog = &progs->stream_verdict; + *pprog = &progs->stream_verdict; break; case BPF_SK_SKB_VERDICT: if (progs->stream_verdict) return -EBUSY; - pprog = &progs->skb_verdict; + *pprog = &progs->skb_verdict; break; default: return -EOPNOTSUPP; } + return 0; +} + +static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + struct bpf_prog *old, u32 which) +{ + struct bpf_prog **pprog; + int ret; + + ret = sock_map_prog_lookup(map, &pprog, which); + if (ret) + return ret; + if (old) return psock_replace_prog(pprog, prog, old); @@ -1455,6 +1467,57 @@ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, return 0; } +int sock_map_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + u32 prog_cnt = 0, flags = 0, ufd = attr->target_fd; + struct bpf_prog **pprog; + struct bpf_prog *prog; + struct bpf_map *map; + struct fd f; + u32 id = 0; + int ret; + + if (attr->query.query_flags) + return -EINVAL; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + rcu_read_lock(); + + ret = sock_map_prog_lookup(map, &pprog, attr->query.attach_type); + if (ret) + goto end; + + prog = *pprog; + prog_cnt = !prog ? 0 : 1; + + if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) + goto end; + + /* we do not hold the refcnt, the bpf prog may be released + * asynchronously and the id would be set to 0. + */ + id = data_race(prog->aux->id); + if (id == 0) + prog_cnt = 0; + +end: + rcu_read_unlock(); + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)) || + (id != 0 && copy_to_user(prog_ids, &id, sizeof(u32))) || + copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) + ret = -EFAULT; + + fdput(f); + return ret; +} + static void sock_map_unlink(struct sock *sk, struct sk_psock_link *link) { switch (link->map->map_type) { -- cgit v1.2.3 From c2f2cdbeffda7b153c19e0f3d73149c41026c0db Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 21 Jan 2022 11:09:52 +0100 Subject: bpf: introduce BPF_F_XDP_HAS_FRAGS flag in prog_flags loading the ebpf program Introduce BPF_F_XDP_HAS_FRAGS and the related field in bpf_prog_aux in order to notify the driver the loaded program support xdp frags. Acked-by: Toke Hoiland-Jorgensen Acked-by: John Fastabend Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/db2e8075b7032a356003f407d1b0deb99adaa0ed.1642758637.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 5 +++++ kernel/bpf/syscall.c | 4 +++- tools/include/uapi/linux/bpf.h | 5 +++++ 4 files changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 80e3387ea3af..e93ed028a030 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -933,6 +933,7 @@ struct bpf_prog_aux { bool func_proto_unreliable; bool sleepable; bool tail_call_reachable; + bool xdp_has_frags; struct hlist_node tramp_hlist; /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index fe2272defcd9..945649c67e03 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1113,6 +1113,11 @@ enum bpf_link_type { */ #define BPF_F_SLEEPABLE (1U << 4) +/* If BPF_F_XDP_HAS_FRAGS is used in BPF_PROG_LOAD command, the loaded program + * fully support xdp frags. + */ +#define BPF_F_XDP_HAS_FRAGS (1U << 5) + /* When BPF ldimm64's insn[0].src_reg != 0 then this can have * the following extensions: * diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9e0631f091a6..f29090643c6e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2217,7 +2217,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) BPF_F_ANY_ALIGNMENT | BPF_F_TEST_STATE_FREQ | BPF_F_SLEEPABLE | - BPF_F_TEST_RND_HI32)) + BPF_F_TEST_RND_HI32 | + BPF_F_XDP_HAS_FRAGS)) return -EINVAL; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && @@ -2303,6 +2304,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) prog->aux->dst_prog = dst_prog; prog->aux->offload_requested = !!attr->prog_ifindex; prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE; + prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS; err = security_bpf_prog_alloc(prog->aux); if (err) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index fe2272defcd9..945649c67e03 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1113,6 +1113,11 @@ enum bpf_link_type { */ #define BPF_F_SLEEPABLE (1U << 4) +/* If BPF_F_XDP_HAS_FRAGS is used in BPF_PROG_LOAD command, the loaded program + * fully support xdp frags. + */ +#define BPF_F_XDP_HAS_FRAGS (1U << 5) + /* When BPF ldimm64's insn[0].src_reg != 0 then this can have * the following extensions: * -- cgit v1.2.3 From d99173027d6803430fd60e61aab3006644e18628 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Fri, 21 Jan 2022 11:09:56 +0100 Subject: bpf: add frags support to xdp copy helpers This patch adds support for frags for the following helpers: - bpf_xdp_output() - bpf_perf_event_output() Acked-by: Toke Hoiland-Jorgensen Acked-by: John Fastabend Acked-by: Jakub Kicinski Signed-off-by: Eelco Chaudron Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/340b4a99cdc24337b40eaf8bb597f9f9e7b0373e.1642758637.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 3 + net/core/filter.c | 57 ++++++++++- .../testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c | 111 +++++++++++++++------ .../testing/selftests/bpf/progs/test_xdp_bpf2bpf.c | 2 +- 4 files changed, 137 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 21aa30644219..06a9e220069e 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1562,6 +1562,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { extern const struct bpf_func_proto bpf_skb_output_proto; extern const struct bpf_func_proto bpf_xdp_output_proto; +extern const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto; BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, struct bpf_map *, map, u64, flags) @@ -1661,6 +1662,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sock_from_file_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_ptr_cookie_proto; + case BPF_FUNC_xdp_get_buff_len: + return &bpf_xdp_get_buff_len_trace_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? diff --git a/net/core/filter.c b/net/core/filter.c index 70e5874f19c3..e4ce138bf925 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3796,6 +3796,15 @@ static const struct bpf_func_proto bpf_xdp_get_buff_len_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BTF_ID_LIST_SINGLE(bpf_xdp_get_buff_len_bpf_ids, struct, xdp_buff) + +const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto = { + .func = bpf_xdp_get_buff_len, + .gpl_only = false, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_xdp_get_buff_len_bpf_ids[0], +}; + static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) { return xdp_data_meta_unsupported(xdp) ? 0 : @@ -4668,10 +4677,48 @@ static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = { }; #endif -static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, +static unsigned long bpf_xdp_copy(void *dst_buff, const void *ctx, unsigned long off, unsigned long len) { - memcpy(dst_buff, src_buff + off, len); + struct xdp_buff *xdp = (struct xdp_buff *)ctx; + unsigned long ptr_len, ptr_off = 0; + skb_frag_t *next_frag, *end_frag; + struct skb_shared_info *sinfo; + u8 *ptr_buf; + + if (likely(xdp->data_end - xdp->data >= off + len)) { + memcpy(dst_buff, xdp->data + off, len); + return 0; + } + + sinfo = xdp_get_shared_info_from_buff(xdp); + end_frag = &sinfo->frags[sinfo->nr_frags]; + next_frag = &sinfo->frags[0]; + + ptr_len = xdp->data_end - xdp->data; + ptr_buf = xdp->data; + + while (true) { + if (off < ptr_off + ptr_len) { + unsigned long copy_off = off - ptr_off; + unsigned long copy_len = min(len, ptr_len - copy_off); + + memcpy(dst_buff, ptr_buf + copy_off, copy_len); + + off += copy_len; + len -= copy_len; + dst_buff += copy_len; + } + + if (!len || next_frag == end_frag) + break; + + ptr_off += ptr_len; + ptr_buf = skb_frag_address(next_frag); + ptr_len = skb_frag_size(next_frag); + next_frag++; + } + return 0; } @@ -4682,11 +4729,11 @@ BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; - if (unlikely(!xdp || - xdp_size > (unsigned long)(xdp->data_end - xdp->data))) + + if (unlikely(!xdp || xdp_size > xdp_get_buff_len(xdp))) return -EFAULT; - return bpf_event_output(map, flags, meta, meta_size, xdp->data, + return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size, bpf_xdp_copy); } diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c index 500a302cb3e9..9c395ea680c6 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c @@ -10,28 +10,97 @@ struct meta { int pkt_len; }; +struct test_ctx_s { + bool passed; + int pkt_size; +}; + +struct test_ctx_s test_ctx; + static void on_sample(void *ctx, int cpu, void *data, __u32 size) { struct meta *meta = (struct meta *)data; struct ipv4_packet *trace_pkt_v4 = data + sizeof(*meta); + unsigned char *raw_pkt = data + sizeof(*meta); + struct test_ctx_s *tst_ctx = ctx; ASSERT_GE(size, sizeof(pkt_v4) + sizeof(*meta), "check_size"); ASSERT_EQ(meta->ifindex, if_nametoindex("lo"), "check_meta_ifindex"); - ASSERT_EQ(meta->pkt_len, sizeof(pkt_v4), "check_meta_pkt_len"); + ASSERT_EQ(meta->pkt_len, tst_ctx->pkt_size, "check_meta_pkt_len"); ASSERT_EQ(memcmp(trace_pkt_v4, &pkt_v4, sizeof(pkt_v4)), 0, "check_packet_content"); - *(bool *)ctx = true; + if (meta->pkt_len > sizeof(pkt_v4)) { + for (int i = 0; i < meta->pkt_len - sizeof(pkt_v4); i++) + ASSERT_EQ(raw_pkt[i + sizeof(pkt_v4)], (unsigned char)i, + "check_packet_content"); + } + + tst_ctx->passed = true; } -void test_xdp_bpf2bpf(void) +#define BUF_SZ 9000 + +static void run_xdp_bpf2bpf_pkt_size(int pkt_fd, struct perf_buffer *pb, + struct test_xdp_bpf2bpf *ftrace_skel, + int pkt_size) { __u32 duration = 0, retval, size; - char buf[128]; + __u8 *buf, *buf_in; + int err; + + if (!ASSERT_LE(pkt_size, BUF_SZ, "pkt_size") || + !ASSERT_GE(pkt_size, sizeof(pkt_v4), "pkt_size")) + return; + + buf_in = malloc(BUF_SZ); + if (!ASSERT_OK_PTR(buf_in, "buf_in malloc()")) + return; + + buf = malloc(BUF_SZ); + if (!ASSERT_OK_PTR(buf, "buf malloc()")) { + free(buf_in); + return; + } + + test_ctx.passed = false; + test_ctx.pkt_size = pkt_size; + + memcpy(buf_in, &pkt_v4, sizeof(pkt_v4)); + if (pkt_size > sizeof(pkt_v4)) { + for (int i = 0; i < (pkt_size - sizeof(pkt_v4)); i++) + buf_in[i + sizeof(pkt_v4)] = i; + } + + /* Run test program */ + err = bpf_prog_test_run(pkt_fd, 1, buf_in, pkt_size, + buf, &size, &retval, &duration); + + ASSERT_OK(err, "ipv4"); + ASSERT_EQ(retval, XDP_PASS, "ipv4 retval"); + ASSERT_EQ(size, pkt_size, "ipv4 size"); + + /* Make sure bpf_xdp_output() was triggered and it sent the expected + * data to the perf ring buffer. + */ + err = perf_buffer__poll(pb, 100); + + ASSERT_GE(err, 0, "perf_buffer__poll"); + ASSERT_TRUE(test_ctx.passed, "test passed"); + /* Verify test results */ + ASSERT_EQ(ftrace_skel->bss->test_result_fentry, if_nametoindex("lo"), + "fentry result"); + ASSERT_EQ(ftrace_skel->bss->test_result_fexit, XDP_PASS, "fexit result"); + + free(buf); + free(buf_in); +} + +void test_xdp_bpf2bpf(void) +{ int err, pkt_fd, map_fd; - bool passed = false; - struct iphdr iph; - struct iptnl_info value4 = {.family = AF_INET}; + int pkt_sizes[] = {sizeof(pkt_v4), 1024, 4100, 8200}; + struct iptnl_info value4 = {.family = AF_INET6}; struct test_xdp *pkt_skel = NULL; struct test_xdp_bpf2bpf *ftrace_skel = NULL; struct vip key4 = {.protocol = 6, .family = AF_INET}; @@ -73,32 +142,14 @@ void test_xdp_bpf2bpf(void) goto out; /* Set up perf buffer */ - pb = perf_buffer__new(bpf_map__fd(ftrace_skel->maps.perf_buf_map), 1, - on_sample, NULL, &passed, NULL); + pb = perf_buffer__new(bpf_map__fd(ftrace_skel->maps.perf_buf_map), 8, + on_sample, NULL, &test_ctx, NULL); if (!ASSERT_OK_PTR(pb, "perf_buf__new")) goto out; - /* Run test program */ - err = bpf_prog_test_run(pkt_fd, 1, &pkt_v4, sizeof(pkt_v4), - buf, &size, &retval, &duration); - memcpy(&iph, buf + sizeof(struct ethhdr), sizeof(iph)); - - ASSERT_OK(err, "ipv4"); - ASSERT_EQ(retval, XDP_TX, "ipv4 retval"); - ASSERT_EQ(size, 74, "ipv4 size"); - ASSERT_EQ(iph.protocol, IPPROTO_IPIP, "ipv4 proto"); - - /* Make sure bpf_xdp_output() was triggered and it sent the expected - * data to the perf ring buffer. - */ - err = perf_buffer__poll(pb, 100); - - ASSERT_GE(err, 0, "perf_buffer__poll"); - ASSERT_TRUE(passed, "test passed"); - /* Verify test results */ - ASSERT_EQ(ftrace_skel->bss->test_result_fentry, if_nametoindex("lo"), - "fentry result"); - ASSERT_EQ(ftrace_skel->bss->test_result_fexit, XDP_TX, "fexit result"); + for (int i = 0; i < ARRAY_SIZE(pkt_sizes); i++) + run_xdp_bpf2bpf_pkt_size(pkt_fd, pb, ftrace_skel, + pkt_sizes[i]); out: perf_buffer__free(pb); test_xdp__destroy(pkt_skel); diff --git a/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c b/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c index 58cf4345f5cc..3379d303f41a 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c @@ -49,7 +49,7 @@ int BPF_PROG(trace_on_entry, struct xdp_buff *xdp) void *data = (void *)(long)xdp->data; meta.ifindex = xdp->rxq->dev->ifindex; - meta.pkt_len = data_end - data; + meta.pkt_len = bpf_xdp_get_buff_len((struct xdp_md *)xdp); bpf_xdp_output(xdp, &perf_buf_map, ((__u64) meta.pkt_len << 32) | BPF_F_CURRENT_CPU, -- cgit v1.2.3 From f45d5b6ce2e835834c94b8b700787984f02cd662 Mon Sep 17 00:00:00 2001 From: Toke Hoiland-Jorgensen Date: Fri, 21 Jan 2022 11:10:02 +0100 Subject: bpf: generalise tail call map compatibility check The check for tail call map compatibility ensures that tail calls only happen between maps of the same type. To ensure backwards compatibility for XDP frags we need a similar type of check for cpumap and devmap programs, so move the state from bpf_array_aux into bpf_map, add xdp_has_frags to the check, and apply the same check to cpumap and devmap. Acked-by: John Fastabend Co-developed-by: Lorenzo Bianconi Signed-off-by: Lorenzo Bianconi Signed-off-by: Toke Hoiland-Jorgensen Link: https://lore.kernel.org/r/f19fd97c0328a39927f3ad03e1ca6b43fd53cdfd.1642758637.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 30 +++++++++++++++++++----------- kernel/bpf/arraymap.c | 4 +--- kernel/bpf/core.c | 28 ++++++++++++++-------------- kernel/bpf/cpumap.c | 8 +++++--- kernel/bpf/devmap.c | 3 ++- kernel/bpf/syscall.c | 15 +++++++-------- 6 files changed, 48 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e93ed028a030..e8ec8d2f2fe3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -194,6 +194,17 @@ struct bpf_map { struct work_struct work; struct mutex freeze_mutex; atomic64_t writecnt; + /* 'Ownership' of program-containing map is claimed by the first program + * that is going to use this map or by the first program which FD is + * stored in the map to make sure that all callers and callees have the + * same prog type, JITed flag and xdp_has_frags flag. + */ + struct { + spinlock_t lock; + enum bpf_prog_type type; + bool jited; + bool xdp_has_frags; + } owner; }; static inline bool map_value_has_spin_lock(const struct bpf_map *map) @@ -994,16 +1005,6 @@ struct bpf_prog_aux { }; struct bpf_array_aux { - /* 'Ownership' of prog array is claimed by the first program that - * is going to use this map or by the first program which FD is - * stored in the map to make sure that all callers and callees have - * the same prog type and JITed flag. - */ - struct { - spinlock_t lock; - enum bpf_prog_type type; - bool jited; - } owner; /* Programs with direct jumps into programs part of this array. */ struct list_head poke_progs; struct bpf_map *map; @@ -1178,7 +1179,14 @@ struct bpf_event_entry { struct rcu_head rcu; }; -bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); +static inline bool map_type_contains_progs(struct bpf_map *map) +{ + return map->map_type == BPF_MAP_TYPE_PROG_ARRAY || + map->map_type == BPF_MAP_TYPE_DEVMAP || + map->map_type == BPF_MAP_TYPE_CPUMAP; +} + +bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp); int bpf_prog_calc_tag(struct bpf_prog *fp); const struct bpf_func_proto *bpf_get_trace_printk_proto(void); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index c7a5be3bf8be..7f145aefbff8 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -837,13 +837,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key) static void *prog_fd_array_get_ptr(struct bpf_map *map, struct file *map_file, int fd) { - struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_prog *prog = bpf_prog_get(fd); if (IS_ERR(prog)) return prog; - if (!bpf_prog_array_compatible(array, prog)) { + if (!bpf_prog_map_compatible(map, prog)) { bpf_prog_put(prog); return ERR_PTR(-EINVAL); } @@ -1071,7 +1070,6 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) INIT_WORK(&aux->work, prog_array_map_clear_deferred); INIT_LIST_HEAD(&aux->poke_progs); mutex_init(&aux->poke_mutex); - spin_lock_init(&aux->owner.lock); map = array_map_alloc(attr); if (IS_ERR(map)) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index de3e5bc6781f..0a1cfd8544b9 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1829,28 +1829,30 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx, } #endif -bool bpf_prog_array_compatible(struct bpf_array *array, - const struct bpf_prog *fp) +bool bpf_prog_map_compatible(struct bpf_map *map, + const struct bpf_prog *fp) { bool ret; if (fp->kprobe_override) return false; - spin_lock(&array->aux->owner.lock); - - if (!array->aux->owner.type) { + spin_lock(&map->owner.lock); + if (!map->owner.type) { /* There's no owner yet where we could check for * compatibility. */ - array->aux->owner.type = fp->type; - array->aux->owner.jited = fp->jited; + map->owner.type = fp->type; + map->owner.jited = fp->jited; + map->owner.xdp_has_frags = fp->aux->xdp_has_frags; ret = true; } else { - ret = array->aux->owner.type == fp->type && - array->aux->owner.jited == fp->jited; + ret = map->owner.type == fp->type && + map->owner.jited == fp->jited && + map->owner.xdp_has_frags == fp->aux->xdp_has_frags; } - spin_unlock(&array->aux->owner.lock); + spin_unlock(&map->owner.lock); + return ret; } @@ -1862,13 +1864,11 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) mutex_lock(&aux->used_maps_mutex); for (i = 0; i < aux->used_map_cnt; i++) { struct bpf_map *map = aux->used_maps[i]; - struct bpf_array *array; - if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) + if (!map_type_contains_progs(map)) continue; - array = container_of(map, struct bpf_array, map); - if (!bpf_prog_array_compatible(array, fp)) { + if (!bpf_prog_map_compatible(map, fp)) { ret = -EINVAL; goto out; } diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index b3e6b9422238..650e5d21f90d 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -397,7 +397,8 @@ static int cpu_map_kthread_run(void *data) return 0; } -static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd) +static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, + struct bpf_map *map, int fd) { struct bpf_prog *prog; @@ -405,7 +406,8 @@ static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd) if (IS_ERR(prog)) return PTR_ERR(prog); - if (prog->expected_attach_type != BPF_XDP_CPUMAP) { + if (prog->expected_attach_type != BPF_XDP_CPUMAP || + !bpf_prog_map_compatible(map, prog)) { bpf_prog_put(prog); return -EINVAL; } @@ -457,7 +459,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, rcpu->map_id = map->id; rcpu->value.qsize = value->qsize; - if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd)) + if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd)) goto free_ptr_ring; /* Setup kthread */ diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index fe019dbdb3f0..038f6d7a83e4 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -858,7 +858,8 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, BPF_PROG_TYPE_XDP, false); if (IS_ERR(prog)) goto err_put_dev; - if (prog->expected_attach_type != BPF_XDP_DEVMAP) + if (prog->expected_attach_type != BPF_XDP_DEVMAP || + !bpf_prog_map_compatible(&dtab->map, prog)) goto err_put_prog; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f29090643c6e..72ce1edde950 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -556,16 +556,14 @@ static unsigned long bpf_map_memory_footprint(const struct bpf_map *map) static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) { - const struct bpf_map *map = filp->private_data; - const struct bpf_array *array; + struct bpf_map *map = filp->private_data; u32 type = 0, jited = 0; - if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { - array = container_of(map, struct bpf_array, map); - spin_lock(&array->aux->owner.lock); - type = array->aux->owner.type; - jited = array->aux->owner.jited; - spin_unlock(&array->aux->owner.lock); + if (map_type_contains_progs(map)) { + spin_lock(&map->owner.lock); + type = map->owner.type; + jited = map->owner.jited; + spin_unlock(&map->owner.lock); } seq_printf(m, @@ -874,6 +872,7 @@ static int map_create(union bpf_attr *attr) atomic64_set(&map->refcnt, 1); atomic64_set(&map->usercnt, 1); mutex_init(&map->freeze_mutex); + spin_lock_init(&map->owner.lock); map->spin_lock_off = -EINVAL; map->timer_off = -EINVAL; -- cgit v1.2.3 From b77fb25dcb342788d72ad7533163c34b8b823a1d Mon Sep 17 00:00:00 2001 From: Kenny Yu Date: Mon, 24 Jan 2022 10:54:00 -0800 Subject: bpf: Add support for bpf iterator programs to use sleepable helpers This patch allows bpf iterator programs to use sleepable helpers by changing `bpf_iter_run_prog` to use the appropriate synchronization. With sleepable bpf iterator programs, we can no longer use `rcu_read_lock()` and must use `rcu_read_lock_trace()` instead to protect the bpf program. Signed-off-by: Kenny Yu Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220124185403.468466-2-kennyyu@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_iter.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index b7aef5b3416d..110029ede71e 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -5,6 +5,7 @@ #include #include #include +#include struct bpf_iter_target_info { struct list_head list; @@ -684,11 +685,20 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) { int ret; - rcu_read_lock(); - migrate_disable(); - ret = bpf_prog_run(prog, ctx); - migrate_enable(); - rcu_read_unlock(); + if (prog->aux->sleepable) { + rcu_read_lock_trace(); + migrate_disable(); + might_fault(); + ret = bpf_prog_run(prog, ctx); + migrate_enable(); + rcu_read_unlock_trace(); + } else { + rcu_read_lock(); + migrate_disable(); + ret = bpf_prog_run(prog, ctx); + migrate_enable(); + rcu_read_unlock(); + } /* bpf program can only return 0 or 1: * 0 : okay -- cgit v1.2.3 From 376040e47334c6dc6a939a32197acceb00fe4acf Mon Sep 17 00:00:00 2001 From: Kenny Yu Date: Mon, 24 Jan 2022 10:54:01 -0800 Subject: bpf: Add bpf_copy_from_user_task() helper This adds a helper for bpf programs to read the memory of other tasks. As an example use case at Meta, we are using a bpf task iterator program and this new helper to print C++ async stack traces for all threads of a given process. Signed-off-by: Kenny Yu Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220124185403.468466-3-kennyyu@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 11 +++++++++++ kernel/bpf/helpers.c | 34 ++++++++++++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 2 ++ tools/include/uapi/linux/bpf.h | 11 +++++++++++ 5 files changed, 59 insertions(+) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8c92c974bd12..394305a5e02f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2243,6 +2243,7 @@ extern const struct bpf_func_proto bpf_kallsyms_lookup_name_proto; extern const struct bpf_func_proto bpf_find_vma_proto; extern const struct bpf_func_proto bpf_loop_proto; extern const struct bpf_func_proto bpf_strncmp_proto; +extern const struct bpf_func_proto bpf_copy_from_user_task_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 16a7574292a5..4a2f7041ebae 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5076,6 +5076,16 @@ union bpf_attr { * associated to *xdp_md*, at *offset*. * Return * 0 on success, or a negative error in case of failure. + * + * long bpf_copy_from_user_task(void *dst, u32 size, const void *user_ptr, struct task_struct *tsk, u64 flags) + * Description + * Read *size* bytes from user space address *user_ptr* in *tsk*'s + * address space, and stores the data in *dst*. *flags* is not + * used yet and is provided for future extensibility. This helper + * can only be used by sleepable programs. + * Return + * 0 on success, or a negative error in case of failure. On error + * *dst* buffer is zeroed out. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5269,6 +5279,7 @@ union bpf_attr { FN(xdp_get_buff_len), \ FN(xdp_load_bytes), \ FN(xdp_store_bytes), \ + FN(copy_from_user_task), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 01cfdf40c838..ed2780b76cc1 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "../../lib/kstrtox.h" @@ -671,6 +672,39 @@ const struct bpf_func_proto bpf_copy_from_user_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size, + const void __user *, user_ptr, struct task_struct *, tsk, u64, flags) +{ + int ret; + + /* flags is not used yet */ + if (unlikely(flags)) + return -EINVAL; + + if (unlikely(!size)) + return 0; + + ret = access_process_vm(tsk, (unsigned long)user_ptr, dst, size, 0); + if (ret == size) + return 0; + + memset(dst, 0, size); + /* Return -EFAULT for partial read */ + return ret < 0 ? ret : -EFAULT; +} + +const struct bpf_func_proto bpf_copy_from_user_task_proto = { + .func = bpf_copy_from_user_task, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_BTF_ID, + .arg4_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], + .arg5_type = ARG_ANYTHING +}; + BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) { if (cpu >= nr_cpu_ids) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 06a9e220069e..a2024ba32a20 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1235,6 +1235,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_task_stack_proto; case BPF_FUNC_copy_from_user: return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL; + case BPF_FUNC_copy_from_user_task: + return prog->aux->sleepable ? &bpf_copy_from_user_task_proto : NULL; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; case BPF_FUNC_per_cpu_ptr: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 16a7574292a5..4a2f7041ebae 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5076,6 +5076,16 @@ union bpf_attr { * associated to *xdp_md*, at *offset*. * Return * 0 on success, or a negative error in case of failure. + * + * long bpf_copy_from_user_task(void *dst, u32 size, const void *user_ptr, struct task_struct *tsk, u64 flags) + * Description + * Read *size* bytes from user space address *user_ptr* in *tsk*'s + * address space, and stores the data in *dst*. *flags* is not + * used yet and is provided for future extensibility. This helper + * can only be used by sleepable programs. + * Return + * 0 on success, or a negative error in case of failure. On error + * *dst* buffer is zeroed out. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5269,6 +5279,7 @@ union bpf_attr { FN(xdp_get_buff_len), \ FN(xdp_load_bytes), \ FN(xdp_store_bytes), \ + FN(copy_from_user_task), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 63ee956f69d8c181e5251c7ce58b84c1edec0f6a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 24 Jan 2022 20:20:51 -0800 Subject: bpf: Fix renaming task_getsecid_subj->current_getsecid_subj. The commit 6326948f940d missed renaming of task->current LSM hook in BTF_ID. Fix it to silence build warning: WARN: resolve_btfids: unresolved symbol bpf_lsm_task_getsecid_subj Fixes: 6326948f940d ("lsm: security_task_getsecid_subj() -> security_current_getsecid_subj()") Acked-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_lsm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 06062370c3b8..9e4ecc990647 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -207,7 +207,7 @@ BTF_ID(func, bpf_lsm_socket_socketpair) BTF_ID(func, bpf_lsm_syslog) BTF_ID(func, bpf_lsm_task_alloc) -BTF_ID(func, bpf_lsm_task_getsecid_subj) +BTF_ID(func, bpf_lsm_current_getsecid_subj) BTF_ID(func, bpf_lsm_task_getsecid_obj) BTF_ID(func, bpf_lsm_task_prctl) BTF_ID(func, bpf_lsm_task_setscheduler) -- cgit v1.2.3 From f26d04331360d42dbd6b58448bd98e4edbfbe1c5 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Thu, 13 Jan 2022 18:54:38 -0500 Subject: audit: improve audit queue handling when "audit=1" on cmdline When an admin enables audit at early boot via the "audit=1" kernel command line the audit queue behavior is slightly different; the audit subsystem goes to greater lengths to avoid dropping records, which unfortunately can result in problems when the audit daemon is forcibly stopped for an extended period of time. This patch makes a number of changes designed to improve the audit queuing behavior so that leaving the audit daemon in a stopped state for an extended period does not cause a significant impact to the system. - kauditd_send_queue() is now limited to looping through the passed queue only once per call. This not only prevents the function from looping indefinitely when records are returned to the current queue, it also allows any recovery handling in kauditd_thread() to take place when kauditd_send_queue() returns. - Transient netlink send errors seen as -EAGAIN now cause the record to be returned to the retry queue instead of going to the hold queue. The intention of the hold queue is to store, perhaps for an extended period of time, the events which led up to the audit daemon going offline. The retry queue remains a temporary queue intended to protect against transient issues between the kernel and the audit daemon. - The retry queue is now limited by the audit_backlog_limit setting, the same as the other queues. This allows admins to bound the size of all of the audit queues on the system. - kauditd_rehold_skb() now returns records to the end of the hold queue to ensure ordering is preserved in the face of recent changes to kauditd_send_queue(). Cc: stable@vger.kernel.org Fixes: 5b52330bbfe63 ("audit: fix auditd/kernel connection state tracking") Fixes: f4b3ee3c85551 ("audit: improve robustness of the audit queue handling") Reported-by: Gaosheng Cui Tested-by: Gaosheng Cui Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 62 ++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 43 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index e4bbe2c70c26..7690c29d4ee4 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -541,20 +541,22 @@ static void kauditd_printk_skb(struct sk_buff *skb) /** * kauditd_rehold_skb - Handle a audit record send failure in the hold queue * @skb: audit record + * @error: error code (unused) * * Description: * This should only be used by the kauditd_thread when it fails to flush the * hold queue. */ -static void kauditd_rehold_skb(struct sk_buff *skb) +static void kauditd_rehold_skb(struct sk_buff *skb, __always_unused int error) { - /* put the record back in the queue at the same place */ - skb_queue_head(&audit_hold_queue, skb); + /* put the record back in the queue */ + skb_queue_tail(&audit_hold_queue, skb); } /** * kauditd_hold_skb - Queue an audit record, waiting for auditd * @skb: audit record + * @error: error code * * Description: * Queue the audit record, waiting for an instance of auditd. When this @@ -564,19 +566,31 @@ static void kauditd_rehold_skb(struct sk_buff *skb) * and queue it, if we have room. If we want to hold on to the record, but we * don't have room, record a record lost message. */ -static void kauditd_hold_skb(struct sk_buff *skb) +static void kauditd_hold_skb(struct sk_buff *skb, int error) { /* at this point it is uncertain if we will ever send this to auditd so * try to send the message via printk before we go any further */ kauditd_printk_skb(skb); /* can we just silently drop the message? */ - if (!audit_default) { - kfree_skb(skb); - return; + if (!audit_default) + goto drop; + + /* the hold queue is only for when the daemon goes away completely, + * not -EAGAIN failures; if we are in a -EAGAIN state requeue the + * record on the retry queue unless it's full, in which case drop it + */ + if (error == -EAGAIN) { + if (!audit_backlog_limit || + skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { + skb_queue_tail(&audit_retry_queue, skb); + return; + } + audit_log_lost("kauditd retry queue overflow"); + goto drop; } - /* if we have room, queue the message */ + /* if we have room in the hold queue, queue the message */ if (!audit_backlog_limit || skb_queue_len(&audit_hold_queue) < audit_backlog_limit) { skb_queue_tail(&audit_hold_queue, skb); @@ -585,24 +599,32 @@ static void kauditd_hold_skb(struct sk_buff *skb) /* we have no other options - drop the message */ audit_log_lost("kauditd hold queue overflow"); +drop: kfree_skb(skb); } /** * kauditd_retry_skb - Queue an audit record, attempt to send again to auditd * @skb: audit record + * @error: error code (unused) * * Description: * Not as serious as kauditd_hold_skb() as we still have a connected auditd, * but for some reason we are having problems sending it audit records so * queue the given record and attempt to resend. */ -static void kauditd_retry_skb(struct sk_buff *skb) +static void kauditd_retry_skb(struct sk_buff *skb, __always_unused int error) { - /* NOTE: because records should only live in the retry queue for a - * short period of time, before either being sent or moved to the hold - * queue, we don't currently enforce a limit on this queue */ - skb_queue_tail(&audit_retry_queue, skb); + if (!audit_backlog_limit || + skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { + skb_queue_tail(&audit_retry_queue, skb); + return; + } + + /* we have to drop the record, send it via printk as a last effort */ + kauditd_printk_skb(skb); + audit_log_lost("kauditd retry queue overflow"); + kfree_skb(skb); } /** @@ -640,7 +662,7 @@ static void auditd_reset(const struct auditd_connection *ac) /* flush the retry queue to the hold queue, but don't touch the main * queue since we need to process that normally for multicast */ while ((skb = skb_dequeue(&audit_retry_queue))) - kauditd_hold_skb(skb); + kauditd_hold_skb(skb, -ECONNREFUSED); } /** @@ -714,16 +736,18 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, struct sk_buff_head *queue, unsigned int retry_limit, void (*skb_hook)(struct sk_buff *skb), - void (*err_hook)(struct sk_buff *skb)) + void (*err_hook)(struct sk_buff *skb, int error)) { int rc = 0; - struct sk_buff *skb; + struct sk_buff *skb = NULL; + struct sk_buff *skb_tail; unsigned int failed = 0; /* NOTE: kauditd_thread takes care of all our locking, we just use * the netlink info passed to us (e.g. sk and portid) */ - while ((skb = skb_dequeue(queue))) { + skb_tail = skb_peek_tail(queue); + while ((skb != skb_tail) && (skb = skb_dequeue(queue))) { /* call the skb_hook for each skb we touch */ if (skb_hook) (*skb_hook)(skb); @@ -731,7 +755,7 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, /* can we send to anyone via unicast? */ if (!sk) { if (err_hook) - (*err_hook)(skb); + (*err_hook)(skb, -ECONNREFUSED); continue; } @@ -745,7 +769,7 @@ retry: rc == -ECONNREFUSED || rc == -EPERM) { sk = NULL; if (err_hook) - (*err_hook)(skb); + (*err_hook)(skb, rc); if (rc == -EAGAIN) rc = 0; /* continue to drain the queue */ -- cgit v1.2.3 From e204193b138af347fbbbe026e68cb3385112f387 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Mon, 13 Dec 2021 21:26:18 +0800 Subject: lockdep: Use memset_startat() helper in reinit_class() use memset_startat() helper to simplify the code, there is no functional change in this patch. Signed-off-by: Xiu Jianfeng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211213132618.105737-1-xiujianfeng@huawei.com --- kernel/locking/lockdep.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4a882f83aeb9..89b3df51fd98 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -6011,13 +6011,10 @@ static void zap_class(struct pending_free *pf, struct lock_class *class) static void reinit_class(struct lock_class *class) { - void *const p = class; - const unsigned int offset = offsetof(struct lock_class, key); - WARN_ON_ONCE(!class->lock_entry.next); WARN_ON_ONCE(!list_empty(&class->locks_after)); WARN_ON_ONCE(!list_empty(&class->locks_before)); - memset(p + offset, 0, sizeof(*class) - offset); + memset_startat(class, 0, key); WARN_ON_ONCE(!class->lock_entry.next); WARN_ON_ONCE(!list_empty(&class->locks_after)); WARN_ON_ONCE(!list_empty(&class->locks_before)); -- cgit v1.2.3 From 61cc4534b6550997c97a03759ab46b29d44c0017 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sun, 2 Jan 2022 21:35:58 -0500 Subject: locking/lockdep: Avoid potential access of invalid memory in lock_class It was found that reading /proc/lockdep after a lockdep splat may potentially cause an access to freed memory if lockdep_unregister_key() is called after the splat but before access to /proc/lockdep [1]. This is due to the fact that graph_lock() call in lockdep_unregister_key() fails after the clearing of debug_locks by the splat process. After lockdep_unregister_key() is called, the lock_name may be freed but the corresponding lock_class structure still have a reference to it. That invalid memory pointer will then be accessed when /proc/lockdep is read by a user and a use-after-free (UAF) error will be reported if KASAN is enabled. To fix this problem, lockdep_unregister_key() is now modified to always search for a matching key irrespective of the debug_locks state and zap the corresponding lock class if a matching one is found. [1] https://lore.kernel.org/lkml/77f05c15-81b6-bddd-9650-80d5f23fe330@i-love.sakura.ne.jp/ Fixes: 8b39adbee805 ("locking/lockdep: Make lockdep_unregister_key() honor 'debug_locks' again") Reported-by: Tetsuo Handa Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Bart Van Assche Link: https://lkml.kernel.org/r/20220103023558.1377055-1-longman@redhat.com --- kernel/locking/lockdep.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 89b3df51fd98..2e6892ec3756 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -6287,7 +6287,13 @@ void lockdep_reset_lock(struct lockdep_map *lock) lockdep_reset_lock_reg(lock); } -/* Unregister a dynamically allocated key. */ +/* + * Unregister a dynamically allocated key. + * + * Unlike lockdep_register_key(), a search is always done to find a matching + * key irrespective of debug_locks to avoid potential invalid access to freed + * memory in lock_class entry. + */ void lockdep_unregister_key(struct lock_class_key *key) { struct hlist_head *hash_head = keyhashentry(key); @@ -6302,10 +6308,8 @@ void lockdep_unregister_key(struct lock_class_key *key) return; raw_local_irq_save(flags); - if (!graph_lock()) - goto out_irq; + lockdep_lock(); - pf = get_pending_free(); hlist_for_each_entry_rcu(k, hash_head, hash_entry) { if (k == key) { hlist_del_rcu(&k->hash_entry); @@ -6313,11 +6317,13 @@ void lockdep_unregister_key(struct lock_class_key *key) break; } } - WARN_ON_ONCE(!found); - __lockdep_free_key_range(pf, key, 1); - call_rcu_zapped(pf); - graph_unlock(); -out_irq: + WARN_ON_ONCE(!found && debug_locks); + if (found) { + pf = get_pending_free(); + __lockdep_free_key_range(pf, key, 1); + call_rcu_zapped(pf); + } + lockdep_unlock(); raw_local_irq_restore(flags); /* Wait until is_dynamic_key() has finished accessing k->hash_entry. */ -- cgit v1.2.3 From 0e3135d3bfa5dfb658145238d2bc723a8e30c3a3 Mon Sep 17 00:00:00 2001 From: He Fengqing Date: Sat, 22 Jan 2022 10:29:36 +0000 Subject: bpf: Fix possible race in inc_misses_counter It seems inc_misses_counter() suffers from same issue fixed in the commit d979617aa84d ("bpf: Fixes possible race in update_prog_stats() for 32bit arches"): As it can run while interrupts are enabled, it could be re-entered and the u64_stats syncp could be mangled. Fixes: 9ed9e9ba2337 ("bpf: Count the number of times recursion was prevented") Signed-off-by: He Fengqing Acked-by: John Fastabend Link: https://lore.kernel.org/r/20220122102936.1219518-1-hefengqing@huawei.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/trampoline.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 4b6974a195c1..5e7edf913060 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -550,11 +550,12 @@ static __always_inline u64 notrace bpf_prog_start_time(void) static void notrace inc_misses_counter(struct bpf_prog *prog) { struct bpf_prog_stats *stats; + unsigned int flags; stats = this_cpu_ptr(prog->stats); - u64_stats_update_begin(&stats->syncp); + flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->misses); - u64_stats_update_end(&stats->syncp); + u64_stats_update_end_irqrestore(&stats->syncp, flags); } /* The logic is similar to bpf_prog_run(), but with an explicit -- cgit v1.2.3 From c446fdacb10dcb3b9a9ed3b91d91e72d71d94b03 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 25 Jan 2022 16:13:40 -0800 Subject: bpf: fix register_btf_kfunc_id_set for !CONFIG_DEBUG_INFO_BTF Commit dee872e124e8 ("bpf: Populate kfunc BTF ID sets in struct btf") breaks loading of some modules when CONFIG_DEBUG_INFO_BTF is not set. register_btf_kfunc_id_set returns -ENOENT to the callers when there is no module btf. Let's return 0 (success) instead to let those modules work in !CONFIG_DEBUG_INFO_BTF cases. Acked-by: Kumar Kartikeya Dwivedi Fixes: dee872e124e8 ("bpf: Populate kfunc BTF ID sets in struct btf") Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220126001340.1573649-1-sdf@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a1c44c17ea9c..b2a248956100 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6740,8 +6740,19 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, int ret; btf = btf_get_module_btf(kset->owner); - if (IS_ERR_OR_NULL(btf)) - return btf ? PTR_ERR(btf) : -ENOENT; + if (!btf) { + if (!kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { + pr_err("missing vmlinux BTF, cannot register kfuncs\n"); + return -ENOENT; + } + if (kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) { + pr_err("missing module BTF, cannot register kfuncs\n"); + return -ENOENT; + } + return 0; + } + if (IS_ERR(btf)) + return PTR_ERR(btf); hook = bpf_prog_type_to_kfunc_hook(prog_type); ret = btf_populate_kfunc_set(btf, hook, kset); -- cgit v1.2.3 From f244b4dc53e520d4570b2610436aba0593ce6f55 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 21 Jan 2022 18:36:28 +0530 Subject: printk: ringbuffer: Improve prb_next_seq() performance prb_next_seq() always iterates from the first known sequence number. In the worst case, it might loop 8k times for 256kB buffer, 15k times for 512kB buffer, and 64k times for 2MB buffer. It was reported that polling and reading using syslog interface might occupy 50% of CPU. Speedup the search by storing @id of the last finalized descriptor. The loop is still needed because the @id is stored and read in the best effort way. An atomic variable is used to keep the @id consistent. But the stores and reads are not serialized against each other. The descriptor could get reused in the meantime. The related sequence number will be used only when it is still valid. An invalid value should be read _only_ when there is a flood of messages and the ringbuffer is rapidly reused. The performance is the least problem in this case. Reported-by: Chunlei Wang Signed-off-by: Mukesh Ojha Reviewed-by: John Ogness Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/1642770388-17327-1-git-send-email-quic_mojha@quicinc.com Link: https://lore.kernel.org/lkml/YXlddJxLh77DKfIO@alley/T/#m43062e8b2a17f8dbc8c6ccdb8851fb0dbaabbb14 --- kernel/printk/printk_ringbuffer.c | 52 +++++++++++++++++++++++++++++++++++---- kernel/printk/printk_ringbuffer.h | 2 ++ 2 files changed, 49 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 8a7b7362c0dd..2b7b6ddab4f7 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -474,8 +474,10 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, * state has been re-checked. A memcpy() for all of @desc * cannot be used because of the atomic_t @state_var field. */ - memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos, - sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */ + if (desc_out) { + memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos, + sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */ + } if (seq_out) *seq_out = info->seq; /* also part of desc_read:C */ if (caller_id_out) @@ -528,7 +530,8 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */ d_state = get_desc_state(id, state_val); out: - atomic_long_set(&desc_out->state_var, state_val); + if (desc_out) + atomic_long_set(&desc_out->state_var, state_val); return d_state; } @@ -1449,6 +1452,9 @@ static void desc_make_final(struct prb_desc_ring *desc_ring, unsigned long id) atomic_long_cmpxchg_relaxed(&d->state_var, prev_state_val, DESC_SV(id, desc_finalized)); /* LMM(desc_make_final:A) */ + + /* Best effort to remember the last finalized @id. */ + atomic_long_set(&desc_ring->last_finalized_id, id); } /** @@ -1657,7 +1663,12 @@ void prb_commit(struct prb_reserved_entry *e) */ void prb_final_commit(struct prb_reserved_entry *e) { + struct prb_desc_ring *desc_ring = &e->rb->desc_ring; + _prb_commit(e, desc_finalized); + + /* Best effort to remember the last finalized @id. */ + atomic_long_set(&desc_ring->last_finalized_id, e->id); } /* @@ -2005,9 +2016,39 @@ u64 prb_first_valid_seq(struct printk_ringbuffer *rb) */ u64 prb_next_seq(struct printk_ringbuffer *rb) { - u64 seq = 0; + struct prb_desc_ring *desc_ring = &rb->desc_ring; + enum desc_state d_state; + unsigned long id; + u64 seq; + + /* Check if the cached @id still points to a valid @seq. */ + id = atomic_long_read(&desc_ring->last_finalized_id); + d_state = desc_read(desc_ring, id, NULL, &seq, NULL); - /* Search forward from the oldest descriptor. */ + if (d_state == desc_finalized || d_state == desc_reusable) { + /* + * Begin searching after the last finalized record. + * + * On 0, the search must begin at 0 because of hack#2 + * of the bootstrapping phase it is not known if a + * record at index 0 exists. + */ + if (seq != 0) + seq++; + } else { + /* + * The information about the last finalized sequence number + * has gone. It should happen only when there is a flood of + * new messages and the ringbuffer is rapidly recycled. + * Give up and start from the beginning. + */ + seq = 0; + } + + /* + * The information about the last finalized @seq might be inaccurate. + * Search forward to find the current one. + */ while (_prb_read_valid(rb, &seq, NULL, NULL)) seq++; @@ -2044,6 +2085,7 @@ void prb_init(struct printk_ringbuffer *rb, rb->desc_ring.infos = infos; atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits)); atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits)); + atomic_long_set(&rb->desc_ring.last_finalized_id, DESC0_ID(descbits)); rb->text_data_ring.size_bits = textbits; rb->text_data_ring.data = text_buf; diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h index 73cc80e01cef..18cd25e489b8 100644 --- a/kernel/printk/printk_ringbuffer.h +++ b/kernel/printk/printk_ringbuffer.h @@ -75,6 +75,7 @@ struct prb_desc_ring { struct printk_info *infos; atomic_long_t head_id; atomic_long_t tail_id; + atomic_long_t last_finalized_id; }; /* @@ -258,6 +259,7 @@ static struct printk_ringbuffer name = { \ .infos = &_##name##_infos[0], \ .head_id = ATOMIC_INIT(DESC0_ID(descbits)), \ .tail_id = ATOMIC_INIT(DESC0_ID(descbits)), \ + .last_finalized_id = ATOMIC_INIT(DESC0_ID(descbits)), \ }, \ .text_data_ring = { \ .size_bits = (avgtextbits) + (descbits), \ -- cgit v1.2.3 From dfcf2e017f5bb928094952d5d56d3566d3d07ba7 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 25 Jan 2022 16:20:01 +0300 Subject: swiotlb: do not zero buffer in set_memory_decrypted() For larger TDX VM, memset() after set_memory_decrypted() in swiotlb_update_mem_attributes() takes substantial portion of boot time. Zeroing doesn't serve any functional purpose. Malicious VMM can mess with decrypted/shared buffer at any point. Remove the memset(). Signed-off-by: Kirill A. Shutemov Acked-by: Tom Lendacky Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index f1e7ea160b43..9390b38d2897 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -207,8 +207,6 @@ void __init swiotlb_update_mem_attributes(void) mem->vaddr = swiotlb_mem_remap(mem, bytes); if (!mem->vaddr) mem->vaddr = vaddr; - - memset(mem->vaddr, 0, bytes); } static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, -- cgit v1.2.3 From 35265899acef135225e946b883fb07acba1d31a2 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 24 Jan 2022 16:40:17 +0000 Subject: swiotlb: simplify debugfs setup Debugfs functions are already stubbed out for !CONFIG_DEBUG_FS, so we can remove most of the #ifdefs, just keeping one to manually optimise away the initcall when it would do nothing. We can also simplify the code itself by factoring out the directory creation and realising that the global io_tlb_default_mem now makes debugfs_dir redundant. Signed-off-by: Robin Murphy Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 40 ++++++++++------------------------------ 1 file changed, 10 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 9390b38d2897..f829259262fd 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -36,9 +36,7 @@ #include #include #include -#ifdef CONFIG_DEBUG_FS #include -#endif #ifdef CONFIG_DMA_RESTRICTED_POOL #include #include @@ -756,47 +754,29 @@ bool is_swiotlb_active(struct device *dev) } EXPORT_SYMBOL_GPL(is_swiotlb_active); -#ifdef CONFIG_DEBUG_FS -static struct dentry *debugfs_dir; - -static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem) +static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, + const char *dirname) { + mem->debugfs = debugfs_create_dir(dirname, io_tlb_default_mem.debugfs); + if (!mem->nslabs) + return; + debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs); debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used); } -static int __init swiotlb_create_default_debugfs(void) +static int __init __maybe_unused swiotlb_create_default_debugfs(void) { - struct io_tlb_mem *mem = &io_tlb_default_mem; - - debugfs_dir = debugfs_create_dir("swiotlb", NULL); - if (mem->nslabs) { - mem->debugfs = debugfs_dir; - swiotlb_create_debugfs_files(mem); - } + swiotlb_create_debugfs_files(&io_tlb_default_mem, "swiotlb"); return 0; } +#ifdef CONFIG_DEBUG_FS late_initcall(swiotlb_create_default_debugfs); - #endif #ifdef CONFIG_DMA_RESTRICTED_POOL -#ifdef CONFIG_DEBUG_FS -static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem) -{ - struct io_tlb_mem *mem = rmem->priv; - - mem->debugfs = debugfs_create_dir(rmem->name, debugfs_dir); - swiotlb_create_debugfs_files(mem); -} -#else -static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem) -{ -} -#endif - struct page *swiotlb_alloc(struct device *dev, size_t size) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; @@ -858,7 +838,7 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem, rmem->priv = mem; - rmem_swiotlb_debugfs_init(rmem); + swiotlb_create_debugfs_files(mem, rmem->name); } dev->dma_io_tlb_mem = mem; -- cgit v1.2.3 From c0a4191c27a12d3175283fa33f16db20e91008fd Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 24 Jan 2022 16:40:18 +0000 Subject: swiotlb: tidy up includes SWIOTLB's includes have become a great big mess. Restore some order by consolidating the random different blocks, sorting alphabetically, and purging some clearly unnecessary entries - linux/io.h is now included unconditionally, so need not be duplicated in the restricted DMA pool case; similarly, linux/io.h subsumes asm/io.h; and by now it's a mystery why asm/dma.h was ever here at all. Signed-off-by: Robin Murphy Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index f829259262fd..f3ff0af49f81 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -21,38 +21,33 @@ #define pr_fmt(fmt) "software IO TLB: " fmt #include +#include +#include +#include #include #include -#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include #include -#include #include -#include -#include -#include -#include -#include -#include -#include #ifdef CONFIG_DMA_RESTRICTED_POOL -#include #include #include #include #include #endif -#include -#include - -#include -#include -#include -#include - #define CREATE_TRACE_POINTS #include -- cgit v1.2.3 From 404f9373c4e5c943ed8a5e71c8dcfef9eddd54ab Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 24 Jan 2022 16:40:19 +0000 Subject: swiotlb: simplify array allocation Prefer kcalloc() to kzalloc(array_size()) for allocating an array. Signed-off-by: Robin Murphy Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index f3ff0af49f81..908eac2527cb 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -818,8 +818,7 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem, if (!mem) return -ENOMEM; - mem->slots = kzalloc(array_size(sizeof(*mem->slots), nslabs), - GFP_KERNEL); + mem->slots = kcalloc(nslabs, sizeof(*mem->slots), GFP_KERNEL); if (!mem->slots) { kfree(mem); return -ENOMEM; -- cgit v1.2.3 From c80d401c52a2d1baf2a5afeb06f0ffe678e56d23 Mon Sep 17 00:00:00 2001 From: Tianchen Ding Date: Tue, 18 Jan 2022 18:05:18 +0800 Subject: cpuset: Fix the bug that subpart_cpus updated wrongly in update_cpumask() subparts_cpus should be limited as a subset of cpus_allowed, but it is updated wrongly by using cpumask_andnot(). Use cpumask_and() instead to fix it. Fixes: ee8dde0cd2ce ("cpuset: Add new v2 cpuset.sched.partition flag") Signed-off-by: Tianchen Ding Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index bb3531e7fda7..804ff5738c5f 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1635,8 +1635,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Make sure that subparts_cpus is a subset of cpus_allowed. */ if (cs->nr_subparts_cpus) { - cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus, - cs->cpus_allowed); + cpumask_and(cs->subparts_cpus, cs->subparts_cpus, cs->cpus_allowed); cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus); } spin_unlock_irq(&callback_lock); -- cgit v1.2.3 From 28c988c3ec29db74a1dda631b18785958d57df4f Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Tue, 18 Jan 2022 10:35:15 +0530 Subject: sched/debug: Remove mpol_get/put and task_lock/unlock from sched_show_numa The older format of /proc/pid/sched printed home node info which required the mempolicy and task lock around mpol_get(). However the format has changed since then and there is no need for sched_show_numa() any more to have mempolicy argument, asssociated mpol_get/put and task_lock/unlock. Remove them. Fixes: 397f2378f1361 ("sched/numa: Fix numa balancing stats in /proc/pid/sched") Signed-off-by: Bharata B Rao Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Srikar Dronamraju Acked-by: Mel Gorman Link: https://lore.kernel.org/r/20220118050515.2973-1-bharata@amd.com --- kernel/sched/debug.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index aa29211de1bf..102d6f70e84d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -931,25 +931,15 @@ void print_numa_stats(struct seq_file *m, int node, unsigned long tsf, static void sched_show_numa(struct task_struct *p, struct seq_file *m) { #ifdef CONFIG_NUMA_BALANCING - struct mempolicy *pol; - if (p->mm) P(mm->numa_scan_seq); - task_lock(p); - pol = p->mempolicy; - if (pol && !(pol->flags & MPOL_F_MORON)) - pol = NULL; - mpol_get(pol); - task_unlock(p); - P(numa_pages_migrated); P(numa_preferred_nid); P(total_numa_faults); SEQ_printf(m, "current_node=%d, numa_group_id=%d\n", task_node(p), task_numa_group_id(p)); show_numa_stats(p, m); - mpol_put(pol); #endif } -- cgit v1.2.3 From 12bf8a7eb84e4d3547ebfd89bb0a9255a0f2acc7 Mon Sep 17 00:00:00 2001 From: Honglei Wang Date: Thu, 13 Jan 2022 21:39:20 +0800 Subject: sched/numa: initialize numa statistics when forking new task The child processes will inherit numa_pages_migrated and total_numa_faults from the parent. It means even if there is no numa fault happen on the child, the statistics in /proc/$pid of the child process might show huge amount. This is a bit weird. Let's initialize them when do fork. Signed-off-by: Honglei Wang Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Link: https://lore.kernel.org/r/20220113133920.49900-1-wanghonglei@didichuxing.com --- kernel/sched/fair.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 095b0aa378df..5dca13ff89f2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2825,6 +2825,8 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) /* Protect against double add, see task_tick_numa and task_numa_work */ p->numa_work.next = &p->numa_work; p->numa_faults = NULL; + p->numa_pages_migrated = 0; + p->total_numa_faults = 0; RCU_INIT_POINTER(p->numa_group, NULL); p->last_task_numa_placement = 0; p->last_sum_exec_runtime = 0; -- cgit v1.2.3 From 77cf151b7bbdfa3577b3c3f3a5e267a6c60a263b Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Thu, 28 Oct 2021 12:50:05 +0100 Subject: sched/core: Export pelt_thermal_tp We can't use this tracepoint in modules without having the symbol exported first, fix that. Fixes: 765047932f15 ("sched/pelt: Add support to track thermal pressure") Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211028115005.873539-1-qais.yousef@arm.com --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2e4ae00e52d1..1d863d7f6ad7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -36,6 +36,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); -- cgit v1.2.3 From 7a17e1db1265471f7718af100cfc5e41280d53a7 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Thu, 16 Dec 2021 22:53:19 +0000 Subject: sched/sugov: Ignore 'busy' filter when rq is capped by uclamp_max sugov_update_single_{freq, perf}() contains a 'busy' filter that ensures we don't bring the frqeuency down if there's no idle time (CPU is busy). The problem is that with uclamp_max we will have scenarios where a busy task is capped to run at a lower frequency and this filter prevents applying the capping when this task starts running. We handle this by skipping the filter when uclamp is enabled and the rq is being capped by uclamp_max. We introduce a new function uclamp_rq_is_capped() to help detecting when this capping is taking effect. Some code shuffling was required to allow using cpu_util_{cfs, rt}() in this new function. On 2 Core SMT2 Intel laptop I see: Without this patch: uclampset -M 0 sysbench --test=cpu --threads = 4 run produces a score of ~3200 consistently. Which is the highest possible. Compiling the kernel also results in frequency running at max 3.1GHz all the time - running uclampset -M 400 to cap it has no effect without this patch. With this patch: uclampset -M 0 sysbench --test=cpu --threads = 4 run produces a score of ~1100 with some outliers in ~1700. Uclamp max aggregates the performance requirements, so having high values sometimes is expected if some other task happens to require that frequency starts running at the same time. When compiling the kernel with uclampset -M 400 I can see the frequencies mostly in the ~2GHz region. Helpful to conserve power and prevent heating when not plugged in. Fixes: 982d9cdc22c9 ("sched/cpufreq, sched/uclamp: Add clamps for FAIR and RT tasks") Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211216225320.2957053-2-qais.yousef@arm.com --- kernel/sched/cpufreq_schedutil.c | 10 ++- kernel/sched/sched.h | 181 +++++++++++++++++++++------------------ 2 files changed, 107 insertions(+), 84 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 26778884d9ab..62d98b09aaa5 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -348,8 +348,11 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time, /* * Do not reduce the frequency if the CPU has not been idle * recently, as the reduction is likely to be premature then. + * + * Except when the rq is capped by uclamp_max. */ - if (sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) { + if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) && + sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) { next_f = sg_policy->next_freq; /* Restore cached freq as next_freq has changed */ @@ -395,8 +398,11 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, /* * Do not reduce the target performance level if the CPU has not been * idle recently, as the reduction is likely to be premature then. + * + * Except when the rq is capped by uclamp_max. */ - if (sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util) + if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) && + sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util) sg_cpu->util = prev_util; cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl), diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index de53be905739..9b33ba9c3c42 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2841,88 +2841,6 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ -#ifdef CONFIG_UCLAMP_TASK -unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); - -/** - * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values. - * @rq: The rq to clamp against. Must not be NULL. - * @util: The util value to clamp. - * @p: The task to clamp against. Can be NULL if you want to clamp - * against @rq only. - * - * Clamps the passed @util to the max(@rq, @p) effective uclamp values. - * - * If sched_uclamp_used static key is disabled, then just return the util - * without any clamping since uclamp aggregation at the rq level in the fast - * path is disabled, rendering this operation a NOP. - * - * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It - * will return the correct effective uclamp value of the task even if the - * static key is disabled. - */ -static __always_inline -unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, - struct task_struct *p) -{ - unsigned long min_util = 0; - unsigned long max_util = 0; - - if (!static_branch_likely(&sched_uclamp_used)) - return util; - - if (p) { - min_util = uclamp_eff_value(p, UCLAMP_MIN); - max_util = uclamp_eff_value(p, UCLAMP_MAX); - - /* - * Ignore last runnable task's max clamp, as this task will - * reset it. Similarly, no need to read the rq's min clamp. - */ - if (rq->uclamp_flags & UCLAMP_FLAG_IDLE) - goto out; - } - - min_util = max_t(unsigned long, min_util, READ_ONCE(rq->uclamp[UCLAMP_MIN].value)); - max_util = max_t(unsigned long, max_util, READ_ONCE(rq->uclamp[UCLAMP_MAX].value)); -out: - /* - * Since CPU's {min,max}_util clamps are MAX aggregated considering - * RUNNABLE tasks with _different_ clamps, we can end up with an - * inversion. Fix it now when the clamps are applied. - */ - if (unlikely(min_util >= max_util)) - return min_util; - - return clamp(util, min_util, max_util); -} - -/* - * When uclamp is compiled in, the aggregation at rq level is 'turned off' - * by default in the fast path and only gets turned on once userspace performs - * an operation that requires it. - * - * Returns true if userspace opted-in to use uclamp and aggregation at rq level - * hence is active. - */ -static inline bool uclamp_is_used(void) -{ - return static_branch_likely(&sched_uclamp_used); -} -#else /* CONFIG_UCLAMP_TASK */ -static inline -unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, - struct task_struct *p) -{ - return util; -} - -static inline bool uclamp_is_used(void) -{ - return false; -} -#endif /* CONFIG_UCLAMP_TASK */ - #ifdef arch_scale_freq_capacity # ifndef arch_scale_freq_invariant # define arch_scale_freq_invariant() true @@ -3020,6 +2938,105 @@ static inline unsigned long cpu_util_rt(struct rq *rq) } #endif +#ifdef CONFIG_UCLAMP_TASK +unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); + +/** + * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values. + * @rq: The rq to clamp against. Must not be NULL. + * @util: The util value to clamp. + * @p: The task to clamp against. Can be NULL if you want to clamp + * against @rq only. + * + * Clamps the passed @util to the max(@rq, @p) effective uclamp values. + * + * If sched_uclamp_used static key is disabled, then just return the util + * without any clamping since uclamp aggregation at the rq level in the fast + * path is disabled, rendering this operation a NOP. + * + * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It + * will return the correct effective uclamp value of the task even if the + * static key is disabled. + */ +static __always_inline +unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, + struct task_struct *p) +{ + unsigned long min_util = 0; + unsigned long max_util = 0; + + if (!static_branch_likely(&sched_uclamp_used)) + return util; + + if (p) { + min_util = uclamp_eff_value(p, UCLAMP_MIN); + max_util = uclamp_eff_value(p, UCLAMP_MAX); + + /* + * Ignore last runnable task's max clamp, as this task will + * reset it. Similarly, no need to read the rq's min clamp. + */ + if (rq->uclamp_flags & UCLAMP_FLAG_IDLE) + goto out; + } + + min_util = max_t(unsigned long, min_util, READ_ONCE(rq->uclamp[UCLAMP_MIN].value)); + max_util = max_t(unsigned long, max_util, READ_ONCE(rq->uclamp[UCLAMP_MAX].value)); +out: + /* + * Since CPU's {min,max}_util clamps are MAX aggregated considering + * RUNNABLE tasks with _different_ clamps, we can end up with an + * inversion. Fix it now when the clamps are applied. + */ + if (unlikely(min_util >= max_util)) + return min_util; + + return clamp(util, min_util, max_util); +} + +/* Is the rq being capped/throttled by uclamp_max? */ +static inline bool uclamp_rq_is_capped(struct rq *rq) +{ + unsigned long rq_util; + unsigned long max_util; + + if (!static_branch_likely(&sched_uclamp_used)) + return false; + + rq_util = cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq); + max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); + + return max_util != SCHED_CAPACITY_SCALE && rq_util >= max_util; +} + +/* + * When uclamp is compiled in, the aggregation at rq level is 'turned off' + * by default in the fast path and only gets turned on once userspace performs + * an operation that requires it. + * + * Returns true if userspace opted-in to use uclamp and aggregation at rq level + * hence is active. + */ +static inline bool uclamp_is_used(void) +{ + return static_branch_likely(&sched_uclamp_used); +} +#else /* CONFIG_UCLAMP_TASK */ +static inline +unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, + struct task_struct *p) +{ + return util; +} + +static inline bool uclamp_rq_is_capped(struct rq *rq) { return false; } + +static inline bool uclamp_is_used(void) +{ + return false; +} +#endif /* CONFIG_UCLAMP_TASK */ + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ static inline unsigned long cpu_util_irq(struct rq *rq) { -- cgit v1.2.3 From d37aee9018e68b0d356195caefbb651910e0bbfa Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Thu, 16 Dec 2021 22:53:20 +0000 Subject: sched/uclamp: Fix iowait boost escaping uclamp restriction iowait_boost signal is applied independently of util and doesn't take into account uclamp settings of the rq. An io heavy task that is capped by uclamp_max could still request higher frequency because sugov_iowait_apply() doesn't clamp the boost via uclamp_rq_util_with() like effective_cpu_util() does. Make sure that iowait_boost honours uclamp requests by calling uclamp_rq_util_with() when applying the boost. Fixes: 982d9cdc22c9 ("sched/cpufreq, sched/uclamp: Add clamps for FAIR and RT tasks") Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20211216225320.2957053-3-qais.yousef@arm.com --- kernel/sched/cpufreq_schedutil.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 62d98b09aaa5..6d65ab6e484e 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -289,6 +289,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time) * into the same scale so we can compare. */ boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT; + boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL); if (sg_cpu->util < boost) sg_cpu->util = boost; } -- cgit v1.2.3 From 5102bb1c9f82857a3164af9d7ab7ad628cb783ed Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 19 Jan 2022 14:39:40 -0800 Subject: psi: Fix "defined but not used" warnings when CONFIG_PROC_FS=n When CONFIG_PROC_FS is disabled psi code generates the following warnings: kernel/sched/psi.c:1364:30: warning: 'psi_cpu_proc_ops' defined but not used [-Wunused-const-variable=] 1364 | static const struct proc_ops psi_cpu_proc_ops = { | ^~~~~~~~~~~~~~~~ kernel/sched/psi.c:1355:30: warning: 'psi_memory_proc_ops' defined but not used [-Wunused-const-variable=] 1355 | static const struct proc_ops psi_memory_proc_ops = { | ^~~~~~~~~~~~~~~~~~~ kernel/sched/psi.c:1346:30: warning: 'psi_io_proc_ops' defined but not used [-Wunused-const-variable=] 1346 | static const struct proc_ops psi_io_proc_ops = { | ^~~~~~~~~~~~~~~ Make definitions of these structures and related functions conditional on CONFIG_PROC_FS config. Fixes: 0e94682b73bf ("psi: introduce psi monitor") Reported-by: kernel test robot Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220119223940.787748-3-surenb@google.com --- kernel/sched/psi.c | 79 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 41 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index a679613a7cb7..cfe76f704d8a 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1082,44 +1082,6 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) return 0; } -static int psi_io_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_IO); -} - -static int psi_memory_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_MEM); -} - -static int psi_cpu_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_CPU); -} - -static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *)) -{ - if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) - return -EPERM; - - return single_open(file, psi_show, NULL); -} - -static int psi_io_open(struct inode *inode, struct file *file) -{ - return psi_open(file, psi_io_show); -} - -static int psi_memory_open(struct inode *inode, struct file *file) -{ - return psi_open(file, psi_memory_show); -} - -static int psi_cpu_open(struct inode *inode, struct file *file) -{ - return psi_open(file, psi_cpu_show); -} - struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, size_t nbytes, enum psi_res res) { @@ -1296,6 +1258,45 @@ __poll_t psi_trigger_poll(void **trigger_ptr, return ret; } +#ifdef CONFIG_PROC_FS +static int psi_io_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IO); +} + +static int psi_memory_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_MEM); +} + +static int psi_cpu_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_CPU); +} + +static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *)) +{ + if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) + return -EPERM; + + return single_open(file, psi_show, NULL); +} + +static int psi_io_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_io_show); +} + +static int psi_memory_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_memory_show); +} + +static int psi_cpu_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_cpu_show); +} + static ssize_t psi_write(struct file *file, const char __user *user_buf, size_t nbytes, enum psi_res res) { @@ -1400,3 +1401,5 @@ static int __init psi_proc_init(void) return 0; } module_init(psi_proc_init); + +#endif /* CONFIG_PROC_FS */ -- cgit v1.2.3 From 46531a30364bd483bfa1b041c15d42a196e77e93 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 27 Jan 2022 14:09:13 +0000 Subject: cgroup/bpf: fast path skb BPF filtering Even though there is a static key protecting from overhead from cgroup-bpf skb filtering when there is nothing attached, in many cases it's not enough as registering a filter for one type will ruin the fast path for all others. It's observed in production servers I've looked at but also in laptops, where registration is done during init by systemd or something else. Add a per-socket fast path check guarding from such overhead. This affects both receive and transmit paths of TCP, UDP and other protocols. It showed ~1% tx/s improvement in small payload UDP send benchmarks using a real NIC and in a server environment and the number jumps to 2-3% for preemtible kernels. Reviewed-by: Stanislav Fomichev Signed-off-by: Pavel Begunkov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/d8c58857113185a764927a46f4b5a058d36d3ec3.1643292455.git.asml.silence@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup.h | 24 ++++++++++++++++++++---- include/linux/bpf.h | 13 +++++++++++++ kernel/bpf/cgroup.c | 30 ------------------------------ kernel/bpf/core.c | 16 ++++------------ 4 files changed, 37 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index b525d8cdc25b..88a51b242adc 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -8,6 +8,7 @@ #include #include #include +#include #include struct sock; @@ -165,11 +166,23 @@ int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, void *value, u64 flags); +/* Opportunistic check to see whether we have any BPF program attached*/ +static inline bool cgroup_bpf_sock_enabled(struct sock *sk, + enum cgroup_bpf_attach_type type) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_prog_array *array; + + array = rcu_access_pointer(cgrp->bpf.effective[type]); + return array != &bpf_empty_prog_array.hdr; +} + /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(CGROUP_INET_INGRESS)) \ + if (cgroup_bpf_enabled(CGROUP_INET_INGRESS) && \ + cgroup_bpf_sock_enabled(sk, CGROUP_INET_INGRESS)) \ __ret = __cgroup_bpf_run_filter_skb(sk, skb, \ CGROUP_INET_INGRESS); \ \ @@ -181,7 +194,8 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \ typeof(sk) __sk = sk_to_full_sk(sk); \ - if (sk_fullsock(__sk)) \ + if (sk_fullsock(__sk) && \ + cgroup_bpf_sock_enabled(__sk, CGROUP_INET_EGRESS)) \ __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \ CGROUP_INET_EGRESS); \ } \ @@ -347,7 +361,8 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, kernel_optval) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(CGROUP_SETSOCKOPT)) \ + if (cgroup_bpf_enabled(CGROUP_SETSOCKOPT) && \ + cgroup_bpf_sock_enabled(sock, CGROUP_SETSOCKOPT)) \ __ret = __cgroup_bpf_run_filter_setsockopt(sock, level, \ optname, optval, \ optlen, \ @@ -367,7 +382,8 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, max_optlen, retval) \ ({ \ int __ret = retval; \ - if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT)) \ + if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT) && \ + cgroup_bpf_sock_enabled(sock, CGROUP_GETSOCKOPT)) \ if (!(sock)->sk_prot->bpf_bypass_getsockopt || \ !INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \ tcp_bpf_bypass_getsockopt, \ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2344f793c4dc..e3b82ce51445 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1233,6 +1233,19 @@ struct bpf_prog_array { struct bpf_prog_array_item items[]; }; +struct bpf_empty_prog_array { + struct bpf_prog_array hdr; + struct bpf_prog *null_prog; +}; + +/* to avoid allocating empty bpf_prog_array for cgroups that + * don't have bpf program attached use one global 'bpf_empty_prog_array' + * It will not be modified the caller of bpf_prog_array_alloc() + * (since caller requested prog_cnt == 0) + * that pointer should be 'freed' by bpf_prog_array_free() + */ +extern struct bpf_empty_prog_array bpf_empty_prog_array; + struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); void bpf_prog_array_free(struct bpf_prog_array *progs); int bpf_prog_array_length(struct bpf_prog_array *progs); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 279ebbed75a5..098632fdbc45 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1384,20 +1384,6 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, } #ifdef CONFIG_NET -static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, - enum cgroup_bpf_attach_type attach_type) -{ - struct bpf_prog_array *prog_array; - bool empty; - - rcu_read_lock(); - prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]); - empty = bpf_prog_array_is_empty(prog_array); - rcu_read_unlock(); - - return empty; -} - static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen, struct bpf_sockopt_buf *buf) { @@ -1456,19 +1442,11 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, }; int ret, max_optlen; - /* Opportunistic check to see whether we have any BPF program - * attached to the hook so we don't waste time allocating - * memory and locking the socket. - */ - if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_SETSOCKOPT)) - return 0; - /* Allocate a bit more than the initial user buffer for * BPF program. The canonical use case is overriding * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic). */ max_optlen = max_t(int, 16, *optlen); - max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) return max_optlen; @@ -1550,15 +1528,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, }; int ret; - /* Opportunistic check to see whether we have any BPF program - * attached to the hook so we don't waste time allocating - * memory and locking the socket. - */ - if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_GETSOCKOPT)) - return retval; - ctx.optlen = max_optlen; - max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) return max_optlen; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 0a1cfd8544b9..04a8d5bea552 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1968,18 +1968,10 @@ static struct bpf_prog_dummy { }, }; -/* to avoid allocating empty bpf_prog_array for cgroups that - * don't have bpf program attached use one global 'empty_prog_array' - * It will not be modified the caller of bpf_prog_array_alloc() - * (since caller requested prog_cnt == 0) - * that pointer should be 'freed' by bpf_prog_array_free() - */ -static struct { - struct bpf_prog_array hdr; - struct bpf_prog *null_prog; -} empty_prog_array = { +struct bpf_empty_prog_array bpf_empty_prog_array = { .null_prog = NULL, }; +EXPORT_SYMBOL(bpf_empty_prog_array); struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) { @@ -1989,12 +1981,12 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) (prog_cnt + 1), flags); - return &empty_prog_array.hdr; + return &bpf_empty_prog_array.hdr; } void bpf_prog_array_free(struct bpf_prog_array *progs) { - if (!progs || progs == &empty_prog_array.hdr) + if (!progs || progs == &bpf_empty_prog_array.hdr) return; kfree_rcu(progs, rcu); } -- cgit v1.2.3 From c6f1bfe89ac95dc829dcb4ed54780da134ac5fce Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 27 Jan 2022 07:46:06 -0800 Subject: bpf: reject program if a __user tagged memory accessed in kernel way BPF verifier supports direct memory access for BPF_PROG_TYPE_TRACING type of bpf programs, e.g., a->b. If "a" is a pointer pointing to kernel memory, bpf verifier will allow user to write code in C like a->b and the verifier will translate it to a kernel load properly. If "a" is a pointer to user memory, it is expected that bpf developer should be bpf_probe_read_user() helper to get the value a->b. Without utilizing BTF __user tagging information, current verifier will assume that a->b is a kernel memory access and this may generate incorrect result. Now BTF contains __user information, it can check whether the pointer points to a user memory or not. If it is, the verifier can reject the program and force users to use bpf_probe_read_user() helper explicitly. In the future, we can easily extend btf_add_space for other address space tagging, for example, rcu/percpu etc. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220127154606.654961-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 9 ++++++--- include/linux/btf.h | 5 +++++ kernel/bpf/btf.c | 34 ++++++++++++++++++++++++++++------ kernel/bpf/verifier.c | 35 ++++++++++++++++++++++++----------- net/bpf/bpf_dummy_struct_ops.c | 6 ++++-- net/ipv4/bpf_tcp_ca.c | 6 ++++-- 6 files changed, 71 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e3b82ce51445..6eb0b180d33b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -332,7 +332,10 @@ enum bpf_type_flag { */ MEM_ALLOC = BIT(2 + BPF_BASE_TYPE_BITS), - __BPF_TYPE_LAST_FLAG = MEM_ALLOC, + /* MEM is in user address space. */ + MEM_USER = BIT(3 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_LAST_FLAG = MEM_USER, }; /* Max number of base types. */ @@ -588,7 +591,7 @@ struct bpf_verifier_ops { const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype, - u32 *next_btf_id); + u32 *next_btf_id, enum bpf_type_flag *flag); }; struct bpf_prog_offload_ops { @@ -1780,7 +1783,7 @@ static inline bool bpf_tracing_btf_ctx_access(int off, int size, int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype, - u32 *next_btf_id); + u32 *next_btf_id, enum bpf_type_flag *flag); bool btf_struct_ids_match(struct bpf_verifier_log *log, const struct btf *btf, u32 id, int off, const struct btf *need_btf, u32 need_type_id); diff --git a/include/linux/btf.h b/include/linux/btf.h index b12cfe3b12bb..f6c43dd513fa 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -238,6 +238,11 @@ static inline bool btf_type_is_var(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_VAR; } +static inline bool btf_type_is_type_tag(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_TYPE_TAG; +} + /* union is only a special case of struct: * all its offsetof(member) == 0 */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index b2a248956100..b983cee8d196 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4886,6 +4886,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, const char *tname = prog->aux->attach_func_name; struct bpf_verifier_log *log = info->log; const struct btf_param *args; + const char *tag_value; u32 nr_args, arg; int i, ret; @@ -5038,6 +5039,13 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, info->btf = btf; info->btf_id = t->type; t = btf_type_by_id(btf, t->type); + + if (btf_type_is_type_tag(t)) { + tag_value = __btf_name_by_offset(btf, t->name_off); + if (strcmp(tag_value, "user") == 0) + info->reg_type |= MEM_USER; + } + /* skip modifiers */ while (btf_type_is_modifier(t)) { info->btf_id = t->type; @@ -5064,12 +5072,12 @@ enum bpf_struct_walk_result { static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, - u32 *next_btf_id) + u32 *next_btf_id, enum bpf_type_flag *flag) { u32 i, moff, mtrue_end, msize = 0, total_nelems = 0; const struct btf_type *mtype, *elem_type = NULL; const struct btf_member *member; - const char *tname, *mname; + const char *tname, *mname, *tag_value; u32 vlen, elem_id, mid; again: @@ -5253,7 +5261,8 @@ error: } if (btf_type_is_ptr(mtype)) { - const struct btf_type *stype; + const struct btf_type *stype, *t; + enum bpf_type_flag tmp_flag = 0; u32 id; if (msize != size || off != moff) { @@ -5262,9 +5271,19 @@ error: mname, moff, tname, off, size); return -EACCES; } + + /* check __user tag */ + t = btf_type_by_id(btf, mtype->type); + if (btf_type_is_type_tag(t)) { + tag_value = __btf_name_by_offset(btf, t->name_off); + if (strcmp(tag_value, "user") == 0) + tmp_flag = MEM_USER; + } + stype = btf_type_skip_modifiers(btf, mtype->type, &id); if (btf_type_is_struct(stype)) { *next_btf_id = id; + *flag = tmp_flag; return WALK_PTR; } } @@ -5291,13 +5310,14 @@ error: int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype __maybe_unused, - u32 *next_btf_id) + u32 *next_btf_id, enum bpf_type_flag *flag) { + enum bpf_type_flag tmp_flag = 0; int err; u32 id; do { - err = btf_struct_walk(log, btf, t, off, size, &id); + err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag); switch (err) { case WALK_PTR: @@ -5305,6 +5325,7 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, * we're done. */ *next_btf_id = id; + *flag = tmp_flag; return PTR_TO_BTF_ID; case WALK_SCALAR: return SCALAR_VALUE; @@ -5349,6 +5370,7 @@ bool btf_struct_ids_match(struct bpf_verifier_log *log, const struct btf *need_btf, u32 need_type_id) { const struct btf_type *type; + enum bpf_type_flag flag; int err; /* Are we already done? */ @@ -5359,7 +5381,7 @@ again: type = btf_type_by_id(btf, id); if (!type) return false; - err = btf_struct_walk(log, btf, type, off, 1, &id); + err = btf_struct_walk(log, btf, type, off, 1, &id, &flag); if (err != WALK_STRUCT) return false; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dcf065ec2774..1ae41d0cf96c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -536,7 +536,7 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn) static const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type) { - char postfix[16] = {0}, prefix[16] = {0}; + char postfix[16] = {0}, prefix[32] = {0}; static const char * const str[] = { [NOT_INIT] = "?", [SCALAR_VALUE] = "inv", @@ -570,9 +570,11 @@ static const char *reg_type_str(struct bpf_verifier_env *env, } if (type & MEM_RDONLY) - strncpy(prefix, "rdonly_", 16); + strncpy(prefix, "rdonly_", 32); if (type & MEM_ALLOC) - strncpy(prefix, "alloc_", 16); + strncpy(prefix, "alloc_", 32); + if (type & MEM_USER) + strncpy(prefix, "user_", 32); snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s", prefix, str[base_type(type)], postfix); @@ -1547,14 +1549,15 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, static void mark_btf_ld_reg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno, enum bpf_reg_type reg_type, - struct btf *btf, u32 btf_id) + struct btf *btf, u32 btf_id, + enum bpf_type_flag flag) { if (reg_type == SCALAR_VALUE) { mark_reg_unknown(env, regs, regno); return; } mark_reg_known_zero(env, regs, regno); - regs[regno].type = PTR_TO_BTF_ID; + regs[regno].type = PTR_TO_BTF_ID | flag; regs[regno].btf = btf; regs[regno].btf_id = btf_id; } @@ -4152,6 +4155,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg = regs + regno; const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id); const char *tname = btf_name_by_offset(reg->btf, t->name_off); + enum bpf_type_flag flag = 0; u32 btf_id; int ret; @@ -4171,9 +4175,16 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, return -EACCES; } + if (reg->type & MEM_USER) { + verbose(env, + "R%d is ptr_%s access user memory: off=%d\n", + regno, tname, off); + return -EACCES; + } + if (env->ops->btf_struct_access) { ret = env->ops->btf_struct_access(&env->log, reg->btf, t, - off, size, atype, &btf_id); + off, size, atype, &btf_id, &flag); } else { if (atype != BPF_READ) { verbose(env, "only read is supported\n"); @@ -4181,14 +4192,14 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, } ret = btf_struct_access(&env->log, reg->btf, t, off, size, - atype, &btf_id); + atype, &btf_id, &flag); } if (ret < 0) return ret; if (atype == BPF_READ && value_regno >= 0) - mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id); + mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag); return 0; } @@ -4201,6 +4212,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, { struct bpf_reg_state *reg = regs + regno; struct bpf_map *map = reg->map_ptr; + enum bpf_type_flag flag = 0; const struct btf_type *t; const char *tname; u32 btf_id; @@ -4238,12 +4250,12 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, return -EACCES; } - ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id); + ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id, &flag); if (ret < 0) return ret; if (value_regno >= 0) - mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id); + mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id, flag); return 0; } @@ -4444,7 +4456,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err < 0) return err; - err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf, &btf_id); + err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf, + &btf_id); if (err) verbose_linfo(env, insn_idx, "; "); if (!err && t == BPF_READ && value_regno >= 0) { diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index fbc896323bec..d0e54e30658a 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -145,7 +145,8 @@ static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype, - u32 *next_btf_id) + u32 *next_btf_id, + enum bpf_type_flag *flag) { const struct btf_type *state; s32 type_id; @@ -162,7 +163,8 @@ static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log, return -EACCES; } - err = btf_struct_access(log, btf, t, off, size, atype, next_btf_id); + err = btf_struct_access(log, btf, t, off, size, atype, next_btf_id, + flag); if (err < 0) return err; diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index b60c9fd7147e..f79ab942f03b 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -96,12 +96,14 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype, - u32 *next_btf_id) + u32 *next_btf_id, + enum bpf_type_flag *flag) { size_t end; if (atype == BPF_READ) - return btf_struct_access(log, btf, t, off, size, atype, next_btf_id); + return btf_struct_access(log, btf, t, off, size, atype, next_btf_id, + flag); if (t != tcp_sock_type) { bpf_log(log, "only read is supported\n"); -- cgit v1.2.3 From 1c4cafd11599abdbc53a520f0b6e6799d037eae1 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sun, 23 Jan 2022 10:38:52 -0800 Subject: padata: replace cpumask_weight with cpumask_empty in padata.c padata_do_parallel() calls cpumask_weight() to check if any bit of a given cpumask is set. We can do it more efficiently with cpumask_empty() because cpumask_empty() stops traversing the cpumask as soon as it finds first set bit, while cpumask_weight() counts all bits unconditionally. Signed-off-by: Yury Norov Signed-off-by: Herbert Xu --- kernel/padata.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 18d3a5c699d8..e5819bb8bd1d 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -181,7 +181,7 @@ int padata_do_parallel(struct padata_shell *ps, goto out; if (!cpumask_test_cpu(*cb_cpu, pd->cpumask.cbcpu)) { - if (!cpumask_weight(pd->cpumask.cbcpu)) + if (cpumask_empty(pd->cpumask.cbcpu)) goto out; /* Select an alternate fallback CPU and notify the caller. */ -- cgit v1.2.3 From 0407a65f356e6d9340ad673907c17e52fade43e3 Mon Sep 17 00:00:00 2001 From: Kenta Tada Date: Sat, 29 Jan 2022 02:09:06 +0900 Subject: bpf: make bpf_copy_from_user_task() gpl only access_process_vm() is exported by EXPORT_SYMBOL_GPL(). Signed-off-by: Kenta Tada Link: https://lore.kernel.org/r/20220128170906.21154-1-Kenta.Tada@sony.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index ed2780b76cc1..4e5969fde0b3 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -695,7 +695,7 @@ BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size, const struct bpf_func_proto bpf_copy_from_user_task_proto = { .func = bpf_copy_from_user_task, - .gpl_only = false, + .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, -- cgit v1.2.3 From 24f6008564183aa120d07c03d9289519c2fe02af Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 20 Jan 2022 11:04:01 -0600 Subject: cgroup-v1: Require capabilities to set release_agent The cgroup release_agent is called with call_usermodehelper. The function call_usermodehelper starts the release_agent with a full set fo capabilities. Therefore require capabilities when setting the release_agaent. Reported-by: Tabitha Sable Tested-by: Tabitha Sable Fixes: 81a6a5cdd2c5 ("Task Control Groups: automatic userspace notification of idle cgroups") Cc: stable@vger.kernel.org # v2.6.24+ Signed-off-by: "Eric W. Biederman" Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 41e0837a5a0b..0e877dbcfeea 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -549,6 +549,14 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + /* + * Release agent gets called with all capabilities, + * require capabilities to set release agent. + */ + if ((of->file->f_cred->user_ns != &init_user_ns) || + !capable(CAP_SYS_ADMIN)) + return -EPERM; + cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; @@ -954,6 +962,12 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) /* Specifying two release agents is forbidden */ if (ctx->release_agent) return invalfc(fc, "release_agent respecified"); + /* + * Release agent gets called with all capabilities, + * require capabilities to set release agent. + */ + if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) + return invalfc(fc, "Setting release_agent not allowed"); ctx->release_agent = param->string; param->string = NULL; break; -- cgit v1.2.3 From 1ddbddd7065182c10c7c50ba6daf890edfdf7377 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 31 Jan 2022 14:05:25 -0800 Subject: bpf: Remove unnecessary setrlimit from bpf preload. BPF programs and maps are memcg accounted. setrlimit is obsolete. Remove its use from bpf preload. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220131220528.98088-5-alexei.starovoitov@gmail.com --- kernel/bpf/preload/iterators/iterators.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/preload/iterators/iterators.c b/kernel/bpf/preload/iterators/iterators.c index 5d872a705470..2ec85fc6984f 100644 --- a/kernel/bpf/preload/iterators/iterators.c +++ b/kernel/bpf/preload/iterators/iterators.c @@ -37,7 +37,6 @@ static int send_link_to_kernel(struct bpf_link *link, const char *link_name) int main(int argc, char **argv) { - struct rlimit rlim = { RLIM_INFINITY, RLIM_INFINITY }; struct iterators_bpf *skel; int err, magic; int debug_fd; @@ -55,7 +54,6 @@ int main(int argc, char **argv) printf("bad start magic %d\n", magic); return 1; } - setrlimit(RLIMIT_MEMLOCK, &rlim); /* libbpf opens BPF object and loads it into the kernel */ skel = iterators_bpf__open_and_load(); if (!skel) { -- cgit v1.2.3 From 79b203926d18cb8c110564d8a09ff646691de9e7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 31 Jan 2022 14:05:26 -0800 Subject: bpf: Convert bpf preload to light skeleton. Convert bpffs preload iterators to light skeleton. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220131220528.98088-6-alexei.starovoitov@gmail.com --- kernel/bpf/preload/iterators/Makefile | 6 +- kernel/bpf/preload/iterators/iterators.c | 10 +- kernel/bpf/preload/iterators/iterators.lskel.h | 428 +++++++++++++++++++++++++ kernel/bpf/preload/iterators/iterators.skel.h | 412 ------------------------ 4 files changed, 436 insertions(+), 420 deletions(-) create mode 100644 kernel/bpf/preload/iterators/iterators.lskel.h delete mode 100644 kernel/bpf/preload/iterators/iterators.skel.h (limited to 'kernel') diff --git a/kernel/bpf/preload/iterators/Makefile b/kernel/bpf/preload/iterators/Makefile index b8bd60511227..bfe24f8c5a20 100644 --- a/kernel/bpf/preload/iterators/Makefile +++ b/kernel/bpf/preload/iterators/Makefile @@ -35,15 +35,15 @@ endif .PHONY: all clean -all: iterators.skel.h +all: iterators.lskel.h clean: $(call msg,CLEAN) $(Q)rm -rf $(OUTPUT) iterators -iterators.skel.h: $(OUTPUT)/iterators.bpf.o | $(BPFTOOL) +iterators.lskel.h: $(OUTPUT)/iterators.bpf.o | $(BPFTOOL) $(call msg,GEN-SKEL,$@) - $(Q)$(BPFTOOL) gen skeleton $< > $@ + $(Q)$(BPFTOOL) gen skeleton -L $< > $@ $(OUTPUT)/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT) diff --git a/kernel/bpf/preload/iterators/iterators.c b/kernel/bpf/preload/iterators/iterators.c index 2ec85fc6984f..23b74916fb84 100644 --- a/kernel/bpf/preload/iterators/iterators.c +++ b/kernel/bpf/preload/iterators/iterators.c @@ -10,20 +10,20 @@ #include #include #include -#include "iterators.skel.h" +#include "iterators.lskel.h" #include "bpf_preload_common.h" int to_kernel = -1; int from_kernel = 0; -static int send_link_to_kernel(struct bpf_link *link, const char *link_name) +static int send_link_to_kernel(int link_fd, const char *link_name) { struct bpf_preload_info obj = {}; struct bpf_link_info info = {}; __u32 info_len = sizeof(info); int err; - err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &info, &info_len); + err = bpf_obj_get_info_by_fd(link_fd, &info, &info_len); if (err) return err; obj.link_id = info.id; @@ -70,10 +70,10 @@ int main(int argc, char **argv) goto cleanup; /* send two bpf_link IDs with names to the kernel */ - err = send_link_to_kernel(skel->links.dump_bpf_map, "maps.debug"); + err = send_link_to_kernel(skel->links.dump_bpf_map_fd, "maps.debug"); if (err) goto cleanup; - err = send_link_to_kernel(skel->links.dump_bpf_prog, "progs.debug"); + err = send_link_to_kernel(skel->links.dump_bpf_prog_fd, "progs.debug"); if (err) goto cleanup; diff --git a/kernel/bpf/preload/iterators/iterators.lskel.h b/kernel/bpf/preload/iterators/iterators.lskel.h new file mode 100644 index 000000000000..d90562d672d2 --- /dev/null +++ b/kernel/bpf/preload/iterators/iterators.lskel.h @@ -0,0 +1,428 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* THIS FILE IS AUTOGENERATED! */ +#ifndef __ITERATORS_BPF_SKEL_H__ +#define __ITERATORS_BPF_SKEL_H__ + +#include +#include +#include + +struct iterators_bpf { + struct bpf_loader_ctx ctx; + struct { + struct bpf_map_desc rodata; + } maps; + struct { + struct bpf_prog_desc dump_bpf_map; + struct bpf_prog_desc dump_bpf_prog; + } progs; + struct { + int dump_bpf_map_fd; + int dump_bpf_prog_fd; + } links; + struct iterators_bpf__rodata { + } *rodata; +}; + +static inline int +iterators_bpf__dump_bpf_map__attach(struct iterators_bpf *skel) +{ + int prog_fd = skel->progs.dump_bpf_map.prog_fd; + int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER); + + if (fd > 0) + skel->links.dump_bpf_map_fd = fd; + return fd; +} + +static inline int +iterators_bpf__dump_bpf_prog__attach(struct iterators_bpf *skel) +{ + int prog_fd = skel->progs.dump_bpf_prog.prog_fd; + int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER); + + if (fd > 0) + skel->links.dump_bpf_prog_fd = fd; + return fd; +} + +static inline int +iterators_bpf__attach(struct iterators_bpf *skel) +{ + int ret = 0; + + ret = ret < 0 ? ret : iterators_bpf__dump_bpf_map__attach(skel); + ret = ret < 0 ? ret : iterators_bpf__dump_bpf_prog__attach(skel); + return ret < 0 ? ret : 0; +} + +static inline void +iterators_bpf__detach(struct iterators_bpf *skel) +{ + skel_closenz(skel->links.dump_bpf_map_fd); + skel_closenz(skel->links.dump_bpf_prog_fd); +} +static void +iterators_bpf__destroy(struct iterators_bpf *skel) +{ + if (!skel) + return; + iterators_bpf__detach(skel); + skel_closenz(skel->progs.dump_bpf_map.prog_fd); + skel_closenz(skel->progs.dump_bpf_prog.prog_fd); + munmap(skel->rodata, 4096); + skel_closenz(skel->maps.rodata.map_fd); + free(skel); +} +static inline struct iterators_bpf * +iterators_bpf__open(void) +{ + struct iterators_bpf *skel; + + skel = calloc(sizeof(*skel), 1); + if (!skel) + goto cleanup; + skel->ctx.sz = (void *)&skel->links - (void *)skel; + skel->rodata = + mmap(NULL, 4096, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (skel->rodata == (void *) -1) + goto cleanup; + memcpy(skel->rodata, (void *)"\ +\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\ +\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\ +\x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\ +\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\ +\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0", 98); + skel->maps.rodata.initial_value = (__u64)(long)skel->rodata; + return skel; +cleanup: + iterators_bpf__destroy(skel); + return NULL; +} + +static inline int +iterators_bpf__load(struct iterators_bpf *skel) +{ + struct bpf_load_and_run_opts opts = {}; + int err; + + opts.ctx = (struct bpf_loader_ctx *)skel; + opts.data_sz = 6056; + opts.data = (void *)"\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x9f\xeb\x01\0\ +\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\xf9\x04\0\0\0\0\0\0\0\0\0\x02\x02\0\ +\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\0\0\0\x04\ +\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\0\0\0\0\0\ +\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\0\0\0\x20\ +\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xa3\0\0\0\x03\0\0\x04\x18\0\0\0\xb1\0\ +\0\0\x09\0\0\0\0\0\0\0\xb5\0\0\0\x0b\0\0\0\x40\0\0\0\xc0\0\0\0\x0b\0\0\0\x80\0\ +\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xc8\0\0\0\0\0\0\x07\0\0\0\0\xd1\0\0\0\0\0\0\ +\x08\x0c\0\0\0\xd7\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\x94\x01\0\0\x03\0\0\x04\ +\x18\0\0\0\x9c\x01\0\0\x0e\0\0\0\0\0\0\0\x9f\x01\0\0\x11\0\0\0\x20\0\0\0\xa4\ +\x01\0\0\x0e\0\0\0\xa0\0\0\0\xb0\x01\0\0\0\0\0\x08\x0f\0\0\0\xb6\x01\0\0\0\0\0\ +\x01\x04\0\0\0\x20\0\0\0\xc3\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\0\0\0\ +\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xc8\x01\0\0\0\0\0\x01\x04\0\0\0\ +\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x2c\x02\0\0\x02\0\0\x04\x10\0\0\0\x13\0\ +\0\0\x03\0\0\0\0\0\0\0\x3f\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x18\0\ +\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x44\x02\0\0\x01\0\0\x0c\ +\x16\0\0\0\x90\x02\0\0\x01\0\0\x04\x08\0\0\0\x99\x02\0\0\x19\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\x02\x1a\0\0\0\xea\x02\0\0\x06\0\0\x04\x38\0\0\0\x9c\x01\0\0\x0e\0\0\ +\0\0\0\0\0\x9f\x01\0\0\x11\0\0\0\x20\0\0\0\xf7\x02\0\0\x1b\0\0\0\xc0\0\0\0\x08\ +\x03\0\0\x15\0\0\0\0\x01\0\0\x11\x03\0\0\x1d\0\0\0\x40\x01\0\0\x1b\x03\0\0\x1e\ +\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\0\0\0\0\ +\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x65\x03\0\0\x02\0\0\x04\ +\x08\0\0\0\x73\x03\0\0\x0e\0\0\0\0\0\0\0\x7c\x03\0\0\x0e\0\0\0\x20\0\0\0\x1b\ +\x03\0\0\x03\0\0\x04\x18\0\0\0\x86\x03\0\0\x1b\0\0\0\0\0\0\0\x8e\x03\0\0\x21\0\ +\0\0\x40\0\0\0\x94\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\0\0\0\ +\0\0\0\0\0\x02\x24\0\0\0\x98\x03\0\0\x01\0\0\x04\x04\0\0\0\xa3\x03\0\0\x0e\0\0\ +\0\0\0\0\0\x0c\x04\0\0\x01\0\0\x04\x04\0\0\0\x15\x04\0\0\x0e\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x8b\x04\0\0\0\0\0\x0e\x25\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\x9f\x04\ +\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\ +\x20\0\0\0\xb5\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\ +\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xca\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xe1\x04\0\0\0\0\0\x0e\x2d\0\0\ +\0\x01\0\0\0\xe9\x04\0\0\x04\0\0\x0f\x62\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\0\x28\ +\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\0\0\ +\x11\0\0\0\xf1\x04\0\0\x01\0\0\x0f\x04\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\x62\ +\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\ +\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\x70\ +\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x30\ +\x3a\x30\0\x2f\x77\x2f\x6e\x65\x74\x2d\x6e\x65\x78\x74\x2f\x6b\x65\x72\x6e\x65\ +\x6c\x2f\x62\x70\x66\x2f\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\ +\x74\x6f\x72\x73\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\ +\x63\0\x09\x73\x74\x72\x75\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\ +\x73\x65\x71\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\ +\x71\x3b\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\ +\x73\x65\x73\x73\x69\x6f\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\ +\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\x6e\ +\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\0\x30\x3a\x31\0\x09\x73\x74\ +\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\ +\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\ +\x29\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x20\x63\ +\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\x3b\0\x30\ +\x3a\x32\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\ +\x29\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\ +\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\ +\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\ +\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\ +\0\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\ +\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\ +\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\ +\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\ +\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\ +\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\ +\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\ +\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\ +\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\ +\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\ +\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\ +\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\ +\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\ +\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\ +\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\ +\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\ +\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\ +\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\ +\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\ +\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\ +\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\ +\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\ +\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\ +\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\ +\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\ +\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\ +\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\ +\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\ +\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\ +\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\ +\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\ +\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\ +\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\ +\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\ +\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\ +\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\ +\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\ +\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ +\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ +\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\ +\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\ +\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\ +\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x2d\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\ +\0\x04\0\0\0\x62\0\0\0\x01\0\0\0\x80\x04\0\0\0\0\0\0\0\0\0\0\x69\x74\x65\x72\ +\x61\x74\x6f\x72\x2e\x72\x6f\x64\x61\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\x2f\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\ +\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\ +\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\ +\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\ +\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\ +\x25\x73\x20\x25\x73\x0a\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\ +\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\x1b\0\0\ +\0\0\0\x79\x11\0\0\0\0\0\0\x79\x11\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\ +\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x23\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\ +\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\ +\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\x7b\x2a\xf0\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\ +\x7b\x1a\xf8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\ +\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x23\0\0\0\xb7\x03\0\0\x0e\0\0\0\ +\xb7\x05\0\0\x18\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\ +\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x3c\x01\0\x01\0\0\0\x42\0\0\ +\0\x7b\0\0\0\x24\x3c\x01\0\x02\0\0\0\x42\0\0\0\xee\0\0\0\x1d\x44\x01\0\x03\0\0\ +\0\x42\0\0\0\x0f\x01\0\0\x06\x4c\x01\0\x04\0\0\0\x42\0\0\0\x1a\x01\0\0\x17\x40\ +\x01\0\x05\0\0\0\x42\0\0\0\x1a\x01\0\0\x1d\x40\x01\0\x06\0\0\0\x42\0\0\0\x43\ +\x01\0\0\x06\x58\x01\0\x08\0\0\0\x42\0\0\0\x56\x01\0\0\x03\x5c\x01\0\x0f\0\0\0\ +\x42\0\0\0\xdc\x01\0\0\x02\x64\x01\0\x1f\0\0\0\x42\0\0\0\x2a\x02\0\0\x01\x6c\ +\x01\0\0\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\ +\0\x10\0\0\0\x02\0\0\0\xea\0\0\0\0\0\0\0\x20\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\ +\x28\0\0\0\x08\0\0\0\x3f\x01\0\0\0\0\0\0\x78\0\0\0\x0d\0\0\0\x3e\0\0\0\0\0\0\0\ +\x88\0\0\0\x0d\0\0\0\xea\0\0\0\0\0\0\0\xa8\0\0\0\x0d\0\0\0\x3f\x01\0\0\0\0\0\0\ +\x1a\0\0\0\x21\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\ +\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\ +\0\0\0\0\0\x0a\0\0\0\x01\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\x10\0\0\0\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x6d\ +\x61\x70\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\ +\0\0\0\0\x79\x12\x08\0\0\0\0\0\x15\x02\x3c\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x79\ +\x27\0\0\0\0\0\0\x79\x11\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\ +\0\x07\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\ +\x31\0\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x7b\ +\x6a\xc8\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xb7\x03\0\0\ +\x04\0\0\0\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\x71\x28\0\0\0\0\0\x79\ +\x78\x30\0\0\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\ +\0\x61\x11\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\0\x03\0\0\0\x0f\x13\0\ +\0\0\0\0\0\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\ +\xb7\x02\0\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\ +\0\0\0\0\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\ +\xb7\x02\0\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\ +\xff\0\0\0\0\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\ +\xbf\x69\0\0\0\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\ +\xff\0\0\0\0\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\ +\x1a\xe8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\x79\xa1\ +\xc8\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x51\0\0\0\xb7\x03\0\0\x11\0\0\0\ +\xb7\x05\0\0\x20\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\ +\0\0\0\0\x17\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x80\x01\0\x01\0\0\0\x42\0\0\ +\0\x7b\0\0\0\x24\x80\x01\0\x02\0\0\0\x42\0\0\0\x60\x02\0\0\x1f\x88\x01\0\x03\0\ +\0\0\x42\0\0\0\x84\x02\0\0\x06\x94\x01\0\x04\0\0\0\x42\0\0\0\x1a\x01\0\0\x17\ +\x84\x01\0\x05\0\0\0\x42\0\0\0\x9d\x02\0\0\x0e\xa0\x01\0\x06\0\0\0\x42\0\0\0\ +\x1a\x01\0\0\x1d\x84\x01\0\x07\0\0\0\x42\0\0\0\x43\x01\0\0\x06\xa4\x01\0\x09\0\ +\0\0\x42\0\0\0\xaf\x02\0\0\x03\xa8\x01\0\x11\0\0\0\x42\0\0\0\x1f\x03\0\0\x02\ +\xb0\x01\0\x18\0\0\0\x42\0\0\0\x5a\x03\0\0\x06\x04\x01\0\x1b\0\0\0\x42\0\0\0\0\ +\0\0\0\0\0\0\0\x1c\0\0\0\x42\0\0\0\xab\x03\0\0\x0f\x10\x01\0\x1d\0\0\0\x42\0\0\ +\0\xc0\x03\0\0\x2d\x14\x01\0\x1f\0\0\0\x42\0\0\0\xf7\x03\0\0\x0d\x0c\x01\0\x21\ +\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x22\0\0\0\x42\0\0\0\xc0\x03\0\0\x02\x14\x01\0\ +\x25\0\0\0\x42\0\0\0\x1e\x04\0\0\x0d\x18\x01\0\x28\0\0\0\x42\0\0\0\0\0\0\0\0\0\ +\0\0\x29\0\0\0\x42\0\0\0\x1e\x04\0\0\x0d\x18\x01\0\x2c\0\0\0\x42\0\0\0\x1e\x04\ +\0\0\x0d\x18\x01\0\x2d\0\0\0\x42\0\0\0\x4c\x04\0\0\x1b\x1c\x01\0\x2e\0\0\0\x42\ +\0\0\0\x4c\x04\0\0\x06\x1c\x01\0\x2f\0\0\0\x42\0\0\0\x6f\x04\0\0\x0d\x24\x01\0\ +\x31\0\0\0\x42\0\0\0\x1f\x03\0\0\x02\xb0\x01\0\x40\0\0\0\x42\0\0\0\x2a\x02\0\0\ +\x01\xc0\x01\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\ +\0\0\0\0\0\x10\0\0\0\x14\0\0\0\xea\0\0\0\0\0\0\0\x20\0\0\0\x14\0\0\0\x3e\0\0\0\ +\0\0\0\0\x28\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x30\0\0\0\x08\0\0\0\x3f\x01\0\0\ +\0\0\0\0\x88\0\0\0\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x98\0\0\0\x1a\0\0\0\xea\0\0\0\0\ +\0\0\0\xb0\0\0\0\x1a\0\0\0\x52\x03\0\0\0\0\0\0\xb8\0\0\0\x1a\0\0\0\x56\x03\0\0\ +\0\0\0\0\xc8\0\0\0\x1f\0\0\0\x84\x03\0\0\0\0\0\0\xe0\0\0\0\x20\0\0\0\xea\0\0\0\ +\0\0\0\0\xf8\0\0\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x20\x01\0\0\x24\0\0\0\x3e\0\0\0\ +\0\0\0\0\x58\x01\0\0\x1a\0\0\0\xea\0\0\0\0\0\0\0\x68\x01\0\0\x20\0\0\0\x46\x04\ +\0\0\0\0\0\0\x90\x01\0\0\x1a\0\0\0\x3f\x01\0\0\0\0\0\0\xa0\x01\0\0\x1a\0\0\0\ +\x87\x04\0\0\0\0\0\0\xa8\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x42\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0\x1c\0\0\ +\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x1a\0\ +\0\0\x01\0\0\0\0\0\0\0\x13\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\ +\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\ +\0\0\0\0"; + opts.insns_sz = 2184; + opts.insns = (void *)"\ +\xbf\x16\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\x78\xff\xff\xff\xb7\x02\0\ +\0\x88\0\0\0\xb7\x03\0\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x05\0\x14\0\0\0\0\0\x61\ +\xa1\x78\xff\0\0\0\0\xd5\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x7c\xff\ +\0\0\0\0\xd5\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x80\xff\0\0\0\0\xd5\ +\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x84\xff\0\0\0\0\xd5\x01\x01\0\0\ +\0\0\0\x85\0\0\0\xa8\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\ +\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xbf\x70\0\0\ +\0\0\0\0\x95\0\0\0\0\0\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\ +\x48\x0e\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\ +\0\0\x44\x0e\0\0\x63\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\ +\0\0\0\0\x38\x0e\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x05\0\0\ +\x18\x61\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x12\0\ +\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\xb7\x03\0\0\x1c\0\0\0\x85\0\0\0\ +\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\xd4\xff\0\0\0\0\x63\x7a\x78\xff\0\0\0\0\ +\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x0e\0\0\x63\x01\0\0\0\ +\0\0\0\x61\x60\x20\0\0\0\0\0\x15\0\x03\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\ +\x5c\x0e\0\0\x63\x01\0\0\0\0\0\0\xb7\x01\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\ +\0\x50\x0e\0\0\xb7\x03\0\0\x48\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\ +\xc5\x07\xc3\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x71\0\0\0\0\0\ +\0\x79\x63\x18\0\0\0\0\0\x15\x03\x04\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x98\ +\x0e\0\0\xb7\x02\0\0\x62\0\0\0\x85\0\0\0\x94\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\x63\x01\0\ +\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\ +\x10\x0f\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x98\x0e\0\0\x18\ +\x61\0\0\0\0\0\0\0\0\0\0\x18\x0f\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x02\0\0\0\ +\x18\x62\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\xb7\x03\0\0\x20\0\0\0\x85\0\0\0\xa6\0\ +\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\xa3\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\x63\x01\0\0\ +\0\0\0\0\xb7\x01\0\0\x16\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\xb7\x03\ +\0\0\x04\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x96\xff\0\0\0\0\ +\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x78\x11\0\ +\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x38\x0f\0\0\x18\x61\0\0\0\0\ +\0\0\0\0\0\0\x70\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x40\ +\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\ +\0\0\0\0\0\0\0\0\0\x48\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xc8\x11\0\0\x7b\x01\ +\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\ +\0\xe8\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x18\x61\ +\0\0\0\0\0\0\0\0\0\0\xe0\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\x60\x08\0\0\0\0\0\x18\ +\x61\0\0\0\0\0\0\0\0\0\0\x80\x11\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\ +\x18\x61\0\0\0\0\0\0\0\0\0\0\x84\x11\0\0\x63\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\ +\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x88\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\xa0\x78\ +\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb0\x11\0\0\x63\x01\0\0\0\0\0\0\x18\ +\x61\0\0\0\0\0\0\0\0\0\0\xf8\x11\0\0\xb7\x02\0\0\x11\0\0\0\xb7\x03\0\0\x0c\0\0\ +\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x60\xff\ +\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x68\x11\0\0\x63\x70\x6c\0\0\0\0\0\x77\x07\ +\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\0\0\0\x18\x62\0\0\0\0\0\0\ +\0\0\0\0\x68\x11\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\ +\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd8\x11\0\0\x61\x01\0\0\0\0\0\0\xd5\x01\x02\0\ +\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x4e\xff\0\0\0\0\x63\ +\x7a\x80\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\x18\x61\0\0\0\0\0\ +\0\0\0\0\0\x10\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x18\x12\ +\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\ +\0\0\0\0\0\0\0\x28\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x50\x17\0\0\x7b\x01\0\0\ +\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\ +\x60\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd0\x15\0\0\x18\ +\x61\0\0\0\0\0\0\0\0\0\0\x80\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x78\x17\0\0\x7b\x01\0\0\0\0\0\0\x61\ +\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x17\0\0\x63\x01\0\0\0\0\0\0\ +\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x1c\x17\0\0\x63\x01\0\0\0\0\ +\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\x7b\x01\0\0\ +\0\0\0\0\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x48\x17\0\0\x63\ +\x01\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x90\x17\0\0\xb7\x02\0\0\x12\0\0\0\ +\xb7\x03\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\ +\0\0\xc5\x07\x17\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\x63\x70\x6c\ +\0\0\0\0\0\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\0\0\0\ +\x18\x62\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\ +\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x70\x17\0\0\x61\x01\0\0\0\0\ +\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x05\ +\xff\0\0\0\0\x63\x7a\x84\xff\0\0\0\0\x61\xa1\x78\xff\0\0\0\0\xd5\x01\x02\0\0\0\ +\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa0\x80\xff\0\0\0\0\x63\x06\ +\x28\0\0\0\0\0\x61\xa0\x84\xff\0\0\0\0\x63\x06\x2c\0\0\0\0\0\x18\x61\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\0\0\x63\x06\x18\0\0\0\0\0\xb7\0\0\0\0\0\0\0\ +\x95\0\0\0\0\0\0\0"; + err = bpf_load_and_run(&opts); + if (err < 0) + return err; + skel->rodata = + mmap(skel->rodata, 4096, PROT_READ, MAP_SHARED | MAP_FIXED, + skel->maps.rodata.map_fd, 0); + return 0; +} + +static inline struct iterators_bpf * +iterators_bpf__open_and_load(void) +{ + struct iterators_bpf *skel; + + skel = iterators_bpf__open(); + if (!skel) + return NULL; + if (iterators_bpf__load(skel)) { + iterators_bpf__destroy(skel); + return NULL; + } + return skel; +} + +#endif /* __ITERATORS_BPF_SKEL_H__ */ diff --git a/kernel/bpf/preload/iterators/iterators.skel.h b/kernel/bpf/preload/iterators/iterators.skel.h deleted file mode 100644 index cf9a6a94b3a4..000000000000 --- a/kernel/bpf/preload/iterators/iterators.skel.h +++ /dev/null @@ -1,412 +0,0 @@ -/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ - -/* THIS FILE IS AUTOGENERATED! */ -#ifndef __ITERATORS_BPF_SKEL_H__ -#define __ITERATORS_BPF_SKEL_H__ - -#include -#include - -struct iterators_bpf { - struct bpf_object_skeleton *skeleton; - struct bpf_object *obj; - struct { - struct bpf_map *rodata; - } maps; - struct { - struct bpf_program *dump_bpf_map; - struct bpf_program *dump_bpf_prog; - } progs; - struct { - struct bpf_link *dump_bpf_map; - struct bpf_link *dump_bpf_prog; - } links; - struct iterators_bpf__rodata { - char dump_bpf_map____fmt[35]; - char dump_bpf_map____fmt_1[14]; - char dump_bpf_prog____fmt[32]; - char dump_bpf_prog____fmt_2[17]; - } *rodata; -}; - -static void -iterators_bpf__destroy(struct iterators_bpf *obj) -{ - if (!obj) - return; - if (obj->skeleton) - bpf_object__destroy_skeleton(obj->skeleton); - free(obj); -} - -static inline int -iterators_bpf__create_skeleton(struct iterators_bpf *obj); - -static inline struct iterators_bpf * -iterators_bpf__open_opts(const struct bpf_object_open_opts *opts) -{ - struct iterators_bpf *obj; - - obj = (struct iterators_bpf *)calloc(1, sizeof(*obj)); - if (!obj) - return NULL; - if (iterators_bpf__create_skeleton(obj)) - goto err; - if (bpf_object__open_skeleton(obj->skeleton, opts)) - goto err; - - return obj; -err: - iterators_bpf__destroy(obj); - return NULL; -} - -static inline struct iterators_bpf * -iterators_bpf__open(void) -{ - return iterators_bpf__open_opts(NULL); -} - -static inline int -iterators_bpf__load(struct iterators_bpf *obj) -{ - return bpf_object__load_skeleton(obj->skeleton); -} - -static inline struct iterators_bpf * -iterators_bpf__open_and_load(void) -{ - struct iterators_bpf *obj; - - obj = iterators_bpf__open(); - if (!obj) - return NULL; - if (iterators_bpf__load(obj)) { - iterators_bpf__destroy(obj); - return NULL; - } - return obj; -} - -static inline int -iterators_bpf__attach(struct iterators_bpf *obj) -{ - return bpf_object__attach_skeleton(obj->skeleton); -} - -static inline void -iterators_bpf__detach(struct iterators_bpf *obj) -{ - return bpf_object__detach_skeleton(obj->skeleton); -} - -static inline int -iterators_bpf__create_skeleton(struct iterators_bpf *obj) -{ - struct bpf_object_skeleton *s; - - s = (struct bpf_object_skeleton *)calloc(1, sizeof(*s)); - if (!s) - return -1; - obj->skeleton = s; - - s->sz = sizeof(*s); - s->name = "iterators_bpf"; - s->obj = &obj->obj; - - /* maps */ - s->map_cnt = 1; - s->map_skel_sz = sizeof(*s->maps); - s->maps = (struct bpf_map_skeleton *)calloc(s->map_cnt, s->map_skel_sz); - if (!s->maps) - goto err; - - s->maps[0].name = "iterator.rodata"; - s->maps[0].map = &obj->maps.rodata; - s->maps[0].mmaped = (void **)&obj->rodata; - - /* programs */ - s->prog_cnt = 2; - s->prog_skel_sz = sizeof(*s->progs); - s->progs = (struct bpf_prog_skeleton *)calloc(s->prog_cnt, s->prog_skel_sz); - if (!s->progs) - goto err; - - s->progs[0].name = "dump_bpf_map"; - s->progs[0].prog = &obj->progs.dump_bpf_map; - s->progs[0].link = &obj->links.dump_bpf_map; - - s->progs[1].name = "dump_bpf_prog"; - s->progs[1].prog = &obj->progs.dump_bpf_prog; - s->progs[1].link = &obj->links.dump_bpf_prog; - - s->data_sz = 7176; - s->data = (void *)"\ -\x7f\x45\x4c\x46\x02\x01\x01\0\0\0\0\0\0\0\0\0\x01\0\xf7\0\x01\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\x48\x18\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\x40\0\x0f\0\ -\x0e\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\ -\x1a\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\ -\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\xb7\x03\0\0\x23\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x61\x71\0\ -\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\0\0\0\0\0\ -\x0f\x12\0\0\0\0\0\0\x7b\x2a\xf0\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\x7b\x1a\xf8\ -\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\ -\0\x18\x02\0\0\x23\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x0e\0\0\0\xb7\x05\0\0\x18\ -\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x79\x12\0\0\0\0\ -\0\0\x79\x26\0\0\0\0\0\0\x79\x11\x08\0\0\0\0\0\x15\x01\x3b\0\0\0\0\0\x79\x17\0\ -\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\ -\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\x31\0\0\0\0\0\0\0\0\0\ -\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x7b\x6a\xc8\ -\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xb7\x03\0\0\x04\0\0\0\ -\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\x71\x28\0\0\0\0\0\x79\x78\x30\0\0\ -\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x61\x11\ -\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\0\x03\0\0\0\x0f\x13\0\0\0\0\0\0\ -\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\xb7\x02\0\ -\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\0\0\0\0\ -\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\xb7\x02\0\ -\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\xff\0\0\0\0\ -\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\xbf\x69\0\0\0\ -\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\xff\0\0\0\0\ -\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\x1a\xe8\xff\ -\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\x79\xa1\xc8\xff\0\0\0\ -\0\x18\x02\0\0\x51\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x11\0\0\0\xb7\x05\0\0\x20\ -\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x20\x20\x69\x64\ -\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\ -\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\ -\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\ -\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\ -\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0\x47\x50\x4c\0\x9f\ -\xeb\x01\0\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\x09\x05\0\0\0\0\0\0\0\0\0\ -\x02\x02\0\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\ -\0\0\0\x04\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\ -\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\ -\0\0\0\x20\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xaf\0\0\0\x03\0\0\x04\x18\0\ -\0\0\xbd\0\0\0\x09\0\0\0\0\0\0\0\xc1\0\0\0\x0b\0\0\0\x40\0\0\0\xcc\0\0\0\x0b\0\ -\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xd4\0\0\0\0\0\0\x07\0\0\0\0\xdd\0\0\ -\0\0\0\0\x08\x0c\0\0\0\xe3\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\xa4\x01\0\0\x03\ -\0\0\x04\x18\0\0\0\xac\x01\0\0\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\ -\0\xb4\x01\0\0\x0e\0\0\0\xa0\0\0\0\xc0\x01\0\0\0\0\0\x08\x0f\0\0\0\xc6\x01\0\0\ -\0\0\0\x01\x04\0\0\0\x20\0\0\0\xd3\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\ -\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xd8\x01\0\0\0\0\0\x01\x04\ -\0\0\0\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x3c\x02\0\0\x02\0\0\x04\x10\0\0\0\ -\x13\0\0\0\x03\0\0\0\0\0\0\0\x4f\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\ -\x18\0\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x54\x02\0\0\x01\0\ -\0\x0c\x16\0\0\0\xa0\x02\0\0\x01\0\0\x04\x08\0\0\0\xa9\x02\0\0\x19\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\x02\x1a\0\0\0\xfa\x02\0\0\x06\0\0\x04\x38\0\0\0\xac\x01\0\0\ -\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\0\x07\x03\0\0\x1b\0\0\0\xc0\0\ -\0\0\x18\x03\0\0\x15\0\0\0\0\x01\0\0\x21\x03\0\0\x1d\0\0\0\x40\x01\0\0\x2b\x03\ -\0\0\x1e\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\ -\0\0\0\0\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x75\x03\0\0\x02\0\ -\0\x04\x08\0\0\0\x83\x03\0\0\x0e\0\0\0\0\0\0\0\x8c\x03\0\0\x0e\0\0\0\x20\0\0\0\ -\x2b\x03\0\0\x03\0\0\x04\x18\0\0\0\x96\x03\0\0\x1b\0\0\0\0\0\0\0\x9e\x03\0\0\ -\x21\0\0\0\x40\0\0\0\xa4\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\ -\0\0\0\0\0\0\0\0\x02\x24\0\0\0\xa8\x03\0\0\x01\0\0\x04\x04\0\0\0\xb3\x03\0\0\ -\x0e\0\0\0\0\0\0\0\x1c\x04\0\0\x01\0\0\x04\x04\0\0\0\x25\x04\0\0\x0e\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x9b\x04\0\0\0\0\0\ -\x0e\x25\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\ -\xaf\x04\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\ -\x12\0\0\0\x20\0\0\0\xc5\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\ -\0\0\0\0\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xda\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xf1\x04\0\0\0\0\0\x0e\ -\x2d\0\0\0\x01\0\0\0\xf9\x04\0\0\x04\0\0\x0f\0\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\ -\0\x28\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\ -\0\0\x11\0\0\0\x01\x05\0\0\x01\0\0\x0f\0\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\ -\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\ -\x74\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\ -\x70\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\ -\x30\x3a\x30\0\x2f\x68\x6f\x6d\x65\x2f\x61\x6c\x72\x75\x61\x2f\x62\x75\x69\x6c\ -\x64\x2f\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\ -\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\ -\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\ -\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\ -\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\ -\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\ -\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\ -\x65\0\x5f\x5f\x75\x36\x34\0\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\x20\x75\x6e\ -\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x30\x3a\x31\0\x09\x73\x74\x72\x75\ -\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\x20\x63\ -\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\x29\0\ -\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\ -\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\ -\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\x29\ -\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\ -\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\ -\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\ -\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\ -\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\ -\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\ -\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\ -\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\ -\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\ -\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\ -\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\ -\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\ -\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\ -\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\ -\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\ -\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\ -\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\ -\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\ -\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\ -\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\ -\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\ -\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\ -\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\ -\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\ -\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\ -\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\ -\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\ -\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\ -\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\ -\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\ -\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\ -\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\ -\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\ -\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\ -\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\ -\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\ -\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\ -\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\ -\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\ -\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\ -\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\ -\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ -\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ -\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\ -\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\ -\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\ -\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x9f\ -\xeb\x01\0\x20\0\0\0\0\0\0\0\x24\0\0\0\x24\0\0\0\x44\x02\0\0\x68\x02\0\0\xa4\ -\x01\0\0\x08\0\0\0\x31\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\x62\x02\0\0\x01\0\0\0\ -\0\0\0\0\x17\0\0\0\x10\0\0\0\x31\0\0\0\x09\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\ -\x1e\x40\x01\0\x08\0\0\0\x42\0\0\0\x87\0\0\0\x24\x40\x01\0\x10\0\0\0\x42\0\0\0\ -\xfe\0\0\0\x1d\x48\x01\0\x18\0\0\0\x42\0\0\0\x1f\x01\0\0\x06\x50\x01\0\x20\0\0\ -\0\x42\0\0\0\x2e\x01\0\0\x1d\x44\x01\0\x28\0\0\0\x42\0\0\0\x53\x01\0\0\x06\x5c\ -\x01\0\x38\0\0\0\x42\0\0\0\x66\x01\0\0\x03\x60\x01\0\x70\0\0\0\x42\0\0\0\xec\ -\x01\0\0\x02\x68\x01\0\xf0\0\0\0\x42\0\0\0\x3a\x02\0\0\x01\x70\x01\0\x62\x02\0\ -\0\x1a\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\x1e\x84\x01\0\x08\0\0\0\x42\0\0\0\x87\ -\0\0\0\x24\x84\x01\0\x10\0\0\0\x42\0\0\0\x70\x02\0\0\x1f\x8c\x01\0\x18\0\0\0\ -\x42\0\0\0\x94\x02\0\0\x06\x98\x01\0\x20\0\0\0\x42\0\0\0\xad\x02\0\0\x0e\xa4\ -\x01\0\x28\0\0\0\x42\0\0\0\x2e\x01\0\0\x1d\x88\x01\0\x30\0\0\0\x42\0\0\0\x53\ -\x01\0\0\x06\xa8\x01\0\x40\0\0\0\x42\0\0\0\xbf\x02\0\0\x03\xac\x01\0\x80\0\0\0\ -\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xb8\0\0\0\x42\0\0\0\x6a\x03\0\0\x06\x08\ -\x01\0\xd0\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\x42\0\0\0\xbb\x03\0\0\x0f\ -\x14\x01\0\xe0\0\0\0\x42\0\0\0\xd0\x03\0\0\x2d\x18\x01\0\xf0\0\0\0\x42\0\0\0\ -\x07\x04\0\0\x0d\x10\x01\0\0\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x08\x01\0\0\x42\ -\0\0\0\xd0\x03\0\0\x02\x18\x01\0\x20\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\ -\0\x38\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x40\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\ -\x1c\x01\0\x58\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\0\x60\x01\0\0\x42\0\0\ -\0\x5c\x04\0\0\x1b\x20\x01\0\x68\x01\0\0\x42\0\0\0\x5c\x04\0\0\x06\x20\x01\0\ -\x70\x01\0\0\x42\0\0\0\x7f\x04\0\0\x0d\x28\x01\0\x78\x01\0\0\x42\0\0\0\0\0\0\0\ -\0\0\0\0\x80\x01\0\0\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xf8\x01\0\0\x42\0\0\0\ -\x3a\x02\0\0\x01\xc4\x01\0\x10\0\0\0\x31\0\0\0\x07\0\0\0\0\0\0\0\x02\0\0\0\x3e\ -\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\0\0\xfa\0\ -\0\0\0\0\0\0\x20\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x70\0\0\0\x0d\0\0\0\x3e\0\ -\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\0\xfa\0\0\0\0\0\0\0\xa0\0\0\0\x0d\0\0\0\x2a\x01\ -\0\0\0\0\0\0\x62\x02\0\0\x12\0\0\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\ -\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x14\0\0\0\xfa\0\0\0\0\0\0\0\x20\0\0\0\ -\x18\0\0\0\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x80\0\0\0\ -\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\xa8\0\0\0\ -\x1a\0\0\0\x62\x03\0\0\0\0\0\0\xb0\0\0\0\x1a\0\0\0\x66\x03\0\0\0\0\0\0\xc0\0\0\ -\0\x1f\0\0\0\x94\x03\0\0\0\0\0\0\xd8\0\0\0\x20\0\0\0\xfa\0\0\0\0\0\0\0\xf0\0\0\ -\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x18\x01\0\0\x24\0\0\0\x3e\0\0\0\0\0\0\0\x50\x01\ -\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\x60\x01\0\0\x20\0\0\0\x56\x04\0\0\0\0\0\0\x88\ -\x01\0\0\x1a\0\0\0\x2a\x01\0\0\0\0\0\0\x98\x01\0\0\x1a\0\0\0\x97\x04\0\0\0\0\0\ -\0\xa0\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\x91\0\0\0\x04\0\xf1\xff\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xe6\0\0\ -\0\0\0\x02\0\x70\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\0\0\x02\0\xf0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\xdf\0\0\0\0\0\x03\0\x78\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\xd1\0\0\0\0\0\x03\0\x80\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xca\0\0\0\0\0\x03\0\ -\xf8\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x14\0\0\0\x01\0\x04\0\0\0\0\0\0\0\0\0\x23\ -\0\0\0\0\0\0\0\x04\x01\0\0\x01\0\x04\0\x23\0\0\0\0\0\0\0\x0e\0\0\0\0\0\0\0\x28\ -\0\0\0\x01\0\x04\0\x31\0\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\xed\0\0\0\x01\0\x04\0\ -\x51\0\0\0\0\0\0\0\x11\0\0\0\0\0\0\0\0\0\0\0\x03\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\x03\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\ -\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xc2\0\0\0\x11\0\x05\0\0\0\0\0\0\0\0\0\ -\x04\0\0\0\0\0\0\0\x3d\0\0\0\x12\0\x02\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\x5b\ -\0\0\0\x12\0\x03\0\0\0\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\x48\0\0\0\0\0\0\0\x01\0\ -\0\0\x0d\0\0\0\xc8\0\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\x50\0\0\0\0\0\0\0\x01\0\0\ -\0\x0d\0\0\0\xd0\x01\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\xf0\x03\0\0\0\0\0\0\x0a\0\ -\0\0\x0d\0\0\0\xfc\x03\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x08\x04\0\0\0\0\0\0\x0a\ -\0\0\0\x0d\0\0\0\x14\x04\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x2c\x04\0\0\0\0\0\0\0\ -\0\0\0\x0e\0\0\0\x2c\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x3c\0\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x50\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x60\0\0\0\0\0\0\0\0\0\0\0\x0b\0\ -\0\0\x70\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\ -\x90\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xb0\0\ -\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xd0\0\0\0\0\ -\0\0\0\0\0\0\0\x0b\0\0\0\xe8\0\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\0\0\0\0\0\0\0\ -\0\0\0\0\x0c\0\0\0\x08\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x01\0\0\0\0\0\0\0\ -\0\0\0\x0c\0\0\0\x28\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x01\0\0\0\0\0\0\0\0\ -\0\0\x0c\0\0\0\x48\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x01\0\0\0\0\0\0\0\0\0\ -\0\x0c\0\0\0\x68\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x01\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x88\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x98\x01\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\xa8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xb8\x01\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\xc8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xd8\x01\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\xe8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\x01\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x28\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x02\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x48\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x02\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x68\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x02\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x94\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa4\x02\0\0\0\0\0\0\0\0\0\0\ -\x0b\0\0\0\xb4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc4\x02\0\0\0\0\0\0\0\0\0\0\ -\x0b\0\0\0\xd4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xe4\x02\0\0\0\0\0\0\0\0\0\0\ -\x0b\0\0\0\xf4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x0c\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x1c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x2c\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x3c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4c\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x5c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x6c\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x7c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x8c\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x9c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xac\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\xbc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xcc\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\xdc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xec\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\xfc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x0c\x04\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x1c\x04\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4d\x4e\x40\x41\x42\x43\x4c\0\ -\x2e\x74\x65\x78\x74\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\x2e\x65\x78\x74\0\x64\ -\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\ -\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\ -\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x2e\x72\x65\x6c\x69\x74\x65\ -\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\ -\x72\x6f\x67\0\x2e\x72\x65\x6c\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\ -\x67\0\x2e\x6c\x6c\x76\x6d\x5f\x61\x64\x64\x72\x73\x69\x67\0\x6c\x69\x63\x65\ -\x6e\x73\x65\0\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\ -\x2e\x73\x74\x72\x74\x61\x62\0\x2e\x73\x79\x6d\x74\x61\x62\0\x2e\x72\x6f\x64\ -\x61\x74\x61\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\0\x4c\x49\x43\x45\x4e\x53\x45\0\ -\x4c\x42\x42\x31\x5f\x37\0\x4c\x42\x42\x31\x5f\x36\0\x4c\x42\x42\x30\x5f\x34\0\ -\x4c\x42\x42\x31\x5f\x33\0\x4c\x42\x42\x30\x5f\x33\0\x64\x75\x6d\x70\x5f\x62\ -\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x64\x75\x6d\ -\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x01\0\0\ -\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x4e\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\x6d\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\x40\x01\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\xb1\0\0\0\x01\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x48\x03\0\ -\0\0\0\0\0\x62\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\x89\0\0\0\x01\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xaa\x03\0\0\0\0\0\0\x04\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xbd\0\0\0\x01\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xae\x03\0\0\0\0\0\0\x3d\x09\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x01\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\xeb\x0c\0\0\0\0\0\0\x2c\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa9\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\x18\x11\0\0\0\0\0\0\x98\x01\0\0\0\0\0\0\x0e\0\0\0\x0e\0\0\0\x08\0\0\ -\0\0\0\0\0\x18\0\0\0\0\0\0\0\x4a\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\xb0\x12\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x02\0\0\0\x08\0\0\0\0\0\0\0\ -\x10\0\0\0\0\0\0\0\x69\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd0\x12\ -\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x03\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\ -\0\0\0\0\xb9\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xf0\x12\0\0\0\0\0\ -\0\x50\0\0\0\0\0\0\0\x08\0\0\0\x06\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\ -\x07\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\x13\0\0\0\0\0\0\xe0\ -\x03\0\0\0\0\0\0\x08\0\0\0\x07\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x7b\0\ -\0\0\x03\x4c\xff\x6f\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\0\0\0\0\x07\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa1\0\0\0\x03\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x27\x17\0\0\0\0\0\0\x1a\x01\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"; - - return 0; -err: - bpf_object__destroy_skeleton(s); - return -1; -} - -#endif /* __ITERATORS_BPF_SKEL_H__ */ -- cgit v1.2.3 From 18ef5dac934a0fb1aeb7d6bee6c81e1fa6bcc598 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 31 Jan 2022 14:05:27 -0800 Subject: bpf: Open code obj_get_info_by_fd in bpf preload. Open code obj_get_info_by_fd in bpf preload. It's the last part of libbpf that preload/iterators were using. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220131220528.98088-7-alexei.starovoitov@gmail.com --- kernel/bpf/preload/iterators/iterators.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/preload/iterators/iterators.c b/kernel/bpf/preload/iterators/iterators.c index 23b74916fb84..4dafe0f4f2b2 100644 --- a/kernel/bpf/preload/iterators/iterators.c +++ b/kernel/bpf/preload/iterators/iterators.c @@ -16,6 +16,22 @@ int to_kernel = -1; int from_kernel = 0; +static int __bpf_obj_get_info_by_fd(int bpf_fd, void *info, __u32 *info_len) +{ + union bpf_attr attr; + int err; + + memset(&attr, 0, sizeof(attr)); + attr.info.bpf_fd = bpf_fd; + attr.info.info_len = *info_len; + attr.info.info = (long) info; + + err = skel_sys_bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr)); + if (!err) + *info_len = attr.info.info_len; + return err; +} + static int send_link_to_kernel(int link_fd, const char *link_name) { struct bpf_preload_info obj = {}; @@ -23,7 +39,7 @@ static int send_link_to_kernel(int link_fd, const char *link_name) __u32 info_len = sizeof(info); int err; - err = bpf_obj_get_info_by_fd(link_fd, &info, &info_len); + err = __bpf_obj_get_info_by_fd(link_fd, &info, &info_len); if (err) return err; obj.link_id = info.id; -- cgit v1.2.3 From e96f2d64c812d9c20adea38a9b5e08feaa21fcf5 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 31 Jan 2022 14:05:28 -0800 Subject: bpf: Drop libbpf, libelf, libz dependency from bpf preload. Drop libbpf, libelf, libz dependency from bpf preload. This reduces bpf_preload_umd binary size from 1.7M to 30k unstripped with debug info and from 300k to 19k stripped. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220131220528.98088-8-alexei.starovoitov@gmail.com --- kernel/bpf/preload/Makefile | 28 ++-------------------------- 1 file changed, 2 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/preload/Makefile b/kernel/bpf/preload/Makefile index 1400ac58178e..baf47d9c7557 100644 --- a/kernel/bpf/preload/Makefile +++ b/kernel/bpf/preload/Makefile @@ -1,40 +1,16 @@ # SPDX-License-Identifier: GPL-2.0 LIBBPF_SRCS = $(srctree)/tools/lib/bpf/ -LIBBPF_OUT = $(abspath $(obj))/libbpf -LIBBPF_A = $(LIBBPF_OUT)/libbpf.a -LIBBPF_DESTDIR = $(LIBBPF_OUT) -LIBBPF_INCLUDE = $(LIBBPF_DESTDIR)/include - -# Although not in use by libbpf's Makefile, set $(O) so that the "dummy" test -# in tools/scripts/Makefile.include always succeeds when building the kernel -# with $(O) pointing to a relative path, as in "make O=build bindeb-pkg". -$(LIBBPF_A): | $(LIBBPF_OUT) - $(Q)$(MAKE) -C $(LIBBPF_SRCS) O=$(LIBBPF_OUT)/ OUTPUT=$(LIBBPF_OUT)/ \ - DESTDIR=$(LIBBPF_DESTDIR) prefix= \ - $(LIBBPF_OUT)/libbpf.a install_headers - -libbpf_hdrs: $(LIBBPF_A) - -.PHONY: libbpf_hdrs - -$(LIBBPF_OUT): - $(call msg,MKDIR,$@) - $(Q)mkdir -p $@ +LIBBPF_INCLUDE = $(LIBBPF_SRCS)/.. userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi \ -I $(LIBBPF_INCLUDE) -Wno-unused-result userprogs := bpf_preload_umd -clean-files := libbpf/ - -$(obj)/iterators/iterators.o: | libbpf_hdrs - bpf_preload_umd-objs := iterators/iterators.o -bpf_preload_umd-userldlibs := $(LIBBPF_A) -lelf -lz -$(obj)/bpf_preload_umd: $(LIBBPF_A) +$(obj)/bpf_preload_umd: $(obj)/bpf_preload_umd_blob.o: $(obj)/bpf_preload_umd -- cgit v1.2.3 From 4d266c247d56751c2c97e0c411212b59e90922fc Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Mon, 13 Dec 2021 11:40:24 +0530 Subject: rcu/exp: Fix check for idle context in rcu_exp_handler For PREEMPT_RCU, the rcu_exp_handler() function checks whether the current CPU is in idle, by calling rcu_dynticks_curr_cpu_in_eqs(). However, rcu_exp_handler() is called in IPI handler context. So, it should be checking the idle context using rcu_is_cpu_rrupt_from_idle(). Fix this by using rcu_is_cpu_rrupt_from_idle() instead of rcu_dynticks_curr_cpu_in_eqs(). Non-preempt configuration already uses the correct check. Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 237a79989aba..1568c8ef185b 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -656,7 +656,7 @@ static void rcu_exp_handler(void *unused) */ if (!depth) { if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || - rcu_dynticks_curr_cpu_in_eqs()) { + rcu_is_cpu_rrupt_from_idle()) { rcu_report_exp_rdp(rdp); } else { WRITE_ONCE(rdp->cpu_no_qs.b.exp, true); -- cgit v1.2.3 From 63c564da11cbed96ec6cf0b5faf6af0b7e3624d2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 Dec 2021 21:00:02 -0800 Subject: rcu: Mark ->expmask access in synchronize_rcu_expedited_wait() This commit adds a READ_ONCE() to an access to the rcu_node structure's ->expmask field to prevent compiler mischief. Detected by KCSAN. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 1568c8ef185b..60197ea24ceb 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -502,7 +502,8 @@ static void synchronize_rcu_expedited_wait(void) if (synchronize_rcu_expedited_wait_once(1)) return; rcu_for_each_leaf_node(rnp) { - for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { + mask = READ_ONCE(rnp->expmask); + for_each_leaf_node_cpu_mask(rnp, cpu, mask) { rdp = per_cpu_ptr(&rcu_data, cpu); if (rdp->rcu_forced_tick_exp) continue; -- cgit v1.2.3 From a47f9f131dfe4f765e385fa90e13032eadb00bac Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 Dec 2021 11:05:07 -0800 Subject: rcu: Mark accesses to boost_starttime The boost_starttime shared variable has conflicting unmarked C-language accesses, which are dangerous at best. This commit therefore adds appropriate marking. This was found by KCSAN. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 422f7e4cc08d..829ae0b7d3c0 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -997,7 +997,7 @@ static int rcu_torture_boost(void *arg) goto checkwait; /* Wait for the next test interval. */ - oldstarttime = boost_starttime; + oldstarttime = READ_ONCE(boost_starttime); while (time_before(jiffies, oldstarttime)) { schedule_timeout_interruptible(oldstarttime - jiffies); if (stutter_wait("rcu_torture_boost")) @@ -1041,10 +1041,11 @@ static int rcu_torture_boost(void *arg) * interval. Besides, we are running at RT priority, * so delays should be relatively rare. */ - while (oldstarttime == boost_starttime && !kthread_should_stop()) { + while (oldstarttime == READ_ONCE(boost_starttime) && !kthread_should_stop()) { if (mutex_trylock(&boost_mutex)) { if (oldstarttime == boost_starttime) { - boost_starttime = jiffies + test_boost_interval * HZ; + WRITE_ONCE(boost_starttime, + jiffies + test_boost_interval * HZ); n_rcu_torture_boosts++; } mutex_unlock(&boost_mutex); -- cgit v1.2.3 From 02e3024175274ed4bf7912e7a1281b300cec76b5 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Sat, 11 Dec 2021 22:31:39 +0530 Subject: rcu/nocb: Handle concurrent nocb kthreads creation When multiple CPUs in the same nocb gp/cb group concurrently come online, they might try to concurrently create the same rcuog kthread. Fix this by using nocb gp CPU's spawn mutex to provide mutual exclusion for the rcuog kthread creation code. [ paulmck: Whitespace fixes per kernel test robot feedback. ] Acked-by: David Woodhouse Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 2 ++ kernel/rcu/tree_nocb.h | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 486fc901bd08..24dd4b0d805f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -203,6 +203,8 @@ struct rcu_data { int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ struct timer_list nocb_timer; /* Enforce finite deferral. */ unsigned long nocb_gp_adv_time; /* Last call_rcu() CB adv (jiffies). */ + struct mutex nocb_gp_kthread_mutex; /* Exclusion for nocb gp kthread */ + /* spawning */ /* The following fields are used by call_rcu, hence own cacheline. */ raw_spinlock_t nocb_bypass_lock ____cacheline_internodealigned_in_smp; diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index eeafb546a7a0..1e40519d1a05 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1226,6 +1226,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) raw_spin_lock_init(&rdp->nocb_gp_lock); timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); rcu_cblist_init(&rdp->nocb_bypass); + mutex_init(&rdp->nocb_gp_kthread_mutex); } /* @@ -1248,13 +1249,17 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) /* If we didn't spawn the GP kthread first, reorganize! */ rdp_gp = rdp->nocb_gp_rdp; + mutex_lock(&rdp_gp->nocb_gp_kthread_mutex); if (!rdp_gp->nocb_gp_kthread) { t = kthread_run(rcu_nocb_gp_kthread, rdp_gp, "rcuog/%d", rdp_gp->cpu); - if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) + if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) { + mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); return; + } WRITE_ONCE(rdp_gp->nocb_gp_kthread, t); } + mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); /* Spawn the kthread for this CPU. */ t = kthread_run(rcu_nocb_cb_kthread, rdp, -- cgit v1.2.3 From eae9f147a4b02e132187a2d88a403b9ccc28212a Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Mon, 13 Dec 2021 12:32:09 +0530 Subject: rcu: Remove unused rcu_state.boost Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 5 ++--- kernel/rcu/tree_plugin.h | 2 -- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 24dd4b0d805f..e9990945483f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -304,9 +304,8 @@ struct rcu_state { /* The following fields are guarded by the root rcu_node's lock. */ - u8 boost ____cacheline_internodealigned_in_smp; - /* Subject to priority boost. */ - unsigned long gp_seq; /* Grace-period sequence #. */ + unsigned long gp_seq ____cacheline_internodealigned_in_smp; + /* Grace-period sequence #. */ unsigned long gp_max; /* Maximum GP duration in */ /* jiffies. */ struct task_struct *gp_kthread; /* Task for grace periods. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c5b45c2f68a1..109429e70a64 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1175,8 +1175,6 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) if (rnp->boost_kthread_task || !rcu_scheduler_fully_active) return; - rcu_state.boost = 1; - t = kthread_create(rcu_boost_kthread, (void *)rnp, "rcub/%d", rnp_index); if (WARN_ON_ONCE(IS_ERR(t))) -- cgit v1.2.3 From 4b4399b2450de38916718ba9947e6cdb69c99c55 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 29 Dec 2021 00:05:10 +0800 Subject: rcu: Create per-cpu rcuc kthreads only when rcutree.use_softirq=0 The per-CPU "rcuc" kthreads are used only by kernels booted with rcutree.use_softirq=0, but they are nevertheless unconditionally created by kernels built with CONFIG_RCU_BOOST=y. This results in "rcuc" kthreads being created that are never actually used. This commit therefore refrains from creating these kthreads unless the kernel is actually booted with rcutree.use_softirq=0. Acked-by: Sebastian Andrzej Siewior Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a4c25a6283b0..4e5e37e5ee3c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2894,7 +2894,7 @@ static int __init rcu_spawn_core_kthreads(void) for_each_possible_cpu(cpu) per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0; - if (!IS_ENABLED(CONFIG_RCU_BOOST) && use_softirq) + if (use_softirq) return 0; WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), "%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__); -- cgit v1.2.3 From c8db27dd0ea8071d2ea29a1a401c4ccc611ec6c1 Mon Sep 17 00:00:00 2001 From: Alison Chaiken Date: Tue, 11 Jan 2022 15:32:50 -0800 Subject: rcu: Move kthread_prio bounds-check to a separate function Move the bounds-check of the kthread_prio cmdline parameter to a new function in order to faciliate a different callsite. Signed-off-by: Alison Chaiken Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4e5e37e5ee3c..5bf0312f6676 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4440,26 +4440,10 @@ static int rcu_pm_notify(struct notifier_block *self, static int __init rcu_spawn_gp_kthread(void) { unsigned long flags; - int kthread_prio_in = kthread_prio; struct rcu_node *rnp; struct sched_param sp; struct task_struct *t; - /* Force priority into range. */ - if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2 - && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) - kthread_prio = 2; - else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1) - kthread_prio = 1; - else if (kthread_prio < 0) - kthread_prio = 0; - else if (kthread_prio > 99) - kthread_prio = 99; - - if (kthread_prio != kthread_prio_in) - pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n", - kthread_prio, kthread_prio_in); - rcu_scheduler_fully_active = 1; t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name); if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__)) @@ -4584,6 +4568,28 @@ static void __init rcu_init_one(void) } } +/* + * Force priority from the kernel command-line into range. + */ +static void __init sanitize_kthread_prio(void) +{ + int kthread_prio_in = kthread_prio; + + if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2 + && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) + kthread_prio = 2; + else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1) + kthread_prio = 1; + else if (kthread_prio < 0) + kthread_prio = 0; + else if (kthread_prio > 99) + kthread_prio = 99; + + if (kthread_prio != kthread_prio_in) + pr_alert("%s: Limited prio to %d from %d\n", + __func__, kthread_prio, kthread_prio_in); +} + /* * Compute the rcu_node tree geometry from kernel parameters. This cannot * replace the definitions in tree.h because those are needed to size @@ -4744,6 +4750,7 @@ void __init rcu_init(void) kfree_rcu_batch_init(); rcu_bootup_announce(); + sanitize_kthread_prio(); rcu_init_geometry(); rcu_init_one(); if (dump_tree) -- cgit v1.2.3 From 54577e23fa0791599db1a3d86fc8e7a205d3da75 Mon Sep 17 00:00:00 2001 From: Alison Chaiken Date: Tue, 11 Jan 2022 15:32:51 -0800 Subject: rcu: Make priority of grace-period thread consistent The priority of RCU grace period threads is set to kthread_prio when they are launched from rcu_spawn_gp_kthread(). The same is not true of rcu_spawn_one_nocb_kthread(). Accordingly, add priority elevation to rcu_spawn_one_nocb_kthread(). Signed-off-by: Alison Chaiken Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_nocb.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 1e40519d1a05..ea889cbfc3b9 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1239,6 +1239,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); struct rcu_data *rdp_gp; struct task_struct *t; + struct sched_param sp; if (!rcu_scheduler_fully_active || !rcu_nocb_is_setup) return; @@ -1248,6 +1249,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) return; /* If we didn't spawn the GP kthread first, reorganize! */ + sp.sched_priority = kthread_prio; rdp_gp = rdp->nocb_gp_rdp; mutex_lock(&rdp_gp->nocb_gp_kthread_mutex); if (!rdp_gp->nocb_gp_kthread) { @@ -1258,6 +1260,8 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) return; } WRITE_ONCE(rdp_gp->nocb_gp_kthread, t); + if (kthread_prio) + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); } mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); -- cgit v1.2.3 From c8b16a65267e35ecc5621dbc81cbe7e5b0992fce Mon Sep 17 00:00:00 2001 From: Alison Chaiken Date: Tue, 11 Jan 2022 15:32:52 -0800 Subject: rcu: Elevate priority of offloaded callback threads When CONFIG_PREEMPT_RT=y, the rcutree.kthread_prio command-line parameter signals initialization code to boost the priority of rcuc callbacks to the designated value. With the additional CONFIG_RCU_NOCB_CPU=y configuration and an additional rcu_nocbs command-line parameter, the callbacks on the listed cores are offloaded to new rcuop kthreads that are not pinned to the cores whose post-grace-period work is performed. While the rcuop kthreads perform the same function as the rcuc kthreads they offload, the kthread_prio parameter only boosts the priority of the rcuc kthreads. Fix this inconsistency by elevating rcuop kthreads to the same priority as the rcuc kthreads. Signed-off-by: Alison Chaiken Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- kernel/rcu/tree_nocb.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5bf0312f6676..9e4c5b281f00 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -153,7 +153,7 @@ static void sync_sched_exp_online_cleanup(int cpu); static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp); static bool rcu_rdp_is_offloaded(struct rcu_data *rdp); -/* rcuc/rcub kthread realtime priority */ +/* rcuc/rcub/rcuop kthread realtime priority */ static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; module_param(kthread_prio, int, 0444); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index ea889cbfc3b9..547c41437c76 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1270,6 +1270,9 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) "rcuo%c/%d", rcu_state.abbr, cpu); if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__)) return; + + if (kthread_prio) + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); WRITE_ONCE(rdp->nocb_cb_kthread, t); WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); } -- cgit v1.2.3 From 10c535787436d62ea28156a4b91365fd89b5a432 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Jan 2022 12:40:08 -0800 Subject: rcu: Don't deboost before reporting expedited quiescent state Currently rcu_preempt_deferred_qs_irqrestore() releases rnp->boost_mtx before reporting the expedited quiescent state. Under heavy real-time load, this can result in this function being preempted before the quiescent state is reported, which can in turn prevent the expedited grace period from completing. Tim Murray reports that the resulting expedited grace periods can take hundreds of milliseconds and even more than one second, when they should normally complete in less than a millisecond. This was fine given that there were no particular response-time constraints for synchronize_rcu_expedited(), as it was designed for throughput rather than latency. However, some users now need sub-100-millisecond response-time constratints. This patch therefore follows Neeraj's suggestion (seconded by Tim and by Uladzislau Rezki) of simply reversing the two operations. Reported-by: Tim Murray Reported-by: Joel Fernandes Reported-by: Neeraj Upadhyay Reviewed-by: Neeraj Upadhyay Reviewed-by: Uladzislau Rezki (Sony) Tested-by: Tim Murray Cc: Todd Kjos Cc: Sandeep Patil Cc: # 5.4.x Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 109429e70a64..02ac057ba3f8 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -556,16 +556,16 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } - /* Unboost if we were boosted. */ - if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex) - rt_mutex_futex_unlock(&rnp->boost_mtx.rtmutex); - /* * If this was the last task on the expedited lists, * then we need to report up the rcu_node hierarchy. */ if (!empty_exp && empty_exp_now) rcu_report_exp_rnp(rnp, true); + + /* Unboost if we were boosted. */ + if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex) + rt_mutex_futex_unlock(&rnp->boost_mtx.rtmutex); } else { local_irq_restore(flags); } -- cgit v1.2.3 From c9515875850fefcc79492c5189fe8431e75ddec5 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Tue, 25 Jan 2022 10:47:44 +0800 Subject: rcu: Add per-CPU rcuc task dumps to RCU CPU stall warnings When the rcutree.use_softirq kernel boot parameter is set to zero, all RCU_SOFTIRQ processing is carried out by the per-CPU rcuc kthreads. If these kthreads are being starved, quiescent states will not be reported, which in turn means that the grace period will not end, which can in turn trigger RCU CPU stall warnings. This commit therefore dumps stack traces of stalled CPUs' rcuc kthreads, which can help identify what is preventing those kthreads from running. Suggested-by: Ammar Faizi Reviewed-by: Ammar Faizi Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 +++ kernel/rcu/tree.h | 1 + kernel/rcu/tree_plugin.h | 3 +++ kernel/rcu/tree_stall.h | 35 +++++++++++++++++++++++++++++++++++ 4 files changed, 42 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9e4c5b281f00..bd9b2af247ab 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2850,10 +2850,12 @@ static void rcu_cpu_kthread(unsigned int cpu) { unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status); char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work); + unsigned long *j = this_cpu_ptr(&rcu_data.rcuc_activity); int spincnt; trace_rcu_utilization(TPS("Start CPU kthread@rcu_run")); for (spincnt = 0; spincnt < 10; spincnt++) { + WRITE_ONCE(*j, jiffies); local_bh_disable(); *statusp = RCU_KTHREAD_RUNNING; local_irq_disable(); @@ -2874,6 +2876,7 @@ static void rcu_cpu_kthread(unsigned int cpu) schedule_timeout_idle(2); trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); *statusp = RCU_KTHREAD_WAITING; + WRITE_ONCE(*j, jiffies); } static struct smp_hotplug_thread rcu_cpu_thread_spec = { diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e9990945483f..b84cc5742c31 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -239,6 +239,7 @@ struct rcu_data { /* rcuc per-CPU kthread or NULL. */ unsigned int rcu_cpu_kthread_status; char rcu_cpu_has_work; + unsigned long rcuc_activity; /* 7) Diagnostic data, including RCU CPU stall warnings. */ unsigned int softirq_snap; /* Snapshot of softirq activity. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 02ac057ba3f8..8167cab1bffc 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -996,12 +996,15 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) */ static void rcu_cpu_kthread_setup(unsigned int cpu) { + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); #ifdef CONFIG_RCU_BOOST struct sched_param sp; sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); #endif /* #ifdef CONFIG_RCU_BOOST */ + + WRITE_ONCE(rdp->rcuc_activity, jiffies); } #ifdef CONFIG_RCU_BOOST diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 21bebf7c9030..0c5d8516516a 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -379,6 +379,15 @@ static bool rcu_is_gp_kthread_starving(unsigned long *jp) return j > 2 * HZ; } +static bool rcu_is_rcuc_kthread_starving(struct rcu_data *rdp, unsigned long *jp) +{ + unsigned long j = jiffies - READ_ONCE(rdp->rcuc_activity); + + if (jp) + *jp = j; + return j > 2 * HZ; +} + /* * Print out diagnostic information for the specified stalled CPU. * @@ -430,6 +439,29 @@ static void print_cpu_stall_info(int cpu) falsepositive ? " (false positive?)" : ""); } +static void rcuc_kthread_dump(struct rcu_data *rdp) +{ + int cpu; + unsigned long j; + struct task_struct *rcuc; + + rcuc = rdp->rcu_cpu_kthread_task; + if (!rcuc) + return; + + cpu = task_cpu(rcuc); + if (cpu_is_offline(cpu) || idle_cpu(cpu)) + return; + + if (!rcu_is_rcuc_kthread_starving(rdp, &j)) + return; + + pr_err("%s kthread starved for %ld jiffies\n", rcuc->comm, j); + sched_show_task(rcuc); + if (!trigger_single_cpu_backtrace(cpu)) + dump_cpu_task(cpu); +} + /* Complain about starvation of grace-period kthread. */ static void rcu_check_gp_kthread_starvation(void) { @@ -601,6 +633,9 @@ static void print_cpu_stall(unsigned long gps) rcu_check_gp_kthread_expired_fqs_timer(); rcu_check_gp_kthread_starvation(); + if (!use_softirq) + rcuc_kthread_dump(rdp); + rcu_dump_cpu_stacks(); raw_spin_lock_irqsave_rcu_node(rnp, flags); -- cgit v1.2.3 From 6f81bd6a4e305d15d9c2a6a350e2876a7a814d7e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 6 Dec 2021 15:12:14 -0800 Subject: rcutorture: Print message before invoking ->cb_barrier() The various ->cb_barrier() functions, for example, rcu_barrier(), sometimes cause rcutorture hangs. But currently, the last console message is the unenlightening "Stopping rcu_torture_stats". This commit therefore prints a message of the form "rcu_torture_cleanup: Invoking rcu_barrier+0x0/0x1e0()" to help point people in the right direction. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 422f7e4cc08d..00400aef5818 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2905,8 +2905,10 @@ rcu_torture_cleanup(void) int i; if (torture_cleanup_begin()) { - if (cur_ops->cb_barrier != NULL) + if (cur_ops->cb_barrier != NULL) { + pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier); cur_ops->cb_barrier(); + } return; } if (!cur_ops) { @@ -2961,8 +2963,10 @@ rcu_torture_cleanup(void) * Wait for all RCU callbacks to fire, then do torture-type-specific * cleanup operations. */ - if (cur_ops->cb_barrier != NULL) + if (cur_ops->cb_barrier != NULL) { + pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier); cur_ops->cb_barrier(); + } if (cur_ops->cleanup != NULL) cur_ops->cleanup(); -- cgit v1.2.3 From 2b4a7f20f160e6440848c62a70ee5dc5237a2c8b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 16 Dec 2021 12:23:31 -0800 Subject: torture: Distinguish kthread stopping and being asked to stop Right now, if a given kthread (call it "kthread") realizes that it needs to stop, "Stopping kthread" is written to the console. When the cleanup code decides that it is time to stop that kthread, "Stopping kthread tasks" is written to the console. These two events might happen in either order, especially in the case of time-based torture-test shutdown. But it is hard to distinguish these, especially for those unfamiliar with the torture tests. This commit therefore changes the first case from "Stopping kthread" to "kthread is stopping" to make things more clear. Signed-off-by: Paul E. McKenney --- kernel/torture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index ef27a6c82451..f55d803f995d 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -911,7 +911,7 @@ void torture_kthread_stopping(char *title) { char buf[128]; - snprintf(buf, sizeof(buf), "Stopping %s", title); + snprintf(buf, sizeof(buf), "%s is stopping", title); VERBOSE_TOROUT_STRING(buf); while (!kthread_should_stop()) { torture_shutdown_absorb(title); -- cgit v1.2.3 From 05b724655bf1908abf531dd0bce455e55703a3a8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 16 Dec 2021 15:36:02 -0800 Subject: rcutorture: Increase visibility of forward-progress hangs This commit adds a few pr_alert() calls to rcutorture's forward-progress testing in order to better diagnose shutdown-time hangs. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 00400aef5818..fefc3fa1a9c2 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2281,6 +2281,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp, unsigned long stopat; static DEFINE_TORTURE_RANDOM(trs); + pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id); if (!cur_ops->sync) return; // Cannot do need_resched() forward progress testing without ->sync. if (cur_ops->call && cur_ops->cb_barrier) { @@ -2325,6 +2326,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp, if (selfpropcb) { WRITE_ONCE(fcs.stop, 1); cur_ops->sync(); /* Wait for running CB to complete. */ + pr_alert("%s: Waiting for CBs: %pS() %d\n", __func__, cur_ops->cb_barrier, rfp->rcu_fwd_id); cur_ops->cb_barrier(); /* Wait for queued callbacks. */ } @@ -2353,6 +2355,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) unsigned long stopat; unsigned long stoppedat; + pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id); if (READ_ONCE(rcu_fwd_emergency_stop)) return; /* Get out of the way quickly, no GP wait! */ if (!cur_ops->call) @@ -2414,6 +2417,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) n_launders_cb_snap = READ_ONCE(rfp->n_launders_cb); cver = READ_ONCE(rcu_torture_current_version) - cver; gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); + pr_alert("%s: Waiting for CBs: %pS() %d\n", __func__, cur_ops->cb_barrier, rfp->rcu_fwd_id); cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ (void)rcu_torture_fwd_prog_cbfree(rfp); -- cgit v1.2.3 From e22ef8df415d924428e35c9c112526306e684adc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 17 Dec 2021 12:33:53 -0800 Subject: rcutorture: Make rcu_fwd_cb_nodelay be a counter Back when only one rcutorture kthread could do forward-progress testing, it was just fine for rcu_fwd_cb_nodelay to be a non-atomic bool. It was set at the start of forward-progress testing and cleared at the end. But now that there are multiple threads, the value can be cleared while one of the threads is still doing forward-progress testing. This commit therefore makes rcu_fwd_cb_nodelay be an atomic counter, replacing the WRITE_ONCE() operations with atomic_inc() and atomic_dec(). Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index fefc3fa1a9c2..afe95c694895 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -284,7 +284,7 @@ static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); -static bool rcu_fwd_cb_nodelay; /* Short rcu_torture_delay() delays. */ +static atomic_t rcu_fwd_cb_nodelay; /* Short rcu_torture_delay() delays. */ /* * Allocate an element from the rcu_tortures pool. @@ -387,7 +387,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) * period, and we want a long delay occasionally to trigger * force_quiescent_state. */ - if (!READ_ONCE(rcu_fwd_cb_nodelay) && + if (!atomic_read(&rcu_fwd_cb_nodelay) && !(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); @@ -1276,7 +1276,7 @@ rcu_torture_writer(void *arg) boot_ended = rcu_inkernel_boot_has_ended(); stutter_waited = stutter_wait("rcu_torture_writer"); if (stutter_waited && - !READ_ONCE(rcu_fwd_cb_nodelay) && + !atomic_read(&rcu_fwd_cb_nodelay) && !cur_ops->slow_gps && !torture_must_stop() && boot_ended) @@ -2290,7 +2290,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp, } /* Tight loop containing cond_resched(). */ - WRITE_ONCE(rcu_fwd_cb_nodelay, true); + atomic_inc(&rcu_fwd_cb_nodelay); cur_ops->sync(); /* Later readers see above write. */ if (selfpropcb) { WRITE_ONCE(fcs.stop, 0); @@ -2335,7 +2335,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp, destroy_rcu_head_on_stack(&fcs.rh); } schedule_timeout_uninterruptible(HZ / 10); /* Let kthreads recover. */ - WRITE_ONCE(rcu_fwd_cb_nodelay, false); + atomic_dec(&rcu_fwd_cb_nodelay); } /* Carry out call_rcu() forward-progress testing. */ @@ -2362,7 +2362,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) return; /* Can't do call_rcu() fwd prog without ->call. */ /* Loop continuously posting RCU callbacks. */ - WRITE_ONCE(rcu_fwd_cb_nodelay, true); + atomic_inc(&rcu_fwd_cb_nodelay); cur_ops->sync(); /* Later readers see above write. */ WRITE_ONCE(rfp->rcu_fwd_startat, jiffies); stopat = rfp->rcu_fwd_startat + MAX_FWD_CB_JIFFIES; @@ -2435,7 +2435,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) } schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */ tick_dep_clear_task(current, TICK_DEP_BIT_RCU); - WRITE_ONCE(rcu_fwd_cb_nodelay, false); + atomic_dec(&rcu_fwd_cb_nodelay); } -- cgit v1.2.3 From 02b51a1cf47977d8772c7dcc363ef6a1e6e59f21 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 17 Dec 2021 15:05:05 -0800 Subject: rcutorture: Add end-of-test check to rcu_torture_fwd_prog() loop The second and subsequent forward-progress kthreads loop waiting for the first forward-progress kthread to start the next test interval. Unfortunately, if the test ends while one of those kthreads is waiting, the test will hang. This hang occurs because that wait loop fails to check for the end of the test. This commit therefore adds an end-of-test check to that wait loop. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index afe95c694895..e99658efd97f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2515,7 +2515,7 @@ static int rcu_torture_fwd_prog(void *args) firsttime = false; WRITE_ONCE(rcu_fwd_seq, rcu_fwd_seq + 1); } else { - while (READ_ONCE(rcu_fwd_seq) == oldseq) + while (READ_ONCE(rcu_fwd_seq) == oldseq && !torture_must_stop()) schedule_timeout_interruptible(1); oldseq = READ_ONCE(rcu_fwd_seq); } -- cgit v1.2.3 From 89440d2dad0cc2a781290470cb90402ebba481fc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 28 Dec 2021 15:59:38 -0800 Subject: rcutorture: Fix rcu_fwd_mutex deadlock The rcu_torture_fwd_cb_hist() function acquires rcu_fwd_mutex, but is invoked from rcutorture_oom_notify() function, which hold this same mutex across this call. This commit fixes the resulting deadlock. Reported-by: kernel test robot Tested-by: Oliver Sang Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index e99658efd97f..1c8f40b90f70 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2180,7 +2180,6 @@ static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) for (i = ARRAY_SIZE(rfp->n_launders_hist) - 1; i > 0; i--) if (rfp->n_launders_hist[i].n_launders > 0) break; - mutex_lock(&rcu_fwd_mutex); // Serialize histograms. pr_alert("%s: Callback-invocation histogram %d (duration %lu jiffies):", __func__, rfp->rcu_fwd_id, jiffies - rfp->rcu_fwd_startat); gps_old = rfp->rcu_launder_gp_seq_start; @@ -2193,7 +2192,6 @@ static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) gps_old = gps; } pr_cont("\n"); - mutex_unlock(&rcu_fwd_mutex); } /* Callback function for continuous-flood RCU callbacks. */ @@ -2431,7 +2429,9 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) n_launders, n_launders_sa, n_max_gps, n_max_cbs, cver, gps); atomic_long_add(n_max_cbs, &rcu_fwd_max_cbs); + mutex_lock(&rcu_fwd_mutex); // Serialize histograms. rcu_torture_fwd_cb_hist(rfp); + mutex_unlock(&rcu_fwd_mutex); } schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */ tick_dep_clear_task(current, TICK_DEP_BIT_RCU); -- cgit v1.2.3 From 6b8646a9d37c6324cf994dbefb75f3eb20b109ab Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 3 Jan 2022 06:07:09 -0800 Subject: torture: Wake up kthreads after storing task_struct pointer Currently, _torture_create_kthread() uses kthread_run() to create torture-test kthreads, which means that the resulting task_struct pointer is stored after the newly created kthread has been marked runnable. This in turn can cause spurious failure of checks for code being run by a particular kthread. This commit therefore changes _torture_create_kthread() to use kthread_create(), then to do an explicit wake_up_process() after the task_struct pointer has been stored. Reported-by: Frederic Weisbecker Reviewed-by: Neeraj Upadhyay Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/torture.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index f55d803f995d..789aeb0e1159 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -931,12 +931,14 @@ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m, int ret = 0; VERBOSE_TOROUT_STRING(m); - *tp = kthread_run(fn, arg, "%s", s); + *tp = kthread_create(fn, arg, "%s", s); if (IS_ERR(*tp)) { ret = PTR_ERR(*tp); TOROUT_ERRSTRING(f); *tp = NULL; + return ret; } + wake_up_process(*tp); // Process is sleeping, so ordering provided. torture_shuffle_task_register(*tp); return ret; } -- cgit v1.2.3 From 9c0f1c7fd7c6e1e0f5b84e20c577fbab62563d03 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 27 Jan 2022 20:29:10 -0800 Subject: rcutorture: Enable limited callback-flooding tests of SRCU This commit allows up to 50,000 callbacks worth of callback-flooding tests of SRCU. The goal of this change is to exercise Tree SRCU's ability to transition from SRCU_SIZE_SMALL to SRCU_SIZE_BIG triggered by callback-queue-time lock contention. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 1c8f40b90f70..b41db719085e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -674,6 +674,7 @@ static struct rcu_torture_ops srcu_ops = { .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, + .cbflood_max = 50000, .irq_capable = 1, .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .name = "srcu" @@ -708,6 +709,7 @@ static struct rcu_torture_ops srcud_ops = { .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, + .cbflood_max = 50000, .irq_capable = 1, .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .name = "srcud" -- cgit v1.2.3 From fc153c1c58cb8c3bb3b443b4d7dc3211ff5f65fc Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sun, 5 Dec 2021 22:38:15 -0500 Subject: clocksource: Add a Kconfig option for WATCHDOG_MAX_SKEW A watchdog maximum skew of 100us may still be too small for some systems or archs. It may also be too small when some kernel debug config options are enabled. So add a new Kconfig option CLOCKSOURCE_WATCHDOG_MAX_SKEW_US to allow kernel builders to have more control on the threshold for marking clocksource as unstable. Signed-off-by: Waiman Long Signed-off-by: Paul E. McKenney --- kernel/time/Kconfig | 9 +++++++++ kernel/time/clocksource.c | 8 +++++++- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 04bfd62f5e5c..27b7868b5c30 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -181,5 +181,14 @@ config HIGH_RES_TIMERS hardware is not capable then this option only increases the size of the kernel image. +config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US + int "Clocksource watchdog maximum allowable skew (in μs)" + depends on CLOCKSOURCE_WATCHDOG + range 50 1000 + default 100 + help + Specify the maximum amount of allowable watchdog skew in + microseconds before reporting the clocksource to be unstable. + endmenu endif diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 1cf73807b450..95d7ca35bdf2 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -107,7 +107,13 @@ static u64 suspend_start; * This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as * a lower bound for cs->uncertainty_margin values when registering clocks. */ -#define WATCHDOG_MAX_SKEW (100 * NSEC_PER_USEC) +#ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US +#define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US +#else +#define MAX_SKEW_USEC 100 +#endif + +#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC) #ifdef CONFIG_CLOCKSOURCE_WATCHDOG static void clocksource_watchdog_work(struct work_struct *work); -- cgit v1.2.3 From bfdf4e6208051ed7165b2e92035b4bf11f43eb63 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 27 Jan 2022 10:27:20 -0500 Subject: rseq: Remove broken uapi field layout on 32-bit little endian The rseq rseq_cs.ptr.{ptr32,padding} uapi endianness handling is entirely wrong on 32-bit little endian: a preprocessor logic mistake wrongly uses the big endian field layout on 32-bit little endian architectures. Fortunately, those ptr32 accessors were never used within the kernel, and only meant as a convenience for user-space. Remove those and replace the whole rseq_cs union by a __u64 type, as this is the only thing really needed to express the ABI. Document how 32-bit architectures are meant to interact with this field. Fixes: ec9c82e03a74 ("rseq: uapi: Declare rseq_cs field as union, update includes") Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220127152720.25898-1-mathieu.desnoyers@efficios.com --- include/uapi/linux/rseq.h | 20 ++++---------------- kernel/rseq.c | 8 ++++---- 2 files changed, 8 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index 9a402fdb60e9..77ee207623a9 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -105,23 +105,11 @@ struct rseq { * Read and set by the kernel. Set by user-space with single-copy * atomicity semantics. This field should only be updated by the * thread which registered this data structure. Aligned on 64-bit. + * + * 32-bit architectures should update the low order bits of the + * rseq_cs field, leaving the high order bits initialized to 0. */ - union { - __u64 ptr64; -#ifdef __LP64__ - __u64 ptr; -#else - struct { -#if (defined(__BYTE_ORDER) && (__BYTE_ORDER == __BIG_ENDIAN)) || defined(__BIG_ENDIAN) - __u32 padding; /* Initialized to zero. */ - __u32 ptr32; -#else /* LITTLE */ - __u32 ptr32; - __u32 padding; /* Initialized to zero. */ -#endif /* ENDIAN */ - } ptr; -#endif - } rseq_cs; + __u64 rseq_cs; /* * Restartable sequences flags field. diff --git a/kernel/rseq.c b/kernel/rseq.c index 6d45ac3dae7f..97ac20b4f738 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -128,10 +128,10 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) int ret; #ifdef CONFIG_64BIT - if (get_user(ptr, &t->rseq->rseq_cs.ptr64)) + if (get_user(ptr, &t->rseq->rseq_cs)) return -EFAULT; #else - if (copy_from_user(&ptr, &t->rseq->rseq_cs.ptr64, sizeof(ptr))) + if (copy_from_user(&ptr, &t->rseq->rseq_cs, sizeof(ptr))) return -EFAULT; #endif if (!ptr) { @@ -217,9 +217,9 @@ static int clear_rseq_cs(struct task_struct *t) * Set rseq_cs to NULL. */ #ifdef CONFIG_64BIT - return put_user(0UL, &t->rseq->rseq_cs.ptr64); + return put_user(0UL, &t->rseq->rseq_cs); #else - if (clear_user(&t->rseq->rseq_cs.ptr64, sizeof(t->rseq->rseq_cs.ptr64))) + if (clear_user(&t->rseq->rseq_cs, sizeof(t->rseq->rseq_cs))) return -EFAULT; return 0; #endif -- cgit v1.2.3 From c8eaf6ac76f40f6c59fc7d056e2e08c4a57ea9c7 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Fri, 28 Jan 2022 17:50:25 +0800 Subject: sched: move autogroup sysctls into its own file move autogroup sysctls to autogroup.c and use the new register_sysctl_init() to register the sysctl interface. Signed-off-by: Zhen Ni Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220128095025.8745-1-nizhen@uniontech.com --- include/linux/sched/sysctl.h | 4 ---- kernel/sched/autogroup.c | 23 +++++++++++++++++++++++ kernel/sched/autogroup.h | 1 + kernel/sysctl.c | 11 ----------- 4 files changed, 24 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index c19dd5a2c05c..3f2b70f8d32c 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -45,10 +45,6 @@ extern unsigned int sysctl_sched_uclamp_util_min_rt_default; extern unsigned int sysctl_sched_cfs_bandwidth_slice; #endif -#ifdef CONFIG_SCHED_AUTOGROUP -extern unsigned int sysctl_sched_autogroup_enabled; -#endif - extern int sysctl_sched_rr_timeslice; extern int sched_rr_timeslice; diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 8629b37d118e..31dd2593145e 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -9,6 +9,28 @@ unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; static struct autogroup autogroup_default; static atomic_t autogroup_seq_nr; +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_autogroup_sysctls[] = { + { + .procname = "sched_autogroup_enabled", + .data = &sysctl_sched_autogroup_enabled, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static void __init sched_autogroup_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_autogroup_sysctls); +} +#else +#define sched_autogroup_sysctl_init() do { } while (0) +#endif + void __init autogroup_init(struct task_struct *init_task) { autogroup_default.tg = &root_task_group; @@ -198,6 +220,7 @@ void sched_autogroup_exit(struct signal_struct *sig) static int __init setup_autogroup(char *str) { sysctl_sched_autogroup_enabled = 0; + sched_autogroup_sysctl_init(); return 1; } diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h index b96419974a1f..90fcbfdd70c3 100644 --- a/kernel/sched/autogroup.h +++ b/kernel/sched/autogroup.h @@ -27,6 +27,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); static inline struct task_group * autogroup_task_group(struct task_struct *p, struct task_group *tg) { + extern unsigned int sysctl_sched_autogroup_enabled; int enabled = READ_ONCE(sysctl_sched_autogroup_enabled); if (enabled && task_wants_autogroup(p, tg)) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5ae443b2882e..1cb7ca68cd4e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1750,17 +1750,6 @@ static struct ctl_table kern_table[] = { .proc_handler = sysctl_sched_uclamp_handler, }, #endif -#ifdef CONFIG_SCHED_AUTOGROUP - { - .procname = "sched_autogroup_enabled", - .data = &sysctl_sched_autogroup_enabled, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif #ifdef CONFIG_CFS_BANDWIDTH { .procname = "sched_cfs_bandwidth_slice_us", -- cgit v1.2.3 From 3c25fc97f5590060464cabfa25710970ecddbc96 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 31 Jan 2022 11:34:05 +0100 Subject: perf: Copy perf_event_attr::sig_data on modification The intent has always been that perf_event_attr::sig_data should also be modifiable along with PERF_EVENT_IOC_MODIFY_ATTRIBUTES, because it is observable by user space if SIGTRAP on events is requested. Currently only PERF_TYPE_BREAKPOINT is modifiable, and explicitly copies relevant breakpoint-related attributes in hw_breakpoint_copy_attr(). This misses copying perf_event_attr::sig_data. Since sig_data is not specific to PERF_TYPE_BREAKPOINT, introduce a helper to copy generic event-type-independent attributes on modification. Fixes: 97ba62b27867 ("perf: Add support for SIGTRAP on perf events") Reported-by: Dmitry Vyukov Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Link: https://lore.kernel.org/r/20220131103407.1971678-1-elver@google.com --- kernel/events/core.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 76c754e45d01..57c7197838db 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3238,6 +3238,15 @@ static int perf_event_modify_breakpoint(struct perf_event *bp, return err; } +/* + * Copy event-type-independent attributes that may be modified. + */ +static void perf_event_modify_copy_attr(struct perf_event_attr *to, + const struct perf_event_attr *from) +{ + to->sig_data = from->sig_data; +} + static int perf_event_modify_attr(struct perf_event *event, struct perf_event_attr *attr) { @@ -3260,10 +3269,17 @@ static int perf_event_modify_attr(struct perf_event *event, WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->child_mutex); + /* + * Event-type-independent attributes must be copied before event-type + * modification, which will validate that final attributes match the + * source attributes after all relevant attributes have been copied. + */ + perf_event_modify_copy_attr(&event->attr, attr); err = func(event, attr); if (err) goto out; list_for_each_entry(child, &event->child_list, child_list) { + perf_event_modify_copy_attr(&child->attr, attr); err = func(child, attr); if (err) goto out; -- cgit v1.2.3 From d680ff24e9e14444c63945b43a37ede7cd6958f9 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 31 Jan 2022 09:24:51 +0200 Subject: perf/core: Fix address filter parser for multiple filters Reset appropriate variables in the parser loop between parsing separate filters, so that they do not interfere with parsing the next filter. Fixes: 375637bc524952 ("perf/core: Introduce address range filtering") Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220131072453.2839535-4-adrian.hunter@intel.com --- kernel/events/core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 76c754e45d01..2889b82fb75e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10558,8 +10558,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, } /* ready to consume more filters */ + kfree(filename); + filename = NULL; state = IF_STATE_ACTION; filter = NULL; + kernel = 0; } } -- cgit v1.2.3 From 58b2ff2c18b1e1d7232b8007a5698ec4ee7a7a0d Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 31 Jan 2022 09:24:53 +0200 Subject: perf/core: Allow kernel address filter when not filtering the kernel The so-called 'kernel' address filter can also be useful for filtering fixed addresses in user space. Allow that. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220131072453.2839535-6-adrian.hunter@intel.com --- kernel/events/core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 2889b82fb75e..afbf388a5176 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10515,8 +10515,6 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, */ if (state == IF_STATE_END) { ret = -EINVAL; - if (kernel && event->attr.exclude_kernel) - goto fail; /* * ACTION "filter" must have a non-zero length region -- cgit v1.2.3 From 322cbb50de711814c42fb088f6d31901502c711a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jan 2022 10:39:13 +0100 Subject: block: remove genhd.h There is no good reason to keep genhd.h separate from the main blkdev.h header that includes it. So fold the contents of genhd.h into blkdev.h and remove genhd.h entirely. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220124093913.742411-4-hch@lst.de Signed-off-by: Jens Axboe --- Documentation/block/capability.rst | 2 +- arch/m68k/atari/stdma.c | 1 - arch/m68k/bvme6000/config.c | 1 - arch/m68k/emu/nfblock.c | 1 - arch/m68k/kernel/setup_mm.c | 1 - arch/m68k/mvme147/config.c | 1 - arch/m68k/mvme16x/config.c | 1 - block/blk-cgroup.c | 1 - block/disk-events.c | 2 +- block/genhd.c | 1 - block/holder.c | 2 +- block/partitions/check.h | 1 - block/partitions/core.c | 1 - block/partitions/efi.h | 1 - block/partitions/ldm.h | 1 - block/sed-opal.c | 2 +- drivers/base/class.c | 2 +- drivers/base/core.c | 2 +- drivers/base/devtmpfs.c | 2 +- drivers/block/aoe/aoeblk.c | 1 - drivers/block/aoe/aoecmd.c | 1 - drivers/block/drbd/drbd_int.h | 1 - drivers/block/mtip32xx/mtip32xx.c | 1 - drivers/block/mtip32xx/mtip32xx.h | 1 - drivers/block/rnbd/rnbd-srv-sysfs.c | 1 - drivers/block/sunvdc.c | 1 - drivers/block/zram/zram_drv.c | 1 - drivers/cdrom/gdrom.c | 1 - drivers/char/random.c | 2 +- drivers/md/bcache/super.c | 1 - drivers/md/dm-core.h | 1 - drivers/mtd/mtdswap.c | 2 +- drivers/mtd/nand/raw/sharpsl.c | 1 - drivers/nvdimm/blk.c | 1 - drivers/nvdimm/btt.c | 1 - drivers/nvdimm/btt_devs.c | 1 - drivers/nvdimm/bus.c | 1 - drivers/nvdimm/pfn_devs.c | 1 - drivers/s390/block/dasd_int.h | 1 - drivers/s390/block/scm_blk.c | 1 - drivers/s390/block/scm_blk.h | 1 - drivers/scsi/scsi_debug.c | 1 - drivers/scsi/scsicam.c | 1 - drivers/scsi/sd.c | 1 - drivers/scsi/sr.h | 1 - drivers/target/target_core_iblock.c | 1 - drivers/target/target_core_pscsi.c | 1 - fs/btrfs/check-integrity.c | 1 - fs/dax.c | 1 - fs/gfs2/sys.c | 2 +- fs/hfs/mdb.c | 2 +- fs/hfsplus/wrapper.c | 1 - fs/ksmbd/vfs.c | 1 - fs/nfs/blocklayout/rpc_pipefs.c | 1 - fs/nfsd/blocklayout.c | 1 - include/linux/blkdev.h | 273 +++++++++++++++++++++++++++++++++- include/linux/genhd.h | 287 ------------------------------------ include/linux/part_stat.h | 2 +- init/do_mounts.c | 1 - kernel/power/hibernate.c | 1 - kernel/power/swap.c | 1 - security/integrity/ima/ima_policy.c | 1 - 62 files changed, 282 insertions(+), 350 deletions(-) delete mode 100644 include/linux/genhd.h (limited to 'kernel') diff --git a/Documentation/block/capability.rst b/Documentation/block/capability.rst index 160a5148b915..2ae7f064736a 100644 --- a/Documentation/block/capability.rst +++ b/Documentation/block/capability.rst @@ -7,4 +7,4 @@ This file documents the sysfs file ``block//capability``. ``capability`` is a bitfield, printed in hexadecimal, indicating which capabilities a specific block device supports: -.. kernel-doc:: include/linux/genhd.h +.. kernel-doc:: include/linux/blkdev.h diff --git a/arch/m68k/atari/stdma.c b/arch/m68k/atari/stdma.c index ba65f942d0c7..ce6818eff75e 100644 --- a/arch/m68k/atari/stdma.c +++ b/arch/m68k/atari/stdma.c @@ -30,7 +30,6 @@ #include #include -#include #include #include #include diff --git a/arch/m68k/bvme6000/config.c b/arch/m68k/bvme6000/config.c index 0c6feafbbd11..0fe0f3e888fb 100644 --- a/arch/m68k/bvme6000/config.c +++ b/arch/m68k/bvme6000/config.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c index 9c57b245dc12..267b02cc5655 100644 --- a/arch/m68k/emu/nfblock.c +++ b/arch/m68k/emu/nfblock.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/m68k/kernel/setup_mm.c b/arch/m68k/kernel/setup_mm.c index 49e573b94326..ee268055bdce 100644 --- a/arch/m68k/kernel/setup_mm.c +++ b/arch/m68k/kernel/setup_mm.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/m68k/mvme147/config.c b/arch/m68k/mvme147/config.c index dfd6202fd403..db1430dc411f 100644 --- a/arch/m68k/mvme147/config.c +++ b/arch/m68k/mvme147/config.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/arch/m68k/mvme16x/config.c b/arch/m68k/mvme16x/config.c index b4422c2dfbbf..45a07ab3123a 100644 --- a/arch/m68k/mvme16x/config.c +++ b/arch/m68k/mvme16x/config.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 650f7e27989f..671debbae941 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/block/disk-events.c b/block/disk-events.c index 8d5496e7592a..aee25a7e1ab7 100644 --- a/block/disk-events.c +++ b/block/disk-events.c @@ -4,7 +4,7 @@ */ #include #include -#include +#include #include "blk.h" struct disk_events { diff --git a/block/genhd.c b/block/genhd.c index 6ae990ff0266..9589d1d59afa 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include diff --git a/block/holder.c b/block/holder.c index 27cddce1b446..8d750281a1cd 100644 --- a/block/holder.c +++ b/block/holder.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -#include +#include #include struct bd_holder_disk { diff --git a/block/partitions/check.h b/block/partitions/check.h index d5b28e309d64..4ffa2359b1a3 100644 --- a/block/partitions/check.h +++ b/block/partitions/check.h @@ -1,7 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include #include -#include #include "../blk.h" /* diff --git a/block/partitions/core.c b/block/partitions/core.c index c2a1635922b1..2ef8dfa1e5c8 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include diff --git a/block/partitions/efi.h b/block/partitions/efi.h index 8cc2b88d0aa8..84b9f36b9e47 100644 --- a/block/partitions/efi.h +++ b/block/partitions/efi.h @@ -13,7 +13,6 @@ #include #include -#include #include #include #include diff --git a/block/partitions/ldm.h b/block/partitions/ldm.h index 8693704dcf5e..0a747a0c782d 100644 --- a/block/partitions/ldm.h +++ b/block/partitions/ldm.h @@ -14,7 +14,6 @@ #include #include -#include #include #include #include diff --git a/block/sed-opal.c b/block/sed-opal.c index daafadbb88ca..9700197000f2 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/base/class.c b/drivers/base/class.c index 7476f393df97..8feb85e186e3 100644 --- a/drivers/base/class.c +++ b/drivers/base/class.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include "base.h" diff --git a/drivers/base/core.c b/drivers/base/core.c index 7bb957b11861..3d6430eb0c6a 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index f41063ac1aee..db5a03a0618e 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 52484bcdedb9..8a91fcac6f82 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 6af111f568e4..cc11f89a0928 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index f27d5b0f9a0b..acb1ad3c0603 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index e6005c232328..cba956881d55 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index 88f4206310e4..6816beb45352 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -15,7 +15,6 @@ #include #include #include -#include /* Offset of Subsystem Device ID in pci confoguration space */ #define PCI_SUBSYSTEM_DEVICEID 0x2E diff --git a/drivers/block/rnbd/rnbd-srv-sysfs.c b/drivers/block/rnbd/rnbd-srv-sysfs.c index 4db98e0e76f0..feaa76c5a342 100644 --- a/drivers/block/rnbd/rnbd-srv-sysfs.c +++ b/drivers/block/rnbd/rnbd-srv-sysfs.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 146d85d80e0e..dd0a1a6fed29 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index cb253d80d72b..342dbcb3f220 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index faead41709bc..8e78b37d0f6a 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/char/random.c b/drivers/char/random.c index 68613f0b6887..f206c87c6202 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -330,7 +330,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 140f35dc0c45..c31a62b963f0 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index b855fef4f38a..72d18c3fbf1f 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -11,7 +11,6 @@ #include #include -#include #include #include diff --git a/drivers/mtd/mtdswap.c b/drivers/mtd/mtdswap.c index e86b04bc1d6b..dc7f1532a37f 100644 --- a/drivers/mtd/mtdswap.c +++ b/drivers/mtd/mtdswap.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/mtd/nand/raw/sharpsl.c b/drivers/mtd/nand/raw/sharpsl.c index 5612ee628425..52ce5162538a 100644 --- a/drivers/mtd/nand/raw/sharpsl.c +++ b/drivers/mtd/nand/raw/sharpsl.c @@ -6,7 +6,6 @@ * Based on Sharp's NAND driver sharp_sl.c */ -#include #include #include #include diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c index 228c33b8d1d6..c1db43524d75 100644 --- a/drivers/nvdimm/blk.c +++ b/drivers/nvdimm/blk.c @@ -6,7 +6,6 @@ #include #include -#include #include #include #include diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index da3f007a1211..cbd994f7f1fe 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c index 8b52e5144f08..e5a58520d398 100644 --- a/drivers/nvdimm/btt_devs.c +++ b/drivers/nvdimm/btt_devs.c @@ -4,7 +4,6 @@ */ #include #include -#include #include #include #include diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index 9dc7f3edd42b..5bbe31b08581 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 58eda16f5c53..c31e184bfa45 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index 8b458010f88a..3b7af00a7825 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -47,7 +47,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c index 61ecdcb2cc6a..2a9c0ddcade5 100644 --- a/drivers/s390/block/scm_blk.c +++ b/drivers/s390/block/scm_blk.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/s390/block/scm_blk.h b/drivers/s390/block/scm_blk.h index a05a4297cfae..af82b3214774 100644 --- a/drivers/s390/block/scm_blk.h +++ b/drivers/s390/block/scm_blk.h @@ -6,7 +6,6 @@ #include #include #include -#include #include #include diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 2104973a35cd..911cc72dd7ac 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/scsi/scsicam.c b/drivers/scsi/scsicam.c index 0ffdb8f2995f..acdc0aceca5e 100644 --- a/drivers/scsi/scsicam.c +++ b/drivers/scsi/scsicam.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 62eb9921cc94..2d648d27bfd7 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/scsi/sr.h b/drivers/scsi/sr.h index 339c624e04d8..1609f02ed29a 100644 --- a/drivers/scsi/sr.h +++ b/drivers/scsi/sr.h @@ -18,7 +18,6 @@ #ifndef _SR_H #define _SR_H -#include #include #include diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index bf8ae4825a06..6045678365a5 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 807d06ecadee..0fae71ac5cc8 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 7e9f90fa0388..abac86a75840 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -78,7 +78,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/dax.c b/fs/dax.c index cd03485867a7..ab0978739eaa 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index a6002b2d146d..d87ea98cf535 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include "gfs2.h" #include "incore.h" diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c index 5beb82652435..8082eb01127c 100644 --- a/fs/hfs/mdb.c +++ b/fs/hfs/mdb.c @@ -9,7 +9,7 @@ */ #include -#include +#include #include #include diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 51ae6f1eb4a5..4688cc7b3692 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include "hfsplus_fs.h" diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index 19d36393974c..9cebb6ba555b 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c index ef9db135c649..6c977288cc28 100644 --- a/fs/nfs/blocklayout/rpc_pipefs.c +++ b/fs/nfs/blocklayout/rpc_pipefs.c @@ -27,7 +27,6 @@ */ #include -#include #include #include "blocklayout.h" diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index e5c0982a381d..b6d01d51a746 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -4,7 +4,6 @@ */ #include #include -#include #include #include diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f35aea98bc35..99a4384bb8a5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1,9 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ +/* + * Portions Copyright (C) 1992 Drew Eckhardt + */ #ifndef _LINUX_BLKDEV_H #define _LINUX_BLKDEV_H -#include -#include +#include +#include +#include #include #include #include @@ -12,11 +16,15 @@ #include #include #include +#include #include #include #include +#include #include #include +#include +#include struct module; struct request_queue; @@ -33,6 +41,10 @@ struct blk_queue_stats; struct blk_stat_callback; struct blk_crypto_profile; +extern const struct device_type disk_type; +extern struct device_type part_type; +extern struct class block_class; + /* Must be consistent with blk_mq_poll_stats_bkt() */ #define BLK_MQ_POLL_STATS_BKTS 16 @@ -45,6 +57,144 @@ struct blk_crypto_profile; */ #define BLKCG_MAX_POLS 6 +#define DISK_MAX_PARTS 256 +#define DISK_NAME_LEN 32 + +#define PARTITION_META_INFO_VOLNAMELTH 64 +/* + * Enough for the string representation of any kind of UUID plus NULL. + * EFI UUID is 36 characters. MSDOS UUID is 11 characters. + */ +#define PARTITION_META_INFO_UUIDLTH (UUID_STRING_LEN + 1) + +struct partition_meta_info { + char uuid[PARTITION_META_INFO_UUIDLTH]; + u8 volname[PARTITION_META_INFO_VOLNAMELTH]; +}; + +/** + * DOC: genhd capability flags + * + * ``GENHD_FL_REMOVABLE``: indicates that the block device gives access to + * removable media. When set, the device remains present even when media is not + * inserted. Shall not be set for devices which are removed entirely when the + * media is removed. + * + * ``GENHD_FL_HIDDEN``: the block device is hidden; it doesn't produce events, + * doesn't appear in sysfs, and can't be opened from userspace or using + * blkdev_get*. Used for the underlying components of multipath devices. + * + * ``GENHD_FL_NO_PART``: partition support is disabled. The kernel will not + * scan for partitions from add_disk, and users can't add partitions manually. + * + */ +enum { + GENHD_FL_REMOVABLE = 1 << 0, + GENHD_FL_HIDDEN = 1 << 1, + GENHD_FL_NO_PART = 1 << 2, +}; + +enum { + DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */ + DISK_EVENT_EJECT_REQUEST = 1 << 1, /* eject requested */ +}; + +enum { + /* Poll even if events_poll_msecs is unset */ + DISK_EVENT_FLAG_POLL = 1 << 0, + /* Forward events to udev */ + DISK_EVENT_FLAG_UEVENT = 1 << 1, + /* Block event polling when open for exclusive write */ + DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE = 1 << 2, +}; + +struct disk_events; +struct badblocks; + +struct blk_integrity { + const struct blk_integrity_profile *profile; + unsigned char flags; + unsigned char tuple_size; + unsigned char interval_exp; + unsigned char tag_size; +}; + +struct gendisk { + /* + * major/first_minor/minors should not be set by any new driver, the + * block core will take care of allocating them automatically. + */ + int major; + int first_minor; + int minors; + + char disk_name[DISK_NAME_LEN]; /* name of major driver */ + + unsigned short events; /* supported events */ + unsigned short event_flags; /* flags related to event processing */ + + struct xarray part_tbl; + struct block_device *part0; + + const struct block_device_operations *fops; + struct request_queue *queue; + void *private_data; + + int flags; + unsigned long state; +#define GD_NEED_PART_SCAN 0 +#define GD_READ_ONLY 1 +#define GD_DEAD 2 +#define GD_NATIVE_CAPACITY 3 + + struct mutex open_mutex; /* open/close mutex */ + unsigned open_partitions; /* number of open partitions */ + + struct backing_dev_info *bdi; + struct kobject *slave_dir; +#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED + struct list_head slave_bdevs; +#endif + struct timer_rand_state *random; + atomic_t sync_io; /* RAID */ + struct disk_events *ev; +#ifdef CONFIG_BLK_DEV_INTEGRITY + struct kobject integrity_kobj; +#endif /* CONFIG_BLK_DEV_INTEGRITY */ +#if IS_ENABLED(CONFIG_CDROM) + struct cdrom_device_info *cdi; +#endif + int node_id; + struct badblocks *bb; + struct lockdep_map lockdep_map; + u64 diskseq; +}; + +static inline bool disk_live(struct gendisk *disk) +{ + return !inode_unhashed(disk->part0->bd_inode); +} + +/* + * The gendisk is refcounted by the part0 block_device, and the bd_device + * therein is also used for device model presentation in sysfs. + */ +#define dev_to_disk(device) \ + (dev_to_bdev(device)->bd_disk) +#define disk_to_dev(disk) \ + (&((disk)->part0->bd_device)) + +#if IS_REACHABLE(CONFIG_CDROM) +#define disk_to_cdi(disk) ((disk)->cdi) +#else +#define disk_to_cdi(disk) NULL +#endif + +static inline dev_t disk_devt(struct gendisk *disk) +{ + return MKDEV(disk->major, disk->first_minor); +} + static inline int blk_validate_block_size(unsigned long bsize) { if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize)) @@ -596,6 +746,118 @@ static inline unsigned int blk_queue_depth(struct request_queue *q) #define for_each_bio(_bio) \ for (; _bio; _bio = _bio->bi_next) +int __must_check device_add_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups); +static inline int __must_check add_disk(struct gendisk *disk) +{ + return device_add_disk(NULL, disk, NULL); +} +void del_gendisk(struct gendisk *gp); +void invalidate_disk(struct gendisk *disk); +void set_disk_ro(struct gendisk *disk, bool read_only); +void disk_uevent(struct gendisk *disk, enum kobject_action action); + +static inline int get_disk_ro(struct gendisk *disk) +{ + return disk->part0->bd_read_only || + test_bit(GD_READ_ONLY, &disk->state); +} + +static inline int bdev_read_only(struct block_device *bdev) +{ + return bdev->bd_read_only || get_disk_ro(bdev->bd_disk); +} + +bool set_capacity_and_notify(struct gendisk *disk, sector_t size); +bool disk_force_media_change(struct gendisk *disk, unsigned int events); + +void add_disk_randomness(struct gendisk *disk) __latent_entropy; +void rand_initialize_disk(struct gendisk *disk); + +static inline sector_t get_start_sect(struct block_device *bdev) +{ + return bdev->bd_start_sect; +} + +static inline sector_t bdev_nr_sectors(struct block_device *bdev) +{ + return bdev->bd_nr_sectors; +} + +static inline loff_t bdev_nr_bytes(struct block_device *bdev) +{ + return (loff_t)bdev_nr_sectors(bdev) << SECTOR_SHIFT; +} + +static inline sector_t get_capacity(struct gendisk *disk) +{ + return bdev_nr_sectors(disk->part0); +} + +static inline u64 sb_bdev_nr_blocks(struct super_block *sb) +{ + return bdev_nr_sectors(sb->s_bdev) >> + (sb->s_blocksize_bits - SECTOR_SHIFT); +} + +int bdev_disk_changed(struct gendisk *disk, bool invalidate); + +struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, + struct lock_class_key *lkclass); +void put_disk(struct gendisk *disk); +struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass); + +/** + * blk_alloc_disk - allocate a gendisk structure + * @node_id: numa node to allocate on + * + * Allocate and pre-initialize a gendisk structure for use with BIO based + * drivers. + * + * Context: can sleep + */ +#define blk_alloc_disk(node_id) \ +({ \ + static struct lock_class_key __key; \ + \ + __blk_alloc_disk(node_id, &__key); \ +}) +void blk_cleanup_disk(struct gendisk *disk); + +int __register_blkdev(unsigned int major, const char *name, + void (*probe)(dev_t devt)); +#define register_blkdev(major, name) \ + __register_blkdev(major, name, NULL) +void unregister_blkdev(unsigned int major, const char *name); + +bool bdev_check_media_change(struct block_device *bdev); +int __invalidate_device(struct block_device *bdev, bool kill_dirty); +void set_capacity(struct gendisk *disk, sector_t size); + +#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED +int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); +void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk); +int bd_register_pending_holders(struct gendisk *disk); +#else +static inline int bd_link_disk_holder(struct block_device *bdev, + struct gendisk *disk) +{ + return 0; +} +static inline void bd_unlink_disk_holder(struct block_device *bdev, + struct gendisk *disk) +{ +} +static inline int bd_register_pending_holders(struct gendisk *disk) +{ + return 0; +} +#endif /* CONFIG_BLOCK_HOLDER_DEPRECATED */ + +dev_t part_devt(struct gendisk *disk, u8 partno); +void inc_diskseq(struct gendisk *disk); +dev_t blk_lookup_devt(const char *name, int partno); +void blk_request_module(dev_t devt); extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); @@ -1311,6 +1573,7 @@ void invalidate_bdev(struct block_device *bdev); int sync_blockdev(struct block_device *bdev); int sync_blockdev_nowait(struct block_device *bdev); void sync_bdevs(bool wait); +void printk_all_partitions(void); #else static inline void invalidate_bdev(struct block_device *bdev) { @@ -1326,7 +1589,11 @@ static inline int sync_blockdev_nowait(struct block_device *bdev) static inline void sync_bdevs(bool wait) { } -#endif +static inline void printk_all_partitions(void) +{ +} +#endif /* CONFIG_BLOCK */ + int fsync_bdev(struct block_device *bdev); int freeze_bdev(struct block_device *bdev); diff --git a/include/linux/genhd.h b/include/linux/genhd.h deleted file mode 100644 index aa4bd985dbe5..000000000000 --- a/include/linux/genhd.h +++ /dev/null @@ -1,287 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_GENHD_H -#define _LINUX_GENHD_H - -/* - * genhd.h Copyright (C) 1992 Drew Eckhardt - * Generic hard disk header file by - * Drew Eckhardt - * - * - */ - -#include -#include -#include -#include -#include -#include - -extern const struct device_type disk_type; -extern struct device_type part_type; -extern struct class block_class; - -#define DISK_MAX_PARTS 256 -#define DISK_NAME_LEN 32 - -#define PARTITION_META_INFO_VOLNAMELTH 64 -/* - * Enough for the string representation of any kind of UUID plus NULL. - * EFI UUID is 36 characters. MSDOS UUID is 11 characters. - */ -#define PARTITION_META_INFO_UUIDLTH (UUID_STRING_LEN + 1) - -struct partition_meta_info { - char uuid[PARTITION_META_INFO_UUIDLTH]; - u8 volname[PARTITION_META_INFO_VOLNAMELTH]; -}; - -/** - * DOC: genhd capability flags - * - * ``GENHD_FL_REMOVABLE``: indicates that the block device gives access to - * removable media. When set, the device remains present even when media is not - * inserted. Shall not be set for devices which are removed entirely when the - * media is removed. - * - * ``GENHD_FL_HIDDEN``: the block device is hidden; it doesn't produce events, - * doesn't appear in sysfs, and can't be opened from userspace or using - * blkdev_get*. Used for the underlying components of multipath devices. - * - * ``GENHD_FL_NO_PART``: partition support is disabled. The kernel will not - * scan for partitions from add_disk, and users can't add partitions manually. - * - */ -enum { - GENHD_FL_REMOVABLE = 1 << 0, - GENHD_FL_HIDDEN = 1 << 1, - GENHD_FL_NO_PART = 1 << 2, -}; - -enum { - DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */ - DISK_EVENT_EJECT_REQUEST = 1 << 1, /* eject requested */ -}; - -enum { - /* Poll even if events_poll_msecs is unset */ - DISK_EVENT_FLAG_POLL = 1 << 0, - /* Forward events to udev */ - DISK_EVENT_FLAG_UEVENT = 1 << 1, - /* Block event polling when open for exclusive write */ - DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE = 1 << 2, -}; - -struct disk_events; -struct badblocks; - -struct blk_integrity { - const struct blk_integrity_profile *profile; - unsigned char flags; - unsigned char tuple_size; - unsigned char interval_exp; - unsigned char tag_size; -}; - -struct gendisk { - /* - * major/first_minor/minors should not be set by any new driver, the - * block core will take care of allocating them automatically. - */ - int major; - int first_minor; - int minors; - - char disk_name[DISK_NAME_LEN]; /* name of major driver */ - - unsigned short events; /* supported events */ - unsigned short event_flags; /* flags related to event processing */ - - struct xarray part_tbl; - struct block_device *part0; - - const struct block_device_operations *fops; - struct request_queue *queue; - void *private_data; - - int flags; - unsigned long state; -#define GD_NEED_PART_SCAN 0 -#define GD_READ_ONLY 1 -#define GD_DEAD 2 -#define GD_NATIVE_CAPACITY 3 - - struct mutex open_mutex; /* open/close mutex */ - unsigned open_partitions; /* number of open partitions */ - - struct backing_dev_info *bdi; - struct kobject *slave_dir; -#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED - struct list_head slave_bdevs; -#endif - struct timer_rand_state *random; - atomic_t sync_io; /* RAID */ - struct disk_events *ev; -#ifdef CONFIG_BLK_DEV_INTEGRITY - struct kobject integrity_kobj; -#endif /* CONFIG_BLK_DEV_INTEGRITY */ -#if IS_ENABLED(CONFIG_CDROM) - struct cdrom_device_info *cdi; -#endif - int node_id; - struct badblocks *bb; - struct lockdep_map lockdep_map; - u64 diskseq; -}; - -static inline bool disk_live(struct gendisk *disk) -{ - return !inode_unhashed(disk->part0->bd_inode); -} - -/* - * The gendisk is refcounted by the part0 block_device, and the bd_device - * therein is also used for device model presentation in sysfs. - */ -#define dev_to_disk(device) \ - (dev_to_bdev(device)->bd_disk) -#define disk_to_dev(disk) \ - (&((disk)->part0->bd_device)) - -#if IS_REACHABLE(CONFIG_CDROM) -#define disk_to_cdi(disk) ((disk)->cdi) -#else -#define disk_to_cdi(disk) NULL -#endif - -static inline dev_t disk_devt(struct gendisk *disk) -{ - return MKDEV(disk->major, disk->first_minor); -} - -void disk_uevent(struct gendisk *disk, enum kobject_action action); - -/* block/genhd.c */ -int __must_check device_add_disk(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups); -static inline int __must_check add_disk(struct gendisk *disk) -{ - return device_add_disk(NULL, disk, NULL); -} -extern void del_gendisk(struct gendisk *gp); - -void invalidate_disk(struct gendisk *disk); - -void set_disk_ro(struct gendisk *disk, bool read_only); - -static inline int get_disk_ro(struct gendisk *disk) -{ - return disk->part0->bd_read_only || - test_bit(GD_READ_ONLY, &disk->state); -} - -static inline int bdev_read_only(struct block_device *bdev) -{ - return bdev->bd_read_only || get_disk_ro(bdev->bd_disk); -} - -bool set_capacity_and_notify(struct gendisk *disk, sector_t size); -bool disk_force_media_change(struct gendisk *disk, unsigned int events); - -/* drivers/char/random.c */ -extern void add_disk_randomness(struct gendisk *disk) __latent_entropy; -extern void rand_initialize_disk(struct gendisk *disk); - -static inline sector_t get_start_sect(struct block_device *bdev) -{ - return bdev->bd_start_sect; -} - -static inline sector_t bdev_nr_sectors(struct block_device *bdev) -{ - return bdev->bd_nr_sectors; -} - -static inline loff_t bdev_nr_bytes(struct block_device *bdev) -{ - return (loff_t)bdev_nr_sectors(bdev) << SECTOR_SHIFT; -} - -static inline sector_t get_capacity(struct gendisk *disk) -{ - return bdev_nr_sectors(disk->part0); -} - -static inline u64 sb_bdev_nr_blocks(struct super_block *sb) -{ - return bdev_nr_sectors(sb->s_bdev) >> - (sb->s_blocksize_bits - SECTOR_SHIFT); -} - -int bdev_disk_changed(struct gendisk *disk, bool invalidate); - -struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, - struct lock_class_key *lkclass); -extern void put_disk(struct gendisk *disk); -struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass); - -/** - * blk_alloc_disk - allocate a gendisk structure - * @node_id: numa node to allocate on - * - * Allocate and pre-initialize a gendisk structure for use with BIO based - * drivers. - * - * Context: can sleep - */ -#define blk_alloc_disk(node_id) \ -({ \ - static struct lock_class_key __key; \ - \ - __blk_alloc_disk(node_id, &__key); \ -}) -void blk_cleanup_disk(struct gendisk *disk); - -int __register_blkdev(unsigned int major, const char *name, - void (*probe)(dev_t devt)); -#define register_blkdev(major, name) \ - __register_blkdev(major, name, NULL) -void unregister_blkdev(unsigned int major, const char *name); - -bool bdev_check_media_change(struct block_device *bdev); -int __invalidate_device(struct block_device *bdev, bool kill_dirty); -void set_capacity(struct gendisk *disk, sector_t size); - -#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED -int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); -void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk); -int bd_register_pending_holders(struct gendisk *disk); -#else -static inline int bd_link_disk_holder(struct block_device *bdev, - struct gendisk *disk) -{ - return 0; -} -static inline void bd_unlink_disk_holder(struct block_device *bdev, - struct gendisk *disk) -{ -} -static inline int bd_register_pending_holders(struct gendisk *disk) -{ - return 0; -} -#endif /* CONFIG_BLOCK_HOLDER_DEPRECATED */ - -dev_t part_devt(struct gendisk *disk, u8 partno); -void inc_diskseq(struct gendisk *disk); -dev_t blk_lookup_devt(const char *name, int partno); -void blk_request_module(dev_t devt); -#ifdef CONFIG_BLOCK -void printk_all_partitions(void); -#else /* CONFIG_BLOCK */ -static inline void printk_all_partitions(void) -{ -} -#endif /* CONFIG_BLOCK */ - -#endif /* _LINUX_GENHD_H */ diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h index 6f7949b2fd8d..abeba356bc3f 100644 --- a/include/linux/part_stat.h +++ b/include/linux/part_stat.h @@ -2,7 +2,7 @@ #ifndef _LINUX_PART_STAT_H #define _LINUX_PART_STAT_H -#include +#include #include struct disk_stats { diff --git a/init/do_mounts.c b/init/do_mounts.c index 762b534978d9..7058e14ad5f7 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index e6af502c2fd7..a94044197c4a 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/power/swap.c b/kernel/power/swap.c index ad10359030a4..f1bd03129575 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index 320ca80aacab..02882526ba9a 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include -- cgit v1.2.3 From 07888c665b405b1cd3577ddebfeb74f4717a84c4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jan 2022 10:11:05 +0100 Subject: block: pass a block_device and opf to bio_alloc Pass the block_device and operation that we plan to use this bio for to bio_alloc to optimize the assignment. NULL/0 can be passed, both for the passthrough case on a raw request_queue and to temporarily avoid refactoring some nasty code. Also move the gfp_mask argument after the nr_vecs argument for a much more logical calling convention matching what most of the kernel does. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220124091107.642561-18-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 5 +---- block/fops.c | 4 +--- drivers/block/drbd/drbd_receiver.c | 10 ++++------ drivers/block/rnbd/rnbd-srv.c | 5 ++--- drivers/block/xen-blkback/blkback.c | 11 +++++------ drivers/block/zram/zram_drv.c | 11 ++++------- drivers/md/dm-log-writes.c | 21 ++++++++------------- drivers/md/dm-thin.c | 9 ++++----- drivers/md/dm-zoned-metadata.c | 15 ++++++--------- drivers/nvdimm/nd_virtio.c | 6 +++--- drivers/nvme/target/io-cmd-bdev.c | 12 ++++++------ drivers/nvme/target/passthru.c | 5 +++-- drivers/nvme/target/zns.c | 6 +++--- drivers/scsi/ufs/ufshpb.c | 4 ++-- drivers/target/target_core_iblock.c | 5 ++--- fs/btrfs/disk-io.c | 6 +++--- fs/buffer.c | 14 ++++++-------- fs/crypto/bio.c | 13 +++++++------ fs/direct-io.c | 5 +---- fs/erofs/zdata.c | 5 ++--- fs/ext4/page-io.c | 3 +-- fs/ext4/readpage.c | 8 ++++---- fs/gfs2/lops.c | 8 +++----- fs/gfs2/meta_io.c | 4 +--- fs/gfs2/ops_fstype.c | 4 +--- fs/hfsplus/wrapper.c | 4 +--- fs/iomap/buffered-io.c | 16 ++++++++-------- fs/iomap/direct-io.c | 8 ++------ fs/jfs/jfs_logmgr.c | 11 ++--------- fs/jfs/jfs_metapage.c | 9 +++------ fs/mpage.c | 7 +++---- fs/nfs/blocklayout/blocklayout.c | 4 +--- fs/nilfs2/segbuf.c | 4 ++-- fs/ntfs3/fsntfs.c | 8 ++------ fs/ocfs2/cluster/heartbeat.c | 4 +--- fs/squashfs/block.c | 11 ++++++----- fs/xfs/xfs_bio_io.c | 10 ++++------ fs/xfs/xfs_buf.c | 4 +--- fs/zonefs/super.c | 5 ++--- include/linux/bio.h | 5 +++-- kernel/power/swap.c | 5 ++--- mm/page_io.c | 10 ++++------ 42 files changed, 130 insertions(+), 194 deletions(-) (limited to 'kernel') diff --git a/block/bio.c b/block/bio.c index 6c3efb0fd12b..b73c9babd583 100644 --- a/block/bio.c +++ b/block/bio.c @@ -347,10 +347,7 @@ EXPORT_SYMBOL(bio_chain); struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev, unsigned int nr_pages, unsigned int opf, gfp_t gfp) { - struct bio *new = bio_alloc(gfp, nr_pages); - - bio_set_dev(new, bdev); - new->bi_opf = opf; + struct bio *new = bio_alloc(bdev, nr_pages, opf, gfp); if (bio) { bio_chain(bio, new); diff --git a/block/fops.c b/block/fops.c index 3a62b8b91275..c68359684773 100644 --- a/block/fops.c +++ b/block/fops.c @@ -256,9 +256,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, } atomic_inc(&dio->ref); submit_bio(bio); - bio = bio_alloc(GFP_KERNEL, nr_pages); - bio_set_dev(bio, bdev); - bio->bi_opf = opf; + bio = bio_alloc(bdev, nr_pages, opf, GFP_KERNEL); } blk_finish_plug(&plug); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index fb59b263deee..04e3ec12d8b4 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1279,7 +1279,8 @@ static void one_flush_endio(struct bio *bio) static void submit_one_flush(struct drbd_device *device, struct issue_flush_context *ctx) { - struct bio *bio = bio_alloc(GFP_NOIO, 0); + struct bio *bio = bio_alloc(device->ldev->backing_bdev, 0, + REQ_OP_FLUSH | REQ_PREFLUSH, GFP_NOIO); struct one_flush_context *octx = kmalloc(sizeof(*octx), GFP_NOIO); if (!octx) { @@ -1297,10 +1298,8 @@ static void submit_one_flush(struct drbd_device *device, struct issue_flush_cont octx->device = device; octx->ctx = ctx; - bio_set_dev(bio, device->ldev->backing_bdev); bio->bi_private = octx; bio->bi_end_io = one_flush_endio; - bio->bi_opf = REQ_OP_FLUSH | REQ_PREFLUSH; device->flush_jif = jiffies; set_bit(FLUSH_PENDING, &device->flags); @@ -1685,11 +1684,10 @@ int drbd_submit_peer_request(struct drbd_device *device, * generated bio, but a bio allocated on behalf of the peer. */ next_bio: - bio = bio_alloc(GFP_NOIO, nr_pages); + bio = bio_alloc(device->ldev->backing_bdev, nr_pages, op | op_flags, + GFP_NOIO); /* > peer_req->i.sector, unless this is the first bio */ bio->bi_iter.bi_sector = sector; - bio_set_dev(bio, device->ldev->backing_bdev); - bio_set_op_attrs(bio, op, op_flags); bio->bi_private = peer_req; bio->bi_end_io = drbd_peer_request_endio; diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index ff9b38997607..132e950685d5 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -149,7 +149,8 @@ static int process_rdma(struct rnbd_srv_session *srv_sess, priv->sess_dev = sess_dev; priv->id = id; - bio = bio_alloc(GFP_KERNEL, 1); + bio = bio_alloc(sess_dev->rnbd_dev->bdev, 1, + rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL); if (bio_add_page(bio, virt_to_page(data), datalen, offset_in_page(data)) != datalen) { rnbd_srv_err(sess_dev, "Failed to map data to bio\n"); @@ -159,13 +160,11 @@ static int process_rdma(struct rnbd_srv_session *srv_sess, bio->bi_end_io = rnbd_dev_bi_end_io; bio->bi_private = priv; - bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw)); bio->bi_iter.bi_sector = le64_to_cpu(msg->sector); bio->bi_iter.bi_size = le32_to_cpu(msg->bi_size); prio = srv_sess->ver < RNBD_PROTO_VER_MAJOR || usrlen < sizeof(*msg) ? 0 : le16_to_cpu(msg->prio); bio_set_prio(bio, prio); - bio_set_dev(bio, sess_dev->rnbd_dev->bdev); submit_bio(bio); diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 6bb2ad769206..d1e26461a64e 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -1326,13 +1326,13 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, pages[i]->page, seg[i].nsec << 9, seg[i].offset) == 0)) { - bio = bio_alloc(GFP_KERNEL, bio_max_segs(nseg - i)); + bio = bio_alloc(preq.bdev, bio_max_segs(nseg - i), + operation | operation_flags, + GFP_KERNEL); biolist[nbio++] = bio; - bio_set_dev(bio, preq.bdev); bio->bi_private = pending_req; bio->bi_end_io = end_block_io_op; bio->bi_iter.bi_sector = preq.sector_number; - bio_set_op_attrs(bio, operation, operation_flags); } preq.sector_number += seg[i].nsec; @@ -1342,12 +1342,11 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, if (!bio) { BUG_ON(operation_flags != REQ_PREFLUSH); - bio = bio_alloc(GFP_KERNEL, 0); + bio = bio_alloc(preq.bdev, 0, operation | operation_flags, + GFP_KERNEL); biolist[nbio++] = bio; - bio_set_dev(bio, preq.bdev); bio->bi_private = pending_req; bio->bi_end_io = end_block_io_op; - bio_set_op_attrs(bio, operation, operation_flags); } atomic_set(&pending_req->pendcnt, nbio); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 342dbcb3f220..f3fe0ea8aa80 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -616,24 +616,21 @@ static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec, { struct bio *bio; - bio = bio_alloc(GFP_NOIO, 1); + bio = bio_alloc(zram->bdev, 1, parent ? parent->bi_opf : REQ_OP_READ, + GFP_NOIO); if (!bio) return -ENOMEM; bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9); - bio_set_dev(bio, zram->bdev); if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len, bvec->bv_offset)) { bio_put(bio); return -EIO; } - if (!parent) { - bio->bi_opf = REQ_OP_READ; + if (!parent) bio->bi_end_io = zram_page_end_io; - } else { - bio->bi_opf = parent->bi_opf; + else bio_chain(bio, parent); - } submit_bio(bio); return 1; diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 25f5e8d2d417..c9d036d6bb2e 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -217,14 +217,12 @@ static int write_metadata(struct log_writes_c *lc, void *entry, void *ptr; size_t ret; - bio = bio_alloc(GFP_KERNEL, 1); + bio = bio_alloc(lc->logdev->bdev, 1, REQ_OP_WRITE, GFP_KERNEL); bio->bi_iter.bi_size = 0; bio->bi_iter.bi_sector = sector; - bio_set_dev(bio, lc->logdev->bdev); bio->bi_end_io = (sector == WRITE_LOG_SUPER_SECTOR) ? log_end_super : log_end_io; bio->bi_private = lc; - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); page = alloc_page(GFP_KERNEL); if (!page) { @@ -271,13 +269,12 @@ static int write_inline_data(struct log_writes_c *lc, void *entry, atomic_inc(&lc->io_blocks); - bio = bio_alloc(GFP_KERNEL, bio_pages); + bio = bio_alloc(lc->logdev->bdev, bio_pages, REQ_OP_WRITE, + GFP_KERNEL); bio->bi_iter.bi_size = 0; bio->bi_iter.bi_sector = sector; - bio_set_dev(bio, lc->logdev->bdev); bio->bi_end_io = log_end_io; bio->bi_private = lc; - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); for (i = 0; i < bio_pages; i++) { pg_datalen = min_t(int, datalen, PAGE_SIZE); @@ -353,13 +350,12 @@ static int log_one_block(struct log_writes_c *lc, goto out; atomic_inc(&lc->io_blocks); - bio = bio_alloc(GFP_KERNEL, bio_max_segs(block->vec_cnt)); + bio = bio_alloc(lc->logdev->bdev, bio_max_segs(block->vec_cnt), + REQ_OP_WRITE, GFP_KERNEL); bio->bi_iter.bi_size = 0; bio->bi_iter.bi_sector = sector; - bio_set_dev(bio, lc->logdev->bdev); bio->bi_end_io = log_end_io; bio->bi_private = lc; - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); for (i = 0; i < block->vec_cnt; i++) { /* @@ -371,14 +367,13 @@ static int log_one_block(struct log_writes_c *lc, if (ret != block->vecs[i].bv_len) { atomic_inc(&lc->io_blocks); submit_bio(bio); - bio = bio_alloc(GFP_KERNEL, - bio_max_segs(block->vec_cnt - i)); + bio = bio_alloc(lc->logdev->bdev, + bio_max_segs(block->vec_cnt - i), + REQ_OP_WRITE, GFP_KERNEL); bio->bi_iter.bi_size = 0; bio->bi_iter.bi_sector = sector; - bio_set_dev(bio, lc->logdev->bdev); bio->bi_end_io = log_end_io; bio->bi_private = lc; - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); ret = bio_add_page(bio, block->vecs[i].bv_page, block->vecs[i].bv_len, 0); diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 411a3f56ed90..f4234d615aa1 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -1177,13 +1177,12 @@ static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m) return; } - discard_parent = bio_alloc(GFP_NOIO, 1); + discard_parent = bio_alloc(NULL, 1, 0, GFP_NOIO); discard_parent->bi_end_io = passdown_endio; discard_parent->bi_private = m; - - if (m->maybe_shared) - passdown_double_checking_shared_status(m, discard_parent); - else { + if (m->maybe_shared) + passdown_double_checking_shared_status(m, discard_parent); + else { struct discard_op op; begin_discard(&op, tc, discard_parent); diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 5718b83cc718..e5f1eb27ce2e 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -550,7 +550,8 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd, if (!mblk) return ERR_PTR(-ENOMEM); - bio = bio_alloc(GFP_NOIO, 1); + bio = bio_alloc(dev->bdev, 1, REQ_OP_READ | REQ_META | REQ_PRIO, + GFP_NOIO); spin_lock(&zmd->mblk_lock); @@ -574,10 +575,8 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd, /* Submit read BIO */ bio->bi_iter.bi_sector = dmz_blk2sect(block); - bio_set_dev(bio, dev->bdev); bio->bi_private = mblk; bio->bi_end_io = dmz_mblock_bio_end_io; - bio_set_op_attrs(bio, REQ_OP_READ, REQ_META | REQ_PRIO); bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0); submit_bio(bio); @@ -721,15 +720,14 @@ static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk, if (dmz_bdev_is_dying(dev)) return -EIO; - bio = bio_alloc(GFP_NOIO, 1); + bio = bio_alloc(dev->bdev, 1, REQ_OP_WRITE | REQ_META | REQ_PRIO, + GFP_NOIO); set_bit(DMZ_META_WRITING, &mblk->state); bio->bi_iter.bi_sector = dmz_blk2sect(block); - bio_set_dev(bio, dev->bdev); bio->bi_private = mblk; bio->bi_end_io = dmz_mblock_bio_end_io; - bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO); bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0); submit_bio(bio); @@ -751,10 +749,9 @@ static int dmz_rdwr_block(struct dmz_dev *dev, int op, if (dmz_bdev_is_dying(dev)) return -EIO; - bio = bio_alloc(GFP_NOIO, 1); + bio = bio_alloc(dev->bdev, 1, op | REQ_SYNC | REQ_META | REQ_PRIO, + GFP_NOIO); bio->bi_iter.bi_sector = dmz_blk2sect(block); - bio_set_dev(bio, dev->bdev); - bio_set_op_attrs(bio, op, REQ_SYNC | REQ_META | REQ_PRIO); bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0); ret = submit_bio_wait(bio); bio_put(bio); diff --git a/drivers/nvdimm/nd_virtio.c b/drivers/nvdimm/nd_virtio.c index 10351d5b49fa..c6a648fd8744 100644 --- a/drivers/nvdimm/nd_virtio.c +++ b/drivers/nvdimm/nd_virtio.c @@ -105,12 +105,12 @@ int async_pmem_flush(struct nd_region *nd_region, struct bio *bio) * parent bio. Otherwise directly call nd_region flush. */ if (bio && bio->bi_iter.bi_sector != -1) { - struct bio *child = bio_alloc(GFP_ATOMIC, 0); + struct bio *child = bio_alloc(bio->bi_bdev, 0, REQ_PREFLUSH, + GFP_ATOMIC); if (!child) return -ENOMEM; - bio_copy_dev(child, bio); - child->bi_opf = REQ_PREFLUSH; + bio_clone_blkg_association(child, bio); child->bi_iter.bi_sector = -1; bio_chain(child, bio); submit_bio(child); diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 70ca9dfc1771..e092af3abc71 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -268,14 +268,15 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req) if (nvmet_use_inline_bvec(req)) { bio = &req->b.inline_bio; bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec)); + bio_set_dev(bio, req->ns->bdev); + bio->bi_opf = op; } else { - bio = bio_alloc(GFP_KERNEL, bio_max_segs(sg_cnt)); + bio = bio_alloc(req->ns->bdev, bio_max_segs(sg_cnt), op, + GFP_KERNEL); } - bio_set_dev(bio, req->ns->bdev); bio->bi_iter.bi_sector = sector; bio->bi_private = req; bio->bi_end_io = nvmet_bio_done; - bio->bi_opf = op; blk_start_plug(&plug); if (req->metadata_len) @@ -296,10 +297,9 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req) } } - bio = bio_alloc(GFP_KERNEL, bio_max_segs(sg_cnt)); - bio_set_dev(bio, req->ns->bdev); + bio = bio_alloc(req->ns->bdev, bio_max_segs(sg_cnt), + op, GFP_KERNEL); bio->bi_iter.bi_sector = sector; - bio->bi_opf = op; bio_chain(bio, prev); submit_bio(prev); diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 9e5b89ae29df..38f72968c3fd 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -207,11 +207,12 @@ static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) if (nvmet_use_inline_bvec(req)) { bio = &req->p.inline_bio; bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec)); + bio->bi_opf = req_op(rq); } else { - bio = bio_alloc(GFP_KERNEL, bio_max_segs(req->sg_cnt)); + bio = bio_alloc(NULL, bio_max_segs(req->sg_cnt), req_op(rq), + GFP_KERNEL); bio->bi_end_io = bio_put; } - bio->bi_opf = req_op(rq); for_each_sg(req->sg, sg, req->sg_cnt, i) { if (bio_add_pc_page(rq->q, bio, sg_page(sg), sg->length, diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c index 247de74247fa..62c53e8f26d3 100644 --- a/drivers/nvme/target/zns.c +++ b/drivers/nvme/target/zns.c @@ -522,6 +522,7 @@ static void nvmet_bdev_zone_append_bio_done(struct bio *bio) void nvmet_bdev_execute_zone_append(struct nvmet_req *req) { sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->rw.slba); + const unsigned int op = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE; u16 status = NVME_SC_SUCCESS; unsigned int total_len = 0; struct scatterlist *sg; @@ -552,13 +553,12 @@ void nvmet_bdev_execute_zone_append(struct nvmet_req *req) if (nvmet_use_inline_bvec(req)) { bio = &req->z.inline_bio; bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec)); + bio->bi_opf = op; } else { - bio = bio_alloc(GFP_KERNEL, req->sg_cnt); + bio = bio_alloc(req->ns->bdev, req->sg_cnt, op, GFP_KERNEL); } - bio->bi_opf = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE; bio->bi_end_io = nvmet_bdev_zone_append_bio_done; - bio_set_dev(bio, req->ns->bdev); bio->bi_iter.bi_sector = sect; bio->bi_private = req; if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA)) diff --git a/drivers/scsi/ufs/ufshpb.c b/drivers/scsi/ufs/ufshpb.c index 2d36a0715fca..8970068314ef 100644 --- a/drivers/scsi/ufs/ufshpb.c +++ b/drivers/scsi/ufs/ufshpb.c @@ -494,7 +494,7 @@ static struct ufshpb_req *ufshpb_get_map_req(struct ufshpb_lu *hpb, if (!map_req) return NULL; - bio = bio_alloc(GFP_KERNEL, hpb->pages_per_srgn); + bio = bio_alloc(NULL, hpb->pages_per_srgn, 0, GFP_KERNEL); if (!bio) { ufshpb_put_req(hpb, map_req); return NULL; @@ -2050,7 +2050,7 @@ static int ufshpb_pre_req_mempool_init(struct ufshpb_lu *hpb) INIT_LIST_HEAD(&pre_req->list_req); pre_req->req = NULL; - pre_req->bio = bio_alloc(GFP_KERNEL, 1); + pre_req->bio = bio_alloc(NULL, 1, 0, GFP_KERNEL); if (!pre_req->bio) goto release_mem; diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 3c92ba374819..87ede165ddba 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -415,10 +415,9 @@ iblock_execute_sync_cache(struct se_cmd *cmd) if (immed) target_complete_cmd(cmd, SAM_STAT_GOOD); - bio = bio_alloc(GFP_KERNEL, 0); + bio = bio_alloc(ib_dev->ibd_bd, 0, REQ_OP_WRITE | REQ_PREFLUSH, + GFP_KERNEL); bio->bi_end_io = iblock_end_io_flush; - bio_set_dev(bio, ib_dev->ibd_bd); - bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; if (!immed) bio->bi_private = cmd; submit_bio(bio); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 87a5addbedf6..f45aa506f9a6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4029,8 +4029,9 @@ static int write_dev_supers(struct btrfs_device *device, * to do I/O, so we don't lose the ability to do integrity * checking. */ - bio = bio_alloc(GFP_NOFS, 1); - bio_set_dev(bio, device->bdev); + bio = bio_alloc(device->bdev, 1, + REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO, + GFP_NOFS); bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT; bio->bi_private = device; bio->bi_end_io = btrfs_end_super_write; @@ -4042,7 +4043,6 @@ static int write_dev_supers(struct btrfs_device *device, * go down lazy and there's a short window where the on-disk * copies might still contain the older version. */ - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO; if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER)) bio->bi_opf |= REQ_FUA; diff --git a/fs/buffer.c b/fs/buffer.c index 8e112b6bd371..a17c386a142c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3024,12 +3024,16 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE)) clear_buffer_write_io_error(bh); - bio = bio_alloc(GFP_NOIO, 1); + if (buffer_meta(bh)) + op_flags |= REQ_META; + if (buffer_prio(bh)) + op_flags |= REQ_PRIO; + + bio = bio_alloc(bh->b_bdev, 1, op | op_flags, GFP_NOIO); fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio_set_dev(bio, bh->b_bdev); bio->bi_write_hint = write_hint; bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); @@ -3038,12 +3042,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; - if (buffer_meta(bh)) - op_flags |= REQ_META; - if (buffer_prio(bh)) - op_flags |= REQ_PRIO; - bio_set_op_attrs(bio, op, op_flags); - /* Take care of bh's that straddle the end of the device */ guard_bio_eod(bio); diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index bfc2a5b74ed3..755e985a42e0 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -54,7 +54,8 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, int num_pages = 0; /* This always succeeds since __GFP_DIRECT_RECLAIM is set. */ - bio = bio_alloc(GFP_NOFS, BIO_MAX_VECS); + bio = bio_alloc(inode->i_sb->s_bdev, BIO_MAX_VECS, REQ_OP_WRITE, + GFP_NOFS); while (len) { unsigned int blocks_this_page = min(len, blocks_per_page); @@ -62,10 +63,8 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, if (num_pages == 0) { fscrypt_set_bio_crypt_ctx(bio, inode, lblk, GFP_NOFS); - bio_set_dev(bio, inode->i_sb->s_bdev); bio->bi_iter.bi_sector = pblk << (blockbits - SECTOR_SHIFT); - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); } ret = bio_add_page(bio, ZERO_PAGE(0), bytes_this_page, 0); if (WARN_ON(ret != bytes_this_page)) { @@ -82,6 +81,8 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, if (err) goto out; bio_reset(bio); + bio_set_dev(bio, inode->i_sb->s_bdev); + bio->bi_opf = REQ_OP_WRITE; num_pages = 0; } } @@ -150,12 +151,10 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, return -EINVAL; /* This always succeeds since __GFP_DIRECT_RECLAIM is set. */ - bio = bio_alloc(GFP_NOFS, nr_pages); + bio = bio_alloc(inode->i_sb->s_bdev, nr_pages, REQ_OP_WRITE, GFP_NOFS); do { - bio_set_dev(bio, inode->i_sb->s_bdev); bio->bi_iter.bi_sector = pblk << (blockbits - 9); - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); i = 0; offset = 0; @@ -183,6 +182,8 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, if (err) goto out; bio_reset(bio); + bio_set_dev(bio, inode->i_sb->s_bdev); + bio->bi_opf = REQ_OP_WRITE; } while (len != 0); err = 0; out: diff --git a/fs/direct-io.c b/fs/direct-io.c index 654443558047..38bca4980a1c 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -396,11 +396,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, * bio_alloc() is guaranteed to return a bio when allowed to sleep and * we request a valid number of vectors. */ - bio = bio_alloc(GFP_KERNEL, nr_vecs); - - bio_set_dev(bio, bdev); + bio = bio_alloc(bdev, nr_vecs, dio->op | dio->op_flags, GFP_KERNEL); bio->bi_iter.bi_sector = first_sector; - bio_set_op_attrs(bio, dio->op, dio->op_flags); if (dio->is_async) bio->bi_end_io = dio_bio_end_aio; else diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 498b7666efe8..db7de2dbac73 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1371,15 +1371,14 @@ submit_bio_retry: } if (!bio) { - bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS); + bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, + REQ_OP_READ, GFP_NOIO); bio->bi_end_io = z_erofs_decompressqueue_endio; - bio_set_dev(bio, mdev.m_bdev); last_bdev = mdev.m_bdev; bio->bi_iter.bi_sector = (sector_t)cur << LOG_SECTORS_PER_BLOCK; bio->bi_private = bi_private; - bio->bi_opf = REQ_OP_READ; if (f->readahead) bio->bi_opf |= REQ_RAHEAD; ++nr_bios; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 1d370364230e..125398226873 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -398,10 +398,9 @@ static void io_submit_init_bio(struct ext4_io_submit *io, * bio_alloc will _always_ be able to allocate a bio if * __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset(). */ - bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS); + bio = bio_alloc(bh->b_bdev, BIO_MAX_VECS, 0, GFP_NOIO); fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio_set_dev(bio, bh->b_bdev); bio->bi_end_io = ext4_end_bio; bio->bi_private = ext4_get_io_end(io->io_end); io->io_bio = bio; diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 4cd62f1d848c..1aa26d6634fc 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -365,15 +365,15 @@ int ext4_mpage_readpages(struct inode *inode, * bio_alloc will _always_ be able to allocate a bio if * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset(). */ - bio = bio_alloc(GFP_KERNEL, bio_max_segs(nr_pages)); + bio = bio_alloc(bdev, bio_max_segs(nr_pages), + REQ_OP_READ, GFP_KERNEL); fscrypt_set_bio_crypt_ctx(bio, inode, next_block, GFP_KERNEL); ext4_set_bio_post_read_ctx(bio, inode, page->index); - bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; - bio_set_op_attrs(bio, REQ_OP_READ, - rac ? REQ_RAHEAD : 0); + if (rac) + bio->bi_opf |= REQ_RAHEAD; } length = first_hole << blkbits; diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index ca0bb3a73912..4ae1eefae616 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -265,10 +265,9 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno, bio_end_io_t *end_io) { struct super_block *sb = sdp->sd_vfs; - struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS); + struct bio *bio = bio_alloc(sb->s_bdev, BIO_MAX_VECS, 0, GFP_NOIO); bio->bi_iter.bi_sector = blkno << sdp->sd_fsb2bb_shift; - bio_set_dev(bio, sb->s_bdev); bio->bi_end_io = end_io; bio->bi_private = sdp; @@ -489,10 +488,9 @@ static struct bio *gfs2_chain_bio(struct bio *prev, unsigned int nr_iovecs) { struct bio *new; - new = bio_alloc(GFP_NOIO, nr_iovecs); - bio_copy_dev(new, prev); + new = bio_alloc(prev->bi_bdev, nr_iovecs, prev->bi_opf, GFP_NOIO); + bio_clone_blkg_association(new, prev); new->bi_iter.bi_sector = bio_end_sector(prev); - new->bi_opf = prev->bi_opf; new->bi_write_hint = prev->bi_write_hint; bio_chain(new, prev); submit_bio(prev); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 72d30a682ece..a580b90b7522 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -222,9 +222,8 @@ static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[], struct buffer_head *bh = *bhs; struct bio *bio; - bio = bio_alloc(GFP_NOIO, num); + bio = bio_alloc(bh->b_bdev, num, op | op_flags, GFP_NOIO); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio_set_dev(bio, bh->b_bdev); while (num > 0) { bh = *bhs; if (!bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh))) { @@ -235,7 +234,6 @@ static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[], num--; } bio->bi_end_io = gfs2_meta_read_endio; - bio_set_op_attrs(bio, op, op_flags); submit_bio(bio); } } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 7f8410d8fdc1..c9b423c874a3 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -251,14 +251,12 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent) ClearPageDirty(page); lock_page(page); - bio = bio_alloc(GFP_NOFS, 1); + bio = bio_alloc(sb->s_bdev, 1, REQ_OP_READ | REQ_META, GFP_NOFS); bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9); - bio_set_dev(bio, sb->s_bdev); bio_add_page(bio, page, PAGE_SIZE, 0); bio->bi_end_io = end_bio_io_page; bio->bi_private = page; - bio_set_op_attrs(bio, REQ_OP_READ, REQ_META); submit_bio(bio); wait_on_page_locked(page); bio_put(bio); diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 4688cc7b3692..0b8ad6586df5 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -63,10 +63,8 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, offset = start & (io_size - 1); sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1); - bio = bio_alloc(GFP_NOIO, 1); + bio = bio_alloc(sb->s_bdev, 1, op | op_flags, GFP_NOIO); bio->bi_iter.bi_sector = sector; - bio_set_dev(bio, sb->s_bdev); - bio_set_op_attrs(bio, op, op_flags); if (op != WRITE && data) *data = (u8 *)buf + offset; diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 340d373cb1bf..70f3657a6ec0 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -290,19 +290,20 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter, if (ctx->rac) /* same as readahead_gfp_mask */ gfp |= __GFP_NORETRY | __GFP_NOWARN; - ctx->bio = bio_alloc(gfp, bio_max_segs(nr_vecs)); + ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs), + REQ_OP_READ, gfp); /* * If the bio_alloc fails, try it again for a single page to * avoid having to deal with partial page reads. This emulates * what do_mpage_readpage does. */ - if (!ctx->bio) - ctx->bio = bio_alloc(orig_gfp, 1); - ctx->bio->bi_opf = REQ_OP_READ; + if (!ctx->bio) { + ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ, + orig_gfp); + } if (ctx->rac) ctx->bio->bi_opf |= REQ_RAHEAD; ctx->bio->bi_iter.bi_sector = sector; - bio_set_dev(ctx->bio, iomap->bdev); ctx->bio->bi_end_io = iomap_read_end_io; bio_add_folio(ctx->bio, folio, plen, poff); } @@ -1226,10 +1227,9 @@ iomap_chain_bio(struct bio *prev) { struct bio *new; - new = bio_alloc(GFP_NOFS, BIO_MAX_VECS); - bio_copy_dev(new, prev);/* also copies over blkcg information */ + new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS); + bio_clone_blkg_association(new, prev); new->bi_iter.bi_sector = bio_end_sector(prev); - new->bi_opf = prev->bi_opf; new->bi_write_hint = prev->bi_write_hint; bio_chain(prev, new); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 03ea367df19a..e2ba13645ef2 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -183,15 +183,13 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, int flags = REQ_SYNC | REQ_IDLE; struct bio *bio; - bio = bio_alloc(GFP_KERNEL, 1); - bio_set_dev(bio, iter->iomap.bdev); + bio = bio_alloc(iter->iomap.bdev, 1, REQ_OP_WRITE | flags, GFP_KERNEL); bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos); bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; get_page(page); __bio_add_page(bio, page, len, 0); - bio_set_op_attrs(bio, REQ_OP_WRITE, flags); iomap_dio_submit_bio(iter, dio, bio, pos); } @@ -309,14 +307,12 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, goto out; } - bio = bio_alloc(GFP_KERNEL, nr_pages); - bio_set_dev(bio, iomap->bdev); + bio = bio_alloc(iomap->bdev, nr_pages, bio_opf, GFP_KERNEL); bio->bi_iter.bi_sector = iomap_sector(iomap, pos); bio->bi_write_hint = dio->iocb->ki_hint; bio->bi_ioprio = dio->iocb->ki_ioprio; bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; - bio->bi_opf = bio_opf; ret = bio_iov_iter_get_pages(bio, dio->submit.iter); if (unlikely(ret)) { diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 78fd136ac13b..997c81fcea34 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1980,17 +1980,13 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bp->l_flag |= lbmREAD; - bio = bio_alloc(GFP_NOFS, 1); - + bio = bio_alloc(log->bdev, 1, REQ_OP_READ, GFP_NOFS); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); - bio_set_dev(bio, log->bdev); - bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); bio->bi_end_io = lbmIODone; bio->bi_private = bp; - bio->bi_opf = REQ_OP_READ; /*check if journaling to disk has been disabled*/ if (log->no_integrity) { bio->bi_iter.bi_size = 0; @@ -2125,16 +2121,13 @@ static void lbmStartIO(struct lbuf * bp) jfs_info("lbmStartIO"); - bio = bio_alloc(GFP_NOFS, 1); + bio = bio_alloc(log->bdev, 1, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); - bio_set_dev(bio, log->bdev); - bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); bio->bi_end_io = lbmIODone; bio->bi_private = bp; - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; /* check if journaling to disk has been disabled */ if (log->no_integrity) { diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 104ae698443e..fde1a9cf902e 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -417,12 +417,10 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) } len = min(xlen, (int)JFS_SBI(inode->i_sb)->nbperpage); - bio = bio_alloc(GFP_NOFS, 1); - bio_set_dev(bio, inode->i_sb->s_bdev); + bio = bio_alloc(inode->i_sb->s_bdev, 1, REQ_OP_WRITE, GFP_NOFS); bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9); bio->bi_end_io = metapage_write_end_io; bio->bi_private = page; - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); /* Don't call bio_add_page yet, we may add to this vec */ bio_offset = offset; @@ -497,13 +495,12 @@ static int metapage_readpage(struct file *fp, struct page *page) if (bio) submit_bio(bio); - bio = bio_alloc(GFP_NOFS, 1); - bio_set_dev(bio, inode->i_sb->s_bdev); + bio = bio_alloc(inode->i_sb->s_bdev, 1, REQ_OP_READ, + GFP_NOFS); bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9); bio->bi_end_io = metapage_read_end_io; bio->bi_private = page; - bio_set_op_attrs(bio, REQ_OP_READ, 0); len = xlen << inode->i_blkbits; offset = block_offset << inode->i_blkbits; if (bio_add_page(bio, page, len, offset) < len) diff --git a/fs/mpage.c b/fs/mpage.c index 06e95d777e94..dbfc02e23d97 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -273,10 +273,10 @@ alloc_new: page)) goto out; } - args->bio = bio_alloc(gfp, bio_max_segs(args->nr_pages)); + args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), 0, + gfp); if (args->bio == NULL) goto confused; - bio_set_dev(args->bio, bdev); args->bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); } @@ -586,8 +586,7 @@ alloc_new: page, wbc)) goto out; } - bio = bio_alloc(GFP_NOFS, BIO_MAX_VECS); - bio_set_dev(bio, bdev); + bio = bio_alloc(bdev, BIO_MAX_VECS, 0, GFP_NOFS); bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); wbc_init_bio(wbc, bio); diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 38e063af7e98..79a8b451791f 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -154,12 +154,10 @@ do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, retry: if (!bio) { - bio = bio_alloc(GFP_NOIO, bio_max_segs(npg)); + bio = bio_alloc(map->bdev, bio_max_segs(npg), rw, GFP_NOIO); bio->bi_iter.bi_sector = disk_addr >> SECTOR_SHIFT; - bio_set_dev(bio, map->bdev); bio->bi_end_io = end_io; bio->bi_private = par; - bio_set_op_attrs(bio, rw, 0); } if (bio_add_page(bio, page, *len, offset) < *len) { bio = bl_submit_bio(bio); diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 53b7c6d21cdd..4f71faacd825 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -391,8 +391,8 @@ static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf, BUG_ON(wi->nr_vecs <= 0); repeat: if (!wi->bio) { - wi->bio = bio_alloc(GFP_NOIO, wi->nr_vecs); - bio_set_dev(wi->bio, wi->nilfs->ns_bdev); + wi->bio = bio_alloc(wi->nilfs->ns_bdev, wi->nr_vecs, 0, + GFP_NOIO); wi->bio->bi_iter.bi_sector = (wi->blocknr + wi->end) << (wi->nilfs->ns_blocksize_bits - 9); } diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index 4a255e21ecf5..0660a07c5a96 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -1485,15 +1485,13 @@ int ntfs_bio_pages(struct ntfs_sb_info *sbi, const struct runs_tree *run, lbo = ((u64)lcn << cluster_bits) + off; len = ((u64)clen << cluster_bits) - off; new_bio: - new = bio_alloc(GFP_NOFS, nr_pages - page_idx); + new = bio_alloc(bdev, nr_pages - page_idx, op, GFP_NOFS); if (bio) { bio_chain(bio, new); submit_bio(bio); } bio = new; - bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = lbo >> 9; - bio->bi_opf = op; while (len) { off = vbo & (PAGE_SIZE - 1); @@ -1584,14 +1582,12 @@ int ntfs_bio_fill_1(struct ntfs_sb_info *sbi, const struct runs_tree *run) lbo = (u64)lcn << cluster_bits; len = (u64)clen << cluster_bits; new_bio: - new = bio_alloc(GFP_NOFS, BIO_MAX_VECS); + new = bio_alloc(bdev, BIO_MAX_VECS, REQ_OP_WRITE, GFP_NOFS); if (bio) { bio_chain(bio, new); submit_bio(bio); } bio = new; - bio_set_dev(bio, bdev); - bio->bi_opf = REQ_OP_WRITE; bio->bi_iter.bi_sector = lbo >> 9; for (;;) { diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index a17be1618bf7..ea0e70c0fce0 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -518,7 +518,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, * GFP_KERNEL that the local node can get fenced. It would be * nicest if we could pre-allocate these bios and avoid this * all together. */ - bio = bio_alloc(GFP_ATOMIC, 16); + bio = bio_alloc(reg->hr_bdev, 16, op | op_flags, GFP_ATOMIC); if (!bio) { mlog(ML_ERROR, "Could not alloc slots BIO!\n"); bio = ERR_PTR(-ENOMEM); @@ -527,10 +527,8 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, /* Must put everything in 512 byte sectors for the bio... */ bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9); - bio_set_dev(bio, reg->hr_bdev); bio->bi_private = wc; bio->bi_end_io = o2hb_bio_end_io; - bio_set_op_attrs(bio, op, op_flags); vec_start = (cs << bits) % PAGE_SIZE; while(cs < max_slots) { diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 2db8bcf7ff85..622c844f6d11 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -86,16 +86,17 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length, int error, i; struct bio *bio; - if (page_count <= BIO_MAX_VECS) - bio = bio_alloc(GFP_NOIO, page_count); - else + if (page_count <= BIO_MAX_VECS) { + bio = bio_alloc(sb->s_bdev, page_count, REQ_OP_READ, GFP_NOIO); + } else { bio = bio_kmalloc(GFP_NOIO, page_count); + bio_set_dev(bio, sb->s_bdev); + bio->bi_opf = REQ_OP_READ; + } if (!bio) return -ENOMEM; - bio_set_dev(bio, sb->s_bdev); - bio->bi_opf = READ; bio->bi_iter.bi_sector = block * (msblk->devblksize >> SECTOR_SHIFT); for (i = 0; i < page_count; ++i) { diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c index 667e297f59b1..eff4a9f21dcf 100644 --- a/fs/xfs/xfs_bio_io.c +++ b/fs/xfs/xfs_bio_io.c @@ -61,10 +61,9 @@ xfs_rw_bdev( if (is_vmalloc && op == REQ_OP_WRITE) flush_kernel_vmap_range(data, count); - bio = bio_alloc(GFP_KERNEL, bio_max_vecs(left)); - bio_set_dev(bio, bdev); + bio = bio_alloc(bdev, bio_max_vecs(left), op | REQ_META | REQ_SYNC, + GFP_KERNEL); bio->bi_iter.bi_sector = sector; - bio->bi_opf = op | REQ_META | REQ_SYNC; do { struct page *page = kmem_to_page(data); @@ -74,10 +73,9 @@ xfs_rw_bdev( while (bio_add_page(bio, page, len, off) != len) { struct bio *prev = bio; - bio = bio_alloc(GFP_KERNEL, bio_max_vecs(left)); - bio_copy_dev(bio, prev); + bio = bio_alloc(prev->bi_bdev, bio_max_vecs(left), + prev->bi_opf, GFP_KERNEL); bio->bi_iter.bi_sector = bio_end_sector(prev); - bio->bi_opf = prev->bi_opf; bio_chain(prev, bio); submit_bio(prev); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index b45e0d50a405..ae87fd95b17e 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1440,12 +1440,10 @@ next_chunk: atomic_inc(&bp->b_io_remaining); nr_pages = bio_max_segs(total_nr_pages); - bio = bio_alloc(GFP_NOIO, nr_pages); - bio_set_dev(bio, bp->b_target->bt_bdev); + bio = bio_alloc(bp->b_target->bt_bdev, nr_pages, op, GFP_NOIO); bio->bi_iter.bi_sector = sector; bio->bi_end_io = xfs_buf_bio_end_io; bio->bi_private = bp; - bio->bi_opf = op; for (; size && nr_pages; nr_pages--, page_index++) { int rbytes, nbytes = PAGE_SIZE - offset; diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index b76dfb310ab6..c0fc2c326dce 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -692,12 +692,11 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) if (!nr_pages) return 0; - bio = bio_alloc(GFP_NOFS, nr_pages); - bio_set_dev(bio, bdev); + bio = bio_alloc(bdev, nr_pages, + REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS); bio->bi_iter.bi_sector = zi->i_zsector; bio->bi_write_hint = iocb->ki_hint; bio->bi_ioprio = iocb->ki_ioprio; - bio->bi_opf = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE; if (iocb->ki_flags & IOCB_DSYNC) bio->bi_opf |= REQ_FUA; diff --git a/include/linux/bio.h b/include/linux/bio.h index 5c5ada2ebb27..be6ac92913d4 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -418,9 +418,10 @@ extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *); extern struct bio_set fs_bio_set; -static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned short nr_iovecs) +static inline struct bio *bio_alloc(struct block_device *bdev, + unsigned short nr_vecs, unsigned int opf, gfp_t gfp_mask) { - return bio_alloc_bioset(NULL, nr_iovecs, 0, gfp_mask, &fs_bio_set); + return bio_alloc_bioset(bdev, nr_vecs, opf, gfp_mask, &fs_bio_set); } void submit_bio(struct bio *bio); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index f1bd03129575..6c4f983cbacc 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -276,10 +276,9 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, struct bio *bio; int error = 0; - bio = bio_alloc(GFP_NOIO | __GFP_HIGH, 1); + bio = bio_alloc(hib_resume_bdev, 1, op | op_flags, + GFP_NOIO | __GFP_HIGH); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); - bio_set_dev(bio, hib_resume_bdev); - bio_set_op_attrs(bio, op, op_flags); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { pr_err("Adding page to bio failed at %llu\n", diff --git a/mm/page_io.c b/mm/page_io.c index 0bf8e40f4e57..61c792f916fa 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -338,10 +338,10 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, return 0; } - bio = bio_alloc(GFP_NOIO, 1); - bio_set_dev(bio, sis->bdev); + bio = bio_alloc(sis->bdev, 1, + REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc), + GFP_NOIO); bio->bi_iter.bi_sector = swap_page_sector(page); - bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc); bio->bi_end_io = end_write_func; bio_add_page(bio, page, thp_size(page), 0); @@ -403,9 +403,7 @@ int swap_readpage(struct page *page, bool synchronous) } ret = 0; - bio = bio_alloc(GFP_KERNEL, 1); - bio_set_dev(bio, sis->bdev); - bio->bi_opf = REQ_OP_READ; + bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL); bio->bi_iter.bi_sector = swap_page_sector(page); bio->bi_end_io = end_swap_bio_read; bio_add_page(bio, page, thp_size(page), 0); -- cgit v1.2.3 From b1f866b013e6e5583f2f0bf4a61d13eddb9a1799 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 27 Jan 2022 08:05:48 +0100 Subject: block: remove blk_needs_flush_plug blk_needs_flush_plug fails to account for the cb_list, which needs flushing as well. Remove it and just check if there is a plug instead of poking into the internals of the plug structure. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220127070549.1377856-1-hch@lst.de Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 2 +- include/linux/blkdev.h | 13 ------------- kernel/exit.c | 2 +- kernel/sched/core.c | 2 +- 4 files changed, 3 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index f8d7fe6db989..f4ce38f6fc31 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2301,7 +2301,7 @@ void wakeup_flusher_threads(enum wb_reason reason) /* * If we are expecting writeback progress we must submit plugged IO. */ - if (blk_needs_flush_plug(current)) + if (current->plug) blk_flush_plug(current->plug, true); rcu_read_lock(); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 99a4384bb8a5..f902a1c2fac0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1055,14 +1055,6 @@ extern void blk_finish_plug(struct blk_plug *); void blk_flush_plug(struct blk_plug *plug, bool from_schedule); -static inline bool blk_needs_flush_plug(struct task_struct *tsk) -{ - struct blk_plug *plug = tsk->plug; - - return plug && - (plug->mq_list || !list_empty(&plug->cb_list)); -} - int blkdev_issue_flush(struct block_device *bdev); long nr_blockdev_pages(void); #else /* CONFIG_BLOCK */ @@ -1086,11 +1078,6 @@ static inline void blk_flush_plug(struct blk_plug *plug, bool async) { } -static inline bool blk_needs_flush_plug(struct task_struct *tsk) -{ - return false; -} - static inline int blkdev_issue_flush(struct block_device *bdev) { return 0; diff --git a/kernel/exit.c b/kernel/exit.c index b00a25bb4ab9..11fc6c9df9f2 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -735,7 +735,7 @@ void __noreturn do_exit(long code) struct task_struct *tsk = current; int group_dead; - WARN_ON(blk_needs_flush_plug(tsk)); + WARN_ON(tsk->plug); /* * If do_dead is called because this processes oopsed, it's possible diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 848eaa0efe0e..3487bb92d1f2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6344,7 +6344,7 @@ static inline void sched_submit_work(struct task_struct *tsk) * If we are going to sleep and we have plugged IO queued, * make sure to submit it to avoid deadlocks. */ - if (blk_needs_flush_plug(tsk)) + if (tsk->plug) blk_flush_plug(tsk->plug, true); } -- cgit v1.2.3 From aa8dcccaf32bfdc09f2aff089d5d60c37da5b7b5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 27 Jan 2022 08:05:49 +0100 Subject: block: check that there is a plug in blk_flush_plug Rename blk_flush_plug to __blk_flush_plug and add a wrapper that includes the NULL check instead of open coding that check everywhere. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220127070549.1377856-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 7 +++---- fs/fs-writeback.c | 6 ++---- include/linux/blkdev.h | 7 ++++++- kernel/sched/core.c | 7 ++----- 4 files changed, 13 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/block/blk-core.c b/block/blk-core.c index d93e3bb9a769..61f6a0dc4511 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -991,8 +991,7 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) return 0; - if (current->plug) - blk_flush_plug(current->plug, false); + blk_flush_plug(current->plug, false); if (blk_queue_enter(q, BLK_MQ_REQ_NOWAIT)) return 0; @@ -1274,7 +1273,7 @@ struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data, } EXPORT_SYMBOL(blk_check_plugged); -void blk_flush_plug(struct blk_plug *plug, bool from_schedule) +void __blk_flush_plug(struct blk_plug *plug, bool from_schedule) { if (!list_empty(&plug->cb_list)) flush_plug_callbacks(plug, from_schedule); @@ -1303,7 +1302,7 @@ void blk_flush_plug(struct blk_plug *plug, bool from_schedule) void blk_finish_plug(struct blk_plug *plug) { if (plug == current->plug) { - blk_flush_plug(plug, false); + __blk_flush_plug(plug, false); current->plug = NULL; } } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index f4ce38f6fc31..33d54c9fbefc 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1903,8 +1903,7 @@ static long writeback_sb_inodes(struct super_block *sb, * unplug, so get our IOs out the door before we * give up the CPU. */ - if (current->plug) - blk_flush_plug(current->plug, false); + blk_flush_plug(current->plug, false); cond_resched(); } @@ -2301,8 +2300,7 @@ void wakeup_flusher_threads(enum wb_reason reason) /* * If we are expecting writeback progress we must submit plugged IO. */ - if (current->plug) - blk_flush_plug(current->plug, true); + blk_flush_plug(current->plug, true); rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f902a1c2fac0..654163d3b903 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1053,7 +1053,12 @@ extern void blk_start_plug(struct blk_plug *); extern void blk_start_plug_nr_ios(struct blk_plug *, unsigned short); extern void blk_finish_plug(struct blk_plug *); -void blk_flush_plug(struct blk_plug *plug, bool from_schedule); +void __blk_flush_plug(struct blk_plug *plug, bool from_schedule); +static inline void blk_flush_plug(struct blk_plug *plug, bool async) +{ + if (plug) + __blk_flush_plug(plug, async); +} int blkdev_issue_flush(struct block_device *bdev); long nr_blockdev_pages(void); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3487bb92d1f2..46152982e400 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6344,8 +6344,7 @@ static inline void sched_submit_work(struct task_struct *tsk) * If we are going to sleep and we have plugged IO queued, * make sure to submit it to avoid deadlocks. */ - if (tsk->plug) - blk_flush_plug(tsk->plug, true); + blk_flush_plug(tsk->plug, true); } static void sched_update_worker(struct task_struct *tsk) @@ -8371,9 +8370,7 @@ int io_schedule_prepare(void) int old_iowait = current->in_iowait; current->in_iowait = 1; - if (current->plug) - blk_flush_plug(current->plug, true); - + blk_flush_plug(current->plug, true); return old_iowait; } -- cgit v1.2.3 From fe13889c390e14205e064d7e159e61eb5da4b1c3 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Fri, 28 Jan 2022 19:07:27 +0800 Subject: genirq, softirq: Use in_hardirq() instead of in_irq() Replace the obsolete and ambiguos macro in_irq() with the new macro in_hardirq(). Signed-off-by: Changbin Du Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220128110727.5110-1-changbin.du@gmail.com --- kernel/irq/irqdesc.c | 4 ++-- kernel/softirq.c | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 2267e6527db3..6167d32e27da 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -640,7 +640,7 @@ int handle_irq_desc(struct irq_desc *desc) return -EINVAL; data = irq_desc_get_irq_data(desc); - if (WARN_ON_ONCE(!in_irq() && handle_enforce_irqctx(data))) + if (WARN_ON_ONCE(!in_hardirq() && handle_enforce_irqctx(data))) return -EPERM; generic_handle_irq_desc(desc); @@ -676,7 +676,7 @@ EXPORT_SYMBOL_GPL(generic_handle_irq); */ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq) { - WARN_ON_ONCE(!in_irq()); + WARN_ON_ONCE(!in_hardirq()); return handle_irq_desc(irq_resolve_mapping(domain, hwirq)); } EXPORT_SYMBOL_GPL(generic_handle_domain_irq); diff --git a/kernel/softirq.c b/kernel/softirq.c index 41f470929e99..fac801815554 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -222,7 +222,7 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) u32 pending; int curcnt; - WARN_ON_ONCE(in_irq()); + WARN_ON_ONCE(in_hardirq()); lockdep_assert_irqs_enabled(); local_irq_save(flags); @@ -305,7 +305,7 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) { unsigned long flags; - WARN_ON_ONCE(in_irq()); + WARN_ON_ONCE(in_hardirq()); raw_local_irq_save(flags); /* @@ -352,14 +352,14 @@ static void __local_bh_enable(unsigned int cnt) */ void _local_bh_enable(void) { - WARN_ON_ONCE(in_irq()); + WARN_ON_ONCE(in_hardirq()); __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); } EXPORT_SYMBOL(_local_bh_enable); void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) { - WARN_ON_ONCE(in_irq()); + WARN_ON_ONCE(in_hardirq()); lockdep_assert_irqs_enabled(); #ifdef CONFIG_TRACE_IRQFLAGS local_irq_disable(); @@ -618,7 +618,7 @@ static inline void tick_irq_exit(void) /* Make sure that timer wheel updates are propagated */ if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) { - if (!in_irq()) + if (!in_hardirq()) tick_nohz_irq_exit(); } #endif -- cgit v1.2.3 From b293dcc473d22a62dc6d78de2b15e4f49515db56 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 2 Feb 2022 14:01:58 +0800 Subject: bpf: Use VM_MAP instead of VM_ALLOC for ringbuf After commit 2fd3fb0be1d1 ("kasan, vmalloc: unpoison VM_ALLOC pages after mapping"), non-VM_ALLOC mappings will be marked as accessible in __get_vm_area_node() when KASAN is enabled. But now the flag for ringbuf area is VM_ALLOC, so KASAN will complain out-of-bound access after vmap() returns. Because the ringbuf area is created by mapping allocated pages, so use VM_MAP instead. After the change, info in /proc/vmallocinfo also changes from [start]-[end] 24576 ringbuf_map_alloc+0x171/0x290 vmalloc user to [start]-[end] 24576 ringbuf_map_alloc+0x171/0x290 vmap user Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Reported-by: syzbot+5ad567a418794b9b5983@syzkaller.appspotmail.com Signed-off-by: Hou Tao Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220202060158.6260-1-houtao1@huawei.com --- kernel/bpf/ringbuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 638d7fd7b375..710ba9de12ce 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -104,7 +104,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) } rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages, - VM_ALLOC | VM_USERMAP, PAGE_KERNEL); + VM_MAP | VM_USERMAP, PAGE_KERNEL); if (rb) { kmemleak_not_leak(pages); rb->pages = pages; -- cgit v1.2.3 From 2bdfd2825c9662463371e6691b1a794e97fa36b4 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 2 Feb 2022 22:31:03 -0500 Subject: cgroup/cpuset: Fix "suspicious RCU usage" lockdep warning It was found that a "suspicious RCU usage" lockdep warning was issued with the rcu_read_lock() call in update_sibling_cpumasks(). It is because the update_cpumasks_hier() function may sleep. So we have to release the RCU lock, call update_cpumasks_hier() and reacquire it afterward. Also add a percpu_rwsem_assert_held() in update_sibling_cpumasks() instead of stating that in the comment. Fixes: 4716909cc5c5 ("cpuset: Track cpusets that use parent's effective_cpus") Signed-off-by: Waiman Long Tested-by: Phil Auld Reviewed-by: Phil Auld Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 804ff5738c5f..4c7254e8f49a 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1550,10 +1550,15 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct cpuset *sibling; struct cgroup_subsys_state *pos_css; + percpu_rwsem_assert_held(&cpuset_rwsem); + /* * Check all its siblings and call update_cpumasks_hier() * if their use_parent_ecpus flag is set in order for them * to use the right effective_cpus value. + * + * The update_cpumasks_hier() function may sleep. So we have to + * release the RCU read lock before calling it. */ rcu_read_lock(); cpuset_for_each_child(sibling, pos_css, parent) { @@ -1561,8 +1566,13 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, continue; if (!sibling->use_parent_ecpus) continue; + if (!css_tryget_online(&sibling->css)) + continue; + rcu_read_unlock(); update_cpumasks_hier(sibling, tmp); + rcu_read_lock(); + css_put(&sibling->css); } rcu_read_unlock(); } -- cgit v1.2.3 From 67d6212afda218d564890d1674bab28e8612170f Mon Sep 17 00:00:00 2001 From: Igor Pylypiv Date: Thu, 27 Jan 2022 15:39:53 -0800 Subject: Revert "module, async: async_synchronize_full() on module init iff async is used" This reverts commit 774a1221e862b343388347bac9b318767336b20b. We need to finish all async code before the module init sequence is done. In the reverted commit the PF_USED_ASYNC flag was added to mark a thread that called async_schedule(). Then the PF_USED_ASYNC flag was used to determine whether or not async_synchronize_full() needs to be invoked. This works when modprobe thread is calling async_schedule(), but it does not work if module dispatches init code to a worker thread which then calls async_schedule(). For example, PCI driver probing is invoked from a worker thread based on a node where device is attached: if (cpu < nr_cpu_ids) error = work_on_cpu(cpu, local_pci_probe, &ddi); else error = local_pci_probe(&ddi); We end up in a situation where a worker thread gets the PF_USED_ASYNC flag set instead of the modprobe thread. As a result, async_synchronize_full() is not invoked and modprobe completes without waiting for the async code to finish. The issue was discovered while loading the pm80xx driver: (scsi_mod.scan=async) modprobe pm80xx worker ... do_init_module() ... pci_call_probe() work_on_cpu(local_pci_probe) local_pci_probe() pm8001_pci_probe() scsi_scan_host() async_schedule() worker->flags |= PF_USED_ASYNC; ... < return from worker > ... if (current->flags & PF_USED_ASYNC) <--- false async_synchronize_full(); Commit 21c3c5d28007 ("block: don't request module during elevator init") fixed the deadlock issue which the reverted commit 774a1221e862 ("module, async: async_synchronize_full() on module init iff async is used") tried to fix. Since commit 0fdff3ec6d87 ("async, kmod: warn on synchronous request_module() from async workers") synchronous module loading from async is not allowed. Given that the original deadlock issue is fixed and it is no longer allowed to call synchronous request_module() from async we can remove PF_USED_ASYNC flag to make module init consistently invoke async_synchronize_full() unless async module probe is requested. Signed-off-by: Igor Pylypiv Reviewed-by: Changyuan Lyu Reviewed-by: Luis Chamberlain Acked-by: Tejun Heo Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 - kernel/async.c | 3 --- kernel/module.c | 25 +++++-------------------- 3 files changed, 5 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index f5b2be39a78c..75ba8aa60248 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1680,7 +1680,6 @@ extern struct pid *cad_pid; #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ -#define PF_USED_ASYNC 0x00004000 /* Used async_schedule*(), used by module init */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF_FROZEN 0x00010000 /* Frozen for system suspend */ #define PF_KSWAPD 0x00020000 /* I am kswapd */ diff --git a/kernel/async.c b/kernel/async.c index b8d7a663497f..b2c4ba5686ee 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -205,9 +205,6 @@ async_cookie_t async_schedule_node_domain(async_func_t func, void *data, atomic_inc(&entry_count); spin_unlock_irqrestore(&async_lock, flags); - /* mark that this task has queued an async job, used by module init */ - current->flags |= PF_USED_ASYNC; - /* schedule for execution */ queue_work_node(node, system_unbound_wq, &entry->work); diff --git a/kernel/module.c b/kernel/module.c index 24dab046e16c..46a5c2ed1928 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3725,12 +3725,6 @@ static noinline int do_init_module(struct module *mod) } freeinit->module_init = mod->init_layout.base; - /* - * We want to find out whether @mod uses async during init. Clear - * PF_USED_ASYNC. async_schedule*() will set it. - */ - current->flags &= ~PF_USED_ASYNC; - do_mod_ctors(mod); /* Start the module */ if (mod->init != NULL) @@ -3756,22 +3750,13 @@ static noinline int do_init_module(struct module *mod) /* * We need to finish all async code before the module init sequence - * is done. This has potential to deadlock. For example, a newly - * detected block device can trigger request_module() of the - * default iosched from async probing task. Once userland helper - * reaches here, async_synchronize_full() will wait on the async - * task waiting on request_module() and deadlock. - * - * This deadlock is avoided by perfomring async_synchronize_full() - * iff module init queued any async jobs. This isn't a full - * solution as it will deadlock the same if module loading from - * async jobs nests more than once; however, due to the various - * constraints, this hack seems to be the best option for now. - * Please refer to the following thread for details. + * is done. This has potential to deadlock if synchronous module + * loading is requested from async (which is not allowed!). * - * http://thread.gmane.org/gmane.linux.kernel/1420814 + * See commit 0fdff3ec6d87 ("async, kmod: warn on synchronous + * request_module() from async workers") for more details. */ - if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC)) + if (!mod->async_probe_requested) async_synchronize_full(); ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base + -- cgit v1.2.3 From 1f2cfdd349b7647f438c1e552dc1b983da86d830 Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Thu, 3 Feb 2022 15:50:29 +0100 Subject: printk: Fix incorrect __user type in proc_dointvec_minmax_sysadmin() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The move of proc_dointvec_minmax_sysadmin() from kernel/sysctl.c to kernel/printk/sysctl.c introduced an incorrect __user attribute to the buffer argument. I spotted this change in [1] as well as the kernel test robot. Revert this change to please sparse: kernel/printk/sysctl.c:20:51: warning: incorrect type in argument 3 (different address spaces) kernel/printk/sysctl.c:20:51: expected void * kernel/printk/sysctl.c:20:51: got void [noderef] __user *buffer Fixes: faaa357a55e0 ("printk: move printk sysctl to printk/sysctl.c") Link: https://lore.kernel.org/r/20220104155024.48023-2-mic@digikod.net [1] Reported-by: kernel test robot Cc: Andrew Morton Cc: John Ogness Cc: Luis Chamberlain Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Xiaoming Ni Signed-off-by: Mickaël Salaün Link: https://lore.kernel.org/r/20220203145029.272640-1-mic@digikod.net Signed-off-by: Linus Torvalds --- kernel/printk/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/sysctl.c b/kernel/printk/sysctl.c index 653ae04aab7f..c228343eeb97 100644 --- a/kernel/printk/sysctl.c +++ b/kernel/printk/sysctl.c @@ -12,7 +12,7 @@ static const int ten_thousand = 10000; static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; -- cgit v1.2.3 From d7e7b42f4f956f2c68ad8cda87d750093dbba737 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 3 Feb 2022 11:17:27 -0800 Subject: bpf: Fix a btf decl_tag bug when tagging a function syzbot reported a btf decl_tag bug with stack trace below: general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] CPU: 0 PID: 3592 Comm: syz-executor914 Not tainted 5.16.0-syzkaller-11424-gb7892f7d5cb2 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:btf_type_vlen include/linux/btf.h:231 [inline] RIP: 0010:btf_decl_tag_resolve+0x83e/0xaa0 kernel/bpf/btf.c:3910 ... Call Trace: btf_resolve+0x251/0x1020 kernel/bpf/btf.c:4198 btf_check_all_types kernel/bpf/btf.c:4239 [inline] btf_parse_type_sec kernel/bpf/btf.c:4280 [inline] btf_parse kernel/bpf/btf.c:4513 [inline] btf_new_fd+0x19fe/0x2370 kernel/bpf/btf.c:6047 bpf_btf_load kernel/bpf/syscall.c:4039 [inline] __sys_bpf+0x1cbb/0x5970 kernel/bpf/syscall.c:4679 __do_sys_bpf kernel/bpf/syscall.c:4738 [inline] __se_sys_bpf kernel/bpf/syscall.c:4736 [inline] __x64_sys_bpf+0x75/0xb0 kernel/bpf/syscall.c:4736 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae The kasan error is triggered with an illegal BTF like below: type 0: void type 1: int type 2: decl_tag to func type 3 type 3: func to func_proto type 8 The total number of types is 4 and the type 3 is illegal since its func_proto type is out of range. Currently, the target type of decl_tag can be struct/union, var or func. Both struct/union and var implemented their own 'resolve' callback functions and hence handled properly in kernel. But func type doesn't have 'resolve' callback function. When btf_decl_tag_resolve() tries to check func type, it tries to get vlen of its func_proto type, which triggered the above kasan error. To fix the issue, btf_decl_tag_resolve() needs to do btf_func_check() before trying to accessing func_proto type. In the current implementation, func type is checked with btf_func_check() in the main checking function btf_check_all_types(). To fix the above kasan issue, let us implement 'resolve' callback func type properly. The 'resolve' callback will be also called in btf_check_all_types() for func types. Fixes: b5ea834dde6b ("bpf: Support for new btf kind BTF_KIND_TAG") Reported-by: syzbot+53619be9444215e785ed@syzkaller.appspotmail.com Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220203191727.741862-1-yhs@fb.com --- kernel/bpf/btf.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index b983cee8d196..9b47972c57a4 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -419,6 +419,9 @@ static struct btf_type btf_void; static int btf_resolve(struct btf_verifier_env *env, const struct btf_type *t, u32 type_id); +static int btf_func_check(struct btf_verifier_env *env, + const struct btf_type *t); + static bool btf_type_is_modifier(const struct btf_type *t) { /* Some of them is not strictly a C modifier @@ -595,6 +598,7 @@ static bool btf_type_needs_resolve(const struct btf_type *t) btf_type_is_struct(t) || btf_type_is_array(t) || btf_type_is_var(t) || + btf_type_is_func(t) || btf_type_is_decl_tag(t) || btf_type_is_datasec(t); } @@ -3571,9 +3575,24 @@ static s32 btf_func_check_meta(struct btf_verifier_env *env, return 0; } +static int btf_func_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_type *t = v->t; + u32 next_type_id = t->type; + int err; + + err = btf_func_check(env, t); + if (err) + return err; + + env_stack_pop_resolved(env, next_type_id, 0); + return 0; +} + static struct btf_kind_operations func_ops = { .check_meta = btf_func_check_meta, - .resolve = btf_df_resolve, + .resolve = btf_func_resolve, .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_ref_type_log, @@ -4194,7 +4213,7 @@ static bool btf_resolve_valid(struct btf_verifier_env *env, return !btf_resolved_type_id(btf, type_id) && !btf_resolved_type_size(btf, type_id); - if (btf_type_is_decl_tag(t)) + if (btf_type_is_decl_tag(t) || btf_type_is_func(t)) return btf_resolved_type_id(btf, type_id) && !btf_resolved_type_size(btf, type_id); @@ -4284,12 +4303,6 @@ static int btf_check_all_types(struct btf_verifier_env *env) if (err) return err; } - - if (btf_type_is_func(t)) { - err = btf_func_check(env, t); - if (err) - return err; - } } return 0; -- cgit v1.2.3 From dcb85f85fa6f142aae1fe86f399d4503d49f2b60 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 3 Feb 2022 12:17:54 -0800 Subject: gcc-plugins/stackleak: Use noinstr in favor of notrace While the stackleak plugin was already using notrace, objtool is now a bit more picky. Update the notrace uses to noinstr. Silences the following objtool warnings when building with: CONFIG_DEBUG_ENTRY=y CONFIG_STACK_VALIDATION=y CONFIG_VMLINUX_VALIDATION=y CONFIG_GCC_PLUGIN_STACKLEAK=y vmlinux.o: warning: objtool: do_syscall_64()+0x9: call to stackleak_track_stack() leaves .noinstr.text section vmlinux.o: warning: objtool: do_int80_syscall_32()+0x9: call to stackleak_track_stack() leaves .noinstr.text section vmlinux.o: warning: objtool: exc_general_protection()+0x22: call to stackleak_track_stack() leaves .noinstr.text section vmlinux.o: warning: objtool: fixup_bad_iret()+0x20: call to stackleak_track_stack() leaves .noinstr.text section vmlinux.o: warning: objtool: do_machine_check()+0x27: call to stackleak_track_stack() leaves .noinstr.text section vmlinux.o: warning: objtool: .text+0x5346e: call to stackleak_erase() leaves .noinstr.text section vmlinux.o: warning: objtool: .entry.text+0x143: call to stackleak_erase() leaves .noinstr.text section vmlinux.o: warning: objtool: .entry.text+0x10eb: call to stackleak_erase() leaves .noinstr.text section vmlinux.o: warning: objtool: .entry.text+0x17f9: call to stackleak_erase() leaves .noinstr.text section Note that the plugin's addition of calls to stackleak_track_stack() from noinstr functions is expected to be safe, as it isn't runtime instrumentation and is self-contained. Cc: Alexander Popov Suggested-by: Peter Zijlstra Signed-off-by: Kees Cook Signed-off-by: Linus Torvalds --- kernel/stackleak.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/stackleak.c b/kernel/stackleak.c index 66b8af394e58..ddb5a7f48d69 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -70,7 +70,7 @@ late_initcall(stackleak_sysctls_init); #define skip_erasing() false #endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */ -asmlinkage void notrace stackleak_erase(void) +asmlinkage void noinstr stackleak_erase(void) { /* It would be nice not to have 'kstack_ptr' and 'boundary' on stack */ unsigned long kstack_ptr = current->lowest_stack; @@ -124,9 +124,8 @@ asmlinkage void notrace stackleak_erase(void) /* Reset the 'lowest_stack' value for the next syscall */ current->lowest_stack = current_top_of_stack() - THREAD_SIZE/64; } -NOKPROBE_SYMBOL(stackleak_erase); -void __used __no_caller_saved_registers notrace stackleak_track_stack(void) +void __used __no_caller_saved_registers noinstr stackleak_track_stack(void) { unsigned long sp = current_stack_pointer; -- cgit v1.2.3 From 53725c4cbd4567423ff6143c5d10300e53ecf52a Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Sun, 23 Jan 2022 20:45:07 +0800 Subject: cpufreq: schedutil: Use to_gov_attr_set() to get the gov_attr_set The to_gov_attr_set() has been moved to the cpufreq.h, so use it to get the gov_attr_set. Signed-off-by: Kevin Hao Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 26778884d9ab..cffcd08f4ec8 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -539,7 +539,7 @@ ATTRIBUTE_GROUPS(sugov); static void sugov_tunables_free(struct kobject *kobj) { - struct gov_attr_set *attr_set = container_of(kobj, struct gov_attr_set, kobj); + struct gov_attr_set *attr_set = to_gov_attr_set(kobj); kfree(to_sugov_tunables(attr_set)); } -- cgit v1.2.3 From e70e13e7d4ab8f932f49db1c9500b30a34a6d420 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Fri, 4 Feb 2022 01:55:18 +0100 Subject: bpf: Implement bpf_core_types_are_compat(). Adopt libbpf's bpf_core_types_are_compat() for kernel duty by adding explicit recursion limit of 2 which is enough to handle 2 levels of function prototypes. Signed-off-by: Matteo Croce Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220204005519.60361-2-mcroce@linux.microsoft.com --- include/linux/btf.h | 5 +++ kernel/bpf/btf.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 109 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/btf.h b/include/linux/btf.h index f6c43dd513fa..36bc09b8e890 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -327,6 +327,11 @@ static inline const struct btf_var_secinfo *btf_type_var_secinfo( return (const struct btf_var_secinfo *)(t + 1); } +static inline struct btf_param *btf_params(const struct btf_type *t) +{ + return (struct btf_param *)(t + 1); +} + #ifdef CONFIG_BPF_SYSCALL struct bpf_prog; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 9b47972c57a4..11740b300de9 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6798,10 +6798,113 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, } EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set); +#define MAX_TYPES_ARE_COMPAT_DEPTH 2 + +static +int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id, + int level) +{ + const struct btf_type *local_type, *targ_type; + int depth = 32; /* max recursion depth */ + + /* caller made sure that names match (ignoring flavor suffix) */ + local_type = btf_type_by_id(local_btf, local_id); + targ_type = btf_type_by_id(targ_btf, targ_id); + if (btf_kind(local_type) != btf_kind(targ_type)) + return 0; + +recur: + depth--; + if (depth < 0) + return -EINVAL; + + local_type = btf_type_skip_modifiers(local_btf, local_id, &local_id); + targ_type = btf_type_skip_modifiers(targ_btf, targ_id, &targ_id); + if (!local_type || !targ_type) + return -EINVAL; + + if (btf_kind(local_type) != btf_kind(targ_type)) + return 0; + + switch (btf_kind(local_type)) { + case BTF_KIND_UNKN: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_ENUM: + case BTF_KIND_FWD: + return 1; + case BTF_KIND_INT: + /* just reject deprecated bitfield-like integers; all other + * integers are by default compatible between each other + */ + return btf_int_offset(local_type) == 0 && btf_int_offset(targ_type) == 0; + case BTF_KIND_PTR: + local_id = local_type->type; + targ_id = targ_type->type; + goto recur; + case BTF_KIND_ARRAY: + local_id = btf_array(local_type)->type; + targ_id = btf_array(targ_type)->type; + goto recur; + case BTF_KIND_FUNC_PROTO: { + struct btf_param *local_p = btf_params(local_type); + struct btf_param *targ_p = btf_params(targ_type); + __u16 local_vlen = btf_vlen(local_type); + __u16 targ_vlen = btf_vlen(targ_type); + int i, err; + + if (local_vlen != targ_vlen) + return 0; + + for (i = 0; i < local_vlen; i++, local_p++, targ_p++) { + if (level <= 0) + return -EINVAL; + + btf_type_skip_modifiers(local_btf, local_p->type, &local_id); + btf_type_skip_modifiers(targ_btf, targ_p->type, &targ_id); + err = __bpf_core_types_are_compat(local_btf, local_id, + targ_btf, targ_id, + level - 1); + if (err <= 0) + return err; + } + + /* tail recurse for return type check */ + btf_type_skip_modifiers(local_btf, local_type->type, &local_id); + btf_type_skip_modifiers(targ_btf, targ_type->type, &targ_id); + goto recur; + } + default: + return 0; + } +} + +/* Check local and target types for compatibility. This check is used for + * type-based CO-RE relocations and follow slightly different rules than + * field-based relocations. This function assumes that root types were already + * checked for name match. Beyond that initial root-level name check, names + * are completely ignored. Compatibility rules are as follows: + * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs are considered compatible, but + * kind should match for local and target types (i.e., STRUCT is not + * compatible with UNION); + * - for ENUMs, the size is ignored; + * - for INT, size and signedness are ignored; + * - for ARRAY, dimensionality is ignored, element types are checked for + * compatibility recursively; + * - CONST/VOLATILE/RESTRICT modifiers are ignored; + * - TYPEDEFs/PTRs are compatible if types they pointing to are compatible; + * - FUNC_PROTOs are compatible if they have compatible signature: same + * number of input args and compatible return and argument types. + * These rules are not set in stone and probably will be adjusted as we get + * more experience with using BPF CO-RE relocations. + */ int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, __u32 targ_id) { - return -EOPNOTSUPP; + return __bpf_core_types_are_compat(local_btf, local_id, + targ_btf, targ_id, + MAX_TYPES_ARE_COMPAT_DEPTH); } static bool bpf_core_is_flavor_sep(const char *s) -- cgit v1.2.3 From 13765de8148f71fa795e0a6607de37c49ea5915a Mon Sep 17 00:00:00 2001 From: Tadeusz Struk Date: Thu, 3 Feb 2022 08:18:46 -0800 Subject: sched/fair: Fix fault in reweight_entity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Syzbot found a GPF in reweight_entity. This has been bisected to commit 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group") There is a race between sched_post_fork() and setpriority(PRIO_PGRP) within a thread group that causes a null-ptr-deref in reweight_entity() in CFS. The scenario is that the main process spawns number of new threads, which then call setpriority(PRIO_PGRP, 0, -20), wait, and exit. For each of the new threads the copy_process() gets invoked, which adds the new task_struct and calls sched_post_fork() for it. In the above scenario there is a possibility that setpriority(PRIO_PGRP) and set_one_prio() will be called for a thread in the group that is just being created by copy_process(), and for which the sched_post_fork() has not been executed yet. This will trigger a null pointer dereference in reweight_entity(), as it will try to access the run queue pointer, which hasn't been set. Before the mentioned change the cfs_rq pointer for the task has been set in sched_fork(), which is called much earlier in copy_process(), before the new task is added to the thread_group. Now it is done in the sched_post_fork(), which is called after that. To fix the issue the remove the update_load param from the update_load param() function and call reweight_task() only if the task flag doesn't have the TASK_NEW flag set. Fixes: 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group") Reported-by: syzbot+af7a719bc92395ee41b3@syzkaller.appspotmail.com Signed-off-by: Tadeusz Struk Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20220203161846.1160750-1-tadeusz.struk@linaro.org --- kernel/sched/core.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 848eaa0efe0e..fcf0c180617c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1214,8 +1214,9 @@ int tg_nop(struct task_group *tg, void *data) } #endif -static void set_load_weight(struct task_struct *p, bool update_load) +static void set_load_weight(struct task_struct *p) { + bool update_load = !(READ_ONCE(p->__state) & TASK_NEW); int prio = p->static_prio - MAX_RT_PRIO; struct load_weight *load = &p->se.load; @@ -4406,7 +4407,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->static_prio = NICE_TO_PRIO(0); p->prio = p->normal_prio = p->static_prio; - set_load_weight(p, false); + set_load_weight(p); /* * We don't need the reset flag anymore after the fork. It has @@ -6921,7 +6922,7 @@ void set_user_nice(struct task_struct *p, long nice) put_prev_task(rq, p); p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p, true); + set_load_weight(p); old_prio = p->prio; p->prio = effective_prio(p); @@ -7212,7 +7213,7 @@ static void __setscheduler_params(struct task_struct *p, */ p->rt_priority = attr->sched_priority; p->normal_prio = normal_prio(p); - set_load_weight(p, true); + set_load_weight(p); } /* @@ -9445,7 +9446,7 @@ void __init sched_init(void) #endif } - set_load_weight(&init_task, false); + set_load_weight(&init_task); /* * The boot idle thread does lazy MMU switching as well: -- cgit v1.2.3 From 5f4e5ce638e6a490b976ade4a40017b40abb2da0 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 3 Feb 2022 16:40:57 -0800 Subject: perf: Fix list corruption in perf_cgroup_switch() There's list corruption on cgrp_cpuctx_list. This happens on the following path: perf_cgroup_switch: list_for_each_entry(cgrp_cpuctx_list) cpu_ctx_sched_in ctx_sched_in ctx_pinned_sched_in merge_sched_in perf_cgroup_event_disable: remove the event from the list Use list_for_each_entry_safe() to allow removing an entry during iteration. Fixes: 058fe1c0440e ("perf/core: Make cgroup switch visit only cpuctxs with cgroup events") Signed-off-by: Song Liu Reviewed-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220204004057.2961252-1-song@kernel.org --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 57c7197838db..6859229497b1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -839,7 +839,7 @@ static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list); */ static void perf_cgroup_switch(struct task_struct *task, int mode) { - struct perf_cpu_context *cpuctx; + struct perf_cpu_context *cpuctx, *tmp; struct list_head *list; unsigned long flags; @@ -850,7 +850,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) local_irq_save(flags); list = this_cpu_ptr(&cgrp_cpuctx_list); - list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) { + list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) { WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); perf_ctx_lock(cpuctx, cpuctx->task_ctx); -- cgit v1.2.3 From 2f293651eca3eacaeb56747dede31edace7329d2 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 20 Dec 2021 16:38:02 +0000 Subject: livepatch: Fix build failure on 32 bits processors Trying to build livepatch on powerpc/32 results in: kernel/livepatch/core.c: In function 'klp_resolve_symbols': kernel/livepatch/core.c:221:23: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] 221 | sym = (Elf64_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info); | ^ kernel/livepatch/core.c:221:21: error: assignment to 'Elf32_Sym *' {aka 'struct elf32_sym *'} from incompatible pointer type 'Elf64_Sym *' {aka 'struct elf64_sym *'} [-Werror=incompatible-pointer-types] 221 | sym = (Elf64_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info); | ^ kernel/livepatch/core.c: In function 'klp_apply_section_relocs': kernel/livepatch/core.c:312:35: error: passing argument 1 of 'klp_resolve_symbols' from incompatible pointer type [-Werror=incompatible-pointer-types] 312 | ret = klp_resolve_symbols(sechdrs, strtab, symndx, sec, sec_objname); | ^~~~~~~ | | | Elf32_Shdr * {aka struct elf32_shdr *} kernel/livepatch/core.c:193:44: note: expected 'Elf64_Shdr *' {aka 'struct elf64_shdr *'} but argument is of type 'Elf32_Shdr *' {aka 'struct elf32_shdr *'} 193 | static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab, | ~~~~~~~~~~~~^~~~~~~ Fix it by using the right types instead of forcing 64 bits types. Fixes: 7c8e2bdd5f0d ("livepatch: Apply vmlinux-specific KLP relocations early") Signed-off-by: Christophe Leroy Acked-by: Petr Mladek Acked-by: Joe Lawrence Acked-by: Miroslav Benes Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/5288e11b018a762ea3351cc8fb2d4f15093a4457.1640017960.git.christophe.leroy@csgroup.eu --- kernel/livepatch/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 585494ec464f..bc475e62279d 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -190,7 +190,7 @@ static int klp_find_object_symbol(const char *objname, const char *name, return -EINVAL; } -static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab, +static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab, unsigned int symndx, Elf_Shdr *relasec, const char *sec_objname) { @@ -218,7 +218,7 @@ static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab, relas = (Elf_Rela *) relasec->sh_addr; /* For each rela in this klp relocation section */ for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) { - sym = (Elf64_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info); + sym = (Elf_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info); if (sym->st_shndx != SHN_LIVEPATCH) { pr_err("symbol %s is not marked as a livepatch symbol\n", strtab + sym->st_name); -- cgit v1.2.3 From cb1f65c1e1424a4b5e4a86da8aa3b8fd8459c8ec Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 4 Feb 2022 18:35:22 +0100 Subject: PM: s2idle: ACPI: Fix wakeup interrupts handling After commit e3728b50cd9b ("ACPI: PM: s2idle: Avoid possible race related to the EC GPE") wakeup interrupts occurring immediately after the one discarded by acpi_s2idle_wake() may be missed. Moreover, if the SCI triggers again immediately after the rearming in acpi_s2idle_wake(), that wakeup may be missed too. The problem is that pm_system_irq_wakeup() only calls pm_system_wakeup() when pm_wakeup_irq is 0, but that's not the case any more after the interrupt causing acpi_s2idle_wake() to run until pm_wakeup_irq is cleared by the pm_wakeup_clear() call in s2idle_loop(). However, there may be wakeup interrupts occurring in that time frame and if that happens, they will be missed. To address that issue first move the clearing of pm_wakeup_irq to the point at which it is known that the interrupt causing acpi_s2idle_wake() to tun will be discarded, before rearming the SCI for wakeup. Moreover, because that only reduces the size of the time window in which the issue may manifest itself, allow pm_system_irq_wakeup() to register two second wakeup interrupts in a row and, when discarding the first one, replace it with the second one. [Of course, this assumes that only one wakeup interrupt can be discarded in one go, but currently that is the case and I am not aware of any plans to change that.] Fixes: e3728b50cd9b ("ACPI: PM: s2idle: Avoid possible race related to the EC GPE") Cc: 5.4+ # 5.4+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/sleep.c | 1 + drivers/base/power/wakeup.c | 41 ++++++++++++++++++++++++++++++++++------- include/linux/suspend.h | 4 ++-- kernel/power/main.c | 5 ++++- kernel/power/process.c | 2 +- kernel/power/suspend.c | 2 -- 6 files changed, 42 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index fac7c9d4c9a1..d4fbea91ab6b 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -758,6 +758,7 @@ bool acpi_s2idle_wake(void) return true; } + pm_wakeup_clear(acpi_sci_irq); rearm_wake_irq(acpi_sci_irq); } diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index 99bda0da23a8..8666590201c9 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -34,7 +34,8 @@ suspend_state_t pm_suspend_target_state; bool events_check_enabled __read_mostly; /* First wakeup IRQ seen by the kernel in the last cycle. */ -unsigned int pm_wakeup_irq __read_mostly; +static unsigned int wakeup_irq[2] __read_mostly; +static DEFINE_RAW_SPINLOCK(wakeup_irq_lock); /* If greater than 0 and the system is suspending, terminate the suspend. */ static atomic_t pm_abort_suspend __read_mostly; @@ -942,19 +943,45 @@ void pm_system_cancel_wakeup(void) atomic_dec_if_positive(&pm_abort_suspend); } -void pm_wakeup_clear(bool reset) +void pm_wakeup_clear(unsigned int irq_number) { - pm_wakeup_irq = 0; - if (reset) + raw_spin_lock_irq(&wakeup_irq_lock); + + if (irq_number && wakeup_irq[0] == irq_number) + wakeup_irq[0] = wakeup_irq[1]; + else + wakeup_irq[0] = 0; + + wakeup_irq[1] = 0; + + raw_spin_unlock_irq(&wakeup_irq_lock); + + if (!irq_number) atomic_set(&pm_abort_suspend, 0); } void pm_system_irq_wakeup(unsigned int irq_number) { - if (pm_wakeup_irq == 0) { - pm_wakeup_irq = irq_number; + unsigned long flags; + + raw_spin_lock_irqsave(&wakeup_irq_lock, flags); + + if (wakeup_irq[0] == 0) + wakeup_irq[0] = irq_number; + else if (wakeup_irq[1] == 0) + wakeup_irq[1] = irq_number; + else + irq_number = 0; + + raw_spin_unlock_irqrestore(&wakeup_irq_lock, flags); + + if (irq_number) pm_system_wakeup(); - } +} + +unsigned int pm_wakeup_irq(void) +{ + return wakeup_irq[0]; } /** diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 3e8ecdebe601..300273ff40cc 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -497,14 +497,14 @@ extern void ksys_sync_helper(void); /* drivers/base/power/wakeup.c */ extern bool events_check_enabled; -extern unsigned int pm_wakeup_irq; extern suspend_state_t pm_suspend_target_state; extern bool pm_wakeup_pending(void); extern void pm_system_wakeup(void); extern void pm_system_cancel_wakeup(void); -extern void pm_wakeup_clear(bool reset); +extern void pm_wakeup_clear(unsigned int irq_number); extern void pm_system_irq_wakeup(unsigned int irq_number); +extern unsigned int pm_wakeup_irq(void); extern bool pm_get_wakeup_count(unsigned int *count, bool block); extern bool pm_save_wakeup_count(unsigned int count); extern void pm_wakep_autosleep_enabled(bool set); diff --git a/kernel/power/main.c b/kernel/power/main.c index 44169f3081fd..7e646079fbeb 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -504,7 +504,10 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA; + if (!pm_wakeup_irq()) + return -ENODATA; + + return sprintf(buf, "%u\n", pm_wakeup_irq()); } power_attr_ro(pm_wakeup_irq); diff --git a/kernel/power/process.c b/kernel/power/process.c index b7e7798637b8..11b570fcf049 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -134,7 +134,7 @@ int freeze_processes(void) if (!pm_freezing) atomic_inc(&system_freezing_cnt); - pm_wakeup_clear(true); + pm_wakeup_clear(0); pr_info("Freezing user space processes ... "); pm_freezing = true; error = try_to_freeze_tasks(true); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 80cc1f0f502b..6fcdee7e87a5 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -136,8 +136,6 @@ static void s2idle_loop(void) break; } - pm_wakeup_clear(false); - s2idle_enter(); } -- cgit v1.2.3 From 3486bedd99196ecdfe99c0ab5b67ad3c47e8a8fa Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 4 Feb 2022 10:57:35 -0800 Subject: bpf: Use bytes instead of pages for bpf_jit_[charge|uncharge]_modmem This enables sub-page memory charge and allocation. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220204185742.271030-3-song@kernel.org --- include/linux/bpf.h | 4 ++-- kernel/bpf/core.c | 17 ++++++++--------- kernel/bpf/trampoline.c | 6 +++--- 3 files changed, 13 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6eb0b180d33b..366f88afd56b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -846,8 +846,8 @@ void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym); void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); void bpf_ksym_del(struct bpf_ksym *ksym); -int bpf_jit_charge_modmem(u32 pages); -void bpf_jit_uncharge_modmem(u32 pages); +int bpf_jit_charge_modmem(u32 size); +void bpf_jit_uncharge_modmem(u32 size); bool bpf_prog_has_trampoline(const struct bpf_prog *prog); #else static inline int bpf_trampoline_link_prog(struct bpf_prog *prog, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 04a8d5bea552..6ca0550c4b24 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -833,12 +833,11 @@ static int __init bpf_jit_charge_init(void) } pure_initcall(bpf_jit_charge_init); -int bpf_jit_charge_modmem(u32 pages) +int bpf_jit_charge_modmem(u32 size) { - if (atomic_long_add_return(pages, &bpf_jit_current) > - (bpf_jit_limit >> PAGE_SHIFT)) { + if (atomic_long_add_return(size, &bpf_jit_current) > bpf_jit_limit) { if (!bpf_capable()) { - atomic_long_sub(pages, &bpf_jit_current); + atomic_long_sub(size, &bpf_jit_current); return -EPERM; } } @@ -846,9 +845,9 @@ int bpf_jit_charge_modmem(u32 pages) return 0; } -void bpf_jit_uncharge_modmem(u32 pages) +void bpf_jit_uncharge_modmem(u32 size) { - atomic_long_sub(pages, &bpf_jit_current); + atomic_long_sub(size, &bpf_jit_current); } void *__weak bpf_jit_alloc_exec(unsigned long size) @@ -879,11 +878,11 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); pages = size / PAGE_SIZE; - if (bpf_jit_charge_modmem(pages)) + if (bpf_jit_charge_modmem(size)) return NULL; hdr = bpf_jit_alloc_exec(size); if (!hdr) { - bpf_jit_uncharge_modmem(pages); + bpf_jit_uncharge_modmem(size); return NULL; } @@ -906,7 +905,7 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr) u32 pages = hdr->pages; bpf_jit_free_exec(hdr); - bpf_jit_uncharge_modmem(pages); + bpf_jit_uncharge_modmem(pages << PAGE_SHIFT); } /* This symbol is only overridden by archs that have different diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 4b6974a195c1..e76a488c09c3 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -213,7 +213,7 @@ static void __bpf_tramp_image_put_deferred(struct work_struct *work) im = container_of(work, struct bpf_tramp_image, work); bpf_image_ksym_del(&im->ksym); bpf_jit_free_exec(im->image); - bpf_jit_uncharge_modmem(1); + bpf_jit_uncharge_modmem(PAGE_SIZE); percpu_ref_exit(&im->pcref); kfree_rcu(im, rcu); } @@ -310,7 +310,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) if (!im) goto out; - err = bpf_jit_charge_modmem(1); + err = bpf_jit_charge_modmem(PAGE_SIZE); if (err) goto out_free_im; @@ -332,7 +332,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) out_free_image: bpf_jit_free_exec(im->image); out_uncharge: - bpf_jit_uncharge_modmem(1); + bpf_jit_uncharge_modmem(PAGE_SIZE); out_free_im: kfree(im); out: -- cgit v1.2.3 From ed2d9e1a26cca963ff5ed3b76326d70f7d8201a9 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 4 Feb 2022 10:57:36 -0800 Subject: bpf: Use size instead of pages in bpf_binary_header This is necessary to charge sub page memory for the BPF program. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220204185742.271030-4-song@kernel.org --- include/linux/filter.h | 6 +++--- kernel/bpf/core.c | 11 +++++------ 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index d23e999dc032..5855eb474c62 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -548,7 +548,7 @@ struct sock_fprog_kern { #define BPF_IMAGE_ALIGNMENT 8 struct bpf_binary_header { - u32 pages; + u32 size; u8 image[] __aligned(BPF_IMAGE_ALIGNMENT); }; @@ -886,8 +886,8 @@ static inline void bpf_prog_lock_ro(struct bpf_prog *fp) static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) { set_vm_flush_reset_perms(hdr); - set_memory_ro((unsigned long)hdr, hdr->pages); - set_memory_x((unsigned long)hdr, hdr->pages); + set_memory_ro((unsigned long)hdr, hdr->size >> PAGE_SHIFT); + set_memory_x((unsigned long)hdr, hdr->size >> PAGE_SHIFT); } static inline struct bpf_binary_header * diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 6ca0550c4b24..14199228a6f0 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -543,7 +543,7 @@ bpf_prog_ksym_set_addr(struct bpf_prog *prog) WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog)); prog->aux->ksym.start = (unsigned long) prog->bpf_func; - prog->aux->ksym.end = addr + hdr->pages * PAGE_SIZE; + prog->aux->ksym.end = addr + hdr->size; } static void @@ -866,7 +866,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_binary_header *hdr; - u32 size, hole, start, pages; + u32 size, hole, start; WARN_ON_ONCE(!is_power_of_2(alignment) || alignment > BPF_IMAGE_ALIGNMENT); @@ -876,7 +876,6 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, * random section of illegal instructions. */ size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); - pages = size / PAGE_SIZE; if (bpf_jit_charge_modmem(size)) return NULL; @@ -889,7 +888,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, /* Fill space with illegal/arch-dep instructions. */ bpf_fill_ill_insns(hdr, size); - hdr->pages = pages; + hdr->size = size; hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); start = (get_random_int() % hole) & ~(alignment - 1); @@ -902,10 +901,10 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, void bpf_jit_binary_free(struct bpf_binary_header *hdr) { - u32 pages = hdr->pages; + u32 size = hdr->size; bpf_jit_free_exec(hdr); - bpf_jit_uncharge_modmem(pages << PAGE_SHIFT); + bpf_jit_uncharge_modmem(size); } /* This symbol is only overridden by archs that have different -- cgit v1.2.3 From d00c6473b1ee9050cc36d008c6d30bf0d3de0524 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 4 Feb 2022 10:57:37 -0800 Subject: bpf: Use prog->jited_len in bpf_prog_ksym_set_addr() Using prog->jited_len is simpler and more accurate than current estimation (header + header->size). Also, fix missing prog->jited_len with multi function program. This hasn't been a real issue before this. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220204185742.271030-5-song@kernel.org --- kernel/bpf/core.c | 5 +---- kernel/bpf/verifier.c | 1 + 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 14199228a6f0..e3fe53df0a71 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -537,13 +537,10 @@ long bpf_jit_limit_max __read_mostly; static void bpf_prog_ksym_set_addr(struct bpf_prog *prog) { - const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog); - unsigned long addr = (unsigned long)hdr; - WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog)); prog->aux->ksym.start = (unsigned long) prog->bpf_func; - prog->aux->ksym.end = addr + hdr->size; + prog->aux->ksym.end = prog->aux->ksym.start + prog->jited_len; } static void diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1ae41d0cf96c..bbef86cb4e72 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13067,6 +13067,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) prog->jited = 1; prog->bpf_func = func[0]->bpf_func; + prog->jited_len = func[0]->jited_len; prog->aux->func = func; prog->aux->func_cnt = env->subprog_cnt; bpf_prog_jit_attempt_done(prog); -- cgit v1.2.3 From ebc1415d9b4f043cef5a1fb002ec316e32167e7a Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 4 Feb 2022 10:57:39 -0800 Subject: bpf: Introduce bpf_arch_text_copy This will be used to copy JITed text to RO protected module memory. On x86, bpf_arch_text_copy is implemented with text_poke_copy. bpf_arch_text_copy returns pointer to dst on success, and ERR_PTR(errno) on errors. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220204185742.271030-7-song@kernel.org --- arch/x86/net/bpf_jit_comp.c | 7 +++++++ include/linux/bpf.h | 2 ++ kernel/bpf/core.c | 5 +++++ 3 files changed, 14 insertions(+) (limited to 'kernel') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 36f6fc3e6e69..c13d148f7396 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2412,3 +2412,10 @@ bool bpf_jit_supports_kfunc_call(void) { return true; } + +void *bpf_arch_text_copy(void *dst, void *src, size_t len) +{ + if (text_poke_copy(dst, src, len) == NULL) + return ERR_PTR(-EINVAL); + return dst; +} diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 366f88afd56b..ea0d7fd4a410 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2362,6 +2362,8 @@ enum bpf_text_poke_type { int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2); +void *bpf_arch_text_copy(void *dst, void *src, size_t len); + struct btf_id_set; bool btf_id_set_contains(const struct btf_id_set *set, u32 id); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index e3fe53df0a71..a5ec480f9862 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2440,6 +2440,11 @@ int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, return -ENOTSUPP; } +void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len) +{ + return ERR_PTR(-ENOTSUPP); +} + DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); EXPORT_SYMBOL(bpf_stats_enabled_key); -- cgit v1.2.3 From 57631054fae6dcc9c892ae6310b58bbb6f6e5048 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 4 Feb 2022 10:57:40 -0800 Subject: bpf: Introduce bpf_prog_pack allocator Most BPF programs are small, but they consume a page each. For systems with busy traffic and many BPF programs, this could add significant pressure to instruction TLB. High iTLB pressure usually causes slow down for the whole system, which includes visible performance degradation for production workloads. Introduce bpf_prog_pack allocator to pack multiple BPF programs in a huge page. The memory is then allocated in 64 byte chunks. Memory allocated by bpf_prog_pack allocator is RO protected after initial allocation. To write to it, the user (jit engine) need to use text poke API. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220204185742.271030-8-song@kernel.org --- kernel/bpf/core.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a5ec480f9862..7ae590897b73 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -805,6 +805,133 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, return slot; } +/* + * BPF program pack allocator. + * + * Most BPF programs are pretty small. Allocating a hole page for each + * program is sometime a waste. Many small bpf program also adds pressure + * to instruction TLB. To solve this issue, we introduce a BPF program pack + * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86) + * to host BPF programs. + */ +#define BPF_PROG_PACK_SIZE HPAGE_PMD_SIZE +#define BPF_PROG_CHUNK_SHIFT 6 +#define BPF_PROG_CHUNK_SIZE (1 << BPF_PROG_CHUNK_SHIFT) +#define BPF_PROG_CHUNK_MASK (~(BPF_PROG_CHUNK_SIZE - 1)) +#define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE) + +struct bpf_prog_pack { + struct list_head list; + void *ptr; + unsigned long bitmap[BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)]; +}; + +#define BPF_PROG_MAX_PACK_PROG_SIZE HPAGE_PMD_SIZE +#define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE) + +static DEFINE_MUTEX(pack_mutex); +static LIST_HEAD(pack_list); + +static struct bpf_prog_pack *alloc_new_pack(void) +{ + struct bpf_prog_pack *pack; + + pack = kzalloc(sizeof(*pack), GFP_KERNEL); + if (!pack) + return NULL; + pack->ptr = module_alloc(BPF_PROG_PACK_SIZE); + if (!pack->ptr) { + kfree(pack); + return NULL; + } + bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE); + list_add_tail(&pack->list, &pack_list); + + set_vm_flush_reset_perms(pack->ptr); + set_memory_ro((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); + set_memory_x((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); + return pack; +} + +static void *bpf_prog_pack_alloc(u32 size) +{ + unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size); + struct bpf_prog_pack *pack; + unsigned long pos; + void *ptr = NULL; + + if (size > BPF_PROG_MAX_PACK_PROG_SIZE) { + size = round_up(size, PAGE_SIZE); + ptr = module_alloc(size); + if (ptr) { + set_vm_flush_reset_perms(ptr); + set_memory_ro((unsigned long)ptr, size / PAGE_SIZE); + set_memory_x((unsigned long)ptr, size / PAGE_SIZE); + } + return ptr; + } + mutex_lock(&pack_mutex); + list_for_each_entry(pack, &pack_list, list) { + pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0, + nbits, 0); + if (pos < BPF_PROG_CHUNK_COUNT) + goto found_free_area; + } + + pack = alloc_new_pack(); + if (!pack) + goto out; + + pos = 0; + +found_free_area: + bitmap_set(pack->bitmap, pos, nbits); + ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT); + +out: + mutex_unlock(&pack_mutex); + return ptr; +} + +static void bpf_prog_pack_free(struct bpf_binary_header *hdr) +{ + struct bpf_prog_pack *pack = NULL, *tmp; + unsigned int nbits; + unsigned long pos; + void *pack_ptr; + + if (hdr->size > BPF_PROG_MAX_PACK_PROG_SIZE) { + module_memfree(hdr); + return; + } + + pack_ptr = (void *)((unsigned long)hdr & ~(BPF_PROG_PACK_SIZE - 1)); + mutex_lock(&pack_mutex); + + list_for_each_entry(tmp, &pack_list, list) { + if (tmp->ptr == pack_ptr) { + pack = tmp; + break; + } + } + + if (WARN_ONCE(!pack, "bpf_prog_pack bug\n")) + goto out; + + nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size); + pos = ((unsigned long)hdr - (unsigned long)pack_ptr) >> BPF_PROG_CHUNK_SHIFT; + + bitmap_clear(pack->bitmap, pos, nbits); + if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0, + BPF_PROG_CHUNK_COUNT, 0) == 0) { + list_del(&pack->list); + module_memfree(pack->ptr); + kfree(pack); + } +out: + mutex_unlock(&pack_mutex); +} + static atomic_long_t bpf_jit_current; /* Can be overridden by an arch's JIT compiler if it has a custom, -- cgit v1.2.3 From 33c9805860e584b194199cab1a1e81f4e6395408 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 4 Feb 2022 10:57:41 -0800 Subject: bpf: Introduce bpf_jit_binary_pack_[alloc|finalize|free] This is the jit binary allocator built on top of bpf_prog_pack. bpf_prog_pack allocates RO memory, which cannot be used directly by the JIT engine. Therefore, a temporary rw buffer is allocated for the JIT engine. Once JIT is done, bpf_jit_binary_pack_finalize is used to copy the program to the RO memory. bpf_jit_binary_pack_alloc reserves 16 bytes of extra space for illegal instructions, which is small than the 128 bytes space reserved by bpf_jit_binary_alloc. This change is necessary for bpf_jit_binary_hdr to find the correct header. Also, flag use_bpf_prog_pack is added to differentiate a program allocated by bpf_jit_binary_pack_alloc. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220204185742.271030-9-song@kernel.org --- include/linux/bpf.h | 1 + include/linux/filter.h | 21 +++++----- kernel/bpf/core.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 120 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ea0d7fd4a410..2fc7e5c5ef41 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -953,6 +953,7 @@ struct bpf_prog_aux { bool sleepable; bool tail_call_reachable; bool xdp_has_frags; + bool use_bpf_prog_pack; struct hlist_node tramp_hlist; /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; diff --git a/include/linux/filter.h b/include/linux/filter.h index 5855eb474c62..1cb1af917617 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -890,15 +890,6 @@ static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) set_memory_x((unsigned long)hdr, hdr->size >> PAGE_SHIFT); } -static inline struct bpf_binary_header * -bpf_jit_binary_hdr(const struct bpf_prog *fp) -{ - unsigned long real_start = (unsigned long)fp->bpf_func; - unsigned long addr = real_start & PAGE_MASK; - - return (void *)addr; -} - int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap); static inline int sk_filter(struct sock *sk, struct sk_buff *skb) { @@ -1068,6 +1059,18 @@ void *bpf_jit_alloc_exec(unsigned long size); void bpf_jit_free_exec(void *addr); void bpf_jit_free(struct bpf_prog *fp); +struct bpf_binary_header * +bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **ro_image, + unsigned int alignment, + struct bpf_binary_header **rw_hdr, + u8 **rw_image, + bpf_jit_fill_hole_t bpf_fill_ill_insns); +int bpf_jit_binary_pack_finalize(struct bpf_prog *prog, + struct bpf_binary_header *ro_header, + struct bpf_binary_header *rw_header); +void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header, + struct bpf_binary_header *rw_header); + int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, struct bpf_jit_poke_descriptor *poke); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7ae590897b73..306aa63fa58e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1031,6 +1031,109 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr) bpf_jit_uncharge_modmem(size); } +/* Allocate jit binary from bpf_prog_pack allocator. + * Since the allocated memory is RO+X, the JIT engine cannot write directly + * to the memory. To solve this problem, a RW buffer is also allocated at + * as the same time. The JIT engine should calculate offsets based on the + * RO memory address, but write JITed program to the RW buffer. Once the + * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies + * the JITed program to the RO memory. + */ +struct bpf_binary_header * +bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr, + unsigned int alignment, + struct bpf_binary_header **rw_header, + u8 **rw_image, + bpf_jit_fill_hole_t bpf_fill_ill_insns) +{ + struct bpf_binary_header *ro_header; + u32 size, hole, start; + + WARN_ON_ONCE(!is_power_of_2(alignment) || + alignment > BPF_IMAGE_ALIGNMENT); + + /* add 16 bytes for a random section of illegal instructions */ + size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE); + + if (bpf_jit_charge_modmem(size)) + return NULL; + ro_header = bpf_prog_pack_alloc(size); + if (!ro_header) { + bpf_jit_uncharge_modmem(size); + return NULL; + } + + *rw_header = kvmalloc(size, GFP_KERNEL); + if (!*rw_header) { + bpf_prog_pack_free(ro_header); + bpf_jit_uncharge_modmem(size); + return NULL; + } + + /* Fill space with illegal/arch-dep instructions. */ + bpf_fill_ill_insns(*rw_header, size); + (*rw_header)->size = size; + + hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)), + BPF_PROG_CHUNK_SIZE - sizeof(*ro_header)); + start = (get_random_int() % hole) & ~(alignment - 1); + + *image_ptr = &ro_header->image[start]; + *rw_image = &(*rw_header)->image[start]; + + return ro_header; +} + +/* Copy JITed text from rw_header to its final location, the ro_header. */ +int bpf_jit_binary_pack_finalize(struct bpf_prog *prog, + struct bpf_binary_header *ro_header, + struct bpf_binary_header *rw_header) +{ + void *ptr; + + ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size); + + kvfree(rw_header); + + if (IS_ERR(ptr)) { + bpf_prog_pack_free(ro_header); + return PTR_ERR(ptr); + } + prog->aux->use_bpf_prog_pack = true; + return 0; +} + +/* bpf_jit_binary_pack_free is called in two different scenarios: + * 1) when the program is freed after; + * 2) when the JIT engine fails (before bpf_jit_binary_pack_finalize). + * For case 2), we need to free both the RO memory and the RW buffer. + * Also, ro_header->size in 2) is not properly set yet, so rw_header->size + * is used for uncharge. + */ +void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header, + struct bpf_binary_header *rw_header) +{ + u32 size = rw_header ? rw_header->size : ro_header->size; + + bpf_prog_pack_free(ro_header); + kvfree(rw_header); + bpf_jit_uncharge_modmem(size); +} + +static inline struct bpf_binary_header * +bpf_jit_binary_hdr(const struct bpf_prog *fp) +{ + unsigned long real_start = (unsigned long)fp->bpf_func; + unsigned long addr; + + if (fp->aux->use_bpf_prog_pack) + addr = real_start & BPF_PROG_CHUNK_MASK; + else + addr = real_start & PAGE_MASK; + + return (void *)addr; +} + /* This symbol is only overridden by archs that have different * requirements than the usual eBPF JITs, f.e. when they only * implement cBPF JIT, do not set images read-only, etc. @@ -1040,7 +1143,10 @@ void __weak bpf_jit_free(struct bpf_prog *fp) if (fp->jited) { struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); - bpf_jit_binary_free(hdr); + if (fp->aux->use_bpf_prog_pack) + bpf_jit_binary_pack_free(hdr, NULL /* rw_buffer */); + else + bpf_jit_binary_free(hdr); WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); } -- cgit v1.2.3 From 82980b1622d97017053c6792382469d7dc26a486 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 16 Feb 2021 15:04:34 +0000 Subject: rcu: Kill rnp->ofl_seq and use only rcu_state.ofl_lock for exclusion If we allow architectures to bring APs online in parallel, then we end up requiring rcu_cpu_starting() to be reentrant. But currently, the manipulation of rnp->ofl_seq is not thread-safe. However, rnp->ofl_seq is also fairly much pointless anyway since both rcu_cpu_starting() and rcu_report_dead() hold rcu_state.ofl_lock for fairly much the whole time that rnp->ofl_seq is set to an odd number to indicate that an operation is in progress. So drop rnp->ofl_seq completely, and use only rcu_state.ofl_lock. This has a couple of minor complexities: lockdep will complain when we take rcu_state.ofl_lock, and currently accepts the 'excuse' of having an odd value in rnp->ofl_seq. So switch it to an arch_spinlock_t to avoid that false positive complaint. Since we're killing rnp->ofl_seq of course that 'excuse' has to be changed too, so make it check for arch_spin_is_locked(rcu_state.ofl_lock). There's no arch_spin_lock_irqsave() so we have to manually save and restore local interrupts around the locking. At Paul's request based on Neeraj's analysis, make rcu_gp_init not just wait but *exclude* any CPU online/offline activity, which was fairly much true already by virtue of it holding rcu_state.ofl_lock. Signed-off-by: David Woodhouse Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 71 ++++++++++++++++++++++++++++--------------------------- kernel/rcu/tree.h | 4 +--- 2 files changed, 37 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a4c25a6283b0..73a4c9d07b86 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -91,7 +91,7 @@ static struct rcu_state rcu_state = { .abbr = RCU_ABBR, .exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex), .exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex), - .ofl_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.ofl_lock), + .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED, }; /* Dump rcu_node combining tree at boot to verify correct setup. */ @@ -1175,7 +1175,15 @@ bool rcu_lockdep_current_cpu_online(void) preempt_disable_notrace(); rdp = this_cpu_ptr(&rcu_data); rnp = rdp->mynode; - if (rdp->grpmask & rcu_rnp_online_cpus(rnp) || READ_ONCE(rnp->ofl_seq) & 0x1) + /* + * Strictly, we care here about the case where the current CPU is + * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask + * not being up to date. So arch_spin_is_locked() might have a + * false positive if it's held by some *other* CPU, but that's + * OK because that just means a false *negative* on the warning. + */ + if (rdp->grpmask & rcu_rnp_online_cpus(rnp) || + arch_spin_is_locked(&rcu_state.ofl_lock)) ret = true; preempt_enable_notrace(); return ret; @@ -1739,7 +1747,6 @@ static void rcu_strict_gp_boundary(void *unused) */ static noinline_for_stack bool rcu_gp_init(void) { - unsigned long firstseq; unsigned long flags; unsigned long oldmask; unsigned long mask; @@ -1782,22 +1789,17 @@ static noinline_for_stack bool rcu_gp_init(void) * of RCU's Requirements documentation. */ WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF); + /* Exclude CPU hotplug operations. */ rcu_for_each_leaf_node(rnp) { - // Wait for CPU-hotplug operations that might have - // started before this grace period did. - smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values. - firstseq = READ_ONCE(rnp->ofl_seq); - if (firstseq & 0x1) - while (firstseq == READ_ONCE(rnp->ofl_seq)) - schedule_timeout_idle(1); // Can't wake unless RCU is watching. - smp_mb(); // Pair with barriers used when updating ->ofl_seq to even values. - raw_spin_lock(&rcu_state.ofl_lock); - raw_spin_lock_irq_rcu_node(rnp); + local_irq_save(flags); + arch_spin_lock(&rcu_state.ofl_lock); + raw_spin_lock_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && !rnp->wait_blkd_tasks) { /* Nothing to do on this leaf rcu_node structure. */ - raw_spin_unlock_irq_rcu_node(rnp); - raw_spin_unlock(&rcu_state.ofl_lock); + raw_spin_unlock_rcu_node(rnp); + arch_spin_unlock(&rcu_state.ofl_lock); + local_irq_restore(flags); continue; } @@ -1832,8 +1834,9 @@ static noinline_for_stack bool rcu_gp_init(void) rcu_cleanup_dead_rnp(rnp); } - raw_spin_unlock_irq_rcu_node(rnp); - raw_spin_unlock(&rcu_state.ofl_lock); + raw_spin_unlock_rcu_node(rnp); + arch_spin_unlock(&rcu_state.ofl_lock); + local_irq_restore(flags); } rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */ @@ -4287,11 +4290,10 @@ void rcu_cpu_starting(unsigned int cpu) rnp = rdp->mynode; mask = rdp->grpmask; - WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1); - WARN_ON_ONCE(!(rnp->ofl_seq & 0x1)); + local_irq_save(flags); + arch_spin_lock(&rcu_state.ofl_lock); rcu_dynticks_eqs_online(); - smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier(). - raw_spin_lock_irqsave_rcu_node(rnp, flags); + raw_spin_lock_rcu_node(rnp); WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask); newcpu = !(rnp->expmaskinitnext & mask); rnp->expmaskinitnext |= mask; @@ -4304,15 +4306,18 @@ void rcu_cpu_starting(unsigned int cpu) /* An incoming CPU should never be blocking a grace period. */ if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */ + /* rcu_report_qs_rnp() *really* wants some flags to restore */ + unsigned long flags2; + + local_irq_save(flags2); rcu_disable_urgency_upon_qs(rdp); /* Report QS -after- changing ->qsmaskinitnext! */ - rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); + rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags2); } else { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + raw_spin_unlock_rcu_node(rnp); } - smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier(). - WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1); - WARN_ON_ONCE(rnp->ofl_seq & 0x1); + arch_spin_unlock(&rcu_state.ofl_lock); + local_irq_restore(flags); smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ } @@ -4326,7 +4331,7 @@ void rcu_cpu_starting(unsigned int cpu) */ void rcu_report_dead(unsigned int cpu) { - unsigned long flags; + unsigned long flags, seq_flags; unsigned long mask; struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ @@ -4340,10 +4345,8 @@ void rcu_report_dead(unsigned int cpu) /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; - WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1); - WARN_ON_ONCE(!(rnp->ofl_seq & 0x1)); - smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier(). - raw_spin_lock(&rcu_state.ofl_lock); + local_irq_save(seq_flags); + arch_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq); rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags); @@ -4354,10 +4357,8 @@ void rcu_report_dead(unsigned int cpu) } WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - raw_spin_unlock(&rcu_state.ofl_lock); - smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier(). - WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1); - WARN_ON_ONCE(rnp->ofl_seq & 0x1); + arch_spin_unlock(&rcu_state.ofl_lock); + local_irq_restore(seq_flags); rdp->cpu_started = false; } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 486fc901bd08..4b4bcef8a974 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -56,8 +56,6 @@ struct rcu_node { /* Initialized from ->qsmaskinitnext at the */ /* beginning of each grace period. */ unsigned long qsmaskinitnext; - unsigned long ofl_seq; /* CPU-hotplug operation sequence count. */ - /* Online CPUs for next grace period. */ unsigned long expmask; /* CPUs or groups that need to check in */ /* to allow the current expedited GP */ /* to complete. */ @@ -355,7 +353,7 @@ struct rcu_state { const char *name; /* Name of structure. */ char abbr; /* Abbreviated name. */ - raw_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp; + arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp; /* Synchronize offline with */ /* GP pre-initialization. */ }; -- cgit v1.2.3 From 0cabb47af3cfaeb6007ba3868379bbd4daee64cc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 10 Dec 2021 16:25:20 -0800 Subject: rcu: Refactor rcu_barrier() empty-list handling This commit saves a few lines by checking first for an empty callback list. If the callback list is empty, then that CPU is taken care of, regardless of its online or nocb state. Also simplify tracing accordingly and fold a few lines together. Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 9 ++++----- kernel/rcu/tree.c | 25 ++++++++----------------- 2 files changed, 12 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 670e41783edd..90b2fb0292cb 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -794,16 +794,15 @@ TRACE_EVENT_RCU(rcu_torture_read, * Tracepoint for rcu_barrier() execution. The string "s" describes * the rcu_barrier phase: * "Begin": rcu_barrier() started. + * "CB": An rcu_barrier_callback() invoked a callback, not the last. * "EarlyExit": rcu_barrier() piggybacked, thus early exit. * "Inc1": rcu_barrier() piggyback check counter incremented. - * "OfflineNoCBQ": rcu_barrier() found offline no-CBs CPU with callbacks. - * "OnlineQ": rcu_barrier() found online CPU with callbacks. - * "OnlineNQ": rcu_barrier() found online CPU, no callbacks. + * "Inc2": rcu_barrier() piggyback check counter incremented. * "IRQ": An rcu_barrier_callback() callback posted on remote CPU. * "IRQNQ": An rcu_barrier_callback() callback found no callbacks. - * "CB": An rcu_barrier_callback() invoked a callback, not the last. * "LastCB": An rcu_barrier_callback() invoked the last callback. - * "Inc2": rcu_barrier() piggyback check counter incremented. + * "NQ": rcu_barrier() found a CPU with no callbacks. + * "OnlineQ": rcu_barrier() found online CPU with callbacks. * The "cpu" argument is the CPU or -1 if meaningless, the "cnt" argument * is the count of remaining callbacks, and "done" is the piggybacking count. */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 73a4c9d07b86..57a7a0065750 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4030,8 +4030,7 @@ void rcu_barrier(void) /* Did someone else do our work for us? */ if (rcu_seq_done(&rcu_state.barrier_sequence, s)) { - rcu_barrier_trace(TPS("EarlyExit"), -1, - rcu_state.barrier_sequence); + rcu_barrier_trace(TPS("EarlyExit"), -1, rcu_state.barrier_sequence); smp_mb(); /* caller's subsequent code after above check. */ mutex_unlock(&rcu_state.barrier_mutex); return; @@ -4059,26 +4058,18 @@ void rcu_barrier(void) */ for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); - if (cpu_is_offline(cpu) && - !rcu_rdp_is_offloaded(rdp)) + if (!rcu_segcblist_n_cbs(&rdp->cblist)) { + rcu_barrier_trace(TPS("NQ"), cpu, rcu_state.barrier_sequence); continue; - if (rcu_segcblist_n_cbs(&rdp->cblist) && cpu_online(cpu)) { - rcu_barrier_trace(TPS("OnlineQ"), cpu, - rcu_state.barrier_sequence); + } + if (cpu_online(cpu)) { + rcu_barrier_trace(TPS("OnlineQ"), cpu, rcu_state.barrier_sequence); smp_call_function_single(cpu, rcu_barrier_func, (void *)cpu, 1); - } else if (rcu_segcblist_n_cbs(&rdp->cblist) && - cpu_is_offline(cpu)) { - rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu, - rcu_state.barrier_sequence); + } else { + rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu, rcu_state.barrier_sequence); local_irq_disable(); rcu_barrier_func((void *)cpu); local_irq_enable(); - } else if (cpu_is_offline(cpu)) { - rcu_barrier_trace(TPS("OfflineNoCBNoQ"), cpu, - rcu_state.barrier_sequence); - } else { - rcu_barrier_trace(TPS("OnlineNQ"), cpu, - rcu_state.barrier_sequence); } } cpus_read_unlock(); -- cgit v1.2.3 From a16578dd5e3a44b53ca0699ac2971679dab97484 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 14 Dec 2021 13:15:18 -0800 Subject: rcu: Rework rcu_barrier() and callback-migration logic This commit reworks rcu_barrier() and callback-migration logic to permit allowing rcu_barrier() to run concurrently with CPU-hotplug operations. The key trick is for callback migration to check to see if an rcu_barrier() is in flight, and, if so, enqueue the ->barrier_head callback on its behalf. This commit adds synchronization with RCU's CPU-hotplug notifiers. Taken together, this will permit a later commit to remove the cpus_read_lock() and cpus_read_unlock() calls from rcu_barrier(). [ paulmck: Updated per kbuild test robot feedback. ] [ paulmck: Updated per reviews session with Neeraj, Frederic, Uladzislau, and Boqun. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 77 +++++++++++++++++++++++++++++++++++++++++++------------ kernel/rcu/tree.h | 2 ++ 2 files changed, 63 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 57a7a0065750..004ff1c0d192 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3987,13 +3987,16 @@ static void rcu_barrier_callback(struct rcu_head *rhp) } /* - * Called with preemption disabled, and from cross-cpu IRQ context. + * If needed, entrain an rcu_barrier() callback on rdp->cblist. */ -static void rcu_barrier_func(void *cpu_in) +static void rcu_barrier_entrain(struct rcu_data *rdp) { - uintptr_t cpu = (uintptr_t)cpu_in; - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence); + unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap); + lockdep_assert_held(&rdp->barrier_lock); + if (rcu_seq_state(lseq) || !rcu_seq_state(gseq) || rcu_seq_ctr(lseq) != rcu_seq_ctr(gseq)) + return; rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence); rdp->barrier_head.func = rcu_barrier_callback; debug_rcu_head_queue(&rdp->barrier_head); @@ -4003,10 +4006,26 @@ static void rcu_barrier_func(void *cpu_in) atomic_inc(&rcu_state.barrier_cpu_count); } else { debug_rcu_head_unqueue(&rdp->barrier_head); - rcu_barrier_trace(TPS("IRQNQ"), -1, - rcu_state.barrier_sequence); + rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence); } rcu_nocb_unlock(rdp); + smp_store_release(&rdp->barrier_seq_snap, gseq); +} + +/* + * Called with preemption disabled, and from cross-cpu IRQ context. + */ +static void rcu_barrier_handler(void *cpu_in) +{ + uintptr_t cpu = (uintptr_t)cpu_in; + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + + lockdep_assert_irqs_disabled(); + WARN_ON_ONCE(cpu != rdp->cpu); + WARN_ON_ONCE(cpu != smp_processor_id()); + raw_spin_lock(&rdp->barrier_lock); + rcu_barrier_entrain(rdp); + raw_spin_unlock(&rdp->barrier_lock); } /** @@ -4020,6 +4039,8 @@ static void rcu_barrier_func(void *cpu_in) void rcu_barrier(void) { uintptr_t cpu; + unsigned long flags; + unsigned long gseq; struct rcu_data *rdp; unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence); @@ -4038,6 +4059,7 @@ void rcu_barrier(void) /* Mark the start of the barrier operation. */ rcu_seq_start(&rcu_state.barrier_sequence); + gseq = rcu_state.barrier_sequence; rcu_barrier_trace(TPS("Inc1"), -1, rcu_state.barrier_sequence); /* @@ -4058,19 +4080,30 @@ void rcu_barrier(void) */ for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); +retry: + if (smp_load_acquire(&rdp->barrier_seq_snap) == gseq) + continue; + raw_spin_lock_irqsave(&rdp->barrier_lock, flags); if (!rcu_segcblist_n_cbs(&rdp->cblist)) { + WRITE_ONCE(rdp->barrier_seq_snap, gseq); + raw_spin_unlock_irqrestore(&rdp->barrier_lock, flags); rcu_barrier_trace(TPS("NQ"), cpu, rcu_state.barrier_sequence); continue; } - if (cpu_online(cpu)) { - rcu_barrier_trace(TPS("OnlineQ"), cpu, rcu_state.barrier_sequence); - smp_call_function_single(cpu, rcu_barrier_func, (void *)cpu, 1); - } else { + if (!rcu_rdp_cpu_online(rdp)) { + rcu_barrier_entrain(rdp); + WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq); + raw_spin_unlock_irqrestore(&rdp->barrier_lock, flags); rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu, rcu_state.barrier_sequence); - local_irq_disable(); - rcu_barrier_func((void *)cpu); - local_irq_enable(); + continue; } + raw_spin_unlock_irqrestore(&rdp->barrier_lock, flags); + if (smp_call_function_single(cpu, rcu_barrier_handler, (void *)cpu, 1)) { + schedule_timeout_uninterruptible(1); + goto retry; + } + WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq); + rcu_barrier_trace(TPS("OnlineQ"), cpu, rcu_state.barrier_sequence); } cpus_read_unlock(); @@ -4087,6 +4120,12 @@ void rcu_barrier(void) /* Mark the end of the barrier operation. */ rcu_barrier_trace(TPS("Inc2"), -1, rcu_state.barrier_sequence); rcu_seq_end(&rcu_state.barrier_sequence); + gseq = rcu_state.barrier_sequence; + for_each_possible_cpu(cpu) { + rdp = per_cpu_ptr(&rcu_data, cpu); + + WRITE_ONCE(rdp->barrier_seq_snap, gseq); + } /* Other rcu_barrier() invocations can now safely proceed. */ mutex_unlock(&rcu_state.barrier_mutex); @@ -4134,6 +4173,8 @@ rcu_boot_init_percpu_data(int cpu) INIT_WORK(&rdp->strict_work, strict_work_handler); WARN_ON_ONCE(rdp->dynticks_nesting != 1); WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp))); + raw_spin_lock_init(&rdp->barrier_lock); + rdp->barrier_seq_snap = rcu_state.barrier_sequence; rdp->rcu_ofl_gp_seq = rcu_state.gp_seq; rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED; rdp->rcu_onl_gp_seq = rcu_state.gp_seq; @@ -4284,8 +4325,10 @@ void rcu_cpu_starting(unsigned int cpu) local_irq_save(flags); arch_spin_lock(&rcu_state.ofl_lock); rcu_dynticks_eqs_online(); + raw_spin_lock(&rdp->barrier_lock); raw_spin_lock_rcu_node(rnp); WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask); + raw_spin_unlock(&rdp->barrier_lock); newcpu = !(rnp->expmaskinitnext & mask); rnp->expmaskinitnext |= mask; /* Allow lockless access for expedited grace periods. */ @@ -4372,7 +4415,9 @@ void rcutree_migrate_callbacks(int cpu) rcu_segcblist_empty(&rdp->cblist)) return; /* No callbacks to migrate. */ - local_irq_save(flags); + raw_spin_lock_irqsave(&rdp->barrier_lock, flags); + WARN_ON_ONCE(rcu_rdp_cpu_online(rdp)); + rcu_barrier_entrain(rdp); my_rdp = this_cpu_ptr(&rcu_data); my_rnp = my_rdp->mynode; rcu_nocb_lock(my_rdp); /* irqs already disabled. */ @@ -4382,10 +4427,10 @@ void rcutree_migrate_callbacks(int cpu) needwake = rcu_advance_cbs(my_rnp, rdp) || rcu_advance_cbs(my_rnp, my_rdp); rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); + raw_spin_unlock(&rdp->barrier_lock); /* irqs remain disabled. */ needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp); rcu_segcblist_disable(&rdp->cblist); - WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != - !rcu_segcblist_n_cbs(&my_rdp->cblist)); + WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist)); if (rcu_rdp_is_offloaded(my_rdp)) { raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */ __call_rcu_nocb_wake(my_rdp, true, flags); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 4b4bcef8a974..84362951ed9e 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -188,6 +188,8 @@ struct rcu_data { bool rcu_forced_tick_exp; /* ... provide QS to expedited GP. */ /* 4) rcu_barrier(), OOM callbacks, and expediting. */ + raw_spinlock_t barrier_lock; /* Protects ->barrier_seq_snap. */ + unsigned long barrier_seq_snap; /* Snap of rcu_state.barrier_sequence. */ struct rcu_head barrier_head; int exp_dynticks_snap; /* Double-check need for IPI. */ -- cgit v1.2.3 From 80b3fd474c91b3ecfd845b4a0bfb58706b877ba5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 14 Dec 2021 13:35:17 -0800 Subject: rcu: Make rcu_barrier() no longer block CPU-hotplug operations This commit removes the cpus_read_lock() and cpus_read_unlock() calls from rcu_barrier(), thus allowing CPUs to come and go during the course of rcu_barrier() execution. Posting of the ->barrier_head callbacks does synchronize with portions of RCU's CPU-hotplug notifiers, but these locks are held for short time periods on both sides. Thus, full CPU-hotplug operations could both start and finish during the execution of a given rcu_barrier() invocation. Additional synchronization is provided by a global ->barrier_lock. Since the ->barrier_lock is only used during rcu_barrier() execution and during onlining/offlining a CPU, the contention for this lock should be low. It might be tempting to make use of a per-CPU lock just on general principles, but straightforward attempts to do this have the problems shown below. Initial state: 3 CPUs present, CPU 0 and CPU1 do not have any callback and CPU2 has callbacks. 1. CPU0 calls rcu_barrier(). 2. CPU1 starts offlining for CPU2. CPU1 calls rcutree_migrate_callbacks(). rcu_barrier_entrain() is called from rcutree_migrate_callbacks(), with CPU2's rdp->barrier_lock. It does not entrain ->barrier_head for CPU2, as rcu_barrier() on CPU0 hasn't started the barrier sequence (by calling rcu_seq_start(&rcu_state.barrier_sequence)) yet. 3. CPU0 starts new barrier sequence. It iterates over CPU0 and CPU1, after acquiring their per-cpu ->barrier_lock and finds 0 segcblist length. It updates ->barrier_seq_snap for CPU0 and CPU1 and continues loop iteration to CPU2. for_each_possible_cpu(cpu) { raw_spin_lock_irqsave(&rdp->barrier_lock, flags); if (!rcu_segcblist_n_cbs(&rdp->cblist)) { WRITE_ONCE(rdp->barrier_seq_snap, gseq); raw_spin_unlock_irqrestore(&rdp->barrier_lock, flags); rcu_barrier_trace(TPS("NQ"), cpu, rcu_state.barrier_sequence); continue; } 4. rcutree_migrate_callbacks() completes execution on CPU1. Segcblist len for CPU2 becomes 0. 5. The loop iteration on CPU0, checks rcu_segcblist_n_cbs(&rdp->cblist) for CPU2 and completes the loop iteration after setting ->barrier_seq_snap. 6. As there isn't any ->barrier_head callback entrained; at this point, rcu_barrier() in CPU0 returns. 7. The callbacks, which migrated from CPU2 to CPU1, execute. Straightforward per-CPU locking is also subject to the following race condition noted by Boqun Feng: 1. CPU0 calls rcu_barrier(), starting a new barrier sequence by invoking rcu_seq_start() and init_completion(), but does not yet initialize rcu_state.barrier_cpu_count. 2. CPU1 starts offlining for CPU2, calling rcutree_migrate_callbacks(), which in turn calls rcu_barrier_entrain() holding CPU2's. rdp->barrier_lock. It then entrains ->barrier_head for CPU2 and atomically increments rcu_state.barrier_cpu_count, which is unfortunately not yet initialized to the value 2. 3. The just-entrained RCU callback is invoked. It atomically decrements rcu_state.barrier_cpu_count and sees that it is now zero. This callback therefore invokes complete(). 4. CPU0 continues executing rcu_barrier(), but is not blocked by its call to wait_for_completion(). This results in rcu_barrier() returning before all pre-existing callbacks have been invoked, which is a bug. Therefore, synchronization is provided by rcu_state.barrier_lock, which is also held across the initialization sequence, especially the rcu_seq_start() and the atomic_set() that sets rcu_state.barrier_cpu_count to the value 2. In addition, this lock is held when entraining the rcu_barrier() callback, when deciding whether or not a CPU has callbacks that rcu_barrier() must wait on, when setting the ->qsmaskinitnext for incoming CPUs, and when migrating callbacks from a CPU that is going offline. Reviewed-by: Frederic Weisbecker Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 28 ++++++++++++++-------------- kernel/rcu/tree.h | 3 ++- 2 files changed, 16 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 004ff1c0d192..2d70b91e3fbc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -87,6 +87,7 @@ static struct rcu_state rcu_state = { .gp_state = RCU_GP_IDLE, .gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT, .barrier_mutex = __MUTEX_INITIALIZER(rcu_state.barrier_mutex), + .barrier_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.barrier_lock), .name = RCU_NAME, .abbr = RCU_ABBR, .exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex), @@ -3994,7 +3995,7 @@ static void rcu_barrier_entrain(struct rcu_data *rdp) unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence); unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap); - lockdep_assert_held(&rdp->barrier_lock); + lockdep_assert_held(&rcu_state.barrier_lock); if (rcu_seq_state(lseq) || !rcu_seq_state(gseq) || rcu_seq_ctr(lseq) != rcu_seq_ctr(gseq)) return; rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence); @@ -4023,9 +4024,9 @@ static void rcu_barrier_handler(void *cpu_in) lockdep_assert_irqs_disabled(); WARN_ON_ONCE(cpu != rdp->cpu); WARN_ON_ONCE(cpu != smp_processor_id()); - raw_spin_lock(&rdp->barrier_lock); + raw_spin_lock(&rcu_state.barrier_lock); rcu_barrier_entrain(rdp); - raw_spin_unlock(&rdp->barrier_lock); + raw_spin_unlock(&rcu_state.barrier_lock); } /** @@ -4058,6 +4059,7 @@ void rcu_barrier(void) } /* Mark the start of the barrier operation. */ + raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags); rcu_seq_start(&rcu_state.barrier_sequence); gseq = rcu_state.barrier_sequence; rcu_barrier_trace(TPS("Inc1"), -1, rcu_state.barrier_sequence); @@ -4071,7 +4073,7 @@ void rcu_barrier(void) */ init_completion(&rcu_state.barrier_completion); atomic_set(&rcu_state.barrier_cpu_count, 2); - cpus_read_lock(); + raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags); /* * Force each CPU with callbacks to register a new callback. @@ -4083,21 +4085,21 @@ void rcu_barrier(void) retry: if (smp_load_acquire(&rdp->barrier_seq_snap) == gseq) continue; - raw_spin_lock_irqsave(&rdp->barrier_lock, flags); + raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags); if (!rcu_segcblist_n_cbs(&rdp->cblist)) { WRITE_ONCE(rdp->barrier_seq_snap, gseq); - raw_spin_unlock_irqrestore(&rdp->barrier_lock, flags); + raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags); rcu_barrier_trace(TPS("NQ"), cpu, rcu_state.barrier_sequence); continue; } if (!rcu_rdp_cpu_online(rdp)) { rcu_barrier_entrain(rdp); WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq); - raw_spin_unlock_irqrestore(&rdp->barrier_lock, flags); + raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags); rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu, rcu_state.barrier_sequence); continue; } - raw_spin_unlock_irqrestore(&rdp->barrier_lock, flags); + raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags); if (smp_call_function_single(cpu, rcu_barrier_handler, (void *)cpu, 1)) { schedule_timeout_uninterruptible(1); goto retry; @@ -4105,7 +4107,6 @@ retry: WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq); rcu_barrier_trace(TPS("OnlineQ"), cpu, rcu_state.barrier_sequence); } - cpus_read_unlock(); /* * Now that we have an rcu_barrier_callback() callback on each @@ -4173,7 +4174,6 @@ rcu_boot_init_percpu_data(int cpu) INIT_WORK(&rdp->strict_work, strict_work_handler); WARN_ON_ONCE(rdp->dynticks_nesting != 1); WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp))); - raw_spin_lock_init(&rdp->barrier_lock); rdp->barrier_seq_snap = rcu_state.barrier_sequence; rdp->rcu_ofl_gp_seq = rcu_state.gp_seq; rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED; @@ -4325,10 +4325,10 @@ void rcu_cpu_starting(unsigned int cpu) local_irq_save(flags); arch_spin_lock(&rcu_state.ofl_lock); rcu_dynticks_eqs_online(); - raw_spin_lock(&rdp->barrier_lock); + raw_spin_lock(&rcu_state.barrier_lock); raw_spin_lock_rcu_node(rnp); WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask); - raw_spin_unlock(&rdp->barrier_lock); + raw_spin_unlock(&rcu_state.barrier_lock); newcpu = !(rnp->expmaskinitnext & mask); rnp->expmaskinitnext |= mask; /* Allow lockless access for expedited grace periods. */ @@ -4415,7 +4415,7 @@ void rcutree_migrate_callbacks(int cpu) rcu_segcblist_empty(&rdp->cblist)) return; /* No callbacks to migrate. */ - raw_spin_lock_irqsave(&rdp->barrier_lock, flags); + raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags); WARN_ON_ONCE(rcu_rdp_cpu_online(rdp)); rcu_barrier_entrain(rdp); my_rdp = this_cpu_ptr(&rcu_data); @@ -4427,7 +4427,7 @@ void rcutree_migrate_callbacks(int cpu) needwake = rcu_advance_cbs(my_rnp, rdp) || rcu_advance_cbs(my_rnp, my_rdp); rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); - raw_spin_unlock(&rdp->barrier_lock); /* irqs remain disabled. */ + raw_spin_unlock(&rcu_state.barrier_lock); /* irqs remain disabled. */ needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp); rcu_segcblist_disable(&rdp->cblist); WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist)); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 84362951ed9e..a2d7ffd634cc 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -188,7 +188,6 @@ struct rcu_data { bool rcu_forced_tick_exp; /* ... provide QS to expedited GP. */ /* 4) rcu_barrier(), OOM callbacks, and expediting. */ - raw_spinlock_t barrier_lock; /* Protects ->barrier_seq_snap. */ unsigned long barrier_seq_snap; /* Snap of rcu_state.barrier_sequence. */ struct rcu_head barrier_head; int exp_dynticks_snap; /* Double-check need for IPI. */ @@ -323,6 +322,8 @@ struct rcu_state { /* rcu_barrier(). */ /* End of fields guarded by barrier_mutex. */ + raw_spinlock_t barrier_lock; /* Protects ->barrier_seq_snap. */ + struct mutex exp_mutex; /* Serialize expedited GP. */ struct mutex exp_wake_mutex; /* Serialize wakeup. */ unsigned long expedited_sequence; /* Take a ticket. */ -- cgit v1.2.3 From 5ae0f1b58b28b53f4ab3708ef9337a2665e79664 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 10 Dec 2021 13:44:17 -0800 Subject: rcu: Create and use an rcu_rdp_cpu_online() The pattern "rdp->grpmask & rcu_rnp_online_cpus(rnp)" occurs frequently in RCU code in order to determine whether rdp->cpu is online from an RCU perspective. This commit therefore creates an rcu_rdp_cpu_online() function to replace it. [ paulmck: Apply kernel test robot unused-variable feedback. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 21 +++++++++++++-------- kernel/rcu/tree_plugin.h | 6 ++---- 2 files changed, 15 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2d70b91e3fbc..1d3507d563db 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -222,6 +222,16 @@ static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) return READ_ONCE(rnp->qsmaskinitnext); } +/* + * Is the CPU corresponding to the specified rcu_data structure online + * from RCU's perspective? This perspective is given by that structure's + * ->qsmaskinitnext field rather than by the global cpu_online_mask. + */ +static bool rcu_rdp_cpu_online(struct rcu_data *rdp) +{ + return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode)); +} + /* * Return true if an RCU grace period is in progress. The READ_ONCE()s * permit this function to be invoked without holding the root rcu_node @@ -1168,14 +1178,12 @@ void rcu_request_urgent_qs_task(struct task_struct *t) bool rcu_lockdep_current_cpu_online(void) { struct rcu_data *rdp; - struct rcu_node *rnp; bool ret = false; if (in_nmi() || !rcu_scheduler_fully_active) return true; preempt_disable_notrace(); rdp = this_cpu_ptr(&rcu_data); - rnp = rdp->mynode; /* * Strictly, we care here about the case where the current CPU is * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask @@ -1183,8 +1191,7 @@ bool rcu_lockdep_current_cpu_online(void) * false positive if it's held by some *other* CPU, but that's * OK because that just means a false *negative* on the warning. */ - if (rdp->grpmask & rcu_rnp_online_cpus(rnp) || - arch_spin_is_locked(&rcu_state.ofl_lock)) + if (rcu_rdp_cpu_online(rdp) || arch_spin_is_locked(&rcu_state.ofl_lock)) ret = true; preempt_enable_notrace(); return ret; @@ -1269,8 +1276,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * For more detail, please refer to the "Hotplug CPU" section * of RCU's Requirements documentation. */ - if (WARN_ON_ONCE(!(rdp->grpmask & rcu_rnp_online_cpus(rnp)))) { - bool onl; + if (WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp))) { struct rcu_node *rnp1; pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n", @@ -1279,9 +1285,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent) pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx ->rcu_gp_init_mask %#lx\n", __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask); - onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp)); pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n", - __func__, rdp->cpu, ".o"[onl], + __func__, rdp->cpu, ".o"[rcu_rdp_cpu_online(rdp)], (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags, (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags); return 1; /* Break things loose after complaining. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c5b45c2f68a1..d3db2168598e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -330,7 +330,7 @@ void rcu_note_context_switch(bool preempt) * then queue the task as required based on the states * of any ongoing and expedited grace periods. */ - WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0); + WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp)); WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); trace_rcu_preempt_task(rcu_state.name, t->pid, @@ -773,7 +773,6 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) int cpu; int i; struct list_head *lhp; - bool onl; struct rcu_data *rdp; struct rcu_node *rnp1; @@ -797,9 +796,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) pr_cont("\n"); for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { rdp = per_cpu_ptr(&rcu_data, cpu); - onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp)); pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n", - cpu, ".o"[onl], + cpu, ".o"[rcu_rdp_cpu_online(rdp)], (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags, (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags); } -- cgit v1.2.3 From 2bcd18e041fc3c2ae58f41eb5e18790c7c82c674 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Feb 2022 15:42:36 -0800 Subject: rcu-tasks: Use order_base_2() instead of ilog2() The ilog2() function can be used to generate a shift count, but it will generate the same count for a power of two as for one greater than a power of two. This results in shift counts that are larger than necessary for systems with a power-of-two number of CPUs because the CPUs are numbered from zero, so that the maximum CPU number is one less than that power of two. This commit therefore substitutes order_base_2(), which appears to have been designed for exactly this use case. Suggested-by: Mark Rutland Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d64f0b1d8cd3..670c75cbcb98 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -123,7 +123,7 @@ static struct rcu_tasks rt_name = \ .call_func = call, \ .rtpcpu = &rt_name ## __percpu, \ .name = n, \ - .percpu_enqueue_shift = ilog2(CONFIG_NR_CPUS) + 1, \ + .percpu_enqueue_shift = order_base_2(CONFIG_NR_CPUS), \ .percpu_enqueue_lim = 1, \ .percpu_dequeue_lim = 1, \ .barrier_q_mutex = __MUTEX_INITIALIZER(rt_name.barrier_q_mutex), \ @@ -302,7 +302,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, if (unlikely(needadjust)) { raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); if (rtp->percpu_enqueue_lim != nr_cpu_ids) { - WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids) + 1); + WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(nr_cpu_ids)); WRITE_ONCE(rtp->percpu_dequeue_lim, nr_cpu_ids); smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids); pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name); @@ -417,7 +417,7 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) if (rcu_task_cb_adjust && ncbs <= rcu_task_collapse_lim) { raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); if (rtp->percpu_enqueue_lim > 1) { - WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids) + 1); + WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(nr_cpu_ids)); smp_store_release(&rtp->percpu_enqueue_lim, 1); rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu(); pr_info("Starting switch %s to CPU-0 callback queuing.\n", rtp->name); -- cgit v1.2.3 From 00a8b4b54cd69d9f7ba1730d3b266469a778b1d7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Feb 2022 16:34:40 -0800 Subject: rcu-tasks: Set ->percpu_enqueue_shift to zero upon contention Currently, call_rcu_tasks_generic() sets ->percpu_enqueue_shift to order_base_2(nr_cpu_ids) upon encountering sufficient contention. This does not shift to use of non-CPU-0 callback queues as intended, but rather continues using only CPU 0's queue. Although this does provide some decrease in contention due to spreading work over multiple locks, it is not the dramatic decrease that was intended. This commit therefore makes call_rcu_tasks_generic() set ->percpu_enqueue_shift to 0. Reported-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 670c75cbcb98..ac17348187e4 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -302,7 +302,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, if (unlikely(needadjust)) { raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); if (rtp->percpu_enqueue_lim != nr_cpu_ids) { - WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(nr_cpu_ids)); + WRITE_ONCE(rtp->percpu_enqueue_shift, 0); WRITE_ONCE(rtp->percpu_dequeue_lim, nr_cpu_ids); smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids); pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name); -- cgit v1.2.3 From 3203ce39ac0b2a57a84382ec184c7d4a0bede175 Mon Sep 17 00:00:00 2001 From: JaeSang Yoo Date: Wed, 9 Feb 2022 04:54:22 +0900 Subject: tracing: Fix tp_printk option related with tp_printk_stop_on_boot The kernel parameter "tp_printk_stop_on_boot" starts with "tp_printk" which is the same as another kernel parameter "tp_printk". If "tp_printk" setup is called before the "tp_printk_stop_on_boot", it will override the latter and keep it from being set. This is similar to other kernel parameter issues, such as: Commit 745a600cf1a6 ("um: console: Ignore console= option") or init/do_mounts.c:45 (setup function of "ro" kernel param) Fix it by checking for a "_" right after the "tp_printk" and if that exists do not process the parameter. Link: https://lkml.kernel.org/r/20220208195421.969326-1-jsyoo5b@gmail.com Signed-off-by: JaeSang Yoo [ Fixed up change log and added space after if condition ] Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c860f582b078..7c2578efde26 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -252,6 +252,10 @@ __setup("trace_clock=", set_trace_boot_clock); static int __init set_tracepoint_printk(char *str) { + /* Ignore the "tp_printk_stop_on_boot" param */ + if (*str == '_') + return 0; + if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) tracepoint_printk = 1; return 1; -- cgit v1.2.3 From c1b13a9451ab9d46eefb80a2cc4b8b3206460829 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 8 Feb 2022 14:05:09 -0800 Subject: bpf: Fix bpf_prog_pack build HPAGE_PMD_SIZE Fix build with CONFIG_TRANSPARENT_HUGEPAGE=n with BPF_PROG_PACK_SIZE as PAGE_SIZE. Fixes: 57631054fae6 ("bpf: Introduce bpf_prog_pack allocator") Reported-by: kernel test robot Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220208220509.4180389-3-song@kernel.org --- kernel/bpf/core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 306aa63fa58e..42d96549a804 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -814,7 +814,11 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86) * to host BPF programs. */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE #define BPF_PROG_PACK_SIZE HPAGE_PMD_SIZE +#else +#define BPF_PROG_PACK_SIZE PAGE_SIZE +#endif #define BPF_PROG_CHUNK_SHIFT 6 #define BPF_PROG_CHUNK_SIZE (1 << BPF_PROG_CHUNK_SHIFT) #define BPF_PROG_CHUNK_MASK (~(BPF_PROG_CHUNK_SIZE - 1)) @@ -826,7 +830,7 @@ struct bpf_prog_pack { unsigned long bitmap[BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)]; }; -#define BPF_PROG_MAX_PACK_PROG_SIZE HPAGE_PMD_SIZE +#define BPF_PROG_MAX_PACK_PROG_SIZE BPF_PROG_PACK_SIZE #define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE) static DEFINE_MUTEX(pack_mutex); -- cgit v1.2.3 From 1f8863bfb5ca500ea1c7669b16b1931ba27fce20 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 1 Feb 2022 12:02:59 +0000 Subject: genirq: Allow the PM device to originate from irq domain As a preparation to moving the reference to the device used for runtime power management, add a new 'dev' field to the irqdomain structure for that exact purpose. The irq_chip_pm_{get,put}() helpers are made aware of the dual location via a new private helper. No functional change intended. Signed-off-by: Marc Zyngier Reviewed-by: Geert Uytterhoeven Tested-by: Geert Uytterhoeven Tested-by: Tony Lindgren Acked-by: Bartosz Golaszewski Link: https://lore.kernel.org/r/20220201120310.878267-2-maz@kernel.org --- include/linux/irqdomain.h | 10 ++++++++++ kernel/irq/chip.c | 23 ++++++++++++++++++----- 2 files changed, 28 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index d476405802e9..be25a33293e5 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -151,6 +151,8 @@ struct irq_domain_chip_generic; * @gc: Pointer to a list of generic chips. There is a helper function for * setting up one or more generic chips for interrupt controllers * drivers using the generic chip library which uses this pointer. + * @dev: Pointer to a device that the domain represent, and that will be + * used for power management purposes. * @parent: Pointer to parent irq_domain to support hierarchy irq_domains * * Revmap data, used internally by irq_domain @@ -171,6 +173,7 @@ struct irq_domain { struct fwnode_handle *fwnode; enum irq_domain_bus_token bus_token; struct irq_domain_chip_generic *gc; + struct device *dev; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY struct irq_domain *parent; #endif @@ -226,6 +229,13 @@ static inline struct device_node *irq_domain_get_of_node(struct irq_domain *d) return to_of_node(d->fwnode); } +static inline void irq_domain_set_pm_device(struct irq_domain *d, + struct device *dev) +{ + if (d) + d->dev = dev; +} + #ifdef CONFIG_IRQ_DOMAIN struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id, const char *name, phys_addr_t *pa); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c09324663088..a2a12cdbe872 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1558,6 +1558,17 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) return 0; } +static struct device *irq_get_parent_device(struct irq_data *data) +{ + if (data->chip->parent_device) + return data->chip->parent_device; + + if (data->domain) + return data->domain->dev; + + return NULL; +} + /** * irq_chip_pm_get - Enable power for an IRQ chip * @data: Pointer to interrupt specific data @@ -1567,12 +1578,13 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) */ int irq_chip_pm_get(struct irq_data *data) { + struct device *dev = irq_get_parent_device(data); int retval; - if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) { - retval = pm_runtime_get_sync(data->chip->parent_device); + if (IS_ENABLED(CONFIG_PM) && dev) { + retval = pm_runtime_get_sync(dev); if (retval < 0) { - pm_runtime_put_noidle(data->chip->parent_device); + pm_runtime_put_noidle(dev); return retval; } } @@ -1590,10 +1602,11 @@ int irq_chip_pm_get(struct irq_data *data) */ int irq_chip_pm_put(struct irq_data *data) { + struct device *dev = irq_get_parent_device(data); int retval = 0; - if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) - retval = pm_runtime_put(data->chip->parent_device); + if (IS_ENABLED(CONFIG_PM) && dev) + retval = pm_runtime_put(dev); return (retval < 0) ? retval : 0; } -- cgit v1.2.3 From 7a82f89de92aac5a244d3735b2bd162c1147620c Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Wed, 9 Feb 2022 14:49:38 -0500 Subject: audit: don't deref the syscall args when checking the openat2 open_how::flags As reported by Jeff, dereferencing the openat2 syscall argument in audit_match_perm() to obtain the open_how::flags can result in an oops/page-fault. This patch fixes this by using the open_how struct that we store in the audit_context with audit_openat2_how(). Independent of this patch, Richard Guy Briggs posted a similar patch to the audit mailing list roughly 40 minutes after this patch was posted. Cc: stable@vger.kernel.org Fixes: 1c30e3af8a79 ("audit: add support for the openat2 syscall") Reported-by: Jeff Mahoney Signed-off-by: Paul Moore --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index fce5d43a933f..a83928cbdcb7 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -185,7 +185,7 @@ static int audit_match_perm(struct audit_context *ctx, int mask) case AUDITSC_EXECVE: return mask & AUDIT_PERM_EXEC; case AUDITSC_OPENAT2: - return mask & ACC_MODE((u32)((struct open_how *)ctx->argv[2])->flags); + return mask & ACC_MODE((u32)ctx->openat2.flags); default: return 0; } -- cgit v1.2.3 From beb0622138cd2848dec06b0651a988c39d099574 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 1 Feb 2022 12:03:10 +0000 Subject: genirq: Kill irq_chip::parent_device Now that noone is using irq_chip::parent_device in the tree, get rid of it. Signed-off-by: Marc Zyngier Acked-by: Bartosz Golaszewski Link: https://lore.kernel.org/r/20220201120310.878267-13-maz@kernel.org --- include/linux/irq.h | 2 -- kernel/irq/chip.c | 3 --- 2 files changed, 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 848e1e12c5c6..2cb2e2ac2703 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -456,7 +456,6 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) /** * struct irq_chip - hardware interrupt chip descriptor * - * @parent_device: pointer to parent device for irqchip * @name: name for /proc/interrupts * @irq_startup: start up the interrupt (defaults to ->enable if NULL) * @irq_shutdown: shut down the interrupt (defaults to ->disable if NULL) @@ -503,7 +502,6 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) * @flags: chip specific flags */ struct irq_chip { - struct device *parent_device; const char *name; unsigned int (*irq_startup)(struct irq_data *data); void (*irq_shutdown)(struct irq_data *data); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a2a12cdbe872..24b6f2b40e5e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1560,9 +1560,6 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) static struct device *irq_get_parent_device(struct irq_data *data) { - if (data->chip->parent_device) - return data->chip->parent_device; - if (data->domain) return data->domain->dev; -- cgit v1.2.3 From b1d18a7574d0df5eb4117c14742baf8bc2b9bb74 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 9 Feb 2022 15:19:57 -0800 Subject: bpf: Extend sys_bpf commands for bpf_syscall programs. bpf_sycall programs can be used directly by the kernel modules to load programs and create maps via kernel skeleton. . Export bpf_sys_bpf syscall wrapper to be used in kernel skeleton. . Export bpf_map_get to be used in kernel skeleton. . Allow prog_run cmd for bpf_syscall programs with recursion check. . Enable link_create and raw_tp_open cmds. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220209232001.27490-2-alexei.starovoitov@gmail.com --- kernel/bpf/syscall.c | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 72ce1edde950..49f88b30662a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -985,6 +985,7 @@ struct bpf_map *bpf_map_get(u32 ufd) return map; } +EXPORT_SYMBOL(bpf_map_get); struct bpf_map *bpf_map_get_with_uref(u32 ufd) { @@ -4756,23 +4757,52 @@ static bool syscall_prog_is_valid_access(int off, int size, return true; } -BPF_CALL_3(bpf_sys_bpf, int, cmd, void *, attr, u32, attr_size) +BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) { + struct bpf_prog * __maybe_unused prog; + switch (cmd) { case BPF_MAP_CREATE: case BPF_MAP_UPDATE_ELEM: case BPF_MAP_FREEZE: case BPF_PROG_LOAD: case BPF_BTF_LOAD: + case BPF_LINK_CREATE: + case BPF_RAW_TRACEPOINT_OPEN: break; - /* case BPF_PROG_TEST_RUN: - * is not part of this list to prevent recursive test_run - */ +#ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */ + case BPF_PROG_TEST_RUN: + if (attr->test.data_in || attr->test.data_out || + attr->test.ctx_out || attr->test.duration || + attr->test.repeat || attr->test.flags) + return -EINVAL; + + prog = bpf_prog_get_type(attr->test.prog_fd, BPF_PROG_TYPE_SYSCALL); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (attr->test.ctx_size_in < prog->aux->max_ctx_offset || + attr->test.ctx_size_in > U16_MAX) { + bpf_prog_put(prog); + return -EINVAL; + } + + if (!__bpf_prog_enter_sleepable(prog)) { + /* recursion detected */ + bpf_prog_put(prog); + return -EBUSY; + } + attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in); + __bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */); + bpf_prog_put(prog); + return 0; +#endif default: return -EINVAL; } return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size); } +EXPORT_SYMBOL(bpf_sys_bpf); static const struct bpf_func_proto bpf_sys_bpf_proto = { .func = bpf_sys_bpf, -- cgit v1.2.3 From d7beb3d6aba39480590b30c502fbaa2cc1e5e30b Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 9 Feb 2022 15:20:00 -0800 Subject: bpf: Update iterators.lskel.h. Light skeleton and skel_internal.h have changed. Update iterators.lskel.h. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220209232001.27490-5-alexei.starovoitov@gmail.com --- kernel/bpf/preload/iterators/iterators.lskel.h | 141 ++++++++++++------------- 1 file changed, 69 insertions(+), 72 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/preload/iterators/iterators.lskel.h b/kernel/bpf/preload/iterators/iterators.lskel.h index d90562d672d2..70f236a82fe1 100644 --- a/kernel/bpf/preload/iterators/iterators.lskel.h +++ b/kernel/bpf/preload/iterators/iterators.lskel.h @@ -3,8 +3,6 @@ #ifndef __ITERATORS_BPF_SKEL_H__ #define __ITERATORS_BPF_SKEL_H__ -#include -#include #include struct iterators_bpf { @@ -70,31 +68,28 @@ iterators_bpf__destroy(struct iterators_bpf *skel) iterators_bpf__detach(skel); skel_closenz(skel->progs.dump_bpf_map.prog_fd); skel_closenz(skel->progs.dump_bpf_prog.prog_fd); - munmap(skel->rodata, 4096); + skel_free_map_data(skel->rodata, skel->maps.rodata.initial_value, 4096); skel_closenz(skel->maps.rodata.map_fd); - free(skel); + skel_free(skel); } static inline struct iterators_bpf * iterators_bpf__open(void) { struct iterators_bpf *skel; - skel = calloc(sizeof(*skel), 1); + skel = skel_alloc(sizeof(*skel)); if (!skel) goto cleanup; skel->ctx.sz = (void *)&skel->links - (void *)skel; - skel->rodata = - mmap(NULL, 4096, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, -1, 0); - if (skel->rodata == (void *) -1) - goto cleanup; - memcpy(skel->rodata, (void *)"\ + skel->rodata = skel_prep_map_data((void *)"\ \x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\ \x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\ \x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\ \x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\ -\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0", 98); - skel->maps.rodata.initial_value = (__u64)(long)skel->rodata; +\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0", 4096, 98); + if (!skel->rodata) + goto cleanup; + skel->maps.rodata.initial_value = (__u64) (long) skel->rodata; return skel; cleanup: iterators_bpf__destroy(skel); @@ -326,7 +321,7 @@ iterators_bpf__load(struct iterators_bpf *skel) \0\0\x01\0\0\0\0\0\0\0\x13\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\ \0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\ \0\0\0\0"; - opts.insns_sz = 2184; + opts.insns_sz = 2216; opts.insns = (void *)"\ \xbf\x16\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\x78\xff\xff\xff\xb7\x02\0\ \0\x88\0\0\0\xb7\x03\0\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x05\0\x14\0\0\0\0\0\x61\ @@ -343,70 +338,72 @@ iterators_bpf__load(struct iterators_bpf *skel) \0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\xb7\x03\0\0\x1c\0\0\0\x85\0\0\0\ \xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\xd4\xff\0\0\0\0\x63\x7a\x78\xff\0\0\0\0\ \x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x0e\0\0\x63\x01\0\0\0\ -\0\0\0\x61\x60\x20\0\0\0\0\0\x15\0\x03\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\x61\x60\x1c\0\0\0\0\0\x15\0\x03\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\ \x5c\x0e\0\0\x63\x01\0\0\0\0\0\0\xb7\x01\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\ \0\x50\x0e\0\0\xb7\x03\0\0\x48\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\ \xc5\x07\xc3\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x71\0\0\0\0\0\ -\0\x79\x63\x18\0\0\0\0\0\x15\x03\x04\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x98\ -\x0e\0\0\xb7\x02\0\0\x62\0\0\0\x85\0\0\0\x94\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\x63\x01\0\ -\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\ -\x10\x0f\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x98\x0e\0\0\x18\ -\x61\0\0\0\0\0\0\0\0\0\0\x18\x0f\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x02\0\0\0\ -\x18\x62\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\xb7\x03\0\0\x20\0\0\0\x85\0\0\0\xa6\0\ -\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\xa3\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\x63\x01\0\0\ -\0\0\0\0\xb7\x01\0\0\x16\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\xb7\x03\ -\0\0\x04\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x96\xff\0\0\0\0\ -\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x78\x11\0\ -\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x38\x0f\0\0\x18\x61\0\0\0\0\ -\0\0\0\0\0\0\x70\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x40\ -\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\ -\0\0\0\0\0\0\0\0\0\x48\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xc8\x11\0\0\x7b\x01\ -\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\ -\0\xe8\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x18\x61\ -\0\0\0\0\0\0\0\0\0\0\xe0\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\x60\x08\0\0\0\0\0\x18\ -\x61\0\0\0\0\0\0\0\0\0\0\x80\x11\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\ -\x18\x61\0\0\0\0\0\0\0\0\0\0\x84\x11\0\0\x63\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\ -\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x88\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\xa0\x78\ -\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb0\x11\0\0\x63\x01\0\0\0\0\0\0\x18\ -\x61\0\0\0\0\0\0\0\0\0\0\xf8\x11\0\0\xb7\x02\0\0\x11\0\0\0\xb7\x03\0\0\x0c\0\0\ -\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x60\xff\ -\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x68\x11\0\0\x63\x70\x6c\0\0\0\0\0\x77\x07\ -\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\0\0\0\x18\x62\0\0\0\0\0\0\ -\0\0\0\0\x68\x11\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\ -\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd8\x11\0\0\x61\x01\0\0\0\0\0\0\xd5\x01\x02\0\ -\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x4e\xff\0\0\0\0\x63\ -\x7a\x80\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\x18\x61\0\0\0\0\0\ -\0\0\0\0\0\x10\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x18\x12\ -\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\ -\0\0\0\0\0\0\0\x28\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x50\x17\0\0\x7b\x01\0\0\ -\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\ -\x60\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd0\x15\0\0\x18\ -\x61\0\0\0\0\0\0\0\0\0\0\x80\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x78\x17\0\0\x7b\x01\0\0\0\0\0\0\x61\ -\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x17\0\0\x63\x01\0\0\0\0\0\0\ -\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x1c\x17\0\0\x63\x01\0\0\0\0\ -\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\x7b\x01\0\0\ -\0\0\0\0\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x48\x17\0\0\x63\ -\x01\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x90\x17\0\0\xb7\x02\0\0\x12\0\0\0\ -\xb7\x03\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\ -\0\0\xc5\x07\x17\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\x63\x70\x6c\ -\0\0\0\0\0\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\0\0\0\ -\x18\x62\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\ -\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x70\x17\0\0\x61\x01\0\0\0\0\ -\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x05\ -\xff\0\0\0\0\x63\x7a\x84\xff\0\0\0\0\x61\xa1\x78\xff\0\0\0\0\xd5\x01\x02\0\0\0\ -\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa0\x80\xff\0\0\0\0\x63\x06\ -\x28\0\0\0\0\0\x61\xa0\x84\xff\0\0\0\0\x63\x06\x2c\0\0\0\0\0\x18\x61\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\0\0\x63\x06\x18\0\0\0\0\0\xb7\0\0\0\0\0\0\0\ -\x95\0\0\0\0\0\0\0"; +\0\x79\x63\x20\0\0\0\0\0\x15\x03\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x98\ +\x0e\0\0\xb7\x02\0\0\x62\0\0\0\x61\x60\x04\0\0\0\0\0\x45\0\x02\0\x01\0\0\0\x85\ +\0\0\0\x94\0\0\0\x05\0\x01\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x18\x62\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\x63\ +\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\ +\0\0\x10\x0f\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x98\x0e\0\0\ +\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x0f\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x02\0\ +\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\xb7\x03\0\0\x20\0\0\0\x85\0\0\0\ +\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x9f\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\x63\ +\x01\0\0\0\0\0\0\xb7\x01\0\0\x16\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\ +\xb7\x03\0\0\x04\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x92\xff\ +\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\ +\x78\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x38\x0f\0\0\x18\ +\x61\0\0\0\0\0\0\0\0\0\0\x70\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\ +\0\0\0\x40\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x11\0\0\x7b\x01\0\0\0\0\0\0\ +\x18\x60\0\0\0\0\0\0\0\0\0\0\x48\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xc8\x11\0\ +\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x10\0\0\x18\x61\0\0\0\0\ +\0\0\0\0\0\0\xe8\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe0\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\x60\x08\0\0\ +\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x11\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\ +\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x84\x11\0\0\x63\x01\0\0\0\0\0\0\x79\x60\ +\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x88\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\ +\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb0\x11\0\0\x63\x01\0\0\0\0\0\ +\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xf8\x11\0\0\xb7\x02\0\0\x11\0\0\0\xb7\x03\0\0\ +\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\ +\x5c\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x68\x11\0\0\x63\x70\x6c\0\0\0\0\0\ +\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\0\0\0\x18\x62\0\0\ +\0\0\0\0\0\0\0\0\x68\x11\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\ +\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd8\x11\0\0\x61\x01\0\0\0\0\0\0\xd5\ +\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x4a\xff\0\0\ +\0\0\x63\x7a\x80\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\x18\x61\0\ +\0\0\0\0\0\0\0\0\0\x10\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\ +\x18\x12\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\ +\x60\0\0\0\0\0\0\0\0\0\0\x28\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x50\x17\0\0\ +\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x14\0\0\x18\x61\0\0\0\0\0\ +\0\0\0\0\0\x60\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd0\x15\ +\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x78\x17\0\0\x7b\x01\0\0\0\0\ +\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x17\0\0\x63\x01\0\0\ +\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x1c\x17\0\0\x63\x01\ +\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\x7b\ +\x01\0\0\0\0\0\0\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x48\x17\0\ +\0\x63\x01\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x90\x17\0\0\xb7\x02\0\0\x12\ +\0\0\0\xb7\x03\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\ +\0\0\0\0\0\xc5\x07\x13\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\x63\ +\x70\x6c\0\0\0\0\0\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\ +\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\ +\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x70\x17\0\0\x61\x01\ +\0\0\0\0\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\ +\x07\x01\xff\0\0\0\0\x63\x7a\x84\xff\0\0\0\0\x61\xa1\x78\xff\0\0\0\0\xd5\x01\ +\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa0\x80\xff\0\0\0\0\ +\x63\x06\x28\0\0\0\0\0\x61\xa0\x84\xff\0\0\0\0\x63\x06\x2c\0\0\0\0\0\x18\x61\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\0\0\x63\x06\x18\0\0\0\0\0\xb7\0\0\0\ +\0\0\0\0\x95\0\0\0\0\0\0\0"; err = bpf_load_and_run(&opts); if (err < 0) return err; - skel->rodata = - mmap(skel->rodata, 4096, PROT_READ, MAP_SHARED | MAP_FIXED, - skel->maps.rodata.map_fd, 0); + skel->rodata = skel_finalize_map_data(&skel->maps.rodata.initial_value, + 4096, PROT_READ, skel->maps.rodata.map_fd); + if (!skel->rodata) + return -ENOMEM; return 0; } -- cgit v1.2.3 From cb80ddc67152e72f28ff6ea8517acdf875d7381d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 9 Feb 2022 15:20:01 -0800 Subject: bpf: Convert bpf_preload.ko to use light skeleton. The main change is a move of the single line #include "iterators.lskel.h" from iterators/iterators.c to bpf_preload_kern.c. Which means that generated light skeleton can be used from user space or user mode driver like iterators.c or from the kernel module or the kernel itself. The direct use of light skeleton from the kernel module simplifies the code, since UMD is no longer necessary. The libbpf.a required user space and UMD. The CO-RE in the kernel and generated "loader bpf program" used by the light skeleton are capable to perform complex loading operations traditionally provided by libbpf. In addition UMD approach was launching UMD process every time bpffs has to be mounted. With light skeleton in the kernel the bpf_preload kernel module loads bpf iterators once and pins them multiple times into different bpffs mounts. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220209232001.27490-6-alexei.starovoitov@gmail.com --- kernel/bpf/inode.c | 39 ++----- kernel/bpf/preload/Kconfig | 7 +- kernel/bpf/preload/Makefile | 14 +-- kernel/bpf/preload/bpf_preload.h | 8 +- kernel/bpf/preload/bpf_preload_kern.c | 119 +++++++++------------- kernel/bpf/preload/bpf_preload_umd_blob.S | 7 -- kernel/bpf/preload/iterators/bpf_preload_common.h | 13 --- kernel/bpf/preload/iterators/iterators.c | 108 -------------------- kernel/bpf/syscall.c | 2 + 9 files changed, 70 insertions(+), 247 deletions(-) delete mode 100644 kernel/bpf/preload/bpf_preload_umd_blob.S delete mode 100644 kernel/bpf/preload/iterators/bpf_preload_common.h delete mode 100644 kernel/bpf/preload/iterators/iterators.c (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5a8d9f7467bf..4f841e16779e 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -710,11 +710,10 @@ static DEFINE_MUTEX(bpf_preload_lock); static int populate_bpffs(struct dentry *parent) { struct bpf_preload_info objs[BPF_PRELOAD_LINKS] = {}; - struct bpf_link *links[BPF_PRELOAD_LINKS] = {}; int err = 0, i; /* grab the mutex to make sure the kernel interactions with bpf_preload - * UMD are serialized + * are serialized */ mutex_lock(&bpf_preload_lock); @@ -722,40 +721,22 @@ static int populate_bpffs(struct dentry *parent) if (!bpf_preload_mod_get()) goto out; - if (!bpf_preload_ops->info.tgid) { - /* preload() will start UMD that will load BPF iterator programs */ - err = bpf_preload_ops->preload(objs); - if (err) + err = bpf_preload_ops->preload(objs); + if (err) + goto out_put; + for (i = 0; i < BPF_PRELOAD_LINKS; i++) { + bpf_link_inc(objs[i].link); + err = bpf_iter_link_pin_kernel(parent, + objs[i].link_name, objs[i].link); + if (err) { + bpf_link_put(objs[i].link); goto out_put; - for (i = 0; i < BPF_PRELOAD_LINKS; i++) { - links[i] = bpf_link_by_id(objs[i].link_id); - if (IS_ERR(links[i])) { - err = PTR_ERR(links[i]); - goto out_put; - } } - for (i = 0; i < BPF_PRELOAD_LINKS; i++) { - err = bpf_iter_link_pin_kernel(parent, - objs[i].link_name, links[i]); - if (err) - goto out_put; - /* do not unlink successfully pinned links even - * if later link fails to pin - */ - links[i] = NULL; - } - /* finish() will tell UMD process to exit */ - err = bpf_preload_ops->finish(); - if (err) - goto out_put; } out_put: bpf_preload_mod_put(); out: mutex_unlock(&bpf_preload_lock); - for (i = 0; i < BPF_PRELOAD_LINKS && err; i++) - if (!IS_ERR_OR_NULL(links[i])) - bpf_link_put(links[i]); return err; } diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig index 26bced262473..c9d45c9d6918 100644 --- a/kernel/bpf/preload/Kconfig +++ b/kernel/bpf/preload/Kconfig @@ -18,10 +18,9 @@ menuconfig BPF_PRELOAD if BPF_PRELOAD config BPF_PRELOAD_UMD - tristate "bpf_preload kernel module with user mode driver" - depends on CC_CAN_LINK - depends on m || CC_CAN_LINK_STATIC + tristate "bpf_preload kernel module" default m help - This builds bpf_preload kernel module with embedded user mode driver. + This builds bpf_preload kernel module with embedded BPF programs for + introspection in bpffs. endif diff --git a/kernel/bpf/preload/Makefile b/kernel/bpf/preload/Makefile index baf47d9c7557..167534e3b0b4 100644 --- a/kernel/bpf/preload/Makefile +++ b/kernel/bpf/preload/Makefile @@ -3,16 +3,6 @@ LIBBPF_SRCS = $(srctree)/tools/lib/bpf/ LIBBPF_INCLUDE = $(LIBBPF_SRCS)/.. -userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi \ - -I $(LIBBPF_INCLUDE) -Wno-unused-result - -userprogs := bpf_preload_umd - -bpf_preload_umd-objs := iterators/iterators.o - -$(obj)/bpf_preload_umd: - -$(obj)/bpf_preload_umd_blob.o: $(obj)/bpf_preload_umd - obj-$(CONFIG_BPF_PRELOAD_UMD) += bpf_preload.o -bpf_preload-objs += bpf_preload_kern.o bpf_preload_umd_blob.o +CFLAGS_bpf_preload_kern.o += -I $(LIBBPF_INCLUDE) +bpf_preload-objs += bpf_preload_kern.o diff --git a/kernel/bpf/preload/bpf_preload.h b/kernel/bpf/preload/bpf_preload.h index 2f9932276f2e..f065c91213a0 100644 --- a/kernel/bpf/preload/bpf_preload.h +++ b/kernel/bpf/preload/bpf_preload.h @@ -2,13 +2,13 @@ #ifndef _BPF_PRELOAD_H #define _BPF_PRELOAD_H -#include -#include "iterators/bpf_preload_common.h" +struct bpf_preload_info { + char link_name[16]; + struct bpf_link *link; +}; struct bpf_preload_ops { - struct umd_info info; int (*preload)(struct bpf_preload_info *); - int (*finish)(void); struct module *owner; }; extern struct bpf_preload_ops *bpf_preload_ops; diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c index 53736e52c1df..30207c048d36 100644 --- a/kernel/bpf/preload/bpf_preload_kern.c +++ b/kernel/bpf/preload/bpf_preload_kern.c @@ -2,101 +2,80 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include -#include -#include -#include #include "bpf_preload.h" +#include "iterators/iterators.lskel.h" -extern char bpf_preload_umd_start; -extern char bpf_preload_umd_end; +static struct bpf_link *maps_link, *progs_link; +static struct iterators_bpf *skel; -static int preload(struct bpf_preload_info *obj); -static int finish(void); +static void free_links_and_skel(void) +{ + if (!IS_ERR_OR_NULL(maps_link)) + bpf_link_put(maps_link); + if (!IS_ERR_OR_NULL(progs_link)) + bpf_link_put(progs_link); + iterators_bpf__destroy(skel); +} + +static int preload(struct bpf_preload_info *obj) +{ + strlcpy(obj[0].link_name, "maps.debug", sizeof(obj[0].link_name)); + obj[0].link = maps_link; + strlcpy(obj[1].link_name, "progs.debug", sizeof(obj[1].link_name)); + obj[1].link = progs_link; + return 0; +} -static struct bpf_preload_ops umd_ops = { - .info.driver_name = "bpf_preload", +static struct bpf_preload_ops ops = { .preload = preload, - .finish = finish, .owner = THIS_MODULE, }; -static int preload(struct bpf_preload_info *obj) +static int load_skel(void) { - int magic = BPF_PRELOAD_START; - loff_t pos = 0; - int i, err; - ssize_t n; + int err; - err = fork_usermode_driver(&umd_ops.info); + skel = iterators_bpf__open(); + if (!skel) + return -ENOMEM; + err = iterators_bpf__load(skel); if (err) - return err; - - /* send the start magic to let UMD proceed with loading BPF progs */ - n = kernel_write(umd_ops.info.pipe_to_umh, - &magic, sizeof(magic), &pos); - if (n != sizeof(magic)) - return -EPIPE; - - /* receive bpf_link IDs and names from UMD */ - pos = 0; - for (i = 0; i < BPF_PRELOAD_LINKS; i++) { - n = kernel_read(umd_ops.info.pipe_from_umh, - &obj[i], sizeof(*obj), &pos); - if (n != sizeof(*obj)) - return -EPIPE; + goto out; + err = iterators_bpf__attach(skel); + if (err) + goto out; + maps_link = bpf_link_get_from_fd(skel->links.dump_bpf_map_fd); + if (IS_ERR(maps_link)) { + err = PTR_ERR(maps_link); + goto out; } - return 0; -} - -static int finish(void) -{ - int magic = BPF_PRELOAD_END; - struct pid *tgid; - loff_t pos = 0; - ssize_t n; - - /* send the last magic to UMD. It will do a normal exit. */ - n = kernel_write(umd_ops.info.pipe_to_umh, - &magic, sizeof(magic), &pos); - if (n != sizeof(magic)) - return -EPIPE; - - tgid = umd_ops.info.tgid; - if (tgid) { - wait_event(tgid->wait_pidfd, thread_group_exited(tgid)); - umd_cleanup_helper(&umd_ops.info); + progs_link = bpf_link_get_from_fd(skel->links.dump_bpf_prog_fd); + if (IS_ERR(progs_link)) { + err = PTR_ERR(progs_link); + goto out; } return 0; +out: + free_links_and_skel(); + return err; } -static int __init load_umd(void) +static int __init load(void) { int err; - err = umd_load_blob(&umd_ops.info, &bpf_preload_umd_start, - &bpf_preload_umd_end - &bpf_preload_umd_start); + err = load_skel(); if (err) return err; - bpf_preload_ops = &umd_ops; + bpf_preload_ops = &ops; return err; } -static void __exit fini_umd(void) +static void __exit fini(void) { - struct pid *tgid; - bpf_preload_ops = NULL; - - /* kill UMD in case it's still there due to earlier error */ - tgid = umd_ops.info.tgid; - if (tgid) { - kill_pid(tgid, SIGKILL, 1); - - wait_event(tgid->wait_pidfd, thread_group_exited(tgid)); - umd_cleanup_helper(&umd_ops.info); - } - umd_unload_blob(&umd_ops.info); + free_links_and_skel(); } -late_initcall(load_umd); -module_exit(fini_umd); +late_initcall(load); +module_exit(fini); MODULE_LICENSE("GPL"); diff --git a/kernel/bpf/preload/bpf_preload_umd_blob.S b/kernel/bpf/preload/bpf_preload_umd_blob.S deleted file mode 100644 index f1f40223b5c3..000000000000 --- a/kernel/bpf/preload/bpf_preload_umd_blob.S +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - .section .init.rodata, "a" - .global bpf_preload_umd_start -bpf_preload_umd_start: - .incbin "kernel/bpf/preload/bpf_preload_umd" - .global bpf_preload_umd_end -bpf_preload_umd_end: diff --git a/kernel/bpf/preload/iterators/bpf_preload_common.h b/kernel/bpf/preload/iterators/bpf_preload_common.h deleted file mode 100644 index 8464d1a48c05..000000000000 --- a/kernel/bpf/preload/iterators/bpf_preload_common.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BPF_PRELOAD_COMMON_H -#define _BPF_PRELOAD_COMMON_H - -#define BPF_PRELOAD_START 0x5555 -#define BPF_PRELOAD_END 0xAAAA - -struct bpf_preload_info { - char link_name[16]; - int link_id; -}; - -#endif diff --git a/kernel/bpf/preload/iterators/iterators.c b/kernel/bpf/preload/iterators/iterators.c deleted file mode 100644 index 4dafe0f4f2b2..000000000000 --- a/kernel/bpf/preload/iterators/iterators.c +++ /dev/null @@ -1,108 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (c) 2020 Facebook */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "iterators.lskel.h" -#include "bpf_preload_common.h" - -int to_kernel = -1; -int from_kernel = 0; - -static int __bpf_obj_get_info_by_fd(int bpf_fd, void *info, __u32 *info_len) -{ - union bpf_attr attr; - int err; - - memset(&attr, 0, sizeof(attr)); - attr.info.bpf_fd = bpf_fd; - attr.info.info_len = *info_len; - attr.info.info = (long) info; - - err = skel_sys_bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr)); - if (!err) - *info_len = attr.info.info_len; - return err; -} - -static int send_link_to_kernel(int link_fd, const char *link_name) -{ - struct bpf_preload_info obj = {}; - struct bpf_link_info info = {}; - __u32 info_len = sizeof(info); - int err; - - err = __bpf_obj_get_info_by_fd(link_fd, &info, &info_len); - if (err) - return err; - obj.link_id = info.id; - if (strlen(link_name) >= sizeof(obj.link_name)) - return -E2BIG; - strcpy(obj.link_name, link_name); - if (write(to_kernel, &obj, sizeof(obj)) != sizeof(obj)) - return -EPIPE; - return 0; -} - -int main(int argc, char **argv) -{ - struct iterators_bpf *skel; - int err, magic; - int debug_fd; - - debug_fd = open("/dev/console", O_WRONLY | O_NOCTTY | O_CLOEXEC); - if (debug_fd < 0) - return 1; - to_kernel = dup(1); - close(1); - dup(debug_fd); - /* now stdin and stderr point to /dev/console */ - - read(from_kernel, &magic, sizeof(magic)); - if (magic != BPF_PRELOAD_START) { - printf("bad start magic %d\n", magic); - return 1; - } - /* libbpf opens BPF object and loads it into the kernel */ - skel = iterators_bpf__open_and_load(); - if (!skel) { - /* iterators.skel.h is little endian. - * libbpf doesn't support automatic little->big conversion - * of BPF bytecode yet. - * The program load will fail in such case. - */ - printf("Failed load could be due to wrong endianness\n"); - return 1; - } - err = iterators_bpf__attach(skel); - if (err) - goto cleanup; - - /* send two bpf_link IDs with names to the kernel */ - err = send_link_to_kernel(skel->links.dump_bpf_map_fd, "maps.debug"); - if (err) - goto cleanup; - err = send_link_to_kernel(skel->links.dump_bpf_prog_fd, "progs.debug"); - if (err) - goto cleanup; - - /* The kernel will proceed with pinnging the links in bpffs. - * UMD will wait on read from pipe. - */ - read(from_kernel, &magic, sizeof(magic)); - if (magic != BPF_PRELOAD_END) { - printf("bad final magic %d\n", magic); - err = -EINVAL; - } -cleanup: - iterators_bpf__destroy(skel); - - return err != 0; -} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 49f88b30662a..35646db3d950 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2490,6 +2490,7 @@ void bpf_link_put(struct bpf_link *link) bpf_link_free(link); } } +EXPORT_SYMBOL(bpf_link_put); static int bpf_link_release(struct inode *inode, struct file *filp) { @@ -2632,6 +2633,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd) return link; } +EXPORT_SYMBOL(bpf_link_get_from_fd); struct bpf_tracing_link { struct bpf_link link; -- cgit v1.2.3 From 4cc0991abd3954609a6929234bbb8c0fe7a0298d Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 10 Feb 2022 18:49:39 -0800 Subject: bpf: Fix bpf_prog_pack build for ppc64_defconfig bpf_prog_pack causes build error with powerpc ppc64_defconfig: kernel/bpf/core.c:830:23: error: variably modified 'bitmap' at file scope 830 | unsigned long bitmap[BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)]; | ^~~~~~ This is because the marco expands as: unsigned long bitmap[((((((1UL) << (16 + __pte_index_size)) / (1 << 6))) \ + ((sizeof(long) * 8)) - 1) / ((sizeof(long) * 8)))]; where __pte_index_size is a global variable. Fix it by turning bitmap into a 0-length array. Fixes: 57631054fae6 ("bpf: Introduce bpf_prog_pack allocator") Reported-by: Stephen Rothwell Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220211024939.2962537-1-song@kernel.org --- kernel/bpf/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 42d96549a804..44623c9b5bb1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -827,7 +827,7 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, struct bpf_prog_pack { struct list_head list; void *ptr; - unsigned long bitmap[BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)]; + unsigned long bitmap[]; }; #define BPF_PROG_MAX_PACK_PROG_SIZE BPF_PROG_PACK_SIZE @@ -840,7 +840,7 @@ static struct bpf_prog_pack *alloc_new_pack(void) { struct bpf_prog_pack *pack; - pack = kzalloc(sizeof(*pack), GFP_KERNEL); + pack = kzalloc(sizeof(*pack) + BITS_TO_BYTES(BPF_PROG_CHUNK_COUNT), GFP_KERNEL); if (!pack) return NULL; pack->ptr = module_alloc(BPF_PROG_PACK_SIZE); -- cgit v1.2.3 From 5c72263ef2fbe99596848f03758ae2dc593adf2c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 8 Feb 2022 00:57:17 -0800 Subject: signal: HANDLER_EXIT should clear SIGNAL_UNKILLABLE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fatal SIGSYS signals (i.e. seccomp RET_KILL_* syscall filter actions) were not being delivered to ptraced pid namespace init processes. Make sure the SIGNAL_UNKILLABLE doesn't get set for these cases. Reported-by: Robert Święcki Suggested-by: "Eric W. Biederman" Fixes: 00b06da29cf9 ("signal: Add SA_IMMUTABLE to ensure forced siganls do not get changed") Cc: stable@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: "Eric W. Biederman" Link: https://lore.kernel.org/lkml/878rui8u4a.fsf@email.froward.int.ebiederm.org --- kernel/signal.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 38602738866e..9b04631acde8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1342,9 +1342,10 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, } /* * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect - * debugging to leave init killable. + * debugging to leave init killable. But HANDLER_EXIT is always fatal. */ - if (action->sa.sa_handler == SIG_DFL && !t->ptrace) + if (action->sa.sa_handler == SIG_DFL && + (!t->ptrace || (handler == HANDLER_EXIT))) t->signal->flags &= ~SIGNAL_UNKILLABLE; ret = send_signal(sig, info, t, PIDTYPE_PID); spin_unlock_irqrestore(&t->sighand->siglock, flags); -- cgit v1.2.3 From 495ac3069a6235bfdf516812a2a9b256671bbdf9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 7 Feb 2022 20:21:13 -0800 Subject: seccomp: Invalidate seccomp mode to catch death failures If seccomp tries to kill a process, it should never see that process again. To enforce this proactively, switch the mode to something impossible. If encountered: WARN, reject all syscalls, and attempt to kill the process again even harder. Cc: Andy Lutomirski Cc: Will Drewry Fixes: 8112c4f140fa ("seccomp: remove 2-phase API") Cc: stable@vger.kernel.org Signed-off-by: Kees Cook --- kernel/seccomp.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 4d8f44a17727..db10e73d06e0 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -29,6 +29,9 @@ #include #include +/* Not exposed in headers: strictly internal use only. */ +#define SECCOMP_MODE_DEAD (SECCOMP_MODE_FILTER + 1) + #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER #include #endif @@ -1010,6 +1013,7 @@ static void __secure_computing_strict(int this_syscall) #ifdef SECCOMP_DEBUG dump_stack(); #endif + current->seccomp.mode = SECCOMP_MODE_DEAD; seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true); do_exit(SIGKILL); } @@ -1261,6 +1265,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, case SECCOMP_RET_KILL_THREAD: case SECCOMP_RET_KILL_PROCESS: default: + current->seccomp.mode = SECCOMP_MODE_DEAD; seccomp_log(this_syscall, SIGSYS, action, true); /* Dump core only if this is the last remaining thread. */ if (action != SECCOMP_RET_KILL_THREAD || @@ -1309,6 +1314,11 @@ int __secure_computing(const struct seccomp_data *sd) return 0; case SECCOMP_MODE_FILTER: return __seccomp_filter(this_syscall, sd, false); + /* Surviving SECCOMP_RET_KILL_* must be proactively impossible. */ + case SECCOMP_MODE_DEAD: + WARN_ON_ONCE(1); + do_exit(SIGKILL); + return -1; default: BUG(); } -- cgit v1.2.3 From 1581a884b7ca5592270caa010a910f2ed4f7b5f5 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 27 Jan 2022 15:44:19 -0600 Subject: tracing: Remove size restriction on tracing_log_err cmd strings Currently, tracing_log_err.cmd strings are restricted to a length of MAX_FILTER_STR_VAL (256), which is too short for some commands already seen in the wild (with cmd strings longer than that showing up truncated). Remove the restriction so that no command string is ever truncated. Link: https://lkml.kernel.org/r/ca965f23256b350ebd94b3dc1a319f28e8267f5f.1643319703.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 55 +++++++++++++++++++++++++++++++++++++++------------- kernel/trace/trace.h | 2 +- 2 files changed, 43 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7c2578efde26..7c85ce9ffdc3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7723,7 +7723,7 @@ const struct file_operations trace_min_max_fops = { struct err_info { const char **errs; /* ptr to loc-specific array of err strings */ u8 type; /* index into errs -> specific err string */ - u8 pos; /* MAX_FILTER_STR_VAL = 256 */ + u16 pos; /* caret position */ u64 ts; }; @@ -7731,26 +7731,52 @@ struct tracing_log_err { struct list_head list; struct err_info info; char loc[TRACING_LOG_LOC_MAX]; /* err location */ - char cmd[MAX_FILTER_STR_VAL]; /* what caused err */ + char *cmd; /* what caused err */ }; static DEFINE_MUTEX(tracing_err_log_lock); -static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr) +static struct tracing_log_err *alloc_tracing_log_err(int len) +{ + struct tracing_log_err *err; + + err = kzalloc(sizeof(*err), GFP_KERNEL); + if (!err) + return ERR_PTR(-ENOMEM); + + err->cmd = kzalloc(len, GFP_KERNEL); + if (!err->cmd) { + kfree(err); + return ERR_PTR(-ENOMEM); + } + + return err; +} + +static void free_tracing_log_err(struct tracing_log_err *err) +{ + kfree(err->cmd); + kfree(err); +} + +static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr, + int len) { struct tracing_log_err *err; if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) { - err = kzalloc(sizeof(*err), GFP_KERNEL); - if (!err) - err = ERR_PTR(-ENOMEM); - else + err = alloc_tracing_log_err(len); + if (PTR_ERR(err) != -ENOMEM) tr->n_err_log_entries++; return err; } err = list_first_entry(&tr->err_log, struct tracing_log_err, list); + kfree(err->cmd); + err->cmd = kzalloc(len, GFP_KERNEL); + if (!err->cmd) + return ERR_PTR(-ENOMEM); list_del(&err->list); return err; @@ -7811,22 +7837,25 @@ unsigned int err_pos(char *cmd, const char *str) */ void tracing_log_err(struct trace_array *tr, const char *loc, const char *cmd, - const char **errs, u8 type, u8 pos) + const char **errs, u8 type, u16 pos) { struct tracing_log_err *err; + int len = 0; if (!tr) tr = &global_trace; + len += sizeof(CMD_PREFIX) + 2 * sizeof("\n") + strlen(cmd) + 1; + mutex_lock(&tracing_err_log_lock); - err = get_tracing_log_err(tr); + err = get_tracing_log_err(tr, len); if (PTR_ERR(err) == -ENOMEM) { mutex_unlock(&tracing_err_log_lock); return; } snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc); - snprintf(err->cmd, MAX_FILTER_STR_VAL,"\n" CMD_PREFIX "%s\n", cmd); + snprintf(err->cmd, len, "\n" CMD_PREFIX "%s\n", cmd); err->info.errs = errs; err->info.type = type; @@ -7844,7 +7873,7 @@ static void clear_tracing_err_log(struct trace_array *tr) mutex_lock(&tracing_err_log_lock); list_for_each_entry_safe(err, next, &tr->err_log, list) { list_del(&err->list); - kfree(err); + free_tracing_log_err(err); } tr->n_err_log_entries = 0; @@ -7872,9 +7901,9 @@ static void tracing_err_log_seq_stop(struct seq_file *m, void *v) mutex_unlock(&tracing_err_log_lock); } -static void tracing_err_log_show_pos(struct seq_file *m, u8 pos) +static void tracing_err_log_show_pos(struct seq_file *m, u16 pos) { - u8 i; + u16 i; for (i = 0; i < sizeof(CMD_PREFIX) - 1; i++) seq_putc(m, ' '); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d038ddbf1bea..0f5e22238cd2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1878,7 +1878,7 @@ extern ssize_t trace_parse_run_command(struct file *file, extern unsigned int err_pos(char *cmd, const char *str); extern void tracing_log_err(struct trace_array *tr, const char *loc, const char *cmd, - const char **errs, u8 type, u8 pos); + const char **errs, u8 type, u16 pos); /* * Normal trace_printk() and friends allocates special buffers -- cgit v1.2.3 From edfeed318d59ff242e895bf906223fc0b915117c Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 28 Jan 2022 14:08:26 -0600 Subject: tracing: Remove size restriction on hist trigger cmd error logging Currently, hist trigger command error strings are restricted to a length of MAX_FILTER_STR_VAL (256), which is too short for some commands already seen in the wild (with cmd strings longer than that showing up truncated in err_log). Remove the restriction so that no hist trigger command error string is ever truncated. Link: https://lkml.kernel.org/r/0f9d46407222eaf6632cd3b417bc50a11f401b71.1643399022.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index ada87bfb5bb8..5e8970624bce 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -727,11 +727,16 @@ static struct track_data *track_data_alloc(unsigned int key_len, return data; } -static char last_cmd[MAX_FILTER_STR_VAL]; +#define HIST_PREFIX "hist:" + +static char *last_cmd; static char last_cmd_loc[MAX_FILTER_STR_VAL]; static int errpos(char *str) { + if (!str || !last_cmd) + return 0; + return err_pos(last_cmd, str); } @@ -739,12 +744,19 @@ static void last_cmd_set(struct trace_event_file *file, char *str) { const char *system = NULL, *name = NULL; struct trace_event_call *call; + int len = 0; if (!str) return; - strcpy(last_cmd, "hist:"); - strncat(last_cmd, str, MAX_FILTER_STR_VAL - 1 - sizeof("hist:")); + len += sizeof(HIST_PREFIX) + strlen(str) + 1; + kfree(last_cmd); + last_cmd = kzalloc(len, GFP_KERNEL); + if (!last_cmd) + return; + + strcpy(last_cmd, HIST_PREFIX); + strncat(last_cmd, str, len - sizeof(HIST_PREFIX)); if (file) { call = file->event_call; @@ -757,18 +769,22 @@ static void last_cmd_set(struct trace_event_file *file, char *str) } if (system) - snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name); + snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, HIST_PREFIX "%s:%s", system, name); } -static void hist_err(struct trace_array *tr, u8 err_type, u8 err_pos) +static void hist_err(struct trace_array *tr, u8 err_type, u16 err_pos) { + if (!last_cmd) + return; + tracing_log_err(tr, last_cmd_loc, last_cmd, err_text, err_type, err_pos); } static void hist_err_clear(void) { - last_cmd[0] = '\0'; + if (last_cmd) + last_cmd[0] = '\0'; last_cmd_loc[0] = '\0'; } @@ -5610,7 +5626,7 @@ static int event_hist_trigger_print(struct seq_file *m, bool have_var = false; unsigned int i; - seq_puts(m, "hist:"); + seq_puts(m, HIST_PREFIX); if (data->name) seq_printf(m, "%s:", data->name); -- cgit v1.2.3 From 27c888da9867725784bad3d6455d6e53b425fa2b Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 28 Jan 2022 14:08:27 -0600 Subject: tracing: Remove size restriction on synthetic event cmd error logging Currently, synthetic event command error strings are restricted to a length of MAX_FILTER_STR_VAL (256), which is too short for some commands already seen in the wild (with cmd strings longer than that showing up truncated in err_log). Remove the restriction so that no synthetic event command error string is ever truncated. Link: https://lkml.kernel.org/r/0376692396a81d0b795127c66ea92ca5bf60f481.1643399022.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_synth.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 154db74dadbc..fdd79e07e2fc 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -42,10 +42,13 @@ enum { ERRORS }; static const char *err_text[] = { ERRORS }; -static char last_cmd[MAX_FILTER_STR_VAL]; +static char *last_cmd; static int errpos(const char *str) { + if (!str || !last_cmd) + return 0; + return err_pos(last_cmd, str); } @@ -54,11 +57,19 @@ static void last_cmd_set(const char *str) if (!str) return; - strncpy(last_cmd, str, MAX_FILTER_STR_VAL - 1); + kfree(last_cmd); + last_cmd = kzalloc(strlen(str) + 1, GFP_KERNEL); + if (!last_cmd) + return; + + strncpy(last_cmd, str, strlen(str) + 1); } -static void synth_err(u8 err_type, u8 err_pos) +static void synth_err(u8 err_type, u16 err_pos) { + if (!last_cmd) + return; + tracing_log_err(NULL, "synthetic_events", last_cmd, err_text, err_type, err_pos); } -- cgit v1.2.3 From 55bc8384d3deadce48923d8329c1434494c52273 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 10 Feb 2022 19:36:05 -0500 Subject: tracing: Save both wakee and current on wakeup events Use the sched_switch function to save both the wakee and the waker comms in the saved cmdlines list when sched_wakeup is done. Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_sched_switch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index e304196d7c28..5cd33be2031b 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -44,7 +44,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee) if (!flags) return; - tracing_record_taskinfo(current, flags); + tracing_record_taskinfo_sched_switch(current, wakee, flags); } static int tracing_sched_register(void) -- cgit v1.2.3 From 7f5a08c79df35e68f1a43033450c5050f12bc155 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Tue, 18 Jan 2022 12:43:15 -0800 Subject: user_events: Add minimal support for trace_event into ftrace Minimal support for interacting with dynamic events, trace_event and ftrace. Core outline of flow between user process, ioctl and trace_event APIs. User mode processes that wish to use trace events to get data into ftrace, perf, eBPF, etc are limited to uprobes today. The user events features enables an ABI for user mode processes to create and write to trace events that are isolated from kernel level trace events. This enables a faster path for tracing from user mode data as well as opens managed code to participate in trace events, where stub locations are dynamic. User processes often want to trace only when it's useful. To enable this a set of pages are mapped into the user process space that indicate the current state of the user events that have been registered. User processes can check if their event is hooked to a trace/probe, and if it is, emit the event data out via the write() syscall. Two new files are introduced into tracefs to accomplish this: user_events_status - This file is mmap'd into participating user mode processes to indicate event status. user_events_data - This file is opened and register/delete ioctl's are issued to create/open/delete trace events that can be used for tracing. The typical scenario is on process start to mmap user_events_status. Processes then register the events they plan to use via the REG ioctl. The ioctl reads and updates the passed in user_reg struct. The status_index of the struct is used to know the byte in the status page to check for that event. The write_index of the struct is used to describe that event when writing out to the fd that was used for the ioctl call. The data must always include this index first when writing out data for an event. Data can be written either by write() or by writev(). For example, in memory: int index; char data[]; Psuedo code example of typical usage: struct user_reg reg; int page_fd = open("user_events_status", O_RDWR); char *page_data = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_SHARED, page_fd, 0); close(page_fd); int data_fd = open("user_events_data", O_RDWR); reg.size = sizeof(reg); reg.name_args = (__u64)"test"; ioctl(data_fd, DIAG_IOCSREG, ®); int status_id = reg.status_index; int write_id = reg.write_index; struct iovec io[2]; io[0].iov_base = &write_id; io[0].iov_len = sizeof(write_id); io[1].iov_base = payload; io[1].iov_len = sizeof(payload); if (page_data[status_id]) writev(data_fd, io, 2); User events are also exposed via the dynamic_events tracefs file for both create and delete. Current status is exposed via the user_events_status tracefs file. Simple example to register a user event via dynamic_events: echo u:test >> dynamic_events cat dynamic_events u:test If an event is hooked to a probe, the probe hooked shows up: echo 1 > events/user_events/test/enable cat user_events_status 1:test # Used by ftrace Active: 1 Busy: 1 Max: 4096 If an event is not hooked to a probe, no probe status shows up: echo 0 > events/user_events/test/enable cat user_events_status 1:test Active: 1 Busy: 0 Max: 4096 Users can describe the trace event format via the following format: name[:FLAG1[,FLAG2...] [field1[;field2...]] Each field has the following format: type name Example for char array with a size of 20 named msg: echo 'u:detailed char[20] msg' >> dynamic_events cat dynamic_events u:detailed char[20] msg Data offsets are based on the data written out via write() and will be updated to reflect the correct offset in the trace_event fields. For dynamic data it is recommended to use the new __rel_loc data type. This type will be the same as __data_loc, but the offset is relative to this entry. This allows user_events to not worry about what common fields are being inserted before the data. The above format is valid for both the ioctl and the dynamic_events file. Link: https://lkml.kernel.org/r/20220118204326.2169-2-beaub@linux.microsoft.com Acked-by: Masami Hiramatsu Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- include/uapi/linux/user_events.h | 116 ++++ kernel/trace/Kconfig | 14 + kernel/trace/Makefile | 1 + kernel/trace/trace_events_user.c | 1187 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 1318 insertions(+) create mode 100644 include/uapi/linux/user_events.h create mode 100644 kernel/trace/trace_events_user.c (limited to 'kernel') diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h new file mode 100644 index 000000000000..e570840571e1 --- /dev/null +++ b/include/uapi/linux/user_events.h @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright (c) 2021, Microsoft Corporation. + * + * Authors: + * Beau Belgrave + */ +#ifndef _UAPI_LINUX_USER_EVENTS_H +#define _UAPI_LINUX_USER_EVENTS_H + +#include +#include + +#ifdef __KERNEL__ +#include +#else +#include +#endif + +#define USER_EVENTS_SYSTEM "user_events" +#define USER_EVENTS_PREFIX "u:" + +/* Bits 0-6 are for known probe types, Bit 7 is for unknown probes */ +#define EVENT_BIT_FTRACE 0 +#define EVENT_BIT_PERF 1 +#define EVENT_BIT_OTHER 7 + +#define EVENT_STATUS_FTRACE (1 << EVENT_BIT_FTRACE) +#define EVENT_STATUS_PERF (1 << EVENT_BIT_PERF) +#define EVENT_STATUS_OTHER (1 << EVENT_BIT_OTHER) + +/* Create dynamic location entry within a 32-bit value */ +#define DYN_LOC(offset, size) ((size) << 16 | (offset)) + +/* Use raw iterator for attached BPF program(s), no affect on ftrace/perf */ +#define FLAG_BPF_ITER (1 << 0) + +/* + * Describes an event registration and stores the results of the registration. + * This structure is passed to the DIAG_IOCSREG ioctl, callers at a minimum + * must set the size and name_args before invocation. + */ +struct user_reg { + + /* Input: Size of the user_reg structure being used */ + __u32 size; + + /* Input: Pointer to string with event name, description and flags */ + __u64 name_args; + + /* Output: Byte index of the event within the status page */ + __u32 status_index; + + /* Output: Index of the event to use when writing data */ + __u32 write_index; +}; + +#define DIAG_IOC_MAGIC '*' + +/* Requests to register a user_event */ +#define DIAG_IOCSREG _IOWR(DIAG_IOC_MAGIC, 0, struct user_reg*) + +/* Requests to delete a user_event */ +#define DIAG_IOCSDEL _IOW(DIAG_IOC_MAGIC, 1, char*) + +/* Data type that was passed to the BPF program */ +enum { + /* Data resides in kernel space */ + USER_BPF_DATA_KERNEL, + + /* Data resides in user space */ + USER_BPF_DATA_USER, + + /* Data is a pointer to a user_bpf_iter structure */ + USER_BPF_DATA_ITER, +}; + +/* + * Describes an iovec iterator that BPF programs can use to access data for + * a given user_event write() / writev() call. + */ +struct user_bpf_iter { + + /* Offset of the data within the first iovec */ + __u32 iov_offset; + + /* Number of iovec structures */ + __u32 nr_segs; + + /* Pointer to iovec structures */ + const struct iovec *iov; +}; + +/* Context that BPF programs receive when attached to a user_event */ +struct user_bpf_context { + + /* Data type being passed (see union below) */ + __u32 data_type; + + /* Length of the data */ + __u32 data_len; + + /* Pointer to data, varies by data type */ + union { + /* Kernel data (data_type == USER_BPF_DATA_KERNEL) */ + void *kdata; + + /* User data (data_type == USER_BPF_DATA_USER) */ + void *udata; + + /* Direct iovec (data_type == USER_BPF_DATA_ITER) */ + struct user_bpf_iter *iter; + }; +}; + +#endif /* _UAPI_LINUX_USER_EVENTS_H */ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a5eb5e7fd624..16a52a71732d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -737,6 +737,20 @@ config SYNTH_EVENTS If in doubt, say N. +config USER_EVENTS + bool "User trace events" + select TRACING + select DYNAMIC_EVENTS + help + User trace events are user-defined trace events that + can be used like an existing kernel trace event. User trace + events are generated by writing to a tracefs file. User + processes can determine if their tracing events should be + generated by memory mapping a tracefs file and checking for + an associated byte being non-zero. + + If in doubt, say N. + config HIST_TRIGGERS bool "Histogram triggers" depends on ARCH_HAVE_NMI_SAFE_CMPXCHG diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index bedc5caceec7..19ef3758da95 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -82,6 +82,7 @@ obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o +obj-$(CONFIG_USER_EVENTS) += trace_events_user.o obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o obj-$(CONFIG_TRACEPOINTS) += error_report-traces.o diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c new file mode 100644 index 000000000000..77105233115e --- /dev/null +++ b/kernel/trace/trace_events_user.c @@ -0,0 +1,1187 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2021, Microsoft Corporation. + * + * Authors: + * Beau Belgrave + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "trace.h" +#include "trace_dynevent.h" + +#define USER_EVENTS_PREFIX_LEN (sizeof(USER_EVENTS_PREFIX)-1) + +#define FIELD_DEPTH_TYPE 0 +#define FIELD_DEPTH_NAME 1 +#define FIELD_DEPTH_SIZE 2 + +/* + * Limits how many trace_event calls user processes can create: + * Must be multiple of PAGE_SIZE. + */ +#define MAX_PAGES 1 +#define MAX_EVENTS (MAX_PAGES * PAGE_SIZE) + +/* Limit how long of an event name plus args within the subsystem. */ +#define MAX_EVENT_DESC 512 +#define EVENT_NAME(user_event) ((user_event)->tracepoint.name) +#define MAX_FIELD_ARRAY_SIZE 1024 + +static char *register_page_data; + +static DEFINE_MUTEX(reg_mutex); +static DEFINE_HASHTABLE(register_table, 4); +static DECLARE_BITMAP(page_bitmap, MAX_EVENTS); + +/* + * Stores per-event properties, as users register events + * within a file a user_event might be created if it does not + * already exist. These are globally used and their lifetime + * is tied to the refcnt member. These cannot go away until the + * refcnt reaches zero. + */ +struct user_event { + struct tracepoint tracepoint; + struct trace_event_call call; + struct trace_event_class class; + struct dyn_event devent; + struct hlist_node node; + struct list_head fields; + atomic_t refcnt; + int index; + int flags; +}; + +/* + * Stores per-file events references, as users register events + * within a file this structure is modified and freed via RCU. + * The lifetime of this struct is tied to the lifetime of the file. + * These are not shared and only accessible by the file that created it. + */ +struct user_event_refs { + struct rcu_head rcu; + int count; + struct user_event *events[]; +}; + +typedef void (*user_event_func_t) (struct user_event *user, + void *data, u32 datalen, + void *tpdata); + +static int user_event_parse(char *name, char *args, char *flags, + struct user_event **newuser); + +static u32 user_event_key(char *name) +{ + return jhash(name, strlen(name), 0); +} + +static struct list_head *user_event_get_fields(struct trace_event_call *call) +{ + struct user_event *user = (struct user_event *)call->data; + + return &user->fields; +} + +/* + * Parses a register command for user_events + * Format: event_name[:FLAG1[,FLAG2...]] [field1[;field2...]] + * + * Example event named 'test' with a 20 char 'msg' field with an unsigned int + * 'id' field after: + * test char[20] msg;unsigned int id + * + * NOTE: Offsets are from the user data perspective, they are not from the + * trace_entry/buffer perspective. We automatically add the common properties + * sizes to the offset for the user. + */ +static int user_event_parse_cmd(char *raw_command, struct user_event **newuser) +{ + char *name = raw_command; + char *args = strpbrk(name, " "); + char *flags; + + if (args) + *args++ = '\0'; + + flags = strpbrk(name, ":"); + + if (flags) + *flags++ = '\0'; + + return user_event_parse(name, args, flags, newuser); +} + +static int user_field_array_size(const char *type) +{ + const char *start = strchr(type, '['); + char val[8]; + char *bracket; + int size = 0; + + if (start == NULL) + return -EINVAL; + + if (strscpy(val, start + 1, sizeof(val)) <= 0) + return -EINVAL; + + bracket = strchr(val, ']'); + + if (!bracket) + return -EINVAL; + + *bracket = '\0'; + + if (kstrtouint(val, 0, &size)) + return -EINVAL; + + if (size > MAX_FIELD_ARRAY_SIZE) + return -EINVAL; + + return size; +} + +static int user_field_size(const char *type) +{ + /* long is not allowed from a user, since it's ambigious in size */ + if (strcmp(type, "s64") == 0) + return sizeof(s64); + if (strcmp(type, "u64") == 0) + return sizeof(u64); + if (strcmp(type, "s32") == 0) + return sizeof(s32); + if (strcmp(type, "u32") == 0) + return sizeof(u32); + if (strcmp(type, "int") == 0) + return sizeof(int); + if (strcmp(type, "unsigned int") == 0) + return sizeof(unsigned int); + if (strcmp(type, "s16") == 0) + return sizeof(s16); + if (strcmp(type, "u16") == 0) + return sizeof(u16); + if (strcmp(type, "short") == 0) + return sizeof(short); + if (strcmp(type, "unsigned short") == 0) + return sizeof(unsigned short); + if (strcmp(type, "s8") == 0) + return sizeof(s8); + if (strcmp(type, "u8") == 0) + return sizeof(u8); + if (strcmp(type, "char") == 0) + return sizeof(char); + if (strcmp(type, "unsigned char") == 0) + return sizeof(unsigned char); + if (str_has_prefix(type, "char[")) + return user_field_array_size(type); + if (str_has_prefix(type, "unsigned char[")) + return user_field_array_size(type); + if (str_has_prefix(type, "__data_loc ")) + return sizeof(u32); + if (str_has_prefix(type, "__rel_loc ")) + return sizeof(u32); + + /* Uknown basic type, error */ + return -EINVAL; +} + +static void user_event_destroy_fields(struct user_event *user) +{ + struct ftrace_event_field *field, *next; + struct list_head *head = &user->fields; + + list_for_each_entry_safe(field, next, head, link) { + list_del(&field->link); + kfree(field); + } +} + +static int user_event_add_field(struct user_event *user, const char *type, + const char *name, int offset, int size, + int is_signed, int filter_type) +{ + struct ftrace_event_field *field; + + field = kmalloc(sizeof(*field), GFP_KERNEL); + + if (!field) + return -ENOMEM; + + field->type = type; + field->name = name; + field->offset = offset; + field->size = size; + field->is_signed = is_signed; + field->filter_type = filter_type; + + list_add(&field->link, &user->fields); + + return 0; +} + +/* + * Parses the values of a field within the description + * Format: type name [size] + */ +static int user_event_parse_field(char *field, struct user_event *user, + u32 *offset) +{ + char *part, *type, *name; + u32 depth = 0, saved_offset = *offset; + int len, size = -EINVAL; + bool is_struct = false; + + field = skip_spaces(field); + + if (*field == '\0') + return 0; + + /* Handle types that have a space within */ + len = str_has_prefix(field, "unsigned "); + if (len) + goto skip_next; + + len = str_has_prefix(field, "struct "); + if (len) { + is_struct = true; + goto skip_next; + } + + len = str_has_prefix(field, "__data_loc unsigned "); + if (len) + goto skip_next; + + len = str_has_prefix(field, "__data_loc "); + if (len) + goto skip_next; + + len = str_has_prefix(field, "__rel_loc unsigned "); + if (len) + goto skip_next; + + len = str_has_prefix(field, "__rel_loc "); + if (len) + goto skip_next; + + goto parse; +skip_next: + type = field; + field = strpbrk(field + len, " "); + + if (field == NULL) + return -EINVAL; + + *field++ = '\0'; + depth++; +parse: + while ((part = strsep(&field, " ")) != NULL) { + switch (depth++) { + case FIELD_DEPTH_TYPE: + type = part; + break; + case FIELD_DEPTH_NAME: + name = part; + break; + case FIELD_DEPTH_SIZE: + if (!is_struct) + return -EINVAL; + + if (kstrtou32(part, 10, &size)) + return -EINVAL; + break; + default: + return -EINVAL; + } + } + + if (depth < FIELD_DEPTH_SIZE) + return -EINVAL; + + if (depth == FIELD_DEPTH_SIZE) + size = user_field_size(type); + + if (size == 0) + return -EINVAL; + + if (size < 0) + return size; + + *offset = saved_offset + size; + + return user_event_add_field(user, type, name, saved_offset, size, + type[0] != 'u', FILTER_OTHER); +} + +static void user_event_parse_flags(struct user_event *user, char *flags) +{ + char *flag; + + if (flags == NULL) + return; + + while ((flag = strsep(&flags, ",")) != NULL) { + if (strcmp(flag, "BPF_ITER") == 0) + user->flags |= FLAG_BPF_ITER; + } +} + +static int user_event_parse_fields(struct user_event *user, char *args) +{ + char *field; + u32 offset = sizeof(struct trace_entry); + int ret = -EINVAL; + + if (args == NULL) + return 0; + + while ((field = strsep(&args, ";")) != NULL) { + ret = user_event_parse_field(field, user, &offset); + + if (ret) + break; + } + + return ret; +} + +static struct trace_event_fields user_event_fields_array[1]; + +static enum print_line_t user_event_print_trace(struct trace_iterator *iter, + int flags, + struct trace_event *event) +{ + /* Unsafe to try to decode user provided print_fmt, use hex */ + trace_print_hex_dump_seq(&iter->seq, "", DUMP_PREFIX_OFFSET, 16, + 1, iter->ent, iter->ent_size, true); + + return trace_handle_return(&iter->seq); +} + +static struct trace_event_functions user_event_funcs = { + .trace = user_event_print_trace, +}; + +static int destroy_user_event(struct user_event *user) +{ + int ret = 0; + + /* Must destroy fields before call removal */ + user_event_destroy_fields(user); + + ret = trace_remove_event_call(&user->call); + + if (ret) + return ret; + + dyn_event_remove(&user->devent); + + register_page_data[user->index] = 0; + clear_bit(user->index, page_bitmap); + hash_del(&user->node); + + kfree(EVENT_NAME(user)); + kfree(user); + + return ret; +} + +static struct user_event *find_user_event(char *name, u32 *outkey) +{ + struct user_event *user; + u32 key = user_event_key(name); + + *outkey = key; + + hash_for_each_possible(register_table, user, node, key) + if (!strcmp(EVENT_NAME(user), name)) + return user; + + return NULL; +} + +/* + * Writes the user supplied payload out to a trace file. + */ +static void user_event_ftrace(struct user_event *user, void *data, u32 datalen, + void *tpdata) +{ + struct trace_event_file *file; + struct trace_entry *entry; + struct trace_event_buffer event_buffer; + + file = (struct trace_event_file *)tpdata; + + if (!file || + !(file->flags & EVENT_FILE_FL_ENABLED) || + trace_trigger_soft_disabled(file)) + return; + + /* Allocates and fills trace_entry, + 1 of this is data payload */ + entry = trace_event_buffer_reserve(&event_buffer, file, + sizeof(*entry) + datalen); + + if (unlikely(!entry)) + return; + + memcpy(entry + 1, data, datalen); + + trace_event_buffer_commit(&event_buffer); +} + +/* + * Update the register page that is shared between user processes. + */ +static void update_reg_page_for(struct user_event *user) +{ + struct tracepoint *tp = &user->tracepoint; + char status = 0; + + if (atomic_read(&tp->key.enabled) > 0) { + struct tracepoint_func *probe_func_ptr; + user_event_func_t probe_func; + + rcu_read_lock_sched(); + + probe_func_ptr = rcu_dereference_sched(tp->funcs); + + if (probe_func_ptr) { + do { + probe_func = probe_func_ptr->func; + + if (probe_func == user_event_ftrace) + status |= EVENT_STATUS_FTRACE; + else + status |= EVENT_STATUS_OTHER; + } while ((++probe_func_ptr)->func); + } + + rcu_read_unlock_sched(); + } + + register_page_data[user->index] = status; +} + +/* + * Register callback for our events from tracing sub-systems. + */ +static int user_event_reg(struct trace_event_call *call, + enum trace_reg type, + void *data) +{ + struct user_event *user = (struct user_event *)call->data; + int ret = 0; + + if (!user) + return -ENOENT; + + switch (type) { + case TRACE_REG_REGISTER: + ret = tracepoint_probe_register(call->tp, + call->class->probe, + data); + if (!ret) + goto inc; + break; + + case TRACE_REG_UNREGISTER: + tracepoint_probe_unregister(call->tp, + call->class->probe, + data); + goto dec; + + default: + break; + } + + return ret; +inc: + atomic_inc(&user->refcnt); + update_reg_page_for(user); + return 0; +dec: + update_reg_page_for(user); + atomic_dec(&user->refcnt); + return 0; +} + +static int user_event_create(const char *raw_command) +{ + struct user_event *user; + char *name; + int ret; + + if (!str_has_prefix(raw_command, USER_EVENTS_PREFIX)) + return -ECANCELED; + + raw_command += USER_EVENTS_PREFIX_LEN; + raw_command = skip_spaces(raw_command); + + name = kstrdup(raw_command, GFP_KERNEL); + + if (!name) + return -ENOMEM; + + mutex_lock(®_mutex); + ret = user_event_parse_cmd(name, &user); + mutex_unlock(®_mutex); + + if (ret) + kfree(name); + + return ret; +} + +static int user_event_show(struct seq_file *m, struct dyn_event *ev) +{ + struct user_event *user = container_of(ev, struct user_event, devent); + struct ftrace_event_field *field, *next; + struct list_head *head; + int depth = 0; + + seq_printf(m, "%s%s", USER_EVENTS_PREFIX, EVENT_NAME(user)); + + head = trace_get_fields(&user->call); + + list_for_each_entry_safe_reverse(field, next, head, link) { + if (depth == 0) + seq_puts(m, " "); + else + seq_puts(m, "; "); + + seq_printf(m, "%s %s", field->type, field->name); + + if (str_has_prefix(field->type, "struct ")) + seq_printf(m, " %d", field->size); + + depth++; + } + + seq_puts(m, "\n"); + + return 0; +} + +static bool user_event_is_busy(struct dyn_event *ev) +{ + struct user_event *user = container_of(ev, struct user_event, devent); + + return atomic_read(&user->refcnt) != 0; +} + +static int user_event_free(struct dyn_event *ev) +{ + struct user_event *user = container_of(ev, struct user_event, devent); + + if (atomic_read(&user->refcnt) != 0) + return -EBUSY; + + return destroy_user_event(user); +} + +static bool user_event_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev) +{ + struct user_event *user = container_of(ev, struct user_event, devent); + + return strcmp(EVENT_NAME(user), event) == 0 && + (!system || strcmp(system, USER_EVENTS_SYSTEM) == 0); +} + +static struct dyn_event_operations user_event_dops = { + .create = user_event_create, + .show = user_event_show, + .is_busy = user_event_is_busy, + .free = user_event_free, + .match = user_event_match, +}; + +static int user_event_trace_register(struct user_event *user) +{ + int ret; + + ret = register_trace_event(&user->call.event); + + if (!ret) + return -ENODEV; + + ret = trace_add_event_call(&user->call); + + if (ret) + unregister_trace_event(&user->call.event); + + return ret; +} + +/* + * Parses the event name, arguments and flags then registers if successful. + * The name buffer lifetime is owned by this method for success cases only. + */ +static int user_event_parse(char *name, char *args, char *flags, + struct user_event **newuser) +{ + int ret; + int index; + u32 key; + struct user_event *user = find_user_event(name, &key); + + if (user) { + *newuser = user; + /* + * Name is allocated by caller, free it since it already exists. + * Caller only worries about failure cases for freeing. + */ + kfree(name); + return 0; + } + + index = find_first_zero_bit(page_bitmap, MAX_EVENTS); + + if (index == MAX_EVENTS) + return -EMFILE; + + user = kzalloc(sizeof(*user), GFP_KERNEL); + + if (!user) + return -ENOMEM; + + INIT_LIST_HEAD(&user->class.fields); + INIT_LIST_HEAD(&user->fields); + + user->tracepoint.name = name; + + user_event_parse_flags(user, flags); + + ret = user_event_parse_fields(user, args); + + if (ret) + goto put_user; + + /* Minimal print format */ + user->call.print_fmt = "\"\""; + + user->call.data = user; + user->call.class = &user->class; + user->call.name = name; + user->call.flags = TRACE_EVENT_FL_TRACEPOINT; + user->call.tp = &user->tracepoint; + user->call.event.funcs = &user_event_funcs; + + user->class.system = USER_EVENTS_SYSTEM; + user->class.fields_array = user_event_fields_array; + user->class.get_fields = user_event_get_fields; + user->class.reg = user_event_reg; + user->class.probe = user_event_ftrace; + + mutex_lock(&event_mutex); + ret = user_event_trace_register(user); + mutex_unlock(&event_mutex); + + if (ret) + goto put_user; + + user->index = index; + dyn_event_init(&user->devent, &user_event_dops); + dyn_event_add(&user->devent, &user->call); + set_bit(user->index, page_bitmap); + hash_add(register_table, &user->node, key); + + *newuser = user; + return 0; +put_user: + user_event_destroy_fields(user); + kfree(user); + return ret; +} + +/* + * Deletes a previously created event if it is no longer being used. + */ +static int delete_user_event(char *name) +{ + u32 key; + int ret; + struct user_event *user = find_user_event(name, &key); + + if (!user) + return -ENOENT; + + if (atomic_read(&user->refcnt) != 0) + return -EBUSY; + + mutex_lock(&event_mutex); + ret = destroy_user_event(user); + mutex_unlock(&event_mutex); + + return ret; +} + +/* + * Validates the user payload and writes via iterator. + */ +static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) +{ + struct user_event_refs *refs; + struct user_event *user = NULL; + struct tracepoint *tp; + ssize_t ret = i->count; + int idx; + + if (unlikely(copy_from_iter(&idx, sizeof(idx), i) != sizeof(idx))) + return -EFAULT; + + rcu_read_lock_sched(); + + refs = rcu_dereference_sched(file->private_data); + + /* + * The refs->events array is protected by RCU, and new items may be + * added. But the user retrieved from indexing into the events array + * shall be immutable while the file is opened. + */ + if (likely(refs && idx < refs->count)) + user = refs->events[idx]; + + rcu_read_unlock_sched(); + + if (unlikely(user == NULL)) + return -ENOENT; + + tp = &user->tracepoint; + + /* + * It's possible key.enabled disables after this check, however + * we don't mind if a few events are included in this condition. + */ + if (likely(atomic_read(&tp->key.enabled) > 0)) { + struct tracepoint_func *probe_func_ptr; + user_event_func_t probe_func; + void *tpdata; + void *kdata; + u32 datalen; + + kdata = kmalloc(i->count, GFP_KERNEL); + + if (unlikely(!kdata)) + return -ENOMEM; + + datalen = copy_from_iter(kdata, i->count, i); + + rcu_read_lock_sched(); + + probe_func_ptr = rcu_dereference_sched(tp->funcs); + + if (probe_func_ptr) { + do { + probe_func = probe_func_ptr->func; + tpdata = probe_func_ptr->data; + probe_func(user, kdata, datalen, tpdata); + } while ((++probe_func_ptr)->func); + } + + rcu_read_unlock_sched(); + + kfree(kdata); + } + + return ret; +} + +static ssize_t user_events_write(struct file *file, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct iovec iov; + struct iov_iter i; + + if (unlikely(*ppos != 0)) + return -EFAULT; + + if (unlikely(import_single_range(READ, (char *)ubuf, count, &iov, &i))) + return -EFAULT; + + return user_events_write_core(file, &i); +} + +static ssize_t user_events_write_iter(struct kiocb *kp, struct iov_iter *i) +{ + return user_events_write_core(kp->ki_filp, i); +} + +static int user_events_ref_add(struct file *file, struct user_event *user) +{ + struct user_event_refs *refs, *new_refs; + int i, size, count = 0; + + refs = rcu_dereference_protected(file->private_data, + lockdep_is_held(®_mutex)); + + if (refs) { + count = refs->count; + + for (i = 0; i < count; ++i) + if (refs->events[i] == user) + return i; + } + + size = struct_size(refs, events, count + 1); + + new_refs = kzalloc(size, GFP_KERNEL); + + if (!new_refs) + return -ENOMEM; + + new_refs->count = count + 1; + + for (i = 0; i < count; ++i) + new_refs->events[i] = refs->events[i]; + + new_refs->events[i] = user; + + atomic_inc(&user->refcnt); + + rcu_assign_pointer(file->private_data, new_refs); + + if (refs) + kfree_rcu(refs, rcu); + + return i; +} + +static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg) +{ + u32 size; + long ret; + + ret = get_user(size, &ureg->size); + + if (ret) + return ret; + + if (size > PAGE_SIZE) + return -E2BIG; + + return copy_struct_from_user(kreg, sizeof(*kreg), ureg, size); +} + +/* + * Registers a user_event on behalf of a user process. + */ +static long user_events_ioctl_reg(struct file *file, unsigned long uarg) +{ + struct user_reg __user *ureg = (struct user_reg __user *)uarg; + struct user_reg reg; + struct user_event *user; + char *name; + long ret; + + ret = user_reg_get(ureg, ®); + + if (ret) + return ret; + + name = strndup_user((const char __user *)(uintptr_t)reg.name_args, + MAX_EVENT_DESC); + + if (IS_ERR(name)) { + ret = PTR_ERR(name); + return ret; + } + + ret = user_event_parse_cmd(name, &user); + + if (ret) { + kfree(name); + return ret; + } + + ret = user_events_ref_add(file, user); + + /* Positive number is index and valid */ + if (ret < 0) + return ret; + + put_user((u32)ret, &ureg->write_index); + put_user(user->index, &ureg->status_index); + + return 0; +} + +/* + * Deletes a user_event on behalf of a user process. + */ +static long user_events_ioctl_del(struct file *file, unsigned long uarg) +{ + void __user *ubuf = (void __user *)uarg; + char *name; + long ret; + + name = strndup_user(ubuf, MAX_EVENT_DESC); + + if (IS_ERR(name)) + return PTR_ERR(name); + + ret = delete_user_event(name); + + kfree(name); + + return ret; +} + +/* + * Handles the ioctl from user mode to register or alter operations. + */ +static long user_events_ioctl(struct file *file, unsigned int cmd, + unsigned long uarg) +{ + long ret = -ENOTTY; + + switch (cmd) { + case DIAG_IOCSREG: + mutex_lock(®_mutex); + ret = user_events_ioctl_reg(file, uarg); + mutex_unlock(®_mutex); + break; + + case DIAG_IOCSDEL: + mutex_lock(®_mutex); + ret = user_events_ioctl_del(file, uarg); + mutex_unlock(®_mutex); + break; + } + + return ret; +} + +/* + * Handles the final close of the file from user mode. + */ +static int user_events_release(struct inode *node, struct file *file) +{ + struct user_event_refs *refs; + struct user_event *user; + int i; + + /* + * Ensure refs cannot change under any situation by taking the + * register mutex during the final freeing of the references. + */ + mutex_lock(®_mutex); + + refs = file->private_data; + + if (!refs) + goto out; + + /* + * The lifetime of refs has reached an end, it's tied to this file. + * The underlying user_events are ref counted, and cannot be freed. + * After this decrement, the user_events may be freed elsewhere. + */ + for (i = 0; i < refs->count; ++i) { + user = refs->events[i]; + + if (user) + atomic_dec(&user->refcnt); + } +out: + file->private_data = NULL; + + mutex_unlock(®_mutex); + + kfree(refs); + + return 0; +} + +static const struct file_operations user_data_fops = { + .write = user_events_write, + .write_iter = user_events_write_iter, + .unlocked_ioctl = user_events_ioctl, + .release = user_events_release, +}; + +/* + * Maps the shared page into the user process for checking if event is enabled. + */ +static int user_status_mmap(struct file *file, struct vm_area_struct *vma) +{ + unsigned long size = vma->vm_end - vma->vm_start; + + if (size != MAX_EVENTS) + return -EINVAL; + + return remap_pfn_range(vma, vma->vm_start, + virt_to_phys(register_page_data) >> PAGE_SHIFT, + size, vm_get_page_prot(VM_READ)); +} + +static void *user_seq_start(struct seq_file *m, loff_t *pos) +{ + if (*pos) + return NULL; + + return (void *)1; +} + +static void *user_seq_next(struct seq_file *m, void *p, loff_t *pos) +{ + ++*pos; + return NULL; +} + +static void user_seq_stop(struct seq_file *m, void *p) +{ +} + +static int user_seq_show(struct seq_file *m, void *p) +{ + struct user_event *user; + char status; + int i, active = 0, busy = 0, flags; + + mutex_lock(®_mutex); + + hash_for_each(register_table, i, user, node) { + status = register_page_data[user->index]; + flags = user->flags; + + seq_printf(m, "%d:%s", user->index, EVENT_NAME(user)); + + if (flags != 0 || status != 0) + seq_puts(m, " #"); + + if (status != 0) { + seq_puts(m, " Used by"); + if (status & EVENT_STATUS_FTRACE) + seq_puts(m, " ftrace"); + if (status & EVENT_STATUS_PERF) + seq_puts(m, " perf"); + if (status & EVENT_STATUS_OTHER) + seq_puts(m, " other"); + busy++; + } + + if (flags & FLAG_BPF_ITER) + seq_puts(m, " FLAG:BPF_ITER"); + + seq_puts(m, "\n"); + active++; + } + + mutex_unlock(®_mutex); + + seq_puts(m, "\n"); + seq_printf(m, "Active: %d\n", active); + seq_printf(m, "Busy: %d\n", busy); + seq_printf(m, "Max: %ld\n", MAX_EVENTS); + + return 0; +} + +static const struct seq_operations user_seq_ops = { + .start = user_seq_start, + .next = user_seq_next, + .stop = user_seq_stop, + .show = user_seq_show, +}; + +static int user_status_open(struct inode *node, struct file *file) +{ + return seq_open(file, &user_seq_ops); +} + +static const struct file_operations user_status_fops = { + .open = user_status_open, + .mmap = user_status_mmap, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* + * Creates a set of tracefs files to allow user mode interactions. + */ +static int create_user_tracefs(void) +{ + struct dentry *edata, *emmap; + + edata = tracefs_create_file("user_events_data", TRACE_MODE_WRITE, + NULL, NULL, &user_data_fops); + + if (!edata) { + pr_warn("Could not create tracefs 'user_events_data' entry\n"); + goto err; + } + + /* mmap with MAP_SHARED requires writable fd */ + emmap = tracefs_create_file("user_events_status", TRACE_MODE_WRITE, + NULL, NULL, &user_status_fops); + + if (!emmap) { + tracefs_remove(edata); + pr_warn("Could not create tracefs 'user_events_mmap' entry\n"); + goto err; + } + + return 0; +err: + return -ENODEV; +} + +static void set_page_reservations(bool set) +{ + int page; + + for (page = 0; page < MAX_PAGES; ++page) { + void *addr = register_page_data + (PAGE_SIZE * page); + + if (set) + SetPageReserved(virt_to_page(addr)); + else + ClearPageReserved(virt_to_page(addr)); + } +} + +static int __init trace_events_user_init(void) +{ + int ret; + + /* Zero all bits beside 0 (which is reserved for failures) */ + bitmap_zero(page_bitmap, MAX_EVENTS); + set_bit(0, page_bitmap); + + register_page_data = kzalloc(MAX_EVENTS, GFP_KERNEL); + + if (!register_page_data) + return -ENOMEM; + + set_page_reservations(true); + + ret = create_user_tracefs(); + + if (ret) { + pr_warn("user_events could not register with tracefs\n"); + set_page_reservations(false); + kfree(register_page_data); + return ret; + } + + if (dyn_event_register(&user_event_dops)) + pr_warn("user_events could not register with dyn_events\n"); + + return 0; +} + +fs_initcall(trace_events_user_init); -- cgit v1.2.3 From aa3b2b4c669205200615dd8a2cc4af4f81fd0335 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Tue, 18 Jan 2022 12:43:16 -0800 Subject: user_events: Add print_fmt generation support for basic types Addes print_fmt format generation for basic types that are supported for user processes. Only supports sizes that are the same on 32 and 64 bit. Link: https://lkml.kernel.org/r/20220118204326.2169-3-beaub@linux.microsoft.com Acked-by: Masami Hiramatsu Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 115 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 113 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 77105233115e..ddc5c3cf1bf8 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -359,6 +359,114 @@ static int user_event_parse_fields(struct user_event *user, char *args) static struct trace_event_fields user_event_fields_array[1]; +static const char *user_field_format(const char *type) +{ + if (strcmp(type, "s64") == 0) + return "%lld"; + if (strcmp(type, "u64") == 0) + return "%llu"; + if (strcmp(type, "s32") == 0) + return "%d"; + if (strcmp(type, "u32") == 0) + return "%u"; + if (strcmp(type, "int") == 0) + return "%d"; + if (strcmp(type, "unsigned int") == 0) + return "%u"; + if (strcmp(type, "s16") == 0) + return "%d"; + if (strcmp(type, "u16") == 0) + return "%u"; + if (strcmp(type, "short") == 0) + return "%d"; + if (strcmp(type, "unsigned short") == 0) + return "%u"; + if (strcmp(type, "s8") == 0) + return "%d"; + if (strcmp(type, "u8") == 0) + return "%u"; + if (strcmp(type, "char") == 0) + return "%d"; + if (strcmp(type, "unsigned char") == 0) + return "%u"; + if (strstr(type, "char[") != 0) + return "%s"; + + /* Unknown, likely struct, allowed treat as 64-bit */ + return "%llu"; +} + +static bool user_field_is_dyn_string(const char *type, const char **str_func) +{ + if (str_has_prefix(type, "__data_loc ")) { + *str_func = "__get_str"; + goto check; + } + + if (str_has_prefix(type, "__rel_loc ")) { + *str_func = "__get_rel_str"; + goto check; + } + + return false; +check: + return strstr(type, "char") != 0; +} + +#define LEN_OR_ZERO (len ? len - pos : 0) +static int user_event_set_print_fmt(struct user_event *user, char *buf, int len) +{ + struct ftrace_event_field *field, *next; + struct list_head *head = &user->fields; + int pos = 0, depth = 0; + const char *str_func; + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + + list_for_each_entry_safe_reverse(field, next, head, link) { + if (depth != 0) + pos += snprintf(buf + pos, LEN_OR_ZERO, " "); + + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s", + field->name, user_field_format(field->type)); + + depth++; + } + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + + list_for_each_entry_safe_reverse(field, next, head, link) { + if (user_field_is_dyn_string(field->type, &str_func)) + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", %s(%s)", str_func, field->name); + else + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", REC->%s", field->name); + } + + return pos + 1; +} +#undef LEN_OR_ZERO + +static int user_event_create_print_fmt(struct user_event *user) +{ + char *print_fmt; + int len; + + len = user_event_set_print_fmt(user, NULL, 0); + + print_fmt = kmalloc(len, GFP_KERNEL); + + if (!print_fmt) + return -ENOMEM; + + user_event_set_print_fmt(user, print_fmt, len); + + user->call.print_fmt = print_fmt; + + return 0; +} + static enum print_line_t user_event_print_trace(struct trace_iterator *iter, int flags, struct trace_event *event) @@ -392,6 +500,7 @@ static int destroy_user_event(struct user_event *user) clear_bit(user->index, page_bitmap); hash_del(&user->node); + kfree(user->call.print_fmt); kfree(EVENT_NAME(user)); kfree(user); @@ -669,8 +778,10 @@ static int user_event_parse(char *name, char *args, char *flags, if (ret) goto put_user; - /* Minimal print format */ - user->call.print_fmt = "\"\""; + ret = user_event_create_print_fmt(user); + + if (ret) + goto put_user; user->call.data = user; user->call.class = &user->class; -- cgit v1.2.3 From 9aed4e157d1ffe4aeebc005b4eceede1ed5a403a Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Tue, 18 Jan 2022 12:43:17 -0800 Subject: user_events: Handle matching arguments from dyn_events Ensures that when dynamic events requests a match with arguments that they match what is in the user_event. Link: https://lkml.kernel.org/r/20220118204326.2169-4-beaub@linux.microsoft.com Acked-by: Masami Hiramatsu Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 77 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 76 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index ddc5c3cf1bf8..a6794cb1f586 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -39,6 +39,7 @@ #define MAX_EVENT_DESC 512 #define EVENT_NAME(user_event) ((user_event)->tracepoint.name) #define MAX_FIELD_ARRAY_SIZE 1024 +#define MAX_FIELD_ARG_NAME 256 static char *register_page_data; @@ -700,13 +701,87 @@ static int user_event_free(struct dyn_event *ev) return destroy_user_event(user); } +static bool user_field_match(struct ftrace_event_field *field, int argc, + const char **argv, int *iout) +{ + char *field_name, *arg_name; + int len, pos, i = *iout; + bool colon = false, match = false; + + if (i >= argc) + return false; + + len = MAX_FIELD_ARG_NAME; + field_name = kmalloc(len, GFP_KERNEL); + arg_name = kmalloc(len, GFP_KERNEL); + + if (!arg_name || !field_name) + goto out; + + pos = 0; + + for (; i < argc; ++i) { + if (i != *iout) + pos += snprintf(arg_name + pos, len - pos, " "); + + pos += snprintf(arg_name + pos, len - pos, argv[i]); + + if (strchr(argv[i], ';')) { + ++i; + colon = true; + break; + } + } + + pos = 0; + + pos += snprintf(field_name + pos, len - pos, field->type); + pos += snprintf(field_name + pos, len - pos, " "); + pos += snprintf(field_name + pos, len - pos, field->name); + + if (colon) + pos += snprintf(field_name + pos, len - pos, ";"); + + *iout = i; + + match = strcmp(arg_name, field_name) == 0; +out: + kfree(arg_name); + kfree(field_name); + + return match; +} + +static bool user_fields_match(struct user_event *user, int argc, + const char **argv) +{ + struct ftrace_event_field *field, *next; + struct list_head *head = &user->fields; + int i = 0; + + list_for_each_entry_safe_reverse(field, next, head, link) + if (!user_field_match(field, argc, argv, &i)) + return false; + + if (i != argc) + return false; + + return true; +} + static bool user_event_match(const char *system, const char *event, int argc, const char **argv, struct dyn_event *ev) { struct user_event *user = container_of(ev, struct user_event, devent); + bool match; - return strcmp(EVENT_NAME(user), event) == 0 && + match = strcmp(EVENT_NAME(user), event) == 0 && (!system || strcmp(system, USER_EVENTS_SYSTEM) == 0); + + if (match && argc > 0) + match = user_fields_match(user, argc, argv); + + return match; } static struct dyn_event_operations user_event_dops = { -- cgit v1.2.3 From 3207d0459ef3789c7efa801b57123c8a79d05694 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Tue, 18 Jan 2022 12:43:18 -0800 Subject: user_events: Add basic perf and eBPF support Adds support to write out user_event data to perf_probe/perf files as well as to any attached eBPF program. Link: https://lkml.kernel.org/r/20220118204326.2169-5-beaub@linux.microsoft.com Acked-by: Masami Hiramatsu Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 72 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index a6794cb1f586..371f31472156 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -551,6 +551,50 @@ static void user_event_ftrace(struct user_event *user, void *data, u32 datalen, trace_event_buffer_commit(&event_buffer); } +#ifdef CONFIG_PERF_EVENTS +/* + * Writes the user supplied payload out to perf ring buffer or eBPF program. + */ +static void user_event_perf(struct user_event *user, void *data, u32 datalen, + void *tpdata) +{ + struct hlist_head *perf_head; + + if (bpf_prog_array_valid(&user->call)) { + struct user_bpf_context context = {0}; + + context.data_len = datalen; + context.data_type = USER_BPF_DATA_KERNEL; + context.kdata = data; + + trace_call_bpf(&user->call, &context); + } + + perf_head = this_cpu_ptr(user->call.perf_events); + + if (perf_head && !hlist_empty(perf_head)) { + struct trace_entry *perf_entry; + struct pt_regs *regs; + size_t size = sizeof(*perf_entry) + datalen; + int context; + + perf_entry = perf_trace_buf_alloc(ALIGN(size, 8), + ®s, &context); + + if (unlikely(!perf_entry)) + return; + + perf_fetch_caller_regs(regs); + + memcpy(perf_entry + 1, data, datalen); + + perf_trace_buf_submit(perf_entry, size, context, + user->call.event.type, 1, regs, + perf_head, NULL); + } +} +#endif + /* * Update the register page that is shared between user processes. */ @@ -573,6 +617,10 @@ static void update_reg_page_for(struct user_event *user) if (probe_func == user_event_ftrace) status |= EVENT_STATUS_FTRACE; +#ifdef CONFIG_PERF_EVENTS + else if (probe_func == user_event_perf) + status |= EVENT_STATUS_PERF; +#endif else status |= EVENT_STATUS_OTHER; } while ((++probe_func_ptr)->func); @@ -612,8 +660,27 @@ static int user_event_reg(struct trace_event_call *call, data); goto dec; - default: +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + ret = tracepoint_probe_register(call->tp, + call->class->perf_probe, + data); + if (!ret) + goto inc; + break; + + case TRACE_REG_PERF_UNREGISTER: + tracepoint_probe_unregister(call->tp, + call->class->perf_probe, + data); + goto dec; + + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: break; +#endif } return ret; @@ -870,6 +937,9 @@ static int user_event_parse(char *name, char *args, char *flags, user->class.get_fields = user_event_get_fields; user->class.reg = user_event_reg; user->class.probe = user_event_ftrace; +#ifdef CONFIG_PERF_EVENTS + user->class.perf_probe = user_event_perf; +#endif mutex_lock(&event_mutex); ret = user_event_trace_register(user); -- cgit v1.2.3 From 0279400ad38d858ed68f5d787385f6122d4170b2 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Tue, 18 Jan 2022 12:43:19 -0800 Subject: user_events: Optimize writing events by only copying data once Pass iterator through to probes to allow copying data directly to the probe buffers instead of taking multiple copies. Enables eBPF user and raw iterator types out to programs for no-copy scenarios. Link: https://lkml.kernel.org/r/20220118204326.2169-6-beaub@linux.microsoft.com Acked-by: Masami Hiramatsu Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 115 +++++++++++++++++++++++++++++---------- 1 file changed, 85 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 371f31472156..78b6b96c4cfa 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -41,6 +41,9 @@ #define MAX_FIELD_ARRAY_SIZE 1024 #define MAX_FIELD_ARG_NAME 256 +#define MAX_BPF_COPY_SIZE PAGE_SIZE +#define MAX_STACK_BPF_DATA 512 + static char *register_page_data; static DEFINE_MUTEX(reg_mutex); @@ -78,8 +81,7 @@ struct user_event_refs { struct user_event *events[]; }; -typedef void (*user_event_func_t) (struct user_event *user, - void *data, u32 datalen, +typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i, void *tpdata); static int user_event_parse(char *name, char *args, char *flags, @@ -90,6 +92,20 @@ static u32 user_event_key(char *name) return jhash(name, strlen(name), 0); } +static __always_inline __must_check +size_t copy_nofault(void *addr, size_t bytes, struct iov_iter *i) +{ + size_t ret; + + pagefault_disable(); + + ret = copy_from_iter_nocache(addr, bytes, i); + + pagefault_enable(); + + return ret; +} + static struct list_head *user_event_get_fields(struct trace_event_call *call) { struct user_event *user = (struct user_event *)call->data; @@ -525,7 +541,7 @@ static struct user_event *find_user_event(char *name, u32 *outkey) /* * Writes the user supplied payload out to a trace file. */ -static void user_event_ftrace(struct user_event *user, void *data, u32 datalen, +static void user_event_ftrace(struct user_event *user, struct iov_iter *i, void *tpdata) { struct trace_event_file *file; @@ -541,41 +557,83 @@ static void user_event_ftrace(struct user_event *user, void *data, u32 datalen, /* Allocates and fills trace_entry, + 1 of this is data payload */ entry = trace_event_buffer_reserve(&event_buffer, file, - sizeof(*entry) + datalen); + sizeof(*entry) + i->count); if (unlikely(!entry)) return; - memcpy(entry + 1, data, datalen); - - trace_event_buffer_commit(&event_buffer); + if (unlikely(!copy_nofault(entry + 1, i->count, i))) + __trace_event_discard_commit(event_buffer.buffer, + event_buffer.event); + else + trace_event_buffer_commit(&event_buffer); } #ifdef CONFIG_PERF_EVENTS +static void user_event_bpf(struct user_event *user, struct iov_iter *i) +{ + struct user_bpf_context context; + struct user_bpf_iter bpf_i; + char fast_data[MAX_STACK_BPF_DATA]; + void *temp = NULL; + + if ((user->flags & FLAG_BPF_ITER) && iter_is_iovec(i)) { + /* Raw iterator */ + context.data_type = USER_BPF_DATA_ITER; + context.data_len = i->count; + context.iter = &bpf_i; + + bpf_i.iov_offset = i->iov_offset; + bpf_i.iov = i->iov; + bpf_i.nr_segs = i->nr_segs; + } else if (i->nr_segs == 1 && iter_is_iovec(i)) { + /* Single buffer from user */ + context.data_type = USER_BPF_DATA_USER; + context.data_len = i->count; + context.udata = i->iov->iov_base + i->iov_offset; + } else { + /* Multi buffer from user */ + struct iov_iter copy = *i; + size_t copy_size = min_t(size_t, i->count, MAX_BPF_COPY_SIZE); + + context.data_type = USER_BPF_DATA_KERNEL; + context.kdata = fast_data; + + if (unlikely(copy_size > sizeof(fast_data))) { + temp = kmalloc(copy_size, GFP_NOWAIT); + + if (temp) + context.kdata = temp; + else + copy_size = sizeof(fast_data); + } + + context.data_len = copy_nofault(context.kdata, + copy_size, ©); + } + + trace_call_bpf(&user->call, &context); + + kfree(temp); +} + /* * Writes the user supplied payload out to perf ring buffer or eBPF program. */ -static void user_event_perf(struct user_event *user, void *data, u32 datalen, +static void user_event_perf(struct user_event *user, struct iov_iter *i, void *tpdata) { struct hlist_head *perf_head; - if (bpf_prog_array_valid(&user->call)) { - struct user_bpf_context context = {0}; - - context.data_len = datalen; - context.data_type = USER_BPF_DATA_KERNEL; - context.kdata = data; - - trace_call_bpf(&user->call, &context); - } + if (bpf_prog_array_valid(&user->call)) + user_event_bpf(user, i); perf_head = this_cpu_ptr(user->call.perf_events); if (perf_head && !hlist_empty(perf_head)) { struct trace_entry *perf_entry; struct pt_regs *regs; - size_t size = sizeof(*perf_entry) + datalen; + size_t size = sizeof(*perf_entry) + i->count; int context; perf_entry = perf_trace_buf_alloc(ALIGN(size, 8), @@ -586,7 +644,10 @@ static void user_event_perf(struct user_event *user, void *data, u32 datalen, perf_fetch_caller_regs(regs); - memcpy(perf_entry + 1, data, datalen); + if (unlikely(!copy_nofault(perf_entry + 1, i->count, i))) { + perf_swevent_put_recursion_context(context); + return; + } perf_trace_buf_submit(perf_entry, size, context, user->call.event.type, 1, regs, @@ -1024,16 +1085,11 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) if (likely(atomic_read(&tp->key.enabled) > 0)) { struct tracepoint_func *probe_func_ptr; user_event_func_t probe_func; + struct iov_iter copy; void *tpdata; - void *kdata; - u32 datalen; - kdata = kmalloc(i->count, GFP_KERNEL); - - if (unlikely(!kdata)) - return -ENOMEM; - - datalen = copy_from_iter(kdata, i->count, i); + if (unlikely(fault_in_iov_iter_readable(i, i->count))) + return -EFAULT; rcu_read_lock_sched(); @@ -1041,15 +1097,14 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) if (probe_func_ptr) { do { + copy = *i; probe_func = probe_func_ptr->func; tpdata = probe_func_ptr->data; - probe_func(user, kdata, datalen, tpdata); + probe_func(user, ©, tpdata); } while ((++probe_func_ptr)->func); } rcu_read_unlock_sched(); - - kfree(kdata); } return ret; -- cgit v1.2.3 From 2467cda1b5c97a58776a8aebfa5d76543e47479d Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Tue, 18 Jan 2022 12:43:20 -0800 Subject: user_events: Validate user payloads for size and null termination Add validation to ensure data is at or greater than the min size for the fields of the event. If a dynamic array is used and is a type of char, ensure null termination of the array exists. Link: https://lkml.kernel.org/r/20220118204326.2169-7-beaub@linux.microsoft.com Acked-by: Masami Hiramatsu Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 147 +++++++++++++++++++++++++++++++++++---- 1 file changed, 133 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 78b6b96c4cfa..2b5e9fdb63a0 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -64,9 +64,11 @@ struct user_event { struct dyn_event devent; struct hlist_node node; struct list_head fields; + struct list_head validators; atomic_t refcnt; int index; int flags; + int min_size; }; /* @@ -81,8 +83,17 @@ struct user_event_refs { struct user_event *events[]; }; +#define VALIDATOR_ENSURE_NULL (1 << 0) +#define VALIDATOR_REL (1 << 1) + +struct user_event_validator { + struct list_head link; + int offset; + int flags; +}; + typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i, - void *tpdata); + void *tpdata, bool *faulted); static int user_event_parse(char *name, char *args, char *flags, struct user_event **newuser); @@ -215,6 +226,17 @@ static int user_field_size(const char *type) return -EINVAL; } +static void user_event_destroy_validators(struct user_event *user) +{ + struct user_event_validator *validator, *next; + struct list_head *head = &user->validators; + + list_for_each_entry_safe(validator, next, head, link) { + list_del(&validator->link); + kfree(validator); + } +} + static void user_event_destroy_fields(struct user_event *user) { struct ftrace_event_field *field, *next; @@ -230,13 +252,43 @@ static int user_event_add_field(struct user_event *user, const char *type, const char *name, int offset, int size, int is_signed, int filter_type) { + struct user_event_validator *validator; struct ftrace_event_field *field; + int validator_flags = 0; field = kmalloc(sizeof(*field), GFP_KERNEL); if (!field) return -ENOMEM; + if (str_has_prefix(type, "__data_loc ")) + goto add_validator; + + if (str_has_prefix(type, "__rel_loc ")) { + validator_flags |= VALIDATOR_REL; + goto add_validator; + } + + goto add_field; + +add_validator: + if (strstr(type, "char") != 0) + validator_flags |= VALIDATOR_ENSURE_NULL; + + validator = kmalloc(sizeof(*validator), GFP_KERNEL); + + if (!validator) { + kfree(field); + return -ENOMEM; + } + + validator->flags = validator_flags; + validator->offset = offset; + + /* Want sequential access when validating */ + list_add_tail(&validator->link, &user->validators); + +add_field: field->type = type; field->name = name; field->offset = offset; @@ -246,6 +298,12 @@ static int user_event_add_field(struct user_event *user, const char *type, list_add(&field->link, &user->fields); + /* + * Min size from user writes that are required, this does not include + * the size of trace_entry (common fields). + */ + user->min_size = (offset + size) - sizeof(struct trace_entry); + return 0; } @@ -517,6 +575,7 @@ static int destroy_user_event(struct user_event *user) clear_bit(user->index, page_bitmap); hash_del(&user->node); + user_event_destroy_validators(user); kfree(user->call.print_fmt); kfree(EVENT_NAME(user)); kfree(user); @@ -538,15 +597,49 @@ static struct user_event *find_user_event(char *name, u32 *outkey) return NULL; } +static int user_event_validate(struct user_event *user, void *data, int len) +{ + struct list_head *head = &user->validators; + struct user_event_validator *validator; + void *pos, *end = data + len; + u32 loc, offset, size; + + list_for_each_entry(validator, head, link) { + pos = data + validator->offset; + + /* Already done min_size check, no bounds check here */ + loc = *(u32 *)pos; + offset = loc & 0xffff; + size = loc >> 16; + + if (likely(validator->flags & VALIDATOR_REL)) + pos += offset + sizeof(loc); + else + pos = data + offset; + + pos += size; + + if (unlikely(pos > end)) + return -EFAULT; + + if (likely(validator->flags & VALIDATOR_ENSURE_NULL)) + if (unlikely(*(char *)(pos - 1) != '\0')) + return -EFAULT; + } + + return 0; +} + /* * Writes the user supplied payload out to a trace file. */ static void user_event_ftrace(struct user_event *user, struct iov_iter *i, - void *tpdata) + void *tpdata, bool *faulted) { struct trace_event_file *file; struct trace_entry *entry; struct trace_event_buffer event_buffer; + size_t size = sizeof(*entry) + i->count; file = (struct trace_event_file *)tpdata; @@ -556,17 +649,25 @@ static void user_event_ftrace(struct user_event *user, struct iov_iter *i, return; /* Allocates and fills trace_entry, + 1 of this is data payload */ - entry = trace_event_buffer_reserve(&event_buffer, file, - sizeof(*entry) + i->count); + entry = trace_event_buffer_reserve(&event_buffer, file, size); if (unlikely(!entry)) return; if (unlikely(!copy_nofault(entry + 1, i->count, i))) - __trace_event_discard_commit(event_buffer.buffer, - event_buffer.event); - else - trace_event_buffer_commit(&event_buffer); + goto discard; + + if (!list_empty(&user->validators) && + unlikely(user_event_validate(user, entry, size))) + goto discard; + + trace_event_buffer_commit(&event_buffer); + + return; +discard: + *faulted = true; + __trace_event_discard_commit(event_buffer.buffer, + event_buffer.event); } #ifdef CONFIG_PERF_EVENTS @@ -621,7 +722,7 @@ static void user_event_bpf(struct user_event *user, struct iov_iter *i) * Writes the user supplied payload out to perf ring buffer or eBPF program. */ static void user_event_perf(struct user_event *user, struct iov_iter *i, - void *tpdata) + void *tpdata, bool *faulted) { struct hlist_head *perf_head; @@ -644,14 +745,21 @@ static void user_event_perf(struct user_event *user, struct iov_iter *i, perf_fetch_caller_regs(regs); - if (unlikely(!copy_nofault(perf_entry + 1, i->count, i))) { - perf_swevent_put_recursion_context(context); - return; - } + if (unlikely(!copy_nofault(perf_entry + 1, i->count, i))) + goto discard; + + if (!list_empty(&user->validators) && + unlikely(user_event_validate(user, perf_entry, size))) + goto discard; perf_trace_buf_submit(perf_entry, size, context, user->call.event.type, 1, regs, perf_head, NULL); + + return; +discard: + *faulted = true; + perf_swevent_put_recursion_context(context); } } #endif @@ -971,6 +1079,7 @@ static int user_event_parse(char *name, char *args, char *flags, INIT_LIST_HEAD(&user->class.fields); INIT_LIST_HEAD(&user->fields); + INIT_LIST_HEAD(&user->validators); user->tracepoint.name = name; @@ -1019,6 +1128,7 @@ static int user_event_parse(char *name, char *args, char *flags, return 0; put_user: user_event_destroy_fields(user); + user_event_destroy_validators(user); kfree(user); return ret; } @@ -1076,6 +1186,9 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) if (unlikely(user == NULL)) return -ENOENT; + if (unlikely(i->count < user->min_size)) + return -EINVAL; + tp = &user->tracepoint; /* @@ -1087,10 +1200,13 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) user_event_func_t probe_func; struct iov_iter copy; void *tpdata; + bool faulted; if (unlikely(fault_in_iov_iter_readable(i, i->count))) return -EFAULT; + faulted = false; + rcu_read_lock_sched(); probe_func_ptr = rcu_dereference_sched(tp->funcs); @@ -1100,11 +1216,14 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) copy = *i; probe_func = probe_func_ptr->func; tpdata = probe_func_ptr->data; - probe_func(user, ©, tpdata); + probe_func(user, ©, tpdata, &faulted); } while ((++probe_func_ptr)->func); } rcu_read_unlock_sched(); + + if (unlikely(faulted)) + return -EFAULT; } return ret; -- cgit v1.2.3 From ddc204b517e60ae64db34f9832dc41dafa77c751 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 8 Feb 2022 11:39:12 -0500 Subject: copy_process(): Move fd_install() out of sighand->siglock critical section I was made aware of the following lockdep splat: [ 2516.308763] ===================================================== [ 2516.309085] WARNING: HARDIRQ-safe -> HARDIRQ-unsafe lock order detected [ 2516.309433] 5.14.0-51.el9.aarch64+debug #1 Not tainted [ 2516.309703] ----------------------------------------------------- [ 2516.310149] stress-ng/153663 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire: [ 2516.310512] ffff0000e422b198 (&newf->file_lock){+.+.}-{2:2}, at: fd_install+0x368/0x4f0 [ 2516.310944] and this task is already holding: [ 2516.311248] ffff0000c08140d8 (&sighand->siglock){-.-.}-{2:2}, at: copy_process+0x1e2c/0x3e80 [ 2516.311804] which would create a new lock dependency: [ 2516.312066] (&sighand->siglock){-.-.}-{2:2} -> (&newf->file_lock){+.+.}-{2:2} [ 2516.312446] but this new dependency connects a HARDIRQ-irq-safe lock: [ 2516.312983] (&sighand->siglock){-.-.}-{2:2} : [ 2516.330700] Possible interrupt unsafe locking scenario: [ 2516.331075] CPU0 CPU1 [ 2516.331328] ---- ---- [ 2516.331580] lock(&newf->file_lock); [ 2516.331790] local_irq_disable(); [ 2516.332231] lock(&sighand->siglock); [ 2516.332579] lock(&newf->file_lock); [ 2516.332922] [ 2516.333069] lock(&sighand->siglock); [ 2516.333291] *** DEADLOCK *** [ 2516.389845] stack backtrace: [ 2516.390101] CPU: 3 PID: 153663 Comm: stress-ng Kdump: loaded Not tainted 5.14.0-51.el9.aarch64+debug #1 [ 2516.390756] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 [ 2516.391155] Call trace: [ 2516.391302] dump_backtrace+0x0/0x3e0 [ 2516.391518] show_stack+0x24/0x30 [ 2516.391717] dump_stack_lvl+0x9c/0xd8 [ 2516.391938] dump_stack+0x1c/0x38 [ 2516.392247] print_bad_irq_dependency+0x620/0x710 [ 2516.392525] check_irq_usage+0x4fc/0x86c [ 2516.392756] check_prev_add+0x180/0x1d90 [ 2516.392988] validate_chain+0x8e0/0xee0 [ 2516.393215] __lock_acquire+0x97c/0x1e40 [ 2516.393449] lock_acquire.part.0+0x240/0x570 [ 2516.393814] lock_acquire+0x90/0xb4 [ 2516.394021] _raw_spin_lock+0xe8/0x154 [ 2516.394244] fd_install+0x368/0x4f0 [ 2516.394451] copy_process+0x1f5c/0x3e80 [ 2516.394678] kernel_clone+0x134/0x660 [ 2516.394895] __do_sys_clone3+0x130/0x1f4 [ 2516.395128] __arm64_sys_clone3+0x5c/0x7c [ 2516.395478] invoke_syscall.constprop.0+0x78/0x1f0 [ 2516.395762] el0_svc_common.constprop.0+0x22c/0x2c4 [ 2516.396050] do_el0_svc+0xb0/0x10c [ 2516.396252] el0_svc+0x24/0x34 [ 2516.396436] el0t_64_sync_handler+0xa4/0x12c [ 2516.396688] el0t_64_sync+0x198/0x19c [ 2517.491197] NET: Registered PF_ATMPVC protocol family [ 2517.491524] NET: Registered PF_ATMSVC protocol family [ 2591.991877] sched: RT throttling activated One way to solve this problem is to move the fd_install() call out of the sighand->siglock critical section. Before commit 6fd2fe494b17 ("copy_process(): don't use ksys_close() on cleanups"), the pidfd installation was done without holding both the task_list lock and the sighand->siglock. Obviously, holding these two locks are not really needed to protect the fd_install() call. So move the fd_install() call down to after the releases of both locks. Link: https://lore.kernel.org/r/20220208163912.1084752-1-longman@redhat.com Fixes: 6fd2fe494b17 ("copy_process(): don't use ksys_close() on cleanups") Reviewed-by: "Eric W. Biederman" Signed-off-by: Waiman Long Signed-off-by: Christian Brauner --- kernel/fork.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index d75a528f7b21..007af7fb47c7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2323,10 +2323,6 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cancel_cgroup; } - /* past the last point of failure */ - if (pidfile) - fd_install(pidfd, pidfile); - init_task_pid_links(p); if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); @@ -2375,6 +2371,9 @@ static __latent_entropy struct task_struct *copy_process( syscall_tracepoint_update(p); write_unlock_irq(&tasklist_lock); + if (pidfile) + fd_install(pidfd, pidfile); + proc_fork_connector(p); sched_post_fork(p, args); cgroup_post_fork(p, args); -- cgit v1.2.3 From c441e934b604a3b5f350a9104124cf6a3ba07a34 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Sat, 15 Jan 2022 15:16:57 -0800 Subject: locking: Add missing __sched attributes This patch adds __sched attributes to a few missing places to show blocked function rather than locking function in get_wchan. Signed-off-by: Minchan Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220115231657.84828-1-minchan@kernel.org --- kernel/locking/percpu-rwsem.c | 5 +++-- kernel/locking/rwsem.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 70a32a576f3f..c9fdae94e098 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -7,6 +7,7 @@ #include #include #include +#include #include int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, @@ -162,7 +163,7 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) __set_current_state(TASK_RUNNING); } -bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) +bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) { if (__percpu_down_read_trylock(sem)) return true; @@ -211,7 +212,7 @@ static bool readers_active_check(struct percpu_rw_semaphore *sem) return true; } -void percpu_down_write(struct percpu_rw_semaphore *sem) +void __sched percpu_down_write(struct percpu_rw_semaphore *sem) { might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 69aba4abe104..acde5d6f1254 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1048,7 +1048,7 @@ out_nolock: /* * Wait until we successfully acquire the write lock */ -static struct rw_semaphore * +static struct rw_semaphore __sched * rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) { long count; -- cgit v1.2.3 From 3bd916ee0ecbbdd902fc24845f2fef332b2a310c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 11 Feb 2022 11:49:48 -0800 Subject: bpf: Emit bpf_timer in vmlinux BTF Currently the following code in check_and_init_map_value() *(struct bpf_timer *)(dst + map->timer_off) = (struct bpf_timer){}; can help generate bpf_timer definition in vmlinuxBTF. But the code above may not zero the whole structure due to anonymour members and that code will be replaced by memset in the subsequent patch and bpf_timer definition will disappear from vmlinuxBTF. Let us emit the type explicitly so bpf program can continue to use it from vmlinux.h. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220211194948.3141529-1-yhs@fb.com --- kernel/bpf/helpers.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 01cfdf40c838..55c084251fab 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2,6 +2,7 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ #include +#include #include #include #include @@ -1075,6 +1076,7 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) void *key; u32 idx; + BTF_TYPE_EMIT(struct bpf_timer); callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held()); if (!callback_fn) goto out; -- cgit v1.2.3 From 28df029d53a2fd80c1b8674d47895648ad26dcfb Mon Sep 17 00:00:00 2001 From: Cheng Jui Wang Date: Thu, 10 Feb 2022 18:50:11 +0800 Subject: lockdep: Correct lock_classes index mapping A kernel exception was hit when trying to dump /proc/lockdep_chains after lockdep report "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!": Unable to handle kernel paging request at virtual address 00054005450e05c3 ... 00054005450e05c3] address between user and kernel address ranges ... pc : [0xffffffece769b3a8] string+0x50/0x10c lr : [0xffffffece769ac88] vsnprintf+0x468/0x69c ... Call trace: string+0x50/0x10c vsnprintf+0x468/0x69c seq_printf+0x8c/0xd8 print_name+0x64/0xf4 lc_show+0xb8/0x128 seq_read_iter+0x3cc/0x5fc proc_reg_read_iter+0xdc/0x1d4 The cause of the problem is the function lock_chain_get_class() will shift lock_classes index by 1, but the index don't need to be shifted anymore since commit 01bb6f0af992 ("locking/lockdep: Change the range of class_idx in held_lock struct") already change the index to start from 0. The lock_classes[-1] located at chain_hlocks array. When printing lock_classes[-1] after the chain_hlocks entries are modified, the exception happened. The output of lockdep_chains are incorrect due to this problem too. Fixes: f611e8cf98ec ("lockdep: Take read/write status in consideration when generate chainkey") Signed-off-by: Cheng Jui Wang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Boqun Feng Link: https://lore.kernel.org/r/20220210105011.21712-1-cheng-jui.wang@mediatek.com --- kernel/locking/lockdep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4a882f83aeb9..f8a0212189ca 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3462,7 +3462,7 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) u16 chain_hlock = chain_hlocks[chain->base + i]; unsigned int class_idx = chain_hlock_class_idx(chain_hlock); - return lock_classes + class_idx - 1; + return lock_classes + class_idx; } /* @@ -3530,7 +3530,7 @@ static void print_chain_keys_chain(struct lock_chain *chain) hlock_id = chain_hlocks[chain->base + i]; chain_key = print_chain_key_iteration(hlock_id, chain_key); - print_lock_name(lock_classes + chain_hlock_class_idx(hlock_id) - 1); + print_lock_name(lock_classes + chain_hlock_class_idx(hlock_id)); printk("\n"); } } -- cgit v1.2.3 From 2cfb7a1b031b0e816af7a6ee0c6ab83b0acdf05a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 8 Feb 2022 09:43:33 +0000 Subject: sched/fair: Improve consistency of allowed NUMA balance calculations There are inconsistencies when determining if a NUMA imbalance is allowed that should be corrected. o allow_numa_imbalance changes types and is not always examining the destination group so both the type should be corrected as well as the naming. o find_idlest_group uses the sched_domain's weight instead of the group weight which is different to find_busiest_group o find_busiest_group uses the source group instead of the destination which is different to task_numa_find_cpu o Both find_idlest_group and find_busiest_group should account for the number of running tasks if a move was allowed to be consistent with task_numa_find_cpu Fixes: 7d2b5dd0bcc4 ("sched/numa: Allow a floating imbalance between NUMA nodes") Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20220208094334.16379-2-mgorman@techsingularity.net --- kernel/sched/fair.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5dca13ff89f2..ea710168ae91 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9005,9 +9005,10 @@ static bool update_pick_idlest(struct sched_group *idlest, * This is an approximation as the number of running tasks may not be * related to the number of busy CPUs due to sched_setaffinity. */ -static inline bool allow_numa_imbalance(int dst_running, int dst_weight) +static inline bool +allow_numa_imbalance(unsigned int running, unsigned int weight) { - return (dst_running < (dst_weight >> 2)); + return (running < (weight >> 2)); } /* @@ -9141,12 +9142,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) return idlest; #endif /* - * Otherwise, keep the task on this node to stay close - * its wakeup source and improve locality. If there is - * a real need of migration, periodic load balance will - * take care of it. + * Otherwise, keep the task close to the wakeup source + * and improve locality if the number of running tasks + * would remain below threshold where an imbalance is + * allowed. If there is a real need of migration, + * periodic load balance will take care of it. */ - if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight)) + if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, local_sgs.group_weight)) return NULL; } @@ -9352,7 +9354,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* Consider allowing a small imbalance between NUMA groups */ if (env->sd->flags & SD_NUMA) { env->imbalance = adjust_numa_imbalance(env->imbalance, - busiest->sum_nr_running, busiest->group_weight); + local->sum_nr_running + 1, local->group_weight); } return; -- cgit v1.2.3 From e496132ebedd870b67f1f6d2428f9bb9d7ae27fd Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 8 Feb 2022 09:43:34 +0000 Subject: sched/fair: Adjust the allowed NUMA imbalance when SD_NUMA spans multiple LLCs Commit 7d2b5dd0bcc4 ("sched/numa: Allow a floating imbalance between NUMA nodes") allowed an imbalance between NUMA nodes such that communicating tasks would not be pulled apart by the load balancer. This works fine when there is a 1:1 relationship between LLC and node but can be suboptimal for multiple LLCs if independent tasks prematurely use CPUs sharing cache. Zen* has multiple LLCs per node with local memory channels and due to the allowed imbalance, it's far harder to tune some workloads to run optimally than it is on hardware that has 1 LLC per node. This patch allows an imbalance to exist up to the point where LLCs should be balanced between nodes. On a Zen3 machine running STREAM parallelised with OMP to have on instance per LLC the results and without binding, the results are 5.17.0-rc0 5.17.0-rc0 vanilla sched-numaimb-v6 MB/sec copy-16 162596.94 ( 0.00%) 580559.74 ( 257.05%) MB/sec scale-16 136901.28 ( 0.00%) 374450.52 ( 173.52%) MB/sec add-16 157300.70 ( 0.00%) 564113.76 ( 258.62%) MB/sec triad-16 151446.88 ( 0.00%) 564304.24 ( 272.61%) STREAM can use directives to force the spread if the OpenMP is new enough but that doesn't help if an application uses threads and it's not known in advance how many threads will be created. Coremark is a CPU and cache intensive benchmark parallelised with threads. When running with 1 thread per core, the vanilla kernel allows threads to contend on cache. With the patch; 5.17.0-rc0 5.17.0-rc0 vanilla sched-numaimb-v5 Min Score-16 368239.36 ( 0.00%) 389816.06 ( 5.86%) Hmean Score-16 388607.33 ( 0.00%) 427877.08 * 10.11%* Max Score-16 408945.69 ( 0.00%) 481022.17 ( 17.62%) Stddev Score-16 15247.04 ( 0.00%) 24966.82 ( -63.75%) CoeffVar Score-16 3.92 ( 0.00%) 5.82 ( -48.48%) It can also make a big difference for semi-realistic workloads like specjbb which can execute arbitrary numbers of threads without advance knowledge of how they should be placed. Even in cases where the average performance is neutral, the results are more stable. 5.17.0-rc0 5.17.0-rc0 vanilla sched-numaimb-v6 Hmean tput-1 71631.55 ( 0.00%) 73065.57 ( 2.00%) Hmean tput-8 582758.78 ( 0.00%) 556777.23 ( -4.46%) Hmean tput-16 1020372.75 ( 0.00%) 1009995.26 ( -1.02%) Hmean tput-24 1416430.67 ( 0.00%) 1398700.11 ( -1.25%) Hmean tput-32 1687702.72 ( 0.00%) 1671357.04 ( -0.97%) Hmean tput-40 1798094.90 ( 0.00%) 2015616.46 * 12.10%* Hmean tput-48 1972731.77 ( 0.00%) 2333233.72 ( 18.27%) Hmean tput-56 2386872.38 ( 0.00%) 2759483.38 ( 15.61%) Hmean tput-64 2909475.33 ( 0.00%) 2925074.69 ( 0.54%) Hmean tput-72 2585071.36 ( 0.00%) 2962443.97 ( 14.60%) Hmean tput-80 2994387.24 ( 0.00%) 3015980.59 ( 0.72%) Hmean tput-88 3061408.57 ( 0.00%) 3010296.16 ( -1.67%) Hmean tput-96 3052394.82 ( 0.00%) 2784743.41 ( -8.77%) Hmean tput-104 2997814.76 ( 0.00%) 2758184.50 ( -7.99%) Hmean tput-112 2955353.29 ( 0.00%) 2859705.09 ( -3.24%) Hmean tput-120 2889770.71 ( 0.00%) 2764478.46 ( -4.34%) Hmean tput-128 2871713.84 ( 0.00%) 2750136.73 ( -4.23%) Stddev tput-1 5325.93 ( 0.00%) 2002.53 ( 62.40%) Stddev tput-8 6630.54 ( 0.00%) 10905.00 ( -64.47%) Stddev tput-16 25608.58 ( 0.00%) 6851.16 ( 73.25%) Stddev tput-24 12117.69 ( 0.00%) 4227.79 ( 65.11%) Stddev tput-32 27577.16 ( 0.00%) 8761.05 ( 68.23%) Stddev tput-40 59505.86 ( 0.00%) 2048.49 ( 96.56%) Stddev tput-48 168330.30 ( 0.00%) 93058.08 ( 44.72%) Stddev tput-56 219540.39 ( 0.00%) 30687.02 ( 86.02%) Stddev tput-64 121750.35 ( 0.00%) 9617.36 ( 92.10%) Stddev tput-72 223387.05 ( 0.00%) 34081.13 ( 84.74%) Stddev tput-80 128198.46 ( 0.00%) 22565.19 ( 82.40%) Stddev tput-88 136665.36 ( 0.00%) 27905.97 ( 79.58%) Stddev tput-96 111925.81 ( 0.00%) 99615.79 ( 11.00%) Stddev tput-104 146455.96 ( 0.00%) 28861.98 ( 80.29%) Stddev tput-112 88740.49 ( 0.00%) 58288.23 ( 34.32%) Stddev tput-120 186384.86 ( 0.00%) 45812.03 ( 75.42%) Stddev tput-128 78761.09 ( 0.00%) 57418.48 ( 27.10%) Similarly, for embarassingly parallel problems like NPB-ep, there are improvements due to better spreading across LLC when the machine is not fully utilised. vanilla sched-numaimb-v6 Min ep.D 31.79 ( 0.00%) 26.11 ( 17.87%) Amean ep.D 31.86 ( 0.00%) 26.17 * 17.86%* Stddev ep.D 0.07 ( 0.00%) 0.05 ( 24.41%) CoeffVar ep.D 0.22 ( 0.00%) 0.20 ( 7.97%) Max ep.D 31.93 ( 0.00%) 26.21 ( 17.91%) Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Gautham R. Shenoy Tested-by: K Prateek Nayak Link: https://lore.kernel.org/r/20220208094334.16379-3-mgorman@techsingularity.net --- include/linux/sched/topology.h | 1 + kernel/sched/fair.c | 22 ++++++++++-------- kernel/sched/topology.c | 53 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 8054641c0a7b..56cffe42abbc 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -93,6 +93,7 @@ struct sched_domain { unsigned int busy_factor; /* less balancing by factor if busy */ unsigned int imbalance_pct; /* No balance until over watermark */ unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ + unsigned int imb_numa_nr; /* Nr running tasks that allows a NUMA imbalance */ int nohz_idle; /* NOHZ IDLE status */ int flags; /* See SD_* */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ea710168ae91..5c4bfffe8c2c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1489,6 +1489,7 @@ struct task_numa_env { int src_cpu, src_nid; int dst_cpu, dst_nid; + int imb_numa_nr; struct numa_stats src_stats, dst_stats; @@ -1503,7 +1504,7 @@ struct task_numa_env { static unsigned long cpu_load(struct rq *rq); static unsigned long cpu_runnable(struct rq *rq); static inline long adjust_numa_imbalance(int imbalance, - int dst_running, int dst_weight); + int dst_running, int imb_numa_nr); static inline enum numa_type numa_classify(unsigned int imbalance_pct, @@ -1884,7 +1885,7 @@ static void task_numa_find_cpu(struct task_numa_env *env, dst_running = env->dst_stats.nr_running + 1; imbalance = max(0, dst_running - src_running); imbalance = adjust_numa_imbalance(imbalance, dst_running, - env->dst_stats.weight); + env->imb_numa_nr); /* Use idle CPU if there is no imbalance */ if (!imbalance) { @@ -1949,8 +1950,10 @@ static int task_numa_migrate(struct task_struct *p) */ rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); - if (sd) + if (sd) { env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; + env.imb_numa_nr = sd->imb_numa_nr; + } rcu_read_unlock(); /* @@ -9005,10 +9008,9 @@ static bool update_pick_idlest(struct sched_group *idlest, * This is an approximation as the number of running tasks may not be * related to the number of busy CPUs due to sched_setaffinity. */ -static inline bool -allow_numa_imbalance(unsigned int running, unsigned int weight) +static inline bool allow_numa_imbalance(int running, int imb_numa_nr) { - return (running < (weight >> 2)); + return running <= imb_numa_nr; } /* @@ -9148,7 +9150,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) * allowed. If there is a real need of migration, * periodic load balance will take care of it. */ - if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, local_sgs.group_weight)) + if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, sd->imb_numa_nr)) return NULL; } @@ -9240,9 +9242,9 @@ next_group: #define NUMA_IMBALANCE_MIN 2 static inline long adjust_numa_imbalance(int imbalance, - int dst_running, int dst_weight) + int dst_running, int imb_numa_nr) { - if (!allow_numa_imbalance(dst_running, dst_weight)) + if (!allow_numa_imbalance(dst_running, imb_numa_nr)) return imbalance; /* @@ -9354,7 +9356,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* Consider allowing a small imbalance between NUMA groups */ if (env->sd->flags & SD_NUMA) { env->imbalance = adjust_numa_imbalance(env->imbalance, - local->sum_nr_running + 1, local->group_weight); + local->sum_nr_running + 1, env->sd->imb_numa_nr); } return; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index d201a7052a29..e6cd55951304 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2242,6 +2242,59 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att } } + /* + * Calculate an allowed NUMA imbalance such that LLCs do not get + * imbalanced. + */ + for_each_cpu(i, cpu_map) { + unsigned int imb = 0; + unsigned int imb_span = 1; + + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + struct sched_domain *child = sd->child; + + if (!(sd->flags & SD_SHARE_PKG_RESOURCES) && child && + (child->flags & SD_SHARE_PKG_RESOURCES)) { + struct sched_domain *top, *top_p; + unsigned int nr_llcs; + + /* + * For a single LLC per node, allow an + * imbalance up to 25% of the node. This is an + * arbitrary cutoff based on SMT-2 to balance + * between memory bandwidth and avoiding + * premature sharing of HT resources and SMT-4 + * or SMT-8 *may* benefit from a different + * cutoff. + * + * For multiple LLCs, allow an imbalance + * until multiple tasks would share an LLC + * on one node while LLCs on another node + * remain idle. + */ + nr_llcs = sd->span_weight / child->span_weight; + if (nr_llcs == 1) + imb = sd->span_weight >> 2; + else + imb = nr_llcs; + sd->imb_numa_nr = imb; + + /* Set span based on the first NUMA domain. */ + top = sd; + top_p = top->parent; + while (top_p && !(top_p->flags & SD_NUMA)) { + top = top->parent; + top_p = top->parent; + } + imb_span = top_p ? top_p->span_weight : sd->span_weight; + } else { + int factor = max(1U, (sd->span_weight / imb_span)); + + sd->imb_numa_nr = imb * factor; + } + } + } + /* Calculate CPU capacity for physical packages and nodes */ for (i = nr_cpumask_bits-1; i >= 0; i--) { if (!cpumask_test_cpu(i, cpu_map)) -- cgit v1.2.3 From 00ba9357d18947859b7ef03a82c7f4185567ff0b Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 11 Feb 2022 23:32:36 +0100 Subject: ARM: ixp4xx: Drop custom DMA coherency and bouncing The new PCI driver does not need any of this stuff, so just drop it. Cc: iommu@lists.linux-foundation.org Reviewed-by: Christoph Hellwig Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20220211223238.648934-12-linus.walleij@linaro.org Signed-off-by: Linus Walleij --- arch/arm/Kconfig | 5 ---- arch/arm/mach-ixp4xx/common.c | 57 ------------------------------------------- kernel/dma/mapping.c | 2 -- 3 files changed, 64 deletions(-) (limited to 'kernel') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 3a95203236d2..ec0dbaf73a81 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -217,9 +217,6 @@ config ARCH_MAY_HAVE_PC_FDC config ARCH_SUPPORTS_UPROBES def_bool y -config ARCH_HAS_DMA_SET_COHERENT_MASK - bool - config GENERIC_ISA_DMA bool @@ -381,10 +378,8 @@ config ARCH_IOP32X config ARCH_IXP4XX bool "IXP4xx-based" depends on MMU - select ARCH_HAS_DMA_SET_COHERENT_MASK select ARCH_SUPPORTS_BIG_ENDIAN select CPU_XSCALE - select DMABOUNCE if PCI select GENERIC_IRQ_MULTI_HANDLER select GPIO_IXP4XX select GPIOLIB diff --git a/arch/arm/mach-ixp4xx/common.c b/arch/arm/mach-ixp4xx/common.c index 4e51514ace6d..310e1602fbfc 100644 --- a/arch/arm/mach-ixp4xx/common.c +++ b/arch/arm/mach-ixp4xx/common.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include @@ -330,59 +329,3 @@ void ixp4xx_restart(enum reboot_mode mode, const char *cmd) *IXP4XX_OSWE = IXP4XX_WDT_RESET_ENABLE | IXP4XX_WDT_COUNT_ENABLE; } } - -#ifdef CONFIG_PCI -static int ixp4xx_needs_bounce(struct device *dev, dma_addr_t dma_addr, size_t size) -{ - return (dma_addr + size) > SZ_64M; -} - -static int ixp4xx_platform_notify_remove(struct device *dev) -{ - if (dev_is_pci(dev)) - dmabounce_unregister_dev(dev); - - return 0; -} -#endif - -/* - * Setup DMA mask to 64MB on PCI devices and 4 GB on all other things. - */ -static int ixp4xx_platform_notify(struct device *dev) -{ - dev->dma_mask = &dev->coherent_dma_mask; - -#ifdef CONFIG_PCI - if (dev_is_pci(dev)) { - dev->coherent_dma_mask = DMA_BIT_MASK(28); /* 64 MB */ - dmabounce_register_dev(dev, 2048, 4096, ixp4xx_needs_bounce); - return 0; - } -#endif - - dev->coherent_dma_mask = DMA_BIT_MASK(32); - return 0; -} - -int dma_set_coherent_mask(struct device *dev, u64 mask) -{ - if (dev_is_pci(dev)) - mask &= DMA_BIT_MASK(28); /* 64 MB */ - - if ((mask & DMA_BIT_MASK(28)) == DMA_BIT_MASK(28)) { - dev->coherent_dma_mask = mask; - return 0; - } - - return -EIO; /* device wanted sub-64MB mask */ -} -EXPORT_SYMBOL(dma_set_coherent_mask); - -void __init ixp4xx_init_early(void) -{ - platform_notify = ixp4xx_platform_notify; -#ifdef CONFIG_PCI - platform_notify_remove = ixp4xx_platform_notify_remove; -#endif -} diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 9478eccd1c8e..559461a826ba 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -745,7 +745,6 @@ int dma_set_mask(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_set_mask); -#ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK int dma_set_coherent_mask(struct device *dev, u64 mask) { /* @@ -761,7 +760,6 @@ int dma_set_coherent_mask(struct device *dev, u64 mask) return 0; } EXPORT_SYMBOL(dma_set_coherent_mask); -#endif size_t dma_max_mapping_size(struct device *dev) { -- cgit v1.2.3 From ddbd89deb7d32b1fbb879f48d68fda1a8ac58e8e Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Fri, 11 Feb 2022 02:12:52 +0100 Subject: swiotlb: fix info leak with DMA_FROM_DEVICE The problem I'm addressing was discovered by the LTP test covering cve-2018-1000204. A short description of what happens follows: 1) The test case issues a command code 00 (TEST UNIT READY) via the SG_IO interface with: dxfer_len == 524288, dxdfer_dir == SG_DXFER_FROM_DEV and a corresponding dxferp. The peculiar thing about this is that TUR is not reading from the device. 2) In sg_start_req() the invocation of blk_rq_map_user() effectively bounces the user-space buffer. As if the device was to transfer into it. Since commit a45b599ad808 ("scsi: sg: allocate with __GFP_ZERO in sg_build_indirect()") we make sure this first bounce buffer is allocated with GFP_ZERO. 3) For the rest of the story we keep ignoring that we have a TUR, so the device won't touch the buffer we prepare as if the we had a DMA_FROM_DEVICE type of situation. My setup uses a virtio-scsi device and the buffer allocated by SG is mapped by the function virtqueue_add_split() which uses DMA_FROM_DEVICE for the "in" sgs (here scatter-gather and not scsi generics). This mapping involves bouncing via the swiotlb (we need swiotlb to do virtio in protected guest like s390 Secure Execution, or AMD SEV). 4) When the SCSI TUR is done, we first copy back the content of the second (that is swiotlb) bounce buffer (which most likely contains some previous IO data), to the first bounce buffer, which contains all zeros. Then we copy back the content of the first bounce buffer to the user-space buffer. 5) The test case detects that the buffer, which it zero-initialized, ain't all zeros and fails. One can argue that this is an swiotlb problem, because without swiotlb we leak all zeros, and the swiotlb should be transparent in a sense that it does not affect the outcome (if all other participants are well behaved). Copying the content of the original buffer into the swiotlb buffer is the only way I can think of to make swiotlb transparent in such scenarios. So let's do just that if in doubt, but allow the driver to tell us that the whole mapped buffer is going to be overwritten, in which case we can preserve the old behavior and avoid the performance impact of the extra bounce. Signed-off-by: Halil Pasic Signed-off-by: Christoph Hellwig --- Documentation/core-api/dma-attributes.rst | 8 ++++++++ include/linux/dma-mapping.h | 8 ++++++++ kernel/dma/swiotlb.c | 3 ++- 3 files changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Documentation/core-api/dma-attributes.rst b/Documentation/core-api/dma-attributes.rst index 1887d92e8e92..17706dc91ec9 100644 --- a/Documentation/core-api/dma-attributes.rst +++ b/Documentation/core-api/dma-attributes.rst @@ -130,3 +130,11 @@ accesses to DMA buffers in both privileged "supervisor" and unprivileged subsystem that the buffer is fully accessible at the elevated privilege level (and ideally inaccessible or at least read-only at the lesser-privileged levels). + +DMA_ATTR_OVERWRITE +------------------ + +This is a hint to the DMA-mapping subsystem that the device is expected to +overwrite the entire mapped size, thus the caller does not require any of the +previous buffer contents to be preserved. This allows bounce-buffering +implementations to optimise DMA_FROM_DEVICE transfers. diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index dca2b1355bb1..6150d11a607e 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -61,6 +61,14 @@ */ #define DMA_ATTR_PRIVILEGED (1UL << 9) +/* + * This is a hint to the DMA-mapping subsystem that the device is expected + * to overwrite the entire mapped size, thus the caller does not require any + * of the previous buffer contents to be preserved. This allows + * bounce-buffering implementations to optimise DMA_FROM_DEVICE transfers. + */ +#define DMA_ATTR_OVERWRITE (1UL << 10) + /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can * be given to a device to use as a DMA source or target. It is specific to a diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index f1e7ea160b43..bfc56cb21705 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -628,7 +628,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, mem->slots[index + i].orig_addr = slot_addr(orig_addr, i); tlb_addr = slot_addr(mem->start, index) + offset; if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && - (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) + (!(attrs & DMA_ATTR_OVERWRITE) || dir == DMA_TO_DEVICE || + dir == DMA_BIDIRECTIONAL)) swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); return tlb_addr; } -- cgit v1.2.3 From 77498617857f68496b360081dde1a492d40c28b2 Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Wed, 2 Feb 2022 09:18:18 -0800 Subject: printk: Add panic_in_progress helper This will be used help avoid deadlocks during panics. Although it would be better to include this in linux/panic.h, it would require that header to include linux/atomic.h as well. On some architectures, this results in a circular dependency as well. So instead add the helper directly to printk.c. Suggested-by: Petr Mladek Signed-off-by: Stephen Brennan Reviewed-by: Petr Mladek Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220202171821.179394-2-stephen.s.brennan@oracle.com --- kernel/printk/printk.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 155229f0cf0f..f04bbed0aa79 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -256,6 +256,11 @@ static void __up_console_sem(unsigned long ip) } #define up_console_sem() __up_console_sem(_RET_IP_) +static bool panic_in_progress(void) +{ + return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); +} + /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's -- cgit v1.2.3 From d51507098ff91e863b6e0a8047507741d59b8175 Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Wed, 2 Feb 2022 09:18:19 -0800 Subject: printk: disable optimistic spin during panic A CPU executing with console lock spinning enabled might be halted during a panic. Before the panicking CPU calls console_flush_on_panic(), it may call console_trylock(), which attempts to optimistically spin, deadlocking the panic CPU: CPU 0 (panic CPU) CPU 1 ----------------- ------ printk() { vprintk_func() { vprintk_default() { vprintk_emit() { console_unlock() { console_lock_spinning_enable(); ... printing to console ... panic() { crash_smp_send_stop() { NMI -------------------> HALT } atomic_notifier_call_chain() { printk() { ... console_trylock_spinnning() { // optimistic spin infinitely This hang during panic can be induced when a kdump kernel is loaded, and crash_kexec_post_notifiers=1 is present on the kernel command line. The following script which concurrently writes to /dev/kmsg, and triggers a panic, can result in this hang: #!/bin/bash date # 991 chars (based on log buffer size): chars="$(printf 'a%.0s' {1..991})" while :; do echo $chars > /dev/kmsg done & echo c > /proc/sysrq-trigger & date exit To avoid this deadlock, ensure that console_trylock_spinning() does not allow spinning once a panic has begun. Fixes: dbdda842fe96 ("printk: Add console owner and waiter logic to load balance console writes") Suggested-by: Petr Mladek Signed-off-by: Stephen Brennan Reviewed-by: Petr Mladek Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220202171821.179394-3-stephen.s.brennan@oracle.com --- kernel/printk/printk.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index f04bbed0aa79..e83c12770104 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1847,6 +1847,16 @@ static int console_trylock_spinning(void) if (console_trylock()) return 1; + /* + * It's unsafe to spin once a panic has begun. If we are the + * panic CPU, we may have already halted the owner of the + * console_sem. If we are not the panic CPU, then we should + * avoid taking console_sem, so the panic CPU has a better + * chance of cleanly acquiring it later. + */ + if (panic_in_progress()) + return 0; + printk_safe_enter_irqsave(flags); raw_spin_lock(&console_owner_lock); -- cgit v1.2.3 From 13fb0f74d7029df3b8137f11ef955e578a4a4a60 Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Wed, 2 Feb 2022 09:18:20 -0800 Subject: printk: Avoid livelock with heavy printk during panic During panic(), if another CPU is writing heavily the kernel log (e.g. via /dev/kmsg), then the panic CPU may livelock writing out its messages to the console. Note when too many messages are dropped during panic and suppress further printk, except from the panic CPU. This could result in some important messages being dropped. However, messages are already being dropped, so this approach at least prevents a livelock. Reviewed-by: Petr Mladek Signed-off-by: Stephen Brennan Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220202171821.179394-4-stephen.s.brennan@oracle.com --- kernel/printk/printk.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index e83c12770104..2ec6b547cda6 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -93,6 +93,12 @@ EXPORT_SYMBOL_GPL(console_drivers); */ int __read_mostly suppress_printk; +/* + * During panic, heavy printk by other CPUs can delay the + * panic and risk deadlock on console resources. + */ +int __read_mostly suppress_panic_printk; + #ifdef CONFIG_LOCKDEP static struct lockdep_map console_lock_dep_map = { .name = "console_lock" @@ -2232,6 +2238,10 @@ asmlinkage int vprintk_emit(int facility, int level, if (unlikely(suppress_printk)) return 0; + if (unlikely(suppress_panic_printk) && + atomic_read(&panic_cpu) != raw_smp_processor_id()) + return 0; + if (level == LOGLEVEL_SCHED) { level = LOGLEVEL_DEFAULT; in_sched = true; @@ -2617,6 +2627,7 @@ void console_unlock(void) { static char ext_text[CONSOLE_EXT_LOG_MAX]; static char text[CONSOLE_LOG_MAX]; + static int panic_console_dropped; unsigned long flags; bool do_cond_resched, retry; struct printk_info info; @@ -2671,6 +2682,10 @@ skip: if (console_seq != r.info->seq) { console_dropped += r.info->seq - console_seq; console_seq = r.info->seq; + if (panic_in_progress() && panic_console_dropped++ > 10) { + suppress_panic_printk = 1; + pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n"); + } } if (suppress_message_printing(r.info->level)) { -- cgit v1.2.3 From 8ebc476fd51e6c0fd3174ec1959a20ba99d4c5e5 Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Wed, 2 Feb 2022 09:18:21 -0800 Subject: printk: Drop console_sem during panic If another CPU is in panic, we are about to be halted. Try to gracefully abandon the console_sem, leaving it free for the panic CPU to grab. Suggested-by: Petr Mladek Signed-off-by: Stephen Brennan Reviewed-by: Petr Mladek Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220202171821.179394-5-stephen.s.brennan@oracle.com --- kernel/printk/printk.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 2ec6b547cda6..6a51907a33b9 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2597,6 +2597,25 @@ static int have_callable_console(void) return 0; } +/* + * Return true when this CPU should unlock console_sem without pushing all + * messages to the console. This reduces the chance that the console is + * locked when the panic CPU tries to use it. + */ +static bool abandon_console_lock_in_panic(void) +{ + if (!panic_in_progress()) + return false; + + /* + * We can use raw_smp_processor_id() here because it is impossible for + * the task to be migrated to the panic_cpu, or away from it. If + * panic_cpu has already been set, and we're not currently executing on + * that CPU, then we never will be. + */ + return atomic_read(&panic_cpu) != raw_smp_processor_id(); +} + /* * Can we actually use the console at this time on this cpu? * @@ -2745,6 +2764,10 @@ skip: if (handover) return; + /* Allow panic_cpu to take over the consoles safely */ + if (abandon_console_lock_in_panic()) + break; + if (do_cond_resched) cond_resched(); } @@ -2762,7 +2785,7 @@ skip: * flush, no worries. */ retry = prb_read_valid(prb, next_seq, NULL); - if (retry && console_trylock()) + if (retry && !abandon_console_lock_in_panic() && console_trylock()) goto again; } EXPORT_SYMBOL(console_unlock); -- cgit v1.2.3 From 7a853c2d5951419fdf3c1c9d2b6f5a38f6a6857d Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 7 Feb 2022 15:02:45 -0800 Subject: mm: Change CONFIG option for mm->pasid field This currently depends on CONFIG_IOMMU_SUPPORT. But it is only needed when CONFIG_IOMMU_SVA option is enabled. Change the CONFIG guards around definition and initialization of mm->pasid field. Suggested-by: Jacob Pan Signed-off-by: Fenghua Yu Signed-off-by: Borislav Petkov Reviewed-by: Tony Luck Reviewed-by: Thomas Gleixner Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/20220207230254.3342514-3-fenghua.yu@intel.com --- include/linux/mm_types.h | 2 +- kernel/fork.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5140e5feb486..c5cbfd7915ad 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -631,7 +631,7 @@ struct mm_struct { #endif struct work_struct async_put_work; -#ifdef CONFIG_IOMMU_SUPPORT +#ifdef CONFIG_IOMMU_SVA u32 pasid; #endif } __randomize_layout; diff --git a/kernel/fork.c b/kernel/fork.c index d75a528f7b21..6ee7551d3bd2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1021,7 +1021,7 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) static void mm_init_pasid(struct mm_struct *mm) { -#ifdef CONFIG_IOMMU_SUPPORT +#ifdef CONFIG_IOMMU_SVA mm->pasid = INIT_PASID; #endif } -- cgit v1.2.3 From 218b957a6959a2fb5b3967fc824072bb89ac2611 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 8 Dec 2021 23:41:53 +0000 Subject: rcu: Add mutex for rcu boost kthread spawning and affinity setting As we handle parallel CPU bringup, we will need to take care to avoid spawning multiple boost threads, or race conditions when setting their affinity. Spotted by Paul McKenney. Signed-off-by: David Woodhouse Reviewed-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + kernel/rcu/tree.h | 3 +++ kernel/rcu/tree_plugin.h | 10 ++++++++-- 3 files changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a4c25a6283b0..d1d1a8c51223 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4570,6 +4570,7 @@ static void __init rcu_init_one(void) init_waitqueue_head(&rnp->exp_wq[2]); init_waitqueue_head(&rnp->exp_wq[3]); spin_lock_init(&rnp->exp_lock); + mutex_init(&rnp->boost_kthread_mutex); } } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 486fc901bd08..3b8b60de07c3 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -110,6 +110,9 @@ struct rcu_node { /* side effect, not as a lock. */ unsigned long boost_time; /* When to start boosting (jiffies). */ + struct mutex boost_kthread_mutex; + /* Exclusion for thread spawning and affinity */ + /* manipulation. */ struct task_struct *boost_kthread_task; /* kthread that takes care of priority */ /* boosting for this rcu_node structure. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c5b45c2f68a1..07845dcd33c5 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1172,15 +1172,16 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) struct sched_param sp; struct task_struct *t; + mutex_lock(&rnp->boost_kthread_mutex); if (rnp->boost_kthread_task || !rcu_scheduler_fully_active) - return; + goto out; rcu_state.boost = 1; t = kthread_create(rcu_boost_kthread, (void *)rnp, "rcub/%d", rnp_index); if (WARN_ON_ONCE(IS_ERR(t))) - return; + goto out; raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->boost_kthread_task = t; @@ -1188,6 +1189,9 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ + + out: + mutex_unlock(&rnp->boost_kthread_mutex); } /* @@ -1210,6 +1214,7 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) return; if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) return; + mutex_lock(&rnp->boost_kthread_mutex); for_each_leaf_node_possible_cpu(rnp, cpu) if ((mask & leaf_node_cpu_bit(rnp, cpu)) && cpu != outgoingcpu) @@ -1218,6 +1223,7 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) if (cpumask_weight(cm) == 0) cpumask_copy(cm, housekeeping_cpumask(HK_FLAG_RCU)); set_cpus_allowed_ptr(t, cm); + mutex_unlock(&rnp->boost_kthread_mutex); free_cpumask_var(cm); } -- cgit v1.2.3 From 1fe09ebe7a9c9907f516779fbe4954165dd01529 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 18 Dec 2021 09:30:33 -0800 Subject: rcu: Inline __call_rcu() into call_rcu() Because __call_rcu() is invoked only by call_rcu(), this commit inlines the former into the latter. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 91 +++++++++++++++++++++++++------------------------------ 1 file changed, 42 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d1d1a8c51223..f1bb7ccc0084 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2995,9 +2995,47 @@ static void check_cb_ovld(struct rcu_data *rdp) raw_spin_unlock_rcu_node(rnp); } -/* Helper function for call_rcu() and friends. */ -static void -__call_rcu(struct rcu_head *head, rcu_callback_t func) +/** + * call_rcu() - Queue an RCU callback for invocation after a grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all pre-existing RCU read-side + * critical sections have completed. However, the callback function + * might well execute concurrently with RCU read-side critical sections + * that started after call_rcu() was invoked. + * + * RCU read-side critical sections are delimited by rcu_read_lock() + * and rcu_read_unlock(), and may be nested. In addition, but only in + * v5.0 and later, regions of code across which interrupts, preemption, + * or softirqs have been disabled also serve as RCU read-side critical + * sections. This includes hardware interrupt handlers, softirq handlers, + * and NMI handlers. + * + * Note that all CPUs must agree that the grace period extended beyond + * all pre-existing RCU read-side critical section. On systems with more + * than one CPU, this means that when "func()" is invoked, each CPU is + * guaranteed to have executed a full memory barrier since the end of its + * last RCU read-side critical section whose beginning preceded the call + * to call_rcu(). It also means that each CPU executing an RCU read-side + * critical section that continues beyond the start of "func()" must have + * executed a memory barrier after the call_rcu() but before the beginning + * of that RCU read-side critical section. Note that these guarantees + * include CPUs that are offline, idle, or executing in user mode, as + * well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the + * resulting RCU callback function "func()", then both CPU A and CPU B are + * guaranteed to execute a full memory barrier during the time interval + * between the call to call_rcu() and the invocation of "func()" -- even + * if CPU A and CPU B are the same CPU (but again only if the system has + * more than one CPU). + * + * Implementation of these memory-ordering guarantees is described here: + * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst. + */ +void call_rcu(struct rcu_head *head, rcu_callback_t func) { static atomic_t doublefrees; unsigned long flags; @@ -3011,7 +3049,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) /* * Probable double call_rcu(), so leak the callback. * Use rcu:rcu_callback trace event to find the previous - * time callback was passed to __call_rcu(). + * time callback was passed to call_rcu(). */ if (atomic_inc_return(&doublefrees) < 4) { pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func); @@ -3060,51 +3098,6 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) local_irq_restore(flags); } } - -/** - * call_rcu() - Queue an RCU callback for invocation after a grace period. - * @head: structure to be used for queueing the RCU updates. - * @func: actual callback function to be invoked after the grace period - * - * The callback function will be invoked some time after a full grace - * period elapses, in other words after all pre-existing RCU read-side - * critical sections have completed. However, the callback function - * might well execute concurrently with RCU read-side critical sections - * that started after call_rcu() was invoked. - * - * RCU read-side critical sections are delimited by rcu_read_lock() - * and rcu_read_unlock(), and may be nested. In addition, but only in - * v5.0 and later, regions of code across which interrupts, preemption, - * or softirqs have been disabled also serve as RCU read-side critical - * sections. This includes hardware interrupt handlers, softirq handlers, - * and NMI handlers. - * - * Note that all CPUs must agree that the grace period extended beyond - * all pre-existing RCU read-side critical section. On systems with more - * than one CPU, this means that when "func()" is invoked, each CPU is - * guaranteed to have executed a full memory barrier since the end of its - * last RCU read-side critical section whose beginning preceded the call - * to call_rcu(). It also means that each CPU executing an RCU read-side - * critical section that continues beyond the start of "func()" must have - * executed a memory barrier after the call_rcu() but before the beginning - * of that RCU read-side critical section. Note that these guarantees - * include CPUs that are offline, idle, or executing in user mode, as - * well as CPUs that are executing in the kernel. - * - * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the - * resulting RCU callback function "func()", then both CPU A and CPU B are - * guaranteed to execute a full memory barrier during the time interval - * between the call to call_rcu() and the invocation of "func()" -- even - * if CPU A and CPU B are the same CPU (but again only if the system has - * more than one CPU). - * - * Implementation of these memory-ordering guarantees is described here: - * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst. - */ -void call_rcu(struct rcu_head *head, rcu_callback_t func) -{ - __call_rcu(head, func); -} EXPORT_SYMBOL_GPL(call_rcu); -- cgit v1.2.3 From d818cc76e2b4d5f6cebf8c7ce1160d652d7e572b Mon Sep 17 00:00:00 2001 From: Zqiang Date: Sun, 26 Dec 2021 08:52:04 +0800 Subject: kasan: Record work creation stack trace with interrupts enabled Recording the work creation stack trace for KASAN reports in call_rcu() is expensive, due to unwinding the stack, but also due to acquiring depot_lock inside stackdepot (which may be contended). Because calling kasan_record_aux_stack_noalloc() does not require interrupts to already be disabled, this may unnecessarily extend the time with interrupts disabled. Therefore, move calling kasan_record_aux_stack() before the section with interrupts disabled. Acked-by: Marco Elver Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f1bb7ccc0084..ca8d7dd026ee 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3060,8 +3060,8 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func) } head->func = func; head->next = NULL; - local_irq_save(flags); kasan_record_aux_stack_noalloc(head); + local_irq_save(flags); rdp = this_cpu_ptr(&rcu_data); /* Add the callback to our list. */ -- cgit v1.2.3 From c09929031018913b5783872a8b8cdddef4a543c7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 4 Jan 2022 10:34:34 -0800 Subject: rcu: Mark writes to the rcu_segcblist structure's ->flags field KCSAN reports data races between the rcu_segcblist_clear_flags() and rcu_segcblist_set_flags() functions, though misreporting the latter as a call to rcu_segcblist_is_enabled() from call_rcu(). This commit converts the updates of this field to WRITE_ONCE(), relying on the resulting unmarked reads to continue to detect buggy concurrent writes to this field. Reported-by: Zhouyi Zhou Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker --- kernel/rcu/rcu_segcblist.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index e373fbe44da5..431cee212467 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -56,13 +56,13 @@ static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp) static inline void rcu_segcblist_set_flags(struct rcu_segcblist *rsclp, int flags) { - rsclp->flags |= flags; + WRITE_ONCE(rsclp->flags, rsclp->flags | flags); } static inline void rcu_segcblist_clear_flags(struct rcu_segcblist *rsclp, int flags) { - rsclp->flags &= ~flags; + WRITE_ONCE(rsclp->flags, rsclp->flags & ~flags); } static inline bool rcu_segcblist_test_flags(struct rcu_segcblist *rsclp, -- cgit v1.2.3 From 58d4292bd037b01fbb940a5170817f7d40caa9d5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 14 Jan 2022 16:07:28 -0800 Subject: rcu: Uninline multi-use function: finish_rcuwait() This is a rarely used function, so uninlining its 3 instructions is probably a win or a wash - but the main motivation is to make independent of task_struct details. Signed-off-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- include/linux/rcuwait.h | 6 +----- kernel/rcu/update.c | 7 +++++++ 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h index 61c56cca95c4..8052d34da782 100644 --- a/include/linux/rcuwait.h +++ b/include/linux/rcuwait.h @@ -47,11 +47,7 @@ static inline void prepare_to_rcuwait(struct rcuwait *w) rcu_assign_pointer(w->task, current); } -static inline void finish_rcuwait(struct rcuwait *w) -{ - rcu_assign_pointer(w->task, NULL); - __set_current_state(TASK_RUNNING); -} +extern void finish_rcuwait(struct rcuwait *w); #define rcuwait_wait_event(w, condition, state) \ ({ \ diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 156892c22bb5..180ff9c41fa8 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -407,6 +407,13 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, } EXPORT_SYMBOL_GPL(__wait_rcu_gp); +void finish_rcuwait(struct rcuwait *w) +{ + rcu_assign_pointer(w->task, NULL); + __set_current_state(TASK_RUNNING); +} +EXPORT_SYMBOL_GPL(finish_rcuwait); + #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD void init_rcu_head(struct rcu_head *head) { -- cgit v1.2.3 From 6a2c1d450a6a328027280a854019c55de989e14e Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sun, 23 Jan 2022 10:38:53 -0800 Subject: rcu: Replace cpumask_weight with cpumask_empty where appropriate In some places, RCU code calls cpumask_weight() to check if any bit of a given cpumask is set. We can do it more efficiently with cpumask_empty() because cpumask_empty() stops traversing the cpumask as soon as it finds first set bit, while cpumask_weight() counts all bits unconditionally. Signed-off-by: Yury Norov Acked-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_nocb.h | 4 ++-- kernel/rcu/tree_plugin.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index eeafb546a7a0..f83c7b1d6110 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1169,7 +1169,7 @@ void __init rcu_init_nohz(void) struct rcu_data *rdp; #if defined(CONFIG_NO_HZ_FULL) - if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) + if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask)) need_rcu_nocb_mask = true; #endif /* #if defined(CONFIG_NO_HZ_FULL) */ @@ -1348,7 +1348,7 @@ static void __init rcu_organize_nocb_kthreads(void) */ void rcu_bind_current_to_nocb(void) { - if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask)) + if (cpumask_available(rcu_nocb_mask) && !cpumask_empty(rcu_nocb_mask)) WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask)); } EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 07845dcd33c5..efd0c87d2ffa 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1220,7 +1220,7 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); cpumask_and(cm, cm, housekeeping_cpumask(HK_FLAG_RCU)); - if (cpumask_weight(cm) == 0) + if (cpumask_empty(cm)) cpumask_copy(cm, housekeeping_cpumask(HK_FLAG_RCU)); set_cpus_allowed_ptr(t, cm); mutex_unlock(&rnp->boost_kthread_mutex); -- cgit v1.2.3 From a6cbd44093ef305b02ad5f80ed54abf0148a696c Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 7 Feb 2022 15:02:47 -0800 Subject: kernel/fork: Initialize mm's PASID A new mm doesn't have a PASID yet when it's created. Initialize the mm's PASID on fork() or for init_mm to INVALID_IOASID (-1). INIT_PASID (0) is reserved for kernel legacy DMA PASID. It cannot be allocated to a user process. Initializing the process's PASID to 0 may cause confusion that's why the process uses the reserved kernel legacy DMA PASID. Initializing the PASID to INVALID_IOASID (-1) explicitly tells the process doesn't have a valid PASID yet. Even though the only user of mm_pasid_init() is in fork.c, define it in as the first of three mm/pasid life cycle functions (init/set/drop) to keep these all together. Suggested-by: Dave Hansen Signed-off-by: Fenghua Yu Signed-off-by: Borislav Petkov Reviewed-by: Tony Luck Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220207230254.3342514-5-fenghua.yu@intel.com --- include/linux/sched/mm.h | 10 ++++++++++ kernel/fork.c | 10 ++-------- mm/init-mm.c | 4 ++++ 3 files changed, 16 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index aa5f09ca5bcf..c74d1edbac2f 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -8,6 +8,7 @@ #include #include #include +#include /* * Routines for handling mm_structs @@ -433,4 +434,13 @@ static inline void membarrier_update_current_mm(struct mm_struct *next_mm) } #endif +#ifdef CONFIG_IOMMU_SVA +static inline void mm_pasid_init(struct mm_struct *mm) +{ + mm->pasid = INVALID_IOASID; +} +#else +static inline void mm_pasid_init(struct mm_struct *mm) {} +#endif + #endif /* _LINUX_SCHED_MM_H */ diff --git a/kernel/fork.c b/kernel/fork.c index 6ee7551d3bd2..deacd2c17a7f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -97,6 +97,7 @@ #include #include #include +#include #include #include @@ -1019,13 +1020,6 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) #endif } -static void mm_init_pasid(struct mm_struct *mm) -{ -#ifdef CONFIG_IOMMU_SVA - mm->pasid = INIT_PASID; -#endif -} - static void mm_init_uprobes_state(struct mm_struct *mm) { #ifdef CONFIG_UPROBES @@ -1054,7 +1048,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); - mm_init_pasid(mm); + mm_pasid_init(mm); RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_subscriptions_init(mm); init_tlb_flush_pending(mm); diff --git a/mm/init-mm.c b/mm/init-mm.c index b4a6f38fb51d..fbe7844d0912 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -10,6 +10,7 @@ #include #include +#include #include #ifndef INIT_MM_CONTEXT @@ -38,6 +39,9 @@ struct mm_struct init_mm = { .mmlist = LIST_HEAD_INIT(init_mm.mmlist), .user_ns = &init_user_ns, .cpu_bitmap = CPU_BITS_NONE, +#ifdef CONFIG_IOMMU_SVA + .pasid = INVALID_IOASID, +#endif INIT_MM_CONTEXT(init_mm) }; -- cgit v1.2.3 From 05c7b7a92cc87ff8d7fde189d0fade250697573c Mon Sep 17 00:00:00 2001 From: Zhang Qiao Date: Fri, 21 Jan 2022 18:12:10 +0800 Subject: cgroup/cpuset: Fix a race between cpuset_attach() and cpu hotplug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As previously discussed(https://lkml.org/lkml/2022/1/20/51), cpuset_attach() is affected with similar cpu hotplug race, as follow scenario: cpuset_attach() cpu hotplug --------------------------- ---------------------- down_write(cpuset_rwsem) guarantee_online_cpus() // (load cpus_attach) sched_cpu_deactivate set_cpu_active() // will change cpu_active_mask set_cpus_allowed_ptr(cpus_attach) __set_cpus_allowed_ptr_locked() // (if the intersection of cpus_attach and cpu_active_mask is empty, will return -EINVAL) up_write(cpuset_rwsem) To avoid races such as described above, protect cpuset_attach() call with cpu_hotplug_lock. Fixes: be367d099270 ("cgroups: let ss->can_attach and ss->attach do whole threadgroups at a time") Cc: stable@vger.kernel.org # v2.6.32+ Reported-by: Zhao Gongyi Signed-off-by: Zhang Qiao Acked-by: Waiman Long Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4c7254e8f49a..97c53f3cc917 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2289,6 +2289,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); cs = css_cs(css); + cpus_read_lock(); percpu_down_write(&cpuset_rwsem); guarantee_online_mems(cs, &cpuset_attach_nodemask_to); @@ -2342,6 +2343,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) wake_up(&cpuset_attach_wq); percpu_up_write(&cpuset_rwsem); + cpus_read_unlock(); } /* The various types of files and directories in a cpuset file system */ -- cgit v1.2.3 From 701fac40384f07197b106136012804c3cae0b3de Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 7 Feb 2022 15:02:48 -0800 Subject: iommu/sva: Assign a PASID to mm on PASID allocation and free it on mm exit PASIDs are process-wide. It was attempted to use refcounted PASIDs to free them when the last thread drops the refcount. This turned out to be complex and error prone. Given the fact that the PASID space is 20 bits, which allows up to 1M processes to have a PASID associated concurrently, PASID resource exhaustion is not a realistic concern. Therefore, it was decided to simplify the approach and stick with lazy on demand PASID allocation, but drop the eager free approach and make an allocated PASID's lifetime bound to the lifetime of the process. Get rid of the refcounting mechanisms and replace/rename the interfaces to reflect this new approach. [ bp: Massage commit message. ] Suggested-by: Dave Hansen Signed-off-by: Fenghua Yu Signed-off-by: Borislav Petkov Reviewed-by: Tony Luck Reviewed-by: Lu Baolu Reviewed-by: Jacob Pan Reviewed-by: Thomas Gleixner Acked-by: Joerg Roedel Link: https://lore.kernel.org/r/20220207230254.3342514-6-fenghua.yu@intel.com --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 5 +--- drivers/iommu/intel/iommu.c | 4 +-- drivers/iommu/intel/svm.c | 9 ------ drivers/iommu/ioasid.c | 39 +++---------------------- drivers/iommu/iommu-sva-lib.c | 39 ++++++++----------------- drivers/iommu/iommu-sva-lib.h | 1 - include/linux/ioasid.h | 12 ++------ include/linux/sched/mm.h | 16 ++++++++++ kernel/fork.c | 1 + 9 files changed, 38 insertions(+), 88 deletions(-) (limited to 'kernel') diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index a737ba5f727e..22ddd05bbdcd 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -340,14 +340,12 @@ __arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm) bond->smmu_mn = arm_smmu_mmu_notifier_get(smmu_domain, mm); if (IS_ERR(bond->smmu_mn)) { ret = PTR_ERR(bond->smmu_mn); - goto err_free_pasid; + goto err_free_bond; } list_add(&bond->list, &master->bonds); return &bond->sva; -err_free_pasid: - iommu_sva_free_pasid(mm); err_free_bond: kfree(bond); return ERR_PTR(ret); @@ -377,7 +375,6 @@ void arm_smmu_sva_unbind(struct iommu_sva *handle) if (refcount_dec_and_test(&bond->refs)) { list_del(&bond->list); arm_smmu_mmu_notifier_put(bond->smmu_mn); - iommu_sva_free_pasid(bond->mm); kfree(bond); } mutex_unlock(&sva_lock); diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 92fea3fbbb11..ef03b2176bbd 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4781,7 +4781,7 @@ attach_failed: link_failed: spin_unlock_irqrestore(&device_domain_lock, flags); if (list_empty(&domain->subdevices) && domain->default_pasid > 0) - ioasid_put(domain->default_pasid); + ioasid_free(domain->default_pasid); return ret; } @@ -4811,7 +4811,7 @@ static void aux_domain_remove_dev(struct dmar_domain *domain, spin_unlock_irqrestore(&device_domain_lock, flags); if (list_empty(&domain->subdevices) && domain->default_pasid > 0) - ioasid_put(domain->default_pasid); + ioasid_free(domain->default_pasid); } static int prepare_domain_attach_device(struct iommu_domain *domain, diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 5b5d69b04fcc..51ac2096b3da 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -514,11 +514,6 @@ static int intel_svm_alloc_pasid(struct device *dev, struct mm_struct *mm, return iommu_sva_alloc_pasid(mm, PASID_MIN, max_pasid - 1); } -static void intel_svm_free_pasid(struct mm_struct *mm) -{ - iommu_sva_free_pasid(mm); -} - static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu, struct device *dev, struct mm_struct *mm, @@ -662,8 +657,6 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid) kfree(svm); } } - /* Drop a PASID reference and free it if no reference. */ - intel_svm_free_pasid(mm); } out: return ret; @@ -1047,8 +1040,6 @@ struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm, void } sva = intel_svm_bind_mm(iommu, dev, mm, flags); - if (IS_ERR_OR_NULL(sva)) - intel_svm_free_pasid(mm); mutex_unlock(&pasid_mutex); return sva; diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c index 06fee7416816..a786c034907c 100644 --- a/drivers/iommu/ioasid.c +++ b/drivers/iommu/ioasid.c @@ -2,7 +2,7 @@ /* * I/O Address Space ID allocator. There is one global IOASID space, split into * subsets. Users create a subset with DECLARE_IOASID_SET, then allocate and - * free IOASIDs with ioasid_alloc and ioasid_put. + * free IOASIDs with ioasid_alloc() and ioasid_free(). */ #include #include @@ -15,7 +15,6 @@ struct ioasid_data { struct ioasid_set *set; void *private; struct rcu_head rcu; - refcount_t refs; }; /* @@ -315,7 +314,6 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max, data->set = set; data->private = private; - refcount_set(&data->refs, 1); /* * Custom allocator needs allocator data to perform platform specific @@ -348,35 +346,11 @@ exit_free: EXPORT_SYMBOL_GPL(ioasid_alloc); /** - * ioasid_get - obtain a reference to the IOASID - * @ioasid: the ID to get - */ -void ioasid_get(ioasid_t ioasid) -{ - struct ioasid_data *ioasid_data; - - spin_lock(&ioasid_allocator_lock); - ioasid_data = xa_load(&active_allocator->xa, ioasid); - if (ioasid_data) - refcount_inc(&ioasid_data->refs); - else - WARN_ON(1); - spin_unlock(&ioasid_allocator_lock); -} -EXPORT_SYMBOL_GPL(ioasid_get); - -/** - * ioasid_put - Release a reference to an ioasid + * ioasid_free - Free an ioasid * @ioasid: the ID to remove - * - * Put a reference to the IOASID, free it when the number of references drops to - * zero. - * - * Return: %true if the IOASID was freed, %false otherwise. */ -bool ioasid_put(ioasid_t ioasid) +void ioasid_free(ioasid_t ioasid) { - bool free = false; struct ioasid_data *ioasid_data; spin_lock(&ioasid_allocator_lock); @@ -386,10 +360,6 @@ bool ioasid_put(ioasid_t ioasid) goto exit_unlock; } - free = refcount_dec_and_test(&ioasid_data->refs); - if (!free) - goto exit_unlock; - active_allocator->ops->free(ioasid, active_allocator->ops->pdata); /* Custom allocator needs additional steps to free the xa element */ if (active_allocator->flags & IOASID_ALLOCATOR_CUSTOM) { @@ -399,9 +369,8 @@ bool ioasid_put(ioasid_t ioasid) exit_unlock: spin_unlock(&ioasid_allocator_lock); - return free; } -EXPORT_SYMBOL_GPL(ioasid_put); +EXPORT_SYMBOL_GPL(ioasid_free); /** * ioasid_find - Find IOASID data diff --git a/drivers/iommu/iommu-sva-lib.c b/drivers/iommu/iommu-sva-lib.c index bd41405d34e9..106506143896 100644 --- a/drivers/iommu/iommu-sva-lib.c +++ b/drivers/iommu/iommu-sva-lib.c @@ -18,8 +18,7 @@ static DECLARE_IOASID_SET(iommu_sva_pasid); * * Try to allocate a PASID for this mm, or take a reference to the existing one * provided it fits within the [@min, @max] range. On success the PASID is - * available in mm->pasid, and must be released with iommu_sva_free_pasid(). - * @min must be greater than 0, because 0 indicates an unused mm->pasid. + * available in mm->pasid and will be available for the lifetime of the mm. * * Returns 0 on success and < 0 on error. */ @@ -33,38 +32,24 @@ int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max) return -EINVAL; mutex_lock(&iommu_sva_lock); - if (mm->pasid) { - if (mm->pasid >= min && mm->pasid <= max) - ioasid_get(mm->pasid); - else + /* Is a PASID already associated with this mm? */ + if (pasid_valid(mm->pasid)) { + if (mm->pasid < min || mm->pasid >= max) ret = -EOVERFLOW; - } else { - pasid = ioasid_alloc(&iommu_sva_pasid, min, max, mm); - if (pasid == INVALID_IOASID) - ret = -ENOMEM; - else - mm->pasid = pasid; + goto out; } + + pasid = ioasid_alloc(&iommu_sva_pasid, min, max, mm); + if (!pasid_valid(pasid)) + ret = -ENOMEM; + else + mm_pasid_set(mm, pasid); +out: mutex_unlock(&iommu_sva_lock); return ret; } EXPORT_SYMBOL_GPL(iommu_sva_alloc_pasid); -/** - * iommu_sva_free_pasid - Release the mm's PASID - * @mm: the mm - * - * Drop one reference to a PASID allocated with iommu_sva_alloc_pasid() - */ -void iommu_sva_free_pasid(struct mm_struct *mm) -{ - mutex_lock(&iommu_sva_lock); - if (ioasid_put(mm->pasid)) - mm->pasid = 0; - mutex_unlock(&iommu_sva_lock); -} -EXPORT_SYMBOL_GPL(iommu_sva_free_pasid); - /* ioasid_find getter() requires a void * argument */ static bool __mmget_not_zero(void *mm) { diff --git a/drivers/iommu/iommu-sva-lib.h b/drivers/iommu/iommu-sva-lib.h index 95dc3ebc1928..8909ea1094e3 100644 --- a/drivers/iommu/iommu-sva-lib.h +++ b/drivers/iommu/iommu-sva-lib.h @@ -9,7 +9,6 @@ #include int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max); -void iommu_sva_free_pasid(struct mm_struct *mm); struct mm_struct *iommu_sva_find(ioasid_t pasid); /* I/O Page fault */ diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h index 2237f64dbaae..af1c9d62e642 100644 --- a/include/linux/ioasid.h +++ b/include/linux/ioasid.h @@ -34,8 +34,7 @@ struct ioasid_allocator_ops { #if IS_ENABLED(CONFIG_IOASID) ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max, void *private); -void ioasid_get(ioasid_t ioasid); -bool ioasid_put(ioasid_t ioasid); +void ioasid_free(ioasid_t ioasid); void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid, bool (*getter)(void *)); int ioasid_register_allocator(struct ioasid_allocator_ops *allocator); @@ -53,14 +52,7 @@ static inline ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, return INVALID_IOASID; } -static inline void ioasid_get(ioasid_t ioasid) -{ -} - -static inline bool ioasid_put(ioasid_t ioasid) -{ - return false; -} +static inline void ioasid_free(ioasid_t ioasid) { } static inline void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid, bool (*getter)(void *)) diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index c74d1edbac2f..a80356e9dc69 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -439,8 +439,24 @@ static inline void mm_pasid_init(struct mm_struct *mm) { mm->pasid = INVALID_IOASID; } + +/* Associate a PASID with an mm_struct: */ +static inline void mm_pasid_set(struct mm_struct *mm, u32 pasid) +{ + mm->pasid = pasid; +} + +static inline void mm_pasid_drop(struct mm_struct *mm) +{ + if (pasid_valid(mm->pasid)) { + ioasid_free(mm->pasid); + mm->pasid = INVALID_IOASID; + } +} #else static inline void mm_pasid_init(struct mm_struct *mm) {} +static inline void mm_pasid_set(struct mm_struct *mm, u32 pasid) {} +static inline void mm_pasid_drop(struct mm_struct *mm) {} #endif #endif /* _LINUX_SCHED_MM_H */ diff --git a/kernel/fork.c b/kernel/fork.c index deacd2c17a7f..c03c6682464c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1115,6 +1115,7 @@ static inline void __mmput(struct mm_struct *mm) } if (mm->binfmt) module_put(mm->binfmt->module); + mm_pasid_drop(mm); mmdrop(mm); } -- cgit v1.2.3 From a3d29e8291b622780eb6e4e3eeaf2b24ec78fd43 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Feb 2022 15:02:50 -0800 Subject: sched: Define and initialize a flag to identify valid PASID in the task Add a new single bit field to the task structure to track whether this task has initialized the IA32_PASID MSR to the mm's PASID. Initialize the field to zero when creating a new task with fork/clone. Signed-off-by: Peter Zijlstra Co-developed-by: Fenghua Yu Signed-off-by: Fenghua Yu Signed-off-by: Borislav Petkov Reviewed-by: Tony Luck Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220207230254.3342514-8-fenghua.yu@intel.com --- include/linux/sched.h | 3 +++ kernel/fork.c | 4 ++++ 2 files changed, 7 insertions(+) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 75ba8aa60248..4e5de3aed410 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -938,6 +938,9 @@ struct task_struct { /* Recursion prevention for eventfd_signal() */ unsigned in_eventfd_signal:1; #endif +#ifdef CONFIG_IOMMU_SVA + unsigned pasid_activated:1; +#endif unsigned long atomic_flags; /* Flags requiring atomic access. */ diff --git a/kernel/fork.c b/kernel/fork.c index c03c6682464c..51fd1df994b7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -968,6 +968,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->use_memdelay = 0; #endif +#ifdef CONFIG_IOMMU_SVA + tsk->pasid_activated = 0; +#endif + #ifdef CONFIG_MEMCG tsk->active_memcg = NULL; #endif -- cgit v1.2.3 From 45ec846c1cd11835a29c85645065115dd791aa45 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 9 Feb 2022 16:25:58 +0000 Subject: irqdomain: Let irq_domain_set_{info,hwirq_and_chip} take a const irq_chip In order to let a const irqchip be fed to the irqchip layer, adjust the various prototypes. An extra cast in irq_domain_set_hwirq_and_chip() is required to avoid a warning. Signed-off-by: Marc Zyngier Acked-by: Linus Walleij Link: https://lore.kernel.org/r/20220209162607.1118325-2-maz@kernel.org --- include/linux/irqdomain.h | 5 +++-- kernel/irq/irqdomain.c | 9 +++++---- 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index be25a33293e5..00d577f90883 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -479,7 +479,8 @@ int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest); extern struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, unsigned int virq); extern void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq, struct irq_chip *chip, + irq_hw_number_t hwirq, + const struct irq_chip *chip, void *chip_data, irq_flow_handler_t handler, void *handler_data, const char *handler_name); extern void irq_domain_reset_irq_data(struct irq_data *irq_data); @@ -522,7 +523,7 @@ extern int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, extern int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq, - struct irq_chip *chip, + const struct irq_chip *chip, void *chip_data); extern void irq_domain_free_irqs_common(struct irq_domain *domain, unsigned int virq, diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index bf38c546aa25..d5ce96510549 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1319,7 +1319,8 @@ EXPORT_SYMBOL_GPL(irq_domain_get_irq_data); * @chip_data: The associated chip data */ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq, struct irq_chip *chip, + irq_hw_number_t hwirq, + const struct irq_chip *chip, void *chip_data) { struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq); @@ -1328,7 +1329,7 @@ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq, return -ENOENT; irq_data->hwirq = hwirq; - irq_data->chip = chip ? chip : &no_irq_chip; + irq_data->chip = (struct irq_chip *)(chip ? chip : &no_irq_chip); irq_data->chip_data = chip_data; return 0; @@ -1347,7 +1348,7 @@ EXPORT_SYMBOL_GPL(irq_domain_set_hwirq_and_chip); * @handler_name: The interrupt handler name */ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq, struct irq_chip *chip, + irq_hw_number_t hwirq, const struct irq_chip *chip, void *chip_data, irq_flow_handler_t handler, void *handler_data, const char *handler_name) { @@ -1853,7 +1854,7 @@ EXPORT_SYMBOL_GPL(irq_domain_get_irq_data); * @handler_name: The interrupt handler name */ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq, struct irq_chip *chip, + irq_hw_number_t hwirq, const struct irq_chip *chip, void *chip_data, irq_flow_handler_t handler, void *handler_data, const char *handler_name) { -- cgit v1.2.3 From 393e1280f765661cf39785e967676a4e57324126 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 9 Feb 2022 16:25:59 +0000 Subject: genirq: Allow irq_chip registration functions to take a const irq_chip In order to let a const irqchip be fed to the irqchip layer, adjust the various prototypes. An extra cast in irq_set_chip()() is required to avoid a warning. Signed-off-by: Marc Zyngier Acked-by: Linus Walleij Link: https://lore.kernel.org/r/20220209162607.1118325-3-maz@kernel.org --- include/linux/irq.h | 7 ++++--- kernel/irq/chip.c | 9 +++------ 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 2cb2e2ac2703..f92788ccdba2 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -710,10 +710,11 @@ extern struct irq_chip no_irq_chip; extern struct irq_chip dummy_irq_chip; extern void -irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, +irq_set_chip_and_handler_name(unsigned int irq, const struct irq_chip *chip, irq_flow_handler_t handle, const char *name); -static inline void irq_set_chip_and_handler(unsigned int irq, struct irq_chip *chip, +static inline void irq_set_chip_and_handler(unsigned int irq, + const struct irq_chip *chip, irq_flow_handler_t handle) { irq_set_chip_and_handler_name(irq, chip, handle, NULL); @@ -803,7 +804,7 @@ static inline void irq_set_percpu_devid_flags(unsigned int irq) } /* Set/get chip/data for an IRQ: */ -extern int irq_set_chip(unsigned int irq, struct irq_chip *chip); +extern int irq_set_chip(unsigned int irq, const struct irq_chip *chip); extern int irq_set_handler_data(unsigned int irq, void *data); extern int irq_set_chip_data(unsigned int irq, void *data); extern int irq_set_irq_type(unsigned int irq, unsigned int type); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 24b6f2b40e5e..54af0deb239b 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -38,7 +38,7 @@ struct irqaction chained_action = { * @irq: irq number * @chip: pointer to irq chip description structure */ -int irq_set_chip(unsigned int irq, struct irq_chip *chip) +int irq_set_chip(unsigned int irq, const struct irq_chip *chip) { unsigned long flags; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); @@ -46,10 +46,7 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip) if (!desc) return -EINVAL; - if (!chip) - chip = &no_irq_chip; - - desc->irq_data.chip = chip; + desc->irq_data.chip = (struct irq_chip *)(chip ?: &no_irq_chip); irq_put_desc_unlock(desc, flags); /* * For !CONFIG_SPARSE_IRQ make the irq show up in @@ -1073,7 +1070,7 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle, EXPORT_SYMBOL_GPL(irq_set_chained_handler_and_data); void -irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, +irq_set_chip_and_handler_name(unsigned int irq, const struct irq_chip *chip, irq_flow_handler_t handle, const char *name) { irq_set_chip(irq, chip); -- cgit v1.2.3 From 0a25cb5544f4f01d2e7c06164555fd9cd6eb64fd Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 10 Feb 2022 09:27:21 +0000 Subject: genirq/debugfs: Use irq_print_chip() when provided by irqchip Since irqchips have the option to implement irq_print_chip, use this when available to output the irqchip name in debugfs. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220215112154.1360040-1-maz@kernel.org --- kernel/irq/debugfs.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index e4cff358b437..2b43f5f5033d 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -69,8 +69,12 @@ irq_debug_show_chip(struct seq_file *m, struct irq_data *data, int ind) seq_printf(m, "chip: None\n"); return; } - seq_printf(m, "%*schip: %s\n", ind, "", chip->name); - seq_printf(m, "%*sflags: 0x%lx\n", ind + 1, "", chip->flags); + seq_printf(m, "%*schip: ", ind, ""); + if (chip->irq_print_chip) + chip->irq_print_chip(data, m); + else + seq_printf(m, "%s", chip->name); + seq_printf(m, "\n%*sflags: 0x%lx\n", ind + 1, "", chip->flags); irq_debug_show_bits(m, ind, chip->flags, irqchip_flags, ARRAY_SIZE(irqchip_flags)); } -- cgit v1.2.3 From 2ba3673d70178bf07fb75ff25c54bc478add4021 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Fri, 11 Feb 2022 12:29:37 +0106 Subject: printk: use atomic updates for klogd work The per-cpu @printk_pending variable can be updated from sleepable contexts, such as: get_random_bytes() warn_unseeded_randomness() printk_deferred() defer_console_output() and can be updated from interrupt contexts, such as: handle_irq_event_percpu() __irq_wake_thread() wake_up_process() try_to_wake_up() select_task_rq() select_fallback_rq() printk_deferred() defer_console_output() and can be updated from NMI contexts, such as: vprintk() if (in_nmi()) defer_console_output() Therefore the atomic variant of the updating functions must be used. Replace __this_cpu_xchg() with this_cpu_xchg(). Replace __this_cpu_or() with this_cpu_or(). Reported-by: Sebastian Andrzej Siewior Signed-off-by: John Ogness Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/87iltld4ue.fsf@jogness.linutronix.de --- kernel/printk/printk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 155229f0cf0f..25dce8b74791 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3226,7 +3226,7 @@ static DEFINE_PER_CPU(int, printk_pending); static void wake_up_klogd_work_func(struct irq_work *irq_work) { - int pending = __this_cpu_xchg(printk_pending, 0); + int pending = this_cpu_xchg(printk_pending, 0); if (pending & PRINTK_PENDING_OUTPUT) { /* If trylock fails, someone else is doing the printing */ @@ -3260,7 +3260,7 @@ void defer_console_output(void) return; preempt_disable(); - __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); + this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); preempt_enable(); } -- cgit v1.2.3 From 8cbf062a250ed52148badf6f3ffd03657dd4a3f0 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 15 Feb 2022 14:57:32 +0800 Subject: bpf: Reject kfunc calls that overflow insn->imm Now kfunc call uses s32 to represent the offset between the address of kfunc and __bpf_call_base, but it doesn't check whether or not s32 will be overflowed. The overflow is possible when kfunc is in module and the offset between module and kernel is greater than 2GB. Take arm64 as an example, before commit b2eed9b58811 ("arm64/kernel: kaslr: reduce module randomization range to 2 GB"), the offset between module symbol and __bpf_call_base will in 4GB range due to KASLR and may overflow s32. So add an extra checking to reject these invalid kfunc calls. Signed-off-by: Hou Tao Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220215065732.3179408-1-houtao1@huawei.com --- kernel/bpf/verifier.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bbef86cb4e72..d7473fee247c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1842,6 +1842,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) struct bpf_kfunc_desc *desc; const char *func_name; struct btf *desc_btf; + unsigned long call_imm; unsigned long addr; int err; @@ -1926,9 +1927,17 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return -EINVAL; } + call_imm = BPF_CALL_IMM(addr); + /* Check whether or not the relative offset overflows desc->imm */ + if ((unsigned long)(s32)call_imm != call_imm) { + verbose(env, "address of kernel function %s is out of range\n", + func_name); + return -EINVAL; + } + desc = &tab->descs[tab->nr_descs++]; desc->func_id = func_id; - desc->imm = BPF_CALL_IMM(addr); + desc->imm = call_imm; desc->offset = offset; err = btf_distill_func_proto(&env->log, desc_btf, func_proto, func_name, -- cgit v1.2.3 From e1478d8eaf27704db17a44dee4c53696ed01fc9c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 15 Feb 2022 13:41:04 +0100 Subject: asm-generic: Refactor dereference_[kernel]_function_descriptor() dereference_function_descriptor() and dereference_kernel_function_descriptor() are identical on the three architectures implementing them. Make them common and put them out-of-line in kernel/extable.c which is one of the users and has similar type of functions. Signed-off-by: Christophe Leroy Reviewed-by: Kees Cook Reviewed-by: Arnd Bergmann Acked-by: Helge Deller Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/449db09b2eba57f4ab05f80102a67d8675bc8bcd.1644928018.git.christophe.leroy@csgroup.eu --- arch/ia64/include/asm/sections.h | 19 ------------------- arch/parisc/include/asm/sections.h | 9 --------- arch/parisc/kernel/process.c | 21 --------------------- arch/powerpc/include/asm/sections.h | 23 ----------------------- include/asm-generic/sections.h | 2 ++ kernel/extable.c | 23 ++++++++++++++++++++++- 6 files changed, 24 insertions(+), 73 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/include/asm/sections.h b/arch/ia64/include/asm/sections.h index 3abe0562b01a..8e0875cf6071 100644 --- a/arch/ia64/include/asm/sections.h +++ b/arch/ia64/include/asm/sections.h @@ -30,23 +30,4 @@ extern char __start_gate_brl_fsys_bubble_down_patchlist[], __end_gate_brl_fsys_b extern char __start_unwind[], __end_unwind[]; extern char __start_ivt_text[], __end_ivt_text[]; -#undef dereference_function_descriptor -static inline void *dereference_function_descriptor(void *ptr) -{ - struct fdesc *desc = ptr; - void *p; - - if (!get_kernel_nofault(p, (void *)&desc->addr)) - ptr = p; - return ptr; -} - -#undef dereference_kernel_function_descriptor -static inline void *dereference_kernel_function_descriptor(void *ptr) -{ - if (ptr < (void *)__start_opd || ptr >= (void *)__end_opd) - return ptr; - return dereference_function_descriptor(ptr); -} - #endif /* _ASM_IA64_SECTIONS_H */ diff --git a/arch/parisc/include/asm/sections.h b/arch/parisc/include/asm/sections.h index ace1d4047a0b..33df42b5cc6d 100644 --- a/arch/parisc/include/asm/sections.h +++ b/arch/parisc/include/asm/sections.h @@ -12,13 +12,4 @@ typedef Elf64_Fdesc func_desc_t; extern char __alt_instructions[], __alt_instructions_end[]; -#ifdef CONFIG_64BIT - -#undef dereference_function_descriptor -void *dereference_function_descriptor(void *); - -#undef dereference_kernel_function_descriptor -void *dereference_kernel_function_descriptor(void *); -#endif - #endif diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index ea3d83b6fb62..2030c77592d3 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -263,27 +263,6 @@ __get_wchan(struct task_struct *p) return 0; } -#ifdef CONFIG_64BIT -void *dereference_function_descriptor(void *ptr) -{ - Elf64_Fdesc *desc = ptr; - void *p; - - if (!get_kernel_nofault(p, (void *)&desc->addr)) - ptr = p; - return ptr; -} - -void *dereference_kernel_function_descriptor(void *ptr) -{ - if (ptr < (void *)__start_opd || - ptr >= (void *)__end_opd) - return ptr; - - return dereference_function_descriptor(ptr); -} -#endif - static inline unsigned long brk_rnd(void) { return (get_random_int() & BRK_RND_MASK) << PAGE_SHIFT; diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h index fddfb3937868..8be2c491c733 100644 --- a/arch/powerpc/include/asm/sections.h +++ b/arch/powerpc/include/asm/sections.h @@ -58,29 +58,6 @@ static inline int overlaps_kernel_text(unsigned long start, unsigned long end) (unsigned long)_stext < end; } -#ifdef PPC64_ELF_ABI_v1 - -#undef dereference_function_descriptor -static inline void *dereference_function_descriptor(void *ptr) -{ - struct func_desc *desc = ptr; - void *p; - - if (!get_kernel_nofault(p, (void *)&desc->addr)) - ptr = p; - return ptr; -} - -#undef dereference_kernel_function_descriptor -static inline void *dereference_kernel_function_descriptor(void *ptr) -{ - if (ptr < (void *)__start_opd || ptr >= (void *)__end_opd) - return ptr; - - return dereference_function_descriptor(ptr); -} -#endif /* PPC64_ELF_ABI_v1 */ - #endif #endif /* __KERNEL__ */ diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index bbf97502470c..d0f7bdd2fdf2 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -60,6 +60,8 @@ extern __visible const void __nosave_begin, __nosave_end; /* Function descriptor handling (if any). Override in asm/sections.h */ #ifdef CONFIG_HAVE_FUNCTION_DESCRIPTORS +void *dereference_function_descriptor(void *ptr); +void *dereference_kernel_function_descriptor(void *ptr); #else #define dereference_function_descriptor(p) ((void *)(p)) #define dereference_kernel_function_descriptor(p) ((void *)(p)) diff --git a/kernel/extable.c b/kernel/extable.c index b6f330f0fe74..394c39b86e38 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -3,6 +3,7 @@ Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. */ +#include #include #include #include @@ -132,12 +133,32 @@ out: } /* - * On some architectures (PPC64, IA64) function pointers + * On some architectures (PPC64, IA64, PARISC) function pointers * are actually only tokens to some data that then holds the * real function address. As a result, to find if a function * pointer is part of the kernel text, we need to do some * special dereferencing first. */ +#ifdef CONFIG_HAVE_FUNCTION_DESCRIPTORS +void *dereference_function_descriptor(void *ptr) +{ + func_desc_t *desc = ptr; + void *p; + + if (!get_kernel_nofault(p, (void *)&desc->addr)) + ptr = p; + return ptr; +} + +void *dereference_kernel_function_descriptor(void *ptr) +{ + if (ptr < (void *)__start_opd || ptr >= (void *)__end_opd) + return ptr; + + return dereference_function_descriptor(ptr); +} +#endif + int func_ptr_is_kernel_text(void *ptr) { unsigned long addr; -- cgit v1.2.3 From b64913394f123e819bffabc79a0e48f98e78dc5d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 15 Feb 2022 13:41:06 +0100 Subject: lkdtm: Really write into kernel text in WRITE_KERN WRITE_KERN is supposed to overwrite some kernel text, namely do_overwritten() function. But at the time being it overwrites do_overwritten() function descriptor, not function text. Fix it by dereferencing the function descriptor to obtain function text pointer. Export dereference_function_descriptor() for when LKDTM is built as a module. And make do_overwritten() noinline so that it is really do_overwritten() which is called by lkdtm_WRITE_KERN(). Signed-off-by: Christophe Leroy Acked-by: Kees Cook Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/31e58eaffb5bc51c07d8d4891d1982100ade8cfc.1644928018.git.christophe.leroy@csgroup.eu --- drivers/misc/lkdtm/perms.c | 8 +++++--- kernel/extable.c | 1 + 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/drivers/misc/lkdtm/perms.c b/drivers/misc/lkdtm/perms.c index 60b3b2fe929d..035fcca441f0 100644 --- a/drivers/misc/lkdtm/perms.c +++ b/drivers/misc/lkdtm/perms.c @@ -10,6 +10,7 @@ #include #include #include +#include /* Whether or not to fill the target memory area with do_nothing(). */ #define CODE_WRITE true @@ -37,7 +38,7 @@ static noinline void do_nothing(void) } /* Must immediately follow do_nothing for size calculuations to work out. */ -static void do_overwritten(void) +static noinline void do_overwritten(void) { pr_info("do_overwritten wasn't overwritten!\n"); return; @@ -113,8 +114,9 @@ void lkdtm_WRITE_KERN(void) size_t size; volatile unsigned char *ptr; - size = (unsigned long)do_overwritten - (unsigned long)do_nothing; - ptr = (unsigned char *)do_overwritten; + size = (unsigned long)dereference_function_descriptor(do_overwritten) - + (unsigned long)dereference_function_descriptor(do_nothing); + ptr = dereference_function_descriptor(do_overwritten); pr_info("attempting bad %zu byte write at %px\n", size, ptr); memcpy((void *)ptr, (unsigned char *)do_nothing, size); diff --git a/kernel/extable.c b/kernel/extable.c index 394c39b86e38..bda5e9761541 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -149,6 +149,7 @@ void *dereference_function_descriptor(void *ptr) ptr = p; return ptr; } +EXPORT_SYMBOL_GPL(dereference_function_descriptor); void *dereference_kernel_function_descriptor(void *ptr) { -- cgit v1.2.3 From 1087ad4e3f88c474b8134a482720782922bf3fdf Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 10 Feb 2022 14:49:06 -0800 Subject: sched: replace cpumask_weight with cpumask_empty where appropriate In some places, kernel/sched code calls cpumask_weight() to check if any bit of a given cpumask is set. We can do it more efficiently with cpumask_empty() because cpumask_empty() stops traversing the cpumask as soon as it finds first set bit, while cpumask_weight() counts all bits unconditionally. Signed-off-by: Yury Norov Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220210224933.379149-23-yury.norov@gmail.com --- kernel/sched/core.c | 2 +- kernel/sched/topology.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1d863d7f6ad7..c620aab27acf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8708,7 +8708,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur, { int ret = 1; - if (!cpumask_weight(cur)) + if (cpumask_empty(cur)) return ret; ret = dl_cpuset_cpumask_can_shrink(cur, trial); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index e6cd55951304..1c84b48def20 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -74,7 +74,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!cpumask_weight(sched_group_span(group))) { + if (cpumask_empty(sched_group_span(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: empty group\n"); break; -- cgit v1.2.3 From 0fb3978b0aac3a5c08637aed03cc2d65f793508f Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 14 Feb 2022 20:15:52 +0800 Subject: sched/numa: Fix NUMA topology for systems with CPU-less nodes The NUMA topology parameters (sched_numa_topology_type, sched_domains_numa_levels, and sched_max_numa_distance, etc.) identified by scheduler may be wrong for systems with CPU-less nodes. For example, the ACPI SLIT of a system with CPU-less persistent memory (Intel Optane DCPMM) nodes is as follows, [000h 0000 4] Signature : "SLIT" [System Locality Information Table] [004h 0004 4] Table Length : 0000042C [008h 0008 1] Revision : 01 [009h 0009 1] Checksum : 59 [00Ah 0010 6] Oem ID : "XXXX" [010h 0016 8] Oem Table ID : "XXXXXXX" [018h 0024 4] Oem Revision : 00000001 [01Ch 0028 4] Asl Compiler ID : "INTL" [020h 0032 4] Asl Compiler Revision : 20091013 [024h 0036 8] Localities : 0000000000000004 [02Ch 0044 4] Locality 0 : 0A 15 11 1C [030h 0048 4] Locality 1 : 15 0A 1C 11 [034h 0052 4] Locality 2 : 11 1C 0A 1C [038h 0056 4] Locality 3 : 1C 11 1C 0A While the `numactl -H` output is as follows, available: 4 nodes (0-3) node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 node 0 size: 64136 MB node 0 free: 5981 MB node 1 cpus: 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 node 1 size: 64466 MB node 1 free: 10415 MB node 2 cpus: node 2 size: 253952 MB node 2 free: 253920 MB node 3 cpus: node 3 size: 253952 MB node 3 free: 253951 MB node distances: node 0 1 2 3 0: 10 21 17 28 1: 21 10 28 17 2: 17 28 10 28 3: 28 17 28 10 In this system, there are only 2 sockets. In each memory controller, both DRAM and PMEM DIMMs are installed. Although the physical NUMA topology is simple, the logical NUMA topology becomes a little complex. Because both the distance(0, 1) and distance (1, 3) are less than the distance (0, 3), it appears that node 1 sits between node 0 and node 3. And the whole system appears to be a glueless mesh NUMA topology type. But it's definitely not, there is even no CPU in node 3. This isn't a practical problem now yet. Because the PMEM nodes (node 2 and node 3 in example system) are offlined by default during system boot. So init_numa_topology_type() called during system boot will ignore them and set sched_numa_topology_type to NUMA_DIRECT. And init_numa_topology_type() is only called at runtime when a CPU of a never-onlined-before node gets plugged in. And there's no CPU in the PMEM nodes. But it appears better to fix this to make the code more robust. To test the potential problem. We have used a debug patch to call init_numa_topology_type() when the PMEM node is onlined (in __set_migration_target_nodes()). With that, the NUMA parameters identified by scheduler is as follows, sched_numa_topology_type: NUMA_GLUELESS_MESH sched_domains_numa_levels: 4 sched_max_numa_distance: 28 To fix the issue, the CPU-less nodes are ignored when the NUMA topology parameters are identified. Because a node may become CPU-less or not at run time because of CPU hotplug, the NUMA topology parameters need to be re-initialized at runtime for CPU hotplug too. With the patch, the NUMA parameters identified for the example system above is as follows, sched_numa_topology_type: NUMA_DIRECT sched_domains_numa_levels: 2 sched_max_numa_distance: 21 Suggested-by: Peter Zijlstra Signed-off-by: "Huang, Ying" Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220214121553.582248-1-ying.huang@intel.com --- kernel/sched/core.c | 5 +- kernel/sched/fair.c | 15 ++-- kernel/sched/sched.h | 6 +- kernel/sched/topology.c | 206 ++++++++++++++++++++++++++++-------------------- 4 files changed, 137 insertions(+), 95 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c620aab27acf..b2226922206d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9052,6 +9052,7 @@ int sched_cpu_activate(unsigned int cpu) set_cpu_active(cpu, true); if (sched_smp_initialized) { + sched_update_numa(cpu, true); sched_domains_numa_masks_set(cpu); cpuset_cpu_active(); } @@ -9130,10 +9131,12 @@ int sched_cpu_deactivate(unsigned int cpu) if (!sched_smp_initialized) return 0; + sched_update_numa(cpu, false); ret = cpuset_cpu_inactive(cpu); if (ret) { balance_push_set(cpu, false); set_cpu_active(cpu, true); + sched_update_numa(cpu, true); return ret; } sched_domains_numa_masks_clear(cpu); @@ -9236,7 +9239,7 @@ int sched_cpu_dying(unsigned int cpu) void __init sched_init_smp(void) { - sched_init_numa(); + sched_init_numa(NUMA_NO_NODE); /* * There's no userspace yet to cause hotplug operations; hence all the diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5c4bfffe8c2c..da3230b84250 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1259,10 +1259,10 @@ static bool numa_is_active_node(int nid, struct numa_group *ng) /* Handle placement on systems where not all nodes are directly connected. */ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, - int maxdist, bool task) + int lim_dist, bool task) { unsigned long score = 0; - int node; + int node, max_dist; /* * All nodes are directly connected, and the same distance @@ -1271,6 +1271,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, if (sched_numa_topology_type == NUMA_DIRECT) return 0; + /* sched_max_numa_distance may be changed in parallel. */ + max_dist = READ_ONCE(sched_max_numa_distance); /* * This code is called for each node, introducing N^2 complexity, * which should be ok given the number of nodes rarely exceeds 8. @@ -1283,7 +1285,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, * The furthest away nodes in the system are not interesting * for placement; nid was already counted. */ - if (dist == sched_max_numa_distance || node == nid) + if (dist >= max_dist || node == nid) continue; /* @@ -1293,8 +1295,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, * "hoplimit", only nodes closer by than "hoplimit" are part * of each group. Skip other nodes. */ - if (sched_numa_topology_type == NUMA_BACKPLANE && - dist >= maxdist) + if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist) continue; /* Add up the faults from nearby nodes. */ @@ -1312,8 +1313,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, * This seems to result in good task placement. */ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { - faults *= (sched_max_numa_distance - dist); - faults /= (sched_max_numa_distance - LOCAL_DISTANCE); + faults *= (max_dist - dist); + faults /= (max_dist - LOCAL_DISTANCE); } score += faults; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9b33ba9c3c42..3da5718cd641 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1662,12 +1662,14 @@ enum numa_topology_type { extern enum numa_topology_type sched_numa_topology_type; extern int sched_max_numa_distance; extern bool find_numa_distance(int distance); -extern void sched_init_numa(void); +extern void sched_init_numa(int offline_node); +extern void sched_update_numa(int cpu, bool online); extern void sched_domains_numa_masks_set(unsigned int cpu); extern void sched_domains_numa_masks_clear(unsigned int cpu); extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); #else -static inline void sched_init_numa(void) { } +static inline void sched_init_numa(int offline_node) { } +static inline void sched_update_numa(int cpu, bool online) { } static inline void sched_domains_numa_masks_set(unsigned int cpu) { } static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 1c84b48def20..5db322c9cb3f 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1492,8 +1492,6 @@ static int sched_domains_curr_level; int sched_max_numa_distance; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; - -static unsigned long __read_mostly *sched_numa_onlined_nodes; #endif /* @@ -1651,6 +1649,7 @@ static struct sched_domain_topology_level default_topology[] = { static struct sched_domain_topology_level *sched_domain_topology = default_topology; +static struct sched_domain_topology_level *sched_domain_topology_saved; #define for_each_sd_topology(tl) \ for (tl = sched_domain_topology; tl->mask; tl++) @@ -1661,6 +1660,7 @@ void set_sched_topology(struct sched_domain_topology_level *tl) return; sched_domain_topology = tl; + sched_domain_topology_saved = NULL; } #ifdef CONFIG_NUMA @@ -1684,8 +1684,12 @@ static void sched_numa_warn(const char *str) for (i = 0; i < nr_node_ids; i++) { printk(KERN_WARNING " "); - for (j = 0; j < nr_node_ids; j++) - printk(KERN_CONT "%02d ", node_distance(i,j)); + for (j = 0; j < nr_node_ids; j++) { + if (!node_state(i, N_CPU) || !node_state(j, N_CPU)) + printk(KERN_CONT "(%02d) ", node_distance(i,j)); + else + printk(KERN_CONT " %02d ", node_distance(i,j)); + } printk(KERN_CONT "\n"); } printk(KERN_WARNING "\n"); @@ -1693,19 +1697,34 @@ static void sched_numa_warn(const char *str) bool find_numa_distance(int distance) { - int i; + bool found = false; + int i, *distances; if (distance == node_distance(0, 0)) return true; + rcu_read_lock(); + distances = rcu_dereference(sched_domains_numa_distance); + if (!distances) + goto unlock; for (i = 0; i < sched_domains_numa_levels; i++) { - if (sched_domains_numa_distance[i] == distance) - return true; + if (distances[i] == distance) { + found = true; + break; + } } +unlock: + rcu_read_unlock(); - return false; + return found; } +#define for_each_cpu_node_but(n, nbut) \ + for_each_node_state(n, N_CPU) \ + if (n == nbut) \ + continue; \ + else + /* * A system can have three types of NUMA topology: * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system @@ -1725,7 +1744,7 @@ bool find_numa_distance(int distance) * there is an intermediary node C, which is < N hops away from both * nodes A and B, the system is a glueless mesh. */ -static void init_numa_topology_type(void) +static void init_numa_topology_type(int offline_node) { int a, b, c, n; @@ -1736,14 +1755,14 @@ static void init_numa_topology_type(void) return; } - for_each_online_node(a) { - for_each_online_node(b) { + for_each_cpu_node_but(a, offline_node) { + for_each_cpu_node_but(b, offline_node) { /* Find two nodes furthest removed from each other. */ if (node_distance(a, b) < n) continue; /* Is there an intermediary node between a and b? */ - for_each_online_node(c) { + for_each_cpu_node_but(c, offline_node) { if (node_distance(a, c) < n && node_distance(b, c) < n) { sched_numa_topology_type = @@ -1756,17 +1775,22 @@ static void init_numa_topology_type(void) return; } } + + pr_err("Failed to find a NUMA topology type, defaulting to DIRECT\n"); + sched_numa_topology_type = NUMA_DIRECT; } #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS) -void sched_init_numa(void) +void sched_init_numa(int offline_node) { struct sched_domain_topology_level *tl; unsigned long *distance_map; int nr_levels = 0; int i, j; + int *distances; + struct cpumask ***masks; /* * O(nr_nodes^2) deduplicating selection sort -- in order to find the @@ -1777,12 +1801,13 @@ void sched_init_numa(void) return; bitmap_zero(distance_map, NR_DISTANCE_VALUES); - for (i = 0; i < nr_node_ids; i++) { - for (j = 0; j < nr_node_ids; j++) { + for_each_cpu_node_but(i, offline_node) { + for_each_cpu_node_but(j, offline_node) { int distance = node_distance(i, j); if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) { sched_numa_warn("Invalid distance value range"); + bitmap_free(distance_map); return; } @@ -1795,16 +1820,17 @@ void sched_init_numa(void) */ nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES); - sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL); - if (!sched_domains_numa_distance) { + distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL); + if (!distances) { bitmap_free(distance_map); return; } for (i = 0, j = 0; i < nr_levels; i++, j++) { j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j); - sched_domains_numa_distance[i] = j; + distances[i] = j; } + rcu_assign_pointer(sched_domains_numa_distance, distances); bitmap_free(distance_map); @@ -1826,8 +1852,8 @@ void sched_init_numa(void) */ sched_domains_numa_levels = 0; - sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL); - if (!sched_domains_numa_masks) + masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL); + if (!masks) return; /* @@ -1835,31 +1861,20 @@ void sched_init_numa(void) * CPUs of nodes that are that many hops away from us. */ for (i = 0; i < nr_levels; i++) { - sched_domains_numa_masks[i] = - kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); - if (!sched_domains_numa_masks[i]) + masks[i] = kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); + if (!masks[i]) return; - for (j = 0; j < nr_node_ids; j++) { + for_each_cpu_node_but(j, offline_node) { struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); int k; if (!mask) return; - sched_domains_numa_masks[i][j] = mask; - - for_each_node(k) { - /* - * Distance information can be unreliable for - * offline nodes, defer building the node - * masks to its bringup. - * This relies on all unique distance values - * still being visible at init time. - */ - if (!node_online(j)) - continue; + masks[i][j] = mask; + for_each_cpu_node_but(k, offline_node) { if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) sched_numa_warn("Node-distance not symmetric"); @@ -1870,6 +1885,7 @@ void sched_init_numa(void) } } } + rcu_assign_pointer(sched_domains_numa_masks, masks); /* Compute default topology size */ for (i = 0; sched_domain_topology[i].mask; i++); @@ -1907,59 +1923,67 @@ void sched_init_numa(void) }; } + sched_domain_topology_saved = sched_domain_topology; sched_domain_topology = tl; sched_domains_numa_levels = nr_levels; - sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1]; - - init_numa_topology_type(); - - sched_numa_onlined_nodes = bitmap_alloc(nr_node_ids, GFP_KERNEL); - if (!sched_numa_onlined_nodes) - return; + WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]); - bitmap_zero(sched_numa_onlined_nodes, nr_node_ids); - for_each_online_node(i) - bitmap_set(sched_numa_onlined_nodes, i, 1); + init_numa_topology_type(offline_node); } -static void __sched_domains_numa_masks_set(unsigned int node) -{ - int i, j; - - /* - * NUMA masks are not built for offline nodes in sched_init_numa(). - * Thus, when a CPU of a never-onlined-before node gets plugged in, - * adding that new CPU to the right NUMA masks is not sufficient: the - * masks of that CPU's node must also be updated. - */ - if (test_bit(node, sched_numa_onlined_nodes)) - return; - bitmap_set(sched_numa_onlined_nodes, node, 1); - - for (i = 0; i < sched_domains_numa_levels; i++) { - for (j = 0; j < nr_node_ids; j++) { - if (!node_online(j) || node == j) - continue; +static void sched_reset_numa(void) +{ + int nr_levels, *distances; + struct cpumask ***masks; - if (node_distance(j, node) > sched_domains_numa_distance[i]) + nr_levels = sched_domains_numa_levels; + sched_domains_numa_levels = 0; + sched_max_numa_distance = 0; + sched_numa_topology_type = NUMA_DIRECT; + distances = sched_domains_numa_distance; + rcu_assign_pointer(sched_domains_numa_distance, NULL); + masks = sched_domains_numa_masks; + rcu_assign_pointer(sched_domains_numa_masks, NULL); + if (distances || masks) { + int i, j; + + synchronize_rcu(); + kfree(distances); + for (i = 0; i < nr_levels && masks; i++) { + if (!masks[i]) continue; - - /* Add remote nodes in our masks */ - cpumask_or(sched_domains_numa_masks[i][node], - sched_domains_numa_masks[i][node], - sched_domains_numa_masks[0][j]); + for_each_node(j) + kfree(masks[i][j]); + kfree(masks[i]); } + kfree(masks); } + if (sched_domain_topology_saved) { + kfree(sched_domain_topology); + sched_domain_topology = sched_domain_topology_saved; + sched_domain_topology_saved = NULL; + } +} + +/* + * Call with hotplug lock held + */ +void sched_update_numa(int cpu, bool online) +{ + int node; + node = cpu_to_node(cpu); /* - * A new node has been brought up, potentially changing the topology - * classification. - * - * Note that this is racy vs any use of sched_numa_topology_type :/ + * Scheduler NUMA topology is updated when the first CPU of a + * node is onlined or the last CPU of a node is offlined. */ - init_numa_topology_type(); + if (cpumask_weight(cpumask_of_node(node)) != 1) + return; + + sched_reset_numa(); + sched_init_numa(online ? NUMA_NO_NODE : node); } void sched_domains_numa_masks_set(unsigned int cpu) @@ -1967,11 +1991,9 @@ void sched_domains_numa_masks_set(unsigned int cpu) int node = cpu_to_node(cpu); int i, j; - __sched_domains_numa_masks_set(node); - for (i = 0; i < sched_domains_numa_levels; i++) { for (j = 0; j < nr_node_ids; j++) { - if (!node_online(j)) + if (!node_state(j, N_CPU)) continue; /* Set ourselves in the remote node's masks */ @@ -1986,8 +2008,10 @@ void sched_domains_numa_masks_clear(unsigned int cpu) int i, j; for (i = 0; i < sched_domains_numa_levels; i++) { - for (j = 0; j < nr_node_ids; j++) - cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); + for (j = 0; j < nr_node_ids; j++) { + if (sched_domains_numa_masks[i][j]) + cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); + } } } @@ -2001,14 +2025,26 @@ void sched_domains_numa_masks_clear(unsigned int cpu) */ int sched_numa_find_closest(const struct cpumask *cpus, int cpu) { - int i, j = cpu_to_node(cpu); + int i, j = cpu_to_node(cpu), found = nr_cpu_ids; + struct cpumask ***masks; + rcu_read_lock(); + masks = rcu_dereference(sched_domains_numa_masks); + if (!masks) + goto unlock; for (i = 0; i < sched_domains_numa_levels; i++) { - cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]); - if (cpu < nr_cpu_ids) - return cpu; + if (!masks[i][j]) + break; + cpu = cpumask_any_and(cpus, masks[i][j]); + if (cpu < nr_cpu_ids) { + found = cpu; + break; + } } - return nr_cpu_ids; +unlock: + rcu_read_unlock(); + + return found; } #endif /* CONFIG_NUMA */ -- cgit v1.2.3 From 5c7b1aaf139dab5072311853bacc40fc3457d1f9 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 14 Feb 2022 20:15:53 +0800 Subject: sched/numa: Avoid migrating task to CPU-less node In a typical memory tiering system, there's no CPU in slow (PMEM) NUMA nodes. But if the number of the hint page faults on a PMEM node is the max for a task, The current NUMA balancing policy may try to place the task on the PMEM node instead of DRAM node. This is unreasonable, because there's no CPU in PMEM NUMA nodes. To fix this, CPU-less nodes are ignored when searching the migration target node for a task in this patch. To test the patch, we run a workload that accesses more memory in PMEM node than memory in DRAM node. Without the patch, the PMEM node will be chosen as preferred node in task_numa_placement(). While the DRAM node will be chosen instead with the patch. Signed-off-by: "Huang, Ying" Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220214121553.582248-2-ying.huang@intel.com --- kernel/sched/fair.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index da3230b84250..11a72e1b3b2c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1989,7 +1989,7 @@ static int task_numa_migrate(struct task_struct *p) */ ng = deref_curr_numa_group(p); if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) { - for_each_online_node(nid) { + for_each_node_state(nid, N_CPU) { if (nid == env.src_nid || nid == p->numa_preferred_nid) continue; @@ -2087,13 +2087,13 @@ static void numa_group_count_active_nodes(struct numa_group *numa_group) unsigned long faults, max_faults = 0; int nid, active_nodes = 0; - for_each_online_node(nid) { + for_each_node_state(nid, N_CPU) { faults = group_faults_cpu(numa_group, nid); if (faults > max_faults) max_faults = faults; } - for_each_online_node(nid) { + for_each_node_state(nid, N_CPU) { faults = group_faults_cpu(numa_group, nid); if (faults * ACTIVE_NODE_FRACTION > max_faults) active_nodes++; @@ -2247,7 +2247,7 @@ static int preferred_group_nid(struct task_struct *p, int nid) dist = sched_max_numa_distance; - for_each_online_node(node) { + for_each_node_state(node, N_CPU) { score = group_weight(p, node, dist); if (score > max_score) { max_score = score; @@ -2266,7 +2266,7 @@ static int preferred_group_nid(struct task_struct *p, int nid) * inside the highest scoring group of nodes. The nodemask tricks * keep the complexity of the search down. */ - nodes = node_online_map; + nodes = node_states[N_CPU]; for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) { unsigned long max_faults = 0; nodemask_t max_group = NODE_MASK_NONE; @@ -2405,6 +2405,21 @@ static void task_numa_placement(struct task_struct *p) } } + /* Cannot migrate task to CPU-less node */ + if (!node_state(max_nid, N_CPU)) { + int near_nid = max_nid; + int distance, near_distance = INT_MAX; + + for_each_node_state(nid, N_CPU) { + distance = node_distance(max_nid, nid); + if (distance < near_distance) { + near_nid = nid; + near_distance = distance; + } + } + max_nid = near_nid; + } + if (ng) { numa_group_count_active_nodes(ng); spin_unlock_irq(group_lock); -- cgit v1.2.3 From e6df4ead85d9da1b07dd40bd4c6d2182f3e210c4 Mon Sep 17 00:00:00 2001 From: Zhaoyang Huang Date: Tue, 25 Jan 2022 14:56:58 +0800 Subject: psi: fix possible trigger missing in the window When a new threshold breaching stall happens after a psi event was generated and within the window duration, the new event is not generated because the events are rate-limited to one per window. If after that no new stall is recorded then the event will not be generated even after rate-limiting duration has passed. This is happening because with no new stall, window_update will not be called even though threshold was previously breached. To fix this, record threshold breaching occurrence and generate the event once window duration is passed. Suggested-by: Suren Baghdasaryan Signed-off-by: Zhaoyang Huang Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Acked-by: Suren Baghdasaryan Link: https://lore.kernel.org/r/1643093818-19835-1-git-send-email-huangzhaoyang@gmail.com --- include/linux/psi_types.h | 3 +++ kernel/sched/psi.c | 46 ++++++++++++++++++++++++++++++---------------- 2 files changed, 33 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 516c0fe836fd..dc3ec5e4b9ee 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -144,6 +144,9 @@ struct psi_trigger { /* Refcounting to prevent premature destruction */ struct kref refcount; + + /* Deferred event(s) from previous ratelimit window */ + bool pending_event; }; struct psi_group { diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index cfe76f704d8a..e9d623cb8d1b 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -523,7 +523,7 @@ static void init_triggers(struct psi_group *group, u64 now) static u64 update_triggers(struct psi_group *group, u64 now) { struct psi_trigger *t; - bool new_stall = false; + bool update_total = false; u64 *total = group->total[PSI_POLL]; /* @@ -532,24 +532,35 @@ static u64 update_triggers(struct psi_group *group, u64 now) */ list_for_each_entry(t, &group->triggers, node) { u64 growth; + bool new_stall; - /* Check for stall activity */ - if (group->polling_total[t->state] == total[t->state]) - continue; + new_stall = group->polling_total[t->state] != total[t->state]; + /* Check for stall activity or a previous threshold breach */ + if (!new_stall && !t->pending_event) + continue; /* - * Multiple triggers might be looking at the same state, - * remember to update group->polling_total[] once we've - * been through all of them. Also remember to extend the - * polling time if we see new stall activity. + * Check for new stall activity, as well as deferred + * events that occurred in the last window after the + * trigger had already fired (we want to ratelimit + * events without dropping any). */ - new_stall = true; - - /* Calculate growth since last update */ - growth = window_update(&t->win, now, total[t->state]); - if (growth < t->threshold) - continue; - + if (new_stall) { + /* + * Multiple triggers might be looking at the same state, + * remember to update group->polling_total[] once we've + * been through all of them. Also remember to extend the + * polling time if we see new stall activity. + */ + update_total = true; + + /* Calculate growth since last update */ + growth = window_update(&t->win, now, total[t->state]); + if (growth < t->threshold) + continue; + + t->pending_event = true; + } /* Limit event signaling to once per window */ if (now < t->last_event_time + t->win.size) continue; @@ -558,9 +569,11 @@ static u64 update_triggers(struct psi_group *group, u64 now) if (cmpxchg(&t->event, 0, 1) == 0) wake_up_interruptible(&t->event_wait); t->last_event_time = now; + /* Reset threshold breach flag once event got generated */ + t->pending_event = false; } - if (new_stall) + if (update_total) memcpy(group->polling_total, total, sizeof(group->polling_total)); @@ -1125,6 +1138,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, t->last_event_time = 0; init_waitqueue_head(&t->event_wait); kref_init(&t->refcount); + t->pending_event = false; mutex_lock(&group->trigger_lock); -- cgit v1.2.3 From 7b45b51e778021cd7817b8f0d743a2c73205c011 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 7 Feb 2022 16:59:04 +0100 Subject: workqueue: Decouple HK_FLAG_WQ and HK_FLAG_DOMAIN cpumask fetch To prepare for supporting each feature of the housekeeping cpumask toward cpuset, prepare each of the HK_FLAG_* entries to move to their own cpumask with enforcing to fetch them individually. The new constraint is that multiple HK_FLAG_* entries can't be mixed together anymore in a single call to housekeeping cpumask(). This will later allow, for example, to runtime modify the cpulist passed through "isolcpus=", "nohz_full=" and "rcu_nocbs=" kernel boot parameters. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juri Lelli Reviewed-by: Phil Auld Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220207155910.527133-3-frederic@kernel.org --- kernel/workqueue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 33f1106b4f99..61ed310621ea 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -6006,13 +6006,13 @@ static void __init wq_numa_init(void) void __init workqueue_init_early(void) { int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; - int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ; int i, cpu; BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); - cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags)); + cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_WQ)); + cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN)); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); -- cgit v1.2.3 From 04d4e665a60902cf36e7ad39af1179cb5df542ad Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 7 Feb 2022 16:59:06 +0100 Subject: sched/isolation: Use single feature type while referring to housekeeping cpumask Refer to housekeeping APIs using single feature types instead of flags. This prevents from passing multiple isolation features at once to housekeeping interfaces, which soon won't be possible anymore as each isolation features will have their own cpumask. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juri Lelli Reviewed-by: Phil Auld Link: https://lore.kernel.org/r/20220207155910.527133-5-frederic@kernel.org --- arch/x86/kernel/cpu/aperfmperf.c | 6 +++--- arch/x86/kvm/x86.c | 2 +- drivers/base/cpu.c | 2 +- drivers/pci/pci-driver.c | 4 ++-- include/linux/sched/isolation.h | 43 ++++++++++++++++++++-------------------- kernel/cgroup/cpuset.c | 6 +++--- kernel/cpu.c | 4 ++-- kernel/irq/cpuhotplug.c | 4 ++-- kernel/irq/manage.c | 4 ++-- kernel/kthread.c | 4 ++-- kernel/rcu/tasks.h | 2 +- kernel/rcu/tree_plugin.h | 6 +++--- kernel/sched/core.c | 12 +++++------ kernel/sched/fair.c | 10 +++++----- kernel/sched/isolation.c | 32 ++++++++++++++++++++---------- kernel/sched/topology.c | 8 ++++---- kernel/watchdog.c | 2 +- kernel/workqueue.c | 4 ++-- net/core/net-sysfs.c | 4 ++-- 19 files changed, 86 insertions(+), 73 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 22911deacb6e..9ca008f9e9b1 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -91,7 +91,7 @@ unsigned int aperfmperf_get_khz(int cpu) if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) return 0; - if (!housekeeping_cpu(cpu, HK_FLAG_MISC)) + if (!housekeeping_cpu(cpu, HK_TYPE_MISC)) return 0; if (rcu_is_idle_cpu(cpu)) @@ -114,7 +114,7 @@ void arch_freq_prepare_all(void) return; for_each_online_cpu(cpu) { - if (!housekeeping_cpu(cpu, HK_FLAG_MISC)) + if (!housekeeping_cpu(cpu, HK_TYPE_MISC)) continue; if (rcu_is_idle_cpu(cpu)) continue; /* Idle CPUs are completely uninteresting. */ @@ -136,7 +136,7 @@ unsigned int arch_freq_get_on_cpu(int cpu) if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) return 0; - if (!housekeeping_cpu(cpu, HK_FLAG_MISC)) + if (!housekeeping_cpu(cpu, HK_TYPE_MISC)) return 0; if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true)) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9e43d756312f..02a7ac1b6bb2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8769,7 +8769,7 @@ int kvm_arch_init(void *opaque) } if (pi_inject_timer == -1) - pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER); + pi_inject_timer = housekeeping_enabled(HK_TYPE_TIMER); #ifdef CONFIG_X86_64 pvclock_gtod_register_notifier(&pvclock_gtod_notifier); diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 5fc258073bc7..2ef23fce0860 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -275,7 +275,7 @@ static ssize_t print_cpus_isolated(struct device *dev, return -ENOMEM; cpumask_andnot(isolated, cpu_possible_mask, - housekeeping_cpumask(HK_FLAG_DOMAIN)); + housekeeping_cpumask(HK_TYPE_DOMAIN)); len = sysfs_emit(buf, "%*pbl\n", cpumask_pr_args(isolated)); free_cpumask_var(isolated); diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 4a5792c82d08..f61c40a47891 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -377,8 +377,8 @@ static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev, goto out; } cpumask_and(wq_domain_mask, - housekeeping_cpumask(HK_FLAG_WQ), - housekeeping_cpumask(HK_FLAG_DOMAIN)); + housekeeping_cpumask(HK_TYPE_WQ), + housekeeping_cpumask(HK_TYPE_DOMAIN)); cpu = cpumask_any_and(cpumask_of_node(node), wq_domain_mask); diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index cc9f393e2a70..8c15abd67aed 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -5,54 +5,55 @@ #include #include -enum hk_flags { - HK_FLAG_TIMER = 1, - HK_FLAG_RCU = (1 << 1), - HK_FLAG_MISC = (1 << 2), - HK_FLAG_SCHED = (1 << 3), - HK_FLAG_TICK = (1 << 4), - HK_FLAG_DOMAIN = (1 << 5), - HK_FLAG_WQ = (1 << 6), - HK_FLAG_MANAGED_IRQ = (1 << 7), - HK_FLAG_KTHREAD = (1 << 8), +enum hk_type { + HK_TYPE_TIMER, + HK_TYPE_RCU, + HK_TYPE_MISC, + HK_TYPE_SCHED, + HK_TYPE_TICK, + HK_TYPE_DOMAIN, + HK_TYPE_WQ, + HK_TYPE_MANAGED_IRQ, + HK_TYPE_KTHREAD, + HK_TYPE_MAX }; #ifdef CONFIG_CPU_ISOLATION DECLARE_STATIC_KEY_FALSE(housekeeping_overridden); -extern int housekeeping_any_cpu(enum hk_flags flags); -extern const struct cpumask *housekeeping_cpumask(enum hk_flags flags); -extern bool housekeeping_enabled(enum hk_flags flags); -extern void housekeeping_affine(struct task_struct *t, enum hk_flags flags); -extern bool housekeeping_test_cpu(int cpu, enum hk_flags flags); +extern int housekeeping_any_cpu(enum hk_type type); +extern const struct cpumask *housekeeping_cpumask(enum hk_type type); +extern bool housekeeping_enabled(enum hk_type type); +extern void housekeeping_affine(struct task_struct *t, enum hk_type type); +extern bool housekeeping_test_cpu(int cpu, enum hk_type type); extern void __init housekeeping_init(void); #else -static inline int housekeeping_any_cpu(enum hk_flags flags) +static inline int housekeeping_any_cpu(enum hk_type type) { return smp_processor_id(); } -static inline const struct cpumask *housekeeping_cpumask(enum hk_flags flags) +static inline const struct cpumask *housekeeping_cpumask(enum hk_type type) { return cpu_possible_mask; } -static inline bool housekeeping_enabled(enum hk_flags flags) +static inline bool housekeeping_enabled(enum hk_type type) { return false; } static inline void housekeeping_affine(struct task_struct *t, - enum hk_flags flags) { } + enum hk_type type) { } static inline void housekeeping_init(void) { } #endif /* CONFIG_CPU_ISOLATION */ -static inline bool housekeeping_cpu(int cpu, enum hk_flags flags) +static inline bool housekeeping_cpu(int cpu, enum hk_type type) { #ifdef CONFIG_CPU_ISOLATION if (static_branch_unlikely(&housekeeping_overridden)) - return housekeeping_test_cpu(cpu, flags); + return housekeeping_test_cpu(cpu, type); #endif return true; } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index dc653ab26e50..e4e18a2cb404 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -803,7 +803,7 @@ static int generate_sched_domains(cpumask_var_t **domains, update_domain_attr_tree(dattr, &top_cpuset); } cpumask_and(doms[0], top_cpuset.effective_cpus, - housekeeping_cpumask(HK_FLAG_DOMAIN)); + housekeeping_cpumask(HK_TYPE_DOMAIN)); goto done; } @@ -833,7 +833,7 @@ static int generate_sched_domains(cpumask_var_t **domains, if (!cpumask_empty(cp->cpus_allowed) && !(is_sched_load_balance(cp) && cpumask_intersects(cp->cpus_allowed, - housekeeping_cpumask(HK_FLAG_DOMAIN)))) + housekeeping_cpumask(HK_TYPE_DOMAIN)))) continue; if (root_load_balance && @@ -922,7 +922,7 @@ restart: if (apn == b->pn) { cpumask_or(dp, dp, b->effective_cpus); - cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN)); + cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); if (dattr) update_domain_attr_tree(dattr + nslot, b); diff --git a/kernel/cpu.c b/kernel/cpu.c index 407a2568f35e..f39eb0b52dfe 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1488,8 +1488,8 @@ int freeze_secondary_cpus(int primary) cpu_maps_update_begin(); if (primary == -1) { primary = cpumask_first(cpu_online_mask); - if (!housekeeping_cpu(primary, HK_FLAG_TIMER)) - primary = housekeeping_any_cpu(HK_FLAG_TIMER); + if (!housekeeping_cpu(primary, HK_TYPE_TIMER)) + primary = housekeeping_any_cpu(HK_TYPE_TIMER); } else { if (!cpu_online(primary)) primary = cpumask_first(cpu_online_mask); diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 39a41c56ad4f..1ed2b1739363 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -176,10 +176,10 @@ static bool hk_should_isolate(struct irq_data *data, unsigned int cpu) { const struct cpumask *hk_mask; - if (!housekeeping_enabled(HK_FLAG_MANAGED_IRQ)) + if (!housekeeping_enabled(HK_TYPE_MANAGED_IRQ)) return false; - hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ); + hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ); if (cpumask_subset(irq_data_get_effective_affinity_mask(data), hk_mask)) return false; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f23ffd30385b..c03f71d5ec10 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -247,13 +247,13 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, * online. */ if (irqd_affinity_is_managed(data) && - housekeeping_enabled(HK_FLAG_MANAGED_IRQ)) { + housekeeping_enabled(HK_TYPE_MANAGED_IRQ)) { const struct cpumask *hk_mask, *prog_mask; static DEFINE_RAW_SPINLOCK(tmp_mask_lock); static struct cpumask tmp_mask; - hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ); + hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ); raw_spin_lock(&tmp_mask_lock); cpumask_and(&tmp_mask, mask, hk_mask); diff --git a/kernel/kthread.c b/kernel/kthread.c index 38c6dd822da8..d100d5a15b38 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -356,7 +356,7 @@ static int kthread(void *_create) * back to default in case they have been changed. */ sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m); - set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_KTHREAD)); + set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD)); /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); @@ -722,7 +722,7 @@ int kthreadd(void *unused) /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); - set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_FLAG_KTHREAD)); + set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD)); set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 84f1d91604cc..6093b200dff7 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -492,7 +492,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) struct rcu_tasks *rtp = arg; /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ - housekeeping_affine(current, HK_FLAG_RCU); + housekeeping_affine(current, HK_TYPE_RCU); WRITE_ONCE(rtp->kthread_ptr, current); // Let GPs start! /* diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c5b45c2f68a1..65f25a32f6d7 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1214,9 +1214,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) if ((mask & leaf_node_cpu_bit(rnp, cpu)) && cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); - cpumask_and(cm, cm, housekeeping_cpumask(HK_FLAG_RCU)); + cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU)); if (cpumask_weight(cm) == 0) - cpumask_copy(cm, housekeeping_cpumask(HK_FLAG_RCU)); + cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU)); set_cpus_allowed_ptr(t, cm); free_cpumask_var(cm); } @@ -1291,7 +1291,7 @@ static void rcu_bind_gp_kthread(void) { if (!tick_nohz_full_enabled()) return; - housekeeping_affine(current, HK_FLAG_RCU); + housekeeping_affine(current, HK_TYPE_RCU); } /* Record the current task on dyntick-idle entry. */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b2226922206d..1e08b02e0cd5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1025,13 +1025,13 @@ int get_nohz_timer_target(void) struct sched_domain *sd; const struct cpumask *hk_mask; - if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { + if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) { if (!idle_cpu(cpu)) return cpu; default_cpu = cpu; } - hk_mask = housekeeping_cpumask(HK_FLAG_TIMER); + hk_mask = housekeeping_cpumask(HK_TYPE_TIMER); rcu_read_lock(); for_each_domain(cpu, sd) { @@ -1047,7 +1047,7 @@ int get_nohz_timer_target(void) } if (default_cpu == -1) - default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); + default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER); cpu = default_cpu; unlock: rcu_read_unlock(); @@ -5371,7 +5371,7 @@ static void sched_tick_start(int cpu) int os; struct tick_work *twork; - if (housekeeping_cpu(cpu, HK_FLAG_TICK)) + if (housekeeping_cpu(cpu, HK_TYPE_TICK)) return; WARN_ON_ONCE(!tick_work_cpu); @@ -5392,7 +5392,7 @@ static void sched_tick_stop(int cpu) struct tick_work *twork; int os; - if (housekeeping_cpu(cpu, HK_FLAG_TICK)) + if (housekeeping_cpu(cpu, HK_TYPE_TICK)) return; WARN_ON_ONCE(!tick_work_cpu); @@ -9251,7 +9251,7 @@ void __init sched_init_smp(void) mutex_unlock(&sched_domains_mutex); /* Move init over to a non-isolated CPU */ - if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) + if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0) BUG(); current->flags &= ~PF_NO_SETAFFINITY; sched_init_granularity(); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 11a72e1b3b2c..dcbd3110c687 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10337,7 +10337,7 @@ static inline int on_null_domain(struct rq *rq) * - When one of the busy CPUs notice that there may be an idle rebalancing * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. - * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set + * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED not set * anywhere yet. */ @@ -10346,7 +10346,7 @@ static inline int find_new_ilb(void) int ilb; const struct cpumask *hk_mask; - hk_mask = housekeeping_cpumask(HK_FLAG_MISC); + hk_mask = housekeeping_cpumask(HK_TYPE_MISC); for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) { @@ -10362,7 +10362,7 @@ static inline int find_new_ilb(void) /* * Kick a CPU to do the nohz balancing, if it is time for it. We pick any - * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one). + * idle CPU in the HK_TYPE_MISC housekeeping set (if there is one). */ static void kick_ilb(unsigned int flags) { @@ -10575,7 +10575,7 @@ void nohz_balance_enter_idle(int cpu) return; /* Spare idle load balancing on CPUs that don't want to be disturbed: */ - if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) + if (!housekeeping_cpu(cpu, HK_TYPE_SCHED)) return; /* @@ -10791,7 +10791,7 @@ static void nohz_newidle_balance(struct rq *this_rq) * This CPU doesn't want to be disturbed by scheduler * housekeeping */ - if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED)) + if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED)) return; /* Will wake up very soon. No time for doing anything else*/ diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 7f06eaf12818..a735d9e229dd 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -9,23 +9,35 @@ */ #include "sched.h" +enum hk_flags { + HK_FLAG_TIMER = BIT(HK_TYPE_TIMER), + HK_FLAG_RCU = BIT(HK_TYPE_RCU), + HK_FLAG_MISC = BIT(HK_TYPE_MISC), + HK_FLAG_SCHED = BIT(HK_TYPE_SCHED), + HK_FLAG_TICK = BIT(HK_TYPE_TICK), + HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN), + HK_FLAG_WQ = BIT(HK_TYPE_WQ), + HK_FLAG_MANAGED_IRQ = BIT(HK_TYPE_MANAGED_IRQ), + HK_FLAG_KTHREAD = BIT(HK_TYPE_KTHREAD), +}; + DEFINE_STATIC_KEY_FALSE(housekeeping_overridden); EXPORT_SYMBOL_GPL(housekeeping_overridden); static cpumask_var_t housekeeping_mask; static unsigned int housekeeping_flags; -bool housekeeping_enabled(enum hk_flags flags) +bool housekeeping_enabled(enum hk_type type) { - return !!(housekeeping_flags & flags); + return !!(housekeeping_flags & BIT(type)); } EXPORT_SYMBOL_GPL(housekeeping_enabled); -int housekeeping_any_cpu(enum hk_flags flags) +int housekeeping_any_cpu(enum hk_type type) { int cpu; if (static_branch_unlikely(&housekeeping_overridden)) { - if (housekeeping_flags & flags) { + if (housekeeping_flags & BIT(type)) { cpu = sched_numa_find_closest(housekeeping_mask, smp_processor_id()); if (cpu < nr_cpu_ids) return cpu; @@ -37,27 +49,27 @@ int housekeeping_any_cpu(enum hk_flags flags) } EXPORT_SYMBOL_GPL(housekeeping_any_cpu); -const struct cpumask *housekeeping_cpumask(enum hk_flags flags) +const struct cpumask *housekeeping_cpumask(enum hk_type type) { if (static_branch_unlikely(&housekeeping_overridden)) - if (housekeeping_flags & flags) + if (housekeeping_flags & BIT(type)) return housekeeping_mask; return cpu_possible_mask; } EXPORT_SYMBOL_GPL(housekeeping_cpumask); -void housekeeping_affine(struct task_struct *t, enum hk_flags flags) +void housekeeping_affine(struct task_struct *t, enum hk_type type) { if (static_branch_unlikely(&housekeeping_overridden)) - if (housekeeping_flags & flags) + if (housekeeping_flags & BIT(type)) set_cpus_allowed_ptr(t, housekeeping_mask); } EXPORT_SYMBOL_GPL(housekeeping_affine); -bool housekeeping_test_cpu(int cpu, enum hk_flags flags) +bool housekeeping_test_cpu(int cpu, enum hk_type type) { if (static_branch_unlikely(&housekeeping_overridden)) - if (housekeeping_flags & flags) + if (housekeeping_flags & BIT(type)) return cpumask_test_cpu(cpu, housekeeping_mask); return true; } diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 5db322c9cb3f..32841c6741d1 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1366,7 +1366,7 @@ static void asym_cpu_capacity_scan(void) list_for_each_entry(entry, &asym_cap_list, link) cpumask_clear(cpu_capacity_span(entry)); - for_each_cpu_and(cpu, cpu_possible_mask, housekeeping_cpumask(HK_FLAG_DOMAIN)) + for_each_cpu_and(cpu, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)) asym_cpu_capacity_update_data(cpu); list_for_each_entry_safe(entry, next, &asym_cap_list, link) { @@ -2440,7 +2440,7 @@ int sched_init_domains(const struct cpumask *cpu_map) doms_cur = alloc_sched_domains(ndoms_cur); if (!doms_cur) doms_cur = &fallback_doms; - cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN)); + cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_TYPE_DOMAIN)); err = build_sched_domains(doms_cur[0], NULL); return err; @@ -2529,7 +2529,7 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], if (doms_new) { n = 1; cpumask_and(doms_new[0], cpu_active_mask, - housekeeping_cpumask(HK_FLAG_DOMAIN)); + housekeeping_cpumask(HK_TYPE_DOMAIN)); } } else { n = ndoms_new; @@ -2564,7 +2564,7 @@ match1: n = 0; doms_new = &fallback_doms; cpumask_and(doms_new[0], cpu_active_mask, - housekeeping_cpumask(HK_FLAG_DOMAIN)); + housekeeping_cpumask(HK_TYPE_DOMAIN)); } /* Build new domains: */ diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 99afb88d2e85..9166220457bc 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -848,7 +848,7 @@ void __init lockup_detector_init(void) pr_info("Disabling watchdog on nohz_full cores by default\n"); cpumask_copy(&watchdog_cpumask, - housekeeping_cpumask(HK_FLAG_TIMER)); + housekeeping_cpumask(HK_TYPE_TIMER)); if (!watchdog_nmi_probe()) nmi_watchdog_available = true; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 61ed310621ea..52e9abbb7759 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -6011,8 +6011,8 @@ void __init workqueue_init_early(void) BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); - cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_WQ)); - cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN)); + cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ)); + cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN)); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index ed8da7b8d35b..7ceb3460161b 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -839,8 +839,8 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, } if (!cpumask_empty(mask)) { - cpumask_and(mask, mask, housekeeping_cpumask(HK_FLAG_DOMAIN)); - cpumask_and(mask, mask, housekeeping_cpumask(HK_FLAG_WQ)); + cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); + cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ)); if (cpumask_empty(mask)) { free_cpumask_var(mask); return -EINVAL; -- cgit v1.2.3 From 6367b600e31c6b211eadee7bf7f4ed8c755e9176 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 7 Feb 2022 16:59:07 +0100 Subject: sched/isolation: Consolidate check for housekeeping minimum service There can be two subsequent calls to housekeeping_setup() due to "nohz_full=" and "isolcpus=" that can mix up. The two passes each have their own way to deal with an empty housekeeping set of CPUs. Consolidate this part and remove the awful "tmp" based naming. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juri Lelli Reviewed-by: Phil Auld Link: https://lore.kernel.org/r/20220207155910.527133-6-frederic@kernel.org --- kernel/sched/isolation.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index a735d9e229dd..23085e665faa 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -91,8 +91,7 @@ void __init housekeeping_init(void) static int __init housekeeping_setup(char *str, enum hk_flags flags) { - cpumask_var_t non_housekeeping_mask; - cpumask_var_t tmp; + cpumask_var_t non_housekeeping_mask, housekeeping_staging; alloc_bootmem_cpumask_var(&non_housekeeping_mask); if (cpulist_parse(str, non_housekeeping_mask) < 0) { @@ -101,32 +100,32 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags) return 0; } - alloc_bootmem_cpumask_var(&tmp); - if (!housekeeping_flags) { - alloc_bootmem_cpumask_var(&housekeeping_mask); - cpumask_andnot(housekeeping_mask, - cpu_possible_mask, non_housekeeping_mask); + alloc_bootmem_cpumask_var(&housekeeping_staging); + cpumask_andnot(housekeeping_staging, + cpu_possible_mask, non_housekeeping_mask); - cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask); - if (cpumask_empty(tmp)) { + if (!cpumask_intersects(cpu_present_mask, housekeeping_staging)) { + __cpumask_set_cpu(smp_processor_id(), housekeeping_staging); + __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); + if (!housekeeping_flags) { pr_warn("Housekeeping: must include one present CPU, " "using boot CPU:%d\n", smp_processor_id()); - __cpumask_set_cpu(smp_processor_id(), housekeeping_mask); - __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); } + } + + if (!housekeeping_flags) { + alloc_bootmem_cpumask_var(&housekeeping_mask); + cpumask_copy(housekeeping_mask, housekeeping_staging); } else { - cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask); - if (cpumask_empty(tmp)) - __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); - cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask); - if (!cpumask_equal(tmp, housekeeping_mask)) { + if (!cpumask_equal(housekeeping_staging, housekeeping_mask)) { pr_warn("Housekeeping: nohz_full= must match isolcpus=\n"); - free_bootmem_cpumask_var(tmp); + free_bootmem_cpumask_var(housekeeping_staging); free_bootmem_cpumask_var(non_housekeeping_mask); return 0; } } - free_bootmem_cpumask_var(tmp); + + free_bootmem_cpumask_var(housekeeping_staging); if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) { if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { -- cgit v1.2.3 From 0cd3e59de1f53978873669c7c8225ec13e88c3ae Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 7 Feb 2022 16:59:08 +0100 Subject: sched/isolation: Consolidate error handling Centralize the mask freeing and return value for the error path. This makes potential leaks more visible. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juri Lelli Reviewed-by: Phil Auld Link: https://lore.kernel.org/r/20220207155910.527133-7-frederic@kernel.org --- kernel/sched/isolation.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 23085e665faa..828dacec483e 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -92,12 +92,12 @@ void __init housekeeping_init(void) static int __init housekeeping_setup(char *str, enum hk_flags flags) { cpumask_var_t non_housekeeping_mask, housekeeping_staging; + int err = 0; alloc_bootmem_cpumask_var(&non_housekeeping_mask); if (cpulist_parse(str, non_housekeeping_mask) < 0) { pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n"); - free_bootmem_cpumask_var(non_housekeeping_mask); - return 0; + goto free_non_housekeeping_mask; } alloc_bootmem_cpumask_var(&housekeeping_staging); @@ -119,30 +119,29 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags) } else { if (!cpumask_equal(housekeeping_staging, housekeeping_mask)) { pr_warn("Housekeeping: nohz_full= must match isolcpus=\n"); - free_bootmem_cpumask_var(housekeeping_staging); - free_bootmem_cpumask_var(non_housekeeping_mask); - return 0; + goto free_housekeeping_staging; } } - free_bootmem_cpumask_var(housekeeping_staging); - if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) { if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { tick_nohz_full_setup(non_housekeeping_mask); } else { pr_warn("Housekeeping: nohz unsupported." " Build with CONFIG_NO_HZ_FULL\n"); - free_bootmem_cpumask_var(non_housekeeping_mask); - return 0; + goto free_housekeeping_staging; } } housekeeping_flags |= flags; + err = 1; +free_housekeeping_staging: + free_bootmem_cpumask_var(housekeeping_staging); +free_non_housekeeping_mask: free_bootmem_cpumask_var(non_housekeeping_mask); - return 1; + return err; } static int __init housekeeping_nohz_full_setup(char *str) -- cgit v1.2.3 From 65e53f869e9f92a23593c66214b88e54fb190a13 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 7 Feb 2022 16:59:09 +0100 Subject: sched/isolation: Fix housekeeping_mask memory leak If "nohz_full=" or "isolcpus=nohz" are called with CONFIG_NO_HZ_FULL=n, housekeeping_mask doesn't get freed despite it being unused if housekeeping_setup() is called for the first time. Check this scenario first to fix this, so that no useless allocation is performed. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juri Lelli Reviewed-by: Paul E. McKenney Reviewed-by: Phil Auld Link: https://lore.kernel.org/r/20220207155910.527133-8-frederic@kernel.org --- kernel/sched/isolation.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 828dacec483e..883eee9fae22 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -94,6 +94,14 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags) cpumask_var_t non_housekeeping_mask, housekeeping_staging; int err = 0; + if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) { + if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) { + pr_warn("Housekeeping: nohz unsupported." + " Build with CONFIG_NO_HZ_FULL\n"); + return 0; + } + } + alloc_bootmem_cpumask_var(&non_housekeeping_mask); if (cpulist_parse(str, non_housekeeping_mask) < 0) { pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n"); @@ -123,15 +131,8 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags) } } - if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) { - if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { - tick_nohz_full_setup(non_housekeeping_mask); - } else { - pr_warn("Housekeeping: nohz unsupported." - " Build with CONFIG_NO_HZ_FULL\n"); - goto free_housekeeping_staging; - } - } + if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) + tick_nohz_full_setup(non_housekeeping_mask); housekeeping_flags |= flags; err = 1; -- cgit v1.2.3 From ed3b362d54f0038cafc985248350d301af7af686 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 7 Feb 2022 16:59:10 +0100 Subject: sched/isolation: Split housekeeping cpumask per isolation features To prepare for supporting each housekeeping feature toward cpuset, split the global housekeeping cpumask per HK_TYPE_* entry. This will later allow, for example, to runtime modify the cpulist passed through "isolcpus=", "nohz_full=" and "rcu_nocbs=" kernel boot parameters. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juri Lelli Reviewed-by: Phil Auld Link: https://lore.kernel.org/r/20220207155910.527133-9-frederic@kernel.org --- kernel/sched/isolation.c | 91 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 62 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 883eee9fae22..b4d10815c45a 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -23,12 +23,17 @@ enum hk_flags { DEFINE_STATIC_KEY_FALSE(housekeeping_overridden); EXPORT_SYMBOL_GPL(housekeeping_overridden); -static cpumask_var_t housekeeping_mask; -static unsigned int housekeeping_flags; + +struct housekeeping { + cpumask_var_t cpumasks[HK_TYPE_MAX]; + unsigned long flags; +}; + +static struct housekeeping housekeeping; bool housekeeping_enabled(enum hk_type type) { - return !!(housekeeping_flags & BIT(type)); + return !!(housekeeping.flags & BIT(type)); } EXPORT_SYMBOL_GPL(housekeeping_enabled); @@ -37,12 +42,12 @@ int housekeeping_any_cpu(enum hk_type type) int cpu; if (static_branch_unlikely(&housekeeping_overridden)) { - if (housekeeping_flags & BIT(type)) { - cpu = sched_numa_find_closest(housekeeping_mask, smp_processor_id()); + if (housekeeping.flags & BIT(type)) { + cpu = sched_numa_find_closest(housekeeping.cpumasks[type], smp_processor_id()); if (cpu < nr_cpu_ids) return cpu; - return cpumask_any_and(housekeeping_mask, cpu_online_mask); + return cpumask_any_and(housekeeping.cpumasks[type], cpu_online_mask); } } return smp_processor_id(); @@ -52,8 +57,8 @@ EXPORT_SYMBOL_GPL(housekeeping_any_cpu); const struct cpumask *housekeeping_cpumask(enum hk_type type) { if (static_branch_unlikely(&housekeeping_overridden)) - if (housekeeping_flags & BIT(type)) - return housekeeping_mask; + if (housekeeping.flags & BIT(type)) + return housekeeping.cpumasks[type]; return cpu_possible_mask; } EXPORT_SYMBOL_GPL(housekeeping_cpumask); @@ -61,40 +66,53 @@ EXPORT_SYMBOL_GPL(housekeeping_cpumask); void housekeeping_affine(struct task_struct *t, enum hk_type type) { if (static_branch_unlikely(&housekeeping_overridden)) - if (housekeeping_flags & BIT(type)) - set_cpus_allowed_ptr(t, housekeeping_mask); + if (housekeeping.flags & BIT(type)) + set_cpus_allowed_ptr(t, housekeeping.cpumasks[type]); } EXPORT_SYMBOL_GPL(housekeeping_affine); bool housekeeping_test_cpu(int cpu, enum hk_type type) { if (static_branch_unlikely(&housekeeping_overridden)) - if (housekeeping_flags & BIT(type)) - return cpumask_test_cpu(cpu, housekeeping_mask); + if (housekeeping.flags & BIT(type)) + return cpumask_test_cpu(cpu, housekeeping.cpumasks[type]); return true; } EXPORT_SYMBOL_GPL(housekeeping_test_cpu); void __init housekeeping_init(void) { - if (!housekeeping_flags) + enum hk_type type; + + if (!housekeeping.flags) return; static_branch_enable(&housekeeping_overridden); - if (housekeeping_flags & HK_FLAG_TICK) + if (housekeeping.flags & HK_FLAG_TICK) sched_tick_offload_init(); - /* We need at least one CPU to handle housekeeping work */ - WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); + for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) { + /* We need at least one CPU to handle housekeeping work */ + WARN_ON_ONCE(cpumask_empty(housekeeping.cpumasks[type])); + } +} + +static void __init housekeeping_setup_type(enum hk_type type, + cpumask_var_t housekeeping_staging) +{ + + alloc_bootmem_cpumask_var(&housekeeping.cpumasks[type]); + cpumask_copy(housekeeping.cpumasks[type], + housekeeping_staging); } -static int __init housekeeping_setup(char *str, enum hk_flags flags) +static int __init housekeeping_setup(char *str, unsigned long flags) { cpumask_var_t non_housekeeping_mask, housekeeping_staging; int err = 0; - if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) { + if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) { if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) { pr_warn("Housekeeping: nohz unsupported." " Build with CONFIG_NO_HZ_FULL\n"); @@ -115,26 +133,41 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags) if (!cpumask_intersects(cpu_present_mask, housekeeping_staging)) { __cpumask_set_cpu(smp_processor_id(), housekeeping_staging); __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); - if (!housekeeping_flags) { + if (!housekeeping.flags) { pr_warn("Housekeeping: must include one present CPU, " "using boot CPU:%d\n", smp_processor_id()); } } - if (!housekeeping_flags) { - alloc_bootmem_cpumask_var(&housekeeping_mask); - cpumask_copy(housekeeping_mask, housekeeping_staging); + if (!housekeeping.flags) { + /* First setup call ("nohz_full=" or "isolcpus=") */ + enum hk_type type; + + for_each_set_bit(type, &flags, HK_TYPE_MAX) + housekeeping_setup_type(type, housekeeping_staging); } else { - if (!cpumask_equal(housekeeping_staging, housekeeping_mask)) { - pr_warn("Housekeeping: nohz_full= must match isolcpus=\n"); - goto free_housekeeping_staging; + /* Second setup call ("nohz_full=" after "isolcpus=" or the reverse) */ + enum hk_type type; + unsigned long iter_flags = flags & housekeeping.flags; + + for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) { + if (!cpumask_equal(housekeeping_staging, + housekeeping.cpumasks[type])) { + pr_warn("Housekeeping: nohz_full= must match isolcpus=\n"); + goto free_housekeeping_staging; + } } + + iter_flags = flags & ~housekeeping.flags; + + for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) + housekeeping_setup_type(type, housekeeping_staging); } - if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) + if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) tick_nohz_full_setup(non_housekeeping_mask); - housekeeping_flags |= flags; + housekeeping.flags |= flags; err = 1; free_housekeeping_staging: @@ -147,7 +180,7 @@ free_non_housekeeping_mask: static int __init housekeeping_nohz_full_setup(char *str) { - unsigned int flags; + unsigned long flags; flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC | HK_FLAG_KTHREAD; @@ -158,7 +191,7 @@ __setup("nohz_full=", housekeeping_nohz_full_setup); static int __init housekeeping_isolcpus_setup(char *str) { - unsigned int flags = 0; + unsigned long flags = 0; bool illegal = false; char *par; int len; -- cgit v1.2.3 From fb7275acd6fb988313dddd8d3d19efa70d9015ad Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 10 Feb 2022 22:55:26 -0500 Subject: locking/lockdep: Iterate lock_classes directly when reading lockdep files When dumping lock_classes information via /proc/lockdep, we can't take the lockdep lock as the lock hold time is indeterminate. Iterating over all_lock_classes without holding lock can be dangerous as there is a slight chance that it may branch off to other lists leading to infinite loop or even access invalid memory if changes are made to all_lock_classes list in parallel. To avoid this problem, iteration of lock classes is now done directly on the lock_classes array itself. The lock_classes_in_use bitmap is checked to see if the lock class is being used. To avoid iterating the full array all the times, a new max_lock_class_idx value is added to track the maximum lock_class index that is currently being used. We can theoretically take the lockdep lock for iterating all_lock_classes when other lockdep files (lockdep_stats and lock_stat) are accessed as the lock hold time will be shorter for them. For consistency, they are also modified to iterate the lock_classes array directly. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220211035526.1329503-2-longman@redhat.com --- kernel/locking/lockdep.c | 14 +++++++---- kernel/locking/lockdep_internals.h | 6 +++-- kernel/locking/lockdep_proc.c | 51 ++++++++++++++++++++++++++++++++------ 3 files changed, 56 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 2e6892ec3756..50036c10b518 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -183,11 +183,9 @@ static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES); static struct hlist_head lock_keys_hash[KEYHASH_SIZE]; unsigned long nr_lock_classes; unsigned long nr_zapped_classes; -#ifndef CONFIG_DEBUG_LOCKDEP -static -#endif +unsigned long max_lock_class_idx; struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; -static DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS); +DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS); static inline struct lock_class *hlock_class(struct held_lock *hlock) { @@ -338,7 +336,7 @@ static inline void lock_release_holdtime(struct held_lock *hlock) * elements. These elements are linked together by the lock_entry member in * struct lock_class. */ -LIST_HEAD(all_lock_classes); +static LIST_HEAD(all_lock_classes); static LIST_HEAD(free_lock_classes); /** @@ -1252,6 +1250,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) struct lockdep_subclass_key *key; struct hlist_head *hash_head; struct lock_class *class; + int idx; DEBUG_LOCKS_WARN_ON(!irqs_disabled()); @@ -1317,6 +1316,9 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) * of classes. */ list_move_tail(&class->lock_entry, &all_lock_classes); + idx = class - lock_classes; + if (idx > max_lock_class_idx) + max_lock_class_idx = idx; if (verbose(class)) { graph_unlock(); @@ -6000,6 +6002,8 @@ static void zap_class(struct pending_free *pf, struct lock_class *class) WRITE_ONCE(class->name, NULL); nr_lock_classes--; __clear_bit(class - lock_classes, lock_classes_in_use); + if (class - lock_classes == max_lock_class_idx) + max_lock_class_idx--; } else { WARN_ONCE(true, "%s() failed for class %s\n", __func__, class->name); diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index ecb8662e7a4e..bbe9000260d0 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -121,7 +121,6 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = #define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) -extern struct list_head all_lock_classes; extern struct lock_chain lock_chains[]; #define LOCK_USAGE_CHARS (2*XXX_LOCK_USAGE_STATES + 1) @@ -151,6 +150,10 @@ extern unsigned int nr_large_chain_blocks; extern unsigned int max_lockdep_depth; extern unsigned int max_bfs_queue_depth; +extern unsigned long max_lock_class_idx; + +extern struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; +extern unsigned long lock_classes_in_use[]; #ifdef CONFIG_PROVE_LOCKING extern unsigned long lockdep_count_forward_deps(struct lock_class *); @@ -205,7 +208,6 @@ struct lockdep_stats { }; DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); -extern struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; #define __debug_atomic_inc(ptr) \ this_cpu_inc(lockdep_stats.ptr); diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index b8d9a050c337..15fdc7fa5c68 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -24,14 +24,33 @@ #include "lockdep_internals.h" +/* + * Since iteration of lock_classes is done without holding the lockdep lock, + * it is not safe to iterate all_lock_classes list directly as the iteration + * may branch off to free_lock_classes or the zapped list. Iteration is done + * directly on the lock_classes array by checking the lock_classes_in_use + * bitmap and max_lock_class_idx. + */ +#define iterate_lock_classes(idx, class) \ + for (idx = 0, class = lock_classes; idx <= max_lock_class_idx; \ + idx++, class++) + static void *l_next(struct seq_file *m, void *v, loff_t *pos) { - return seq_list_next(v, &all_lock_classes, pos); + struct lock_class *class = v; + + ++class; + *pos = class - lock_classes; + return (*pos > max_lock_class_idx) ? NULL : class; } static void *l_start(struct seq_file *m, loff_t *pos) { - return seq_list_start_head(&all_lock_classes, *pos); + unsigned long idx = *pos; + + if (idx > max_lock_class_idx) + return NULL; + return lock_classes + idx; } static void l_stop(struct seq_file *m, void *v) @@ -57,14 +76,16 @@ static void print_name(struct seq_file *m, struct lock_class *class) static int l_show(struct seq_file *m, void *v) { - struct lock_class *class = list_entry(v, struct lock_class, lock_entry); + struct lock_class *class = v; struct lock_list *entry; char usage[LOCK_USAGE_CHARS]; + int idx = class - lock_classes; - if (v == &all_lock_classes) { + if (v == lock_classes) seq_printf(m, "all lock classes:\n"); + + if (!test_bit(idx, lock_classes_in_use)) return 0; - } seq_printf(m, "%p", class->key); #ifdef CONFIG_DEBUG_LOCKDEP @@ -220,8 +241,11 @@ static int lockdep_stats_show(struct seq_file *m, void *v) #ifdef CONFIG_PROVE_LOCKING struct lock_class *class; + unsigned long idx; - list_for_each_entry(class, &all_lock_classes, lock_entry) { + iterate_lock_classes(idx, class) { + if (!test_bit(idx, lock_classes_in_use)) + continue; if (class->usage_mask == 0) nr_unused++; @@ -254,6 +278,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v) sum_forward_deps += lockdep_count_forward_deps(class); } + #ifdef CONFIG_DEBUG_LOCKDEP DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused); #endif @@ -345,6 +370,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v) seq_printf(m, " max bfs queue depth: %11u\n", max_bfs_queue_depth); #endif + seq_printf(m, " max lock class index: %11lu\n", + max_lock_class_idx); lockdep_stats_debug_show(m); seq_printf(m, " debug_locks: %11u\n", debug_locks); @@ -622,12 +649,16 @@ static int lock_stat_open(struct inode *inode, struct file *file) if (!res) { struct lock_stat_data *iter = data->stats; struct seq_file *m = file->private_data; + unsigned long idx; - list_for_each_entry(class, &all_lock_classes, lock_entry) { + iterate_lock_classes(idx, class) { + if (!test_bit(idx, lock_classes_in_use)) + continue; iter->class = class; iter->stats = lock_stats(class); iter++; } + data->iter_end = iter; sort(data->stats, data->iter_end - data->stats, @@ -645,6 +676,7 @@ static ssize_t lock_stat_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct lock_class *class; + unsigned long idx; char c; if (count) { @@ -654,8 +686,11 @@ static ssize_t lock_stat_write(struct file *file, const char __user *buf, if (c != '0') return count; - list_for_each_entry(class, &all_lock_classes, lock_entry) + iterate_lock_classes(idx, class) { + if (!test_bit(idx, lock_classes_in_use)) + continue; clear_lock_stats(class); + } } return count; } -- cgit v1.2.3 From adb8fa195efdfaac5852aaac24810b456ce43b04 Mon Sep 17 00:00:00 2001 From: Mauricio Vásquez Date: Tue, 15 Feb 2022 17:58:50 -0500 Subject: libbpf: Split bpf_core_apply_relo() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BTFGen needs to run the core relocation logic in order to understand what are the types involved in a given relocation. Currently bpf_core_apply_relo() calculates and **applies** a relocation to an instruction. Having both operations in the same function makes it difficult to only calculate the relocation without patching the instruction. This commit splits that logic in two different phases: (1) calculate the relocation and (2) patch the instruction. For the first phase bpf_core_apply_relo() is renamed to bpf_core_calc_relo_insn() who is now only on charge of calculating the relocation, the second phase uses the already existing bpf_core_patch_insn(). bpf_object__relocate_core() uses both of them and the BTFGen will use only bpf_core_calc_relo_insn(). Signed-off-by: Mauricio Vásquez Signed-off-by: Rafael David Tinoco Signed-off-by: Lorenzo Fontana Signed-off-by: Leonardo Di Donato Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Acked-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220215225856.671072-2-mauricio@kinvolk.io --- kernel/bpf/btf.c | 13 ++++++-- tools/lib/bpf/libbpf.c | 71 +++++++++++++++++++++++------------------- tools/lib/bpf/relo_core.c | 79 ++++++++++++++--------------------------------- tools/lib/bpf/relo_core.h | 42 +++++++++++++++++++++---- 4 files changed, 109 insertions(+), 96 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 11740b300de9..f1d3d2a2f5f6 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7225,6 +7225,7 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, { bool need_cands = relo->kind != BPF_CORE_TYPE_ID_LOCAL; struct bpf_core_cand_list cands = {}; + struct bpf_core_relo_res targ_res; struct bpf_core_spec *specs; int err; @@ -7264,13 +7265,19 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, cands.len = cc->cnt; /* cand_cache_mutex needs to span the cache lookup and * copy of btf pointer into bpf_core_cand_list, - * since module can be unloaded while bpf_core_apply_relo_insn + * since module can be unloaded while bpf_core_calc_relo_insn * is working with module's btf. */ } - err = bpf_core_apply_relo_insn((void *)ctx->log, insn, relo->insn_off / 8, - relo, relo_idx, ctx->btf, &cands, specs); + err = bpf_core_calc_relo_insn((void *)ctx->log, relo, relo_idx, ctx->btf, &cands, specs, + &targ_res); + if (err) + goto out; + + err = bpf_core_patch_insn((void *)ctx->log, insn, relo->insn_off / 8, relo, relo_idx, + &targ_res); + out: kfree(specs); if (need_cands) { diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 2262bcdfee92..d3c457fb045e 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -5530,11 +5530,12 @@ static int record_relo_core(struct bpf_program *prog, return 0; } -static int bpf_core_apply_relo(struct bpf_program *prog, - const struct bpf_core_relo *relo, - int relo_idx, - const struct btf *local_btf, - struct hashmap *cand_cache) +static int bpf_core_resolve_relo(struct bpf_program *prog, + const struct bpf_core_relo *relo, + int relo_idx, + const struct btf *local_btf, + struct hashmap *cand_cache, + struct bpf_core_relo_res *targ_res) { struct bpf_core_spec specs_scratch[3] = {}; const void *type_key = u32_as_hash_key(relo->type_id); @@ -5543,20 +5544,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog, const struct btf_type *local_type; const char *local_name; __u32 local_id = relo->type_id; - struct bpf_insn *insn; - int insn_idx, err; - - if (relo->insn_off % BPF_INSN_SZ) - return -EINVAL; - insn_idx = relo->insn_off / BPF_INSN_SZ; - /* adjust insn_idx from section frame of reference to the local - * program's frame of reference; (sub-)program code is not yet - * relocated, so it's enough to just subtract in-section offset - */ - insn_idx = insn_idx - prog->sec_insn_off; - if (insn_idx >= prog->insns_cnt) - return -EINVAL; - insn = &prog->insns[insn_idx]; + int err; local_type = btf__type_by_id(local_btf, local_id); if (!local_type) @@ -5566,15 +5554,6 @@ static int bpf_core_apply_relo(struct bpf_program *prog, if (!local_name) return -EINVAL; - if (prog->obj->gen_loader) { - const char *spec_str = btf__name_by_offset(local_btf, relo->access_str_off); - - pr_debug("record_relo_core: prog %td insn[%d] %s %s %s final insn_idx %d\n", - prog - prog->obj->programs, relo->insn_off / 8, - btf_kind_str(local_type), local_name, spec_str, insn_idx); - return record_relo_core(prog, relo, insn_idx); - } - if (relo->kind != BPF_CORE_TYPE_ID_LOCAL && !hashmap__find(cand_cache, type_key, (void **)&cands)) { cands = bpf_core_find_cands(prog->obj, local_btf, local_id); @@ -5591,19 +5570,21 @@ static int bpf_core_apply_relo(struct bpf_program *prog, } } - return bpf_core_apply_relo_insn(prog_name, insn, insn_idx, relo, - relo_idx, local_btf, cands, specs_scratch); + return bpf_core_calc_relo_insn(prog_name, relo, relo_idx, local_btf, cands, specs_scratch, + targ_res); } static int bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path) { const struct btf_ext_info_sec *sec; + struct bpf_core_relo_res targ_res; const struct bpf_core_relo *rec; const struct btf_ext_info *seg; struct hashmap_entry *entry; struct hashmap *cand_cache = NULL; struct bpf_program *prog; + struct bpf_insn *insn; const char *sec_name; int i, err = 0, insn_idx, sec_idx; @@ -5654,6 +5635,8 @@ bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path) sec_name, sec->num_info); for_each_btf_ext_rec(seg, sec, i, rec) { + if (rec->insn_off % BPF_INSN_SZ) + return -EINVAL; insn_idx = rec->insn_off / BPF_INSN_SZ; prog = find_prog_by_sec_insn(obj, sec_idx, insn_idx); if (!prog) { @@ -5668,12 +5651,38 @@ bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path) if (!prog->load) continue; - err = bpf_core_apply_relo(prog, rec, i, obj->btf, cand_cache); + /* adjust insn_idx from section frame of reference to the local + * program's frame of reference; (sub-)program code is not yet + * relocated, so it's enough to just subtract in-section offset + */ + insn_idx = insn_idx - prog->sec_insn_off; + if (insn_idx >= prog->insns_cnt) + return -EINVAL; + insn = &prog->insns[insn_idx]; + + if (prog->obj->gen_loader) { + err = record_relo_core(prog, rec, insn_idx); + if (err) { + pr_warn("prog '%s': relo #%d: failed to record relocation: %d\n", + prog->name, i, err); + goto out; + } + continue; + } + + err = bpf_core_resolve_relo(prog, rec, i, obj->btf, cand_cache, &targ_res); if (err) { pr_warn("prog '%s': relo #%d: failed to relocate: %d\n", prog->name, i, err); goto out; } + + err = bpf_core_patch_insn(prog->name, insn, insn_idx, rec, i, &targ_res); + if (err) { + pr_warn("prog '%s': relo #%d: failed to patch insn #%u: %d\n", + prog->name, i, insn_idx, err); + goto out; + } } } diff --git a/tools/lib/bpf/relo_core.c b/tools/lib/bpf/relo_core.c index 910865e29edc..f946f23eab20 100644 --- a/tools/lib/bpf/relo_core.c +++ b/tools/lib/bpf/relo_core.c @@ -775,31 +775,6 @@ static int bpf_core_calc_enumval_relo(const struct bpf_core_relo *relo, return 0; } -struct bpf_core_relo_res -{ - /* expected value in the instruction, unless validate == false */ - __u32 orig_val; - /* new value that needs to be patched up to */ - __u32 new_val; - /* relocation unsuccessful, poison instruction, but don't fail load */ - bool poison; - /* some relocations can't be validated against orig_val */ - bool validate; - /* for field byte offset relocations or the forms: - * *(T *)(rX + ) = rY - * rX = *(T *)(rY + ), - * we remember original and resolved field size to adjust direct - * memory loads of pointers and integers; this is necessary for 32-bit - * host kernel architectures, but also allows to automatically - * relocate fields that were resized from, e.g., u32 to u64, etc. - */ - bool fail_memsz_adjust; - __u32 orig_sz; - __u32 orig_type_id; - __u32 new_sz; - __u32 new_type_id; -}; - /* Calculate original and target relocation values, given local and target * specs and relocation kind. These values are calculated for each candidate. * If there are multiple candidates, resulting values should all be consistent @@ -951,9 +926,9 @@ static int insn_bytes_to_bpf_size(__u32 sz) * 5. *(T *)(rX + ) = rY, where T is one of {u8, u16, u32, u64}; * 6. *(T *)(rX + ) = , where T is one of {u8, u16, u32, u64}. */ -static int bpf_core_patch_insn(const char *prog_name, struct bpf_insn *insn, - int insn_idx, const struct bpf_core_relo *relo, - int relo_idx, const struct bpf_core_relo_res *res) +int bpf_core_patch_insn(const char *prog_name, struct bpf_insn *insn, + int insn_idx, const struct bpf_core_relo *relo, + int relo_idx, const struct bpf_core_relo_res *res) { __u32 orig_val, new_val; __u8 class; @@ -1128,7 +1103,7 @@ static void bpf_core_dump_spec(const char *prog_name, int level, const struct bp } /* - * CO-RE relocate single instruction. + * Calculate CO-RE relocation target result. * * The outline and important points of the algorithm: * 1. For given local type, find corresponding candidate target types. @@ -1177,18 +1152,18 @@ static void bpf_core_dump_spec(const char *prog_name, int level, const struct bp * between multiple relocations for the same type ID and is updated as some * of the candidates are pruned due to structural incompatibility. */ -int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn, - int insn_idx, - const struct bpf_core_relo *relo, - int relo_idx, - const struct btf *local_btf, - struct bpf_core_cand_list *cands, - struct bpf_core_spec *specs_scratch) +int bpf_core_calc_relo_insn(const char *prog_name, + const struct bpf_core_relo *relo, + int relo_idx, + const struct btf *local_btf, + struct bpf_core_cand_list *cands, + struct bpf_core_spec *specs_scratch, + struct bpf_core_relo_res *targ_res) { struct bpf_core_spec *local_spec = &specs_scratch[0]; struct bpf_core_spec *cand_spec = &specs_scratch[1]; struct bpf_core_spec *targ_spec = &specs_scratch[2]; - struct bpf_core_relo_res cand_res, targ_res; + struct bpf_core_relo_res cand_res; const struct btf_type *local_type; const char *local_name; __u32 local_id; @@ -1223,12 +1198,12 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn, /* TYPE_ID_LOCAL relo is special and doesn't need candidate search */ if (relo->kind == BPF_CORE_TYPE_ID_LOCAL) { /* bpf_insn's imm value could get out of sync during linking */ - memset(&targ_res, 0, sizeof(targ_res)); - targ_res.validate = false; - targ_res.poison = false; - targ_res.orig_val = local_spec->root_type_id; - targ_res.new_val = local_spec->root_type_id; - goto patch_insn; + memset(targ_res, 0, sizeof(*targ_res)); + targ_res->validate = false; + targ_res->poison = false; + targ_res->orig_val = local_spec->root_type_id; + targ_res->new_val = local_spec->root_type_id; + return 0; } /* libbpf doesn't support candidate search for anonymous types */ @@ -1262,7 +1237,7 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn, return err; if (j == 0) { - targ_res = cand_res; + *targ_res = cand_res; *targ_spec = *cand_spec; } else if (cand_spec->bit_offset != targ_spec->bit_offset) { /* if there are many field relo candidates, they @@ -1272,7 +1247,8 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn, prog_name, relo_idx, cand_spec->bit_offset, targ_spec->bit_offset); return -EINVAL; - } else if (cand_res.poison != targ_res.poison || cand_res.new_val != targ_res.new_val) { + } else if (cand_res.poison != targ_res->poison || + cand_res.new_val != targ_res->new_val) { /* all candidates should result in the same relocation * decision and value, otherwise it's dangerous to * proceed due to ambiguity @@ -1280,7 +1256,7 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn, pr_warn("prog '%s': relo #%d: relocation decision ambiguity: %s %u != %s %u\n", prog_name, relo_idx, cand_res.poison ? "failure" : "success", cand_res.new_val, - targ_res.poison ? "failure" : "success", targ_res.new_val); + targ_res->poison ? "failure" : "success", targ_res->new_val); return -EINVAL; } @@ -1314,19 +1290,10 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn, prog_name, relo_idx); /* calculate single target relo result explicitly */ - err = bpf_core_calc_relo(prog_name, relo, relo_idx, local_spec, NULL, &targ_res); + err = bpf_core_calc_relo(prog_name, relo, relo_idx, local_spec, NULL, targ_res); if (err) return err; } -patch_insn: - /* bpf_core_patch_insn() should know how to handle missing targ_spec */ - err = bpf_core_patch_insn(prog_name, insn, insn_idx, relo, relo_idx, &targ_res); - if (err) { - pr_warn("prog '%s': relo #%d: failed to patch insn #%u: %d\n", - prog_name, relo_idx, relo->insn_off / 8, err); - return -EINVAL; - } - return 0; } diff --git a/tools/lib/bpf/relo_core.h b/tools/lib/bpf/relo_core.h index 17799819ad7c..a28bf3711ce2 100644 --- a/tools/lib/bpf/relo_core.h +++ b/tools/lib/bpf/relo_core.h @@ -44,14 +44,44 @@ struct bpf_core_spec { __u32 bit_offset; }; -int bpf_core_apply_relo_insn(const char *prog_name, - struct bpf_insn *insn, int insn_idx, - const struct bpf_core_relo *relo, int relo_idx, - const struct btf *local_btf, - struct bpf_core_cand_list *cands, - struct bpf_core_spec *specs_scratch); +struct bpf_core_relo_res { + /* expected value in the instruction, unless validate == false */ + __u32 orig_val; + /* new value that needs to be patched up to */ + __u32 new_val; + /* relocation unsuccessful, poison instruction, but don't fail load */ + bool poison; + /* some relocations can't be validated against orig_val */ + bool validate; + /* for field byte offset relocations or the forms: + * *(T *)(rX + ) = rY + * rX = *(T *)(rY + ), + * we remember original and resolved field size to adjust direct + * memory loads of pointers and integers; this is necessary for 32-bit + * host kernel architectures, but also allows to automatically + * relocate fields that were resized from, e.g., u32 to u64, etc. + */ + bool fail_memsz_adjust; + __u32 orig_sz; + __u32 orig_type_id; + __u32 new_sz; + __u32 new_type_id; +}; + int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, __u32 targ_id); size_t bpf_core_essential_name_len(const char *name); + +int bpf_core_calc_relo_insn(const char *prog_name, + const struct bpf_core_relo *relo, int relo_idx, + const struct btf *local_btf, + struct bpf_core_cand_list *cands, + struct bpf_core_spec *specs_scratch, + struct bpf_core_relo_res *targ_res); + +int bpf_core_patch_insn(const char *prog_name, struct bpf_insn *insn, + int insn_idx, const struct bpf_core_relo *relo, + int relo_idx, const struct bpf_core_relo_res *res); + #endif -- cgit v1.2.3 From 3f51aa9e296fe4af785d5761bb12556fb2494761 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Wed, 9 Feb 2022 19:29:51 +0800 Subject: PM: hibernate: fix load_image_and_restore() error path As 'swsusp_check' open 'hib_resume_bdev', if call 'create_basic_memory_bitmaps' failed, we need to close 'hib_resume_bdev' in 'load_image_and_restore' function. Signed-off-by: Ye Bin [ rjw: Subject ] Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index e6af502c2fd7..49d1df0218cb 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -689,8 +689,10 @@ static int load_image_and_restore(void) lock_device_hotplug(); error = create_basic_memory_bitmaps(); - if (error) + if (error) { + swsusp_close(FMODE_READ | FMODE_EXCL); goto Unlock; + } error = swsusp_read(&flags); swsusp_close(FMODE_READ | FMODE_EXCL); -- cgit v1.2.3 From 45ce4b4f9009102cd9f581196d480a59208690c1 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 17 Feb 2022 01:49:43 +0530 Subject: bpf: Fix crash due to out of bounds access into reg2btf_ids. When commit e6ac2450d6de ("bpf: Support bpf program calling kernel function") added kfunc support, it defined reg2btf_ids as a cheap way to translate the verifier reg type to the appropriate btf_vmlinux BTF ID, however commit c25b2ae13603 ("bpf: Replace PTR_TO_XXX_OR_NULL with PTR_TO_XXX | PTR_MAYBE_NULL") moved the __BPF_REG_TYPE_MAX from the last member of bpf_reg_type enum to after the base register types, and defined other variants using type flag composition. However, now, the direct usage of reg->type to index into reg2btf_ids may no longer fall into __BPF_REG_TYPE_MAX range, and hence lead to out of bounds access and kernel crash on dereference of bad pointer. Fixes: c25b2ae13603 ("bpf: Replace PTR_TO_XXX_OR_NULL with PTR_TO_XXX | PTR_MAYBE_NULL") Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220216201943.624869-1-memxor@gmail.com --- kernel/bpf/btf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index e16dafeb2450..3e23b3fa79ff 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5688,7 +5688,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, } if (check_ptr_off_reg(env, reg, regno)) return -EINVAL; - } else if (is_kfunc && (reg->type == PTR_TO_BTF_ID || reg2btf_ids[reg->type])) { + } else if (is_kfunc && (reg->type == PTR_TO_BTF_ID || + (reg2btf_ids[base_type(reg->type)] && !type_flag(reg->type)))) { const struct btf_type *reg_ref_t; const struct btf *reg_btf; const char *reg_ref_tname; @@ -5706,7 +5707,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, reg_ref_id = reg->btf_id; } else { reg_btf = btf_vmlinux; - reg_ref_id = *reg2btf_ids[reg->type]; + reg_ref_id = *reg2btf_ids[base_type(reg->type)]; } reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, -- cgit v1.2.3 From a8e8f851e8299703a005cf23dfb9ec854a2611e5 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Tue, 15 Feb 2022 13:11:42 -0800 Subject: module: fix building with sysfs disabled Sysfs support might be disabled so we need to guard the code that instantiates "compression" attribute with an #ifdef. Fixes: b1ae6dc41eaa ("module: add in-kernel support for decompressing") Reported-by: kernel test robot Signed-off-by: Dmitry Torokhov Signed-off-by: Luis Chamberlain --- kernel/module_decompress.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/module_decompress.c b/kernel/module_decompress.c index b01c69c2ff99..ffef98a20320 100644 --- a/kernel/module_decompress.c +++ b/kernel/module_decompress.c @@ -250,6 +250,7 @@ void module_decompress_cleanup(struct load_info *info) info->max_pages = info->used_pages = 0; } +#ifdef CONFIG_SYSFS static ssize_t compression_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -269,3 +270,4 @@ static int __init module_decompress_sysfs_init(void) return 0; } late_initcall(module_decompress_sysfs_init); +#endif -- cgit v1.2.3 From c16bdeb5a39ffa3f32b32f812831a2092d2a3061 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 11 Feb 2022 13:57:44 -0600 Subject: rlimit: Fix RLIMIT_NPROC enforcement failure caused by capability calls in set_user Solar Designer wrote: > I'm not aware of anyone actually running into this issue and reporting > it. The systems that I personally know use suexec along with rlimits > still run older/distro kernels, so would not yet be affected. > > So my mention was based on my understanding of how suexec works, and > code review. Specifically, Apache httpd has the setting RLimitNPROC, > which makes it set RLIMIT_NPROC: > > https://httpd.apache.org/docs/2.4/mod/core.html#rlimitnproc > > The above documentation for it includes: > > "This applies to processes forked from Apache httpd children servicing > requests, not the Apache httpd children themselves. This includes CGI > scripts and SSI exec commands, but not any processes forked from the > Apache httpd parent, such as piped logs." > > In code, there are: > > ./modules/generators/mod_cgid.c: ( (cgid_req.limits.limit_nproc_set) && ((rc = apr_procattr_limit_set(procattr, APR_LIMIT_NPROC, > ./modules/generators/mod_cgi.c: ((rc = apr_procattr_limit_set(procattr, APR_LIMIT_NPROC, > ./modules/filters/mod_ext_filter.c: rv = apr_procattr_limit_set(procattr, APR_LIMIT_NPROC, conf->limit_nproc); > > For example, in mod_cgi.c this is in run_cgi_child(). > > I think this means an httpd child sets RLIMIT_NPROC shortly before it > execs suexec, which is a SUID root program. suexec then switches to the > target user and execs the CGI script. > > Before 2863643fb8b9, the setuid() in suexec would set the flag, and the > target user's process count would be checked against RLIMIT_NPROC on > execve(). After 2863643fb8b9, the setuid() in suexec wouldn't set the > flag because setuid() is (naturally) called when the process is still > running as root (thus, has those limits bypass capabilities), and > accordingly execve() would not check the target user's process count > against RLIMIT_NPROC. In commit 2863643fb8b9 ("set_user: add capability check when rlimit(RLIMIT_NPROC) exceeds") capable calls were added to set_user to make it more consistent with fork. Unfortunately because of call site differences those capable calls were checking the credentials of the user before set*id() instead of after set*id(). This breaks enforcement of RLIMIT_NPROC for applications that set the rlimit and then call set*id() while holding a full set of capabilities. The capabilities are only changed in the new credential in security_task_fix_setuid(). The code in apache suexec appears to follow this pattern. Commit 909cc4ae86f3 ("[PATCH] Fix two bugs with process limits (RLIMIT_NPROC)") where this check was added describes the targes of this capability check as: 2/ When a root-owned process (e.g. cgiwrap) sets up process limits and then calls setuid, the setuid should fail if the user would then be running more than rlim_cur[RLIMIT_NPROC] processes, but it doesn't. This patch adds an appropriate test. With this patch, and per-user process limit imposed in cgiwrap really works. So the original use case of this check also appears to match the broken pattern. Restore the enforcement of RLIMIT_NPROC by removing the bad capable checks added in set_user. This unfortunately restores the inconsistent state the code has been in for the last 11 years, but dealing with the inconsistencies looks like a larger problem. Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20210907213042.GA22626@openwall.com/ Link: https://lkml.kernel.org/r/20220212221412.GA29214@openwall.com Link: https://lkml.kernel.org/r/20220216155832.680775-1-ebiederm@xmission.com Fixes: 2863643fb8b9 ("set_user: add capability check when rlimit(RLIMIT_NPROC) exceeds") History-Tree: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git Reviewed-by: Solar Designer Signed-off-by: "Eric W. Biederman" --- kernel/sys.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index ecc4cf019242..8dd938a3d2bf 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -480,8 +480,7 @@ static int set_user(struct cred *new) * failure to the execve() stage. */ if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) && - new_user != INIT_USER && - !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) + new_user != INIT_USER) current->flags |= PF_NPROC_EXCEEDED; else current->flags &= ~PF_NPROC_EXCEEDED; -- cgit v1.2.3 From 8f2f9c4d82f24f172ae439e5035fc1e0e4c229dd Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 9 Feb 2022 20:03:19 -0600 Subject: ucounts: Enforce RLIMIT_NPROC not RLIMIT_NPROC+1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Michal Koutný wrote: > It was reported that v5.14 behaves differently when enforcing > RLIMIT_NPROC limit, namely, it allows one more task than previously. > This is consequence of the commit 21d1c5e386bc ("Reimplement > RLIMIT_NPROC on top of ucounts") that missed the sharpness of > equality in the forking path. This can be fixed either by fixing the test or by moving the increment to be before the test. Fix it my moving copy_creds which contains the increment before is_ucounts_overlimit. In the case of CLONE_NEWUSER the ucounts in the task_cred changes. The function is_ucounts_overlimit needs to use the final version of the ucounts for the new process. Which means moving the is_ucounts_overlimit test after copy_creds is necessary. Both the test in fork and the test in set_user were semantically changed when the code moved to ucounts. The change of the test in fork was bad because it was before the increment. The test in set_user was wrong and the change to ucounts fixed it. So this fix only restores the old behavior in one lcation not two. Link: https://lkml.kernel.org/r/20220204181144.24462-1-mkoutny@suse.com Link: https://lkml.kernel.org/r/20220216155832.680775-2-ebiederm@xmission.com Cc: stable@vger.kernel.org Reported-by: Michal Koutný Reviewed-by: Michal Koutný Fixes: 21d1c5e386bc ("Reimplement RLIMIT_NPROC on top of ucounts") Signed-off-by: "Eric W. Biederman" --- kernel/fork.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index d75a528f7b21..17d8a8c85e3b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2021,18 +2021,18 @@ static __latent_entropy struct task_struct *copy_process( #ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif + retval = copy_creds(p, clone_flags); + if (retval < 0) + goto bad_fork_free; + retval = -EAGAIN; if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { if (p->real_cred->user != INIT_USER && !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) - goto bad_fork_free; + goto bad_fork_cleanup_count; } current->flags &= ~PF_NPROC_EXCEEDED; - retval = copy_creds(p, clone_flags); - if (retval < 0) - goto bad_fork_free; - /* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there -- cgit v1.2.3 From a55d07294f1e9b576093bdfa95422f8119941e83 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 9 Feb 2022 16:22:20 -0600 Subject: ucounts: Base set_cred_ucounts changes on the real user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Michal Koutný wrote: > Tasks are associated to multiple users at once. Historically and as per > setrlimit(2) RLIMIT_NPROC is enforce based on real user ID. > > The commit 21d1c5e386bc ("Reimplement RLIMIT_NPROC on top of ucounts") > made the accounting structure "indexed" by euid and hence potentially > account tasks differently. > > The effective user ID may be different e.g. for setuid programs but > those are exec'd into already existing task (i.e. below limit), so > different accounting is moot. > > Some special setresuid(2) users may notice the difference, justifying > this fix. I looked at cred->ucount and it is only used for rlimit operations that were previously stored in cred->user. Making the fact cred->ucount can refer to a different user from cred->user a bug, affecting all uses of cred->ulimit not just RLIMIT_NPROC. Fix set_cred_ucounts to always use the real uid not the effective uid. Further simplify set_cred_ucounts by noticing that set_cred_ucounts somehow retained a draft version of the check to see if alloc_ucounts was needed that checks the new->user and new->user_ns against the current_real_cred(). Remove that draft version of the check. All that matters for setting the cred->ucounts are the user_ns and uid fields in the cred. Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20220207121800.5079-4-mkoutny@suse.com Link: https://lkml.kernel.org/r/20220216155832.680775-3-ebiederm@xmission.com Reported-by: Michal Koutný Reviewed-by: Michal Koutný Fixes: 21d1c5e386bc ("Reimplement RLIMIT_NPROC on top of ucounts") Signed-off-by: "Eric W. Biederman" --- kernel/cred.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 473d17c431f3..933155c96922 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -665,21 +665,16 @@ EXPORT_SYMBOL(cred_fscmp); int set_cred_ucounts(struct cred *new) { - struct task_struct *task = current; - const struct cred *old = task->real_cred; struct ucounts *new_ucounts, *old_ucounts = new->ucounts; - if (new->user == old->user && new->user_ns == old->user_ns) - return 0; - /* * This optimization is needed because alloc_ucounts() uses locks * for table lookups. */ - if (old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->euid)) + if (old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->uid)) return 0; - if (!(new_ucounts = alloc_ucounts(new->user_ns, new->euid))) + if (!(new_ucounts = alloc_ucounts(new->user_ns, new->uid))) return -EAGAIN; new->ucounts = new_ucounts; -- cgit v1.2.3 From c923a8e7edb010da67424077cbf1a6f1396ebd2e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 14 Feb 2022 09:40:25 -0600 Subject: ucounts: Move RLIMIT_NPROC handling after set_user During set*id() which cred->ucounts to charge the the current process to is not known until after set_cred_ucounts. So move the RLIMIT_NPROC checking into a new helper flag_nproc_exceeded and call flag_nproc_exceeded after set_cred_ucounts. This is very much an arbitrary subset of the places where we currently change the RLIMIT_NPROC accounting, designed to preserve the existing logic. Fixing the existing logic will be the subject of another series of changes. Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20220216155832.680775-4-ebiederm@xmission.com Fixes: 21d1c5e386bc ("Reimplement RLIMIT_NPROC on top of ucounts") Signed-off-by: "Eric W. Biederman" --- kernel/sys.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 8dd938a3d2bf..97dc9e5d6bf9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -472,6 +472,16 @@ static int set_user(struct cred *new) if (!new_user) return -EAGAIN; + free_uid(new->user); + new->user = new_user; + return 0; +} + +static void flag_nproc_exceeded(struct cred *new) +{ + if (new->ucounts == current_ucounts()) + return; + /* * We don't fail in case of NPROC limit excess here because too many * poorly written programs don't check set*uid() return code, assuming @@ -480,14 +490,10 @@ static int set_user(struct cred *new) * failure to the execve() stage. */ if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) && - new_user != INIT_USER) + new->user != INIT_USER) current->flags |= PF_NPROC_EXCEEDED; else current->flags &= ~PF_NPROC_EXCEEDED; - - free_uid(new->user); - new->user = new_user; - return 0; } /* @@ -562,6 +568,7 @@ long __sys_setreuid(uid_t ruid, uid_t euid) if (retval < 0) goto error; + flag_nproc_exceeded(new); return commit_creds(new); error: @@ -624,6 +631,7 @@ long __sys_setuid(uid_t uid) if (retval < 0) goto error; + flag_nproc_exceeded(new); return commit_creds(new); error: @@ -703,6 +711,7 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) if (retval < 0) goto error; + flag_nproc_exceeded(new); return commit_creds(new); error: -- cgit v1.2.3 From 0cbae9e24fa7d6c6e9f828562f084da82217a0c5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 9 Feb 2022 18:09:41 -0600 Subject: ucounts: Handle wrapping in is_ucounts_overlimit While examining is_ucounts_overlimit and reading the various messages I realized that is_ucounts_overlimit fails to deal with counts that may have wrapped. Being wrapped should be a transitory state for counts and they should never be wrapped for long, but it can happen so handle it. Cc: stable@vger.kernel.org Fixes: 21d1c5e386bc ("Reimplement RLIMIT_NPROC on top of ucounts") Link: https://lkml.kernel.org/r/20220216155832.680775-5-ebiederm@xmission.com Reviewed-by: Shuah Khan Signed-off-by: "Eric W. Biederman" --- kernel/ucount.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/ucount.c b/kernel/ucount.c index 65b597431c86..06ea04d44685 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -350,7 +350,8 @@ bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsign if (rlimit > LONG_MAX) max = LONG_MAX; for (iter = ucounts; iter; iter = iter->ns->ucounts) { - if (get_ucounts_value(iter, type) > max) + long val = get_ucounts_value(iter, type); + if (val < 0 || val > max) return true; max = READ_ONCE(iter->ns->ucount_max[type]); } -- cgit v1.2.3 From cea86fe246b694a191804b47378eb9d77aefabec Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Feb 2022 18:26:39 -0800 Subject: mm/munlock: rmap call mlock_vma_page() munlock_vma_page() Add vma argument to mlock_vma_page() and munlock_vma_page(), make them inline functions which check (vma->vm_flags & VM_LOCKED) before calling mlock_page() and munlock_page() in mm/mlock.c. Add bool compound to mlock_vma_page() and munlock_vma_page(): this is because we have understandable difficulty in accounting pte maps of THPs, and if passed a PageHead page, mlock_page() and munlock_page() cannot tell whether it's a pmd map to be counted or a pte map to be ignored. Add vma arg to page_add_file_rmap() and page_remove_rmap(), like the others, and use that to call mlock_vma_page() at the end of the page adds, and munlock_vma_page() at the end of page_remove_rmap() (end or beginning? unimportant, but end was easier for assertions in testing). No page lock is required (although almost all adds happen to hold it): delete the "Serialize with page migration" BUG_ON(!PageLocked(page))s. Certainly page lock did serialize with page migration, but I'm having difficulty explaining why that was ever important. Mlock accounting on THPs has been hard to define, differed between anon and file, involved PageDoubleMap in some places and not others, required clear_page_mlock() at some points. Keep it simple now: just count the pmds and ignore the ptes, there is no reason for ptes to undo pmd mlocks. page_add_new_anon_rmap() callers unchanged: they have long been calling lru_cache_add_inactive_or_unevictable(), which does its own VM_LOCKED handling (it also checks for not VM_SPECIAL: I think that's overcautious, and inconsistent with other checks, that mmap_region() already prevents VM_LOCKED on VM_SPECIAL; but haven't quite convinced myself to change it). Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/rmap.h | 17 ++++++++------- kernel/events/uprobes.c | 7 ++----- mm/huge_memory.c | 17 +++++++-------- mm/hugetlb.c | 4 ++-- mm/internal.h | 36 ++++++++++++++++++++++++++----- mm/khugepaged.c | 4 ++-- mm/ksm.c | 12 +---------- mm/memory.c | 45 +++++++++++++-------------------------- mm/migrate.c | 9 ++------ mm/mlock.c | 21 +++++++------------ mm/rmap.c | 56 +++++++++++++++++++++++-------------------------- mm/userfaultfd.c | 14 +++++++------ 12 files changed, 113 insertions(+), 129 deletions(-) (limited to 'kernel') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index dc48aa8c2c94..ac29b076082b 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -167,18 +167,19 @@ struct anon_vma *page_get_anon_vma(struct page *page); */ void page_move_anon_rmap(struct page *, struct vm_area_struct *); void page_add_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long, bool); + unsigned long address, bool compound); void do_page_add_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long, int); + unsigned long address, int flags); void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long, bool); -void page_add_file_rmap(struct page *, bool); -void page_remove_rmap(struct page *, bool); - + unsigned long address, bool compound); +void page_add_file_rmap(struct page *, struct vm_area_struct *, + bool compound); +void page_remove_rmap(struct page *, struct vm_area_struct *, + bool compound); void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long); + unsigned long address); void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long); + unsigned long address); static inline void page_dup_rmap(struct page *page, bool compound) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6357c3580d07..eed2f7437d96 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -173,7 +173,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, return err; } - /* For try_to_free_swap() and munlock_vma_page() below */ + /* For try_to_free_swap() below */ lock_page(old_page); mmu_notifier_invalidate_range_start(&range); @@ -201,13 +201,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, set_pte_at_notify(mm, addr, pvmw.pte, mk_pte(new_page, vma->vm_page_prot)); - page_remove_rmap(old_page, false); + page_remove_rmap(old_page, vma, false); if (!page_mapped(old_page)) try_to_free_swap(old_page); page_vma_mapped_walk_done(&pvmw); - - if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page)) - munlock_vma_page(old_page); put_page(old_page); err = 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9a34b85ebcf8..d6477f48a27e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1577,7 +1577,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (pmd_present(orig_pmd)) { page = pmd_page(orig_pmd); - page_remove_rmap(page, true); + page_remove_rmap(page, vma, true); VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); VM_BUG_ON_PAGE(!PageHead(page), page); } else if (thp_migration_supported()) { @@ -1962,7 +1962,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, set_page_dirty(page); if (!PageReferenced(page) && pmd_young(old_pmd)) SetPageReferenced(page); - page_remove_rmap(page, true); + page_remove_rmap(page, vma, true); put_page(page); } add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); @@ -2096,6 +2096,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } } unlock_page_memcg(page); + + /* Above is effectively page_remove_rmap(page, vma, true) */ + munlock_vma_page(page, vma, true); } smp_wmb(); /* make pte visible before pmd */ @@ -2103,7 +2106,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (freeze) { for (i = 0; i < HPAGE_PMD_NR; i++) { - page_remove_rmap(page + i, false); + page_remove_rmap(page + i, vma, false); put_page(page + i); } } @@ -2163,8 +2166,6 @@ repeat: do_unlock_page = true; } } - if (PageMlocked(page)) - clear_page_mlock(page); } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) goto out; __split_huge_pmd_locked(vma, pmd, range.start, freeze); @@ -3138,7 +3139,7 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, if (pmd_soft_dirty(pmdval)) pmdswp = pmd_swp_mksoft_dirty(pmdswp); set_pmd_at(mm, address, pvmw->pmd, pmdswp); - page_remove_rmap(page, true); + page_remove_rmap(page, vma, true); put_page(page); } @@ -3168,10 +3169,8 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) if (PageAnon(new)) page_add_anon_rmap(new, vma, mmun_start, true); else - page_add_file_rmap(new, true); + page_add_file_rmap(new, vma, true); set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); - if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new)) - mlock_vma_page(new); update_mmu_cache_pmd(vma, address, pvmw->pmd); } #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 61895cc01d09..43fb3155298e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5014,7 +5014,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct set_page_dirty(page); hugetlb_count_sub(pages_per_huge_page(h), mm); - page_remove_rmap(page, true); + page_remove_rmap(page, vma, true); spin_unlock(ptl); tlb_remove_page_size(tlb, page, huge_page_size(h)); @@ -5259,7 +5259,7 @@ retry_avoidcopy: /* Break COW */ huge_ptep_clear_flush(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, range.start, range.end); - page_remove_rmap(old_page, true); + page_remove_rmap(old_page, vma, true); hugepage_add_new_anon_rmap(new_page, vma, haddr); set_huge_pte_at(mm, haddr, ptep, make_huge_pte(vma, new_page, 1)); diff --git a/mm/internal.h b/mm/internal.h index f235aa92e564..3d7dfc8bc471 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -395,12 +395,35 @@ extern long faultin_vma_page_range(struct vm_area_struct *vma, bool write, int *locked); extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, unsigned long len); - /* - * must be called with vma's mmap_lock held for read or write, and page locked. + * mlock_vma_page() and munlock_vma_page(): + * should be called with vma's mmap_lock held for read or write, + * under page table lock for the pte/pmd being added or removed. + * + * mlock is usually called at the end of page_add_*_rmap(), + * munlock at the end of page_remove_rmap(); but new anon + * pages are managed in lru_cache_add_inactive_or_unevictable(). + * + * @compound is used to include pmd mappings of THPs, but filter out + * pte mappings of THPs, which cannot be consistently counted: a pte + * mapping of the THP head cannot be distinguished by the page alone. */ -extern void mlock_vma_page(struct page *page); -extern void munlock_vma_page(struct page *page); +void mlock_page(struct page *page); +static inline void mlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) +{ + if (unlikely(vma->vm_flags & VM_LOCKED) && + (compound || !PageTransCompound(page))) + mlock_page(page); +} +void munlock_page(struct page *page); +static inline void munlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) +{ + if (unlikely(vma->vm_flags & VM_LOCKED) && + (compound || !PageTransCompound(page))) + munlock_page(page); +} /* * Clear the page's PageMlocked(). This can be useful in a situation where @@ -487,7 +510,10 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, #else /* !CONFIG_MMU */ static inline void unmap_mapping_folio(struct folio *folio) { } static inline void clear_page_mlock(struct page *page) { } -static inline void mlock_vma_page(struct page *page) { } +static inline void mlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) { } +static inline void munlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) { } static inline void vunmap_range_noflush(unsigned long start, unsigned long end) { } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 131492fd1148..52add1825525 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -774,7 +774,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, */ spin_lock(ptl); ptep_clear(vma->vm_mm, address, _pte); - page_remove_rmap(src_page, false); + page_remove_rmap(src_page, vma, false); spin_unlock(ptl); free_page_and_swap_cache(src_page); } @@ -1513,7 +1513,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (pte_none(*pte)) continue; page = vm_normal_page(vma, addr, *pte); - page_remove_rmap(page, false); + page_remove_rmap(page, vma, false); } pte_unmap_unlock(start_pte, ptl); diff --git a/mm/ksm.c b/mm/ksm.c index c20bd4d9a0d9..c5a4403b5dc9 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1177,7 +1177,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, ptep_clear_flush(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, newpte); - page_remove_rmap(page, false); + page_remove_rmap(page, vma, false); if (!page_mapped(page)) try_to_free_swap(page); put_page(page); @@ -1252,16 +1252,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, err = replace_page(vma, page, kpage, orig_pte); } - if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { - munlock_vma_page(page); - if (!PageMlocked(kpage)) { - unlock_page(page); - lock_page(kpage); - mlock_vma_page(kpage); - page = kpage; /* for final unlock */ - } - } - out_unlock: unlock_page(page); out: diff --git a/mm/memory.c b/mm/memory.c index c125c4969913..53bd9e5f2e33 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -735,9 +735,6 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, set_pte_at(vma->vm_mm, address, ptep, pte); - if (vma->vm_flags & VM_LOCKED) - mlock_vma_page(page); - /* * No need to invalidate - it was non-present before. However * secondary CPUs may have mappings that need invalidating. @@ -1377,7 +1374,7 @@ again: mark_page_accessed(page); } rss[mm_counter(page)]--; - page_remove_rmap(page, false); + page_remove_rmap(page, vma, false); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); if (unlikely(__tlb_remove_page(tlb, page))) { @@ -1397,10 +1394,8 @@ again: continue; pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); rss[mm_counter(page)]--; - if (is_device_private_entry(entry)) - page_remove_rmap(page, false); - + page_remove_rmap(page, vma, false); put_page(page); continue; } @@ -1753,16 +1748,16 @@ static int validate_page_before_insert(struct page *page) return 0; } -static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte, +static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { if (!pte_none(*pte)) return -EBUSY; /* Ok, finally just insert the thing.. */ get_page(page); - inc_mm_counter_fast(mm, mm_counter_file(page)); - page_add_file_rmap(page, false); - set_pte_at(mm, addr, pte, mk_pte(page, prot)); + inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); + page_add_file_rmap(page, vma, false); + set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot)); return 0; } @@ -1776,7 +1771,6 @@ static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte, static int insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot) { - struct mm_struct *mm = vma->vm_mm; int retval; pte_t *pte; spinlock_t *ptl; @@ -1785,17 +1779,17 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, if (retval) goto out; retval = -ENOMEM; - pte = get_locked_pte(mm, addr, &ptl); + pte = get_locked_pte(vma->vm_mm, addr, &ptl); if (!pte) goto out; - retval = insert_page_into_pte_locked(mm, pte, addr, page, prot); + retval = insert_page_into_pte_locked(vma, pte, addr, page, prot); pte_unmap_unlock(pte, ptl); out: return retval; } #ifdef pte_index -static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte, +static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { int err; @@ -1805,7 +1799,7 @@ static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte, err = validate_page_before_insert(page); if (err) return err; - return insert_page_into_pte_locked(mm, pte, addr, page, prot); + return insert_page_into_pte_locked(vma, pte, addr, page, prot); } /* insert_pages() amortizes the cost of spinlock operations @@ -1842,7 +1836,7 @@ more: start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock); for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) { - int err = insert_page_in_batch_locked(mm, pte, + int err = insert_page_in_batch_locked(vma, pte, addr, pages[curr_page_idx], prot); if (unlikely(err)) { pte_unmap_unlock(start_pte, pte_lock); @@ -3098,7 +3092,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ - page_remove_rmap(old_page, false); + page_remove_rmap(old_page, vma, false); } /* Free the old page.. */ @@ -3118,16 +3112,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) */ mmu_notifier_invalidate_range_only_end(&range); if (old_page) { - /* - * Don't let another task, with possibly unlocked vma, - * keep the mlocked page. - */ - if (page_copied && (vma->vm_flags & VM_LOCKED)) { - lock_page(old_page); /* LRU manipulation */ - if (PageMlocked(old_page)) - munlock_vma_page(old_page); - unlock_page(old_page); - } if (page_copied) free_swap_cache(old_page); put_page(old_page); @@ -3947,7 +3931,8 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR); - page_add_file_rmap(page, true); + page_add_file_rmap(page, vma, true); + /* * deposit and withdraw with pmd lock held */ @@ -3996,7 +3981,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) lru_cache_add_inactive_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); - page_add_file_rmap(page, false); + page_add_file_rmap(page, vma, false); } set_pte_at(vma->vm_mm, addr, vmf->pte, entry); } diff --git a/mm/migrate.c b/mm/migrate.c index c7da064b4781..7c4223ce2500 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -248,14 +248,9 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, if (PageAnon(new)) page_add_anon_rmap(new, vma, pvmw.address, false); else - page_add_file_rmap(new, false); + page_add_file_rmap(new, vma, false); set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } - if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) - mlock_vma_page(new); - - if (PageTransHuge(page) && PageMlocked(page)) - clear_page_mlock(page); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, pvmw.address, pvmw.pte); @@ -2331,7 +2326,7 @@ again: * drop page refcount. Page won't be freed, as we took * a reference just above. */ - page_remove_rmap(page, false); + page_remove_rmap(page, vma, false); put_page(page); if (pte_present(pte)) diff --git a/mm/mlock.c b/mm/mlock.c index 5d7ced8303be..92f28258b4ae 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -78,17 +78,13 @@ void clear_page_mlock(struct page *page) } } -/* - * Mark page as mlocked if not already. - * If page on LRU, isolate and putback to move to unevictable list. +/** + * mlock_page - mlock a page + * @page: page to be mlocked, either a normal page or a THP head. */ -void mlock_vma_page(struct page *page) +void mlock_page(struct page *page) { - /* Serialize with page migration */ - BUG_ON(!PageLocked(page)); - VM_BUG_ON_PAGE(PageTail(page), page); - VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); if (!TestSetPageMlocked(page)) { int nr_pages = thp_nr_pages(page); @@ -101,14 +97,11 @@ void mlock_vma_page(struct page *page) } /** - * munlock_vma_page - munlock a vma page - * @page: page to be unlocked, either a normal page or THP page head + * munlock_page - munlock a page + * @page: page to be munlocked, either a normal page or a THP head. */ -void munlock_vma_page(struct page *page) +void munlock_page(struct page *page) { - /* Serialize with page migration */ - BUG_ON(!PageLocked(page)); - VM_BUG_ON_PAGE(PageTail(page), page); if (TestClearPageMlocked(page)) { diff --git a/mm/rmap.c b/mm/rmap.c index 7ce7f1946cff..6cc8bf129f18 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1181,17 +1181,17 @@ void do_page_add_anon_rmap(struct page *page, __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); } - if (unlikely(PageKsm(page))) { + if (unlikely(PageKsm(page))) unlock_page_memcg(page); - return; - } /* address might be in next vma when migration races vma_adjust */ - if (first) + else if (first) __page_set_anon_rmap(page, vma, address, flags & RMAP_EXCLUSIVE); else __page_check_anon_rmap(page, vma, address); + + mlock_vma_page(page, vma, compound); } /** @@ -1232,12 +1232,14 @@ void page_add_new_anon_rmap(struct page *page, /** * page_add_file_rmap - add pte mapping to a file page - * @page: the page to add the mapping to - * @compound: charge the page as compound or small page + * @page: the page to add the mapping to + * @vma: the vm area in which the mapping is added + * @compound: charge the page as compound or small page * * The caller needs to hold the pte lock. */ -void page_add_file_rmap(struct page *page, bool compound) +void page_add_file_rmap(struct page *page, + struct vm_area_struct *vma, bool compound) { int i, nr = 1; @@ -1260,13 +1262,8 @@ void page_add_file_rmap(struct page *page, bool compound) nr_pages); } else { if (PageTransCompound(page) && page_mapping(page)) { - struct page *head = compound_head(page); - VM_WARN_ON_ONCE(!PageLocked(page)); - - SetPageDoubleMap(head); - if (PageMlocked(page)) - clear_page_mlock(head); + SetPageDoubleMap(compound_head(page)); } if (!atomic_inc_and_test(&page->_mapcount)) goto out; @@ -1274,6 +1271,8 @@ void page_add_file_rmap(struct page *page, bool compound) __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); out: unlock_page_memcg(page); + + mlock_vma_page(page, vma, compound); } static void page_remove_file_rmap(struct page *page, bool compound) @@ -1368,11 +1367,13 @@ static void page_remove_anon_compound_rmap(struct page *page) /** * page_remove_rmap - take down pte mapping from a page * @page: page to remove mapping from + * @vma: the vm area from which the mapping is removed * @compound: uncharge the page as compound or small page * * The caller needs to hold the pte lock. */ -void page_remove_rmap(struct page *page, bool compound) +void page_remove_rmap(struct page *page, + struct vm_area_struct *vma, bool compound) { lock_page_memcg(page); @@ -1414,6 +1415,8 @@ void page_remove_rmap(struct page *page, bool compound) */ out: unlock_page_memcg(page); + + munlock_vma_page(page, vma, compound); } /* @@ -1469,28 +1472,21 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { + /* Unexpected PMD-mapped THP? */ + VM_BUG_ON_PAGE(!pvmw.pte, page); + /* - * If the page is mlock()d, we cannot swap it out. + * If the page is in an mlock()d vma, we must not swap it out. */ if (!(flags & TTU_IGNORE_MLOCK) && (vma->vm_flags & VM_LOCKED)) { - /* - * PTE-mapped THP are never marked as mlocked: so do - * not set it on a DoubleMap THP, nor on an Anon THP - * (which may still be PTE-mapped after DoubleMap was - * cleared). But stop unmapping even in those cases. - */ - if (!PageTransCompound(page) || (PageHead(page) && - !PageDoubleMap(page) && !PageAnon(page))) - mlock_vma_page(page); + /* Restore the mlock which got missed */ + mlock_vma_page(page, vma, false); page_vma_mapped_walk_done(&pvmw); ret = false; break; } - /* Unexpected PMD-mapped THP? */ - VM_BUG_ON_PAGE(!pvmw.pte, page); - subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); address = pvmw.address; @@ -1668,7 +1664,7 @@ discard: * * See Documentation/vm/mmu_notifier.rst */ - page_remove_rmap(subpage, PageHuge(page)); + page_remove_rmap(subpage, vma, PageHuge(page)); put_page(page); } @@ -1942,7 +1938,7 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, * * See Documentation/vm/mmu_notifier.rst */ - page_remove_rmap(subpage, PageHuge(page)); + page_remove_rmap(subpage, vma, PageHuge(page)); put_page(page); } @@ -2078,7 +2074,7 @@ static bool page_make_device_exclusive_one(struct page *page, * There is a reference on the page for the swap entry which has * been removed, so shouldn't take another. */ - page_remove_rmap(subpage, false); + page_remove_rmap(subpage, vma, false); } mmu_notifier_invalidate_range_end(&range); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 0780c2a57ff1..15d3e97a6e04 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -95,10 +95,15 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, if (!pte_none(*dst_pte)) goto out_unlock; - if (page_in_cache) - page_add_file_rmap(page, false); - else + if (page_in_cache) { + /* Usually, cache pages are already added to LRU */ + if (newly_allocated) + lru_cache_add(page); + page_add_file_rmap(page, dst_vma, false); + } else { page_add_new_anon_rmap(page, dst_vma, dst_addr, false); + lru_cache_add_inactive_or_unevictable(page, dst_vma); + } /* * Must happen after rmap, as mm_counter() checks mapping (via @@ -106,9 +111,6 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, */ inc_mm_counter(dst_mm, mm_counter(page)); - if (newly_allocated) - lru_cache_add_inactive_or_unevictable(page, dst_vma); - set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); /* No need to invalidate - it was non-present before */ -- cgit v1.2.3 From 75134f16e7dd0007aa474b281935c5f42e79f2c8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 17 Feb 2022 10:19:02 -0800 Subject: bpf: Add schedule points in batch ops syzbot reported various soft lockups caused by bpf batch operations. INFO: task kworker/1:1:27 blocked for more than 140 seconds. INFO: task hung in rcu_barrier Nothing prevents batch ops to process huge amount of data, we need to add schedule points in them. Note that maybe_wait_bpf_programs(map) calls from generic_map_delete_batch() can be factorized by moving the call after the loop. This will be done later in -next tree once we get this fix merged, unless there is strong opinion doing this optimization sooner. Fixes: aa2e93b8e58e ("bpf: Add generic support for update and delete batch ops") Fixes: cb4d03ab499d ("bpf: Add generic support for lookup batch op") Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: Alexei Starovoitov Reviewed-by: Stanislav Fomichev Acked-by: Brian Vazquez Link: https://lore.kernel.org/bpf/20220217181902.808742-1-eric.dumazet@gmail.com --- kernel/bpf/syscall.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index fa4505f9b611..ca70fe6fba38 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1355,6 +1355,7 @@ int generic_map_delete_batch(struct bpf_map *map, maybe_wait_bpf_programs(map); if (err) break; + cond_resched(); } if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) err = -EFAULT; @@ -1412,6 +1413,7 @@ int generic_map_update_batch(struct bpf_map *map, if (err) break; + cond_resched(); } if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) @@ -1509,6 +1511,7 @@ int generic_map_lookup_batch(struct bpf_map *map, swap(prev_key, key); retry = MAP_LOOKUP_RETRIES; cp++; + cond_resched(); } if (err == -EFAULT) -- cgit v1.2.3 From d24d2a2b0a81dd5e9bb99aeb4559ec9734e1416f Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 17 Feb 2022 10:30:01 -0800 Subject: bpf: bpf_prog_pack: Set proper size before freeing ro_header bpf_prog_pack_free() uses header->size to decide whether the header should be freed with module_memfree() or the bpf_prog_pack logic. However, in kvmalloc() failure path of bpf_jit_binary_pack_alloc(), header->size is not set yet. As a result, bpf_prog_pack_free() may treat a slice of a pack as a standalone kvmalloc'd header and call module_memfree() on the whole pack. This in turn causes use-after-free by other users of the pack. Fix this by setting ro_header->size before freeing ro_header. Fixes: 33c9805860e5 ("bpf: Introduce bpf_jit_binary_pack_[alloc|finalize|free]") Reported-by: syzbot+2f649ec6d2eea1495a8f@syzkaller.appspotmail.com Reported-by: syzbot+ecb1e7e51c52f68f7481@syzkaller.appspotmail.com Reported-by: syzbot+87f65c75f4a72db05445@syzkaller.appspotmail.com Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220217183001.1876034-1-song@kernel.org --- kernel/bpf/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 44623c9b5bb1..ebb0193d07f0 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1069,6 +1069,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr, *rw_header = kvmalloc(size, GFP_KERNEL); if (!*rw_header) { + bpf_arch_text_copy(&ro_header->size, &size, sizeof(size)); bpf_prog_pack_free(ro_header); bpf_jit_uncharge_modmem(size); return NULL; -- cgit v1.2.3 From 5be2226f417d5b06d17e6c52d6e341cf43c29e48 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 15 Feb 2022 13:07:10 -0500 Subject: KVM: x86: allow defining return-0 static calls A few vendor callbacks are only used by VMX, but they return an integer or bool value. Introduce KVM_X86_OP_OPTIONAL_RET0 for them: if a func is NULL in struct kvm_x86_ops, it will be changed to __static_call_return0 when updating static calls. Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 15 +++++++++------ arch/x86/include/asm/kvm_host.h | 4 ++++ arch/x86/kvm/svm/avic.c | 5 ----- arch/x86/kvm/svm/svm.c | 20 -------------------- arch/x86/kvm/x86.c | 4 ++-- kernel/static_call.c | 1 + 6 files changed, 16 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index c0ec066a8599..29affccb353c 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -10,7 +10,9 @@ BUILD_BUG_ON(1) * * KVM_X86_OP_OPTIONAL() can be used for those functions that can have * a NULL definition, for example if "static_call_cond()" will be used - * at the call sites. + * at the call sites. KVM_X86_OP_OPTIONAL_RET0() can be used likewise + * to make a definition optional, but in this case the default will + * be __static_call_return0. */ KVM_X86_OP(hardware_enable) KVM_X86_OP(hardware_disable) @@ -77,15 +79,15 @@ KVM_X86_OP(check_apicv_inhibit_reasons) KVM_X86_OP(refresh_apicv_exec_ctrl) KVM_X86_OP_OPTIONAL(hwapic_irr_update) KVM_X86_OP_OPTIONAL(hwapic_isr_update) -KVM_X86_OP_OPTIONAL(guest_apic_has_interrupt) +KVM_X86_OP_OPTIONAL_RET0(guest_apic_has_interrupt) KVM_X86_OP_OPTIONAL(load_eoi_exitmap) KVM_X86_OP_OPTIONAL(set_virtual_apic_mode) KVM_X86_OP_OPTIONAL(set_apic_access_page_addr) KVM_X86_OP(deliver_interrupt) KVM_X86_OP_OPTIONAL(sync_pir_to_irr) -KVM_X86_OP(set_tss_addr) -KVM_X86_OP(set_identity_map_addr) -KVM_X86_OP(get_mt_mask) +KVM_X86_OP_OPTIONAL_RET0(set_tss_addr) +KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr) +KVM_X86_OP_OPTIONAL_RET0(get_mt_mask) KVM_X86_OP(load_mmu_pgd) KVM_X86_OP(has_wbinvd_exit) KVM_X86_OP(get_l2_tsc_offset) @@ -103,7 +105,7 @@ KVM_X86_OP_OPTIONAL(vcpu_unblocking) KVM_X86_OP_OPTIONAL(pi_update_irte) KVM_X86_OP_OPTIONAL(pi_start_assignment) KVM_X86_OP_OPTIONAL(apicv_post_state_restore) -KVM_X86_OP_OPTIONAL(dy_apicv_has_pending_interrupt) +KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt) KVM_X86_OP_OPTIONAL(set_hv_timer) KVM_X86_OP_OPTIONAL(cancel_hv_timer) KVM_X86_OP(setup_mce) @@ -127,3 +129,4 @@ KVM_X86_OP(vcpu_deliver_sipi_vector) #undef KVM_X86_OP #undef KVM_X86_OP_OPTIONAL +#undef KVM_X86_OP_OPTIONAL_RET0 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a7e82fc1f1f3..8e512f25a930 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1541,6 +1541,7 @@ extern struct kvm_x86_ops kvm_x86_ops; #define KVM_X86_OP(func) \ DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func)); #define KVM_X86_OP_OPTIONAL KVM_X86_OP +#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP #include static inline void kvm_ops_static_call_update(void) @@ -1550,6 +1551,9 @@ static inline void kvm_ops_static_call_update(void) #define KVM_X86_OP(func) \ WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func) #define KVM_X86_OP_OPTIONAL __KVM_X86_OP +#define KVM_X86_OP_OPTIONAL_RET0(func) \ + static_call_update(kvm_x86_##func, kvm_x86_ops.func ? : \ + (void *) __static_call_return0); #include #undef __KVM_X86_OP } diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 4245cb99b497..d4fa8c4f3a9a 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -650,11 +650,6 @@ void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) avic_set_pi_irte_mode(vcpu, activated); } -bool avic_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) -{ - return false; -} - static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) { unsigned long flags; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 3daca34020fa..7038c76fa841 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3528,16 +3528,6 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); } -static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) -{ - return 0; -} - -static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) -{ - return 0; -} - static void svm_flush_tlb_current(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3934,11 +3924,6 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index) return true; } -static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) -{ - return 0; -} - static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4593,10 +4578,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons, .apicv_post_state_restore = avic_apicv_post_state_restore, - .set_tss_addr = svm_set_tss_addr, - .set_identity_map_addr = svm_set_identity_map_addr, - .get_mt_mask = svm_get_mt_mask, - .get_exit_info = svm_get_exit_info, .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid, @@ -4621,7 +4602,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .nested_ops = &svm_nested_ops, .deliver_interrupt = svm_deliver_interrupt, - .dy_apicv_has_pending_interrupt = avic_dy_apicv_has_pending_interrupt, .pi_update_irte = avic_pi_update_irte, .setup_mce = svm_setup_mce, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ab1c4778824a..16d29d41908f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -131,6 +131,7 @@ struct kvm_x86_ops kvm_x86_ops __read_mostly; DEFINE_STATIC_CALL_NULL(kvm_x86_##func, \ *(((struct kvm_x86_ops *)0)->func)); #define KVM_X86_OP_OPTIONAL KVM_X86_OP +#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP #include EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits); EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg); @@ -12016,8 +12017,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) { return (is_guest_mode(vcpu) && - kvm_x86_ops.guest_apic_has_interrupt && - static_call(kvm_x86_guest_apic_has_interrupt)(vcpu)); + static_call(kvm_x86_guest_apic_has_interrupt)(vcpu)); } static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) diff --git a/kernel/static_call.c b/kernel/static_call.c index 43ba0b1e0edb..f2b8baea35d2 100644 --- a/kernel/static_call.c +++ b/kernel/static_call.c @@ -503,6 +503,7 @@ long __static_call_return0(void) { return 0; } +EXPORT_SYMBOL_GPL(__static_call_return0); #ifdef CONFIG_STATIC_CALL_SELFTEST -- cgit v1.2.3 From 9087c6ff8dfe0a070e4e05a434399080603c29de Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 18 Feb 2022 10:18:01 -0800 Subject: bpf: Call maybe_wait_bpf_programs() only once from generic_map_delete_batch() As stated in the comment found in maybe_wait_bpf_programs(), the synchronize_rcu() barrier is only needed before returning to userspace, not after each deletion in the batch. Signed-off-by: Eric Dumazet Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20220218181801.2971275-1-eric.dumazet@gmail.com --- kernel/bpf/syscall.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a72f63d5a7da..9c7a72b65eee 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1352,7 +1352,6 @@ int generic_map_delete_batch(struct bpf_map *map, err = map->ops->map_delete_elem(map, key); rcu_read_unlock(); bpf_enable_instrumentation(); - maybe_wait_bpf_programs(map); if (err) break; cond_resched(); @@ -1361,6 +1360,8 @@ int generic_map_delete_batch(struct bpf_map *map, err = -EFAULT; kvfree(key); + + maybe_wait_bpf_programs(map); return err; } -- cgit v1.2.3 From b1e8206582f9d680cff7d04828708c8b6ab32957 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 14 Feb 2022 10:16:57 +0100 Subject: sched: Fix yet more sched_fork() races Where commit 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group") fixed a fork race vs cgroup, it opened up a race vs syscalls by not placing the task on the runqueue before it gets exposed through the pidhash. Commit 13765de8148f ("sched/fair: Fix fault in reweight_entity") is trying to fix a single instance of this, instead fix the whole class of issues, effectively reverting this commit. Fixes: 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group") Reported-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Tested-by: Tadeusz Struk Tested-by: Zhang Qiao Tested-by: Dietmar Eggemann Link: https://lkml.kernel.org/r/YgoeCbwj5mbCR0qA@hirez.programming.kicks-ass.net --- include/linux/sched/task.h | 4 ++-- kernel/fork.c | 13 ++++++++++++- kernel/sched/core.c | 34 +++++++++++++++++++++------------- 3 files changed, 35 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index b9198a1b3a84..e84e54d1b490 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -54,8 +54,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern int sched_fork(unsigned long clone_flags, struct task_struct *p); -extern void sched_post_fork(struct task_struct *p, - struct kernel_clone_args *kargs); +extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); +extern void sched_post_fork(struct task_struct *p); extern void sched_dead(struct task_struct *p); void __noreturn do_task_dead(void); diff --git a/kernel/fork.c b/kernel/fork.c index d75a528f7b21..c607d238fc23 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2266,6 +2266,17 @@ static __latent_entropy struct task_struct *copy_process( if (retval) goto bad_fork_put_pidfd; + /* + * Now that the cgroups are pinned, re-clone the parent cgroup and put + * the new task on the correct runqueue. All this *before* the task + * becomes visible. + * + * This isn't part of ->can_fork() because while the re-cloning is + * cgroup specific, it unconditionally needs to place the task on a + * runqueue. + */ + sched_cgroup_fork(p, args); + /* * From this point on we must avoid any synchronous user-space * communication until we take the tasklist-lock. In particular, we do @@ -2376,7 +2387,7 @@ static __latent_entropy struct task_struct *copy_process( write_unlock_irq(&tasklist_lock); proc_fork_connector(p); - sched_post_fork(p, args); + sched_post_fork(p); cgroup_post_fork(p, args); perf_event_fork(p); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fcf0c180617c..9745613d531c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1214,9 +1214,8 @@ int tg_nop(struct task_group *tg, void *data) } #endif -static void set_load_weight(struct task_struct *p) +static void set_load_weight(struct task_struct *p, bool update_load) { - bool update_load = !(READ_ONCE(p->__state) & TASK_NEW); int prio = p->static_prio - MAX_RT_PRIO; struct load_weight *load = &p->se.load; @@ -4407,7 +4406,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->static_prio = NICE_TO_PRIO(0); p->prio = p->normal_prio = p->static_prio; - set_load_weight(p); + set_load_weight(p, false); /* * We don't need the reset flag anymore after the fork. It has @@ -4425,6 +4424,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) init_entity_runnable_average(&p->se); + #ifdef CONFIG_SCHED_INFO if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -4440,18 +4440,23 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) return 0; } -void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) +void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) { unsigned long flags; -#ifdef CONFIG_CGROUP_SCHED - struct task_group *tg; -#endif + /* + * Because we're not yet on the pid-hash, p->pi_lock isn't strictly + * required yet, but lockdep gets upset if rules are violated. + */ raw_spin_lock_irqsave(&p->pi_lock, flags); #ifdef CONFIG_CGROUP_SCHED - tg = container_of(kargs->cset->subsys[cpu_cgrp_id], - struct task_group, css); - p->sched_task_group = autogroup_task_group(p, tg); + if (1) { + struct task_group *tg; + tg = container_of(kargs->cset->subsys[cpu_cgrp_id], + struct task_group, css); + tg = autogroup_task_group(p, tg); + p->sched_task_group = tg; + } #endif rseq_migrate(p); /* @@ -4462,7 +4467,10 @@ void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) if (p->sched_class->task_fork) p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); +} +void sched_post_fork(struct task_struct *p) +{ uclamp_post_fork(p); } @@ -6922,7 +6930,7 @@ void set_user_nice(struct task_struct *p, long nice) put_prev_task(rq, p); p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p); + set_load_weight(p, true); old_prio = p->prio; p->prio = effective_prio(p); @@ -7213,7 +7221,7 @@ static void __setscheduler_params(struct task_struct *p, */ p->rt_priority = attr->sched_priority; p->normal_prio = normal_prio(p); - set_load_weight(p); + set_load_weight(p, true); } /* @@ -9446,7 +9454,7 @@ void __init sched_init(void) #endif } - set_load_weight(&init_task); + set_load_weight(&init_task, false); /* * The boot idle thread does lazy MMU switching as well: -- cgit v1.2.3 From 4c7485584d48f60b1e742c7c6a3a1fa503d48d97 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 14 Feb 2022 16:52:10 +0000 Subject: sched/preempt: Move PREEMPT_DYNAMIC logic later The PREEMPT_DYNAMIC logic in kernel/sched/core.c patches static calls for a bunch of preemption functions. While most are defined prior to this, the definition of cond_resched() is later in the file, and so we only have its declarations from include/linux/sched.h. In subsequent patches we'd like to define some macros alongside the definition of each of the preemption functions, which we can use within sched_dynamic_update(). For this to be possible, the PREEMPT_DYNAMIC logic needs to be placed after the various preemption functions. As a preparatory step, this patch moves the PREEMPT_DYNAMIC logic after the various preemption functions, with no other changes -- this is purely a move. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Acked-by: Ard Biesheuvel Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20220214165216.2231574-2-mark.rutland@arm.com --- kernel/sched/core.c | 272 ++++++++++++++++++++++++++-------------------------- 1 file changed, 136 insertions(+), 136 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1e08b02e0cd5..a123ffa8e21c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6555,142 +6555,6 @@ EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace); #endif /* CONFIG_PREEMPTION */ -#ifdef CONFIG_PREEMPT_DYNAMIC - -#include - -/* - * SC:cond_resched - * SC:might_resched - * SC:preempt_schedule - * SC:preempt_schedule_notrace - * SC:irqentry_exit_cond_resched - * - * - * NONE: - * cond_resched <- __cond_resched - * might_resched <- RET0 - * preempt_schedule <- NOP - * preempt_schedule_notrace <- NOP - * irqentry_exit_cond_resched <- NOP - * - * VOLUNTARY: - * cond_resched <- __cond_resched - * might_resched <- __cond_resched - * preempt_schedule <- NOP - * preempt_schedule_notrace <- NOP - * irqentry_exit_cond_resched <- NOP - * - * FULL: - * cond_resched <- RET0 - * might_resched <- RET0 - * preempt_schedule <- preempt_schedule - * preempt_schedule_notrace <- preempt_schedule_notrace - * irqentry_exit_cond_resched <- irqentry_exit_cond_resched - */ - -enum { - preempt_dynamic_undefined = -1, - preempt_dynamic_none, - preempt_dynamic_voluntary, - preempt_dynamic_full, -}; - -int preempt_dynamic_mode = preempt_dynamic_undefined; - -int sched_dynamic_mode(const char *str) -{ - if (!strcmp(str, "none")) - return preempt_dynamic_none; - - if (!strcmp(str, "voluntary")) - return preempt_dynamic_voluntary; - - if (!strcmp(str, "full")) - return preempt_dynamic_full; - - return -EINVAL; -} - -void sched_dynamic_update(int mode) -{ - /* - * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in - * the ZERO state, which is invalid. - */ - static_call_update(cond_resched, __cond_resched); - static_call_update(might_resched, __cond_resched); - static_call_update(preempt_schedule, __preempt_schedule_func); - static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func); - static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched); - - switch (mode) { - case preempt_dynamic_none: - static_call_update(cond_resched, __cond_resched); - static_call_update(might_resched, (void *)&__static_call_return0); - static_call_update(preempt_schedule, NULL); - static_call_update(preempt_schedule_notrace, NULL); - static_call_update(irqentry_exit_cond_resched, NULL); - pr_info("Dynamic Preempt: none\n"); - break; - - case preempt_dynamic_voluntary: - static_call_update(cond_resched, __cond_resched); - static_call_update(might_resched, __cond_resched); - static_call_update(preempt_schedule, NULL); - static_call_update(preempt_schedule_notrace, NULL); - static_call_update(irqentry_exit_cond_resched, NULL); - pr_info("Dynamic Preempt: voluntary\n"); - break; - - case preempt_dynamic_full: - static_call_update(cond_resched, (void *)&__static_call_return0); - static_call_update(might_resched, (void *)&__static_call_return0); - static_call_update(preempt_schedule, __preempt_schedule_func); - static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func); - static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched); - pr_info("Dynamic Preempt: full\n"); - break; - } - - preempt_dynamic_mode = mode; -} - -static int __init setup_preempt_mode(char *str) -{ - int mode = sched_dynamic_mode(str); - if (mode < 0) { - pr_warn("Dynamic Preempt: unsupported mode: %s\n", str); - return 0; - } - - sched_dynamic_update(mode); - return 1; -} -__setup("preempt=", setup_preempt_mode); - -static void __init preempt_dynamic_init(void) -{ - if (preempt_dynamic_mode == preempt_dynamic_undefined) { - if (IS_ENABLED(CONFIG_PREEMPT_NONE)) { - sched_dynamic_update(preempt_dynamic_none); - } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) { - sched_dynamic_update(preempt_dynamic_voluntary); - } else { - /* Default static call setting, nothing to do */ - WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)); - preempt_dynamic_mode = preempt_dynamic_full; - pr_info("Dynamic Preempt: full\n"); - } - } -} - -#else /* !CONFIG_PREEMPT_DYNAMIC */ - -static inline void preempt_dynamic_init(void) { } - -#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */ - /* * This is the entry point to schedule() from kernel preemption * off of irq context. @@ -8271,6 +8135,142 @@ int __cond_resched_rwlock_write(rwlock_t *lock) } EXPORT_SYMBOL(__cond_resched_rwlock_write); +#ifdef CONFIG_PREEMPT_DYNAMIC + +#include + +/* + * SC:cond_resched + * SC:might_resched + * SC:preempt_schedule + * SC:preempt_schedule_notrace + * SC:irqentry_exit_cond_resched + * + * + * NONE: + * cond_resched <- __cond_resched + * might_resched <- RET0 + * preempt_schedule <- NOP + * preempt_schedule_notrace <- NOP + * irqentry_exit_cond_resched <- NOP + * + * VOLUNTARY: + * cond_resched <- __cond_resched + * might_resched <- __cond_resched + * preempt_schedule <- NOP + * preempt_schedule_notrace <- NOP + * irqentry_exit_cond_resched <- NOP + * + * FULL: + * cond_resched <- RET0 + * might_resched <- RET0 + * preempt_schedule <- preempt_schedule + * preempt_schedule_notrace <- preempt_schedule_notrace + * irqentry_exit_cond_resched <- irqentry_exit_cond_resched + */ + +enum { + preempt_dynamic_undefined = -1, + preempt_dynamic_none, + preempt_dynamic_voluntary, + preempt_dynamic_full, +}; + +int preempt_dynamic_mode = preempt_dynamic_undefined; + +int sched_dynamic_mode(const char *str) +{ + if (!strcmp(str, "none")) + return preempt_dynamic_none; + + if (!strcmp(str, "voluntary")) + return preempt_dynamic_voluntary; + + if (!strcmp(str, "full")) + return preempt_dynamic_full; + + return -EINVAL; +} + +void sched_dynamic_update(int mode) +{ + /* + * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in + * the ZERO state, which is invalid. + */ + static_call_update(cond_resched, __cond_resched); + static_call_update(might_resched, __cond_resched); + static_call_update(preempt_schedule, __preempt_schedule_func); + static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func); + static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched); + + switch (mode) { + case preempt_dynamic_none: + static_call_update(cond_resched, __cond_resched); + static_call_update(might_resched, (void *)&__static_call_return0); + static_call_update(preempt_schedule, NULL); + static_call_update(preempt_schedule_notrace, NULL); + static_call_update(irqentry_exit_cond_resched, NULL); + pr_info("Dynamic Preempt: none\n"); + break; + + case preempt_dynamic_voluntary: + static_call_update(cond_resched, __cond_resched); + static_call_update(might_resched, __cond_resched); + static_call_update(preempt_schedule, NULL); + static_call_update(preempt_schedule_notrace, NULL); + static_call_update(irqentry_exit_cond_resched, NULL); + pr_info("Dynamic Preempt: voluntary\n"); + break; + + case preempt_dynamic_full: + static_call_update(cond_resched, (void *)&__static_call_return0); + static_call_update(might_resched, (void *)&__static_call_return0); + static_call_update(preempt_schedule, __preempt_schedule_func); + static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func); + static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched); + pr_info("Dynamic Preempt: full\n"); + break; + } + + preempt_dynamic_mode = mode; +} + +static int __init setup_preempt_mode(char *str) +{ + int mode = sched_dynamic_mode(str); + if (mode < 0) { + pr_warn("Dynamic Preempt: unsupported mode: %s\n", str); + return 0; + } + + sched_dynamic_update(mode); + return 1; +} +__setup("preempt=", setup_preempt_mode); + +static void __init preempt_dynamic_init(void) +{ + if (preempt_dynamic_mode == preempt_dynamic_undefined) { + if (IS_ENABLED(CONFIG_PREEMPT_NONE)) { + sched_dynamic_update(preempt_dynamic_none); + } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) { + sched_dynamic_update(preempt_dynamic_voluntary); + } else { + /* Default static call setting, nothing to do */ + WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)); + preempt_dynamic_mode = preempt_dynamic_full; + pr_info("Dynamic Preempt: full\n"); + } + } +} + +#else /* !CONFIG_PREEMPT_DYNAMIC */ + +static inline void preempt_dynamic_init(void) { } + +#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */ + /** * yield - yield the current processor to other threads. * -- cgit v1.2.3 From 8a69fe0be143b0a1af829f85f0e9a1ae7d6a04db Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 14 Feb 2022 16:52:11 +0000 Subject: sched/preempt: Refactor sched_dynamic_update() Currently sched_dynamic_update needs to open-code the enabled/disabled function names for each preemption model it supports, when in practice this is a boolean enabled/disabled state for each function. Make this clearer and avoid repetition by defining the enabled/disabled states at the function definition, and using helper macros to perform the static_call_update(). Where x86 currently overrides the enabled function, it is made to provide both the enabled and disabled states for consistency, with defaults provided by the core code otherwise. In subsequent patches this will allow us to support PREEMPT_DYNAMIC without static calls. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Acked-by: Ard Biesheuvel Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20220214165216.2231574-3-mark.rutland@arm.com --- arch/x86/include/asm/preempt.h | 10 ++++--- include/linux/entry-common.h | 2 ++ kernel/sched/core.c | 59 ++++++++++++++++++++++++++---------------- 3 files changed, 45 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index fe5efbcba824..5f6daea1ee24 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -108,16 +108,18 @@ static __always_inline bool should_resched(int preempt_offset) extern asmlinkage void preempt_schedule(void); extern asmlinkage void preempt_schedule_thunk(void); -#define __preempt_schedule_func preempt_schedule_thunk +#define preempt_schedule_dynamic_enabled preempt_schedule_thunk +#define preempt_schedule_dynamic_disabled NULL extern asmlinkage void preempt_schedule_notrace(void); extern asmlinkage void preempt_schedule_notrace_thunk(void); -#define __preempt_schedule_notrace_func preempt_schedule_notrace_thunk +#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace_thunk +#define preempt_schedule_notrace_dynamic_disabled NULL #ifdef CONFIG_PREEMPT_DYNAMIC -DECLARE_STATIC_CALL(preempt_schedule, __preempt_schedule_func); +DECLARE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled); #define __preempt_schedule() \ do { \ @@ -125,7 +127,7 @@ do { \ asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule) : ASM_CALL_CONSTRAINT); \ } while (0) -DECLARE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func); +DECLARE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled); #define __preempt_schedule_notrace() \ do { \ diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 2e2b8d6140ed..a01ac1a0a292 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -456,6 +456,8 @@ irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs); */ void irqentry_exit_cond_resched(void); #ifdef CONFIG_PREEMPT_DYNAMIC +#define irqentry_exit_cond_resched_dynamic_enabled irqentry_exit_cond_resched +#define irqentry_exit_cond_resched_dynamic_disabled NULL DECLARE_STATIC_CALL(irqentry_exit_cond_resched, irqentry_exit_cond_resched); #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a123ffa8e21c..bf3a97f48c1d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6491,7 +6491,11 @@ NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); #ifdef CONFIG_PREEMPT_DYNAMIC -DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func); +#ifndef preempt_schedule_dynamic_enabled +#define preempt_schedule_dynamic_enabled preempt_schedule +#define preempt_schedule_dynamic_disabled NULL +#endif +DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled); EXPORT_STATIC_CALL_TRAMP(preempt_schedule); #endif @@ -6549,7 +6553,11 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) EXPORT_SYMBOL_GPL(preempt_schedule_notrace); #ifdef CONFIG_PREEMPT_DYNAMIC -DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func); +#ifndef preempt_schedule_notrace_dynamic_enabled +#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace +#define preempt_schedule_notrace_dynamic_disabled NULL +#endif +DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled); EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace); #endif @@ -8060,9 +8068,13 @@ EXPORT_SYMBOL(__cond_resched); #endif #ifdef CONFIG_PREEMPT_DYNAMIC +#define cond_resched_dynamic_enabled __cond_resched +#define cond_resched_dynamic_disabled ((void *)&__static_call_return0) DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched); EXPORT_STATIC_CALL_TRAMP(cond_resched); +#define might_resched_dynamic_enabled __cond_resched +#define might_resched_dynamic_disabled ((void *)&__static_call_return0) DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched); EXPORT_STATIC_CALL_TRAMP(might_resched); #endif @@ -8192,43 +8204,46 @@ int sched_dynamic_mode(const char *str) return -EINVAL; } +#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) +#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) + void sched_dynamic_update(int mode) { /* * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in * the ZERO state, which is invalid. */ - static_call_update(cond_resched, __cond_resched); - static_call_update(might_resched, __cond_resched); - static_call_update(preempt_schedule, __preempt_schedule_func); - static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func); - static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched); + preempt_dynamic_enable(cond_resched); + preempt_dynamic_enable(might_resched); + preempt_dynamic_enable(preempt_schedule); + preempt_dynamic_enable(preempt_schedule_notrace); + preempt_dynamic_enable(irqentry_exit_cond_resched); switch (mode) { case preempt_dynamic_none: - static_call_update(cond_resched, __cond_resched); - static_call_update(might_resched, (void *)&__static_call_return0); - static_call_update(preempt_schedule, NULL); - static_call_update(preempt_schedule_notrace, NULL); - static_call_update(irqentry_exit_cond_resched, NULL); + preempt_dynamic_enable(cond_resched); + preempt_dynamic_disable(might_resched); + preempt_dynamic_disable(preempt_schedule); + preempt_dynamic_disable(preempt_schedule_notrace); + preempt_dynamic_disable(irqentry_exit_cond_resched); pr_info("Dynamic Preempt: none\n"); break; case preempt_dynamic_voluntary: - static_call_update(cond_resched, __cond_resched); - static_call_update(might_resched, __cond_resched); - static_call_update(preempt_schedule, NULL); - static_call_update(preempt_schedule_notrace, NULL); - static_call_update(irqentry_exit_cond_resched, NULL); + preempt_dynamic_enable(cond_resched); + preempt_dynamic_enable(might_resched); + preempt_dynamic_disable(preempt_schedule); + preempt_dynamic_disable(preempt_schedule_notrace); + preempt_dynamic_disable(irqentry_exit_cond_resched); pr_info("Dynamic Preempt: voluntary\n"); break; case preempt_dynamic_full: - static_call_update(cond_resched, (void *)&__static_call_return0); - static_call_update(might_resched, (void *)&__static_call_return0); - static_call_update(preempt_schedule, __preempt_schedule_func); - static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func); - static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched); + preempt_dynamic_disable(cond_resched); + preempt_dynamic_disable(might_resched); + preempt_dynamic_enable(preempt_schedule); + preempt_dynamic_enable(preempt_schedule_notrace); + preempt_dynamic_enable(irqentry_exit_cond_resched); pr_info("Dynamic Preempt: full\n"); break; } -- cgit v1.2.3 From 4624a14f4daa8ab4578d274555fd8847254ce339 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 14 Feb 2022 16:52:12 +0000 Subject: sched/preempt: Simplify irqentry_exit_cond_resched() callers Currently callers of irqentry_exit_cond_resched() need to be aware of whether the function should be indirected via a static call, leading to ugly ifdeffery in callers. Save them the hassle with a static inline wrapper that does the right thing. The raw_irqentry_exit_cond_resched() will also be useful in subsequent patches which will add conditional wrappers for preemption functions. Note: in arch/x86/entry/common.c, xen_pv_evtchn_do_upcall() always calls irqentry_exit_cond_resched() directly, even when PREEMPT_DYNAMIC is in use. I believe this is a latent bug (which this patch corrects), but I'm not entirely certain this wasn't deliberate. Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Acked-by: Ard Biesheuvel Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20220214165216.2231574-4-mark.rutland@arm.com --- include/linux/entry-common.h | 9 ++++++--- kernel/entry/common.c | 12 ++++-------- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index a01ac1a0a292..dfd84c59b144 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -454,11 +454,14 @@ irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs); * * Conditional reschedule with additional sanity checks. */ -void irqentry_exit_cond_resched(void); +void raw_irqentry_exit_cond_resched(void); #ifdef CONFIG_PREEMPT_DYNAMIC -#define irqentry_exit_cond_resched_dynamic_enabled irqentry_exit_cond_resched +#define irqentry_exit_cond_resched_dynamic_enabled raw_irqentry_exit_cond_resched #define irqentry_exit_cond_resched_dynamic_disabled NULL -DECLARE_STATIC_CALL(irqentry_exit_cond_resched, irqentry_exit_cond_resched); +DECLARE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched); +#define irqentry_exit_cond_resched() static_call(irqentry_exit_cond_resched)() +#else +#define irqentry_exit_cond_resched() raw_irqentry_exit_cond_resched() #endif /** diff --git a/kernel/entry/common.c b/kernel/entry/common.c index bad713684c2e..1739ca79613b 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -380,7 +380,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) return ret; } -void irqentry_exit_cond_resched(void) +void raw_irqentry_exit_cond_resched(void) { if (!preempt_count()) { /* Sanity check RCU and thread stack */ @@ -392,7 +392,7 @@ void irqentry_exit_cond_resched(void) } } #ifdef CONFIG_PREEMPT_DYNAMIC -DEFINE_STATIC_CALL(irqentry_exit_cond_resched, irqentry_exit_cond_resched); +DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched); #endif noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) @@ -420,13 +420,9 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) } instrumentation_begin(); - if (IS_ENABLED(CONFIG_PREEMPTION)) { -#ifdef CONFIG_PREEMPT_DYNAMIC - static_call(irqentry_exit_cond_resched)(); -#else + if (IS_ENABLED(CONFIG_PREEMPTION)) irqentry_exit_cond_resched(); -#endif - } + /* Covers both tracing and lockdep */ trace_hardirqs_on(); instrumentation_end(); -- cgit v1.2.3 From 33c64734be3461222a8aa27d3dadc477ebca62de Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 14 Feb 2022 16:52:13 +0000 Subject: sched/preempt: Decouple HAVE_PREEMPT_DYNAMIC from GENERIC_ENTRY Now that the enabled/disabled states for the preemption functions are declared alongside their definitions, the core PREEMPT_DYNAMIC logic is no longer tied to GENERIC_ENTRY, and can safely be selected so long as an architecture provides enabled/disabled states for irqentry_exit_cond_resched(). Make it possible to select HAVE_PREEMPT_DYNAMIC without GENERIC_ENTRY. For existing users of HAVE_PREEMPT_DYNAMIC there should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Acked-by: Ard Biesheuvel Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20220214165216.2231574-5-mark.rutland@arm.com --- arch/Kconfig | 1 - kernel/sched/core.c | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 678a80713b21..601691f1570f 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1279,7 +1279,6 @@ config HAVE_STATIC_CALL_INLINE config HAVE_PREEMPT_DYNAMIC bool depends on HAVE_STATIC_CALL - depends on GENERIC_ENTRY help Select this if the architecture support boot time preempt setting on top of static calls. It is strongly advised to support inline diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bf3a97f48c1d..300c0454a2b8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8149,7 +8149,9 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write); #ifdef CONFIG_PREEMPT_DYNAMIC +#ifdef CONFIG_GENERIC_ENTRY #include +#endif /* * SC:cond_resched -- cgit v1.2.3 From 99cf983cc8bca4adb461b519664c939a565cfd4d Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 14 Feb 2022 16:52:14 +0000 Subject: sched/preempt: Add PREEMPT_DYNAMIC using static keys Where an architecture selects HAVE_STATIC_CALL but not HAVE_STATIC_CALL_INLINE, each static call has an out-of-line trampoline which will either branch to a callee or return to the caller. On such architectures, a number of constraints can conspire to make those trampolines more complicated and potentially less useful than we'd like. For example: * Hardware and software control flow integrity schemes can require the addition of "landing pad" instructions (e.g. `BTI` for arm64), which will also be present at the "real" callee. * Limited branch ranges can require that trampolines generate or load an address into a register and perform an indirect branch (or at least have a slow path that does so). This loses some of the benefits of having a direct branch. * Interaction with SW CFI schemes can be complicated and fragile, e.g. requiring that we can recognise idiomatic codegen and remove indirections understand, at least until clang proves more helpful mechanisms for dealing with this. For PREEMPT_DYNAMIC, we don't need the full power of static calls, as we really only need to enable/disable specific preemption functions. We can achieve the same effect without a number of the pain points above by using static keys to fold early returns into the preemption functions themselves rather than in an out-of-line trampoline, effectively inlining the trampoline into the start of the function. For arm64, this results in good code generation. For example, the dynamic_cond_resched() wrapper looks as follows when enabled. When disabled, the first `B` is replaced with a `NOP`, resulting in an early return. | : | bti c | b // or `nop` | mov w0, #0x0 | ret | mrs x0, sp_el0 | ldr x0, [x0, #8] | cbnz x0, | paciasp | stp x29, x30, [sp, #-16]! | mov x29, sp | bl | mov w0, #0x1 | ldp x29, x30, [sp], #16 | autiasp | ret ... compared to the regular form of the function: | <__cond_resched>: | bti c | mrs x0, sp_el0 | ldr x1, [x0, #8] | cbz x1, <__cond_resched+0x18> | mov w0, #0x0 | ret | paciasp | stp x29, x30, [sp, #-16]! | mov x29, sp | bl | mov w0, #0x1 | ldp x29, x30, [sp], #16 | autiasp | ret Any architecture which implements static keys should be able to use this to implement PREEMPT_DYNAMIC with similar cost to non-inlined static calls. Since this is likely to have greater overhead than (inlined) static calls, PREEMPT_DYNAMIC is only defaulted to enabled when HAVE_PREEMPT_DYNAMIC_CALL is selected. Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Acked-by: Ard Biesheuvel Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20220214165216.2231574-6-mark.rutland@arm.com --- arch/Kconfig | 36 ++++++++++++++++++++++++++--- arch/x86/Kconfig | 2 +- include/linux/entry-common.h | 10 ++++++-- include/linux/kernel.h | 7 +++++- include/linux/sched.h | 10 +++++++- kernel/Kconfig.preempt | 3 ++- kernel/entry/common.c | 11 +++++++++ kernel/sched/core.c | 54 ++++++++++++++++++++++++++++++++++++++++++-- 8 files changed, 122 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 601691f1570f..d544abd14c01 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1278,11 +1278,41 @@ config HAVE_STATIC_CALL_INLINE config HAVE_PREEMPT_DYNAMIC bool + +config HAVE_PREEMPT_DYNAMIC_CALL + bool depends on HAVE_STATIC_CALL + select HAVE_PREEMPT_DYNAMIC + help + An architecture should select this if it can handle the preemption + model being selected at boot time using static calls. + + Where an architecture selects HAVE_STATIC_CALL_INLINE, any call to a + preemption function will be patched directly. + + Where an architecture does not select HAVE_STATIC_CALL_INLINE, any + call to a preemption function will go through a trampoline, and the + trampoline will be patched. + + It is strongly advised to support inline static call to avoid any + overhead. + +config HAVE_PREEMPT_DYNAMIC_KEY + bool + depends on HAVE_ARCH_JUMP_LABEL && CC_HAS_ASM_GOTO + select HAVE_PREEMPT_DYNAMIC help - Select this if the architecture support boot time preempt setting - on top of static calls. It is strongly advised to support inline - static call to avoid any overhead. + An architecture should select this if it can handle the preemption + model being selected at boot time using static keys. + + Each preemption function will be given an early return based on a + static key. This should have slightly lower overhead than non-inline + static calls, as this effectively inlines each trampoline into the + start of its callee. This may avoid redundant work, and may + integrate better with CFI schemes. + + This will have greater overhead than using inline static calls as + the call to the preemption function cannot be entirely elided. config ARCH_WANT_LD_ORPHAN_WARN bool diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ebe8fc76949a..f13cfdfb30ce 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -245,7 +245,7 @@ config X86 select HAVE_STACK_VALIDATION if X86_64 select HAVE_STATIC_CALL select HAVE_STATIC_CALL_INLINE if HAVE_STACK_VALIDATION - select HAVE_PREEMPT_DYNAMIC + select HAVE_PREEMPT_DYNAMIC_CALL select HAVE_RSEQ select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNSTABLE_SCHED_CLOCK diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index dfd84c59b144..141952f4fee8 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -456,13 +456,19 @@ irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs); */ void raw_irqentry_exit_cond_resched(void); #ifdef CONFIG_PREEMPT_DYNAMIC +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) #define irqentry_exit_cond_resched_dynamic_enabled raw_irqentry_exit_cond_resched #define irqentry_exit_cond_resched_dynamic_disabled NULL DECLARE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched); #define irqentry_exit_cond_resched() static_call(irqentry_exit_cond_resched)() -#else -#define irqentry_exit_cond_resched() raw_irqentry_exit_cond_resched() +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +DECLARE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); +void dynamic_irqentry_exit_cond_resched(void); +#define irqentry_exit_cond_resched() dynamic_irqentry_exit_cond_resched() #endif +#else /* CONFIG_PREEMPT_DYNAMIC */ +#define irqentry_exit_cond_resched() raw_irqentry_exit_cond_resched() +#endif /* CONFIG_PREEMPT_DYNAMIC */ /** * irqentry_exit - Handle return from exception that used irqentry_enter() diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 33f47a996513..a890428bcc1a 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -99,7 +99,7 @@ struct user; extern int __cond_resched(void); # define might_resched() __cond_resched() -#elif defined(CONFIG_PREEMPT_DYNAMIC) +#elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) extern int __cond_resched(void); @@ -110,6 +110,11 @@ static __always_inline void might_resched(void) static_call_mod(might_resched)(); } +#elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) + +extern int dynamic_might_resched(void); +# define might_resched() dynamic_might_resched() + #else # define might_resched() do { } while (0) diff --git a/include/linux/sched.h b/include/linux/sched.h index 508b91d57470..de03ddeb064b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2020,7 +2020,7 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) extern int __cond_resched(void); -#ifdef CONFIG_PREEMPT_DYNAMIC +#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) DECLARE_STATIC_CALL(cond_resched, __cond_resched); @@ -2029,6 +2029,14 @@ static __always_inline int _cond_resched(void) return static_call_mod(cond_resched)(); } +#elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +extern int dynamic_cond_resched(void); + +static __always_inline int _cond_resched(void) +{ + return dynamic_cond_resched(); +} + #else static inline int _cond_resched(void) diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index ce77f0265660..c2f1fd95a821 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -96,8 +96,9 @@ config PREEMPTION config PREEMPT_DYNAMIC bool "Preemption behaviour defined on boot" depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT + select JUMP_LABEL if HAVE_PREEMPT_DYNAMIC_KEY select PREEMPT_BUILD - default y + default y if HAVE_PREEMPT_DYNAMIC_CALL help This option allows to define the preemption model on the kernel command line parameter and thus override the default preemption diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 1739ca79613b..b145249ad91a 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -392,7 +393,17 @@ void raw_irqentry_exit_cond_resched(void) } } #ifdef CONFIG_PREEMPT_DYNAMIC +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched); +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); +void dynamic_irqentry_exit_cond_resched(void) +{ + if (!static_key_unlikely(&sk_dynamic_irqentry_exit_cond_resched)) + return; + raw_irqentry_exit_cond_resched(); +} +#endif #endif noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 300c0454a2b8..9e65028189f4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -14,6 +14,7 @@ #include #include +#include #include #include @@ -6484,21 +6485,31 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) */ if (likely(!preemptible())) return; - preempt_schedule_common(); } NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); #ifdef CONFIG_PREEMPT_DYNAMIC +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) #ifndef preempt_schedule_dynamic_enabled #define preempt_schedule_dynamic_enabled preempt_schedule #define preempt_schedule_dynamic_disabled NULL #endif DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled); EXPORT_STATIC_CALL_TRAMP(preempt_schedule); +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule); +void __sched notrace dynamic_preempt_schedule(void) +{ + if (!static_branch_unlikely(&sk_dynamic_preempt_schedule)) + return; + preempt_schedule(); +} +NOKPROBE_SYMBOL(dynamic_preempt_schedule); +EXPORT_SYMBOL(dynamic_preempt_schedule); +#endif #endif - /** * preempt_schedule_notrace - preempt_schedule called by tracing @@ -6553,12 +6564,24 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) EXPORT_SYMBOL_GPL(preempt_schedule_notrace); #ifdef CONFIG_PREEMPT_DYNAMIC +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) #ifndef preempt_schedule_notrace_dynamic_enabled #define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace #define preempt_schedule_notrace_dynamic_disabled NULL #endif DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled); EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace); +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace); +void __sched notrace dynamic_preempt_schedule_notrace(void) +{ + if (!static_branch_unlikely(&sk_dynamic_preempt_schedule_notrace)) + return; + preempt_schedule_notrace(); +} +NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace); +EXPORT_SYMBOL(dynamic_preempt_schedule_notrace); +#endif #endif #endif /* CONFIG_PREEMPTION */ @@ -8068,6 +8091,7 @@ EXPORT_SYMBOL(__cond_resched); #endif #ifdef CONFIG_PREEMPT_DYNAMIC +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) #define cond_resched_dynamic_enabled __cond_resched #define cond_resched_dynamic_disabled ((void *)&__static_call_return0) DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched); @@ -8077,6 +8101,25 @@ EXPORT_STATIC_CALL_TRAMP(cond_resched); #define might_resched_dynamic_disabled ((void *)&__static_call_return0) DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched); EXPORT_STATIC_CALL_TRAMP(might_resched); +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched); +int __sched dynamic_cond_resched(void) +{ + if (!static_branch_unlikely(&sk_dynamic_cond_resched)) + return 0; + return __cond_resched(); +} +EXPORT_SYMBOL(dynamic_cond_resched); + +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_might_resched); +int __sched dynamic_might_resched(void) +{ + if (!static_branch_unlikely(&sk_dynamic_might_resched)) + return 0; + return __cond_resched(); +} +EXPORT_SYMBOL(dynamic_might_resched); +#endif #endif /* @@ -8206,8 +8249,15 @@ int sched_dynamic_mode(const char *str) return -EINVAL; } +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) #define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) #define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +#define preempt_dynamic_enable(f) static_key_enable(&sk_dynamic_##f.key) +#define preempt_dynamic_disable(f) static_key_disable(&sk_dynamic_##f.key) +#else +#error "Unsupported PREEMPT_DYNAMIC mechanism" +#endif void sched_dynamic_update(int mode) { -- cgit v1.2.3 From d0b3822902b6af45f2c75706d7eb2a35aacab223 Mon Sep 17 00:00:00 2001 From: "Souptick Joarder (HPE)" Date: Sat, 19 Feb 2022 22:09:15 +0530 Subject: bpf: Initialize ret to 0 inside btf_populate_kfunc_set() Kernel test robot reported below error -> kernel/bpf/btf.c:6718 btf_populate_kfunc_set() error: uninitialized symbol 'ret'. Initialize ret to 0. Fixes: dee872e124e8 ("bpf: Populate kfunc BTF ID sets in struct btf") Reported-by: kernel test robot Signed-off-by: Souptick Joarder (HPE) Signed-off-by: Alexei Starovoitov Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20220219163915.125770-1-jrdr.linux@gmail.com --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 02d7014417a0..2c4c5dbe2abe 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6706,7 +6706,7 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, const struct btf_kfunc_id_set *kset) { bool vmlinux_set = !btf_is_module(btf); - int type, ret; + int type, ret = 0; for (type = 0; type < ARRAY_SIZE(kset->sets); type++) { if (!kset->sets[type]) -- cgit v1.2.3 From 44a3918c8245ab10c6c9719dd12e7a8d291980d8 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 18 Feb 2022 11:49:08 -0800 Subject: x86/speculation: Include unprivileged eBPF status in Spectre v2 mitigation reporting With unprivileged eBPF enabled, eIBRS (without retpoline) is vulnerable to Spectre v2 BHB-based attacks. When both are enabled, print a warning message and report it in the 'spectre_v2' sysfs vulnerabilities file. Signed-off-by: Josh Poimboeuf Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner --- arch/x86/kernel/cpu/bugs.c | 35 +++++++++++++++++++++++++++++------ include/linux/bpf.h | 11 +++++++++++ kernel/sysctl.c | 7 +++++++ 3 files changed, 47 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 79c52dd6c597..0a4267c63d3b 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -650,6 +651,16 @@ static inline const char *spectre_v2_module_string(void) static inline const char *spectre_v2_module_string(void) { return ""; } #endif +#define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n" + +#ifdef CONFIG_BPF_SYSCALL +void unpriv_ebpf_notify(int new_state) +{ + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && !new_state) + pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); +} +#endif + static inline bool match_option(const char *arg, int arglen, const char *opt) { int len = strlen(opt); @@ -994,6 +1005,9 @@ static void __init spectre_v2_select_mitigation(void) break; } + if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); + if (spectre_v2_in_eibrs_mode(mode)) { /* Force it so VMEXIT will restore correctly */ x86_spec_ctrl_base |= SPEC_CTRL_IBRS; @@ -1780,6 +1794,20 @@ static char *ibpb_state(void) return ""; } +static ssize_t spectre_v2_show_state(char *buf) +{ + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + return sprintf(buf, "Vulnerable: Unprivileged eBPF enabled\n"); + + return sprintf(buf, "%s%s%s%s%s%s\n", + spectre_v2_strings[spectre_v2_enabled], + ibpb_state(), + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + stibp_state(), + boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", + spectre_v2_module_string()); +} + static ssize_t srbds_show_state(char *buf) { return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]); @@ -1805,12 +1833,7 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr return sprintf(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]); case X86_BUG_SPECTRE_V2: - return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], - ibpb_state(), - boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", - stibp_state(), - boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", - spectre_v2_module_string()); + return spectre_v2_show_state(buf); case X86_BUG_SPEC_STORE_BYPASS: return sprintf(buf, "%s\n", ssb_strings[ssb_mode]); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fa517ae604ad..1f56806d8eb9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1793,6 +1793,11 @@ struct bpf_core_ctx { int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, int relo_idx, void *insn); +static inline bool unprivileged_ebpf_enabled(void) +{ + return !sysctl_unprivileged_bpf_disabled; +} + #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -2012,6 +2017,12 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog, { return NULL; } + +static inline bool unprivileged_ebpf_enabled(void) +{ + return false; +} + #endif /* CONFIG_BPF_SYSCALL */ void __bpf_free_used_btfs(struct bpf_prog_aux *aux, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5ae443b2882e..730ab56d9e92 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -180,6 +180,10 @@ static int bpf_stats_handler(struct ctl_table *table, int write, return ret; } +void __weak unpriv_ebpf_notify(int new_state) +{ +} + static int bpf_unpriv_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -197,6 +201,9 @@ static int bpf_unpriv_handler(struct ctl_table *table, int write, return -EPERM; *(int *)table->data = unpriv_enable; } + + unpriv_ebpf_notify(unpriv_enable); + return ret; } #endif /* CONFIG_BPF_SYSCALL && CONFIG_SYSCTL */ -- cgit v1.2.3 From 509853f9e1e7b1490dc79f735a5dbafc9298f40d Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 11 Feb 2022 19:14:54 +0100 Subject: genirq: Provide generic_handle_irq_safe() Provide generic_handle_irq_safe() which can used from any context. Suggested-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Reviewed-by: Hans de Goede Reviewed-by: Oleksandr Natalenko Reviewed-by: Wolfram Sang Link: https://lore.kernel.org/r/20220211181500.1856198-2-bigeasy@linutronix.de --- include/linux/irqdesc.h | 1 + kernel/irq/irqdesc.c | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'kernel') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 93d270ca0c56..a77584593f7d 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -160,6 +160,7 @@ static inline void generic_handle_irq_desc(struct irq_desc *desc) int handle_irq_desc(struct irq_desc *desc); int generic_handle_irq(unsigned int irq); +int generic_handle_irq_safe(unsigned int irq); #ifdef CONFIG_IRQ_DOMAIN /* diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 2267e6527db3..346d283d2da1 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -662,6 +662,29 @@ int generic_handle_irq(unsigned int irq) } EXPORT_SYMBOL_GPL(generic_handle_irq); +/** + * generic_handle_irq_safe - Invoke the handler for a particular irq from any + * context. + * @irq: The irq number to handle + * + * Returns: 0 on success, a negative value on error. + * + * This function can be called from any context (IRQ or process context). It + * will report an error if not invoked from IRQ context and the irq has been + * marked to enforce IRQ-context only. + */ +int generic_handle_irq_safe(unsigned int irq) +{ + unsigned long flags; + int ret; + + local_irq_save(flags); + ret = handle_irq_desc(irq_to_desc(irq)); + local_irq_restore(flags); + return ret; +} +EXPORT_SYMBOL_GPL(generic_handle_irq_safe); + #ifdef CONFIG_IRQ_DOMAIN /** * generic_handle_domain_irq - Invoke the handler for a HW irq belonging -- cgit v1.2.3 From a5a763b2b26678f1e01b2d031819b175d8f14555 Mon Sep 17 00:00:00 2001 From: Andre Kalb Date: Wed, 16 Feb 2022 11:41:38 +0100 Subject: printk: Set console_set_on_cmdline=1 when __add_preferred_console() is called with user_specified == true In case of using console="" or console=null set console_set_on_cmdline=1 to disable "stdout-path" node from DT. We basically need to set it every time when __add_preferred_console() is called with parameter 'user_specified' set. Therefore we can move setting it into a helper function that is called from __add_preferred_console(). Suggested-by: Petr Mladek Signed-off-by: Andre Kalb Reviewed-by: Petr Mladek Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/YgzU4ho8l6XapyG2@pc6682 --- kernel/printk/printk.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 25dce8b74791..266cc974b0e3 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2323,6 +2323,20 @@ asmlinkage __visible void early_printk(const char *fmt, ...) } #endif +static void set_user_specified(struct console_cmdline *c, bool user_specified) +{ + if (!user_specified) + return; + + /* + * @c console was defined by the user on the command line. + * Do not clear when added twice also by SPCR or the device tree. + */ + c->user_specified = true; + /* At least one console defined by the user on the command line. */ + console_set_on_cmdline = 1; +} + static int __add_preferred_console(char *name, int idx, char *options, char *brl_options, bool user_specified) { @@ -2339,8 +2353,7 @@ static int __add_preferred_console(char *name, int idx, char *options, if (strcmp(c->name, name) == 0 && c->index == idx) { if (!brl_options) preferred_console = i; - if (user_specified) - c->user_specified = true; + set_user_specified(c, user_specified); return 0; } } @@ -2350,7 +2363,7 @@ static int __add_preferred_console(char *name, int idx, char *options, preferred_console = i; strlcpy(c->name, name, sizeof(c->name)); c->options = options; - c->user_specified = user_specified; + set_user_specified(c, user_specified); braille_set_options(c, brl_options); c->index = idx; @@ -2416,7 +2429,6 @@ static int __init console_setup(char *str) *s = 0; __add_preferred_console(buf, idx, options, brl_options, true); - console_set_on_cmdline = 1; return 1; } __setup("console=", console_setup); -- cgit v1.2.3 From ce06e863f36f16cdc3b84c7206cd13d5f597d623 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Wed, 16 Feb 2022 11:19:57 +0800 Subject: printk: make suppress_panic_printk static This symbol is not used outside of printk.c, so marks it static. Fix the following sparse warning: kernel/printk/printk.c:100:19: warning: symbol 'suppress_panic_printk' was not declared. Should it be static? Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Reviewed-by: Sergey Senozhatsky Reviewed-by: Miguel Ojeda Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220216031957.9761-1-jiapeng.chong@linux.alibaba.com --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 6a51907a33b9..f9430ac4caca 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -97,7 +97,7 @@ int __read_mostly suppress_printk; * During panic, heavy printk by other CPUs can delay the * panic and risk deadlock on console resources. */ -int __read_mostly suppress_panic_printk; +static int __read_mostly suppress_panic_printk; #ifdef CONFIG_LOCKDEP static struct lockdep_map console_lock_dep_map = { -- cgit v1.2.3 From 3191dd5a1179ef0fad5a050a1702ae98b6251e8f Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sun, 13 Feb 2022 22:48:04 +0100 Subject: random: clear fast pool, crng, and batches in cpuhp bring up For the irq randomness fast pool, rather than having to use expensive atomics, which were visibly the most expensive thing in the entire irq handler, simply take care of the extreme edge case of resetting count to zero in the cpuhp online handler, just after workqueues have been reenabled. This simplifies the code a bit and lets us use vanilla variables rather than atomics, and performance should be improved. As well, very early on when the CPU comes up, while interrupts are still disabled, we clear out the per-cpu crng and its batches, so that it always starts with fresh randomness. Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Theodore Ts'o Cc: Sultan Alsawaf Cc: Dominik Brodowski Acked-by: Sebastian Andrzej Siewior Signed-off-by: Jason A. Donenfeld --- drivers/char/random.c | 62 +++++++++++++++++++++++++++++++++++----------- include/linux/cpuhotplug.h | 2 ++ include/linux/random.h | 5 ++++ kernel/cpu.c | 11 ++++++++ 4 files changed, 65 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/drivers/char/random.c b/drivers/char/random.c index bca4467e540f..d73a75cbe82d 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -698,6 +698,25 @@ u32 get_random_u32(void) } EXPORT_SYMBOL(get_random_u32); +#ifdef CONFIG_SMP +/* + * This function is called when the CPU is coming up, with entry + * CPUHP_RANDOM_PREPARE, which comes before CPUHP_WORKQUEUE_PREP. + */ +int random_prepare_cpu(unsigned int cpu) +{ + /* + * When the cpu comes back online, immediately invalidate both + * the per-cpu crng and all batches, so that we serve fresh + * randomness. + */ + per_cpu_ptr(&crngs, cpu)->generation = ULONG_MAX; + per_cpu_ptr(&batched_entropy_u32, cpu)->position = UINT_MAX; + per_cpu_ptr(&batched_entropy_u64, cpu)->position = UINT_MAX; + return 0; +} +#endif + /** * randomize_page - Generate a random, page aligned address * @start: The smallest acceptable address the caller will take. @@ -1183,7 +1202,7 @@ struct fast_pool { }; struct work_struct mix; unsigned long last; - atomic_t count; + unsigned int count; u16 reg_idx; }; @@ -1219,6 +1238,29 @@ static void fast_mix(u32 pool[4]) static DEFINE_PER_CPU(struct fast_pool, irq_randomness); +#ifdef CONFIG_SMP +/* + * This function is called when the CPU has just come online, with + * entry CPUHP_AP_RANDOM_ONLINE, just after CPUHP_AP_WORKQUEUE_ONLINE. + */ +int random_online_cpu(unsigned int cpu) +{ + /* + * During CPU shutdown and before CPU onlining, add_interrupt_ + * randomness() may schedule mix_interrupt_randomness(), and + * set the MIX_INFLIGHT flag. However, because the worker can + * be scheduled on a different CPU during this period, that + * flag will never be cleared. For that reason, we zero out + * the flag here, which runs just after workqueues are onlined + * for the CPU again. This also has the effect of setting the + * irq randomness count to zero so that new accumulated irqs + * are fresh. + */ + per_cpu_ptr(&irq_randomness, cpu)->count = 0; + return 0; +} +#endif + static u32 get_reg(struct fast_pool *f, struct pt_regs *regs) { u32 *ptr = (u32 *)regs; @@ -1243,15 +1285,6 @@ static void mix_interrupt_randomness(struct work_struct *work) local_irq_disable(); if (fast_pool != this_cpu_ptr(&irq_randomness)) { local_irq_enable(); - /* - * If we are unlucky enough to have been moved to another CPU, - * during CPU hotplug while the CPU was shutdown then we set - * our count to zero atomically so that when the CPU comes - * back online, it can enqueue work again. The _release here - * pairs with the atomic_inc_return_acquire in - * add_interrupt_randomness(). - */ - atomic_set_release(&fast_pool->count, 0); return; } @@ -1260,7 +1293,7 @@ static void mix_interrupt_randomness(struct work_struct *work) * consistent view, before we reenable irqs again. */ memcpy(pool, fast_pool->pool32, sizeof(pool)); - atomic_set(&fast_pool->count, 0); + fast_pool->count = 0; fast_pool->last = jiffies; local_irq_enable(); @@ -1296,14 +1329,13 @@ void add_interrupt_randomness(int irq) } fast_mix(fast_pool->pool32); - /* The _acquire here pairs with the atomic_set_release in mix_interrupt_randomness(). */ - new_count = (unsigned int)atomic_inc_return_acquire(&fast_pool->count); + new_count = ++fast_pool->count; if (unlikely(crng_init == 0)) { if (new_count >= 64 && crng_pre_init_inject(fast_pool->pool32, sizeof(fast_pool->pool32), true, true) > 0) { - atomic_set(&fast_pool->count, 0); + fast_pool->count = 0; fast_pool->last = now; if (spin_trylock(&input_pool.lock)) { _mix_pool_bytes(&fast_pool->pool32, sizeof(fast_pool->pool32)); @@ -1321,7 +1353,7 @@ void add_interrupt_randomness(int irq) if (unlikely(!fast_pool->mix.func)) INIT_WORK(&fast_pool->mix, mix_interrupt_randomness); - atomic_or(MIX_INFLIGHT, &fast_pool->count); + fast_pool->count |= MIX_INFLIGHT; queue_work_on(raw_smp_processor_id(), system_highpri_wq, &fast_pool->mix); } EXPORT_SYMBOL_GPL(add_interrupt_randomness); diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 411a428ace4d..481e565cc5c4 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -100,6 +100,7 @@ enum cpuhp_state { CPUHP_AP_ARM_CACHE_B15_RAC_DEAD, CPUHP_PADATA_DEAD, CPUHP_AP_DTPM_CPU_DEAD, + CPUHP_RANDOM_PREPARE, CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, CPUHP_HRTIMERS_PREPARE, @@ -240,6 +241,7 @@ enum cpuhp_state { CPUHP_AP_PERF_CSKY_ONLINE, CPUHP_AP_WATCHDOG_ONLINE, CPUHP_AP_WORKQUEUE_ONLINE, + CPUHP_AP_RANDOM_ONLINE, CPUHP_AP_RCUTREE_ONLINE, CPUHP_AP_BASE_CACHEINFO_ONLINE, CPUHP_AP_ONLINE_DYN, diff --git a/include/linux/random.h b/include/linux/random.h index d7354de9351e..6148b8d1ccf3 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -156,4 +156,9 @@ static inline bool __init arch_get_random_long_early(unsigned long *v) } #endif +#ifdef CONFIG_SMP +extern int random_prepare_cpu(unsigned int cpu); +extern int random_online_cpu(unsigned int cpu); +#endif + #endif /* _LINUX_RANDOM_H */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 407a2568f35e..238cba15449f 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -1659,6 +1660,11 @@ static struct cpuhp_step cpuhp_hp_states[] = { .startup.single = perf_event_init_cpu, .teardown.single = perf_event_exit_cpu, }, + [CPUHP_RANDOM_PREPARE] = { + .name = "random:prepare", + .startup.single = random_prepare_cpu, + .teardown.single = NULL, + }, [CPUHP_WORKQUEUE_PREP] = { .name = "workqueue:prepare", .startup.single = workqueue_prepare_cpu, @@ -1782,6 +1788,11 @@ static struct cpuhp_step cpuhp_hp_states[] = { .startup.single = workqueue_online_cpu, .teardown.single = workqueue_offline_cpu, }, + [CPUHP_AP_RANDOM_ONLINE] = { + .name = "random:online", + .startup.single = random_online_cpu, + .teardown.single = NULL, + }, [CPUHP_AP_RCUTREE_ONLINE] = { .name = "RCU/tree:online", .startup.single = rcutree_online_cpu, -- cgit v1.2.3 From 6d3971dab239e7db1691690a02ce6becf30689cb Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 21 Feb 2022 16:16:39 +0100 Subject: cgroup: clarify cgroup_css_set_fork() With recent fixes for the permission checking when moving a task into a cgroup using a file descriptor to a cgroup's cgroup.procs file and calling write() it seems a good idea to clarify CLONE_INTO_CGROUP permission checking with a comment. Cc: Tejun Heo Cc: Signed-off-by: Christian Brauner Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b31e1465868a..77702e089d6a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6161,6 +6161,20 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) if (ret) goto err; + /* + * Spawning a task directly into a cgroup works by passing a file + * descriptor to the target cgroup directory. This can even be an O_PATH + * file descriptor. But it can never be a cgroup.procs file descriptor. + * This was done on purpose so spawning into a cgroup could be + * conceptualized as an atomic + * + * fd = openat(dfd_cgroup, "cgroup.procs", ...); + * write(fd, , ...); + * + * sequence, i.e. it's a shorthand for the caller opening and writing + * cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us + * to always use the caller's credentials. + */ ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb, !(kargs->flags & CLONE_THREAD), current->nsproxy->cgroup_ns); -- cgit v1.2.3 From 467a726b754f474936980da793b4ff2ec3e382a7 Mon Sep 17 00:00:00 2001 From: Michal Koutný Date: Thu, 17 Feb 2022 17:11:28 +0100 Subject: cgroup-v1: Correct privileges check in release_agent writes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The idea is to check: a) the owning user_ns of cgroup_ns, b) capabilities in init_user_ns. The commit 24f600856418 ("cgroup-v1: Require capabilities to set release_agent") got this wrong in the write handler of release_agent since it checked user_ns of the opener (may be different from the owning user_ns of cgroup_ns). Secondly, to avoid possibly confused deputy, the capability of the opener must be checked. Fixes: 24f600856418 ("cgroup-v1: Require capabilities to set release_agent") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/stable/20220216121142.GB30035@blackbody.suse.cz/ Signed-off-by: Michal Koutný Reviewed-by: Masami Ichikawa(CIP) Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 0e877dbcfeea..afc6c0e9c966 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -546,6 +546,7 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; + struct cgroup_file_ctx *ctx; BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); @@ -553,8 +554,9 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, * Release agent gets called with all capabilities, * require capabilities to set release agent. */ - if ((of->file->f_cred->user_ns != &init_user_ns) || - !capable(CAP_SYS_ADMIN)) + ctx = of->priv; + if ((ctx->ns->user_ns != &init_user_ns) || + !file_ns_capable(of->file, &init_user_ns, CAP_SYS_ADMIN)) return -EPERM; cgrp = cgroup_kn_lock_live(of->kn, false); -- cgit v1.2.3 From 272ceeaea355214b301530e262a0df8600bfca95 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Tue, 22 Feb 2022 11:44:51 -0500 Subject: audit: log AUDIT_TIME_* records only from rules AUDIT_TIME_* events are generated when there are syscall rules present that are not related to time keeping. This will produce noisy log entries that could flood the logs and hide events we really care about. Rather than immediately produce the AUDIT_TIME_* records, store the data in the context and log it at syscall exit time respecting the filter rules. Note: This eats the audit_buffer, unlike any others in show_special(). Please see https://bugzilla.redhat.com/show_bug.cgi?id=1991919 Fixes: 7e8eda734d30 ("ntp: Audit NTP parameters adjustment") Fixes: 2d87a0674bd6 ("timekeeping: Audit clock adjustments") Signed-off-by: Richard Guy Briggs [PM: fixed style/whitespace issues] Signed-off-by: Paul Moore --- kernel/audit.h | 4 +++ kernel/auditsc.c | 87 +++++++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 71 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index c4498090a5bd..58b66543b4d5 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -201,6 +201,10 @@ struct audit_context { struct { char *name; } module; + struct { + struct audit_ntp_data ntp_data; + struct timespec64 tk_injoffset; + } time; }; int fds[2]; struct audit_proctitle proctitle; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index fce5d43a933f..0efd75e4730f 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1340,6 +1340,53 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) from_kuid(&init_user_ns, name->fcap.rootid)); } +static void audit_log_time(struct audit_context *context, struct audit_buffer **ab) +{ + const struct audit_ntp_data *ntp = &context->time.ntp_data; + const struct timespec64 *tk = &context->time.tk_injoffset; + static const char * const ntp_name[] = { + "offset", + "freq", + "status", + "tai", + "tick", + "adjust", + }; + int type; + + if (context->type == AUDIT_TIME_ADJNTPVAL) { + for (type = 0; type < AUDIT_NTP_NVALS; type++) { + if (ntp->vals[type].newval != ntp->vals[type].oldval) { + if (!*ab) { + *ab = audit_log_start(context, + GFP_KERNEL, + AUDIT_TIME_ADJNTPVAL); + if (!*ab) + return; + } + audit_log_format(*ab, "op=%s old=%lli new=%lli", + ntp_name[type], + ntp->vals[type].oldval, + ntp->vals[type].newval); + audit_log_end(*ab); + *ab = NULL; + } + } + } + if (tk->tv_sec != 0 || tk->tv_nsec != 0) { + if (!*ab) { + *ab = audit_log_start(context, GFP_KERNEL, + AUDIT_TIME_INJOFFSET); + if (!*ab) + return; + } + audit_log_format(*ab, "sec=%lli nsec=%li", + (long long)tk->tv_sec, tk->tv_nsec); + audit_log_end(*ab); + *ab = NULL; + } +} + static void show_special(struct audit_context *context, int *call_panic) { struct audit_buffer *ab; @@ -1454,6 +1501,11 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_format(ab, "(null)"); break; + case AUDIT_TIME_ADJNTPVAL: + case AUDIT_TIME_INJOFFSET: + /* this call deviates from the rest, eating the buffer */ + audit_log_time(context, &ab); + break; } audit_log_end(ab); } @@ -2849,31 +2901,26 @@ void __audit_fanotify(unsigned int response) void __audit_tk_injoffset(struct timespec64 offset) { - audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_INJOFFSET, - "sec=%lli nsec=%li", - (long long)offset.tv_sec, offset.tv_nsec); -} - -static void audit_log_ntp_val(const struct audit_ntp_data *ad, - const char *op, enum audit_ntp_type type) -{ - const struct audit_ntp_val *val = &ad->vals[type]; - - if (val->newval == val->oldval) - return; + struct audit_context *context = audit_context(); - audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_ADJNTPVAL, - "op=%s old=%lli new=%lli", op, val->oldval, val->newval); + /* only set type if not already set by NTP */ + if (!context->type) + context->type = AUDIT_TIME_INJOFFSET; + memcpy(&context->time.tk_injoffset, &offset, sizeof(offset)); } void __audit_ntp_log(const struct audit_ntp_data *ad) { - audit_log_ntp_val(ad, "offset", AUDIT_NTP_OFFSET); - audit_log_ntp_val(ad, "freq", AUDIT_NTP_FREQ); - audit_log_ntp_val(ad, "status", AUDIT_NTP_STATUS); - audit_log_ntp_val(ad, "tai", AUDIT_NTP_TAI); - audit_log_ntp_val(ad, "tick", AUDIT_NTP_TICK); - audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST); + struct audit_context *context = audit_context(); + int type; + + for (type = 0; type < AUDIT_NTP_NVALS; type++) + if (ad->vals[type].newval != ad->vals[type].oldval) { + /* unconditionally set type, overwriting TK */ + context->type = AUDIT_TIME_ADJNTPVAL; + memcpy(&context->time.ntp_data, ad, sizeof(*ad)); + break; + } } void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, -- cgit v1.2.3 From c70cd039f1d779126347a896a58876782dcc5284 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Wed, 16 Feb 2022 11:17:53 +0800 Subject: cpuset: Fix kernel-doc Fix the following W=1 kernel warnings: kernel/cgroup/cpuset.c:3718: warning: expecting prototype for cpuset_memory_pressure_bump(). Prototype was for __cpuset_memory_pressure_bump() instead. kernel/cgroup/cpuset.c:3568: warning: expecting prototype for cpuset_node_allowed(). Prototype was for __cpuset_node_allowed() instead. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 97c53f3cc917..5de18448016c 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3524,8 +3524,8 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) return cs; } -/** - * cpuset_node_allowed - Can we allocate on a memory node? +/* + * __cpuset_node_allowed - Can we allocate on a memory node? * @node: is this an allowed node? * @gfp_mask: memory allocation flags * @@ -3696,8 +3696,8 @@ void cpuset_print_current_mems_allowed(void) int cpuset_memory_pressure_enabled __read_mostly; -/** - * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. +/* + * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. * * Keep a running average of the rate of synchronous (direct) * page reclaim efforts initiated by tasks in each cpuset. @@ -3712,7 +3712,7 @@ int cpuset_memory_pressure_enabled __read_mostly; * "memory_pressure". Value displayed is an integer * representing the recent rate of entry into the synchronous * (direct) page reclaim by any task attached to the cpuset. - **/ + */ void __cpuset_memory_pressure_bump(void) { -- cgit v1.2.3 From be9a2277cafd318976d59c41a7f45a934ec43b26 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 17 Feb 2022 11:23:59 +0100 Subject: fork: Redo ifdefs around task stack handling The use of ifdef CONFIG_VMAP_STACK is confusing in terms what is actually happenning and what can happen. For instance from reading free_thread_stack() it appears that in the CONFIG_VMAP_STACK case it may receive a non-NULL vm pointer but it may also be NULL in which case __free_pages() is used to free the stack. This is however not the case because in the VMAP case a non-NULL pointer is always returned here. Since it looks like this might happen, the compiler creates the correct dead code with the invocation to __free_pages() and everything around it. Twice. Add spaces between the ifdef and the identifer to recognize the ifdef level which is currently in scope. Add the current identifer as a comment behind #else and #endif. Move the code within free_thread_stack() and alloc_thread_stack_node() into the relevant ifdef blocks. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20220217102406.3697941-2-bigeasy@linutronix.de --- kernel/fork.c | 74 +++++++++++++++++++++++++++++++---------------------------- 1 file changed, 39 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index a024bf6254df..f5cc10164334 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -185,7 +185,7 @@ static inline void free_task_struct(struct task_struct *tsk) */ # if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) -#ifdef CONFIG_VMAP_STACK +# ifdef CONFIG_VMAP_STACK /* * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB * flush. Try to minimize the number of calls by caching stacks. @@ -210,11 +210,9 @@ static int free_vm_stack_cache(unsigned int cpu) return 0; } -#endif static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { -#ifdef CONFIG_VMAP_STACK void *stack; int i; @@ -258,45 +256,53 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) tsk->stack = stack; } return stack; -#else - struct page *page = alloc_pages_node(node, THREADINFO_GFP, - THREAD_SIZE_ORDER); - - if (likely(page)) { - tsk->stack = kasan_reset_tag(page_address(page)); - return tsk->stack; - } - return NULL; -#endif } -static inline void free_thread_stack(struct task_struct *tsk) +static void free_thread_stack(struct task_struct *tsk) { -#ifdef CONFIG_VMAP_STACK struct vm_struct *vm = task_stack_vm_area(tsk); + int i; - if (vm) { - int i; + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) + memcg_kmem_uncharge_page(vm->pages[i], 0); - for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) - memcg_kmem_uncharge_page(vm->pages[i], 0); + for (i = 0; i < NR_CACHED_STACKS; i++) { + if (this_cpu_cmpxchg(cached_stacks[i], NULL, + tsk->stack_vm_area) != NULL) + continue; - for (i = 0; i < NR_CACHED_STACKS; i++) { - if (this_cpu_cmpxchg(cached_stacks[i], - NULL, tsk->stack_vm_area) != NULL) - continue; + tsk->stack = NULL; + tsk->stack_vm_area = NULL; + return; + } + vfree_atomic(tsk->stack); + tsk->stack = NULL; + tsk->stack_vm_area = NULL; +} - return; - } +# else /* !CONFIG_VMAP_STACK */ - vfree_atomic(tsk->stack); - return; +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) +{ + struct page *page = alloc_pages_node(node, THREADINFO_GFP, + THREAD_SIZE_ORDER); + + if (likely(page)) { + tsk->stack = kasan_reset_tag(page_address(page)); + return tsk->stack; } -#endif + return NULL; +} +static void free_thread_stack(struct task_struct *tsk) +{ __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER); + tsk->stack = NULL; } -# else + +# endif /* CONFIG_VMAP_STACK */ +# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */ + static struct kmem_cache *thread_stack_cache; static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, @@ -312,6 +318,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, static void free_thread_stack(struct task_struct *tsk) { kmem_cache_free(thread_stack_cache, tsk->stack); + tsk->stack = NULL; } void thread_stack_cache_init(void) @@ -321,8 +328,9 @@ void thread_stack_cache_init(void) THREAD_SIZE, NULL); BUG_ON(thread_stack_cache == NULL); } -# endif -#endif + +# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */ +#endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */ /* SLAB cache for signal_struct structures (tsk->signal) */ static struct kmem_cache *signal_cachep; @@ -432,10 +440,6 @@ static void release_task_stack(struct task_struct *tsk) account_kernel_stack(tsk, -1); free_thread_stack(tsk); - tsk->stack = NULL; -#ifdef CONFIG_VMAP_STACK - tsk->stack_vm_area = NULL; -#endif } #ifdef CONFIG_THREAD_INFO_IN_TASK -- cgit v1.2.3 From 546c42b2c5c161619736dd730d3df709181999d0 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 17 Feb 2022 11:24:00 +0100 Subject: fork: Duplicate task_struct before stack allocation alloc_thread_stack_node() already populates the task_struct::stack member except on IA64. The stack pointer is saved and populated again because IA64 needs it and arch_dup_task_struct() overwrites it. Allocate thread's stack after task_struct has been duplicated as a preparation for further changes. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20220217102406.3697941-3-bigeasy@linutronix.de --- kernel/fork.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index f5cc10164334..30c01ce2ae57 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -888,6 +888,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (!tsk) return NULL; + err = arch_dup_task_struct(tsk, orig); + if (err) + goto free_tsk; + stack = alloc_thread_stack_node(tsk, node); if (!stack) goto free_tsk; @@ -897,8 +901,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) stack_vm_area = task_stack_vm_area(tsk); - err = arch_dup_task_struct(tsk, orig); - /* * arch_dup_task_struct() clobbers the stack-related fields. Make * sure they're properly initialized before using any stack-related @@ -912,9 +914,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) refcount_set(&tsk->stack_refcount, 1); #endif - if (err) - goto free_stack; - err = scs_prepare(tsk, node); if (err) goto free_stack; -- cgit v1.2.3 From 2bb0529c0bc0698f3baf3e88ffd61a18eef252a7 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 17 Feb 2022 11:24:01 +0100 Subject: fork, IA64: Provide alloc_thread_stack_node() for IA64 Provide a generic alloc_thread_stack_node() for IA64 and CONFIG_ARCH_THREAD_STACK_ALLOCATOR which returns stack pointer and sets task_struct::stack so it behaves exactly like the other implementations. Rename IA64's alloc_thread_stack_node() and add the generic version to the fork code so it is in one place _and_ to drastically lower the chances of fat fingering the IA64 code. Do the same for free_thread_stack(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20220217102406.3697941-4-bigeasy@linutronix.de --- arch/ia64/include/asm/thread_info.h | 6 +++--- kernel/fork.c | 17 +++++++++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h index 51d20cb37706..1684716f0820 100644 --- a/arch/ia64/include/asm/thread_info.h +++ b/arch/ia64/include/asm/thread_info.h @@ -55,15 +55,15 @@ struct thread_info { #ifndef ASM_OFFSETS_C /* how to get the thread information struct from C */ #define current_thread_info() ((struct thread_info *) ((char *) current + IA64_TASK_SIZE)) -#define alloc_thread_stack_node(tsk, node) \ +#define arch_alloc_thread_stack_node(tsk, node) \ ((unsigned long *) ((char *) (tsk) + IA64_TASK_SIZE)) #define task_thread_info(tsk) ((struct thread_info *) ((char *) (tsk) + IA64_TASK_SIZE)) #else #define current_thread_info() ((struct thread_info *) 0) -#define alloc_thread_stack_node(tsk, node) ((unsigned long *) 0) +#define arch_alloc_thread_stack_node(tsk, node) ((unsigned long *) 0) #define task_thread_info(tsk) ((struct thread_info *) 0) #endif -#define free_thread_stack(tsk) /* nothing */ +#define arch_free_thread_stack(tsk) /* nothing */ #define task_stack_page(tsk) ((void *)(tsk)) #define __HAVE_THREAD_FUNCTIONS diff --git a/kernel/fork.c b/kernel/fork.c index 30c01ce2ae57..7b70c4741072 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -330,6 +330,23 @@ void thread_stack_cache_init(void) } # endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */ +#else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */ + +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) +{ + unsigned long *stack; + + stack = arch_alloc_thread_stack_node(tsk, node); + tsk->stack = stack; + return stack; +} + +static void free_thread_stack(struct task_struct *tsk) +{ + arch_free_thread_stack(tsk); + tsk->stack = NULL; +} + #endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */ /* SLAB cache for signal_struct structures (tsk->signal) */ -- cgit v1.2.3 From 7865aba3ade4cf30f0ac08e015550084a50d9afb Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 17 Feb 2022 11:24:02 +0100 Subject: fork: Don't assign the stack pointer in dup_task_struct() All four versions of alloc_thread_stack_node() assign now task_struct::stack in case the allocation was successful. Let alloc_thread_stack_node() return an error code instead of the stack pointer and remove the stack assignment in dup_task_struct(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20220217102406.3697941-5-bigeasy@linutronix.de --- kernel/fork.c | 47 ++++++++++++++++------------------------------- 1 file changed, 16 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 7b70c4741072..875bd43f02ca 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -211,7 +211,7 @@ static int free_vm_stack_cache(unsigned int cpu) return 0; } -static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) +static int alloc_thread_stack_node(struct task_struct *tsk, int node) { void *stack; int i; @@ -232,7 +232,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) tsk->stack_vm_area = s; tsk->stack = s->addr; - return s->addr; + return 0; } /* @@ -245,17 +245,16 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) THREADINFO_GFP & ~__GFP_ACCOUNT, PAGE_KERNEL, 0, node, __builtin_return_address(0)); - + if (!stack) + return -ENOMEM; /* * We can't call find_vm_area() in interrupt context, and * free_thread_stack() can be called in interrupt context, * so cache the vm_struct. */ - if (stack) { - tsk->stack_vm_area = find_vm_area(stack); - tsk->stack = stack; - } - return stack; + tsk->stack_vm_area = find_vm_area(stack); + tsk->stack = stack; + return 0; } static void free_thread_stack(struct task_struct *tsk) @@ -282,16 +281,16 @@ static void free_thread_stack(struct task_struct *tsk) # else /* !CONFIG_VMAP_STACK */ -static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) +static int alloc_thread_stack_node(struct task_struct *tsk, int node) { struct page *page = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); if (likely(page)) { tsk->stack = kasan_reset_tag(page_address(page)); - return tsk->stack; + return 0; } - return NULL; + return -ENOMEM; } static void free_thread_stack(struct task_struct *tsk) @@ -305,14 +304,13 @@ static void free_thread_stack(struct task_struct *tsk) static struct kmem_cache *thread_stack_cache; -static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, - int node) +static int alloc_thread_stack_node(struct task_struct *tsk, int node) { unsigned long *stack; stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); stack = kasan_reset_tag(stack); tsk->stack = stack; - return stack; + return stack ? 0 : -ENOMEM; } static void free_thread_stack(struct task_struct *tsk) @@ -332,13 +330,13 @@ void thread_stack_cache_init(void) # endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */ #else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */ -static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) +static int alloc_thread_stack_node(struct task_struct *tsk, int node) { unsigned long *stack; stack = arch_alloc_thread_stack_node(tsk, node); tsk->stack = stack; - return stack; + return stack ? 0 : -ENOMEM; } static void free_thread_stack(struct task_struct *tsk) @@ -895,8 +893,6 @@ void set_task_stack_end_magic(struct task_struct *tsk) static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; - unsigned long *stack; - struct vm_struct *stack_vm_area __maybe_unused; int err; if (node == NUMA_NO_NODE) @@ -909,24 +905,13 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (err) goto free_tsk; - stack = alloc_thread_stack_node(tsk, node); - if (!stack) + err = alloc_thread_stack_node(tsk, node); + if (err) goto free_tsk; if (memcg_charge_kernel_stack(tsk)) goto free_stack; - stack_vm_area = task_stack_vm_area(tsk); - - /* - * arch_dup_task_struct() clobbers the stack-related fields. Make - * sure they're properly initialized before using any stack-related - * functions again. - */ - tsk->stack = stack; -#ifdef CONFIG_VMAP_STACK - tsk->stack_vm_area = stack_vm_area; -#endif #ifdef CONFIG_THREAD_INFO_IN_TASK refcount_set(&tsk->stack_refcount, 1); #endif -- cgit v1.2.3 From f1c1a9ee00e4c53c9ccc03ec1aff4792948a25eb Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 17 Feb 2022 11:24:03 +0100 Subject: fork: Move memcg_charge_kernel_stack() into CONFIG_VMAP_STACK memcg_charge_kernel_stack() is only used in the CONFIG_VMAP_STACK case. Move memcg_charge_kernel_stack() into the CONFIG_VMAP_STACK block and invoke it from within alloc_thread_stack_node(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20220217102406.3697941-6-bigeasy@linutronix.de --- kernel/fork.c | 69 +++++++++++++++++++++++++++++++---------------------------- 1 file changed, 36 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 875bd43f02ca..ac63e7fa8816 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -211,6 +211,32 @@ static int free_vm_stack_cache(unsigned int cpu) return 0; } +static int memcg_charge_kernel_stack(struct task_struct *tsk) +{ + struct vm_struct *vm = task_stack_vm_area(tsk); + int i; + int ret; + + BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); + BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); + + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { + ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0); + if (ret) + goto err; + } + return 0; +err: + /* + * If memcg_kmem_charge_page() fails, page's memory cgroup pointer is + * NULL, and memcg_kmem_uncharge_page() in free_thread_stack() will + * ignore this page. + */ + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) + memcg_kmem_uncharge_page(vm->pages[i], 0); + return ret; +} + static int alloc_thread_stack_node(struct task_struct *tsk, int node) { void *stack; @@ -230,6 +256,11 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) /* Clear stale pointers from reused stack. */ memset(s->addr, 0, THREAD_SIZE); + if (memcg_charge_kernel_stack(tsk)) { + vfree(s->addr); + return -ENOMEM; + } + tsk->stack_vm_area = s; tsk->stack = s->addr; return 0; @@ -247,6 +278,11 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) 0, node, __builtin_return_address(0)); if (!stack) return -ENOMEM; + + if (memcg_charge_kernel_stack(tsk)) { + vfree(stack); + return -ENOMEM; + } /* * We can't call find_vm_area() in interrupt context, and * free_thread_stack() can be called in interrupt context, @@ -418,36 +454,6 @@ static void account_kernel_stack(struct task_struct *tsk, int account) } } -static int memcg_charge_kernel_stack(struct task_struct *tsk) -{ -#ifdef CONFIG_VMAP_STACK - struct vm_struct *vm = task_stack_vm_area(tsk); - int ret; - - BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); - - if (vm) { - int i; - - BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); - - for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { - /* - * If memcg_kmem_charge_page() fails, page's - * memory cgroup pointer is NULL, and - * memcg_kmem_uncharge_page() in free_thread_stack() - * will ignore this page. - */ - ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, - 0); - if (ret) - return ret; - } - } -#endif - return 0; -} - static void release_task_stack(struct task_struct *tsk) { if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD)) @@ -909,9 +915,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (err) goto free_tsk; - if (memcg_charge_kernel_stack(tsk)) - goto free_stack; - #ifdef CONFIG_THREAD_INFO_IN_TASK refcount_set(&tsk->stack_refcount, 1); #endif -- cgit v1.2.3 From 1a03d3f13ffe5dd24142d6db629e72c11b704d99 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 17 Feb 2022 11:24:04 +0100 Subject: fork: Move task stack accounting to do_exit() There is no need to perform the stack accounting of the outgoing task in its final schedule() invocation which happens with preemption disabled. The task is leaving, the resources will be freed and the accounting can happen in do_exit() before the actual schedule invocation which frees the stack memory. Move the accounting of the stack memory from release_task_stack() to exit_task_stack_account() which then can be invoked from do_exit(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20220217102406.3697941-7-bigeasy@linutronix.de --- include/linux/sched/task_stack.h | 2 ++ kernel/exit.c | 1 + kernel/fork.c | 35 +++++++++++++++++++++++------------ 3 files changed, 26 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h index d10150587d81..892562ebbd3a 100644 --- a/include/linux/sched/task_stack.h +++ b/include/linux/sched/task_stack.h @@ -79,6 +79,8 @@ static inline void *try_get_task_stack(struct task_struct *tsk) static inline void put_task_stack(struct task_struct *tsk) {} #endif +void exit_task_stack_account(struct task_struct *tsk); + #define task_stack_end_corrupted(task) \ (*(end_of_stack(task)) != STACK_END_MAGIC) diff --git a/kernel/exit.c b/kernel/exit.c index b00a25bb4ab9..c303cffe7fdb 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -845,6 +845,7 @@ void __noreturn do_exit(long code) put_page(tsk->task_frag.page); validate_creds_for_do_exit(tsk); + exit_task_stack_account(tsk); check_stack_usage(); preempt_disable(); diff --git a/kernel/fork.c b/kernel/fork.c index ac63e7fa8816..25828127db8d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -211,9 +211,8 @@ static int free_vm_stack_cache(unsigned int cpu) return 0; } -static int memcg_charge_kernel_stack(struct task_struct *tsk) +static int memcg_charge_kernel_stack(struct vm_struct *vm) { - struct vm_struct *vm = task_stack_vm_area(tsk); int i; int ret; @@ -239,6 +238,7 @@ err: static int alloc_thread_stack_node(struct task_struct *tsk, int node) { + struct vm_struct *vm; void *stack; int i; @@ -256,7 +256,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) /* Clear stale pointers from reused stack. */ memset(s->addr, 0, THREAD_SIZE); - if (memcg_charge_kernel_stack(tsk)) { + if (memcg_charge_kernel_stack(s)) { vfree(s->addr); return -ENOMEM; } @@ -279,7 +279,8 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) if (!stack) return -ENOMEM; - if (memcg_charge_kernel_stack(tsk)) { + vm = find_vm_area(stack); + if (memcg_charge_kernel_stack(vm)) { vfree(stack); return -ENOMEM; } @@ -288,19 +289,15 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) * free_thread_stack() can be called in interrupt context, * so cache the vm_struct. */ - tsk->stack_vm_area = find_vm_area(stack); + tsk->stack_vm_area = vm; tsk->stack = stack; return 0; } static void free_thread_stack(struct task_struct *tsk) { - struct vm_struct *vm = task_stack_vm_area(tsk); int i; - for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) - memcg_kmem_uncharge_page(vm->pages[i], 0); - for (i = 0; i < NR_CACHED_STACKS; i++) { if (this_cpu_cmpxchg(cached_stacks[i], NULL, tsk->stack_vm_area) != NULL) @@ -454,12 +451,25 @@ static void account_kernel_stack(struct task_struct *tsk, int account) } } +void exit_task_stack_account(struct task_struct *tsk) +{ + account_kernel_stack(tsk, -1); + + if (IS_ENABLED(CONFIG_VMAP_STACK)) { + struct vm_struct *vm; + int i; + + vm = task_stack_vm_area(tsk); + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) + memcg_kmem_uncharge_page(vm->pages[i], 0); + } +} + static void release_task_stack(struct task_struct *tsk) { if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD)) return; /* Better to leak the stack than to free prematurely */ - account_kernel_stack(tsk, -1); free_thread_stack(tsk); } @@ -918,6 +928,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #ifdef CONFIG_THREAD_INFO_IN_TASK refcount_set(&tsk->stack_refcount, 1); #endif + account_kernel_stack(tsk, 1); err = scs_prepare(tsk, node); if (err) @@ -961,8 +972,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->wake_q.next = NULL; tsk->worker_private = NULL; - account_kernel_stack(tsk, 1); - kcov_task_init(tsk); kmap_local_fork(tsk); @@ -981,6 +990,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) return tsk; free_stack: + exit_task_stack_account(tsk); free_thread_stack(tsk); free_tsk: free_task_struct(tsk); @@ -2459,6 +2469,7 @@ bad_fork_cleanup_count: exit_creds(p); bad_fork_free: WRITE_ONCE(p->__state, TASK_DEAD); + exit_task_stack_account(p); put_task_stack(p); delayed_free_task(p); fork_out: -- cgit v1.2.3 From e540bf3162e822d7a1f07e69e3bb1b4f925ca368 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 17 Feb 2022 11:24:05 +0100 Subject: fork: Only cache the VMAP stack in finish_task_switch() The task stack could be deallocated later, but for fork()/exec() kind of workloads (say a shell script executing several commands) it is important that the stack is released in finish_task_switch() so that in VMAP_STACK case it can be cached and reused in the new task. For PREEMPT_RT it would be good if the wake-up in vfree_atomic() could be avoided in the scheduling path. Far worse are the other free_thread_stack() implementations which invoke __free_pages()/ kmem_cache_free() with disabled preemption. Cache the stack in free_thread_stack() in the VMAP_STACK case and RCU-delay the free path otherwise. Free the stack in the RCU callback. In the VMAP_STACK case this is another opportunity to fill the cache. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20220217102406.3697941-8-bigeasy@linutronix.de --- kernel/fork.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 63 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 25828127db8d..177bc64078cd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -193,6 +193,41 @@ static inline void free_task_struct(struct task_struct *tsk) #define NR_CACHED_STACKS 2 static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); +struct vm_stack { + struct rcu_head rcu; + struct vm_struct *stack_vm_area; +}; + +static bool try_release_thread_stack_to_cache(struct vm_struct *vm) +{ + unsigned int i; + + for (i = 0; i < NR_CACHED_STACKS; i++) { + if (this_cpu_cmpxchg(cached_stacks[i], NULL, vm) != NULL) + continue; + return true; + } + return false; +} + +static void thread_stack_free_rcu(struct rcu_head *rh) +{ + struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu); + + if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area)) + return; + + vfree(vm_stack); +} + +static void thread_stack_delayed_free(struct task_struct *tsk) +{ + struct vm_stack *vm_stack = tsk->stack; + + vm_stack->stack_vm_area = tsk->stack_vm_area; + call_rcu(&vm_stack->rcu, thread_stack_free_rcu); +} + static int free_vm_stack_cache(unsigned int cpu) { struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu); @@ -296,24 +331,27 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) static void free_thread_stack(struct task_struct *tsk) { - int i; + if (!try_release_thread_stack_to_cache(tsk->stack_vm_area)) + thread_stack_delayed_free(tsk); - for (i = 0; i < NR_CACHED_STACKS; i++) { - if (this_cpu_cmpxchg(cached_stacks[i], NULL, - tsk->stack_vm_area) != NULL) - continue; - - tsk->stack = NULL; - tsk->stack_vm_area = NULL; - return; - } - vfree_atomic(tsk->stack); tsk->stack = NULL; tsk->stack_vm_area = NULL; } # else /* !CONFIG_VMAP_STACK */ +static void thread_stack_free_rcu(struct rcu_head *rh) +{ + __free_pages(virt_to_page(rh), THREAD_SIZE_ORDER); +} + +static void thread_stack_delayed_free(struct task_struct *tsk) +{ + struct rcu_head *rh = tsk->stack; + + call_rcu(rh, thread_stack_free_rcu); +} + static int alloc_thread_stack_node(struct task_struct *tsk, int node) { struct page *page = alloc_pages_node(node, THREADINFO_GFP, @@ -328,7 +366,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) static void free_thread_stack(struct task_struct *tsk) { - __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER); + thread_stack_delayed_free(tsk); tsk->stack = NULL; } @@ -337,6 +375,18 @@ static void free_thread_stack(struct task_struct *tsk) static struct kmem_cache *thread_stack_cache; +static void thread_stack_free_rcu(struct rcu_head *rh) +{ + kmem_cache_free(thread_stack_cache, rh); +} + +static void thread_stack_delayed_free(struct task_struct *tsk) +{ + struct rcu_head *rh = tsk->stack; + + call_rcu(rh, thread_stack_free_rcu); +} + static int alloc_thread_stack_node(struct task_struct *tsk, int node) { unsigned long *stack; @@ -348,7 +398,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) static void free_thread_stack(struct task_struct *tsk) { - kmem_cache_free(thread_stack_cache, tsk->stack); + thread_stack_delayed_free(tsk); tsk->stack = NULL; } -- cgit v1.2.3 From 0ce055f85335e48bc571114d61a70ae217039362 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 17 Feb 2022 11:24:06 +0100 Subject: fork: Use IS_ENABLED() in account_kernel_stack() Not strickly needed but checking CONFIG_VMAP_STACK instead of task_stack_vm_area()' result allows the compiler the remove the else path in the CONFIG_VMAP_STACK case where the pointer can't be NULL. Check for CONFIG_VMAP_STACK in order to use the proper path. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20220217102406.3697941-9-bigeasy@linutronix.de --- kernel/fork.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 177bc64078cd..1279b57c4ad9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -485,16 +485,16 @@ void vm_area_free(struct vm_area_struct *vma) static void account_kernel_stack(struct task_struct *tsk, int account) { - void *stack = task_stack_page(tsk); - struct vm_struct *vm = task_stack_vm_area(tsk); - - if (vm) { + if (IS_ENABLED(CONFIG_VMAP_STACK)) { + struct vm_struct *vm = task_stack_vm_area(tsk); int i; for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB, account * (PAGE_SIZE / 1024)); } else { + void *stack = task_stack_page(tsk); + /* All stack pages are in the same node. */ mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB, account * (THREAD_SIZE / 1024)); -- cgit v1.2.3 From 73bd66d9c834220579c881a3eb020fd8917075d8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 9 Feb 2022 09:28:28 +0100 Subject: scsi: block: Remove REQ_OP_WRITE_SAME support No more users of REQ_OP_WRITE_SAME or drivers implementing it are left, so remove the infrastructure. [mkp: fold in and tweak sysfs reporting fix] Link: https://lore.kernel.org/r/20220209082828.2629273-8-hch@lst.de Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- block/blk-core.c | 13 +------ block/blk-lib.c | 93 ----------------------------------------------- block/blk-merge.c | 40 -------------------- block/blk-settings.c | 16 -------- block/blk-sysfs.c | 3 +- block/blk-zoned.c | 1 - block/blk.h | 1 - block/bounce.c | 3 -- include/linux/bio.h | 3 -- include/linux/blk_types.h | 2 - include/linux/blkdev.h | 19 ---------- kernel/trace/blktrace.c | 1 - 12 files changed, 2 insertions(+), 193 deletions(-) (limited to 'kernel') diff --git a/block/blk-core.c b/block/blk-core.c index 97f8bc8d3a79..535dbaa78ffe 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -122,7 +122,6 @@ static const char *const blk_op_name[] = { REQ_OP_NAME(ZONE_CLOSE), REQ_OP_NAME(ZONE_FINISH), REQ_OP_NAME(ZONE_APPEND), - REQ_OP_NAME(WRITE_SAME), REQ_OP_NAME(WRITE_ZEROES), REQ_OP_NAME(DRV_IN), REQ_OP_NAME(DRV_OUT), @@ -734,10 +733,6 @@ noinline_for_stack bool submit_bio_checks(struct bio *bio) if (!blk_queue_secure_erase(q)) goto not_supported; break; - case REQ_OP_WRITE_SAME: - if (!q->limits.max_write_same_sectors) - goto not_supported; - break; case REQ_OP_ZONE_APPEND: status = blk_check_zone_append(q, bio); if (status != BLK_STS_OK) @@ -933,13 +928,7 @@ void submit_bio(struct bio *bio) * go through the normal accounting stuff before submission. */ if (bio_has_data(bio)) { - unsigned int count; - - if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) - count = queue_logical_block_size( - bdev_get_queue(bio->bi_bdev)) >> 9; - else - count = bio_sectors(bio); + unsigned int count = bio_sectors(bio); if (op_is_write(bio_op(bio))) { count_vm_events(PGPGOUT, count); diff --git a/block/blk-lib.c b/block/blk-lib.c index 9f09beadcbe3..bf5254ccdb5f 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -151,99 +151,6 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL(blkdev_issue_discard); -/** - * __blkdev_issue_write_same - generate number of bios with same page - * @bdev: target blockdev - * @sector: start sector - * @nr_sects: number of sectors to write - * @gfp_mask: memory allocation flags (for bio_alloc) - * @page: page containing data to write - * @biop: pointer to anchor bio - * - * Description: - * Generate and issue number of bios(REQ_OP_WRITE_SAME) with same page. - */ -static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, struct page *page, - struct bio **biop) -{ - struct request_queue *q = bdev_get_queue(bdev); - unsigned int max_write_same_sectors; - struct bio *bio = *biop; - sector_t bs_mask; - - if (!q) - return -ENXIO; - - if (bdev_read_only(bdev)) - return -EPERM; - - bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; - if ((sector | nr_sects) & bs_mask) - return -EINVAL; - - if (!bdev_write_same(bdev)) - return -EOPNOTSUPP; - - /* Ensure that max_write_same_sectors doesn't overflow bi_size */ - max_write_same_sectors = bio_allowed_max_sectors(q); - - while (nr_sects) { - bio = blk_next_bio(bio, 1, gfp_mask); - bio->bi_iter.bi_sector = sector; - bio_set_dev(bio, bdev); - bio->bi_vcnt = 1; - bio->bi_io_vec->bv_page = page; - bio->bi_io_vec->bv_offset = 0; - bio->bi_io_vec->bv_len = bdev_logical_block_size(bdev); - bio_set_op_attrs(bio, REQ_OP_WRITE_SAME, 0); - - if (nr_sects > max_write_same_sectors) { - bio->bi_iter.bi_size = max_write_same_sectors << 9; - nr_sects -= max_write_same_sectors; - sector += max_write_same_sectors; - } else { - bio->bi_iter.bi_size = nr_sects << 9; - nr_sects = 0; - } - cond_resched(); - } - - *biop = bio; - return 0; -} - -/** - * blkdev_issue_write_same - queue a write same operation - * @bdev: target blockdev - * @sector: start sector - * @nr_sects: number of sectors to write - * @gfp_mask: memory allocation flags (for bio_alloc) - * @page: page containing data - * - * Description: - * Issue a write same request for the sectors in question. - */ -int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, - struct page *page) -{ - struct bio *bio = NULL; - struct blk_plug plug; - int ret; - - blk_start_plug(&plug); - ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, page, - &bio); - if (ret == 0 && bio) { - ret = submit_bio_wait(bio); - bio_put(bio); - } - blk_finish_plug(&plug); - return ret; -} -EXPORT_SYMBOL(blkdev_issue_write_same); - static int __blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, unsigned flags) diff --git a/block/blk-merge.c b/block/blk-merge.c index 4de34a332c9f..87cee7e82ae1 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -152,22 +152,6 @@ static struct bio *blk_bio_write_zeroes_split(struct request_queue *q, return bio_split(bio, q->limits.max_write_zeroes_sectors, GFP_NOIO, bs); } -static struct bio *blk_bio_write_same_split(struct request_queue *q, - struct bio *bio, - struct bio_set *bs, - unsigned *nsegs) -{ - *nsegs = 1; - - if (!q->limits.max_write_same_sectors) - return NULL; - - if (bio_sectors(bio) <= q->limits.max_write_same_sectors) - return NULL; - - return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs); -} - /* * Return the maximum number of sectors from the start of a bio that may be * submitted as a single request to a block device. If enough sectors remain, @@ -351,10 +335,6 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio, split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split, nr_segs); break; - case REQ_OP_WRITE_SAME: - split = blk_bio_write_same_split(q, *bio, &q->bio_split, - nr_segs); - break; default: split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs); break; @@ -416,8 +396,6 @@ unsigned int blk_recalc_rq_segments(struct request *rq) return 1; case REQ_OP_WRITE_ZEROES: return 0; - case REQ_OP_WRITE_SAME: - return 1; } rq_for_each_bvec(bv, rq, iter) @@ -555,8 +533,6 @@ int __blk_rq_map_sg(struct request_queue *q, struct request *rq, if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, last_sg); - else if (rq->bio && bio_op(rq->bio) == REQ_OP_WRITE_SAME) - nsegs = __blk_bvec_map_sg(bio_iovec(rq->bio), sglist, last_sg); else if (rq->bio) nsegs = __blk_bios_map_sg(q, rq->bio, sglist, last_sg); @@ -757,13 +733,6 @@ static enum elv_merge blk_try_req_merge(struct request *req, return ELEVATOR_NO_MERGE; } -static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b) -{ - if (bio_page(a) == bio_page(b) && bio_offset(a) == bio_offset(b)) - return true; - return false; -} - /* * For non-mq, this has to be called with the request spinlock acquired. * For mq with scheduling, the appropriate queue wide lock should be held. @@ -780,10 +749,6 @@ static struct request *attempt_merge(struct request_queue *q, if (rq_data_dir(req) != rq_data_dir(next)) return NULL; - if (req_op(req) == REQ_OP_WRITE_SAME && - !blk_write_same_mergeable(req->bio, next->bio)) - return NULL; - /* * Don't allow merge of different write hints, or for a hint with * non-hint IO. @@ -912,11 +877,6 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (!bio_crypt_rq_ctx_compatible(rq, bio)) return false; - /* must be using the same buffer */ - if (req_op(rq) == REQ_OP_WRITE_SAME && - !blk_write_same_mergeable(rq->bio, bio)) - return false; - /* * Don't allow merge of different write hints, or for a hint with * non-hint IO. diff --git a/block/blk-settings.c b/block/blk-settings.c index b880c70e22e4..b83df3d2eebc 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -42,7 +42,6 @@ void blk_set_default_limits(struct queue_limits *lim) lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; lim->max_dev_sectors = 0; lim->chunk_sectors = 0; - lim->max_write_same_sectors = 0; lim->max_write_zeroes_sectors = 0; lim->max_zone_append_sectors = 0; lim->max_discard_sectors = 0; @@ -79,7 +78,6 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->max_segment_size = UINT_MAX; lim->max_sectors = UINT_MAX; lim->max_dev_sectors = UINT_MAX; - lim->max_write_same_sectors = UINT_MAX; lim->max_write_zeroes_sectors = UINT_MAX; lim->max_zone_append_sectors = UINT_MAX; } @@ -178,18 +176,6 @@ void blk_queue_max_discard_sectors(struct request_queue *q, } EXPORT_SYMBOL(blk_queue_max_discard_sectors); -/** - * blk_queue_max_write_same_sectors - set max sectors for a single write same - * @q: the request queue for the device - * @max_write_same_sectors: maximum number of sectors to write per command - **/ -void blk_queue_max_write_same_sectors(struct request_queue *q, - unsigned int max_write_same_sectors) -{ - q->limits.max_write_same_sectors = max_write_same_sectors; -} -EXPORT_SYMBOL(blk_queue_max_write_same_sectors); - /** * blk_queue_max_write_zeroes_sectors - set max sectors for a single * write zeroes @@ -519,8 +505,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); - t->max_write_same_sectors = min(t->max_write_same_sectors, - b->max_write_same_sectors); t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors, b->max_write_zeroes_sectors); t->max_zone_append_sectors = min(t->max_zone_append_sectors, diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 9f32882ceb2f..5e81e65574a0 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -214,8 +214,7 @@ static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *pag static ssize_t queue_write_same_max_show(struct request_queue *q, char *page) { - return sprintf(page, "%llu\n", - (unsigned long long)q->limits.max_write_same_sectors << 9); + return queue_var_show(0, page); } static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 774ecc598bee..61fb6c52f4f3 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -65,7 +65,6 @@ bool blk_req_needs_zone_write_lock(struct request *rq) switch (req_op(rq)) { case REQ_OP_WRITE_ZEROES: - case REQ_OP_WRITE_SAME: case REQ_OP_WRITE: return blk_rq_zone_is_seq(rq); default: diff --git a/block/blk.h b/block/blk.h index 8bd43b3ad33d..c6f8c0ca569f 100644 --- a/block/blk.h +++ b/block/blk.h @@ -286,7 +286,6 @@ static inline bool blk_may_split(struct request_queue *q, struct bio *bio) case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: case REQ_OP_WRITE_ZEROES: - case REQ_OP_WRITE_SAME: return true; /* non-trivial splitting decisions */ default: break; diff --git a/block/bounce.c b/block/bounce.c index 7af1a72835b9..510360559276 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -181,9 +181,6 @@ static struct bio *bounce_clone_bio(struct bio *bio_src) case REQ_OP_SECURE_ERASE: case REQ_OP_WRITE_ZEROES: break; - case REQ_OP_WRITE_SAME: - bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; - break; default: bio_for_each_segment(bv, bio_src, iter) bio->bi_io_vec[bio->bi_vcnt++] = bv; diff --git a/include/linux/bio.h b/include/linux/bio.h index 117d7f248ac9..eb402afa370a 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -65,7 +65,6 @@ static inline bool bio_no_advance_iter(const struct bio *bio) { return bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE || - bio_op(bio) == REQ_OP_WRITE_SAME || bio_op(bio) == REQ_OP_WRITE_ZEROES; } @@ -186,8 +185,6 @@ static inline unsigned bio_segments(struct bio *bio) case REQ_OP_SECURE_ERASE: case REQ_OP_WRITE_ZEROES: return 0; - case REQ_OP_WRITE_SAME: - return 1; default: break; } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index fe065c394fff..077adf4b6e73 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -354,8 +354,6 @@ enum req_opf { REQ_OP_DISCARD = 3, /* securely erase sectors */ REQ_OP_SECURE_ERASE = 5, - /* write the same sector many times */ - REQ_OP_WRITE_SAME = 7, /* write the zero filled sector many times */ REQ_OP_WRITE_ZEROES = 9, /* Open a zone */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9c95df26fc26..b10470d5e986 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -97,7 +97,6 @@ struct queue_limits { unsigned int io_opt; unsigned int max_discard_sectors; unsigned int max_hw_discard_sectors; - unsigned int max_write_same_sectors; unsigned int max_write_zeroes_sectors; unsigned int max_zone_append_sectors; unsigned int discard_granularity; @@ -651,9 +650,6 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, return min(q->limits.max_discard_sectors, UINT_MAX >> SECTOR_SHIFT); - if (unlikely(op == REQ_OP_WRITE_SAME)) - return q->limits.max_write_same_sectors; - if (unlikely(op == REQ_OP_WRITE_ZEROES)) return q->limits.max_write_zeroes_sectors; @@ -696,8 +692,6 @@ extern void blk_queue_max_discard_segments(struct request_queue *, extern void blk_queue_max_segment_size(struct request_queue *, unsigned int); extern void blk_queue_max_discard_sectors(struct request_queue *q, unsigned int max_discard_sectors); -extern void blk_queue_max_write_same_sectors(struct request_queue *q, - unsigned int max_write_same_sectors); extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q, unsigned int max_write_same_sectors); extern void blk_queue_logical_block_size(struct request_queue *, unsigned int); @@ -842,9 +836,6 @@ static inline long nr_blockdev_pages(void) extern void blk_io_schedule(void); -extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, struct page *page); - #define BLKDEV_DISCARD_SECURE (1 << 0) /* issue a secure erase */ extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector, @@ -1071,16 +1062,6 @@ static inline int bdev_discard_alignment(struct block_device *bdev) return q->limits.discard_alignment; } -static inline unsigned int bdev_write_same(struct block_device *bdev) -{ - struct request_queue *q = bdev_get_queue(bdev); - - if (q) - return q->limits.max_write_same_sectors; - - return 0; -} - static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index af68a67179b4..19514edc44f7 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1892,7 +1892,6 @@ void blk_fill_rwbs(char *rwbs, unsigned int op) switch (op & REQ_OP_MASK) { case REQ_OP_WRITE: - case REQ_OP_WRITE_SAME: rwbs[i++] = 'W'; break; case REQ_OP_DISCARD: -- cgit v1.2.3 From 95458477f5b2dc436e3aa6aa25c0f84bb83e6195 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 22 Feb 2022 14:50:43 +0100 Subject: sched/headers: Add header guard to kernel/sched/sched.h Use the canonical header guard naming of the full path to the header. Signed-off-by: Ingo Molnar Reviewed-by: Peter Zijlstra --- kernel/sched/sched.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3da5718cd641..eab4a18f71c2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2,6 +2,9 @@ /* * Scheduler internal types and methods: */ +#ifndef _KERNEL_SCHED_SCHED_H +#define _KERNEL_SCHED_SCHED_H + #include #include @@ -3137,3 +3140,4 @@ extern int sched_dynamic_mode(const char *str); extern void sched_dynamic_update(int mode); #endif +#endif /* _KERNEL_SCHED_SCHED_H */ -- cgit v1.2.3 From d90a2f160a1cd9a1745896c381afdf8d2812fd6b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 20 Nov 2021 10:39:20 +0100 Subject: sched/headers: Add header guard to kernel/sched/stats.h and kernel/sched/autogroup.h Protect against multiple inclusion. Also include "sched.h" in "stat.h", as it relies on it. Signed-off-by: Ingo Molnar Reviewed-by: Peter Zijlstra --- kernel/sched/autogroup.h | 5 +++++ kernel/sched/stats.h | 6 ++++++ 2 files changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h index 90fcbfdd70c3..90d69f2c5eaf 100644 --- a/kernel/sched/autogroup.h +++ b/kernel/sched/autogroup.h @@ -1,4 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _KERNEL_SCHED_AUTOGROUP_H +#define _KERNEL_SCHED_AUTOGROUP_H + #ifdef CONFIG_SCHED_AUTOGROUP struct autogroup { @@ -59,3 +62,5 @@ static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) } #endif /* CONFIG_SCHED_AUTOGROUP */ + +#endif /* _KERNEL_SCHED_AUTOGROUP_H */ diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 3a3c826dd83a..edc0d13fc61b 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -1,7 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _KERNEL_STATS_H +#define _KERNEL_STATS_H #ifdef CONFIG_SCHEDSTATS +#include "sched.h" + extern struct static_key_false sched_schedstats; /* @@ -298,3 +302,5 @@ sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *n # define sched_info_dequeue(rq, t) do { } while (0) # define sched_info_switch(rq, t, next) do { } while (0) #endif /* CONFIG_SCHED_INFO */ + +#endif /* _KERNEL_STATS_H */ -- cgit v1.2.3 From fa28abed7a84f5c9902fe28b3bb58f7900583e83 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 21 Jun 2021 08:41:43 +0200 Subject: sched/headers: sched/clock: Mark all functions 'notrace', remove CC_FLAGS_FTRACE build asymmetry Mark all non-init functions in kernel/sched.c as 'notrace', instead of turning them all off via CC_FLAGS_FTRACE. This is going to allow the treatment of this file as any other scheduler file, and it can be #include-ed in compound compilation units as well. Signed-off-by: Ingo Molnar Reviewed-by: Peter Zijlstra --- kernel/sched/Makefile | 3 --- kernel/sched/clock.c | 42 +++++++++++++++++++++--------------------- 2 files changed, 21 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index c83b37af155b..c0c52026ad0d 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -1,7 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -ifdef CONFIG_FUNCTION_TRACER -CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE) -endif # The compilers are complaining about unused variables inside an if(0) scope # block. This is daft, shut them up. diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c2b2859ddd82..540d0e50e31c 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -61,7 +61,7 @@ * This is default implementation. * Architectures and sub-architectures can override this. */ -unsigned long long __weak sched_clock(void) +notrace unsigned long long __weak sched_clock(void) { return (unsigned long long)(jiffies - INITIAL_JIFFIES) * (NSEC_PER_SEC / HZ); @@ -95,28 +95,28 @@ struct sched_clock_data { static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); -static inline struct sched_clock_data *this_scd(void) +notrace static inline struct sched_clock_data *this_scd(void) { return this_cpu_ptr(&sched_clock_data); } -static inline struct sched_clock_data *cpu_sdc(int cpu) +notrace static inline struct sched_clock_data *cpu_sdc(int cpu) { return &per_cpu(sched_clock_data, cpu); } -int sched_clock_stable(void) +notrace int sched_clock_stable(void) { return static_branch_likely(&__sched_clock_stable); } -static void __scd_stamp(struct sched_clock_data *scd) +notrace static void __scd_stamp(struct sched_clock_data *scd) { scd->tick_gtod = ktime_get_ns(); scd->tick_raw = sched_clock(); } -static void __set_sched_clock_stable(void) +notrace static void __set_sched_clock_stable(void) { struct sched_clock_data *scd; @@ -151,7 +151,7 @@ static void __set_sched_clock_stable(void) * The only way to fully avoid random clock jumps is to boot with: * "tsc=unstable". */ -static void __sched_clock_work(struct work_struct *work) +notrace static void __sched_clock_work(struct work_struct *work) { struct sched_clock_data *scd; int cpu; @@ -177,7 +177,7 @@ static void __sched_clock_work(struct work_struct *work) static DECLARE_WORK(sched_clock_work, __sched_clock_work); -static void __clear_sched_clock_stable(void) +notrace static void __clear_sched_clock_stable(void) { if (!sched_clock_stable()) return; @@ -186,7 +186,7 @@ static void __clear_sched_clock_stable(void) schedule_work(&sched_clock_work); } -void clear_sched_clock_stable(void) +notrace void clear_sched_clock_stable(void) { __sched_clock_stable_early = 0; @@ -196,7 +196,7 @@ void clear_sched_clock_stable(void) __clear_sched_clock_stable(); } -static void __sched_clock_gtod_offset(void) +notrace static void __sched_clock_gtod_offset(void) { struct sched_clock_data *scd = this_scd(); @@ -246,12 +246,12 @@ late_initcall(sched_clock_init_late); * min, max except they take wrapping into account */ -static inline u64 wrap_min(u64 x, u64 y) +notrace static inline u64 wrap_min(u64 x, u64 y) { return (s64)(x - y) < 0 ? x : y; } -static inline u64 wrap_max(u64 x, u64 y) +notrace static inline u64 wrap_max(u64 x, u64 y) { return (s64)(x - y) > 0 ? x : y; } @@ -262,7 +262,7 @@ static inline u64 wrap_max(u64 x, u64 y) * - filter out backward motion * - use the GTOD tick value to create a window to filter crazy TSC values */ -static u64 sched_clock_local(struct sched_clock_data *scd) +notrace static u64 sched_clock_local(struct sched_clock_data *scd) { u64 now, clock, old_clock, min_clock, max_clock, gtod; s64 delta; @@ -295,7 +295,7 @@ again: return clock; } -static u64 sched_clock_remote(struct sched_clock_data *scd) +notrace static u64 sched_clock_remote(struct sched_clock_data *scd) { struct sched_clock_data *my_scd = this_scd(); u64 this_clock, remote_clock; @@ -362,7 +362,7 @@ again: * * See cpu_clock(). */ -u64 sched_clock_cpu(int cpu) +notrace u64 sched_clock_cpu(int cpu) { struct sched_clock_data *scd; u64 clock; @@ -386,7 +386,7 @@ u64 sched_clock_cpu(int cpu) } EXPORT_SYMBOL_GPL(sched_clock_cpu); -void sched_clock_tick(void) +notrace void sched_clock_tick(void) { struct sched_clock_data *scd; @@ -403,7 +403,7 @@ void sched_clock_tick(void) sched_clock_local(scd); } -void sched_clock_tick_stable(void) +notrace void sched_clock_tick_stable(void) { if (!sched_clock_stable()) return; @@ -423,7 +423,7 @@ void sched_clock_tick_stable(void) /* * We are going deep-idle (irqs are disabled): */ -void sched_clock_idle_sleep_event(void) +notrace void sched_clock_idle_sleep_event(void) { sched_clock_cpu(smp_processor_id()); } @@ -432,7 +432,7 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); /* * We just idled; resync with ktime. */ -void sched_clock_idle_wakeup_event(void) +notrace void sched_clock_idle_wakeup_event(void) { unsigned long flags; @@ -458,7 +458,7 @@ void __init sched_clock_init(void) local_irq_enable(); } -u64 sched_clock_cpu(int cpu) +notrace u64 sched_clock_cpu(int cpu) { if (!static_branch_likely(&sched_clock_running)) return 0; @@ -476,7 +476,7 @@ u64 sched_clock_cpu(int cpu) * On bare metal this function should return the same as local_clock. * Architectures and sub-architectures can override this. */ -u64 __weak running_clock(void) +notrace u64 __weak running_clock(void) { return local_clock(); } -- cgit v1.2.3 From 81de6572fe980a98a1c6c5eacdfd2a9137894f32 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 21 Jun 2021 08:50:48 +0200 Subject: sched/headers: Fix comment typo in kernel/sched/cpudeadline.c File name changed. Signed-off-by: Ingo Molnar Reviewed-by: Peter Zijlstra --- kernel/sched/cpudeadline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index ceb03d76c0cc..0e196f0de492 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * kernel/sched/cpudl.c + * kernel/sched/cpudeadline.c * * Global CPU deadline management * -- cgit v1.2.3 From 801c141955108fb7cf1244dda76e6de8b16fd3ae Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 22 Feb 2022 13:23:24 +0100 Subject: sched/headers: Introduce kernel/sched/build_utility.c and build multiple .c files there Collect all utility functionality source code files into a single kernel/sched/build_utility.c file, via #include-ing the .c files: kernel/sched/clock.c kernel/sched/completion.c kernel/sched/loadavg.c kernel/sched/swait.c kernel/sched/wait_bit.c kernel/sched/wait.c CONFIG_CPU_FREQ: kernel/sched/cpufreq.c CONFIG_CPU_FREQ_GOV_SCHEDUTIL: kernel/sched/cpufreq_schedutil.c CONFIG_CGROUP_CPUACCT: kernel/sched/cpuacct.c CONFIG_SCHED_DEBUG: kernel/sched/debug.c CONFIG_SCHEDSTATS: kernel/sched/stats.c CONFIG_SMP: kernel/sched/cpupri.c kernel/sched/stop_task.c kernel/sched/topology.c CONFIG_SCHED_CORE: kernel/sched/core_sched.c CONFIG_PSI: kernel/sched/psi.c CONFIG_MEMBARRIER: kernel/sched/membarrier.c CONFIG_CPU_ISOLATION: kernel/sched/isolation.c CONFIG_SCHED_AUTOGROUP: kernel/sched/autogroup.c The goal is to amortize the 60+ KLOC header bloat from over a dozen build units into a single build unit. The build time of build_utility.c also roughly matches the build time of core.c and fair.c - allowing better load-balancing of scheduler-only rebuilds. Signed-off-by: Ingo Molnar Reviewed-by: Peter Zijlstra --- kernel/sched/Makefile | 25 ++++++-------- kernel/sched/autogroup.c | 3 +- kernel/sched/build_utility.c | 70 ++++++++++++++++++++++++++++++++++++++++ kernel/sched/clock.c | 2 -- kernel/sched/completion.c | 2 +- kernel/sched/core_sched.c | 3 -- kernel/sched/cpuacct.c | 3 +- kernel/sched/cpufreq.c | 3 -- kernel/sched/cpufreq_schedutil.c | 7 ---- kernel/sched/cpupri.c | 1 - kernel/sched/debug.c | 1 - kernel/sched/isolation.c | 1 - kernel/sched/loadavg.c | 1 - kernel/sched/membarrier.c | 1 - kernel/sched/psi.c | 15 --------- kernel/sched/sched.h | 57 ++++++++++++++++++++++++++++++-- kernel/sched/stats.c | 1 - kernel/sched/stop_task.c | 1 - kernel/sched/swait.c | 1 - kernel/sched/topology.c | 1 - kernel/sched/wait.c | 1 - kernel/sched/wait_bit.c | 2 +- 22 files changed, 139 insertions(+), 63 deletions(-) create mode 100644 kernel/sched/build_utility.c (limited to 'kernel') diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index c0c52026ad0d..4a4785cb3cd2 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -22,18 +22,13 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif -obj-y += core.o loadavg.o clock.o cputime.o -obj-y += idle.o fair.o rt.o deadline.o -obj-y += wait.o wait_bit.o swait.o completion.o - -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o -obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o -obj-$(CONFIG_SCHEDSTATS) += stats.o -obj-$(CONFIG_SCHED_DEBUG) += debug.o -obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o -obj-$(CONFIG_CPU_FREQ) += cpufreq.o -obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o -obj-$(CONFIG_MEMBARRIER) += membarrier.o -obj-$(CONFIG_CPU_ISOLATION) += isolation.o -obj-$(CONFIG_PSI) += psi.o -obj-$(CONFIG_SCHED_CORE) += core_sched.o +# +# Build efficiency: +# +# These compilation units have roughly the same size and complexity - so their +# build parallelizes well and finishes roughly at once: +# +obj-y += core.o +obj-y += fair.o +obj-y += build_utility.o +obj-y += idle.o rt.o deadline.o cputime.o cpudeadline.o pelt.o diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 31dd2593145e..16092b49ff6a 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -1,9 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 + /* * Auto-group scheduling implementation: */ -#include -#include "sched.h" unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; static struct autogroup autogroup_default; diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c new file mode 100644 index 000000000000..31216ce0b4b3 --- /dev/null +++ b/kernel/sched/build_utility.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * These are various utility functions of the scheduler, + * built in a single compilation unit for build efficiency reasons. + * + * ( Incidentally, the size of the compilation unit is roughly + * comparable to core.c, fair.c, smp.c and policy.c, the other + * big compilation units. This helps balance build time, while + * coalescing source files to amortize header inclusion + * cost. ) + */ + +#include "sched.h" +#include "sched-pelt.h" + +#include + +#include "clock.c" + +#ifdef CONFIG_CGROUP_CPUACCT +# include "cpuacct.c" +#endif + +#ifdef CONFIG_CPU_FREQ +# include "cpufreq.c" +#endif + +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL +# include "cpufreq_schedutil.c" +#endif + +#ifdef CONFIG_SCHED_DEBUG +# include "debug.c" +#endif + +#ifdef CONFIG_SCHEDSTATS +# include "stats.c" +#endif + +#include "loadavg.c" +#include "completion.c" +#include "swait.c" +#include "wait_bit.c" +#include "wait.c" + +#ifdef CONFIG_SMP +# include "cpupri.c" +# include "stop_task.c" +# include "topology.c" +#endif + +#ifdef CONFIG_SCHED_CORE +# include "core_sched.c" +#endif + +#ifdef CONFIG_PSI +# include "psi.c" +#endif + +#ifdef CONFIG_MEMBARRIER +# include "membarrier.c" +#endif + +#ifdef CONFIG_CPU_ISOLATION +# include "isolation.c" +#endif + +#ifdef CONFIG_SCHED_AUTOGROUP +# include "autogroup.c" +#endif diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 540d0e50e31c..d9272d9061a3 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -53,8 +53,6 @@ * that is otherwise invisible (TSC gets stopped). * */ -#include "sched.h" -#include /* * Scheduler clock - returns current time in nanosec units. diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index a778554f9dad..35f15c26ed54 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 + /* * Generic wait-for-completion handler; * @@ -11,7 +12,6 @@ * typically be used for exclusion which gives rise to priority inversion. * Waiting for completion is a typically sync point, but not an exclusion point. */ -#include "sched.h" /** * complete: - signals a single thread waiting on this completion diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index c8746a9a7ada..38a2cec21014 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -1,8 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -#include -#include "sched.h" - /* * A simple wrapper around refcount. An allocated sched_core_cookie's * address is used to compute the cookie of the task. diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 3d06c5e4220d..2c505cf800aa 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -1,12 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 + /* * CPU accounting code for task groups. * * Based on the work by Paul Menage (menage@google.com) and Balbir Singh * (balbir@in.ibm.com). */ -#include -#include "sched.h" /* Time spent by the tasks of the CPU accounting group executing in ... */ enum cpuacct_stat_index { diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 7c2fe50fd76d..5252fb191fae 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -5,9 +5,6 @@ * Copyright (C) 2016, Intel Corporation * Author: Rafael J. Wysocki */ -#include - -#include "sched.h" DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 6d65ab6e484e..f68885d049f2 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -6,13 +6,6 @@ * Author: Rafael J. Wysocki */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include "sched.h" - -#include -#include - #define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) struct sugov_tunables { diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index d583f2aa744e..fa9ce9d83683 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -22,7 +22,6 @@ * worst case complexity of O(min(101, nr_domcpus)), though the scenario that * yields the worst case search is fairly contrived. */ -#include "sched.h" /* * p->rt_priority p->prio newpri cpupri diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 102d6f70e84d..bb3d63bdf4ae 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -6,7 +6,6 @@ * * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar */ -#include "sched.h" /* * This allows printing both to /proc/sched_debug and diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index b4d10815c45a..373d42c707bc 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -7,7 +7,6 @@ * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker * */ -#include "sched.h" enum hk_flags { HK_FLAG_TIMER = BIT(HK_TYPE_TIMER), diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 954b229868d9..52c8f8226b0d 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -6,7 +6,6 @@ * figure. Its a silly number but people think its important. We go through * great pains to make it work on big machines and tickless kernels. */ -#include "sched.h" /* * Global load-average calculations diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 3d2825408e3a..0c5be7ebb1dc 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -4,7 +4,6 @@ * * membarrier system call */ -#include "sched.h" /* * For documentation purposes, here are some membarrier ordering diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 8fb08a12f602..a4fa3aadfcba 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -137,21 +137,6 @@ * sampling of the aggregate task states would be. */ -#include "../workqueue_internal.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "sched.h" - static int psi_bug __read_mostly; DEFINE_STATIC_KEY_FALSE(psi_disabled); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index eab4a18f71c2..79c7a8a2be65 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -6,7 +6,25 @@ #define _KERNEL_SCHED_SCHED_H #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include #include #include #include @@ -24,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -31,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -69,8 +89,7 @@ #include #include #include - -#include +#include #ifdef CONFIG_PARAVIRT # include @@ -87,6 +106,40 @@ # define SCHED_WARN_ON(x) ({ (void)(x), 0; }) #endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "../workqueue_internal.h" + struct rq; struct cpuidle_state; diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 07dde2928c79..857f837f52cb 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -2,7 +2,6 @@ /* * /proc/schedstat implementation */ -#include "sched.h" void __update_stats_wait_start(struct rq *rq, struct task_struct *p, struct sched_statistics *stats) diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 0b165a25f22f..d04073a93eb4 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -7,7 +7,6 @@ * * See kernel/stop_machine.c */ -#include "sched.h" #ifdef CONFIG_SMP static int diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index e1c655f928c7..76b9b796e695 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -2,7 +2,6 @@ /* * (simple wait queues ) implementation: */ -#include "sched.h" void __init_swait_queue_head(struct swait_queue_head *q, const char *name, struct lock_class_key *key) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 32841c6741d1..e8af72fd70bd 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2,7 +2,6 @@ /* * Scheduler topology setup/handling methods */ -#include "sched.h" DEFINE_MUTEX(sched_domains_mutex); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index eca38107b32f..9860bb9a847c 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -4,7 +4,6 @@ * * (C) 2004 Nadia Yvette Chambers, Oracle */ -#include "sched.h" void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) { diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 02ce292b9bc0..d4788f810b55 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only + /* * The implementation of the wait_bit*() and related waiting APIs: */ -#include "sched.h" #define WAIT_TABLE_BITS 8 #define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) -- cgit v1.2.3 From f96eca432015ddc1b621632488ebc345bca06791 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 22 Feb 2022 13:46:03 +0100 Subject: sched/headers: Introduce kernel/sched/build_policy.c and build multiple .c files there Similarly to kernel/sched/build_utility.c, collect all 'scheduling policy' related source code files into kernel/sched/build_policy.c: kernel/sched/idle.c kernel/sched/rt.c kernel/sched/cpudeadline.c kernel/sched/pelt.c kernel/sched/cputime.c kernel/sched/deadline.c With the exception of fair.c, which we continue to build as a separate file for build efficiency and parallelism reasons. Signed-off-by: Ingo Molnar Reviewed-by: Peter Zijlstra --- kernel/sched/Makefile | 2 +- kernel/sched/build_policy.c | 29 +++++++++++++++++++++++++++++ kernel/sched/cpudeadline.c | 1 - kernel/sched/cputime.c | 1 - kernel/sched/deadline.c | 2 -- kernel/sched/idle.c | 3 --- kernel/sched/pelt.c | 4 ---- kernel/sched/rt.c | 3 --- kernel/sched/sched.h | 8 ++++++++ 9 files changed, 38 insertions(+), 15 deletions(-) create mode 100644 kernel/sched/build_policy.c (limited to 'kernel') diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 4a4785cb3cd2..976092b7bd45 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -30,5 +30,5 @@ endif # obj-y += core.o obj-y += fair.o +obj-y += build_policy.o obj-y += build_utility.o -obj-y += idle.o rt.o deadline.o cputime.o cpudeadline.o pelt.o diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c new file mode 100644 index 000000000000..6bb384ddcfab --- /dev/null +++ b/kernel/sched/build_policy.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * These are the scheduling policy related scheduler files, built + * in a single compilation unit for build efficiency reasons. + * + * ( Incidentally, the size of the compilation unit is roughly + * comparable to core.c and fair.c, the other two big + * compilation units. This helps balance build time, while + * coalescing source files to amortize header inclusion + * cost. ) + * + * core.c and fair.c are built separately. + */ + +#include "sched.h" +#include "pelt.h" + +#include "idle.c" + +#include "rt.c" + +#ifdef CONFIG_SMP +# include "cpudeadline.c" +# include "pelt.c" +#endif + +#include "cputime.c" +#include "deadline.c" + diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 0e196f0de492..02d970a879ed 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -6,7 +6,6 @@ * * Author: Juri Lelli */ -#include "sched.h" static inline int parent(int i) { diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index b7ec42732b28..78a233d43757 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -2,7 +2,6 @@ /* * Simple CPU accounting cgroup controller */ -#include "sched.h" #ifdef CONFIG_IRQ_TIME_ACCOUNTING diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d2c072b0ef01..fca2d7de4d3d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -15,8 +15,6 @@ * Michael Trimarchi , * Fabio Checconi */ -#include "sched.h" -#include "pelt.h" struct dl_bandwidth def_dl_bandwidth; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index d17b0a5ce6ac..8f8b5020e76a 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -6,9 +6,6 @@ * (NOTE: these are not related to SCHED_IDLE batch scheduled * tasks which are handled in sched/fair.c ) */ -#include "sched.h" - -#include /* Linker adds these: start and end of __cpuidle functions */ extern char __cpuidle_text_start[], __cpuidle_text_end[]; diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index a554e3bbab2b..0f310768260c 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -24,10 +24,6 @@ * Author: Vincent Guittot */ -#include -#include "sched.h" -#include "pelt.h" - /* * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 7b4f4fbbb404..ff4c044aed12 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -3,9 +3,6 @@ * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR * policies) */ -#include "sched.h" - -#include "pelt.h" int sched_rr_timeslice = RR_TIMESLICE; int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 79c7a8a2be65..f7d10b98e911 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -113,8 +114,11 @@ #include #include #include +#include +#include #include #include +#include #include #include #include @@ -126,12 +130,16 @@ #include #include #include +#include #include #include +#include +#include #include #include #include #include +#include #include #include #include -- cgit v1.2.3 From b9e9c6ca6e54b5d58a57663f76c5cb33c12ea98f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 13 Feb 2022 08:19:43 +0100 Subject: sched/headers: Standardize kernel/sched/sched.h header dependencies kernel/sched/sched.h is a weird mix of ad-hoc headers included in the middle of the header. Two of them rely on being included in the middle of kernel/sched/sched.h, due to definitions they require: - "stat.h" needs the rq definitions. - "autogroup.h" needs the task_group definition. Move the inclusion of these two files out of kernel/sched/sched.h, and include them in all files that require them. Move of the rest of the header dependencies to the top of the kernel/sched/sched.h file. Signed-off-by: Ingo Molnar Reviewed-by: Peter Zijlstra --- kernel/sched/build_policy.c | 3 +++ kernel/sched/build_utility.c | 2 ++ kernel/sched/core.c | 2 ++ kernel/sched/fair.c | 2 ++ kernel/sched/sched.h | 20 +++++++++++--------- kernel/sched/stats.h | 4 ++-- 6 files changed, 22 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index 6bb384ddcfab..9a169b2f97e6 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -13,6 +13,9 @@ */ #include "sched.h" + +#include "autogroup.h" +#include "stats.h" #include "pelt.h" #include "idle.c" diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index 31216ce0b4b3..8b2b199983bf 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -12,6 +12,8 @@ #include "sched.h" #include "sched-pelt.h" +#include "stats.h" +#include "autogroup.h" #include diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ef946123e9af..22de53710ee9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11,6 +11,8 @@ #undef CREATE_TRACE_POINTS #include "sched.h" +#include "stats.h" +#include "autogroup.h" #include #include diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 16874e112fe6..7391c9287503 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -21,6 +21,8 @@ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ #include "sched.h" +#include "stats.h" +#include "autogroup.h" /* * Targeted preemption latency for CPU-bound tasks: diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f7d10b98e911..f255ec2afeca 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -96,11 +96,20 @@ # include #endif +#include + +#ifdef CONFIG_CGROUP_SCHED +#include +#include +#endif + +#ifdef CONFIG_SCHED_DEBUG +# include +#endif + #include "cpupri.h" #include "cpudeadline.h" -#include - #ifdef CONFIG_SCHED_DEBUG # define SCHED_WARN_ON(x) WARN_ONCE(x, #x) #else @@ -417,9 +426,6 @@ extern bool dl_cpu_busy(unsigned int cpu); #ifdef CONFIG_CGROUP_SCHED -#include -#include - struct cfs_rq; struct rt_rq; @@ -1919,9 +1925,6 @@ extern void flush_smp_call_function_from_idle(void); static inline void flush_smp_call_function_from_idle(void) { } #endif -#include "stats.h" -#include "autogroup.h" - #if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) extern void __sched_core_account_forceidle(struct rq *rq); @@ -2016,7 +2019,6 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ #ifdef CONFIG_SCHED_DEBUG -# include # define const_debug __read_mostly #else # define const_debug const diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index edc0d13fc61b..a0df79e6038c 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -2,10 +2,10 @@ #ifndef _KERNEL_STATS_H #define _KERNEL_STATS_H -#ifdef CONFIG_SCHEDSTATS - #include "sched.h" +#ifdef CONFIG_SCHEDSTATS + extern struct static_key_false sched_schedstats; /* -- cgit v1.2.3 From e66f6481a8c748ce2d4b37a3d5e10c4dd0d65e80 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 23 Feb 2022 08:17:15 +0100 Subject: sched/headers: Reorganize, clean up and optimize kernel/sched/core.c dependencies Use all generic headers from kernel/sched/sched.h that are required for it to build. Sort the sections alphabetically. Signed-off-by: Ingo Molnar Reviewed-by: Peter Zijlstra --- kernel/sched/core.c | 81 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 70 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 22de53710ee9..5eaa4211d93c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6,7 +6,73 @@ * * Copyright (C) 1991-2002 Linus Torvalds */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_PREEMPT_DYNAMIC +# include +#endif + +#include + +#include +#include + #define CREATE_TRACE_POINTS +#include #include #undef CREATE_TRACE_POINTS @@ -14,22 +80,15 @@ #include "stats.h" #include "autogroup.h" -#include -#include -#include -#include -#include - -#include -#include +#include "autogroup.h" +#include "pelt.h" +#include "smp.h" +#include "stats.h" #include "../workqueue_internal.h" #include "../../fs/io-wq.h" #include "../smpboot.h" -#include "pelt.h" -#include "smp.h" - /* * Export tracepoints that act as a bare tracehook (ie: have no trace event * associated with them) to allow external modules to probe them. -- cgit v1.2.3 From c4ad6fcb67c42d65481c85733c8009c8afdfdf4e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 22 Feb 2022 13:50:01 +0100 Subject: sched/headers: Reorganize, clean up and optimize kernel/sched/fair.c dependencies Use all generic headers from kernel/sched/sched.h that are required for it to build. Sort the sections alphabetically. Signed-off-by: Ingo Molnar Reviewed-by: Peter Zijlstra --- kernel/sched/fair.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7391c9287503..89d21fda106c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -20,6 +20,35 @@ * Adaptive scheduling granularity, math enhancements by Peter Zijlstra * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + #include "sched.h" #include "stats.h" #include "autogroup.h" -- cgit v1.2.3 From 0dda4eeb484962ad574fcea77a78a4fbd7bc06ba Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 21 Jun 2021 10:33:56 +0200 Subject: sched/headers: Reorganize, clean up and optimize kernel/sched/build_policy.c dependencies Use all generic headers from kernel/sched/sched.h that are required for it to build. Sort the sections alphabetically. Signed-off-by: Ingo Molnar Reviewed-by: Peter Zijlstra --- kernel/sched/build_policy.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index 9a169b2f97e6..a2e4023771a0 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -12,12 +12,36 @@ * core.c and fair.c are built separately. */ +/* Headers: */ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_PARAVIRT +# include +#endif + +#include + #include "sched.h" #include "autogroup.h" #include "stats.h" #include "pelt.h" +/* Source code modules: */ + #include "idle.c" #include "rt.c" -- cgit v1.2.3 From e81daa7b6489e9810fa699c5104c6fd500e64fb8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 19 Jul 2021 12:43:51 +0200 Subject: sched/headers: Reorganize, clean up and optimize kernel/sched/build_utility.c dependencies Use all generic headers from kernel/sched/sched.h that are required for it to build. Sort the sections alphabetically. Signed-off-by: Ingo Molnar Reviewed-by: Peter Zijlstra --- kernel/sched/build_utility.c | 43 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index 8b2b199983bf..bc2f4d648209 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -9,14 +9,53 @@ * coalescing source files to amortize header inclusion * cost. ) */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_PARAVIRT +# include +#endif + +#include +#include #include "sched.h" #include "sched-pelt.h" #include "stats.h" #include "autogroup.h" -#include - #include "clock.c" #ifdef CONFIG_CGROUP_CPUACCT -- cgit v1.2.3 From 4ff8f2ca6ccd9e0cc5665d09f86d631b3ae3a14c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 22 Feb 2022 14:51:58 +0100 Subject: sched/headers: Reorganize, clean up and optimize kernel/sched/sched.h dependencies Remove all headers, except the ones required to make this header build standalone. Also include stats.h in sched.h explicitly - dependencies already require this. Summary of the build speedup gained through the last ~15 scheduler build & header dependency patches: Cumulative scheduler (kernel/sched/) build time speedup on a Linux distribution's config, which enables all scheduler features, compared to the vanilla kernel: _____________________________________________________________________________ | | Vanilla kernel (v5.13-rc7): |_____________________________________________________________________________ | | Performance counter stats for 'make -j96 kernel/sched/' (3 runs): | | 126,975,564,374 instructions # 1.45 insn per cycle ( +- 0.00% ) | 87,637,847,671 cycles # 3.959 GHz ( +- 0.30% ) | 22,136.96 msec cpu-clock # 7.499 CPUs utilized ( +- 0.29% ) | | 2.9520 +- 0.0169 seconds time elapsed ( +- 0.57% ) |_____________________________________________________________________________ | | Patched kernel: |_____________________________________________________________________________ | | Performance counter stats for 'make -j96 kernel/sched/' (3 runs): | | 50,420,496,914 instructions # 1.47 insn per cycle ( +- 0.00% ) | 34,234,322,038 cycles # 3.946 GHz ( +- 0.31% ) | 8,675.81 msec cpu-clock # 3.053 CPUs utilized ( +- 0.45% ) | | 2.8420 +- 0.0181 seconds time elapsed ( +- 0.64% ) |_____________________________________________________________________________ Summary: - CPU time used to build the scheduler dropped by -60.9%, a reduction from 22.1 clock-seconds to 8.7 clock-seconds. - Wall-clock time to build the scheduler dropped by -3.9%, a reduction from 2.95 seconds to 2.84 seconds. Signed-off-by: Ingo Molnar Reviewed-by: Peter Zijlstra --- kernel/sched/build_policy.c | 4 -- kernel/sched/build_utility.c | 6 +- kernel/sched/sched.h | 147 +++++++++++++------------------------------ kernel/sched/stats.h | 2 - 4 files changed, 46 insertions(+), 113 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index a2e4023771a0..e0104b45029a 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -28,10 +28,6 @@ #include #include -#ifdef CONFIG_PARAVIRT -# include -#endif - #include #include "sched.h" diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index bc2f4d648209..eec0849b2aae 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -44,13 +44,11 @@ #include #include -#ifdef CONFIG_PARAVIRT -# include -#endif - #include #include +#include + #include "sched.h" #include "sched-pelt.h" #include "stats.h" diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f255ec2afeca..0d4217965561 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -5,158 +5,97 @@ #ifndef _KERNEL_SCHED_SCHED_H #define _KERNEL_SCHED_SCHED_H -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - #include #include -#include -#include #include -#include #include -#include -#include -#include -#include -#include -#include +#include #include #include -#include -#include -#include #include -#include #include #include #include #include +#include #include -#include -#include #include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef CONFIG_PARAVIRT -# include -#endif - -#include - -#ifdef CONFIG_CGROUP_SCHED -#include -#include -#endif - -#ifdef CONFIG_SCHED_DEBUG -# include -#endif - -#include "cpupri.h" -#include "cpudeadline.h" - -#ifdef CONFIG_SCHED_DEBUG -# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) -#else -# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) -#endif +#include #include +#include #include +#include #include #include #include #include #include +#include #include #include +#include #include #include +#include #include #include +#include +#include +#include #include #include +#include #include #include +#include #include -#include -#include -#include -#include -#include -#include +#include #include #include #include #include +#include +#include #include #include +#include #include #include #include #include #include +#include #include #include +#include #include "../workqueue_internal.h" +#ifdef CONFIG_CGROUP_SCHED +#include +#include +#endif + +#ifdef CONFIG_SCHED_DEBUG +# include +#endif + +#ifdef CONFIG_PARAVIRT +# include +# include +#endif + +#include "cpupri.h" +#include "cpudeadline.h" + +#ifdef CONFIG_SCHED_DEBUG +# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) +#else +# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) +#endif + struct rq; struct cpuidle_state; @@ -1925,6 +1864,8 @@ extern void flush_smp_call_function_from_idle(void); static inline void flush_smp_call_function_from_idle(void) { } #endif +#include "stats.h" + #if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) extern void __sched_core_account_forceidle(struct rq *rq); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index a0df79e6038c..baa839c1ba96 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -2,8 +2,6 @@ #ifndef _KERNEL_STATS_H #define _KERNEL_STATS_H -#include "sched.h" - #ifdef CONFIG_SCHEDSTATS extern struct static_key_false sched_schedstats; -- cgit v1.2.3 From f2eb478f2f322217aa642e11c1cc011f99c797e6 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 22 Feb 2022 08:07:13 +0100 Subject: kernfs: move struct kernfs_root out of the public view. There is no need to have struct kernfs_root be part of kernfs.h for the whole kernel to see and poke around it. Move it internal to kernfs code and provide a helper function, kernfs_root_to_node(), to handle the one field that kernfs users were directly accessing from the structure. Cc: Imran Khan Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220222070713.3517679-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 4 ++-- fs/kernfs/dir.c | 9 +++++++++ fs/kernfs/kernfs-internal.h | 18 ++++++++++++++++++ fs/sysfs/mount.c | 2 +- include/linux/kernfs.h | 4 ++++ kernel/cgroup/cgroup.c | 4 ++-- 6 files changed, 36 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index b57b3db9a6a7..83f901e2c2df 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -3221,13 +3221,13 @@ static int __init rdtgroup_setup_root(void) list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); - ret = rdtgroup_add_files(rdt_root->kn, RF_CTRL_BASE); + ret = rdtgroup_add_files(kernfs_root_to_node(rdt_root), RF_CTRL_BASE); if (ret) { kernfs_destroy_root(rdt_root); goto out; } - rdtgroup_default.kn = rdt_root->kn; + rdtgroup_default.kn = kernfs_root_to_node(rdt_root); kernfs_activate(rdtgroup_default.kn); out: diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index e6d9772ddb4c..61a8edc4ba8b 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -970,6 +970,15 @@ void kernfs_destroy_root(struct kernfs_root *root) kernfs_put(root->kn); /* will also free @root */ } +/** + * kernfs_root_to_node - return the kernfs_node associated with a kernfs_root + * @root: root to use to lookup + */ +struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root) +{ + return root->kn; +} + /** * kernfs_create_dir_ns - create a directory * @parent: parent in which to create a new directory diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index fc3b32f82a60..eeaa779b929c 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -31,6 +31,24 @@ struct kernfs_iattrs { atomic_t user_xattr_size; }; +struct kernfs_root { + /* published fields */ + struct kernfs_node *kn; + unsigned int flags; /* KERNFS_ROOT_* flags */ + + /* private fields, do not use outside kernfs proper */ + struct idr ino_idr; + u32 last_id_lowbits; + u32 id_highbits; + struct kernfs_syscall_ops *syscall_ops; + + /* list of kernfs_super_info of this root, protected by kernfs_rwsem */ + struct list_head supers; + + wait_queue_head_t deactivate_waitq; + struct rw_semaphore kernfs_rwsem; +}; + /* +1 to avoid triggering overflow warning when negating it */ #define KN_DEACTIVATED_BIAS (INT_MIN + 1) diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index e747c135c1d1..98467bb76737 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -103,7 +103,7 @@ int __init sysfs_init(void) if (IS_ERR(sysfs_root)) return PTR_ERR(sysfs_root); - sysfs_root_kn = sysfs_root->kn; + sysfs_root_kn = kernfs_root_to_node(sysfs_root); err = register_filesystem(&sysfs_fs_type); if (err) { diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 861c4f0f8a29..62aff082dc3f 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -185,6 +185,7 @@ struct kernfs_syscall_ops { struct kernfs_root *root); }; +#if 0 struct kernfs_root { /* published fields */ struct kernfs_node *kn; @@ -202,6 +203,9 @@ struct kernfs_root { wait_queue_head_t deactivate_waitq; struct rw_semaphore kernfs_rwsem; }; +#endif + +struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root); struct kernfs_open_file { /* published fields */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b31e1465868a..a800c3b1b795 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1302,7 +1302,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) { - struct cgroup *root_cgrp = kf_root->kn->priv; + struct cgroup *root_cgrp = kernfs_root_to_node(kf_root)->priv; return root_cgrp->root; } @@ -2025,7 +2025,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) ret = PTR_ERR(root->kf_root); goto exit_root_id; } - root_cgrp->kn = root->kf_root->kn; + root_cgrp->kn = kernfs_root_to_node(root->kf_root); WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1); root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp); -- cgit v1.2.3 From c561d11063009323a0e57c528cb1d77b7d2c41e0 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sun, 20 Feb 2022 10:40:55 -0800 Subject: bpf: Cleanup comments Add leading space to spdx tag Use // for spdx c file comment Replacements resereved to reserved inbetween to in between everytime to every time intutivie to intuitive currenct to current encontered to encountered referenceing to referencing upto to up to exectuted to executed Signed-off-by: Tom Rix Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20220220184055.3608317-1-trix@redhat.com --- kernel/bpf/bpf_local_storage.c | 2 +- kernel/bpf/btf.c | 6 +++--- kernel/bpf/cgroup.c | 8 ++++---- kernel/bpf/hashtab.c | 2 +- kernel/bpf/helpers.c | 2 +- kernel/bpf/local_storage.c | 2 +- kernel/bpf/reuseport_array.c | 2 +- kernel/bpf/syscall.c | 2 +- kernel/bpf/trampoline.c | 2 +- 9 files changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 71de2a89869c..092a1ac772d7 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -136,7 +136,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, * will be done by the caller. * * Although the unlock will be done under - * rcu_read_lock(), it is more intutivie to + * rcu_read_lock(), it is more intuitive to * read if the freeing of the storage is done * after the raw_spin_unlock_bh(&local_storage->lock). * diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2c4c5dbe2abe..0890e56e8b08 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2018 Facebook */ #include @@ -2547,7 +2547,7 @@ static int btf_ptr_resolve(struct btf_verifier_env *env, * * We now need to continue from the last-resolved-ptr to * ensure the last-resolved-ptr will not referring back to - * the currenct ptr (t). + * the current ptr (t). */ if (btf_type_is_modifier(next_type)) { const struct btf_type *resolved_type; @@ -6148,7 +6148,7 @@ int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj, btf_type_show(btf, type_id, obj, (struct btf_show *)&ssnprintf); - /* If we encontered an error, return it. */ + /* If we encountered an error, return it. */ if (ssnprintf.show.state.status) return ssnprintf.show.state.status; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 098632fdbc45..128028efda64 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1031,7 +1031,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering * @sk: The socket sending or receiving traffic * @skb: The skb that is being sent or received - * @type: The type of program to be exectuted + * @type: The type of program to be executed * * If no socket is passed, or the socket is not of type INET or INET6, * this function does nothing and returns 0. @@ -1094,7 +1094,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); /** * __cgroup_bpf_run_filter_sk() - Run a program on a sock * @sk: sock structure to manipulate - * @type: The type of program to be exectuted + * @type: The type of program to be executed * * socket is passed is expected to be of type INET or INET6. * @@ -1119,7 +1119,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); * provided by user sockaddr * @sk: sock struct that will use sockaddr * @uaddr: sockaddr struct provided by user - * @type: The type of program to be exectuted + * @type: The type of program to be executed * @t_ctx: Pointer to attach type specific context * @flags: Pointer to u32 which contains higher bits of BPF program * return value (OR'ed together). @@ -1166,7 +1166,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains * sk with connection information (IP addresses, etc.) May not contain * cgroup info if it is a req sock. - * @type: The type of program to be exectuted + * @type: The type of program to be executed * * socket passed is expected to be of type INET or INET6. * diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d29af9988f37..65877967f414 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1636,7 +1636,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, value_size = size * num_possible_cpus(); total = 0; /* while experimenting with hash tables with sizes ranging from 10 to - * 1000, it was observed that a bucket can have upto 5 entries. + * 1000, it was observed that a bucket can have up to 5 entries. */ bucket_size = 5; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 49817755b8c3..ae64110a98b5 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1093,7 +1093,7 @@ struct bpf_hrtimer { struct bpf_timer_kern { struct bpf_hrtimer *timer; /* bpf_spin_lock is used here instead of spinlock_t to make - * sure that it always fits into space resereved by struct bpf_timer + * sure that it always fits into space reserved by struct bpf_timer * regardless of LOCKDEP and spinlock debug flags. */ struct bpf_spin_lock lock; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 23f7f9d08a62..497916060ac7 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -1,4 +1,4 @@ -//SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: GPL-2.0 #include #include #include diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 556a769b5b80..8251243022a2 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -143,7 +143,7 @@ static void reuseport_array_free(struct bpf_map *map) /* * Once reaching here, all sk->sk_user_data is not - * referenceing this "array". "array" can be freed now. + * referencing this "array". "array" can be freed now. */ bpf_map_area_free(array); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9c7a72b65eee..db402ebc5570 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2566,7 +2566,7 @@ static int bpf_link_alloc_id(struct bpf_link *link) * pre-allocated resources are to be freed with bpf_cleanup() call. All the * transient state is passed around in struct bpf_link_primer. * This is preferred way to create and initialize bpf_link, especially when - * there are complicated and expensive operations inbetween creating bpf_link + * there are complicated and expensive operations in between creating bpf_link * itself and attaching it to BPF hook. By using bpf_link_prime() and * bpf_link_settle() kernel code using bpf_link doesn't have to perform * expensive (and potentially failing) roll back operations in a rare case diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 7224691df2ec..0b41fa993825 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -45,7 +45,7 @@ void *bpf_jit_alloc_exec_page(void) set_vm_flush_reset_perms(image); /* Keep image as writeable. The alternative is to keep flipping ro/rw - * everytime new program is attached or detached. + * every time new program is attached or detached. */ set_memory_x((long)image, 1); return image; -- cgit v1.2.3 From 9f8e5aee93ed2482638d577a56806b455084b595 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Mon, 14 Feb 2022 12:00:59 -0500 Subject: tracing: Fix allocation of last_cmd in last_cmd_set() The strncat() used in last_cmd_set() includes the nul byte of length of the string being copied in, when it should only hold the size of the string being copied (not the nul byte). Change it to subtract the length of the allocated space and the nul byte to pass that into the strncat(). Also, assign "len" instead of initializing it to zero and its first update is to do a "+=". Link: https://lore.kernel.org/all/202202140628.fj6e4w4v-lkp@intel.com/ Reported-by: kernel test robot Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 5e8970624bce..78788049f3d3 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -744,19 +744,20 @@ static void last_cmd_set(struct trace_event_file *file, char *str) { const char *system = NULL, *name = NULL; struct trace_event_call *call; - int len = 0; + int len; if (!str) return; - len += sizeof(HIST_PREFIX) + strlen(str) + 1; + len = sizeof(HIST_PREFIX) + strlen(str) + 1; kfree(last_cmd); last_cmd = kzalloc(len, GFP_KERNEL); if (!last_cmd) return; strcpy(last_cmd, HIST_PREFIX); - strncat(last_cmd, str, len - sizeof(HIST_PREFIX)); + len -= sizeof(HIST_PREFIX) + 1; + strncat(last_cmd, str, len); if (file) { call = file->event_call; -- cgit v1.2.3 From ce33c845b030c9cf768370c951bc699470b09fa7 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Sun, 20 Feb 2022 23:49:57 +0100 Subject: tracing: Dump stacktrace trigger to the corresponding instance The stacktrace event trigger is not dumping the stacktrace to the instance where it was enabled, but to the global "instance." Use the private_data, pointing to the trigger file, to figure out the corresponding trace instance, and use it in the trigger action, like snapshot_trigger does. Link: https://lkml.kernel.org/r/afbb0b4f18ba92c276865bc97204d438473f4ebc.1645396236.git.bristot@kernel.org Cc: stable@vger.kernel.org Fixes: ae63b31e4d0e2 ("tracing: Separate out trace events from global variables") Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_trigger.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d00fee705f9c..e0d50c9577f3 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1540,7 +1540,12 @@ stacktrace_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { - trace_dump_stack(STACK_SKIP); + struct trace_event_file *file = data->private_data; + + if (file) + __trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP); + else + trace_dump_stack(STACK_SKIP); } static void -- cgit v1.2.3 From 967747bbc084b93b54e66f9047d342232314cd25 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 11 Feb 2022 21:42:45 +0100 Subject: uaccess: remove CONFIG_SET_FS There are no remaining callers of set_fs(), so CONFIG_SET_FS can be removed globally, along with the thread_info field and any references to it. This turns access_ok() into a cheaper check against TASK_SIZE_MAX. As CONFIG_SET_FS is now gone, drop all remaining references to set_fs()/get_fs(), mm_segment_t, user_addr_max() and uaccess_kernel(). Acked-by: Sam Ravnborg # for sparc32 changes Acked-by: "Eric W. Biederman" Tested-by: Sergey Matyukevich # for arc changes Acked-by: Stafford Horne # [openrisc, asm-generic] Acked-by: Dinh Nguyen Signed-off-by: Arnd Bergmann --- arch/Kconfig | 3 -- arch/alpha/Kconfig | 1 - arch/alpha/include/asm/processor.h | 4 -- arch/alpha/include/asm/thread_info.h | 2 - arch/alpha/include/asm/uaccess.h | 19 --------- arch/arc/Kconfig | 1 - arch/arc/include/asm/segment.h | 20 --------- arch/arc/include/asm/thread_info.h | 3 -- arch/arc/include/asm/uaccess.h | 1 - arch/arm/lib/uaccess_with_memcpy.c | 10 ----- arch/arm64/kernel/traps.c | 2 +- arch/csky/Kconfig | 1 - arch/csky/include/asm/processor.h | 2 - arch/csky/include/asm/segment.h | 10 ----- arch/csky/include/asm/thread_info.h | 2 - arch/csky/include/asm/uaccess.h | 3 -- arch/csky/kernel/asm-offsets.c | 1 - arch/h8300/Kconfig | 1 - arch/h8300/include/asm/processor.h | 1 - arch/h8300/include/asm/segment.h | 40 ----------------- arch/h8300/include/asm/thread_info.h | 3 -- arch/h8300/kernel/entry.S | 1 - arch/h8300/kernel/head_ram.S | 1 - arch/h8300/mm/init.c | 6 --- arch/h8300/mm/memory.c | 1 - arch/hexagon/Kconfig | 1 - arch/hexagon/include/asm/thread_info.h | 6 --- arch/hexagon/kernel/process.c | 1 - arch/microblaze/Kconfig | 1 - arch/microblaze/include/asm/thread_info.h | 6 --- arch/microblaze/include/asm/uaccess.h | 24 ----------- arch/microblaze/kernel/asm-offsets.c | 1 - arch/microblaze/kernel/process.c | 1 - arch/nds32/Kconfig | 1 - arch/nds32/include/asm/thread_info.h | 4 -- arch/nds32/include/asm/uaccess.h | 15 +------ arch/nds32/kernel/process.c | 5 +-- arch/nds32/mm/alignment.c | 3 -- arch/nios2/Kconfig | 1 - arch/nios2/include/asm/thread_info.h | 9 ---- arch/nios2/include/asm/uaccess.h | 12 ------ arch/openrisc/Kconfig | 1 - arch/openrisc/include/asm/thread_info.h | 7 --- arch/openrisc/include/asm/uaccess.h | 23 ---------- arch/parisc/include/asm/futex.h | 6 --- arch/parisc/kernel/signal.c | 4 +- arch/parisc/lib/memcpy.c | 2 +- arch/sparc/Kconfig | 1 - arch/sparc/include/asm/processor_32.h | 6 --- arch/sparc/include/asm/uaccess_32.h | 13 ------ arch/sparc/kernel/process_32.c | 2 - arch/xtensa/Kconfig | 1 - arch/xtensa/include/asm/asm-uaccess.h | 71 ------------------------------- arch/xtensa/include/asm/processor.h | 7 --- arch/xtensa/include/asm/thread_info.h | 3 -- arch/xtensa/include/asm/uaccess.h | 16 ------- arch/xtensa/kernel/asm-offsets.c | 3 -- drivers/hid/uhid.c | 2 +- drivers/scsi/sg.c | 5 --- fs/exec.c | 6 --- include/asm-generic/access_ok.h | 14 +----- include/asm-generic/uaccess.h | 25 +---------- include/linux/syscalls.h | 4 -- include/linux/uaccess.h | 33 -------------- include/rdma/ib.h | 2 +- kernel/events/callchain.c | 4 -- kernel/events/core.c | 3 -- kernel/exit.c | 14 ------ kernel/kthread.c | 5 --- kernel/stacktrace.c | 3 -- kernel/trace/bpf_trace.c | 4 -- lib/strncpy_from_user.c | 2 +- lib/strnlen_user.c | 2 +- mm/maccess.c | 11 ----- mm/memory.c | 8 ---- net/bpfilter/bpfilter_kern.c | 2 +- 76 files changed, 14 insertions(+), 531 deletions(-) delete mode 100644 arch/arc/include/asm/segment.h delete mode 100644 arch/csky/include/asm/segment.h delete mode 100644 arch/h8300/include/asm/segment.h (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index fa5db36bda67..99349547afed 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -24,9 +24,6 @@ config KEXEC_ELF config HAVE_IMA_KEXEC bool -config SET_FS - bool - config HOTPLUG_SMT bool diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 4e87783c90ad..eee8b5b0a58b 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -35,7 +35,6 @@ config ALPHA select OLD_SIGSUSPEND select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67 select MMU_GATHER_NO_RANGE - select SET_FS select SPARSEMEM_EXTREME if SPARSEMEM select ZONE_DMA help diff --git a/arch/alpha/include/asm/processor.h b/arch/alpha/include/asm/processor.h index 090499c99c1c..43e234c518b1 100644 --- a/arch/alpha/include/asm/processor.h +++ b/arch/alpha/include/asm/processor.h @@ -26,10 +26,6 @@ #define TASK_UNMAPPED_BASE \ ((current->personality & ADDR_LIMIT_32BIT) ? 0x40000000 : TASK_SIZE / 2) -typedef struct { - unsigned long seg; -} mm_segment_t; - /* This is dead. Everything has been moved to thread_info. */ struct thread_struct { }; #define INIT_THREAD { } diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h index 2592356e3215..fdc485d7787a 100644 --- a/arch/alpha/include/asm/thread_info.h +++ b/arch/alpha/include/asm/thread_info.h @@ -19,7 +19,6 @@ struct thread_info { unsigned int flags; /* low level flags */ unsigned int ieee_state; /* see fpu.h */ - mm_segment_t addr_limit; /* thread address space */ unsigned cpu; /* current CPU */ int preempt_count; /* 0 => preemptable, <0 => BUG */ unsigned int status; /* thread-synchronous flags */ @@ -35,7 +34,6 @@ struct thread_info { #define INIT_THREAD_INFO(tsk) \ { \ .task = &tsk, \ - .addr_limit = KERNEL_DS, \ .preempt_count = INIT_PREEMPT_COUNT, \ } diff --git a/arch/alpha/include/asm/uaccess.h b/arch/alpha/include/asm/uaccess.h index 82c5743fc9cd..c32c2584c0b7 100644 --- a/arch/alpha/include/asm/uaccess.h +++ b/arch/alpha/include/asm/uaccess.h @@ -2,26 +2,7 @@ #ifndef __ALPHA_UACCESS_H #define __ALPHA_UACCESS_H -/* - * The fs value determines whether argument validity checking should be - * performed or not. If get_fs() == USER_DS, checking is performed, with - * get_fs() == KERNEL_DS, checking is bypassed. - * - * Or at least it did once upon a time. Nowadays it is a mask that - * defines which bits of the address space are off limits. This is a - * wee bit faster than the above. - * - * For historical reasons, these macros are grossly misnamed. - */ - -#define KERNEL_DS ((mm_segment_t) { 0UL }) -#define USER_DS ((mm_segment_t) { -0x40000000000UL }) - -#define get_fs() (current_thread_info()->addr_limit) -#define set_fs(x) (current_thread_info()->addr_limit = (x)) - #include - /* * These are the main single-value transfer routines. They automatically * use the right size if we just have the right pointer type. diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 3c2a4753d09b..e0a60a27e14d 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -45,7 +45,6 @@ config ARC select PCI_SYSCALL if PCI select PERF_USE_VMALLOC if ARC_CACHE_VIPT_ALIASING select HAVE_ARCH_JUMP_LABEL if ISA_ARCV2 && !CPU_ENDIAN_BE32 - select SET_FS select TRACE_IRQFLAGS_SUPPORT config LOCKDEP_SUPPORT diff --git a/arch/arc/include/asm/segment.h b/arch/arc/include/asm/segment.h deleted file mode 100644 index 871f8ab11bfd..000000000000 --- a/arch/arc/include/asm/segment.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) - */ - -#ifndef __ASMARC_SEGMENT_H -#define __ASMARC_SEGMENT_H - -#ifndef __ASSEMBLY__ - -typedef unsigned long mm_segment_t; - -#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) - -#define KERNEL_DS MAKE_MM_SEG(0) -#define USER_DS MAKE_MM_SEG(TASK_SIZE) -#define uaccess_kernel() (get_fs() == KERNEL_DS) - -#endif /* __ASSEMBLY__ */ -#endif /* __ASMARC_SEGMENT_H */ diff --git a/arch/arc/include/asm/thread_info.h b/arch/arc/include/asm/thread_info.h index d36863e34bfc..1e0b2e3914d5 100644 --- a/arch/arc/include/asm/thread_info.h +++ b/arch/arc/include/asm/thread_info.h @@ -27,7 +27,6 @@ #ifndef __ASSEMBLY__ #include -#include /* * low level task data that entry.S needs immediate access to @@ -40,7 +39,6 @@ struct thread_info { unsigned long flags; /* low level flags */ int preempt_count; /* 0 => preemptable, <0 => BUG */ struct task_struct *task; /* main task structure */ - mm_segment_t addr_limit; /* thread address space */ __u32 cpu; /* current CPU */ unsigned long thr_ptr; /* TLS ptr */ }; @@ -56,7 +54,6 @@ struct thread_info { .flags = 0, \ .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .addr_limit = KERNEL_DS, \ } static inline __attribute_const__ struct thread_info *current_thread_info(void) diff --git a/arch/arc/include/asm/uaccess.h b/arch/arc/include/asm/uaccess.h index 30f80b4be2ab..99712471c96a 100644 --- a/arch/arc/include/asm/uaccess.h +++ b/arch/arc/include/asm/uaccess.h @@ -638,7 +638,6 @@ extern unsigned long arc_clear_user_noinline(void __user *to, #define __clear_user(d, n) arc_clear_user_noinline(d, n) #endif -#include #include #endif diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c index 106f83a5ea6d..c30b689bec2e 100644 --- a/arch/arm/lib/uaccess_with_memcpy.c +++ b/arch/arm/lib/uaccess_with_memcpy.c @@ -92,11 +92,6 @@ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n) unsigned long ua_flags; int atomic; - if (uaccess_kernel()) { - memcpy((void *)to, from, n); - return 0; - } - /* the mmap semaphore is taken only if not in an atomic context */ atomic = faulthandler_disabled(); @@ -165,11 +160,6 @@ __clear_user_memset(void __user *addr, unsigned long n) { unsigned long ua_flags; - if (uaccess_kernel()) { - memset((void *)addr, 0, n); - return 0; - } - mmap_read_lock(current->mm); while (n) { pte_t *pte; diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 70fc42470f13..48dcdbdf9a36 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -519,7 +519,7 @@ void do_ptrauth_fault(struct pt_regs *regs, unsigned int esr) NOKPROBE_SYMBOL(do_ptrauth_fault); #define __user_cache_maint(insn, address, res) \ - if (address >= user_addr_max()) { \ + if (address >= TASK_SIZE_MAX) { \ res = -EFAULT; \ } else { \ uaccess_ttbr0_enable(); \ diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 132f43f12dd8..75ef86605d69 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -79,7 +79,6 @@ config CSKY select PCI_DOMAINS_GENERIC if PCI select PCI_SYSCALL if PCI select PCI_MSI if PCI - select SET_FS select TRACE_IRQFLAGS_SUPPORT config LOCKDEP_SUPPORT diff --git a/arch/csky/include/asm/processor.h b/arch/csky/include/asm/processor.h index 817dd60ff152..688c7548b559 100644 --- a/arch/csky/include/asm/processor.h +++ b/arch/csky/include/asm/processor.h @@ -4,7 +4,6 @@ #define __ASM_CSKY_PROCESSOR_H #include -#include #include #include #include @@ -59,7 +58,6 @@ struct thread_struct { */ #define start_thread(_regs, _pc, _usp) \ do { \ - set_fs(USER_DS); /* reads from user space */ \ (_regs)->pc = (_pc); \ (_regs)->regs[1] = 0; /* ABIV1 is R7, uClibc_main rtdl arg */ \ (_regs)->regs[2] = 0; \ diff --git a/arch/csky/include/asm/segment.h b/arch/csky/include/asm/segment.h deleted file mode 100644 index 5bc1cc62b87f..000000000000 --- a/arch/csky/include/asm/segment.h +++ /dev/null @@ -1,10 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef __ASM_CSKY_SEGMENT_H -#define __ASM_CSKY_SEGMENT_H - -typedef struct { - unsigned long seg; -} mm_segment_t; - -#endif /* __ASM_CSKY_SEGMENT_H */ diff --git a/arch/csky/include/asm/thread_info.h b/arch/csky/include/asm/thread_info.h index 8c349a8f904d..b5ed788f0c68 100644 --- a/arch/csky/include/asm/thread_info.h +++ b/arch/csky/include/asm/thread_info.h @@ -16,7 +16,6 @@ struct thread_info { unsigned long flags; int preempt_count; unsigned long tp_value; - mm_segment_t addr_limit; struct restart_block restart_block; struct pt_regs *regs; unsigned int cpu; @@ -26,7 +25,6 @@ struct thread_info { { \ .task = &tsk, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .addr_limit = KERNEL_DS, \ .cpu = 0, \ .restart_block = { \ .fn = do_no_restart_syscall, \ diff --git a/arch/csky/include/asm/uaccess.h b/arch/csky/include/asm/uaccess.h index fec8f77ffc99..2e927c21d8a1 100644 --- a/arch/csky/include/asm/uaccess.h +++ b/arch/csky/include/asm/uaccess.h @@ -3,8 +3,6 @@ #ifndef __ASM_CSKY_UACCESS_H #define __ASM_CSKY_UACCESS_H -#define user_addr_max() (current_thread_info()->addr_limit.seg) - /* * __put_user_fn */ @@ -200,7 +198,6 @@ unsigned long raw_copy_to_user(void *to, const void *from, unsigned long n); unsigned long __clear_user(void __user *to, unsigned long n); #define __clear_user __clear_user -#include #include #endif /* __ASM_CSKY_UACCESS_H */ diff --git a/arch/csky/kernel/asm-offsets.c b/arch/csky/kernel/asm-offsets.c index 1cbcba4b0dd1..d1e903579473 100644 --- a/arch/csky/kernel/asm-offsets.c +++ b/arch/csky/kernel/asm-offsets.c @@ -25,7 +25,6 @@ int main(void) /* offsets into the thread_info struct */ DEFINE(TINFO_FLAGS, offsetof(struct thread_info, flags)); DEFINE(TINFO_PREEMPT, offsetof(struct thread_info, preempt_count)); - DEFINE(TINFO_ADDR_LIMIT, offsetof(struct thread_info, addr_limit)); DEFINE(TINFO_TP_VALUE, offsetof(struct thread_info, tp_value)); DEFINE(TINFO_TASK, offsetof(struct thread_info, task)); diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index 3e3e0f16f7e0..fe48c4f26cc8 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -24,7 +24,6 @@ config H8300 select HAVE_ARCH_KGDB select HAVE_ARCH_HASH select CPU_NO_EFFICIENT_FFS - select SET_FS select UACCESS_MEMCPY config CPU_BIG_ENDIAN diff --git a/arch/h8300/include/asm/processor.h b/arch/h8300/include/asm/processor.h index 141a23eb62b7..ba171aa4dacb 100644 --- a/arch/h8300/include/asm/processor.h +++ b/arch/h8300/include/asm/processor.h @@ -13,7 +13,6 @@ #define __ASM_H8300_PROCESSOR_H #include -#include #include #include diff --git a/arch/h8300/include/asm/segment.h b/arch/h8300/include/asm/segment.h deleted file mode 100644 index 37950725d9b9..000000000000 --- a/arch/h8300/include/asm/segment.h +++ /dev/null @@ -1,40 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _H8300_SEGMENT_H -#define _H8300_SEGMENT_H - -/* define constants */ -#define USER_DATA (1) -#ifndef __USER_DS -#define __USER_DS (USER_DATA) -#endif -#define USER_PROGRAM (2) -#define SUPER_DATA (3) -#ifndef __KERNEL_DS -#define __KERNEL_DS (SUPER_DATA) -#endif -#define SUPER_PROGRAM (4) - -#ifndef __ASSEMBLY__ - -typedef struct { - unsigned long seg; -} mm_segment_t; - -#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) -#define USER_DS MAKE_MM_SEG(__USER_DS) -#define KERNEL_DS MAKE_MM_SEG(__KERNEL_DS) - -/* - * Get/set the SFC/DFC registers for MOVES instructions - */ - -static inline mm_segment_t get_fs(void) -{ - return USER_DS; -} - -#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) - -#endif /* __ASSEMBLY__ */ - -#endif /* _H8300_SEGMENT_H */ diff --git a/arch/h8300/include/asm/thread_info.h b/arch/h8300/include/asm/thread_info.h index a518214d4ddd..ff2d873749a4 100644 --- a/arch/h8300/include/asm/thread_info.h +++ b/arch/h8300/include/asm/thread_info.h @@ -10,7 +10,6 @@ #define _ASM_THREAD_INFO_H #include -#include #ifdef __KERNEL__ @@ -31,7 +30,6 @@ struct thread_info { unsigned long flags; /* low level flags */ int cpu; /* cpu we're on */ int preempt_count; /* 0 => preemptable, <0 => BUG */ - mm_segment_t addr_limit; }; /* @@ -43,7 +41,6 @@ struct thread_info { .flags = 0, \ .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .addr_limit = KERNEL_DS, \ } /* how to get the thread information struct from C */ diff --git a/arch/h8300/kernel/entry.S b/arch/h8300/kernel/entry.S index c6e289b5f1f2..42db87c17917 100644 --- a/arch/h8300/kernel/entry.S +++ b/arch/h8300/kernel/entry.S @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/h8300/kernel/head_ram.S b/arch/h8300/kernel/head_ram.S index dbf8429f5fab..489462f0ee57 100644 --- a/arch/h8300/kernel/head_ram.S +++ b/arch/h8300/kernel/head_ram.S @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c index f7bf4693e3b2..9fa13312720a 100644 --- a/arch/h8300/mm/init.c +++ b/arch/h8300/mm/init.c @@ -34,7 +34,6 @@ #include #include -#include #include #include @@ -71,11 +70,6 @@ void __init paging_init(void) panic("%s: Failed to allocate %lu bytes align=0x%lx\n", __func__, PAGE_SIZE, PAGE_SIZE); - /* - * Set up SFC/DFC registers (user data space). - */ - set_fs(USER_DS); - pr_debug("before free_area_init\n"); pr_debug("free_area_init -> start_mem is %#lx\nvirtual_end is %#lx\n", diff --git a/arch/h8300/mm/memory.c b/arch/h8300/mm/memory.c index 4a60e2b5eb96..c950571064d2 100644 --- a/arch/h8300/mm/memory.c +++ b/arch/h8300/mm/memory.c @@ -24,7 +24,6 @@ #include #include -#include #include #include #include diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index 15dd8f38b698..54eadf265178 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -30,7 +30,6 @@ config HEXAGON select GENERIC_CLOCKEVENTS_BROADCAST select MODULES_USE_ELF_RELA select GENERIC_CPU_DEVICES - select SET_FS select ARCH_WANT_LD_ORPHAN_WARN select TRACE_IRQFLAGS_SUPPORT help diff --git a/arch/hexagon/include/asm/thread_info.h b/arch/hexagon/include/asm/thread_info.h index 535976665bf0..e90f280b9ce3 100644 --- a/arch/hexagon/include/asm/thread_info.h +++ b/arch/hexagon/include/asm/thread_info.h @@ -22,10 +22,6 @@ #ifndef __ASSEMBLY__ -typedef struct { - unsigned long seg; -} mm_segment_t; - /* * This is union'd with the "bottom" of the kernel stack. * It keeps track of thread info which is handy for routines @@ -37,7 +33,6 @@ struct thread_info { unsigned long flags; /* low level flags */ __u32 cpu; /* current cpu */ int preempt_count; /* 0=>preemptible,<0=>BUG */ - mm_segment_t addr_limit; /* segmentation sux */ /* * used for syscalls somehow; * seems to have a function pointer and four arguments @@ -66,7 +61,6 @@ struct thread_info { .flags = 0, \ .cpu = 0, \ .preempt_count = 1, \ - .addr_limit = KERNEL_DS, \ .sp = 0, \ .regs = NULL, \ } diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c index 232dfd8956aa..dfa6b2757c05 100644 --- a/arch/hexagon/kernel/process.c +++ b/arch/hexagon/kernel/process.c @@ -105,7 +105,6 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, /* * Parent sees new pid -- not necessary, not even possible at * this point in the fork process - * Might also want to set things like ti->addr_limit */ return 0; diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 59798e43cdb0..1fb1cec087b7 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -42,7 +42,6 @@ config MICROBLAZE select CPU_NO_EFFICIENT_FFS select MMU_GATHER_NO_RANGE select SPARSE_IRQ - select SET_FS select ZONE_DMA select TRACE_IRQFLAGS_SUPPORT diff --git a/arch/microblaze/include/asm/thread_info.h b/arch/microblaze/include/asm/thread_info.h index 44f5ca331862..a0ddd2a36fb9 100644 --- a/arch/microblaze/include/asm/thread_info.h +++ b/arch/microblaze/include/asm/thread_info.h @@ -56,17 +56,12 @@ struct cpu_context { __u32 fsr; }; -typedef struct { - unsigned long seg; -} mm_segment_t; - struct thread_info { struct task_struct *task; /* main task structure */ unsigned long flags; /* low level flags */ unsigned long status; /* thread-synchronous flags */ __u32 cpu; /* current CPU */ __s32 preempt_count; /* 0 => preemptable,< 0 => BUG*/ - mm_segment_t addr_limit; /* thread address space */ struct cpu_context cpu_context; }; @@ -80,7 +75,6 @@ struct thread_info { .flags = 0, \ .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .addr_limit = KERNEL_DS, \ } /* how to get the thread information struct from C */ diff --git a/arch/microblaze/include/asm/uaccess.h b/arch/microblaze/include/asm/uaccess.h index bf9b7657a65a..3aab2f17e046 100644 --- a/arch/microblaze/include/asm/uaccess.h +++ b/arch/microblaze/include/asm/uaccess.h @@ -15,30 +15,6 @@ #include #include #include - -/* - * On Microblaze the fs value is actually the top of the corresponding - * address space. - * - * The fs value determines whether argument validity checking should be - * performed or not. If get_fs() == USER_DS, checking is performed, with - * get_fs() == KERNEL_DS, checking is bypassed. - * - * For historical reasons, these macros are grossly misnamed. - * - * For non-MMU arch like Microblaze, KERNEL_DS and USER_DS is equal. - */ -# define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) - -# define KERNEL_DS MAKE_MM_SEG(0xFFFFFFFF) -# define USER_DS MAKE_MM_SEG(TASK_SIZE - 1) - -# define get_fs() (current_thread_info()->addr_limit) -# define set_fs(val) (current_thread_info()->addr_limit = (val)) -# define user_addr_max() get_fs().seg - -# define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) - #include # define __FIXUP_SECTION ".section .fixup,\"ax\"\n" diff --git a/arch/microblaze/kernel/asm-offsets.c b/arch/microblaze/kernel/asm-offsets.c index b77dd188dec4..47ee409508b1 100644 --- a/arch/microblaze/kernel/asm-offsets.c +++ b/arch/microblaze/kernel/asm-offsets.c @@ -86,7 +86,6 @@ int main(int argc, char *argv[]) /* struct thread_info */ DEFINE(TI_TASK, offsetof(struct thread_info, task)); DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); - DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit)); DEFINE(TI_CPU_CONTEXT, offsetof(struct thread_info, cpu_context)); DEFINE(TI_PREEMPT_COUNT, offsetof(struct thread_info, preempt_count)); BLANK(); diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c index 5e2b91c1e8ce..1b944d319d73 100644 --- a/arch/microblaze/kernel/process.c +++ b/arch/microblaze/kernel/process.c @@ -18,7 +18,6 @@ #include #include #include -#include /* for USER_DS macros */ #include void show_regs(struct pt_regs *regs) diff --git a/arch/nds32/Kconfig b/arch/nds32/Kconfig index 4d1421b18734..013249430fa3 100644 --- a/arch/nds32/Kconfig +++ b/arch/nds32/Kconfig @@ -44,7 +44,6 @@ config NDS32 select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE - select SET_FS select TRACE_IRQFLAGS_SUPPORT help Andes(nds32) Linux support. diff --git a/arch/nds32/include/asm/thread_info.h b/arch/nds32/include/asm/thread_info.h index d3967ad184f0..bd8f81cf2ce5 100644 --- a/arch/nds32/include/asm/thread_info.h +++ b/arch/nds32/include/asm/thread_info.h @@ -16,8 +16,6 @@ struct task_struct; #include #include -typedef unsigned long mm_segment_t; - /* * low level task data that entry.S needs immediate access to. * __switch_to() assumes cpu_context follows immediately after cpu_domain. @@ -25,12 +23,10 @@ typedef unsigned long mm_segment_t; struct thread_info { unsigned long flags; /* low level flags */ __s32 preempt_count; /* 0 => preemptable, <0 => bug */ - mm_segment_t addr_limit; /* address limit */ }; #define INIT_THREAD_INFO(tsk) \ { \ .preempt_count = INIT_PREEMPT_COUNT, \ - .addr_limit = KERNEL_DS, \ } #define thread_saved_pc(tsk) ((unsigned long)(tsk->thread.cpu_context.pc)) #define thread_saved_fp(tsk) ((unsigned long)(tsk->thread.cpu_context.fp)) diff --git a/arch/nds32/include/asm/uaccess.h b/arch/nds32/include/asm/uaccess.h index 832d642a4068..377548d4451a 100644 --- a/arch/nds32/include/asm/uaccess.h +++ b/arch/nds32/include/asm/uaccess.h @@ -11,6 +11,7 @@ #include #include #include +#include #define __asmeq(x, y) ".ifnc " x "," y " ; .err ; .endif\n\t" @@ -33,20 +34,6 @@ struct exception_table_entry { extern int fixup_exception(struct pt_regs *regs); -#define KERNEL_DS ((mm_segment_t) { ~0UL }) -#define USER_DS ((mm_segment_t) {TASK_SIZE - 1}) - -#define get_fs() (current_thread_info()->addr_limit) -#define user_addr_max get_fs -#define uaccess_kernel() (get_fs() == KERNEL_DS) - -static inline void set_fs(mm_segment_t fs) -{ - current_thread_info()->addr_limit = fs; -} - -#include - /* * Single-value transfer routines. They automatically use the right * size if we just have the right pointer type. Note that the functions diff --git a/arch/nds32/kernel/process.c b/arch/nds32/kernel/process.c index 49fab9e39cbf..d35c1f63fa11 100644 --- a/arch/nds32/kernel/process.c +++ b/arch/nds32/kernel/process.c @@ -119,9 +119,8 @@ void show_regs(struct pt_regs *regs) regs->uregs[7], regs->uregs[6], regs->uregs[5], regs->uregs[4]); pr_info("r3 : %08lx r2 : %08lx r1 : %08lx r0 : %08lx\n", regs->uregs[3], regs->uregs[2], regs->uregs[1], regs->uregs[0]); - pr_info(" IRQs o%s Segment %s\n", - interrupts_enabled(regs) ? "n" : "ff", - uaccess_kernel() ? "kernel" : "user"); + pr_info(" IRQs o%s Segment user\n", + interrupts_enabled(regs) ? "n" : "ff"); } EXPORT_SYMBOL(show_regs); diff --git a/arch/nds32/mm/alignment.c b/arch/nds32/mm/alignment.c index 1eb7ded6992b..9c2c0a454da8 100644 --- a/arch/nds32/mm/alignment.c +++ b/arch/nds32/mm/alignment.c @@ -512,7 +512,6 @@ int do_unaligned_access(unsigned long addr, struct pt_regs *regs) { unsigned long inst; int ret = -EFAULT; - mm_segment_t seg; inst = get_inst(regs->ipc); @@ -520,12 +519,10 @@ int do_unaligned_access(unsigned long addr, struct pt_regs *regs) "Faulting addr: 0x%08lx, pc: 0x%08lx [inst: 0x%08lx ]\n", addr, regs->ipc, inst); - seg = force_uaccess_begin(); if (inst & NDS32_16BIT_INSTRUCTION) ret = do_16((inst >> 16) & 0xffff, regs); else ret = do_32(inst, regs); - force_uaccess_end(seg); return ret; } diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index 33fd06f5fa41..4167f1eb4cd8 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -24,7 +24,6 @@ config NIOS2 select USB_ARCH_HAS_HCD if USB_SUPPORT select CPU_NO_EFFICIENT_FFS select MMU_GATHER_NO_RANGE if MMU - select SET_FS config GENERIC_CSUM def_bool y diff --git a/arch/nios2/include/asm/thread_info.h b/arch/nios2/include/asm/thread_info.h index 272d2c72a727..bcc0e9915ebd 100644 --- a/arch/nios2/include/asm/thread_info.h +++ b/arch/nios2/include/asm/thread_info.h @@ -26,10 +26,6 @@ #ifndef __ASSEMBLY__ -typedef struct { - unsigned long seg; -} mm_segment_t; - /* * low level task data that entry.S needs immediate access to * - this struct should fit entirely inside of one cache line @@ -42,10 +38,6 @@ struct thread_info { unsigned long flags; /* low level flags */ __u32 cpu; /* current CPU */ int preempt_count; /* 0 => preemptable,<0 => BUG */ - mm_segment_t addr_limit; /* thread address space: - 0-0x7FFFFFFF for user-thead - 0-0xFFFFFFFF for kernel-thread - */ struct pt_regs *regs; }; @@ -60,7 +52,6 @@ struct thread_info { .flags = 0, \ .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .addr_limit = KERNEL_DS, \ } /* how to get the thread information struct from C */ diff --git a/arch/nios2/include/asm/uaccess.h b/arch/nios2/include/asm/uaccess.h index 6664ddc0e8e5..b8299082adbe 100644 --- a/arch/nios2/include/asm/uaccess.h +++ b/arch/nios2/include/asm/uaccess.h @@ -18,18 +18,6 @@ #include #include - -/* - * Segment stuff - */ -#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) -#define USER_DS MAKE_MM_SEG(0x80000000UL) -#define KERNEL_DS MAKE_MM_SEG(0) - - -#define get_fs() (current_thread_info()->addr_limit) -#define set_fs(seg) (current_thread_info()->addr_limit = (seg)) - #include # define __EX_TABLE_SECTION ".section __ex_table,\"a\"\n" diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index f724b3f1aeed..0d68adf6e02b 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -36,7 +36,6 @@ config OPENRISC select ARCH_WANT_FRAME_POINTERS select GENERIC_IRQ_MULTI_HANDLER select MMU_GATHER_NO_RANGE if MMU - select SET_FS select TRACE_IRQFLAGS_SUPPORT config CPU_BIG_ENDIAN diff --git a/arch/openrisc/include/asm/thread_info.h b/arch/openrisc/include/asm/thread_info.h index 659834ab87fa..4af3049c34c2 100644 --- a/arch/openrisc/include/asm/thread_info.h +++ b/arch/openrisc/include/asm/thread_info.h @@ -40,18 +40,12 @@ */ #ifndef __ASSEMBLY__ -typedef unsigned long mm_segment_t; - struct thread_info { struct task_struct *task; /* main task structure */ unsigned long flags; /* low level flags */ __u32 cpu; /* current CPU */ __s32 preempt_count; /* 0 => preemptable, <0 => BUG */ - mm_segment_t addr_limit; /* thread address space: - 0-0x7FFFFFFF for user-thead - 0-0xFFFFFFFF for kernel-thread - */ __u8 supervisor_stack[0]; /* saved context data */ @@ -71,7 +65,6 @@ struct thread_info { .flags = 0, \ .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .addr_limit = KERNEL_DS, \ .ksp = 0, \ } diff --git a/arch/openrisc/include/asm/uaccess.h b/arch/openrisc/include/asm/uaccess.h index 8f049ec99b3e..d6500a374e18 100644 --- a/arch/openrisc/include/asm/uaccess.h +++ b/arch/openrisc/include/asm/uaccess.h @@ -22,29 +22,6 @@ #include #include #include - -/* - * The fs value determines whether argument validity checking should be - * performed or not. If get_fs() == USER_DS, checking is performed, with - * get_fs() == KERNEL_DS, checking is bypassed. - * - * For historical reasons, these macros are grossly misnamed. - */ - -/* addr_limit is the maximum accessible address for the task. we misuse - * the KERNEL_DS and USER_DS values to both assign and compare the - * addr_limit values through the equally misnamed get/set_fs macros. - * (see above) - */ - -#define KERNEL_DS (~0UL) - -#define USER_DS (TASK_SIZE) -#define get_fs() (current_thread_info()->addr_limit) -#define set_fs(x) (current_thread_info()->addr_limit = (x)) - -#define uaccess_kernel() (get_fs() == KERNEL_DS) - #include /* diff --git a/arch/parisc/include/asm/futex.h b/arch/parisc/include/asm/futex.h index b5835325d44b..3222206cb3ea 100644 --- a/arch/parisc/include/asm/futex.h +++ b/arch/parisc/include/asm/futex.h @@ -96,12 +96,6 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 val; unsigned long flags; - /* futex.c wants to do a cmpxchg_inatomic on kernel NULL, which is - * our gateway page, and causes no end of trouble... - */ - if (uaccess_kernel() && !uaddr) - return -EFAULT; - if (!access_ok(uaddr, sizeof(u32))) return -EFAULT; diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c index 46b1050640b8..cc07bcabf336 100644 --- a/arch/parisc/kernel/signal.c +++ b/arch/parisc/kernel/signal.c @@ -251,7 +251,7 @@ setup_rt_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs, DBG(1,"setup_rt_frame: frame %p info %p\n", frame, ksig->info); start = (unsigned long) frame; - if (start >= user_addr_max() - sigframe_size) + if (start >= TASK_SIZE_MAX - sigframe_size) return -EFAULT; #ifdef CONFIG_64BIT @@ -518,7 +518,7 @@ insert_restart_trampoline(struct pt_regs *regs) long err = 0; /* check that we don't exceed the stack */ - if (A(&usp[0]) >= user_addr_max() - 5 * sizeof(int)) + if (A(&usp[0]) >= TASK_SIZE_MAX - 5 * sizeof(int)) return; /* Setup a trampoline to restart the syscall diff --git a/arch/parisc/lib/memcpy.c b/arch/parisc/lib/memcpy.c index ea70a0e08321..468704ce8a1c 100644 --- a/arch/parisc/lib/memcpy.c +++ b/arch/parisc/lib/memcpy.c @@ -13,7 +13,7 @@ #include #include -#define get_user_space() (uaccess_kernel() ? 0 : mfsp(3)) +#define get_user_space() (mfsp(3)) #define get_kernel_space() (0) /* Returns 0 for success, otherwise, returns number of bytes not transferred. */ diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 602149f3957f..9200bc04701c 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -58,7 +58,6 @@ config SPARC32 select HAVE_UID16 select OLD_SIGACTION select ZONE_DMA - select SET_FS config SPARC64 def_bool 64BIT diff --git a/arch/sparc/include/asm/processor_32.h b/arch/sparc/include/asm/processor_32.h index 647bf0ac7beb..b26c35336b51 100644 --- a/arch/sparc/include/asm/processor_32.h +++ b/arch/sparc/include/asm/processor_32.h @@ -32,10 +32,6 @@ struct fpq { }; #endif -typedef struct { - int seg; -} mm_segment_t; - /* The Sparc processor specific thread struct. */ struct thread_struct { struct pt_regs *kregs; @@ -50,11 +46,9 @@ struct thread_struct { unsigned long fsr; unsigned long fpqdepth; struct fpq fpqueue[16]; - mm_segment_t current_ds; }; #define INIT_THREAD { \ - .current_ds = KERNEL_DS, \ .kregs = (struct pt_regs *)(init_stack+THREAD_SIZE)-1 \ } diff --git a/arch/sparc/include/asm/uaccess_32.h b/arch/sparc/include/asm/uaccess_32.h index 367747116260..9fd6c53644b6 100644 --- a/arch/sparc/include/asm/uaccess_32.h +++ b/arch/sparc/include/asm/uaccess_32.h @@ -12,19 +12,6 @@ #include #include - -/* Sparc is not segmented, however we need to be able to fool access_ok() - * when doing system calls from kernel mode legitimately. - * - * "For historical reasons, these macros are grossly misnamed." -Linus - */ - -#define KERNEL_DS ((mm_segment_t) { 0 }) -#define USER_DS ((mm_segment_t) { -1 }) - -#define get_fs() (current->thread.current_ds) -#define set_fs(val) ((current->thread.current_ds) = (val)) - #include /* Uh, these should become the main single-value transfer routines.. diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index 2dc0bf9fe62e..88c0c14aaff0 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -300,7 +300,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, extern int nwindows; unsigned long psr; memset(new_stack, 0, STACKFRAME_SZ + TRACEREG_SZ); - p->thread.current_ds = KERNEL_DS; ti->kpc = (((unsigned long) ret_from_kernel_thread) - 0x8); childregs->u_regs[UREG_G1] = sp; /* function */ childregs->u_regs[UREG_G2] = arg; @@ -311,7 +310,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, } memcpy(new_stack, (char *)regs - STACKFRAME_SZ, STACKFRAME_SZ + TRACEREG_SZ); childregs->u_regs[UREG_FP] = sp; - p->thread.current_ds = USER_DS; ti->kpc = (((unsigned long) ret_from_fork) - 0x8); ti->kpsr = current->thread.fork_kpsr | PSR_PIL; ti->kwim = current->thread.fork_kwim; diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 8ac599aa6d99..09f7616a0b46 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -40,7 +40,6 @@ config XTENSA select IRQ_DOMAIN select MODULES_USE_ELF_RELA select PERF_USE_VMALLOC - select SET_FS select TRACE_IRQFLAGS_SUPPORT select VIRT_TO_BUS help diff --git a/arch/xtensa/include/asm/asm-uaccess.h b/arch/xtensa/include/asm/asm-uaccess.h index 7f6cf4151843..7cec869136e3 100644 --- a/arch/xtensa/include/asm/asm-uaccess.h +++ b/arch/xtensa/include/asm/asm-uaccess.h @@ -23,76 +23,6 @@ #include #include -/* - * These assembly macros mirror the C macros in asm/uaccess.h. They - * should always have identical functionality. See - * arch/xtensa/kernel/sys.S for usage. - */ - -#define KERNEL_DS 0 -#define USER_DS 1 - -/* - * get_fs reads current->thread.current_ds into a register. - * On Entry: - * anything - * stack - * On Exit: - * contains current->thread.current_ds - */ - .macro get_fs ad, sp - GET_CURRENT(\ad,\sp) -#if THREAD_CURRENT_DS > 1020 - addi \ad, \ad, TASK_THREAD - l32i \ad, \ad, THREAD_CURRENT_DS - TASK_THREAD -#else - l32i \ad, \ad, THREAD_CURRENT_DS -#endif - .endm - -/* - * set_fs sets current->thread.current_ds to some value. - * On Entry: - * anything (temp register) - * value to write - * stack - * On Exit: - * destroyed (actually, current) - * preserved, value to write - */ - .macro set_fs at, av, sp - GET_CURRENT(\at,\sp) - s32i \av, \at, THREAD_CURRENT_DS - .endm - -/* - * kernel_ok determines whether we should bypass addr/size checking. - * See the equivalent C-macro version below for clarity. - * On success, kernel_ok branches to a label indicated by parameter - * . This implies that the macro falls through to the next - * insruction on an error. - * - * Note that while this macro can be used independently, we designed - * in for optimal use in the access_ok macro below (i.e., we fall - * through on error). - * - * On Entry: - * anything (temp register) - * label to branch to on success; implies - * fall-through macro on error - * stack pointer - * On Exit: - * destroyed (actually, current->thread.current_ds) - */ - -#if ((KERNEL_DS != 0) || (USER_DS == 0)) -# error Assembly macro kernel_ok fails -#endif - .macro kernel_ok at, sp, success - get_fs \at, \sp - beqz \at, \success - .endm - /* * user_ok determines whether the access to user-space memory is allowed. * See the equivalent C-macro version below for clarity. @@ -147,7 +77,6 @@ * destroyed */ .macro access_ok aa, as, at, sp, error - kernel_ok \at, \sp, .Laccess_ok_\@ user_ok \aa, \as, \at, \error .Laccess_ok_\@: .endm diff --git a/arch/xtensa/include/asm/processor.h b/arch/xtensa/include/asm/processor.h index 37d3e9887fe7..abad7c3df46f 100644 --- a/arch/xtensa/include/asm/processor.h +++ b/arch/xtensa/include/asm/processor.h @@ -152,18 +152,12 @@ */ #define SPILL_SLOT_CALL12(sp, reg) (*(((unsigned long *)(sp)) - 16 + (reg))) -typedef struct { - unsigned long seg; -} mm_segment_t; - struct thread_struct { /* kernel's return address and stack pointer for context switching */ unsigned long ra; /* kernel's a0: return address and window call size */ unsigned long sp; /* kernel's a1: stack pointer */ - mm_segment_t current_ds; /* see uaccess.h for example uses */ - /* struct xtensa_cpuinfo info; */ unsigned long bad_vaddr; /* last user fault */ @@ -186,7 +180,6 @@ struct thread_struct { { \ ra: 0, \ sp: sizeof(init_stack) + (long) &init_stack, \ - current_ds: {0}, \ /*info: {0}, */ \ bad_vaddr: 0, \ bad_uaddr: 0, \ diff --git a/arch/xtensa/include/asm/thread_info.h b/arch/xtensa/include/asm/thread_info.h index a312333a9add..f6fcbba1d02f 100644 --- a/arch/xtensa/include/asm/thread_info.h +++ b/arch/xtensa/include/asm/thread_info.h @@ -52,8 +52,6 @@ struct thread_info { __u32 cpu; /* current CPU */ __s32 preempt_count; /* 0 => preemptable,< 0 => BUG*/ - mm_segment_t addr_limit; /* thread address space */ - unsigned long cpenable; #if XCHAL_HAVE_EXCLUSIVE /* result of the most recent exclusive store */ @@ -81,7 +79,6 @@ struct thread_info { .flags = 0, \ .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .addr_limit = KERNEL_DS, \ } /* how to get the thread information struct from C */ diff --git a/arch/xtensa/include/asm/uaccess.h b/arch/xtensa/include/asm/uaccess.h index 0edd9e4b23d0..56aec6d504fe 100644 --- a/arch/xtensa/include/asm/uaccess.h +++ b/arch/xtensa/include/asm/uaccess.h @@ -19,22 +19,6 @@ #include #include #include - -/* - * The fs value determines whether argument validity checking should - * be performed or not. If get_fs() == USER_DS, checking is - * performed, with get_fs() == KERNEL_DS, checking is bypassed. - * - * For historical reasons (Data Segment Register?), these macros are - * grossly misnamed. - */ - -#define KERNEL_DS ((mm_segment_t) { 0 }) -#define USER_DS ((mm_segment_t) { 1 }) - -#define get_fs() (current->thread.current_ds) -#define set_fs(val) (current->thread.current_ds = (val)) - #include /* diff --git a/arch/xtensa/kernel/asm-offsets.c b/arch/xtensa/kernel/asm-offsets.c index dc5c83cad9be..f1fd1390d069 100644 --- a/arch/xtensa/kernel/asm-offsets.c +++ b/arch/xtensa/kernel/asm-offsets.c @@ -87,7 +87,6 @@ int main(void) OFFSET(TI_STSTUS, thread_info, status); OFFSET(TI_CPU, thread_info, cpu); OFFSET(TI_PRE_COUNT, thread_info, preempt_count); - OFFSET(TI_ADDR_LIMIT, thread_info, addr_limit); /* struct thread_info (offset from start_struct) */ DEFINE(THREAD_RA, offsetof (struct task_struct, thread.ra)); @@ -108,8 +107,6 @@ int main(void) #endif DEFINE(THREAD_XTREGS_USER, offsetof (struct thread_info, xtregs_user)); DEFINE(XTREGS_USER_SIZE, sizeof(xtregs_user_t)); - DEFINE(THREAD_CURRENT_DS, offsetof (struct task_struct, \ - thread.current_ds)); /* struct mm_struct */ DEFINE(MM_USERS, offsetof(struct mm_struct, mm_users)); diff --git a/drivers/hid/uhid.c b/drivers/hid/uhid.c index 614adb510dbd..2a918aeb0af1 100644 --- a/drivers/hid/uhid.c +++ b/drivers/hid/uhid.c @@ -747,7 +747,7 @@ static ssize_t uhid_char_write(struct file *file, const char __user *buffer, * copied from, so it's unsafe to allow this with elevated * privileges (e.g. from a setuid binary) or via kernel_write(). */ - if (file->f_cred != current_cred() || uaccess_kernel()) { + if (file->f_cred != current_cred()) { pr_err_once("UHID_CREATE from different security context by process %d (%s), this is not allowed.\n", task_tgid_vnr(current), current->comm); ret = -EACCES; diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 6b43e97bd417..aaa2376b9d34 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -224,11 +224,6 @@ static int sg_check_file_access(struct file *filp, const char *caller) caller, task_tgid_vnr(current), current->comm); return -EPERM; } - if (uaccess_kernel()) { - pr_err_once("%s: process %d (%s) called from kernel context, this is not allowed.\n", - caller, task_tgid_vnr(current), current->comm); - return -EACCES; - } return 0; } diff --git a/fs/exec.c b/fs/exec.c index 79f2c9483302..bc68a0c089ac 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1303,12 +1303,6 @@ int begin_new_exec(struct linux_binprm * bprm) if (retval) goto out_unlock; - /* - * Ensure that the uaccess routines can actually operate on userspace - * pointers: - */ - force_uaccess_begin(); - if (me->flags & PF_KTHREAD) free_kthread_struct(me); me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | diff --git a/include/asm-generic/access_ok.h b/include/asm-generic/access_ok.h index d38cc5dad65b..2866ae61b1cd 100644 --- a/include/asm-generic/access_ok.h +++ b/include/asm-generic/access_ok.h @@ -16,18 +16,6 @@ #define TASK_SIZE_MAX TASK_SIZE #endif -#ifndef uaccess_kernel -#ifdef CONFIG_SET_FS -#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) -#else -#define uaccess_kernel() (0) -#endif -#endif - -#ifndef user_addr_max -#define user_addr_max() (uaccess_kernel() ? ~0UL : TASK_SIZE_MAX) -#endif - #ifndef __access_ok /* * 'size' is a compile-time constant for most callers, so optimize for @@ -42,7 +30,7 @@ */ static inline int __access_ok(const void __user *ptr, unsigned long size) { - unsigned long limit = user_addr_max(); + unsigned long limit = TASK_SIZE_MAX; unsigned long addr = (unsigned long)ptr; if (IS_ENABLED(CONFIG_ALTERNATE_USER_ADDRESS_SPACE) || diff --git a/include/asm-generic/uaccess.h b/include/asm-generic/uaccess.h index ebc685dc8d74..a5be9e61a2a2 100644 --- a/include/asm-generic/uaccess.h +++ b/include/asm-generic/uaccess.h @@ -8,6 +8,7 @@ * address space, e.g. all NOMMU machines. */ #include +#include #ifdef CONFIG_UACCESS_MEMCPY #include @@ -94,30 +95,6 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n) #define INLINE_COPY_TO_USER #endif /* CONFIG_UACCESS_MEMCPY */ -#ifdef CONFIG_SET_FS -#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) - -#ifndef KERNEL_DS -#define KERNEL_DS MAKE_MM_SEG(~0UL) -#endif - -#ifndef USER_DS -#define USER_DS MAKE_MM_SEG(TASK_SIZE - 1) -#endif - -#ifndef get_fs -#define get_fs() (current_thread_info()->addr_limit) - -static inline void set_fs(mm_segment_t fs) -{ - current_thread_info()->addr_limit = fs; -} -#endif - -#endif /* CONFIG_SET_FS */ - -#include - /* * These are the main single-value transfer routines. They automatically * use the right size if we just have the right pointer type. diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 819c0cb00b6d..a34b0f9a9972 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -290,10 +290,6 @@ static inline void addr_limit_user_check(void) return; #endif - if (CHECK_DATA_CORRUPTION(uaccess_kernel(), - "Invalid address limit on user-mode return")) - force_sig(SIGKILL); - #ifdef TIF_FSCHECK clear_thread_flag(TIF_FSCHECK); #endif diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 2c31667e62e0..2421a41f3a8e 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -10,39 +10,6 @@ #include -#ifdef CONFIG_SET_FS -/* - * Force the uaccess routines to be wired up for actual userspace access, - * overriding any possible set_fs(KERNEL_DS) still lingering around. Undone - * using force_uaccess_end below. - */ -static inline mm_segment_t force_uaccess_begin(void) -{ - mm_segment_t fs = get_fs(); - - set_fs(USER_DS); - return fs; -} - -static inline void force_uaccess_end(mm_segment_t oldfs) -{ - set_fs(oldfs); -} -#else /* CONFIG_SET_FS */ -typedef struct { - /* empty dummy */ -} mm_segment_t; - -static inline mm_segment_t force_uaccess_begin(void) -{ - return (mm_segment_t) { }; -} - -static inline void force_uaccess_end(mm_segment_t oldfs) -{ -} -#endif /* CONFIG_SET_FS */ - /* * Architectures should provide two primitives (raw_copy_{to,from}_user()) * and get rid of their private instances of copy_{to,from}_user() and diff --git a/include/rdma/ib.h b/include/rdma/ib.h index 83139b9ce409..f7c185ff7a11 100644 --- a/include/rdma/ib.h +++ b/include/rdma/ib.h @@ -75,7 +75,7 @@ struct sockaddr_ib { */ static inline bool ib_safe_file_access(struct file *filp) { - return filp->f_cred == current_cred() && !uaccess_kernel(); + return filp->f_cred == current_cred(); } #endif /* _RDMA_IB_H */ diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 58cbe357fb2b..1273be84392c 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -209,17 +209,13 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, } if (regs) { - mm_segment_t fs; - if (crosstask) goto exit_put; if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); - fs = force_uaccess_begin(); perf_callchain_user(&ctx, regs); - force_uaccess_end(fs); } } diff --git a/kernel/events/core.c b/kernel/events/core.c index 57c7197838db..11ca7303d6df 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6746,7 +6746,6 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, unsigned long sp; unsigned int rem; u64 dyn_size; - mm_segment_t fs; /* * We dump: @@ -6764,9 +6763,7 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, /* Data. */ sp = perf_user_stack_pointer(regs); - fs = force_uaccess_begin(); rem = __output_copy_user(handle, (void *) sp, dump_size); - force_uaccess_end(fs); dyn_size = dump_size - rem; perf_output_skip(handle, rem); diff --git a/kernel/exit.c b/kernel/exit.c index b00a25bb4ab9..0884a75bc2f8 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -737,20 +737,6 @@ void __noreturn do_exit(long code) WARN_ON(blk_needs_flush_plug(tsk)); - /* - * If do_dead is called because this processes oopsed, it's possible - * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before - * continuing. Amongst other possible reasons, this is to prevent - * mm_release()->clear_child_tid() from writing to a user-controlled - * kernel address. - * - * On uptodate architectures force_uaccess_begin is a noop. On - * architectures that still have set_fs/get_fs in addition to handling - * oopses handles kernel threads that run as set_fs(KERNEL_DS) by - * default. - */ - force_uaccess_begin(); - kcov_task_exit(tsk); coredump_task_exit(tsk); diff --git a/kernel/kthread.c b/kernel/kthread.c index 38c6dd822da8..16c2275d4b50 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -55,7 +55,6 @@ struct kthread { int result; int (*threadfn)(void *); void *data; - mm_segment_t oldfs; struct completion parked; struct completion exited; #ifdef CONFIG_BLK_CGROUP @@ -1441,8 +1440,6 @@ void kthread_use_mm(struct mm_struct *mm) mmdrop(active_mm); else smp_mb(); - - to_kthread(tsk)->oldfs = force_uaccess_begin(); } EXPORT_SYMBOL_GPL(kthread_use_mm); @@ -1457,8 +1454,6 @@ void kthread_unuse_mm(struct mm_struct *mm) WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(!tsk->mm); - force_uaccess_end(to_kthread(tsk)->oldfs); - task_lock(tsk); /* * When a kthread stops operating on an address space, the loop diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 9c625257023d..9ed5ce989415 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -226,15 +226,12 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size) .store = store, .size = size, }; - mm_segment_t fs; /* Trace user stack if not a kernel thread */ if (current->flags & PF_KTHREAD) return 0; - fs = force_uaccess_begin(); arch_stack_walk_user(consume_entry, &c, task_pt_regs(current)); - force_uaccess_end(fs); return c.len; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 21aa30644219..8115fff17018 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -332,8 +332,6 @@ BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src, if (unlikely(in_interrupt() || current->flags & (PF_KTHREAD | PF_EXITING))) return -EPERM; - if (unlikely(uaccess_kernel())) - return -EPERM; if (unlikely(!nmi_uaccess_okay())) return -EPERM; @@ -835,8 +833,6 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type) */ if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING))) return -EPERM; - if (unlikely(uaccess_kernel())) - return -EPERM; if (unlikely(!nmi_uaccess_okay())) return -EPERM; diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index 122d8d0e253c..08fc72d3ed16 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -120,7 +120,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count) if (unlikely(count <= 0)) return 0; - max_addr = user_addr_max(); + max_addr = TASK_SIZE_MAX; src_addr = (unsigned long)untagged_addr(src); if (likely(src_addr < max_addr)) { unsigned long max = max_addr - src_addr; diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c index 1616710b8a82..bffa0ebf9f8b 100644 --- a/lib/strnlen_user.c +++ b/lib/strnlen_user.c @@ -96,7 +96,7 @@ long strnlen_user(const char __user *str, long count) if (unlikely(count <= 0)) return 0; - max_addr = user_addr_max(); + max_addr = TASK_SIZE_MAX; src_addr = (unsigned long)untagged_addr(str); if (likely(src_addr < max_addr)) { unsigned long max = max_addr - src_addr; diff --git a/mm/maccess.c b/mm/maccess.c index cbd1b3959af2..106820b33a2b 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -113,14 +113,11 @@ Efault: long copy_from_user_nofault(void *dst, const void __user *src, size_t size) { long ret = -EFAULT; - mm_segment_t old_fs = force_uaccess_begin(); - if (access_ok(src, size)) { pagefault_disable(); ret = __copy_from_user_inatomic(dst, src, size); pagefault_enable(); } - force_uaccess_end(old_fs); if (ret) return -EFAULT; @@ -140,14 +137,12 @@ EXPORT_SYMBOL_GPL(copy_from_user_nofault); long copy_to_user_nofault(void __user *dst, const void *src, size_t size) { long ret = -EFAULT; - mm_segment_t old_fs = force_uaccess_begin(); if (access_ok(dst, size)) { pagefault_disable(); ret = __copy_to_user_inatomic(dst, src, size); pagefault_enable(); } - force_uaccess_end(old_fs); if (ret) return -EFAULT; @@ -176,17 +171,14 @@ EXPORT_SYMBOL_GPL(copy_to_user_nofault); long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, long count) { - mm_segment_t old_fs; long ret; if (unlikely(count <= 0)) return 0; - old_fs = force_uaccess_begin(); pagefault_disable(); ret = strncpy_from_user(dst, unsafe_addr, count); pagefault_enable(); - force_uaccess_end(old_fs); if (ret >= count) { ret = count; @@ -216,14 +208,11 @@ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, */ long strnlen_user_nofault(const void __user *unsafe_addr, long count) { - mm_segment_t old_fs; int ret; - old_fs = force_uaccess_begin(); pagefault_disable(); ret = strnlen_user(unsafe_addr, count); pagefault_enable(); - force_uaccess_end(old_fs); return ret; } diff --git a/mm/memory.c b/mm/memory.c index c125c4969913..9a6ebf68a846 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5256,14 +5256,6 @@ void print_vma_addr(char *prefix, unsigned long ip) #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP) void __might_fault(const char *file, int line) { - /* - * Some code (nfs/sunrpc) uses socket ops on kernel memory while - * holding the mmap_lock, this is safe because kernel memory doesn't - * get paged out, therefore we'll never actually fault, and the - * below annotations will generate false positives. - */ - if (uaccess_kernel()) - return; if (pagefault_disabled()) return; __might_sleep(file, line); diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index 51a941b56ec3..422ec6e7ccff 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -70,7 +70,7 @@ static int bpfilter_process_sockopt(struct sock *sk, int optname, .addr = (uintptr_t)optval.user, .len = optlen, }; - if (uaccess_kernel() || sockptr_is_kernel(optval)) { + if (sockptr_is_kernel(optval)) { pr_err("kernel access not supported\n"); return -EFAULT; } -- cgit v1.2.3 From 4c4559b43c5fbe1af5b55212c3f411781da2093d Mon Sep 17 00:00:00 2001 From: Marijn Suijten Date: Wed, 2 Feb 2022 11:05:27 +0100 Subject: config: android-recommended: Don't explicitly disable CONFIG_AIO Android nowadays (for a couple years already) requires AIO for at least its `adb` "Android Debug Bridge" [1]. Without this config option (`default y`) it simply refuses start, making users unable to connect to their phone for debugging purposes when using these kernel fragments. [1]: https://cs.android.com/android/_/android/platform/packages/modules/adb/+/a2cb8de5e68067a5e1d002886d5f3b42d91371e1 Cc: Amit Pundir Cc: Greg Kroah-Hartman Cc: John Stultz Signed-off-by: Marijn Suijten Link: https://lore.kernel.org/r/20220202100528.190794-1-marijn.suijten@somainline.org Signed-off-by: Greg Kroah-Hartman --- kernel/configs/android-recommended.config | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index eb0029c9a6a6..22bd76e43aca 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config @@ -1,5 +1,4 @@ # KEEP ALPHABETICALLY SORTED -# CONFIG_AIO is not set # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_LEGACY_PTYS is not set -- cgit v1.2.3 From 3bdd6d5ad5f8f6719cc761127df4af64ff163c79 Mon Sep 17 00:00:00 2001 From: Marijn Suijten Date: Wed, 2 Feb 2022 11:05:28 +0100 Subject: config: android-recommended: Disable BPF_UNPRIV_DEFAULT_OFF for netd AOSP's `netd` process fails to start on Android S: E ClatdController: getClatEgress4MapFd() failure: Operation not permitted I netd : Initializing ClatdController: 410us E netd : Failed to start trafficcontroller: (Status[code: 1, msg: "Pinned map not accessible or does not exist: (/sys/fs/bpf/map_netd_cookie_tag_map): Operation not permitted"]) E netd : CRITICAL: sleeping 60 seconds, netd exiting with failure, crash loop likely! And on Android R: I ClatdController: 4.9+ kernel and device shipped with P - clat ebpf might work. E ClatdController: getClatEgressMapFd() failure: Operation not permitted I netd : Initializing ClatdController: 1409us E netd : Failed to start trafficcontroller: (Status[code: 1, msg: "Pinned map not accessible or does not exist: (/sys/fs/bpf/map_netd_cookie_tag_map): Operation not permitted"]) These permission issues are caused by 08389d888287 ("bpf: Add kconfig knob for disabling unpriv bpf by default") because AOSP does not provide netd the `SYS_ADMIN` capability, and also has no userspace support for the `BPF` capability yet. Cc: Amit Pundir Cc: Greg Kroah-Hartman Suggested-by: John Stultz [John suggested this in https://linaro.atlassian.net/browse/ACK-107?focusedCommentId=117382] Signed-off-by: Marijn Suijten Link: https://lore.kernel.org/r/20220202100528.190794-2-marijn.suijten@somainline.org Signed-off-by: Greg Kroah-Hartman --- kernel/configs/android-recommended.config | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index 22bd76e43aca..e400fbbc8aba 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config @@ -1,4 +1,5 @@ # KEEP ALPHABETICALLY SORTED +# CONFIG_BPF_UNPRIV_DEFAULT_OFF is not set # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_LEGACY_PTYS is not set -- cgit v1.2.3 From 0ac983f512033cb7b5e210c9589768ad25b1e36b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 24 Feb 2022 08:32:28 -0600 Subject: ucounts: Fix systemd LimitNPROC with private users regression Long story short recursively enforcing RLIMIT_NPROC when it is not enforced on the process that creates a new user namespace, causes currently working code to fail. There is no reason to enforce RLIMIT_NPROC recursively when we don't enforce it normally so update the code to detect this case. I would like to simply use capable(CAP_SYS_RESOURCE) to detect when RLIMIT_NPROC is not enforced upon the caller. Unfortunately because RLIMIT_NPROC is charged and checked for enforcement based upon the real uid, using capable() which is euid based is inconsistent with reality. Come as close as possible to testing for capable(CAP_SYS_RESOURCE) by testing for when the real uid would match the conditions when CAP_SYS_RESOURCE would be present if the real uid was the effective uid. Reported-by: Etienne Dechamps Link: https://bugzilla.kernel.org/show_bug.cgi?id=215596 Link: https://lkml.kernel.org/r/e9589141-cfeb-90cd-2d0e-83a62787239a@edechamps.fr Link: https://lkml.kernel.org/r/87sfs8jmpz.fsf_-_@email.froward.int.ebiederm.org Cc: stable@vger.kernel.org Fixes: 21d1c5e386bc ("Reimplement RLIMIT_NPROC on top of ucounts") Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- kernel/user_namespace.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 6b2e3ca7ee99..5481ba44a8d6 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -58,6 +58,18 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) cred->user_ns = user_ns; } +static unsigned long enforced_nproc_rlimit(void) +{ + unsigned long limit = RLIM_INFINITY; + + /* Is RLIMIT_NPROC currently enforced? */ + if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) || + (current_user_ns() != &init_user_ns)) + limit = rlimit(RLIMIT_NPROC); + + return limit; +} + /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the @@ -122,7 +134,7 @@ int create_user_ns(struct cred *new) for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) { ns->ucount_max[i] = INT_MAX; } - set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)); + set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit()); set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE)); set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING)); set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK)); -- cgit v1.2.3 From 302e9edd54985f584cfc180098f3554774126969 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 23 Feb 2022 22:38:37 -0500 Subject: tracing: Have traceon and traceoff trigger honor the instance If a trigger is set on an event to disable or enable tracing within an instance, then tracing should be disabled or enabled in the instance and not at the top level, which is confusing to users. Link: https://lkml.kernel.org/r/20220223223837.14f94ec3@rorschach.local.home Cc: stable@vger.kernel.org Fixes: ae63b31e4d0e2 ("tracing: Separate out trace events from global variables") Tested-by: Daniel Bristot de Oliveira Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_trigger.c | 52 ++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index e0d50c9577f3..efe563140f27 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1295,6 +1295,16 @@ traceon_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { + struct trace_event_file *file = data->private_data; + + if (file) { + if (tracer_tracing_is_on(file->tr)) + return; + + tracer_tracing_on(file->tr); + return; + } + if (tracing_is_on()) return; @@ -1306,8 +1316,15 @@ traceon_count_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { - if (tracing_is_on()) - return; + struct trace_event_file *file = data->private_data; + + if (file) { + if (tracer_tracing_is_on(file->tr)) + return; + } else { + if (tracing_is_on()) + return; + } if (!data->count) return; @@ -1315,7 +1332,10 @@ traceon_count_trigger(struct event_trigger_data *data, if (data->count != -1) (data->count)--; - tracing_on(); + if (file) + tracer_tracing_on(file->tr); + else + tracing_on(); } static void @@ -1323,6 +1343,16 @@ traceoff_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { + struct trace_event_file *file = data->private_data; + + if (file) { + if (!tracer_tracing_is_on(file->tr)) + return; + + tracer_tracing_off(file->tr); + return; + } + if (!tracing_is_on()) return; @@ -1334,8 +1364,15 @@ traceoff_count_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { - if (!tracing_is_on()) - return; + struct trace_event_file *file = data->private_data; + + if (file) { + if (!tracer_tracing_is_on(file->tr)) + return; + } else { + if (!tracing_is_on()) + return; + } if (!data->count) return; @@ -1343,7 +1380,10 @@ traceoff_count_trigger(struct event_trigger_data *data, if (data->count != -1) (data->count)--; - tracing_off(); + if (file) + tracer_tracing_off(file->tr); + else + tracing_off(); } static int -- cgit v1.2.3 From b61edd57740de5895f44f2ea417b164d9e1708bb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Fri, 18 Feb 2022 19:00:57 -0500 Subject: eprobes: Remove redundant event type information Currently, the event probes save the type of the event they are attached to when recording the event. For example: # echo 'e:switch sched/sched_switch prev_state=$prev_state prev_prio=$prev_prio next_pid=$next_pid next_prio=$next_prio' > dynamic_events # cat events/eprobes/switch/format name: switch ID: 1717 format: field:unsigned short common_type; offset:0; size:2; signed:0; field:unsigned char common_flags; offset:2; size:1; signed:0; field:unsigned char common_preempt_count; offset:3; size:1; signed:0; field:int common_pid; offset:4; size:4; signed:1; field:unsigned int __probe_type; offset:8; size:4; signed:0; field:u64 prev_state; offset:12; size:8; signed:0; field:u64 prev_prio; offset:20; size:8; signed:0; field:u64 next_pid; offset:28; size:8; signed:0; field:u64 next_prio; offset:36; size:8; signed:0; print fmt: "(%u) prev_state=0x%Lx prev_prio=0x%Lx next_pid=0x%Lx next_prio=0x%Lx", REC->__probe_type, REC->prev_state, REC->prev_prio, REC->next_pid, REC->next_prio The __probe_type adds 4 bytes to every event. One of the reasons for creating eprobes is to limit what is traced in an event to be able to limit what is written into the ring buffer. Having this redundant 4 bytes to every event takes away from this. The event that is recorded can be retrieved from the event probe itself, that is available when the trace is happening. For user space tools, it could simply read the dynamic_event file to find the event they are for. So there is really no reason to write this information into the ring buffer for every event. Link: https://lkml.kernel.org/r/20220218190057.2f5a19a8@gandalf.local.home Acked-by: Masami Hiramatsu Reviewed-by: Joel Fernandes Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.h | 1 - kernel/trace/trace_eprobe.c | 16 +++++++--------- kernel/trace/trace_probe.c | 10 +++++----- kernel/trace/trace_probe.h | 1 - 4 files changed, 12 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d038ddbf1bea..c5b09c31e077 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -136,7 +136,6 @@ struct kprobe_trace_entry_head { struct eprobe_trace_entry_head { struct trace_entry ent; - unsigned int type; }; struct kretprobe_trace_entry_head { diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 191db32dec46..541aa13581b9 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -242,7 +242,6 @@ static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i) static int eprobe_event_define_fields(struct trace_event_call *event_call) { - int ret; struct eprobe_trace_entry_head field; struct trace_probe *tp; @@ -250,8 +249,6 @@ static int eprobe_event_define_fields(struct trace_event_call *event_call) if (WARN_ON_ONCE(!tp)) return -ENOENT; - DEFINE_FIELD(unsigned int, type, FIELD_STRING_TYPE, 0); - return traceprobe_define_arg_fields(event_call, sizeof(field), tp); } @@ -270,7 +267,9 @@ print_eprobe_event(struct trace_iterator *iter, int flags, struct trace_event_call *pevent; struct trace_event *probed_event; struct trace_seq *s = &iter->seq; + struct trace_eprobe *ep; struct trace_probe *tp; + unsigned int type; field = (struct eprobe_trace_entry_head *)iter->ent; tp = trace_probe_primary_from_call( @@ -278,15 +277,18 @@ print_eprobe_event(struct trace_iterator *iter, int flags, if (WARN_ON_ONCE(!tp)) goto out; + ep = container_of(tp, struct trace_eprobe, tp); + type = ep->event->event.type; + trace_seq_printf(s, "%s: (", trace_probe_name(tp)); - probed_event = ftrace_find_event(field->type); + probed_event = ftrace_find_event(type); if (probed_event) { pevent = container_of(probed_event, struct trace_event_call, event); trace_seq_printf(s, "%s.%s", pevent->class->system, trace_event_name(pevent)); } else { - trace_seq_printf(s, "%u", field->type); + trace_seq_printf(s, "%u", type); } trace_seq_putc(s, ')'); @@ -498,10 +500,6 @@ __eprobe_trace_func(struct eprobe_data *edata, void *rec) return; entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); - if (edata->ep->event) - entry->type = edata->ep->event->event.type; - else - entry->type = 0; store_trace_args(&entry[1], &edata->ep->tp, rec, sizeof(*entry), dsize); trace_event_buffer_commit(&fbuffer); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 73d90179b51b..80863c6508e5 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -871,15 +871,15 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, switch (ptype) { case PROBE_PRINT_NORMAL: fmt = "(%lx)"; - arg = "REC->" FIELD_STRING_IP; + arg = ", REC->" FIELD_STRING_IP; break; case PROBE_PRINT_RETURN: fmt = "(%lx <- %lx)"; - arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; + arg = ", REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; break; case PROBE_PRINT_EVENT: - fmt = "(%u)"; - arg = "REC->" FIELD_STRING_TYPE; + fmt = ""; + arg = ""; break; default: WARN_ON_ONCE(1); @@ -903,7 +903,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, parg->type->fmt); } - pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); + pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", arg); for (i = 0; i < tp->nr_args; i++) { parg = tp->args + i; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 99e7a5df025e..92cc149af0fd 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -38,7 +38,6 @@ #define FIELD_STRING_IP "__probe_ip" #define FIELD_STRING_RETIP "__probe_ret_ip" #define FIELD_STRING_FUNC "__probe_func" -#define FIELD_STRING_TYPE "__probe_type" #undef DEFINE_FIELD #define DEFINE_FIELD(type, item, name, is_signed) \ -- cgit v1.2.3 From bc82c38a6933aab308387d4aca47e0a05de7b553 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 11 Feb 2022 08:10:18 +0100 Subject: tracing: Uninline trace_trigger_soft_disabled() partly On a powerpc32 build with CONFIG_CC_OPTIMISE_FOR_SIZE, the inline keyword is not honored and trace_trigger_soft_disabled() appears approx 50 times in vmlinux. Adding -Winline to the build, the following message appears: ./include/linux/trace_events.h:712:1: error: inlining failed in call to 'trace_trigger_soft_disabled': call is unlikely and code size would grow [-Werror=inline] That function is rather big for an inlined function: c003df60 : c003df60: 94 21 ff f0 stwu r1,-16(r1) c003df64: 7c 08 02 a6 mflr r0 c003df68: 90 01 00 14 stw r0,20(r1) c003df6c: bf c1 00 08 stmw r30,8(r1) c003df70: 83 e3 00 24 lwz r31,36(r3) c003df74: 73 e9 01 00 andi. r9,r31,256 c003df78: 41 82 00 10 beq c003df88 c003df7c: 38 60 00 00 li r3,0 c003df80: 39 61 00 10 addi r11,r1,16 c003df84: 4b fd 60 ac b c0014030 <_rest32gpr_30_x> c003df88: 73 e9 00 80 andi. r9,r31,128 c003df8c: 7c 7e 1b 78 mr r30,r3 c003df90: 41 a2 00 14 beq c003dfa4 c003df94: 38 c0 00 00 li r6,0 c003df98: 38 a0 00 00 li r5,0 c003df9c: 38 80 00 00 li r4,0 c003dfa0: 48 05 c5 f1 bl c009a590 c003dfa4: 73 e9 00 40 andi. r9,r31,64 c003dfa8: 40 82 00 28 bne c003dfd0 c003dfac: 73 ff 02 00 andi. r31,r31,512 c003dfb0: 41 82 ff cc beq c003df7c c003dfb4: 80 01 00 14 lwz r0,20(r1) c003dfb8: 83 e1 00 0c lwz r31,12(r1) c003dfbc: 7f c3 f3 78 mr r3,r30 c003dfc0: 83 c1 00 08 lwz r30,8(r1) c003dfc4: 7c 08 03 a6 mtlr r0 c003dfc8: 38 21 00 10 addi r1,r1,16 c003dfcc: 48 05 6f 6c b c0094f38 c003dfd0: 38 60 00 01 li r3,1 c003dfd4: 4b ff ff ac b c003df80 However it is located in a hot path so inlining it is important. But forcing inlining of the entire function by using __always_inline leads to increasing the text size by approx 20 kbytes. Instead, split the fonction in two parts, one part with the likely fast path, flagged __always_inline, and a second part out of line. With this change, on a powerpc32 with CONFIG_CC_OPTIMISE_FOR_SIZE vmlinux text increases by only 1,4 kbytes, which is partly compensated by a decrease of vmlinux data by 7 kbytes. On ppc64_defconfig which has CONFIG_CC_OPTIMISE_FOR_SPEED, this change reduces vmlinux text by more than 30 kbytes. Link: https://lkml.kernel.org/r/69ce0986a52d026d381d612801d978aa4f977460.1644563295.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 22 ++++++++++++---------- kernel/trace/trace_events_trigger.c | 14 ++++++++++++++ 2 files changed, 26 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 70c069aef02c..dcea51fb60e2 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -699,6 +699,8 @@ event_triggers_post_call(struct trace_event_file *file, bool trace_event_ignore_this_pid(struct trace_event_file *trace_file); +bool __trace_trigger_soft_disabled(struct trace_event_file *file); + /** * trace_trigger_soft_disabled - do triggers and test if soft disabled * @file: The file pointer of the event to test @@ -708,20 +710,20 @@ bool trace_event_ignore_this_pid(struct trace_event_file *trace_file); * triggers that require testing the fields, it will return true, * otherwise false. */ -static inline bool +static __always_inline bool trace_trigger_soft_disabled(struct trace_event_file *file) { unsigned long eflags = file->flags; - if (!(eflags & EVENT_FILE_FL_TRIGGER_COND)) { - if (eflags & EVENT_FILE_FL_TRIGGER_MODE) - event_triggers_call(file, NULL, NULL, NULL); - if (eflags & EVENT_FILE_FL_SOFT_DISABLED) - return true; - if (eflags & EVENT_FILE_FL_PID_FILTER) - return trace_event_ignore_this_pid(file); - } - return false; + if (likely(!(eflags & (EVENT_FILE_FL_TRIGGER_MODE | + EVENT_FILE_FL_SOFT_DISABLED | + EVENT_FILE_FL_PID_FILTER)))) + return false; + + if (likely(eflags & EVENT_FILE_FL_TRIGGER_COND)) + return false; + + return __trace_trigger_soft_disabled(file); } #ifdef CONFIG_BPF_EVENTS diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index efe563140f27..7eb9d04f1c2e 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -84,6 +84,20 @@ event_triggers_call(struct trace_event_file *file, } EXPORT_SYMBOL_GPL(event_triggers_call); +bool __trace_trigger_soft_disabled(struct trace_event_file *file) +{ + unsigned long eflags = file->flags; + + if (eflags & EVENT_FILE_FL_TRIGGER_MODE) + event_triggers_call(file, NULL, NULL, NULL); + if (eflags & EVENT_FILE_FL_SOFT_DISABLED) + return true; + if (eflags & EVENT_FILE_FL_PID_FILTER) + return trace_event_ignore_this_pid(file); + return false; +} +EXPORT_SYMBOL_GPL(__trace_trigger_soft_disabled); + /** * event_triggers_post_call - Call 'post_triggers' for a trace event * @file: The trace_event_file associated with the event -- cgit v1.2.3 From 7acf3a127bb7c65ff39099afd78960e77b2ca5de Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 14 Feb 2022 14:44:56 +0100 Subject: tracing: Ensure trace buffer is at least 4096 bytes large Booting the kernel with 'trace_buf_size=1' give a warning at boot during the ftrace selftests: [ 0.892809] Running postponed tracer tests: [ 0.892893] Testing tracer function: [ 0.901899] Callback from call_rcu_tasks_trace() invoked. [ 0.983829] Callback from call_rcu_tasks_rude() invoked. [ 1.072003] .. bad ring buffer .. corrupted trace buffer .. [ 1.091944] Callback from call_rcu_tasks() invoked. [ 1.097695] PASSED [ 1.097701] Testing dynamic ftrace: .. filter failed count=0 ..FAILED! [ 1.353474] ------------[ cut here ]------------ [ 1.353478] WARNING: CPU: 0 PID: 1 at kernel/trace/trace.c:1951 run_tracer_selftest+0x13c/0x1b0 Therefore enforce a minimum of 4096 bytes to make the selftest pass. Link: https://lkml.kernel.org/r/20220214134456.1751749-1-svens@linux.ibm.com Signed-off-by: Sven Schnelle Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7c2578efde26..3050892d1812 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1474,10 +1474,12 @@ static int __init set_buf_size(char *str) if (!str) return 0; buf_size = memparse(str, &str); - /* nr_entries can not be zero */ - if (buf_size == 0) - return 0; - trace_buf_size = buf_size; + /* + * nr_entries can not be zero and the startup + * tests require some buffer space. Therefore + * ensure we have at least 4096 bytes of buffer. + */ + trace_buf_size = max(4096UL, buf_size); return 1; } __setup("trace_buf_size=", set_buf_size); -- cgit v1.2.3 From ab2f993c01f261aa3eeb8842842ff38bff7806b6 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 14 Feb 2022 12:28:47 -0700 Subject: ftrace: Remove unused ftrace_startup_enable() stub When building with clang + CONFIG_DYNAMIC_FTRACE=n + W=1, there is a warning: kernel/trace/ftrace.c:7194:20: error: unused function 'ftrace_startup_enable' [-Werror,-Wunused-function] static inline void ftrace_startup_enable(int command) { } ^ 1 error generated. Clang warns on instances of static inline functions in .c files with W=1 after commit 6863f5643dd7 ("kbuild: allow Clang to find unused static inline functions for W=1 build"). The ftrace_startup_enable() stub has been unused since commit e1effa0144a1 ("ftrace: Annotate the ops operation on update"), where its use outside of the CONFIG_DYNAMIC_TRACE section was replaced by ftrace_startup_all(). Remove it to resolve the warning. Link: https://lkml.kernel.org/r/20220214192847.488166-1-nathan@kernel.org Reported-by: kernel test robot Signed-off-by: Nathan Chancellor Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f9feb197b2da..a4b462b6f944 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7191,7 +7191,6 @@ static int __init ftrace_nodyn_init(void) core_initcall(ftrace_nodyn_init); static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; } -static inline void ftrace_startup_enable(int command) { } static inline void ftrace_startup_all(int command) { } # define ftrace_startup_sysctl() do { } while (0) -- cgit v1.2.3 From dd990352f01ee9a6c6eee152e5d11c021caccfe4 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 18 Feb 2022 16:17:38 +0100 Subject: tracing/osnoise: Make osnoise_main to sleep for microseconds osnoise's runtime and period are in the microseconds scale, but it is currently sleeping in the millisecond's scale. This behavior roots in the usage of hwlat as the skeleton for osnoise. Make osnoise to sleep in the microseconds scale. Also, move the sleep to a specialized function. Link: https://lkml.kernel.org/r/302aa6c7bdf2d131719b22901905e9da122a11b2.1645197336.git.bristot@kernel.org Cc: Ingo Molnar Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 53 ++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 870a08da5b48..cfddb30e65ab 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1436,6 +1436,37 @@ out: static struct cpumask osnoise_cpumask; static struct cpumask save_cpumask; +/* + * osnoise_sleep - sleep until the next period + */ +static void osnoise_sleep(void) +{ + u64 interval; + ktime_t wake_time; + + mutex_lock(&interface_lock); + interval = osnoise_data.sample_period - osnoise_data.sample_runtime; + mutex_unlock(&interface_lock); + + /* + * differently from hwlat_detector, the osnoise tracer can run + * without a pause because preemption is on. + */ + if (!interval) { + /* Let synchronize_rcu_tasks() make progress */ + cond_resched_tasks_rcu_qs(); + return; + } + + wake_time = ktime_add_us(ktime_get(), interval); + __set_current_state(TASK_INTERRUPTIBLE); + + while (schedule_hrtimeout_range(&wake_time, 0, HRTIMER_MODE_ABS)) { + if (kthread_should_stop()) + break; + } +} + /* * osnoise_main - The osnoise detection kernel thread * @@ -1444,30 +1475,10 @@ static struct cpumask save_cpumask; */ static int osnoise_main(void *data) { - u64 interval; while (!kthread_should_stop()) { - run_osnoise(); - - mutex_lock(&interface_lock); - interval = osnoise_data.sample_period - osnoise_data.sample_runtime; - mutex_unlock(&interface_lock); - - do_div(interval, USEC_PER_MSEC); - - /* - * differently from hwlat_detector, the osnoise tracer can run - * without a pause because preemption is on. - */ - if (interval < 1) { - /* Let synchronize_rcu_tasks() make progress */ - cond_resched_tasks_rcu_qs(); - continue; - } - - if (msleep_interruptible(interval)) - break; + osnoise_sleep(); } return 0; -- cgit v1.2.3 From 80bebebdac935473568c27d4f1349dc8f9809bf7 Mon Sep 17 00:00:00 2001 From: Yucong Sun Date: Fri, 25 Feb 2022 10:59:24 -0800 Subject: bpf: Fix issue with bpf preload module taking over stdout/stdin of kernel. In cb80ddc67152 ("bpf: Convert bpf_preload.ko to use light skeleton.") BPF preload was switched from user mode process to use in-kernel light skeleton instead. However, in the kernel context, early in the boot sequence, the first available FD can start from 0, instead of normally 3 for user mode process. So FDs 0 and 1 are then used for loaded BPF programs and prevent init process from setting up stdin/stdout/stderr on FD 0, 1, and 2 as expected. Before the fix: ls -lah /proc/1/fd/* lrwx------1 root root 64 Feb 23 17:20 /proc/1/fd/0 -> /dev/null lrwx------ 1 root root 64 Feb 23 17:20 /proc/1/fd/1 -> /dev/null lrwx------ 1 root root 64 Feb 23 17:20 /proc/1/fd/2 -> /dev/console lrwx------ 1 root root 64 Feb 23 17:20 /proc/1/fd/6 -> /dev/console lrwx------ 1 root root 64 Feb 23 17:20 /proc/1/fd/7 -> /dev/console After the fix: ls -lah /proc/1/fd/* lrwx------ 1 root root 64 Feb 24 21:23 /proc/1/fd/0 -> /dev/console lrwx------ 1 root root 64 Feb 24 21:23 /proc/1/fd/1 -> /dev/console lrwx------ 1 root root 64 Feb 24 21:23 /proc/1/fd/2 -> /dev/console Fix by closing prog FDs after initialization. struct bpf_prog's themselves are kept alive through direct kernel references taken with bpf_link_get_from_fd(). Fixes: cb80ddc67152 ("bpf: Convert bpf_preload.ko to use light skeleton.") Signed-off-by: Yucong Sun Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220225185923.2535519-1-fallentree@fb.com --- kernel/bpf/preload/bpf_preload_kern.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c index 30207c048d36..5106b5372f0c 100644 --- a/kernel/bpf/preload/bpf_preload_kern.c +++ b/kernel/bpf/preload/bpf_preload_kern.c @@ -54,6 +54,13 @@ static int load_skel(void) err = PTR_ERR(progs_link); goto out; } + /* Avoid taking over stdin/stdout/stderr of init process. Zeroing out + * makes skel_closenz() a no-op later in iterators_bpf__destroy(). + */ + close_fd(skel->links.dump_bpf_map_fd); + skel->links.dump_bpf_map_fd = 0; + close_fd(skel->links.dump_bpf_prog_fd); + skel->links.dump_bpf_prog_fd = 0; return 0; out: free_links_and_skel(); -- cgit v1.2.3 From c5229a0bd47814770c895e94fbc97ad21819abfe Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 20 Dec 2021 16:38:06 +0000 Subject: tracing: Fix selftest config check for function graph start up test CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS is required to test direct tramp. Link: https://lkml.kernel.org/r/bdc7e594e13b0891c1d61bc8d56c94b1890eaed7.1640017960.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_selftest.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index afd937a46496..abcadbe933bb 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -784,9 +784,7 @@ static struct fgraph_ops fgraph_ops __initdata = { .retfunc = &trace_graph_return, }; -#if defined(CONFIG_DYNAMIC_FTRACE) && \ - defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) -#define TEST_DIRECT_TRAMP +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS noinline __noclone static void trace_direct_tramp(void) { } #endif @@ -849,7 +847,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, goto out; } -#ifdef TEST_DIRECT_TRAMP +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS tracing_reset_online_cpus(&tr->array_buffer); set_graph_array(tr); -- cgit v1.2.3 From 5e214f2e43e453d862ebbbd2a4f7ee3fe650f209 Mon Sep 17 00:00:00 2001 From: Connor O'Brien Date: Wed, 23 Feb 2022 01:28:14 +0000 Subject: bpf: Add config to allow loading modules with BTF mismatches MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BTF mismatch can occur for a separately-built module even when the ABI is otherwise compatible and nothing else would prevent successfully loading. Add a new Kconfig to control how mismatches are handled. By default, preserve the current behavior of refusing to load the module. If MODULE_ALLOW_BTF_MISMATCH is enabled, load the module but ignore its BTF information. Suggested-by: Yonghong Song Suggested-by: Michal Suchánek Signed-off-by: Connor O'Brien Signed-off-by: Daniel Borkmann Acked-by: Shung-Hsi Yu Acked-by: Song Liu Link: https://lore.kernel.org/bpf/CAADnVQJ+OVPnBz8z3vNu8gKXX42jCUqfuvhWAyCQDu8N_yqqwQ@mail.gmail.com Link: https://lore.kernel.org/bpf/20220223012814.1898677-1-connoro@google.com --- kernel/bpf/btf.c | 3 ++- lib/Kconfig.debug | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 0890e56e8b08..b472cf0c8fdb 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6398,7 +6398,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, pr_warn("failed to validate module [%s] BTF: %ld\n", mod->name, PTR_ERR(btf)); kfree(btf_mod); - err = PTR_ERR(btf); + if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) + err = PTR_ERR(btf); goto out; } err = btf_alloc_id(btf); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 1555da672275..72ca4684beda 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -339,6 +339,16 @@ config DEBUG_INFO_BTF_MODULES help Generate compact split BTF type information for kernel modules. +config MODULE_ALLOW_BTF_MISMATCH + bool "Allow loading modules with non-matching BTF type info" + depends on DEBUG_INFO_BTF_MODULES + help + For modules whose split BTF does not match vmlinux, load without + BTF rather than refusing to load. The default behavior with + module BTF enabled is to reject modules with such mismatches; + this option will still load module BTF where possible but ignore + it when a mismatch is found. + config GDB_SCRIPTS bool "Provide GDB scripts for kernel debugging" help -- cgit v1.2.3 From 30939293262eb433c960c4532a0d59c4073b2b84 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 28 Feb 2022 11:43:54 +0800 Subject: blktrace: fix use after free for struct blk_trace When tracing the whole disk, 'dropped' and 'msg' will be created under 'q->debugfs_dir' and 'bt->dir' is NULL, thus blk_trace_free() won't remove those files. What's worse, the following UAF can be triggered because of accessing stale 'dropped' and 'msg': ================================================================== BUG: KASAN: use-after-free in blk_dropped_read+0x89/0x100 Read of size 4 at addr ffff88816912f3d8 by task blktrace/1188 CPU: 27 PID: 1188 Comm: blktrace Not tainted 5.17.0-rc4-next-20220217+ #469 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-4 Call Trace: dump_stack_lvl+0x34/0x44 print_address_description.constprop.0.cold+0xab/0x381 ? blk_dropped_read+0x89/0x100 ? blk_dropped_read+0x89/0x100 kasan_report.cold+0x83/0xdf ? blk_dropped_read+0x89/0x100 kasan_check_range+0x140/0x1b0 blk_dropped_read+0x89/0x100 ? blk_create_buf_file_callback+0x20/0x20 ? kmem_cache_free+0xa1/0x500 ? do_sys_openat2+0x258/0x460 full_proxy_read+0x8f/0xc0 vfs_read+0xc6/0x260 ksys_read+0xb9/0x150 ? vfs_write+0x3d0/0x3d0 ? fpregs_assert_state_consistent+0x55/0x60 ? exit_to_user_mode_prepare+0x39/0x1e0 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7fbc080d92fd Code: ce 20 00 00 75 10 b8 00 00 00 00 0f 05 48 3d 01 f0 ff ff 73 31 c3 48 83 1 RSP: 002b:00007fbb95ff9cb0 EFLAGS: 00000293 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 00007fbb95ff9dc0 RCX: 00007fbc080d92fd RDX: 0000000000000100 RSI: 00007fbb95ff9cc0 RDI: 0000000000000045 RBP: 0000000000000045 R08: 0000000000406299 R09: 00000000fffffffd R10: 000000000153afa0 R11: 0000000000000293 R12: 00007fbb780008c0 R13: 00007fbb78000938 R14: 0000000000608b30 R15: 00007fbb780029c8 Allocated by task 1050: kasan_save_stack+0x1e/0x40 __kasan_kmalloc+0x81/0xa0 do_blk_trace_setup+0xcb/0x410 __blk_trace_setup+0xac/0x130 blk_trace_ioctl+0xe9/0x1c0 blkdev_ioctl+0xf1/0x390 __x64_sys_ioctl+0xa5/0xe0 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae Freed by task 1050: kasan_save_stack+0x1e/0x40 kasan_set_track+0x21/0x30 kasan_set_free_info+0x20/0x30 __kasan_slab_free+0x103/0x180 kfree+0x9a/0x4c0 __blk_trace_remove+0x53/0x70 blk_trace_ioctl+0x199/0x1c0 blkdev_common_ioctl+0x5e9/0xb30 blkdev_ioctl+0x1a5/0x390 __x64_sys_ioctl+0xa5/0xe0 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae The buggy address belongs to the object at ffff88816912f380 which belongs to the cache kmalloc-96 of size 96 The buggy address is located 88 bytes inside of 96-byte region [ffff88816912f380, ffff88816912f3e0) The buggy address belongs to the page: page:000000009a1b4e7c refcount:1 mapcount:0 mapping:0000000000000000 index:0x0f flags: 0x17ffffc0000200(slab|node=0|zone=2|lastcpupid=0x1fffff) raw: 0017ffffc0000200 ffffea00044f1100 dead000000000002 ffff88810004c780 raw: 0000000000000000 0000000000200020 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88816912f280: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc ffff88816912f300: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc >ffff88816912f380: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc ^ ffff88816912f400: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc ffff88816912f480: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc ================================================================== Fixes: c0ea57608b69 ("blktrace: remove debugfs file dentries from struct blk_trace") Signed-off-by: Yu Kuai Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20220228034354.4047385-1-yukuai3@huawei.com Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index af68a67179b4..21dea90eaa93 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -310,10 +310,20 @@ record_it: local_irq_restore(flags); } -static void blk_trace_free(struct blk_trace *bt) +static void blk_trace_free(struct request_queue *q, struct blk_trace *bt) { relay_close(bt->rchan); - debugfs_remove(bt->dir); + + /* + * If 'bt->dir' is not set, then both 'dropped' and 'msg' are created + * under 'q->debugfs_dir', thus lookup and remove them. + */ + if (!bt->dir) { + debugfs_remove(debugfs_lookup("dropped", q->debugfs_dir)); + debugfs_remove(debugfs_lookup("msg", q->debugfs_dir)); + } else { + debugfs_remove(bt->dir); + } free_percpu(bt->sequence); free_percpu(bt->msg_data); kfree(bt); @@ -335,10 +345,10 @@ static void put_probe_ref(void) mutex_unlock(&blk_probe_mutex); } -static void blk_trace_cleanup(struct blk_trace *bt) +static void blk_trace_cleanup(struct request_queue *q, struct blk_trace *bt) { synchronize_rcu(); - blk_trace_free(bt); + blk_trace_free(q, bt); put_probe_ref(); } @@ -352,7 +362,7 @@ static int __blk_trace_remove(struct request_queue *q) return -EINVAL; if (bt->trace_state != Blktrace_running) - blk_trace_cleanup(bt); + blk_trace_cleanup(q, bt); return 0; } @@ -572,7 +582,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, ret = 0; err: if (ret) - blk_trace_free(bt); + blk_trace_free(q, bt); return ret; } @@ -1616,7 +1626,7 @@ static int blk_trace_remove_queue(struct request_queue *q) put_probe_ref(); synchronize_rcu(); - blk_trace_free(bt); + blk_trace_free(q, bt); return 0; } @@ -1647,7 +1657,7 @@ static int blk_trace_setup_queue(struct request_queue *q, return 0; free_bt: - blk_trace_free(bt); + blk_trace_free(q, bt); return ret; } -- cgit v1.2.3 From c6ced22997ad56a05377221bded7bb30973a62f2 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 25 Jan 2022 10:44:08 -0500 Subject: tracing: Update print fmt check to handle new __get_sockaddr() macro A helper macro was added to make reading socket addresses easier in trace events. It pairs %pISpc with __get_sockaddr() that reads the socket address from the ring buffer into a human readable format. The boot up check that makes sure that trace events do not reference pointers to memory that can later be freed when the trace event is read, incorrectly flagged this as a delayed reference. Update the check to handle "__get_sockaddr" and not report an error on it. Link: https://lore.kernel.org/all/20220125160505.068dbb52@canb.auug.org.au/ Reported-by: Stephen Rothwell Signed-off-by: Steven Rostedt (Google) Signed-off-by: Chuck Lever --- kernel/trace/trace_events.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 3147614c1812..f527ae807e77 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -384,6 +384,12 @@ static void test_event_printk(struct trace_event_call *call) if (!(dereference_flags & (1ULL << arg))) goto next_arg; + /* Check for __get_sockaddr */; + if (str_has_prefix(fmt + i, "__get_sockaddr(")) { + dereference_flags &= ~(1ULL << arg); + goto next_arg; + } + /* Find the REC-> in the argument */ c = strchr(fmt + i, ','); r = strstr(fmt + i, "REC->"); -- cgit v1.2.3 From f49169c97fceb21ad6a0aaf671c50b0f520f15a5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 16 Feb 2022 12:31:09 -0500 Subject: NFSD: Remove svc_serv_ops::svo_module struct svc_serv_ops is about to be removed. Neil Brown says: > I suspect svo_module can go as well - I don't think the thread is > ever the thing that primarily keeps a module active. A random sample of kthread_create() callers shows sunrpc is the only one that manages module reference count in this way. Suggested-by: Neil Brown Signed-off-by: Chuck Lever --- fs/lockd/svc.c | 4 +--- fs/nfs/callback.c | 7 ++----- fs/nfs/nfs4state.c | 1 - fs/nfsd/nfssvc.c | 3 --- include/linux/sunrpc/svc.h | 5 ----- kernel/module.c | 2 +- net/sunrpc/svc.c | 2 -- 7 files changed, 4 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index c83ec4a375bc..bfde31124f3a 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -184,8 +184,7 @@ lockd(void *vrqstp) dprintk("lockd_down: service stopped\n"); svc_exit_thread(rqstp); - - module_put_and_kthread_exit(0); + return 0; } static int create_lockd_listener(struct svc_serv *serv, const char *name, @@ -352,7 +351,6 @@ static struct notifier_block lockd_inet6addr_notifier = { static const struct svc_serv_ops lockd_sv_ops = { .svo_function = lockd, - .svo_module = THIS_MODULE, }; static int lockd_get(void) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index c98c68513590..a494f9e7bd0a 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -92,8 +91,8 @@ nfs4_callback_svc(void *vrqstp) continue; svc_process(rqstp); } + svc_exit_thread(rqstp); - module_put_and_kthread_exit(0); return 0; } @@ -136,8 +135,8 @@ nfs41_callback_svc(void *vrqstp) finish_wait(&serv->sv_cb_waitq, &wq); } } + svc_exit_thread(rqstp); - module_put_and_kthread_exit(0); return 0; } @@ -234,12 +233,10 @@ err_bind: static const struct svc_serv_ops nfs40_cb_sv_ops = { .svo_function = nfs4_callback_svc, - .svo_module = THIS_MODULE, }; #if defined(CONFIG_NFS_V4_1) static const struct svc_serv_ops nfs41_cb_sv_ops = { .svo_function = nfs41_callback_svc, - .svo_module = THIS_MODULE, }; static const struct svc_serv_ops *nfs4_cb_sv_ops[] = { diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index f5a62c0d999b..02a899e4390f 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2697,6 +2697,5 @@ static int nfs4_run_state_manager(void *ptr) allow_signal(SIGKILL); nfs4_state_manager(clp); nfs_put_client(clp); - module_put_and_kthread_exit(0); return 0; } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index b92d272f4ba6..544187a8a22b 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -614,7 +614,6 @@ static int nfsd_get_default_max_blksize(void) static const struct svc_serv_ops nfsd_thread_sv_ops = { .svo_function = nfsd, - .svo_module = THIS_MODULE, }; void nfsd_shutdown_threads(struct net *net) @@ -1018,8 +1017,6 @@ out: msleep(20); } - /* Release module */ - module_put_and_kthread_exit(0); return 0; } diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 5603158b2aa7..dfc9283f412f 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -57,11 +57,6 @@ struct svc_serv; struct svc_serv_ops { /* function for service threads to run */ int (*svo_function)(void *); - - /* optional module to count when adding threads. - * Thread function must call module_put_and_kthread_exit() to exit. - */ - struct module *svo_module; }; /* diff --git a/kernel/module.c b/kernel/module.c index 46a5c2ed1928..6cea788fd965 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -335,7 +335,7 @@ static inline void add_taint_module(struct module *mod, unsigned flag, /* * A thread that wants to hold a reference to a module only while it - * is running can call this to safely exit. nfsd and lockd use this. + * is running can call this to safely exit. */ void __noreturn __module_put_and_kthread_exit(struct module *mod, long code) { diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 08d684746452..a90d555aa163 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -736,11 +736,9 @@ svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) if (IS_ERR(rqstp)) return PTR_ERR(rqstp); - __module_get(serv->sv_ops->svo_module); task = kthread_create_on_node(serv->sv_ops->svo_function, rqstp, node, "%s", serv->sv_name); if (IS_ERR(task)) { - module_put(serv->sv_ops->svo_module); svc_exit_thread(rqstp); return PTR_ERR(task); } -- cgit v1.2.3 From ceac059ed4fd8abc0940f799dd5133275011e244 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Wed, 23 Feb 2022 16:05:31 -0800 Subject: bpf: Cache the last valid build_id For binaries that are statically linked, consecutive stack frames are likely to be in the same VMA and therefore have the same build id. On a real-world workload, we observed that 66% of CPU cycles in __bpf_get_stackid() were spent on build_id_parse() and find_vma(). As an optimization for this case, we can cache the previous frame's VMA, if the new frame has the same VMA as the previous one, reuse the previous one's build id. We are holding the MM locks as reader across the entire loop, so we don't need to worry about VMA going away. Tested through "stacktrace_build_id" and "stacktrace_build_id_nmi" in test_progs. Suggested-by: Greg Thelen Signed-off-by: Hao Luo Signed-off-by: Daniel Borkmann Reviewed-by: Pasha Tatashin Acked-by: Andrii Nakryiko Acked-by: Song Liu Acked-by: Namhyung Kim Link: https://lore.kernel.org/bpf/20220224000531.1265030-1-haoluo@google.com --- kernel/bpf/stackmap.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 22c8ae94e4c1..38bdfcd06f55 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -132,7 +132,8 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, int i; struct mmap_unlock_irq_work *work = NULL; bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work); - struct vm_area_struct *vma; + struct vm_area_struct *vma, *prev_vma = NULL; + const char *prev_build_id; /* If the irq_work is in use, fall back to report ips. Same * fallback is used for kernel stack (!user) on a stackmap with @@ -150,6 +151,12 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } for (i = 0; i < trace_nr; i++) { + if (range_in_vma(prev_vma, ips[i], ips[i])) { + vma = prev_vma; + memcpy(id_offs[i].build_id, prev_build_id, + BUILD_ID_SIZE_MAX); + goto build_id_valid; + } vma = find_vma(current->mm, ips[i]); if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) { /* per entry fall back to ips */ @@ -158,9 +165,12 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); continue; } +build_id_valid: id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i] - vma->vm_start; id_offs[i].status = BPF_STACK_BUILD_ID_VALID; + prev_vma = vma; + prev_build_id = id_offs[i].build_id; } bpf_mmap_unlock_mm(work, current->mm); } -- cgit v1.2.3 From b664e255ba3c655a675e4e1fe9503d6f7ada3305 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 22 Feb 2022 17:57:05 +0800 Subject: bpf: Add some description about BPF_JIT_ALWAYS_ON in Kconfig When CONFIG_BPF_JIT_ALWAYS_ON is enabled, /proc/sys/net/core/bpf_jit_enable is permanently set to 1 and setting any other value than that will return failure. Add the above description in the help text of config BPF_JIT_ALWAYS_ON, and then we can distinguish between BPF_JIT_ALWAYS_ON and BPF_JIT_DEFAULT_ON. Signed-off-by: Tiezhu Yang Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/1645523826-18149-2-git-send-email-yangtiezhu@loongson.cn --- kernel/bpf/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index d24d518ddd63..c3cf0b86eeb2 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -58,6 +58,10 @@ config BPF_JIT_ALWAYS_ON Enables BPF JIT and removes BPF interpreter to avoid speculative execution of BPF instructions by the interpreter. + When CONFIG_BPF_JIT_ALWAYS_ON is enabled, /proc/sys/net/core/bpf_jit_enable + is permanently set to 1 and setting any other value than that will + return failure. + config BPF_JIT_DEFAULT_ON def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON depends on HAVE_EBPF_JIT && BPF_JIT -- cgit v1.2.3 From 248cc9993d1cc12b8e9ed716cc3fc09f6c3517dd Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sun, 20 Feb 2022 13:14:24 +0800 Subject: sched/cpuacct: Fix charge percpu cpuusage The cpuacct_account_field() is always called by the current task itself, so it's ok to use __this_cpu_add() to charge the tick time. But cpuacct_charge() maybe called by update_curr() in load_balance() on a random CPU, different from the CPU on which the task is running. So __this_cpu_add() will charge that cputime to a random incorrect CPU. Fixes: 73e6aafd9ea8 ("sched/cpuacct: Simplify the cpuacct code") Reported-by: Minye Zhu Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220220051426.5274-1-zhouchengming@bytedance.com --- kernel/sched/cpuacct.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 3d06c5e4220d..307800586ac8 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -334,12 +334,13 @@ static struct cftype files[] = { */ void cpuacct_charge(struct task_struct *tsk, u64 cputime) { + unsigned int cpu = task_cpu(tsk); struct cpuacct *ca; rcu_read_lock(); for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) - __this_cpu_add(*ca->cpuusage, cputime); + *per_cpu_ptr(ca->cpuusage, cpu) += cputime; rcu_read_unlock(); } -- cgit v1.2.3 From dc6e0818bc9a0336d9accf3ea35d146d72aa7a18 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sun, 20 Feb 2022 13:14:25 +0800 Subject: sched/cpuacct: Optimize away RCU read lock Since cpuacct_charge() is called from the scheduler update_curr(), we must already have rq lock held, then the RCU read lock can be optimized away. And do the same thing in it's wrapper cgroup_account_cputime(), but we can't use lockdep_assert_rq_held() there, which defined in kernel/sched/sched.h. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220220051426.5274-2-zhouchengming@bytedance.com --- include/linux/cgroup.h | 2 -- kernel/sched/cpuacct.c | 4 +--- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 75c151413fda..9a109c6ac0e0 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -791,11 +791,9 @@ static inline void cgroup_account_cputime(struct task_struct *task, cpuacct_charge(task, delta_exec); - rcu_read_lock(); cgrp = task_dfl_cgroup(task); if (cgroup_parent(cgrp)) __cgroup_account_cputime(cgrp, delta_exec); - rcu_read_unlock(); } static inline void cgroup_account_cputime_field(struct task_struct *task, diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 307800586ac8..f79f88456d72 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -337,12 +337,10 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) unsigned int cpu = task_cpu(tsk); struct cpuacct *ca; - rcu_read_lock(); + lockdep_assert_rq_held(cpu_rq(cpu)); for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) *per_cpu_ptr(ca->cpuusage, cpu) += cputime; - - rcu_read_unlock(); } /* -- cgit v1.2.3 From 3eba0505d03a9c1eb30d40c2330c0880b22d1b3a Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sun, 20 Feb 2022 13:14:26 +0800 Subject: sched/cpuacct: Remove redundant RCU read lock The cpuacct_account_field() and it's cgroup v2 wrapper cgroup_account_cputime_field() is only called from cputime in task_group_account_field(), which is already in RCU read-side critical section. So remove these redundant RCU read lock. Suggested-by: Tejun Heo Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220220051426.5274-3-zhouchengming@bytedance.com --- include/linux/cgroup.h | 2 -- kernel/sched/cpuacct.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 9a109c6ac0e0..1e356c222756 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -804,11 +804,9 @@ static inline void cgroup_account_cputime_field(struct task_struct *task, cpuacct_account_field(task, index, delta_exec); - rcu_read_lock(); cgrp = task_dfl_cgroup(task); if (cgroup_parent(cgrp)) __cgroup_account_cputime_field(cgrp, index, delta_exec); - rcu_read_unlock(); } #else /* CONFIG_CGROUPS */ diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index f79f88456d72..d269ede84e85 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -352,10 +352,8 @@ void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) { struct cpuacct *ca; - rcu_read_lock(); for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca)) __this_cpu_add(ca->cpustat->cpustat[index], val); - rcu_read_unlock(); } struct cgroup_subsys cpuacct_cgrp_subsys = { -- cgit v1.2.3 From 49bef33e4b87b743495627a529029156c6e09530 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Thu, 27 Jan 2022 15:40:59 +0000 Subject: sched/rt: Plug rt_mutex_setprio() vs push_rt_task() race John reported that push_rt_task() can end up invoking find_lowest_rq(rq->curr) when curr is not an RT task (in this case a CFS one), which causes mayhem down convert_prio(). This can happen when current gets demoted to e.g. CFS when releasing an rt_mutex, and the local CPU gets hit with an rto_push_work irqwork before getting the chance to reschedule. Exactly who triggers this work isn't entirely clear to me - switched_from_rt() only invokes rt_queue_pull_task() if there are no RT tasks on the local RQ, which means the local CPU can't be in the rto_mask. My current suspected sequence is something along the lines of the below, with the demoted task being current. mark_wakeup_next_waiter() rt_mutex_adjust_prio() rt_mutex_setprio() // deboost originally-CFS task check_class_changed() switched_from_rt() // Only rt_queue_pull_task() if !rq->rt.rt_nr_running switched_to_fair() // Sets need_resched __balance_callbacks() // if pull_rt_task(), tell_cpu_to_push() can't select local CPU per the above raw_spin_rq_unlock(rq) // need_resched is set, so task_woken_rt() can't // invoke push_rt_tasks(). Best I can come up with is // local CPU has rt_nr_migratory >= 2 after the demotion, so stays // in the rto_mask, and then: push_rt_task() // breakage follows here as rq->curr is CFS Move an existing check to check rq->curr vs the next pushable task's priority before getting anywhere near find_lowest_rq(). While at it, add an explicit sched_class of rq->curr check prior to invoking find_lowest_rq(rq->curr). Align the DL logic to also reschedule regardless of next_task's migratability. Fixes: a7c81556ec4d ("sched: Fix migrate_disable() vs rt/dl balancing") Reported-by: John Keeping Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Tested-by: John Keeping Link: https://lore.kernel.org/r/20220127154059.974729-1-valentin.schneider@arm.com --- kernel/sched/deadline.c | 12 ++++++------ kernel/sched/rt.c | 32 ++++++++++++++++++++++---------- 2 files changed, 28 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d2c072b0ef01..62f0cf842277 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2240,12 +2240,6 @@ static int push_dl_task(struct rq *rq) return 0; retry: - if (is_migration_disabled(next_task)) - return 0; - - if (WARN_ON(next_task == rq->curr)) - return 0; - /* * If next_task preempts rq->curr, and rq->curr * can move away, it makes sense to just reschedule @@ -2258,6 +2252,12 @@ retry: return 0; } + if (is_migration_disabled(next_task)) + return 0; + + if (WARN_ON(next_task == rq->curr)) + return 0; + /* We might release rq lock */ get_task_struct(next_task); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 7b4f4fbbb404..14f273c29518 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2026,6 +2026,16 @@ static int push_rt_task(struct rq *rq, bool pull) return 0; retry: + /* + * It's possible that the next_task slipped in of + * higher priority than current. If that's the case + * just reschedule current. + */ + if (unlikely(next_task->prio < rq->curr->prio)) { + resched_curr(rq); + return 0; + } + if (is_migration_disabled(next_task)) { struct task_struct *push_task = NULL; int cpu; @@ -2033,6 +2043,18 @@ retry: if (!pull || rq->push_busy) return 0; + /* + * Invoking find_lowest_rq() on anything but an RT task doesn't + * make sense. Per the above priority check, curr has to + * be of higher priority than next_task, so no need to + * reschedule when bailing out. + * + * Note that the stoppers are masqueraded as SCHED_FIFO + * (cf. sched_set_stop_task()), so we can't rely on rt_task(). + */ + if (rq->curr->sched_class != &rt_sched_class) + return 0; + cpu = find_lowest_rq(rq->curr); if (cpu == -1 || cpu == rq->cpu) return 0; @@ -2057,16 +2079,6 @@ retry: if (WARN_ON(next_task == rq->curr)) return 0; - /* - * It's possible that the next_task slipped in of - * higher priority than current. If that's the case - * just reschedule current. - */ - if (unlikely(next_task->prio < rq->curr->prio)) { - resched_curr(rq); - return 0; - } - /* We might release rq lock */ get_task_struct(next_task); -- cgit v1.2.3 From fa2c3254d7cfff5f7a916ab928a562d1165f17bb Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Thu, 20 Jan 2022 16:25:19 +0000 Subject: sched/tracing: Don't re-read p->state when emitting sched_switch event As of commit c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu") the following sequence becomes possible: p->__state = TASK_INTERRUPTIBLE; __schedule() deactivate_task(p); ttwu() READ !p->on_rq p->__state=TASK_WAKING trace_sched_switch() __trace_sched_switch_state() task_state_index() return 0; TASK_WAKING isn't in TASK_REPORT, so the task appears as TASK_RUNNING in the trace event. Prevent this by pushing the value read from __schedule() down the trace event. Reported-by: Abhijeet Dharmapurikar Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/r/20220120162520.570782-2-valentin.schneider@arm.com --- include/linux/sched.h | 11 ++++++++--- include/trace/events/sched.h | 11 +++++++---- kernel/sched/core.c | 4 ++-- kernel/trace/fgraph.c | 4 +++- kernel/trace/ftrace.c | 4 +++- kernel/trace/trace_events.c | 8 ++++++-- kernel/trace/trace_osnoise.c | 4 +++- kernel/trace/trace_sched_switch.c | 1 + kernel/trace/trace_sched_wakeup.c | 1 + 9 files changed, 34 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index f00132e7ef6e..457c8a058b77 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1620,10 +1620,10 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk) #define TASK_REPORT_IDLE (TASK_REPORT + 1) #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) -static inline unsigned int task_state_index(struct task_struct *tsk) +static inline unsigned int __task_state_index(unsigned int tsk_state, + unsigned int tsk_exit_state) { - unsigned int tsk_state = READ_ONCE(tsk->__state); - unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT; + unsigned int state = (tsk_state | tsk_exit_state) & TASK_REPORT; BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); @@ -1633,6 +1633,11 @@ static inline unsigned int task_state_index(struct task_struct *tsk) return fls(state); } +static inline unsigned int task_state_index(struct task_struct *tsk) +{ + return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state); +} + static inline char task_index_to_char(unsigned int state) { static const char state_char[] = "RSDTtXZPI"; diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 94640482cfe7..65e786756321 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -187,7 +187,9 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new, TP_ARGS(p)); #ifdef CREATE_TRACE_POINTS -static inline long __trace_sched_switch_state(bool preempt, struct task_struct *p) +static inline long __trace_sched_switch_state(bool preempt, + unsigned int prev_state, + struct task_struct *p) { unsigned int state; @@ -208,7 +210,7 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct * * it for left shift operation to get the correct task->state * mapping. */ - state = task_state_index(p); + state = __task_state_index(prev_state, p->exit_state); return state ? (1 << (state - 1)) : state; } @@ -220,10 +222,11 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct * TRACE_EVENT(sched_switch, TP_PROTO(bool preempt, + unsigned int prev_state, struct task_struct *prev, struct task_struct *next), - TP_ARGS(preempt, prev, next), + TP_ARGS(preempt, prev_state, prev, next), TP_STRUCT__entry( __array( char, prev_comm, TASK_COMM_LEN ) @@ -239,7 +242,7 @@ TRACE_EVENT(sched_switch, memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); __entry->prev_pid = prev->pid; __entry->prev_prio = prev->prio; - __entry->prev_state = __trace_sched_switch_state(preempt, prev); + __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev); memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); __entry->next_pid = next->pid; __entry->next_prio = next->prio; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ef946123e9af..3aafc15da24a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4836,7 +4836,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) { struct rq *rq = this_rq(); struct mm_struct *mm = rq->prev_mm; - long prev_state; + unsigned int prev_state; /* * The previous task will have left us with a preempt_count of 2 @@ -6300,7 +6300,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) migrate_disable_switch(rq, prev); psi_sched_switch(prev, next, !task_on_rq_queued(prev)); - trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next); + trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev_state, prev, next); /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 22061d38fc00..19028e072cdb 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -415,7 +415,9 @@ free: static void ftrace_graph_probe_sched_switch(void *ignore, bool preempt, - struct task_struct *prev, struct task_struct *next) + unsigned int prev_state, + struct task_struct *prev, + struct task_struct *next) { unsigned long long timestamp; int index; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f9feb197b2da..6762ae029fdd 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7347,7 +7347,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) static void ftrace_filter_pid_sched_switch_probe(void *data, bool preempt, - struct task_struct *prev, struct task_struct *next) + unsigned int prev_state, + struct task_struct *prev, + struct task_struct *next) { struct trace_array *tr = data; struct trace_pid_list *pid_list; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 3147614c1812..2a19ea747ff4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -759,7 +759,9 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable) static void event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, - struct task_struct *prev, struct task_struct *next) + unsigned int prev_state, + struct task_struct *prev, + struct task_struct *next) { struct trace_array *tr = data; struct trace_pid_list *no_pid_list; @@ -783,7 +785,9 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, static void event_filter_pid_sched_switch_probe_post(void *data, bool preempt, - struct task_struct *prev, struct task_struct *next) + unsigned int prev_state, + struct task_struct *prev, + struct task_struct *next) { struct trace_array *tr = data; struct trace_pid_list *no_pid_list; diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 870a08da5b48..1829b4cb8cc1 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1167,7 +1167,9 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t) * used to record the beginning and to report the end of a thread noise window. */ static void -trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p, +trace_sched_switch_callback(void *data, bool preempt, + unsigned int prev_state, + struct task_struct *p, struct task_struct *n) { struct osnoise_variables *osn_var = this_cpu_osn_var(); diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index e304196d7c28..993b0ed10d8c 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -22,6 +22,7 @@ static DEFINE_MUTEX(sched_register_mutex); static void probe_sched_switch(void *ignore, bool preempt, + unsigned int prev_state, struct task_struct *prev, struct task_struct *next) { int flags; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 2402de520eca..46429f9a96fa 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -426,6 +426,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, static void notrace probe_wakeup_sched_switch(void *ignore, bool preempt, + unsigned int prev_state, struct task_struct *prev, struct task_struct *next) { struct trace_array_cpu *data; -- cgit v1.2.3 From 444e1154b2bf0b881b65ba1bba5bc8e691fac04a Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Tue, 15 Feb 2022 11:37:50 +0800 Subject: PM: hibernate: Clean up non-kernel-doc comments Address the following W=1 kernel build warning: kernel/power/swap.c:120: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index ad10359030a4..c51f5507b34f 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -89,7 +89,7 @@ struct swap_map_page_list { struct swap_map_page_list *next; }; -/** +/* * The swap_map_handle structure is used for handling swap in * a file-alike way */ @@ -117,7 +117,7 @@ struct swsusp_header { static struct swsusp_header *swsusp_header; -/** +/* * The following functions are used for tracing the allocated * swap pages, so that they can be freed in case of an error. */ @@ -171,7 +171,7 @@ static int swsusp_extents_insert(unsigned long swap_offset) return 0; } -/** +/* * alloc_swapdev_block - allocate a swap page and register that it has * been allocated, so that it can be freed in case of an error. */ @@ -190,7 +190,7 @@ sector_t alloc_swapdev_block(int swap) return 0; } -/** +/* * free_all_swap_pages - free swap pages allocated for saving image data. * It also frees the extents used to register which swap entries had been * allocated. -- cgit v1.2.3 From ba7ffcd4c4da374b0f64666354eeeda7d3827131 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 28 Feb 2022 14:05:32 -0800 Subject: PM: hibernate: fix __setup handler error handling If an invalid value is used in "resumedelay=", it is silently ignored. Add a warning message and then let the __setup handler return 1 to indicate that the kernel command line option has been handled. Fixes: 317cf7e5e85e3 ("PM / hibernate: convert simple_strtoul to kstrtoul") Signed-off-by: Randy Dunlap Reported-by: Igor Zhbanov Link: lore.kernel.org/r/64644a2f-4a20-bab3-1e15-3b2cdd0defe3@omprussia.ru Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 49d1df0218cb..0ac805b753e5 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1330,7 +1330,7 @@ static int __init resumedelay_setup(char *str) int rc = kstrtouint(str, 0, &resume_delay); if (rc) - return rc; + pr_warn("resumedelay: bad option string '%s'\n", str); return 1; } -- cgit v1.2.3 From 7a64ca17e4dd50d5f910769167f3553902777844 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 28 Feb 2022 14:05:44 -0800 Subject: PM: suspend: fix return value of __setup handler If an invalid option is given for "test_suspend=