diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cgroup/cgroup.c | 44 | ||||
-rw-r--r-- | kernel/sched/core.c | 2 | ||||
-rw-r--r-- | kernel/sched/ext.c | 6 | ||||
-rw-r--r-- | kernel/sched/ext_idle.c | 28 | ||||
-rw-r--r-- | kernel/trace/rv/monitors/sleep/sleep.c | 4 | ||||
-rw-r--r-- | kernel/trace/rv/rv.c | 4 | ||||
-rw-r--r-- | kernel/vhost_task.c | 3 |
7 files changed, 74 insertions, 17 deletions
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 312c6a8b55bb..77d02f87f3f1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -126,8 +126,31 @@ DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); * of concurrent destructions. Use a separate workqueue so that cgroup * destruction work items don't end up filling up max_active of system_wq * which may lead to deadlock. + * + * A cgroup destruction should enqueue work sequentially to: + * cgroup_offline_wq: use for css offline work + * cgroup_release_wq: use for css release work + * cgroup_free_wq: use for free work + * + * Rationale for using separate workqueues: + * The cgroup root free work may depend on completion of other css offline + * operations. If all tasks were enqueued to a single workqueue, this could + * create a deadlock scenario where: + * - Free work waits for other css offline work to complete. + * - But other css offline work is queued after free work in the same queue. + * + * Example deadlock scenario with single workqueue (cgroup_destroy_wq): + * 1. umount net_prio + * 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx) + * 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx) + * 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline. + * 5. net_prio root destruction blocks waiting for perf_event CSS A offline, + * which can never complete as it's behind in the same queue and + * workqueue's max_active is 1. */ -static struct workqueue_struct *cgroup_destroy_wq; +static struct workqueue_struct *cgroup_offline_wq; +static struct workqueue_struct *cgroup_release_wq; +static struct workqueue_struct *cgroup_free_wq; /* generate an array of cgroup subsystem pointers */ #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, @@ -4159,6 +4182,7 @@ static void cgroup_file_release(struct kernfs_open_file *of) cft->release(of); put_cgroup_ns(ctx->ns); kfree(ctx); + of->priv = NULL; } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, @@ -5558,7 +5582,7 @@ static void css_release_work_fn(struct work_struct *work) cgroup_unlock(); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); - queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); + queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); } static void css_release(struct percpu_ref *ref) @@ -5567,7 +5591,7 @@ static void css_release(struct percpu_ref *ref) container_of(ref, struct cgroup_subsys_state, refcnt); INIT_WORK(&css->destroy_work, css_release_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + queue_work(cgroup_release_wq, &css->destroy_work); } static void init_and_link_css(struct cgroup_subsys_state *css, @@ -5701,7 +5725,7 @@ err_list_del: list_del_rcu(&css->sibling); err_free_css: INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); - queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); + queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); return ERR_PTR(err); } @@ -5939,7 +5963,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref) if (atomic_dec_and_test(&css->online_cnt)) { INIT_WORK(&css->destroy_work, css_killed_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + queue_work(cgroup_offline_wq, &css->destroy_work); } } @@ -6325,8 +6349,14 @@ static int __init cgroup_wq_init(void) * We would prefer to do this in cgroup_init() above, but that * is called before init_workqueues(): so leave this until after. */ - cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); - BUG_ON(!cgroup_destroy_wq); + cgroup_offline_wq = alloc_workqueue("cgroup_offline", 0, 1); + BUG_ON(!cgroup_offline_wq); + + cgroup_release_wq = alloc_workqueue("cgroup_release", 0, 1); + BUG_ON(!cgroup_release_wq); + + cgroup_free_wq = alloc_workqueue("cgroup_free", 0, 1); + BUG_ON(!cgroup_free_wq); return 0; } core_initcall(cgroup_wq_init); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index be00629f0ba4..ccba6fc3c3fe 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9551,7 +9551,7 @@ static unsigned long tg_weight(struct task_group *tg) #ifdef CONFIG_FAIR_GROUP_SCHED return scale_load_down(tg->shares); #else - return sched_weight_from_cgroup(tg->scx_weight); + return sched_weight_from_cgroup(tg->scx.weight); #endif } diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 4ae32ef179dd..088ceff38c8a 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6788,12 +6788,8 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void) * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to * the current local DSQ for running tasks and thus are not * visible to the BPF scheduler. - * - * Also skip re-enqueueing tasks that can only run on this - * CPU, as they would just be re-added to the same local - * DSQ without any benefit. */ - if (p->migration_pending || is_migration_disabled(p) || p->nr_cpus_allowed == 1) + if (p->migration_pending) continue; dispatch_dequeue(rq, p); diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 7174e1c1a392..537c6992bb63 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -856,6 +856,32 @@ static bool check_builtin_idle_enabled(void) return false; } +/* + * Determine whether @p is a migration-disabled task in the context of BPF + * code. + * + * We can't simply check whether @p->migration_disabled is set in a + * sched_ext callback, because migration is always disabled for the current + * task while running BPF code. + * + * The prolog (__bpf_prog_enter) and epilog (__bpf_prog_exit) respectively + * disable and re-enable migration. For this reason, the current task + * inside a sched_ext callback is always a migration-disabled task. + * + * Therefore, when @p->migration_disabled == 1, check whether @p is the + * current task or not: if it is, then migration was not disabled before + * entering the callback, otherwise migration was disabled. + * + * Returns true if @p is migration-disabled, false otherwise. + */ +static bool is_bpf_migration_disabled(const struct task_struct *p) +{ + if (p->migration_disabled == 1) + return p != current; + else + return p->migration_disabled; +} + static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags, const struct cpumask *allowed, u64 flags) { @@ -898,7 +924,7 @@ static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_f * selection optimizations and simply check whether the previously * used CPU is idle and within the allowed cpumask. */ - if (p->nr_cpus_allowed == 1 || is_migration_disabled(p)) { + if (p->nr_cpus_allowed == 1 || is_bpf_migration_disabled(p)) { if (cpumask_test_cpu(prev_cpu, allowed ?: p->cpus_ptr) && scx_idle_test_and_clear_cpu(prev_cpu)) cpu = prev_cpu; diff --git a/kernel/trace/rv/monitors/sleep/sleep.c b/kernel/trace/rv/monitors/sleep/sleep.c index eea447b06907..c1347da69e9d 100644 --- a/kernel/trace/rv/monitors/sleep/sleep.c +++ b/kernel/trace/rv/monitors/sleep/sleep.c @@ -127,7 +127,9 @@ static void handle_sys_enter(void *data, struct pt_regs *regs, long id) mon = ltl_get_monitor(current); switch (id) { +#ifdef __NR_clock_nanosleep case __NR_clock_nanosleep: +#endif #ifdef __NR_clock_nanosleep_time64 case __NR_clock_nanosleep_time64: #endif @@ -138,7 +140,9 @@ static void handle_sys_enter(void *data, struct pt_regs *regs, long id) ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, true); break; +#ifdef __NR_futex case __NR_futex: +#endif #ifdef __NR_futex_time64 case __NR_futex_time64: #endif diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 1482e91c39f4..48338520376f 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -495,7 +495,7 @@ static void *available_monitors_next(struct seq_file *m, void *p, loff_t *pos) */ static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos) { - struct rv_monitor *mon = p; + struct rv_monitor *mon = container_of(p, struct rv_monitor, list); (*pos)++; @@ -805,7 +805,7 @@ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent) retval = create_monitor_dir(monitor, parent); if (retval) - return retval; + goto out_unlock; /* keep children close to the parent for easier visualisation */ if (parent) diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c index bc738fa90c1d..27107dcc1cbf 100644 --- a/kernel/vhost_task.c +++ b/kernel/vhost_task.c @@ -100,6 +100,7 @@ void vhost_task_stop(struct vhost_task *vtsk) * freeing it below. */ wait_for_completion(&vtsk->exited); + put_task_struct(vtsk->task); kfree(vtsk); } EXPORT_SYMBOL_GPL(vhost_task_stop); @@ -148,7 +149,7 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *), return ERR_CAST(tsk); } - vtsk->task = tsk; + vtsk->task = get_task_struct(tsk); return vtsk; } EXPORT_SYMBOL_GPL(vhost_task_create); |