summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup/cgroup.c44
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/sched/ext.c6
-rw-r--r--kernel/trace/rv/monitors/sleep/sleep.c4
-rw-r--r--kernel/trace/rv/rv.c4
-rw-r--r--kernel/trace/trace_kprobe.c2
6 files changed, 47 insertions, 15 deletions
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 312c6a8b55bb..77d02f87f3f1 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -126,8 +126,31 @@ DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
* of concurrent destructions. Use a separate workqueue so that cgroup
* destruction work items don't end up filling up max_active of system_wq
* which may lead to deadlock.
+ *
+ * A cgroup destruction should enqueue work sequentially to:
+ * cgroup_offline_wq: use for css offline work
+ * cgroup_release_wq: use for css release work
+ * cgroup_free_wq: use for free work
+ *
+ * Rationale for using separate workqueues:
+ * The cgroup root free work may depend on completion of other css offline
+ * operations. If all tasks were enqueued to a single workqueue, this could
+ * create a deadlock scenario where:
+ * - Free work waits for other css offline work to complete.
+ * - But other css offline work is queued after free work in the same queue.
+ *
+ * Example deadlock scenario with single workqueue (cgroup_destroy_wq):
+ * 1. umount net_prio
+ * 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx)
+ * 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx)
+ * 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline.
+ * 5. net_prio root destruction blocks waiting for perf_event CSS A offline,
+ * which can never complete as it's behind in the same queue and
+ * workqueue's max_active is 1.
*/
-static struct workqueue_struct *cgroup_destroy_wq;
+static struct workqueue_struct *cgroup_offline_wq;
+static struct workqueue_struct *cgroup_release_wq;
+static struct workqueue_struct *cgroup_free_wq;
/* generate an array of cgroup subsystem pointers */
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
@@ -4159,6 +4182,7 @@ static void cgroup_file_release(struct kernfs_open_file *of)
cft->release(of);
put_cgroup_ns(ctx->ns);
kfree(ctx);
+ of->priv = NULL;
}
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
@@ -5558,7 +5582,7 @@ static void css_release_work_fn(struct work_struct *work)
cgroup_unlock();
INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
- queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
+ queue_rcu_work(cgroup_free_wq, &css->destroy_rwork);
}
static void css_release(struct percpu_ref *ref)
@@ -5567,7 +5591,7 @@ static void css_release(struct percpu_ref *ref)
container_of(ref, struct cgroup_subsys_state, refcnt);
INIT_WORK(&css->destroy_work, css_release_work_fn);
- queue_work(cgroup_destroy_wq, &css->destroy_work);
+ queue_work(cgroup_release_wq, &css->destroy_work);
}
static void init_and_link_css(struct cgroup_subsys_state *css,
@@ -5701,7 +5725,7 @@ err_list_del:
list_del_rcu(&css->sibling);
err_free_css:
INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
- queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
+ queue_rcu_work(cgroup_free_wq, &css->destroy_rwork);
return ERR_PTR(err);
}
@@ -5939,7 +5963,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
if (atomic_dec_and_test(&css->online_cnt)) {
INIT_WORK(&css->destroy_work, css_killed_work_fn);
- queue_work(cgroup_destroy_wq, &css->destroy_work);
+ queue_work(cgroup_offline_wq, &css->destroy_work);
}
}
@@ -6325,8 +6349,14 @@ static int __init cgroup_wq_init(void)
* We would prefer to do this in cgroup_init() above, but that
* is called before init_workqueues(): so leave this until after.
*/
- cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
- BUG_ON(!cgroup_destroy_wq);
+ cgroup_offline_wq = alloc_workqueue("cgroup_offline", 0, 1);
+ BUG_ON(!cgroup_offline_wq);
+
+ cgroup_release_wq = alloc_workqueue("cgroup_release", 0, 1);
+ BUG_ON(!cgroup_release_wq);
+
+ cgroup_free_wq = alloc_workqueue("cgroup_free", 0, 1);
+ BUG_ON(!cgroup_free_wq);
return 0;
}
core_initcall(cgroup_wq_init);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index be00629f0ba4..ccba6fc3c3fe 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9551,7 +9551,7 @@ static unsigned long tg_weight(struct task_group *tg)
#ifdef CONFIG_FAIR_GROUP_SCHED
return scale_load_down(tg->shares);
#else
- return sched_weight_from_cgroup(tg->scx_weight);
+ return sched_weight_from_cgroup(tg->scx.weight);
#endif
}
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 4ae32ef179dd..088ceff38c8a 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -6788,12 +6788,8 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void)
* CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to
* the current local DSQ for running tasks and thus are not
* visible to the BPF scheduler.
- *
- * Also skip re-enqueueing tasks that can only run on this
- * CPU, as they would just be re-added to the same local
- * DSQ without any benefit.
*/
- if (p->migration_pending || is_migration_disabled(p) || p->nr_cpus_allowed == 1)
+ if (p->migration_pending)
continue;
dispatch_dequeue(rq, p);
diff --git a/kernel/trace/rv/monitors/sleep/sleep.c b/kernel/trace/rv/monitors/sleep/sleep.c
index eea447b06907..c1347da69e9d 100644
--- a/kernel/trace/rv/monitors/sleep/sleep.c
+++ b/kernel/trace/rv/monitors/sleep/sleep.c
@@ -127,7 +127,9 @@ static void handle_sys_enter(void *data, struct pt_regs *regs, long id)
mon = ltl_get_monitor(current);
switch (id) {
+#ifdef __NR_clock_nanosleep
case __NR_clock_nanosleep:
+#endif
#ifdef __NR_clock_nanosleep_time64
case __NR_clock_nanosleep_time64:
#endif
@@ -138,7 +140,9 @@ static void handle_sys_enter(void *data, struct pt_regs *regs, long id)
ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, true);
break;
+#ifdef __NR_futex
case __NR_futex:
+#endif
#ifdef __NR_futex_time64
case __NR_futex_time64:
#endif
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index 1482e91c39f4..48338520376f 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -495,7 +495,7 @@ static void *available_monitors_next(struct seq_file *m, void *p, loff_t *pos)
*/
static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos)
{
- struct rv_monitor *mon = p;
+ struct rv_monitor *mon = container_of(p, struct rv_monitor, list);
(*pos)++;
@@ -805,7 +805,7 @@ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent)
retval = create_monitor_dir(monitor, parent);
if (retval)
- return retval;
+ goto out_unlock;
/* keep children close to the parent for easier visualisation */
if (parent)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index ccae62d4fb91..fa60362a3f31 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -908,6 +908,8 @@ static int trace_kprobe_create_internal(int argc, const char *argv[],
return -EINVAL;
}
buf = kmemdup(&argv[0][1], len + 1, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
buf[len] = '\0';
ret = kstrtouint(buf, 0, &maxactive);
if (ret || !maxactive) {