summaryrefslogtreecommitdiff
path: root/kernel/cgroup
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup')
-rw-r--r--kernel/cgroup/cgroup-internal.h11
-rw-r--r--kernel/cgroup/cgroup-v1.c19
-rw-r--r--kernel/cgroup/cgroup.c268
-rw-r--r--kernel/cgroup/cpuset-internal.h5
-rw-r--r--kernel/cgroup/cpuset-v1.c12
-rw-r--r--kernel/cgroup/cpuset.c752
-rw-r--r--kernel/cgroup/debug.c4
-rw-r--r--kernel/cgroup/freezer.c16
-rw-r--r--kernel/cgroup/namespace.c29
9 files changed, 661 insertions, 455 deletions
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index b14e61c64a34..22051b4f1ccb 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -249,12 +249,15 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
bool threadgroup);
-void cgroup_attach_lock(bool lock_threadgroup);
-void cgroup_attach_unlock(bool lock_threadgroup);
+void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode,
+ struct task_struct *tsk);
+void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode,
+ struct task_struct *tsk);
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
- bool *locked)
+ enum cgroup_attach_lock_mode *lock_mode)
__acquires(&cgroup_threadgroup_rwsem);
-void cgroup_procs_write_finish(struct task_struct *task, bool locked)
+void cgroup_procs_write_finish(struct task_struct *task,
+ enum cgroup_attach_lock_mode lock_mode)
__releases(&cgroup_threadgroup_rwsem);
void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 2a4a387f867a..a9e029b570c8 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -10,6 +10,7 @@
#include <linux/sched/task.h>
#include <linux/magic.h>
#include <linux/slab.h>
+#include <linux/string.h>
#include <linux/vmalloc.h>
#include <linux/delayacct.h>
#include <linux/pid_namespace.h>
@@ -68,7 +69,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
int retval = 0;
cgroup_lock();
- cgroup_attach_lock(true);
+ cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
for_each_root(root) {
struct cgroup *from_cgrp;
@@ -80,7 +81,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
if (retval)
break;
}
- cgroup_attach_unlock(true);
+ cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
cgroup_unlock();
return retval;
@@ -117,7 +118,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
cgroup_lock();
- cgroup_attach_lock(true);
+ cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
/* all tasks in @from are being moved, all csets are source */
spin_lock_irq(&css_set_lock);
@@ -153,7 +154,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
} while (task && !ret);
out_err:
cgroup_migrate_finish(&mgctx);
- cgroup_attach_unlock(true);
+ cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
cgroup_unlock();
return ret;
}
@@ -502,13 +503,13 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
struct task_struct *task;
const struct cred *cred, *tcred;
ssize_t ret;
- bool locked;
+ enum cgroup_attach_lock_mode lock_mode;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENODEV;
- task = cgroup_procs_write_start(buf, threadgroup, &locked);
+ task = cgroup_procs_write_start(buf, threadgroup, &lock_mode);
ret = PTR_ERR_OR_ZERO(task);
if (ret)
goto out_unlock;
@@ -531,7 +532,7 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
ret = cgroup_attach_task(cgrp, task, threadgroup);
out_finish:
- cgroup_procs_write_finish(task, locked);
+ cgroup_procs_write_finish(task, lock_mode);
out_unlock:
cgroup_kn_unlock(of->kn);
@@ -1133,7 +1134,7 @@ int cgroup1_reconfigure(struct fs_context *fc)
if (ctx->release_agent) {
spin_lock(&release_agent_path_lock);
- strcpy(root->release_agent_path, ctx->release_agent);
+ strscpy(root->release_agent_path, ctx->release_agent);
spin_unlock(&release_agent_path_lock);
}
@@ -1325,7 +1326,7 @@ static int __init cgroup1_wq_init(void)
* Cap @max_active to 1 too.
*/
cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
- 0, 1);
+ WQ_PERCPU, 1);
BUG_ON(!cgroup_pidlist_destroy_wq);
return 0;
}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 312c6a8b55bb..6ae5f48cf64e 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -59,6 +59,7 @@
#include <linux/sched/cputime.h>
#include <linux/sched/deadline.h>
#include <linux/psi.h>
+#include <linux/nstree.h>
#include <net/sock.h>
#define CREATE_TRACE_POINTS
@@ -124,10 +125,33 @@ DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
/*
* cgroup destruction makes heavy use of work items and there can be a lot
* of concurrent destructions. Use a separate workqueue so that cgroup
- * destruction work items don't end up filling up max_active of system_wq
+ * destruction work items don't end up filling up max_active of system_percpu_wq
* which may lead to deadlock.
+ *
+ * A cgroup destruction should enqueue work sequentially to:
+ * cgroup_offline_wq: use for css offline work
+ * cgroup_release_wq: use for css release work
+ * cgroup_free_wq: use for free work
+ *
+ * Rationale for using separate workqueues:
+ * The cgroup root free work may depend on completion of other css offline
+ * operations. If all tasks were enqueued to a single workqueue, this could
+ * create a deadlock scenario where:
+ * - Free work waits for other css offline work to complete.
+ * - But other css offline work is queued after free work in the same queue.
+ *
+ * Example deadlock scenario with single workqueue (cgroup_destroy_wq):
+ * 1. umount net_prio
+ * 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx)
+ * 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx)
+ * 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline.
+ * 5. net_prio root destruction blocks waiting for perf_event CSS A offline,
+ * which can never complete as it's behind in the same queue and
+ * workqueue's max_active is 1.
*/
-static struct workqueue_struct *cgroup_destroy_wq;
+static struct workqueue_struct *cgroup_offline_wq;
+static struct workqueue_struct *cgroup_release_wq;
+static struct workqueue_struct *cgroup_free_wq;
/* generate an array of cgroup subsystem pointers */
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
@@ -216,13 +240,22 @@ static u16 have_canfork_callback __read_mostly;
static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS);
+/*
+ * Write protected by cgroup_mutex and write-lock of cgroup_threadgroup_rwsem,
+ * read protected by either.
+ *
+ * Can only be turned on, but not turned off.
+ */
+bool cgroup_enable_per_threadgroup_rwsem __read_mostly;
+
/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
- .ns.count = REFCOUNT_INIT(2),
+ .ns.__ns_ref = REFCOUNT_INIT(2),
.user_ns = &init_user_ns,
.ns.ops = &cgroupns_operations,
- .ns.inum = PROC_CGROUP_INIT_INO,
+ .ns.inum = ns_init_inum(&init_cgroup_ns),
.root_cset = &init_css_set,
+ .ns.ns_type = ns_common_type(&init_cgroup_ns),
};
static struct file_system_type cgroup2_fs_type;
@@ -1302,14 +1335,30 @@ void cgroup_favor_dynmods(struct cgroup_root *root, bool favor)
{
bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS;
- /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */
+ /*
+ * see the comment above CGRP_ROOT_FAVOR_DYNMODS definition.
+ * favordynmods can flip while task is between
+ * cgroup_threadgroup_change_begin() and end(), so down_write global
+ * cgroup_threadgroup_rwsem to synchronize them.
+ *
+ * Once cgroup_enable_per_threadgroup_rwsem is enabled, holding
+ * cgroup_threadgroup_rwsem doesn't exlude tasks between
+ * cgroup_thread_group_change_begin() and end() and thus it's unsafe to
+ * turn off. As the scenario is unlikely, simply disallow disabling once
+ * enabled and print out a warning.
+ */
+ percpu_down_write(&cgroup_threadgroup_rwsem);
if (favor && !favoring) {
+ cgroup_enable_per_threadgroup_rwsem = true;
rcu_sync_enter(&cgroup_threadgroup_rwsem.rss);
root->flags |= CGRP_ROOT_FAVOR_DYNMODS;
} else if (!favor && favoring) {
+ if (cgroup_enable_per_threadgroup_rwsem)
+ pr_warn_once("cgroup favordynmods: per threadgroup rwsem mechanism can't be disabled\n");
rcu_sync_exit(&cgroup_threadgroup_rwsem.rss);
root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
}
+ percpu_up_write(&cgroup_threadgroup_rwsem);
}
static int cgroup_init_root_id(struct cgroup_root *root)
@@ -2459,7 +2508,8 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns);
/**
* cgroup_attach_lock - Lock for ->attach()
- * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
+ * @lock_mode: whether acquire and acquire which rwsem
+ * @tsk: thread group to lock
*
* cgroup migration sometimes needs to stabilize threadgroups against forks and
* exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
@@ -2479,22 +2529,55 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns);
* Resolve the situation by always acquiring cpus_read_lock() before optionally
* write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
* CPU hotplug is disabled on entry.
+ *
+ * When favordynmods is enabled, take per threadgroup rwsem to reduce overhead
+ * on dynamic cgroup modifications. see the comment above
+ * CGRP_ROOT_FAVOR_DYNMODS definition.
+ *
+ * tsk is not NULL only when writing to cgroup.procs.
*/
-void cgroup_attach_lock(bool lock_threadgroup)
+void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode,
+ struct task_struct *tsk)
{
cpus_read_lock();
- if (lock_threadgroup)
+
+ switch (lock_mode) {
+ case CGRP_ATTACH_LOCK_NONE:
+ break;
+ case CGRP_ATTACH_LOCK_GLOBAL:
percpu_down_write(&cgroup_threadgroup_rwsem);
+ break;
+ case CGRP_ATTACH_LOCK_PER_THREADGROUP:
+ down_write(&tsk->signal->cgroup_threadgroup_rwsem);
+ break;
+ default:
+ pr_warn("cgroup: Unexpected attach lock mode.");
+ break;
+ }
}
/**
* cgroup_attach_unlock - Undo cgroup_attach_lock()
- * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
+ * @lock_mode: whether release and release which rwsem
+ * @tsk: thread group to lock
*/
-void cgroup_attach_unlock(bool lock_threadgroup)
+void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode,
+ struct task_struct *tsk)
{
- if (lock_threadgroup)
+ switch (lock_mode) {
+ case CGRP_ATTACH_LOCK_NONE:
+ break;
+ case CGRP_ATTACH_LOCK_GLOBAL:
percpu_up_write(&cgroup_threadgroup_rwsem);
+ break;
+ case CGRP_ATTACH_LOCK_PER_THREADGROUP:
+ up_write(&tsk->signal->cgroup_threadgroup_rwsem);
+ break;
+ default:
+ pr_warn("cgroup: Unexpected attach lock mode.");
+ break;
+ }
+
cpus_read_unlock();
}
@@ -2944,14 +3027,12 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
/* look up all src csets */
spin_lock_irq(&css_set_lock);
- rcu_read_lock();
task = leader;
do {
cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
if (!threadgroup)
break;
} while_each_thread(leader, task);
- rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
/* prepare dst csets and commit */
@@ -2968,7 +3049,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
}
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
- bool *threadgroup_locked)
+ enum cgroup_attach_lock_mode *lock_mode)
{
struct task_struct *tsk;
pid_t pid;
@@ -2976,24 +3057,13 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
return ERR_PTR(-EINVAL);
- /*
- * If we migrate a single thread, we don't care about threadgroup
- * stability. If the thread is `current`, it won't exit(2) under our
- * hands or change PID through exec(2). We exclude
- * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write
- * callers by cgroup_mutex.
- * Therefore, we can skip the global lock.
- */
- lockdep_assert_held(&cgroup_mutex);
- *threadgroup_locked = pid || threadgroup;
- cgroup_attach_lock(*threadgroup_locked);
-
+retry_find_task:
rcu_read_lock();
if (pid) {
tsk = find_task_by_vpid(pid);
if (!tsk) {
tsk = ERR_PTR(-ESRCH);
- goto out_unlock_threadgroup;
+ goto out_unlock_rcu;
}
} else {
tsk = current;
@@ -3010,33 +3080,58 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
*/
if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
tsk = ERR_PTR(-EINVAL);
- goto out_unlock_threadgroup;
+ goto out_unlock_rcu;
}
-
get_task_struct(tsk);
- goto out_unlock_rcu;
+ rcu_read_unlock();
+
+ /*
+ * If we migrate a single thread, we don't care about threadgroup
+ * stability. If the thread is `current`, it won't exit(2) under our
+ * hands or change PID through exec(2). We exclude
+ * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write callers
+ * by cgroup_mutex. Therefore, we can skip the global lock.
+ */
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (pid || threadgroup) {
+ if (cgroup_enable_per_threadgroup_rwsem)
+ *lock_mode = CGRP_ATTACH_LOCK_PER_THREADGROUP;
+ else
+ *lock_mode = CGRP_ATTACH_LOCK_GLOBAL;
+ } else {
+ *lock_mode = CGRP_ATTACH_LOCK_NONE;
+ }
+
+ cgroup_attach_lock(*lock_mode, tsk);
+
+ if (threadgroup) {
+ if (!thread_group_leader(tsk)) {
+ /*
+ * A race with de_thread from another thread's exec()
+ * may strip us of our leadership. If this happens,
+ * throw this task away and try again.
+ */
+ cgroup_attach_unlock(*lock_mode, tsk);
+ put_task_struct(tsk);
+ goto retry_find_task;
+ }
+ }
+
+ return tsk;
-out_unlock_threadgroup:
- cgroup_attach_unlock(*threadgroup_locked);
- *threadgroup_locked = false;
out_unlock_rcu:
rcu_read_unlock();
return tsk;
}
-void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)
+void cgroup_procs_write_finish(struct task_struct *task,
+ enum cgroup_attach_lock_mode lock_mode)
{
- struct cgroup_subsys *ss;
- int ssid;
+ cgroup_attach_unlock(lock_mode, task);
/* release reference from cgroup_procs_write_start() */
put_task_struct(task);
-
- cgroup_attach_unlock(threadgroup_locked);
-
- for_each_subsys(ss, ssid)
- if (ss->post_attach)
- ss->post_attach();
}
static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
@@ -3088,6 +3183,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
struct cgroup_subsys_state *d_css;
struct cgroup *dsct;
struct css_set *src_cset;
+ enum cgroup_attach_lock_mode lock_mode;
bool has_tasks;
int ret;
@@ -3119,7 +3215,13 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
* write-locking can be skipped safely.
*/
has_tasks = !list_empty(&mgctx.preloaded_src_csets);
- cgroup_attach_lock(has_tasks);
+
+ if (has_tasks)
+ lock_mode = CGRP_ATTACH_LOCK_GLOBAL;
+ else
+ lock_mode = CGRP_ATTACH_LOCK_NONE;
+
+ cgroup_attach_lock(lock_mode, NULL);
/* NULL dst indicates self on default hierarchy */
ret = cgroup_migrate_prepare_dst(&mgctx);
@@ -3140,7 +3242,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
ret = cgroup_migrate_execute(&mgctx);
out_finish:
cgroup_migrate_finish(&mgctx);
- cgroup_attach_unlock(has_tasks);
+ cgroup_attach_unlock(lock_mode, NULL);
return ret;
}
@@ -3763,6 +3865,27 @@ static int cgroup_stat_show(struct seq_file *seq, void *v)
return 0;
}
+static int cgroup_core_local_stat_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ unsigned int sequence;
+ u64 freeze_time;
+
+ do {
+ sequence = read_seqcount_begin(&cgrp->freezer.freeze_seq);
+ freeze_time = cgrp->freezer.frozen_nsec;
+ /* Add in current freezer interval if the cgroup is freezing. */
+ if (test_bit(CGRP_FREEZE, &cgrp->flags))
+ freeze_time += (ktime_get_ns() -
+ cgrp->freezer.freeze_start_nsec);
+ } while (read_seqcount_retry(&cgrp->freezer.freeze_seq, sequence));
+
+ do_div(freeze_time, NSEC_PER_USEC);
+ seq_printf(seq, "frozen_usec %llu\n", freeze_time);
+
+ return 0;
+}
+
#ifdef CONFIG_CGROUP_SCHED
/**
* cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
@@ -4159,6 +4282,7 @@ static void cgroup_file_release(struct kernfs_open_file *of)
cft->release(of);
put_cgroup_ns(ctx->ns);
kfree(ctx);
+ of->priv = NULL;
}
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
@@ -5241,13 +5365,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
struct task_struct *task;
const struct cred *saved_cred;
ssize_t ret;
- bool threadgroup_locked;
+ enum cgroup_attach_lock_mode lock_mode;
dst_cgrp = cgroup_kn_lock_live(of->kn, false);
if (!dst_cgrp)
return -ENODEV;
- task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked);
+ task = cgroup_procs_write_start(buf, threadgroup, &lock_mode);
ret = PTR_ERR_OR_ZERO(task);
if (ret)
goto out_unlock;
@@ -5273,7 +5397,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
out_finish:
- cgroup_procs_write_finish(task, threadgroup_locked);
+ cgroup_procs_write_finish(task, lock_mode);
out_unlock:
cgroup_kn_unlock(of->kn);
@@ -5355,6 +5479,11 @@ static struct cftype cgroup_base_files[] = {
.seq_show = cgroup_stat_show,
},
{
+ .name = "cgroup.stat.local",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_core_local_stat_show,
+ },
+ {
.name = "cgroup.freeze",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_freeze_show,
@@ -5558,7 +5687,7 @@ static void css_release_work_fn(struct work_struct *work)
cgroup_unlock();
INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
- queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
+ queue_rcu_work(cgroup_free_wq, &css->destroy_rwork);
}
static void css_release(struct percpu_ref *ref)
@@ -5567,7 +5696,7 @@ static void css_release(struct percpu_ref *ref)
container_of(ref, struct cgroup_subsys_state, refcnt);
INIT_WORK(&css->destroy_work, css_release_work_fn);
- queue_work(cgroup_destroy_wq, &css->destroy_work);
+ queue_work(cgroup_release_wq, &css->destroy_work);
}
static void init_and_link_css(struct cgroup_subsys_state *css,
@@ -5701,7 +5830,7 @@ err_list_del:
list_del_rcu(&css->sibling);
err_free_css:
INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
- queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
+ queue_rcu_work(cgroup_free_wq, &css->destroy_rwork);
return ERR_PTR(err);
}
@@ -5763,6 +5892,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
* if the parent has to be frozen, the child has too.
*/
cgrp->freezer.e_freeze = parent->freezer.e_freeze;
+ seqcount_init(&cgrp->freezer.freeze_seq);
if (cgrp->freezer.e_freeze) {
/*
* Set the CGRP_FREEZE flag, so when a process will be
@@ -5771,6 +5901,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
* consider it frozen immediately.
*/
set_bit(CGRP_FREEZE, &cgrp->flags);
+ cgrp->freezer.freeze_start_nsec = ktime_get_ns();
set_bit(CGRP_FROZEN, &cgrp->flags);
}
@@ -5939,7 +6070,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
if (atomic_dec_and_test(&css->online_cnt)) {
INIT_WORK(&css->destroy_work, css_killed_work_fn);
- queue_work(cgroup_destroy_wq, &css->destroy_work);
+ queue_work(cgroup_offline_wq, &css->destroy_work);
}
}
@@ -6312,6 +6443,7 @@ int __init cgroup_init(void)
WARN_ON(register_filesystem(&cpuset_fs_type));
#endif
+ ns_tree_add(&init_cgroup_ns);
return 0;
}
@@ -6325,8 +6457,14 @@ static int __init cgroup_wq_init(void)
* We would prefer to do this in cgroup_init() above, but that
* is called before init_workqueues(): so leave this until after.
*/
- cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
- BUG_ON(!cgroup_destroy_wq);
+ cgroup_offline_wq = alloc_workqueue("cgroup_offline", WQ_PERCPU, 1);
+ BUG_ON(!cgroup_offline_wq);
+
+ cgroup_release_wq = alloc_workqueue("cgroup_release", WQ_PERCPU, 1);
+ BUG_ON(!cgroup_release_wq);
+
+ cgroup_free_wq = alloc_workqueue("cgroup_free", WQ_PERCPU, 1);
+ BUG_ON(!cgroup_free_wq);
return 0;
}
core_initcall(cgroup_wq_init);
@@ -6343,15 +6481,15 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
}
/*
- * cgroup_get_from_id : get the cgroup associated with cgroup id
+ * __cgroup_get_from_id : get the cgroup associated with cgroup id
* @id: cgroup id
* On success return the cgrp or ERR_PTR on failure
- * Only cgroups within current task's cgroup NS are valid.
+ * There are no cgroup NS restrictions.
*/
-struct cgroup *cgroup_get_from_id(u64 id)
+struct cgroup *__cgroup_get_from_id(u64 id)
{
struct kernfs_node *kn;
- struct cgroup *cgrp, *root_cgrp;
+ struct cgroup *cgrp;
kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
if (!kn)
@@ -6373,6 +6511,22 @@ struct cgroup *cgroup_get_from_id(u64 id)
if (!cgrp)
return ERR_PTR(-ENOENT);
+ return cgrp;
+}
+
+/*
+ * cgroup_get_from_id : get the cgroup associated with cgroup id
+ * @id: cgroup id
+ * On success return the cgrp or ERR_PTR on failure
+ * Only cgroups within current task's cgroup NS are valid.
+ */
+struct cgroup *cgroup_get_from_id(u64 id)
+{
+ struct cgroup *cgrp, *root_cgrp;
+
+ cgrp = __cgroup_get_from_id(id);
+ if (IS_ERR(cgrp))
+ return cgrp;
root_cgrp = current_cgns_cgroup_dfl();
if (!cgroup_is_descendant(cgrp, root_cgrp)) {
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index 383963e28ac6..337608f408ce 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -38,7 +38,6 @@ enum prs_errcode {
/* bits in struct cpuset flags field */
typedef enum {
- CS_ONLINE,
CS_CPU_EXCLUSIVE,
CS_MEM_EXCLUSIVE,
CS_MEM_HARDWALL,
@@ -202,7 +201,7 @@ static inline struct cpuset *parent_cs(struct cpuset *cs)
/* convenient tests for these bits */
static inline bool is_cpuset_online(struct cpuset *cs)
{
- return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
+ return css_is_online(&cs->css) && !css_is_dying(&cs->css);
}
static inline int is_cpu_exclusive(const struct cpuset *cs)
@@ -277,6 +276,8 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on)
ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off);
int cpuset_common_seq_show(struct seq_file *sf, void *v);
+void cpuset_full_lock(void);
+void cpuset_full_unlock(void);
/*
* cpuset-v1.c
diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c
index b69a7db67090..12e76774c75b 100644
--- a/kernel/cgroup/cpuset-v1.c
+++ b/kernel/cgroup/cpuset-v1.c
@@ -169,8 +169,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = -ENODEV;
- cpus_read_lock();
- cpuset_lock();
+ cpuset_full_lock();
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -184,8 +183,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
break;
}
out_unlock:
- cpuset_unlock();
- cpus_read_unlock();
+ cpuset_full_unlock();
return retval;
}
@@ -454,8 +452,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = 0;
- cpus_read_lock();
- cpuset_lock();
+ cpuset_full_lock();
if (!is_cpuset_online(cs)) {
retval = -ENODEV;
goto out_unlock;
@@ -498,8 +495,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
break;
}
out_unlock:
- cpuset_unlock();
- cpus_read_unlock();
+ cpuset_full_unlock();
return retval;
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 27adb04df675..52468d2c178a 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -40,6 +40,7 @@
#include <linux/sched/isolation.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
+#include <linux/task_work.h>
DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
@@ -131,11 +132,6 @@ static bool force_sd_rebuild;
#define PRS_INVALID_ROOT -1
#define PRS_INVALID_ISOLATED -2
-static inline bool is_prs_invalid(int prs_state)
-{
- return prs_state < 0;
-}
-
/*
* Temporary cpumasks for working with partitions that are passed among
* functions to avoid memory allocation in inner functions.
@@ -159,16 +155,21 @@ void dec_dl_tasks_cs(struct task_struct *p)
cs->nr_deadline_tasks--;
}
-static inline int is_partition_valid(const struct cpuset *cs)
+static inline bool is_partition_valid(const struct cpuset *cs)
{
return cs->partition_root_state > 0;
}
-static inline int is_partition_invalid(const struct cpuset *cs)
+static inline bool is_partition_invalid(const struct cpuset *cs)
{
return cs->partition_root_state < 0;
}
+static inline bool cs_is_member(const struct cpuset *cs)
+{
+ return cs->partition_root_state == PRS_MEMBER;
+}
+
/*
* Callers should hold callback_lock to modify partition_root_state.
*/
@@ -207,7 +208,7 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs)
* parallel, we may leave an offline CPU in cpu_allowed or some other masks.
*/
static struct cpuset top_cpuset = {
- .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) |
+ .flags = BIT(CS_CPU_EXCLUSIVE) |
BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
.partition_root_state = PRS_ROOT,
.relax_domain_level = -1,
@@ -250,6 +251,12 @@ static struct cpuset top_cpuset = {
static DEFINE_MUTEX(cpuset_mutex);
+/**
+ * cpuset_lock - Acquire the global cpuset mutex
+ *
+ * This locks the global cpuset mutex to prevent modifications to cpuset
+ * hierarchy and configurations. This helper is not enough to make modification.
+ */
void cpuset_lock(void)
{
mutex_lock(&cpuset_mutex);
@@ -260,6 +267,24 @@ void cpuset_unlock(void)
mutex_unlock(&cpuset_mutex);
}
+/**
+ * cpuset_full_lock - Acquire full protection for cpuset modification
+ *
+ * Takes both CPU hotplug read lock (cpus_read_lock()) and cpuset mutex
+ * to safely modify cpuset data.
+ */
+void cpuset_full_lock(void)
+{
+ cpus_read_lock();
+ mutex_lock(&cpuset_mutex);
+}
+
+void cpuset_full_unlock(void)
+{
+ mutex_unlock(&cpuset_mutex);
+ cpus_read_unlock();
+}
+
static DEFINE_SPINLOCK(callback_lock);
void cpuset_callback_lock_irq(void)
@@ -411,94 +436,104 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
}
/**
- * alloc_cpumasks - allocate three cpumasks for cpuset
- * @cs: the cpuset that have cpumasks to be allocated.
- * @tmp: the tmpmasks structure pointer
+ * alloc_cpumasks - Allocate an array of cpumask variables
+ * @pmasks: Pointer to array of cpumask_var_t pointers
+ * @size: Number of cpumasks to allocate
* Return: 0 if successful, -ENOMEM otherwise.
*
- * Only one of the two input arguments should be non-NULL.
+ * Allocates @size cpumasks and initializes them to empty. Returns 0 on
+ * success, -ENOMEM on allocation failure. On failure, any previously
+ * allocated cpumasks are freed.
*/
-static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
+static inline int alloc_cpumasks(cpumask_var_t *pmasks[], u32 size)
{
- cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4;
+ int i;
- if (cs) {
- pmask1 = &cs->cpus_allowed;
- pmask2 = &cs->effective_cpus;
- pmask3 = &cs->effective_xcpus;
- pmask4 = &cs->exclusive_cpus;
- } else {
- pmask1 = &tmp->new_cpus;
- pmask2 = &tmp->addmask;
- pmask3 = &tmp->delmask;
- pmask4 = NULL;
+ for (i = 0; i < size; i++) {
+ if (!zalloc_cpumask_var(pmasks[i], GFP_KERNEL)) {
+ while (--i >= 0)
+ free_cpumask_var(*pmasks[i]);
+ return -ENOMEM;
+ }
}
-
- if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
- return -ENOMEM;
-
- if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
- goto free_one;
-
- if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
- goto free_two;
-
- if (pmask4 && !zalloc_cpumask_var(pmask4, GFP_KERNEL))
- goto free_three;
-
-
return 0;
+}
+
+/**
+ * alloc_tmpmasks - Allocate temporary cpumasks for cpuset operations.
+ * @tmp: Pointer to tmpmasks structure to populate
+ * Return: 0 on success, -ENOMEM on allocation failure
+ */
+static inline int alloc_tmpmasks(struct tmpmasks *tmp)
+{
+ /*
+ * Array of pointers to the three cpumask_var_t fields in tmpmasks.
+ * Note: Array size must match actual number of masks (3)
+ */
+ cpumask_var_t *pmask[3] = {
+ &tmp->new_cpus,
+ &tmp->addmask,
+ &tmp->delmask
+ };
-free_three:
- free_cpumask_var(*pmask3);
-free_two:
- free_cpumask_var(*pmask2);
-free_one:
- free_cpumask_var(*pmask1);
- return -ENOMEM;
+ return alloc_cpumasks(pmask, ARRAY_SIZE(pmask));
}
/**
- * free_cpumasks - free cpumasks in a tmpmasks structure
- * @cs: the cpuset that have cpumasks to be free.
+ * free_tmpmasks - free cpumasks in a tmpmasks structure
* @tmp: the tmpmasks structure pointer
*/
-static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
+static inline void free_tmpmasks(struct tmpmasks *tmp)
{
- if (cs) {
- free_cpumask_var(cs->cpus_allowed);
- free_cpumask_var(cs->effective_cpus);
- free_cpumask_var(cs->effective_xcpus);
- free_cpumask_var(cs->exclusive_cpus);
- }
- if (tmp) {
- free_cpumask_var(tmp->new_cpus);
- free_cpumask_var(tmp->addmask);
- free_cpumask_var(tmp->delmask);
- }
+ if (!tmp)
+ return;
+
+ free_cpumask_var(tmp->new_cpus);
+ free_cpumask_var(tmp->addmask);
+ free_cpumask_var(tmp->delmask);
}
/**
- * alloc_trial_cpuset - allocate a trial cpuset
- * @cs: the cpuset that the trial cpuset duplicates
+ * dup_or_alloc_cpuset - Duplicate or allocate a new cpuset
+ * @cs: Source cpuset to duplicate (NULL for a fresh allocation)
+ *
+ * Creates a new cpuset by either:
+ * 1. Duplicating an existing cpuset (if @cs is non-NULL), or
+ * 2. Allocating a fresh cpuset with zero-initialized masks (if @cs is NULL)
+ *
+ * Return: Pointer to newly allocated cpuset on success, NULL on failure
*/
-static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
+static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs)
{
struct cpuset *trial;
- trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
+ /* Allocate base structure */
+ trial = cs ? kmemdup(cs, sizeof(*cs), GFP_KERNEL) :
+ kzalloc(sizeof(*cs), GFP_KERNEL);
if (!trial)
return NULL;
- if (alloc_cpumasks(trial, NULL)) {
+ /* Setup cpumask pointer array */
+ cpumask_var_t *pmask[4] = {
+ &trial->cpus_allowed,
+ &trial->effective_cpus,
+ &trial->effective_xcpus,
+ &trial->exclusive_cpus
+ };
+
+ if (alloc_cpumasks(pmask, ARRAY_SIZE(pmask))) {
kfree(trial);
return NULL;
}
- cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
- cpumask_copy(trial->effective_cpus, cs->effective_cpus);
- cpumask_copy(trial->effective_xcpus, cs->effective_xcpus);
- cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus);
+ /* Copy masks if duplicating */
+ if (cs) {
+ cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+ cpumask_copy(trial->effective_cpus, cs->effective_cpus);
+ cpumask_copy(trial->effective_xcpus, cs->effective_xcpus);
+ cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus);
+ }
+
return trial;
}
@@ -508,7 +543,10 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
*/
static inline void free_cpuset(struct cpuset *cs)
{
- free_cpumasks(cs, NULL);
+ free_cpumask_var(cs->cpus_allowed);
+ free_cpumask_var(cs->effective_cpus);
+ free_cpumask_var(cs->effective_xcpus);
+ free_cpumask_var(cs->exclusive_cpus);
kfree(cs);
}
@@ -540,6 +578,47 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2)
return true;
}
+/**
+ * cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts
+ * @cs1: first cpuset to check
+ * @cs2: second cpuset to check
+ *
+ * Returns: true if CPU exclusivity conflict exists, false otherwise
+ *
+ * Conflict detection rules:
+ * 1. If either cpuset is CPU exclusive, they must be mutually exclusive
+ * 2. exclusive_cpus masks cannot intersect between cpusets
+ * 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs
+ */
+static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
+{
+ /* If either cpuset is exclusive, check if they are mutually exclusive */
+ if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2))
+ return !cpusets_are_exclusive(cs1, cs2);
+
+ /* Exclusive_cpus cannot intersect */
+ if (cpumask_intersects(cs1->exclusive_cpus, cs2->exclusive_cpus))
+ return true;
+
+ /* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */
+ if (!cpumask_empty(cs1->cpus_allowed) &&
+ cpumask_subset(cs1->cpus_allowed, cs2->exclusive_cpus))
+ return true;
+
+ if (!cpumask_empty(cs2->cpus_allowed) &&
+ cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus))
+ return true;
+
+ return false;
+}
+
+static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
+{
+ if ((is_mem_exclusive(cs1) || is_mem_exclusive(cs2)))
+ return nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
+ return false;
+}
+
/*
* validate_change() - Used to validate that any proposed cpuset change
* follows the structural rules for cpusets.
@@ -621,38 +700,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
*/
ret = -EINVAL;
cpuset_for_each_child(c, css, par) {
- bool txset, cxset; /* Are exclusive_cpus set? */
-
if (c == cur)
continue;
-
- txset = !cpumask_empty(trial->exclusive_cpus);
- cxset = !cpumask_empty(c->exclusive_cpus);
- if (is_cpu_exclusive(trial) || is_cpu_exclusive(c) ||
- (txset && cxset)) {
- if (!cpusets_are_exclusive(trial, c))
- goto out;
- } else if (txset || cxset) {
- struct cpumask *xcpus, *acpus;
-
- /*
- * When just one of the exclusive_cpus's is set,
- * cpus_allowed of the other cpuset, if set, cannot be
- * a subset of it or none of those CPUs will be
- * available if these exclusive CPUs are activated.
- */
- if (txset) {
- xcpus = trial->exclusive_cpus;
- acpus = c->cpus_allowed;
- } else {
- xcpus = c->exclusive_cpus;
- acpus = trial->cpus_allowed;
- }
- if (!cpumask_empty(acpus) && cpumask_subset(acpus, xcpus))
- goto out;
- }
- if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
- nodes_intersects(trial->mems_allowed, c->mems_allowed))
+ if (cpus_excl_conflict(trial, c))
+ goto out;
+ if (mems_excl_conflict(trial, c))
goto out;
}
@@ -1363,38 +1415,25 @@ bool cpuset_cpu_is_isolated(int cpu)
}
EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);
-/*
- * compute_effective_exclusive_cpumask - compute effective exclusive CPUs
- * @cs: cpuset
- * @xcpus: effective exclusive CPUs value to be set
- * @real_cs: the real cpuset (can be NULL)
- * Return: 0 if there is no sibling conflict, > 0 otherwise
+/**
+ * rm_siblings_excl_cpus - Remove exclusive CPUs that are used by sibling cpusets
+ * @parent: Parent cpuset containing all siblings
+ * @cs: Current cpuset (will be skipped)
+ * @excpus: exclusive effective CPU mask to modify
*
- * If exclusive_cpus isn't explicitly set or a real_cs is provided, we have to
- * scan the sibling cpusets and exclude their exclusive_cpus or effective_xcpus
- * as well. The provision of real_cs means that a cpumask is being changed and
- * the given cs is a trial one.
+ * This function ensures the given @excpus mask doesn't include any CPUs that
+ * are exclusively allocated to sibling cpusets. It walks through all siblings
+ * of @cs under @parent and removes their exclusive CPUs from @excpus.
*/
-static int compute_effective_exclusive_cpumask(struct cpuset *cs,
- struct cpumask *xcpus,
- struct cpuset *real_cs)
+static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs,
+ struct cpumask *excpus)
{
struct cgroup_subsys_state *css;
- struct cpuset *parent = parent_cs(cs);
struct cpuset *sibling;
int retval = 0;
- if (!xcpus)
- xcpus = cs->effective_xcpus;
-
- cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus);
-
- if (!real_cs) {
- if (!cpumask_empty(cs->exclusive_cpus))
- return 0;
- } else {
- cs = real_cs;
- }
+ if (cpumask_empty(excpus))
+ return retval;
/*
* Exclude exclusive CPUs from siblings
@@ -1404,20 +1443,66 @@ static int compute_effective_exclusive_cpumask(struct cpuset *cs,
if (sibling == cs)
continue;
- if (cpumask_intersects(xcpus, sibling->exclusive_cpus)) {
- cpumask_andnot(xcpus, xcpus, sibling->exclusive_cpus);
+ if (cpumask_intersects(excpus, sibling->exclusive_cpus)) {
+ cpumask_andnot(excpus, excpus, sibling->exclusive_cpus);
retval++;
continue;
}
- if (cpumask_intersects(xcpus, sibling->effective_xcpus)) {
- cpumask_andnot(xcpus, xcpus, sibling->effective_xcpus);
+ if (cpumask_intersects(excpus, sibling->effective_xcpus)) {
+ cpumask_andnot(excpus, excpus, sibling->effective_xcpus);
retval++;
}
}
rcu_read_unlock();
+
return retval;
}
+/*
+ * compute_excpus - compute effective exclusive CPUs
+ * @cs: cpuset
+ * @xcpus: effective exclusive CPUs value to be set
+ * Return: 0 if there is no sibling conflict, > 0 otherwise
+ *
+ * If exclusive_cpus isn't explicitly set , we have to scan the sibling cpusets
+ * and exclude their exclusive_cpus or effective_xcpus as well.
+ */
+static int compute_excpus(struct cpuset *cs, struct cpumask *excpus)
+{
+ struct cpuset *parent = parent_cs(cs);
+
+ cpumask_and(excpus, user_xcpus(cs), parent->effective_xcpus);
+
+ if (!cpumask_empty(cs->exclusive_cpus))
+ return 0;
+
+ return rm_siblings_excl_cpus(parent, cs, excpus);
+}
+
+/*
+ * compute_trialcs_excpus - Compute effective exclusive CPUs for a trial cpuset
+ * @trialcs: The trial cpuset containing the proposed new configuration
+ * @cs: The original cpuset that the trial configuration is based on
+ * Return: 0 if successful with no sibling conflict, >0 if a conflict is found
+ *
+ * Computes the effective_xcpus for a trial configuration. @cs is provided to represent
+ * the real cs.
+ */
+static int compute_trialcs_excpus(struct cpuset *trialcs, struct cpuset *cs)
+{
+ struct cpuset *parent = parent_cs(trialcs);
+ struct cpumask *excpus = trialcs->effective_xcpus;
+
+ /* trialcs is member, cpuset.cpus has no impact to excpus */
+ if (cs_is_member(cs))
+ cpumask_and(excpus, trialcs->exclusive_cpus,
+ parent->effective_xcpus);
+ else
+ cpumask_and(excpus, user_xcpus(trialcs), parent->effective_xcpus);
+
+ return rm_siblings_excl_cpus(parent, cs, excpus);
+}
+
static inline bool is_remote_partition(struct cpuset *cs)
{
return !list_empty(&cs->remote_sibling);
@@ -1459,7 +1544,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
* Note that creating a remote partition with any local partition root
* above it or remote partition root underneath it is not allowed.
*/
- compute_effective_exclusive_cpumask(cs, tmp->new_cpus, NULL);
+ compute_excpus(cs, tmp->new_cpus);
WARN_ON_ONCE(cpumask_intersects(tmp->new_cpus, subpartitions_cpus));
if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) ||
cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus))
@@ -1508,7 +1593,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
cs->partition_root_state = PRS_MEMBER;
/* effective_xcpus may need to be changed */
- compute_effective_exclusive_cpumask(cs, NULL, NULL);
+ compute_excpus(cs, cs->effective_xcpus);
reset_partition_data(cs);
spin_unlock_irq(&callback_lock);
update_unbound_workqueue_cpumask(isolcpus_updated);
@@ -1677,7 +1762,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
old_prs = new_prs = cs->partition_root_state;
if (cmd == partcmd_invalidate) {
- if (is_prs_invalid(old_prs))
+ if (is_partition_invalid(cs))
return 0;
/*
@@ -1709,13 +1794,14 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) {
/*
- * Need to call compute_effective_exclusive_cpumask() in case
+ * Need to call compute_excpus() in case
* exclusive_cpus not set. Sibling conflict should only happen
* if exclusive_cpus isn't set.
*/
xcpus = tmp->delmask;
- if (compute_effective_exclusive_cpumask(cs, xcpus, NULL))
+ if (compute_excpus(cs, xcpus))
WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus));
+ new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
/*
* Enabling partition root is not allowed if its
@@ -1727,11 +1813,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
if (prstate_housekeeping_conflict(new_prs, xcpus))
return PERR_HKEEPING;
- /*
- * A parent can be left with no CPU as long as there is no
- * task directly associated with the parent partition.
- */
- if (nocpu)
+ if (tasks_nocpu_error(parent, cs, xcpus))
return PERR_NOCPUS;
/*
@@ -1748,7 +1830,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
deleting = true;
subparts_delta++;
- new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
} else if (cmd == partcmd_disable) {
/*
* May need to add cpus back to parent's effective_cpus
@@ -1788,7 +1869,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
* For invalid partition:
* delmask = newmask & parent->effective_xcpus
*/
- if (is_prs_invalid(old_prs)) {
+ if (is_partition_invalid(cs)) {
adding = false;
deleting = cpumask_and(tmp->delmask,
newmask, parent->effective_xcpus);
@@ -1837,7 +1918,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
* A partition error happens when parent has tasks and all
* its effective CPUs will have to be distributed out.
*/
- WARN_ON_ONCE(!is_partition_valid(parent));
if (nocpu) {
part_error = PERR_NOCPUS;
if (is_partition_valid(cs))
@@ -1996,7 +2076,7 @@ static void compute_partition_effective_cpumask(struct cpuset *cs,
* 2) All the effective_cpus will be used up and cp
* has tasks
*/
- compute_effective_exclusive_cpumask(cs, new_ecpus, NULL);
+ compute_excpus(cs, new_ecpus);
cpumask_and(new_ecpus, new_ecpus, cpu_active_mask);
rcu_read_lock();
@@ -2075,7 +2155,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
* its value is being processed.
*/
if (remote && (cp != cs)) {
- compute_effective_exclusive_cpumask(cp, tmp->new_cpus, NULL);
+ compute_excpus(cp, tmp->new_cpus);
if (cpumask_equal(cp->effective_xcpus, tmp->new_cpus)) {
pos_css = css_rightmost_descendant(pos_css);
continue;
@@ -2177,7 +2257,7 @@ get_css:
cpumask_copy(cp->effective_cpus, tmp->new_cpus);
cp->partition_root_state = new_prs;
if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs))
- compute_effective_exclusive_cpumask(cp, NULL, NULL);
+ compute_excpus(cp, cp->effective_xcpus);
/*
* Make sure effective_xcpus is properly set for a valid
@@ -2284,82 +2364,54 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
rcu_read_unlock();
}
-/**
- * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
- * @cs: the cpuset to consider
- * @trialcs: trial cpuset
- * @buf: buffer of cpu numbers written to this cpuset
- */
-static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
- const char *buf)
+static int parse_cpuset_cpulist(const char *buf, struct cpumask *out_mask)
{
int retval;
- struct tmpmasks tmp;
- struct cpuset *parent = parent_cs(cs);
- bool invalidate = false;
- bool force = false;
- int old_prs = cs->partition_root_state;
- /* top_cpuset.cpus_allowed tracks cpu_active_mask; it's read-only */
- if (cs == &top_cpuset)
- return -EACCES;
+ retval = cpulist_parse(buf, out_mask);
+ if (retval < 0)
+ return retval;
+ if (!cpumask_subset(out_mask, top_cpuset.cpus_allowed))
+ return -EINVAL;
- /*
- * An empty cpus_allowed is ok only if the cpuset has no tasks.
- * Since cpulist_parse() fails on an empty mask, we special case
- * that parsing. The validate_change() call ensures that cpusets
- * with tasks have cpus.
- */
- if (!*buf) {
- cpumask_clear(trialcs->cpus_allowed);
- if (cpumask_empty(trialcs->exclusive_cpus))
- cpumask_clear(trialcs->effective_xcpus);
- } else {
- retval = cpulist_parse(buf, trialcs->cpus_allowed);
- if (retval < 0)
- return retval;
+ return 0;
+}
- if (!cpumask_subset(trialcs->cpus_allowed,
- top_cpuset.cpus_allowed))
- return -EINVAL;
+/**
+ * validate_partition - Validate a cpuset partition configuration
+ * @cs: The cpuset to validate
+ * @trialcs: The trial cpuset containing proposed configuration changes
+ *
+ * If any validation check fails, the appropriate error code is set in the
+ * cpuset's prs_err field.
+ *
+ * Return: PRS error code (0 if valid, non-zero error code if invalid)
+ */
+static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *trialcs)
+{
+ struct cpuset *parent = parent_cs(cs);
- /*
- * When exclusive_cpus isn't explicitly set, it is constrained
- * by cpus_allowed and parent's effective_xcpus. Otherwise,
- * trialcs->effective_xcpus is used as a temporary cpumask
- * for checking validity of the partition root.
- */
- trialcs->partition_root_state = PRS_MEMBER;
- if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs))
- compute_effective_exclusive_cpumask(trialcs, NULL, cs);
- }
+ if (cs_is_member(trialcs))
+ return PERR_NONE;
- /* Nothing to do if the cpus didn't change */
- if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
- return 0;
+ if (cpumask_empty(trialcs->effective_xcpus))
+ return PERR_INVCPUS;
- if (alloc_cpumasks(NULL, &tmp))
- return -ENOMEM;
+ if (prstate_housekeeping_conflict(trialcs->partition_root_state,
+ trialcs->effective_xcpus))
+ return PERR_HKEEPING;
- if (old_prs) {
- if (is_partition_valid(cs) &&
- cpumask_empty(trialcs->effective_xcpus)) {
- invalidate = true;
- cs->prs_err = PERR_INVCPUS;
- } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) {
- invalidate = true;
- cs->prs_err = PERR_HKEEPING;
- } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
- invalidate = true;
- cs->prs_err = PERR_NOCPUS;
- }
- }
+ if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus))
+ return PERR_NOCPUS;
- /*
- * Check all the descendants in update_cpumasks_hier() if
- * effective_xcpus is to be changed.
- */
- force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
+ return PERR_NONE;
+}
+
+static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs,
+ struct tmpmasks *tmp)
+{
+ int retval;
+ struct cpuset *parent = parent_cs(cs);
retval = validate_change(cs, trialcs);
@@ -2374,7 +2426,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* partition. However, any conflicting sibling partitions
* have to be marked as invalid too.
*/
- invalidate = true;
+ trialcs->prs_err = PERR_NOTEXCL;
rcu_read_lock();
cpuset_for_each_child(cp, css, parent) {
struct cpumask *xcpus = user_xcpus(trialcs);
@@ -2382,36 +2434,92 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (is_partition_valid(cp) &&
cpumask_intersects(xcpus, cp->effective_xcpus)) {
rcu_read_unlock();
- update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp);
+ update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp);
rcu_read_lock();
}
}
rcu_read_unlock();
retval = 0;
}
+ return retval;
+}
- if (retval < 0)
- goto out_free;
+/**
+ * partition_cpus_change - Handle partition state changes due to CPU mask updates
+ * @cs: The target cpuset being modified
+ * @trialcs: The trial cpuset containing proposed configuration changes
+ * @tmp: Temporary masks for intermediate calculations
+ *
+ * This function handles partition state transitions triggered by CPU mask changes.
+ * CPU modifications may cause a partition to be disabled or require state updates.
+ */
+static void partition_cpus_change(struct cpuset *cs, struct cpuset *trialcs,
+ struct tmpmasks *tmp)
+{
+ enum prs_errcode prs_err;
- if (is_partition_valid(cs) ||
- (is_partition_invalid(cs) && !invalidate)) {
- struct cpumask *xcpus = trialcs->effective_xcpus;
+ if (cs_is_member(cs))
+ return;
- if (cpumask_empty(xcpus) && is_partition_invalid(cs))
- xcpus = trialcs->cpus_allowed;
+ prs_err = validate_partition(cs, trialcs);
+ if (prs_err)
+ trialcs->prs_err = cs->prs_err = prs_err;
- /*
- * Call remote_cpus_update() to handle valid remote partition
- */
- if (is_remote_partition(cs))
- remote_cpus_update(cs, NULL, xcpus, &tmp);
- else if (invalidate)
+ if (is_remote_partition(cs)) {
+ if (trialcs->prs_err)
+ remote_partition_disable(cs, tmp);
+ else
+ remote_cpus_update(cs, trialcs->exclusive_cpus,
+ trialcs->effective_xcpus, tmp);
+ } else {
+ if (trialcs->prs_err)
update_parent_effective_cpumask(cs, partcmd_invalidate,
- NULL, &tmp);
+ NULL, tmp);
else
update_parent_effective_cpumask(cs, partcmd_update,
- xcpus, &tmp);
+ trialcs->effective_xcpus, tmp);
}
+}
+
+/**
+ * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
+ * @cs: the cpuset to consider
+ * @trialcs: trial cpuset
+ * @buf: buffer of cpu numbers written to this cpuset
+ */
+static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
+ const char *buf)
+{
+ int retval;
+ struct tmpmasks tmp;
+ bool force = false;
+ int old_prs = cs->partition_root_state;
+
+ retval = parse_cpuset_cpulist(buf, trialcs->cpus_allowed);
+ if (retval < 0)
+ return retval;
+
+ /* Nothing to do if the cpus didn't change */
+ if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
+ return 0;
+
+ if (alloc_tmpmasks(&tmp))
+ return -ENOMEM;
+
+ compute_trialcs_excpus(trialcs, cs);
+ trialcs->prs_err = PERR_NONE;
+
+ retval = cpus_allowed_validate_change(cs, trialcs, &tmp);
+ if (retval < 0)
+ goto out_free;
+
+ /*
+ * Check all the descendants in update_cpumasks_hier() if
+ * effective_xcpus is to be changed.
+ */
+ force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
+
+ partition_cpus_change(cs, trialcs, &tmp);
spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
@@ -2427,7 +2535,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (cs->partition_root_state)
update_partition_sd_lb(cs, old_prs);
out_free:
- free_cpumasks(NULL, &tmp);
+ free_tmpmasks(&tmp);
return retval;
}
@@ -2444,33 +2552,23 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
{
int retval;
struct tmpmasks tmp;
- struct cpuset *parent = parent_cs(cs);
- bool invalidate = false;
bool force = false;
int old_prs = cs->partition_root_state;
- if (!*buf) {
- cpumask_clear(trialcs->exclusive_cpus);
- cpumask_clear(trialcs->effective_xcpus);
- } else {
- retval = cpulist_parse(buf, trialcs->exclusive_cpus);
- if (retval < 0)
- return retval;
- }
+ retval = parse_cpuset_cpulist(buf, trialcs->exclusive_cpus);
+ if (retval < 0)
+ return retval;
/* Nothing to do if the CPUs didn't change */
if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus))
return 0;
- if (*buf) {
- trialcs->partition_root_state = PRS_MEMBER;
- /*
- * Reject the change if there is exclusive CPUs conflict with
- * the siblings.
- */
- if (compute_effective_exclusive_cpumask(trialcs, NULL, cs))
- return -EINVAL;
- }
+ /*
+ * Reject the change if there is exclusive CPUs conflict with
+ * the siblings.
+ */
+ if (compute_trialcs_excpus(trialcs, cs))
+ return -EINVAL;
/*
* Check all the descendants in update_cpumasks_hier() if
@@ -2482,35 +2580,12 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (retval)
return retval;
- if (alloc_cpumasks(NULL, &tmp))
+ if (alloc_tmpmasks(&tmp))
return -ENOMEM;
- if (old_prs) {
- if (cpumask_empty(trialcs->effective_xcpus)) {
- invalidate = true;
- cs->prs_err = PERR_INVCPUS;
- } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) {
- invalidate = true;
- cs->prs_err = PERR_HKEEPING;
- } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
- invalidate = true;
- cs->prs_err = PERR_NOCPUS;
- }
+ trialcs->prs_err = PERR_NONE;
+ partition_cpus_change(cs, trialcs, &tmp);
- if (is_remote_partition(cs)) {
- if (invalidate)
- remote_partition_disable(cs, &tmp);
- else
- remote_cpus_update(cs, trialcs->exclusive_cpus,
- trialcs->effective_xcpus, &tmp);
- } else if (invalidate) {
- update_parent_effective_cpumask(cs, partcmd_invalidate,
- NULL, &tmp);
- } else {
- update_parent_effective_cpumask(cs, partcmd_update,
- trialcs->effective_xcpus, &tmp);
- }
- }
spin_lock_irq(&callback_lock);
cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus);
cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
@@ -2530,7 +2605,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (cs->partition_root_state)
update_partition_sd_lb(cs, old_prs);
- free_cpumasks(NULL, &tmp);
+ free_tmpmasks(&tmp);
return 0;
}
@@ -2582,9 +2657,24 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
}
}
-static void cpuset_post_attach(void)
+static void flush_migrate_mm_task_workfn(struct callback_head *head)
{
flush_workqueue(cpuset_migrate_mm_wq);
+ kfree(head);
+}
+
+static void schedule_flush_migrate_mm(void)
+{
+ struct callback_head *flush_cb;
+
+ flush_cb = kzalloc(sizeof(struct callback_head), GFP_KERNEL);
+ if (!flush_cb)
+ return;
+
+ init_task_work(flush_cb, flush_migrate_mm_task_workfn);
+
+ if (task_work_add(current, flush_cb, TWA_RESUME))
+ kfree(flush_cb);
}
/*
@@ -2750,32 +2840,17 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
int retval;
/*
- * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
- * it's read-only
- */
- if (cs == &top_cpuset) {
- retval = -EACCES;
- goto done;
- }
-
- /*
* An empty mems_allowed is ok iff there are no tasks in the cpuset.
- * Since nodelist_parse() fails on an empty mask, we special case
- * that parsing. The validate_change() call ensures that cpusets
- * with tasks have memory.
+ * The validate_change() call ensures that cpusets with tasks have memory.
*/
- if (!*buf) {
- nodes_clear(trialcs->mems_allowed);
- } else {
- retval = nodelist_parse(buf, trialcs->mems_allowed);
- if (retval < 0)
- goto done;
+ retval = nodelist_parse(buf, trialcs->mems_allowed);
+ if (retval < 0)
+ goto done;
- if (!nodes_subset(trialcs->mems_allowed,
- top_cpuset.mems_allowed)) {
- retval = -EINVAL;
- goto done;
- }
+ if (!nodes_subset(trialcs->mems_allowed,
+ top_cpuset.mems_allowed)) {
+ retval = -EINVAL;
+ goto done;
}
if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
@@ -2826,7 +2901,7 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
int spread_flag_changed;
int err;
- trialcs = alloc_trial_cpuset(cs);
+ trialcs = dup_or_alloc_cpuset(cs);
if (!trialcs)
return -ENOMEM;
@@ -2884,10 +2959,10 @@ static int update_prstate(struct cpuset *cs, int new_prs)
/*
* Treat a previously invalid partition root as if it is a "member".
*/
- if (new_prs && is_prs_invalid(old_prs))
+ if (new_prs && is_partition_invalid(cs))
old_prs = PRS_MEMBER;
- if (alloc_cpumasks(NULL, &tmpmask))
+ if (alloc_tmpmasks(&tmpmask))
return -ENOMEM;
err = update_partition_exclusive_flag(cs, new_prs);
@@ -2983,7 +3058,7 @@ out:
notify_partition_change(cs, old_prs);
if (force_sd_rebuild)
rebuild_sched_domains_locked();
- free_cpumasks(NULL, &tmpmask);
+ free_tmpmasks(&tmpmask);
return 0;
}
@@ -3141,6 +3216,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
struct cpuset *cs;
struct cpuset *oldcs = cpuset_attach_old_cs;
bool cpus_updated, mems_updated;
+ bool queue_task_work = false;
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
@@ -3191,15 +3267,18 @@ static void cpuset_attach(struct cgroup_taskset *tset)
* @old_mems_allowed is the right nodesets that we
* migrate mm from.
*/
- if (is_memory_migrate(cs))
+ if (is_memory_migrate(cs)) {
cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
&cpuset_attach_nodemask_to);
- else
+ queue_task_work = true;
+ } else
mmput(mm);
}
}
out:
+ if (queue_task_work)
+ schedule_flush_migrate_mm();
cs->old_mems_allowed = cpuset_attach_nodemask_to;
if (cs->nr_migrate_dl_tasks) {
@@ -3223,13 +3302,16 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
struct cpuset *trialcs;
int retval = -ENODEV;
+ /* root is read-only */
+ if (cs == &top_cpuset)
+ return -EACCES;
+
buf = strstrip(buf);
- cpus_read_lock();
- mutex_lock(&cpuset_mutex);
+ cpuset_full_lock();
if (!is_cpuset_online(cs))
goto out_unlock;
- trialcs = alloc_trial_cpuset(cs);
+ trialcs = dup_or_alloc_cpuset(cs);
if (!trialcs) {
retval = -ENOMEM;
goto out_unlock;
@@ -3254,9 +3336,9 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
if (force_sd_rebuild)
rebuild_sched_domains_locked();
out_unlock:
- mutex_unlock(&cpuset_mutex);
- cpus_read_unlock();
- flush_workqueue(cpuset_migrate_mm_wq);
+ cpuset_full_unlock();
+ if (of_cft(of)->private == FILE_MEMLIST)
+ schedule_flush_migrate_mm();
return retval ?: nbytes;
}
@@ -3358,12 +3440,10 @@ static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf,
else
return -EINVAL;
- cpus_read_lock();
- mutex_lock(&cpuset_mutex);
+ cpuset_full_lock();
if (is_cpuset_online(cs))
retval = update_prstate(cs, val);
- mutex_unlock(&cpuset_mutex);
- cpus_read_unlock();
+ cpuset_full_unlock();
return retval ?: nbytes;
}
@@ -3462,15 +3542,10 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
if (!parent_css)
return &top_cpuset.css;
- cs = kzalloc(sizeof(*cs), GFP_KERNEL);
+ cs = dup_or_alloc_cpuset(NULL);
if (!cs)
return ERR_PTR(-ENOMEM);
- if (alloc_cpumasks(cs, NULL)) {
- kfree(cs);
- return ERR_PTR(-ENOMEM);
- }
-
__set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
@@ -3493,10 +3568,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
if (!parent)
return 0;
- cpus_read_lock();
- mutex_lock(&cpuset_mutex);
-
- set_bit(CS_ONLINE, &cs->flags);
+ cpuset_full_lock();
if (is_spread_page(parent))
set_bit(CS_SPREAD_PAGE, &cs->flags);
if (is_spread_slab(parent))
@@ -3548,8 +3620,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
spin_unlock_irq(&callback_lock);
out_unlock:
- mutex_unlock(&cpuset_mutex);
- cpus_read_unlock();
+ cpuset_full_unlock();
return 0;
}
@@ -3564,17 +3635,12 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
- cpus_read_lock();
- mutex_lock(&cpuset_mutex);
-
+ cpuset_full_lock();
if (!cpuset_v2() && is_sched_load_balance(cs))
cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
cpuset_dec();
- clear_bit(CS_ONLINE, &cs->flags);
-
- mutex_unlock(&cpuset_mutex);
- cpus_read_unlock();
+ cpuset_full_unlock();
}
/*
@@ -3586,16 +3652,11 @@ static void cpuset_css_killed(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
- cpus_read_lock();
- mutex_lock(&cpuset_mutex);
-
+ cpuset_full_lock();
/* Reset valid partition back to member */
if (is_partition_valid(cs))
update_prstate(cs, PRS_MEMBER);
-
- mutex_unlock(&cpuset_mutex);
- cpus_read_unlock();
-
+ cpuset_full_unlock();
}
static void cpuset_css_free(struct cgroup_subsys_state *css)
@@ -3724,7 +3785,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
.can_attach = cpuset_can_attach,
.cancel_attach = cpuset_cancel_attach,
.attach = cpuset_attach,
- .post_attach = cpuset_post_attach,
.bind = cpuset_bind,
.can_fork = cpuset_can_fork,
.cancel_fork = cpuset_cancel_fork,
@@ -3928,7 +3988,7 @@ static void cpuset_handle_hotplug(void)
bool on_dfl = is_in_v2_mode();
struct tmpmasks tmp, *ptmp = NULL;
- if (on_dfl && !alloc_cpumasks(NULL, &tmp))
+ if (on_dfl && !alloc_tmpmasks(&tmp))
ptmp = &tmp;
lockdep_assert_cpus_held();
@@ -4008,7 +4068,7 @@ static void cpuset_handle_hotplug(void)
if (force_sd_rebuild)
rebuild_sched_domains_cpuslocked();
- free_cpumasks(NULL, ptmp);
+ free_tmpmasks(ptmp);
}
void cpuset_update_active_cpus(void)
@@ -4073,7 +4133,6 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
struct cpuset *cs;
spin_lock_irqsave(&callback_lock, flags);
- rcu_read_lock();
cs = task_cs(tsk);
if (cs != &top_cpuset)
@@ -4095,7 +4154,6 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
cpumask_copy(pmask, possible_mask);
}
- rcu_read_unlock();
spin_unlock_irqrestore(&callback_lock, flags);
}
@@ -4168,9 +4226,7 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
unsigned long flags;
spin_lock_irqsave(&callback_lock, flags);
- rcu_read_lock();
guarantee_online_mems(task_cs(tsk), &mask);
- rcu_read_unlock();
spin_unlock_irqrestore(&callback_lock, flags);
return mask;
@@ -4265,10 +4321,8 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)
/* Not hardwall and node outside mems_allowed: scan up cpusets */
spin_lock_irqsave(&callback_lock, flags);
- rcu_read_lock();
cs = nearest_hardwall_ancestor(task_cs(current));
allowed = node_isset(node, cs->mems_allowed);
- rcu_read_unlock();
spin_unlock_irqrestore(&callback_lock, flags);
return allowed;
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 80aa3f027ac3..81ea38dd6f9d 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -49,7 +49,6 @@ static int current_css_set_read(struct seq_file *seq, void *v)
return -ENODEV;
spin_lock_irq(&css_set_lock);
- rcu_read_lock();
cset = task_css_set(current);
refcnt = refcount_read(&cset->refcount);
seq_printf(seq, "css_set %pK %d", cset, refcnt);
@@ -67,7 +66,6 @@ static int current_css_set_read(struct seq_file *seq, void *v)
seq_printf(seq, "%2d: %-4s\t- %p[%d]\n", ss->id, ss->name,
css, css->id);
}
- rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
cgroup_kn_unlock(of->kn);
return 0;
@@ -95,7 +93,6 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
return -ENOMEM;
spin_lock_irq(&css_set_lock);
- rcu_read_lock();
cset = task_css_set(current);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
@@ -104,7 +101,6 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
seq_printf(seq, "Root %d group %s\n",
c->root->hierarchy_id, name_buf);
}
- rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
kfree(name_buf);
return 0;
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index bf1690a167dd..6c18854bff34 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -171,7 +171,7 @@ static void cgroup_freeze_task(struct task_struct *task, bool freeze)
/*
* Freeze or unfreeze all tasks in the given cgroup.
*/
-static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze)
+static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze, u64 ts_nsec)
{
struct css_task_iter it;
struct task_struct *task;
@@ -179,10 +179,16 @@ static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze)
lockdep_assert_held(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
- if (freeze)
+ write_seqcount_begin(&cgrp->freezer.freeze_seq);
+ if (freeze) {
set_bit(CGRP_FREEZE, &cgrp->flags);
- else
+ cgrp->freezer.freeze_start_nsec = ts_nsec;
+ } else {
clear_bit(CGRP_FREEZE, &cgrp->flags);
+ cgrp->freezer.frozen_nsec += (ts_nsec -
+ cgrp->freezer.freeze_start_nsec);
+ }
+ write_seqcount_end(&cgrp->freezer.freeze_seq);
spin_unlock_irq(&css_set_lock);
if (freeze)
@@ -260,6 +266,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze)
struct cgroup *parent;
struct cgroup *dsct;
bool applied = false;
+ u64 ts_nsec;
bool old_e;
lockdep_assert_held(&cgroup_mutex);
@@ -271,6 +278,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze)
return;
cgrp->freezer.freeze = freeze;
+ ts_nsec = ktime_get_ns();
/*
* Propagate changes downwards the cgroup tree.
@@ -298,7 +306,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze)
/*
* Do change actual state: freeze or unfreeze.
*/
- cgroup_do_freeze(dsct, freeze);
+ cgroup_do_freeze(dsct, freeze, ts_nsec);
applied = true;
}
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index 144a464e45c6..fdbe57578e68 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -5,7 +5,7 @@
#include <linux/slab.h>
#include <linux/nsproxy.h>
#include <linux/proc_ns.h>
-
+#include <linux/nstree.h>
/* cgroup namespaces */
@@ -21,33 +21,32 @@ static void dec_cgroup_namespaces(struct ucounts *ucounts)
static struct cgroup_namespace *alloc_cgroup_ns(void)
{
- struct cgroup_namespace *new_ns;
+ struct cgroup_namespace *new_ns __free(kfree) = NULL;
int ret;
new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT);
if (!new_ns)
return ERR_PTR(-ENOMEM);
- ret = ns_alloc_inum(&new_ns->ns);
- if (ret) {
- kfree(new_ns);
+ ret = ns_common_init(new_ns);
+ if (ret)
return ERR_PTR(ret);
- }
- refcount_set(&new_ns->ns.count, 1);
- new_ns->ns.ops = &cgroupns_operations;
- return new_ns;
+ ns_tree_add(new_ns);
+ return no_free_ptr(new_ns);
}
void free_cgroup_ns(struct cgroup_namespace *ns)
{
+ ns_tree_remove(ns);
put_css_set(ns->root_cset);
dec_cgroup_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
- ns_free_inum(&ns->ns);
- kfree(ns);
+ ns_common_free(ns);
+ /* Concurrent nstree traversal depends on a grace period. */
+ kfree_rcu(ns, ns.ns_rcu);
}
EXPORT_SYMBOL(free_cgroup_ns);
-struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
+struct cgroup_namespace *copy_cgroup_ns(u64 flags,
struct user_namespace *user_ns,
struct cgroup_namespace *old_ns)
{
@@ -90,11 +89,6 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
return new_ns;
}
-static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
-{
- return container_of(ns, struct cgroup_namespace, ns);
-}
-
static int cgroupns_install(struct nsset *nsset, struct ns_common *ns)
{
struct nsproxy *nsproxy = nsset->nsproxy;
@@ -143,7 +137,6 @@ static struct user_namespace *cgroupns_owner(struct ns_common *ns)
const struct proc_ns_operations cgroupns_operations = {
.name = "cgroup",
- .type = CLONE_NEWCGROUP,
.get = cgroupns_get,
.put = cgroupns_put,
.install = cgroupns_install,