diff options
Diffstat (limited to 'kernel/cgroup')
-rw-r--r-- | kernel/cgroup/cgroup-internal.h | 11 | ||||
-rw-r--r-- | kernel/cgroup/cgroup-v1.c | 19 | ||||
-rw-r--r-- | kernel/cgroup/cgroup.c | 268 | ||||
-rw-r--r-- | kernel/cgroup/cpuset-internal.h | 5 | ||||
-rw-r--r-- | kernel/cgroup/cpuset-v1.c | 12 | ||||
-rw-r--r-- | kernel/cgroup/cpuset.c | 752 | ||||
-rw-r--r-- | kernel/cgroup/debug.c | 4 | ||||
-rw-r--r-- | kernel/cgroup/freezer.c | 16 | ||||
-rw-r--r-- | kernel/cgroup/namespace.c | 29 |
9 files changed, 661 insertions, 455 deletions
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index b14e61c64a34..22051b4f1ccb 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -249,12 +249,15 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); -void cgroup_attach_lock(bool lock_threadgroup); -void cgroup_attach_unlock(bool lock_threadgroup); +void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk); +void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk); struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, - bool *locked) + enum cgroup_attach_lock_mode *lock_mode) __acquires(&cgroup_threadgroup_rwsem); -void cgroup_procs_write_finish(struct task_struct *task, bool locked) +void cgroup_procs_write_finish(struct task_struct *task, + enum cgroup_attach_lock_mode lock_mode) __releases(&cgroup_threadgroup_rwsem); void cgroup_lock_and_drain_offline(struct cgroup *cgrp); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 2a4a387f867a..a9e029b570c8 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -10,6 +10,7 @@ #include <linux/sched/task.h> #include <linux/magic.h> #include <linux/slab.h> +#include <linux/string.h> #include <linux/vmalloc.h> #include <linux/delayacct.h> #include <linux/pid_namespace.h> @@ -68,7 +69,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) int retval = 0; cgroup_lock(); - cgroup_attach_lock(true); + cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL); for_each_root(root) { struct cgroup *from_cgrp; @@ -80,7 +81,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) if (retval) break; } - cgroup_attach_unlock(true); + cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL); cgroup_unlock(); return retval; @@ -117,7 +118,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) cgroup_lock(); - cgroup_attach_lock(true); + cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL); /* all tasks in @from are being moved, all csets are source */ spin_lock_irq(&css_set_lock); @@ -153,7 +154,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) } while (task && !ret); out_err: cgroup_migrate_finish(&mgctx); - cgroup_attach_unlock(true); + cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL); cgroup_unlock(); return ret; } @@ -502,13 +503,13 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, struct task_struct *task; const struct cred *cred, *tcred; ssize_t ret; - bool locked; + enum cgroup_attach_lock_mode lock_mode; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup, &locked); + task = cgroup_procs_write_start(buf, threadgroup, &lock_mode); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -531,7 +532,7 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, ret = cgroup_attach_task(cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task, locked); + cgroup_procs_write_finish(task, lock_mode); out_unlock: cgroup_kn_unlock(of->kn); @@ -1133,7 +1134,7 @@ int cgroup1_reconfigure(struct fs_context *fc) if (ctx->release_agent) { spin_lock(&release_agent_path_lock); - strcpy(root->release_agent_path, ctx->release_agent); + strscpy(root->release_agent_path, ctx->release_agent); spin_unlock(&release_agent_path_lock); } @@ -1325,7 +1326,7 @@ static int __init cgroup1_wq_init(void) * Cap @max_active to 1 too. */ cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", - 0, 1); + WQ_PERCPU, 1); BUG_ON(!cgroup_pidlist_destroy_wq); return 0; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 312c6a8b55bb..6ae5f48cf64e 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -59,6 +59,7 @@ #include <linux/sched/cputime.h> #include <linux/sched/deadline.h> #include <linux/psi.h> +#include <linux/nstree.h> #include <net/sock.h> #define CREATE_TRACE_POINTS @@ -124,10 +125,33 @@ DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); /* * cgroup destruction makes heavy use of work items and there can be a lot * of concurrent destructions. Use a separate workqueue so that cgroup - * destruction work items don't end up filling up max_active of system_wq + * destruction work items don't end up filling up max_active of system_percpu_wq * which may lead to deadlock. + * + * A cgroup destruction should enqueue work sequentially to: + * cgroup_offline_wq: use for css offline work + * cgroup_release_wq: use for css release work + * cgroup_free_wq: use for free work + * + * Rationale for using separate workqueues: + * The cgroup root free work may depend on completion of other css offline + * operations. If all tasks were enqueued to a single workqueue, this could + * create a deadlock scenario where: + * - Free work waits for other css offline work to complete. + * - But other css offline work is queued after free work in the same queue. + * + * Example deadlock scenario with single workqueue (cgroup_destroy_wq): + * 1. umount net_prio + * 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx) + * 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx) + * 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline. + * 5. net_prio root destruction blocks waiting for perf_event CSS A offline, + * which can never complete as it's behind in the same queue and + * workqueue's max_active is 1. */ -static struct workqueue_struct *cgroup_destroy_wq; +static struct workqueue_struct *cgroup_offline_wq; +static struct workqueue_struct *cgroup_release_wq; +static struct workqueue_struct *cgroup_free_wq; /* generate an array of cgroup subsystem pointers */ #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, @@ -216,13 +240,22 @@ static u16 have_canfork_callback __read_mostly; static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS); +/* + * Write protected by cgroup_mutex and write-lock of cgroup_threadgroup_rwsem, + * read protected by either. + * + * Can only be turned on, but not turned off. + */ +bool cgroup_enable_per_threadgroup_rwsem __read_mostly; + /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { - .ns.count = REFCOUNT_INIT(2), + .ns.__ns_ref = REFCOUNT_INIT(2), .user_ns = &init_user_ns, .ns.ops = &cgroupns_operations, - .ns.inum = PROC_CGROUP_INIT_INO, + .ns.inum = ns_init_inum(&init_cgroup_ns), .root_cset = &init_css_set, + .ns.ns_type = ns_common_type(&init_cgroup_ns), }; static struct file_system_type cgroup2_fs_type; @@ -1302,14 +1335,30 @@ void cgroup_favor_dynmods(struct cgroup_root *root, bool favor) { bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS; - /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */ + /* + * see the comment above CGRP_ROOT_FAVOR_DYNMODS definition. + * favordynmods can flip while task is between + * cgroup_threadgroup_change_begin() and end(), so down_write global + * cgroup_threadgroup_rwsem to synchronize them. + * + * Once cgroup_enable_per_threadgroup_rwsem is enabled, holding + * cgroup_threadgroup_rwsem doesn't exlude tasks between + * cgroup_thread_group_change_begin() and end() and thus it's unsafe to + * turn off. As the scenario is unlikely, simply disallow disabling once + * enabled and print out a warning. + */ + percpu_down_write(&cgroup_threadgroup_rwsem); if (favor && !favoring) { + cgroup_enable_per_threadgroup_rwsem = true; rcu_sync_enter(&cgroup_threadgroup_rwsem.rss); root->flags |= CGRP_ROOT_FAVOR_DYNMODS; } else if (!favor && favoring) { + if (cgroup_enable_per_threadgroup_rwsem) + pr_warn_once("cgroup favordynmods: per threadgroup rwsem mechanism can't be disabled\n"); rcu_sync_exit(&cgroup_threadgroup_rwsem.rss); root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; } + percpu_up_write(&cgroup_threadgroup_rwsem); } static int cgroup_init_root_id(struct cgroup_root *root) @@ -2459,7 +2508,8 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); /** * cgroup_attach_lock - Lock for ->attach() - * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem + * @lock_mode: whether acquire and acquire which rwsem + * @tsk: thread group to lock * * cgroup migration sometimes needs to stabilize threadgroups against forks and * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() @@ -2479,22 +2529,55 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); * Resolve the situation by always acquiring cpus_read_lock() before optionally * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that * CPU hotplug is disabled on entry. + * + * When favordynmods is enabled, take per threadgroup rwsem to reduce overhead + * on dynamic cgroup modifications. see the comment above + * CGRP_ROOT_FAVOR_DYNMODS definition. + * + * tsk is not NULL only when writing to cgroup.procs. */ -void cgroup_attach_lock(bool lock_threadgroup) +void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk) { cpus_read_lock(); - if (lock_threadgroup) + + switch (lock_mode) { + case CGRP_ATTACH_LOCK_NONE: + break; + case CGRP_ATTACH_LOCK_GLOBAL: percpu_down_write(&cgroup_threadgroup_rwsem); + break; + case CGRP_ATTACH_LOCK_PER_THREADGROUP: + down_write(&tsk->signal->cgroup_threadgroup_rwsem); + break; + default: + pr_warn("cgroup: Unexpected attach lock mode."); + break; + } } /** * cgroup_attach_unlock - Undo cgroup_attach_lock() - * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem + * @lock_mode: whether release and release which rwsem + * @tsk: thread group to lock */ -void cgroup_attach_unlock(bool lock_threadgroup) +void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk) { - if (lock_threadgroup) + switch (lock_mode) { + case CGRP_ATTACH_LOCK_NONE: + break; + case CGRP_ATTACH_LOCK_GLOBAL: percpu_up_write(&cgroup_threadgroup_rwsem); + break; + case CGRP_ATTACH_LOCK_PER_THREADGROUP: + up_write(&tsk->signal->cgroup_threadgroup_rwsem); + break; + default: + pr_warn("cgroup: Unexpected attach lock mode."); + break; + } + cpus_read_unlock(); } @@ -2944,14 +3027,12 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, /* look up all src csets */ spin_lock_irq(&css_set_lock); - rcu_read_lock(); task = leader; do { cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx); if (!threadgroup) break; } while_each_thread(leader, task); - rcu_read_unlock(); spin_unlock_irq(&css_set_lock); /* prepare dst csets and commit */ @@ -2968,7 +3049,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, } struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, - bool *threadgroup_locked) + enum cgroup_attach_lock_mode *lock_mode) { struct task_struct *tsk; pid_t pid; @@ -2976,24 +3057,13 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return ERR_PTR(-EINVAL); - /* - * If we migrate a single thread, we don't care about threadgroup - * stability. If the thread is `current`, it won't exit(2) under our - * hands or change PID through exec(2). We exclude - * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write - * callers by cgroup_mutex. - * Therefore, we can skip the global lock. - */ - lockdep_assert_held(&cgroup_mutex); - *threadgroup_locked = pid || threadgroup; - cgroup_attach_lock(*threadgroup_locked); - +retry_find_task: rcu_read_lock(); if (pid) { tsk = find_task_by_vpid(pid); if (!tsk) { tsk = ERR_PTR(-ESRCH); - goto out_unlock_threadgroup; + goto out_unlock_rcu; } } else { tsk = current; @@ -3010,33 +3080,58 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, */ if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) { tsk = ERR_PTR(-EINVAL); - goto out_unlock_threadgroup; + goto out_unlock_rcu; } - get_task_struct(tsk); - goto out_unlock_rcu; + rcu_read_unlock(); + + /* + * If we migrate a single thread, we don't care about threadgroup + * stability. If the thread is `current`, it won't exit(2) under our + * hands or change PID through exec(2). We exclude + * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write callers + * by cgroup_mutex. Therefore, we can skip the global lock. + */ + lockdep_assert_held(&cgroup_mutex); + + if (pid || threadgroup) { + if (cgroup_enable_per_threadgroup_rwsem) + *lock_mode = CGRP_ATTACH_LOCK_PER_THREADGROUP; + else + *lock_mode = CGRP_ATTACH_LOCK_GLOBAL; + } else { + *lock_mode = CGRP_ATTACH_LOCK_NONE; + } + + cgroup_attach_lock(*lock_mode, tsk); + + if (threadgroup) { + if (!thread_group_leader(tsk)) { + /* + * A race with de_thread from another thread's exec() + * may strip us of our leadership. If this happens, + * throw this task away and try again. + */ + cgroup_attach_unlock(*lock_mode, tsk); + put_task_struct(tsk); + goto retry_find_task; + } + } + + return tsk; -out_unlock_threadgroup: - cgroup_attach_unlock(*threadgroup_locked); - *threadgroup_locked = false; out_unlock_rcu: rcu_read_unlock(); return tsk; } -void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked) +void cgroup_procs_write_finish(struct task_struct *task, + enum cgroup_attach_lock_mode lock_mode) { - struct cgroup_subsys *ss; - int ssid; + cgroup_attach_unlock(lock_mode, task); /* release reference from cgroup_procs_write_start() */ put_task_struct(task); - - cgroup_attach_unlock(threadgroup_locked); - - for_each_subsys(ss, ssid) - if (ss->post_attach) - ss->post_attach(); } static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) @@ -3088,6 +3183,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) struct cgroup_subsys_state *d_css; struct cgroup *dsct; struct css_set *src_cset; + enum cgroup_attach_lock_mode lock_mode; bool has_tasks; int ret; @@ -3119,7 +3215,13 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) * write-locking can be skipped safely. */ has_tasks = !list_empty(&mgctx.preloaded_src_csets); - cgroup_attach_lock(has_tasks); + + if (has_tasks) + lock_mode = CGRP_ATTACH_LOCK_GLOBAL; + else + lock_mode = CGRP_ATTACH_LOCK_NONE; + + cgroup_attach_lock(lock_mode, NULL); /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(&mgctx); @@ -3140,7 +3242,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); - cgroup_attach_unlock(has_tasks); + cgroup_attach_unlock(lock_mode, NULL); return ret; } @@ -3763,6 +3865,27 @@ static int cgroup_stat_show(struct seq_file *seq, void *v) return 0; } +static int cgroup_core_local_stat_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + unsigned int sequence; + u64 freeze_time; + + do { + sequence = read_seqcount_begin(&cgrp->freezer.freeze_seq); + freeze_time = cgrp->freezer.frozen_nsec; + /* Add in current freezer interval if the cgroup is freezing. */ + if (test_bit(CGRP_FREEZE, &cgrp->flags)) + freeze_time += (ktime_get_ns() - + cgrp->freezer.freeze_start_nsec); + } while (read_seqcount_retry(&cgrp->freezer.freeze_seq, sequence)); + + do_div(freeze_time, NSEC_PER_USEC); + seq_printf(seq, "frozen_usec %llu\n", freeze_time); + + return 0; +} + #ifdef CONFIG_CGROUP_SCHED /** * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem @@ -4159,6 +4282,7 @@ static void cgroup_file_release(struct kernfs_open_file *of) cft->release(of); put_cgroup_ns(ctx->ns); kfree(ctx); + of->priv = NULL; } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, @@ -5241,13 +5365,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, struct task_struct *task; const struct cred *saved_cred; ssize_t ret; - bool threadgroup_locked; + enum cgroup_attach_lock_mode lock_mode; dst_cgrp = cgroup_kn_lock_live(of->kn, false); if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked); + task = cgroup_procs_write_start(buf, threadgroup, &lock_mode); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -5273,7 +5397,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, ret = cgroup_attach_task(dst_cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task, threadgroup_locked); + cgroup_procs_write_finish(task, lock_mode); out_unlock: cgroup_kn_unlock(of->kn); @@ -5355,6 +5479,11 @@ static struct cftype cgroup_base_files[] = { .seq_show = cgroup_stat_show, }, { + .name = "cgroup.stat.local", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_core_local_stat_show, + }, + { .name = "cgroup.freeze", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_freeze_show, @@ -5558,7 +5687,7 @@ static void css_release_work_fn(struct work_struct *work) cgroup_unlock(); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); - queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); + queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); } static void css_release(struct percpu_ref *ref) @@ -5567,7 +5696,7 @@ static void css_release(struct percpu_ref *ref) container_of(ref, struct cgroup_subsys_state, refcnt); INIT_WORK(&css->destroy_work, css_release_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + queue_work(cgroup_release_wq, &css->destroy_work); } static void init_and_link_css(struct cgroup_subsys_state *css, @@ -5701,7 +5830,7 @@ err_list_del: list_del_rcu(&css->sibling); err_free_css: INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); - queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); + queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); return ERR_PTR(err); } @@ -5763,6 +5892,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, * if the parent has to be frozen, the child has too. */ cgrp->freezer.e_freeze = parent->freezer.e_freeze; + seqcount_init(&cgrp->freezer.freeze_seq); if (cgrp->freezer.e_freeze) { /* * Set the CGRP_FREEZE flag, so when a process will be @@ -5771,6 +5901,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, * consider it frozen immediately. */ set_bit(CGRP_FREEZE, &cgrp->flags); + cgrp->freezer.freeze_start_nsec = ktime_get_ns(); set_bit(CGRP_FROZEN, &cgrp->flags); } @@ -5939,7 +6070,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref) if (atomic_dec_and_test(&css->online_cnt)) { INIT_WORK(&css->destroy_work, css_killed_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + queue_work(cgroup_offline_wq, &css->destroy_work); } } @@ -6312,6 +6443,7 @@ int __init cgroup_init(void) WARN_ON(register_filesystem(&cpuset_fs_type)); #endif + ns_tree_add(&init_cgroup_ns); return 0; } @@ -6325,8 +6457,14 @@ static int __init cgroup_wq_init(void) * We would prefer to do this in cgroup_init() above, but that * is called before init_workqueues(): so leave this until after. */ - cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); - BUG_ON(!cgroup_destroy_wq); + cgroup_offline_wq = alloc_workqueue("cgroup_offline", WQ_PERCPU, 1); + BUG_ON(!cgroup_offline_wq); + + cgroup_release_wq = alloc_workqueue("cgroup_release", WQ_PERCPU, 1); + BUG_ON(!cgroup_release_wq); + + cgroup_free_wq = alloc_workqueue("cgroup_free", WQ_PERCPU, 1); + BUG_ON(!cgroup_free_wq); return 0; } core_initcall(cgroup_wq_init); @@ -6343,15 +6481,15 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) } /* - * cgroup_get_from_id : get the cgroup associated with cgroup id + * __cgroup_get_from_id : get the cgroup associated with cgroup id * @id: cgroup id * On success return the cgrp or ERR_PTR on failure - * Only cgroups within current task's cgroup NS are valid. + * There are no cgroup NS restrictions. */ -struct cgroup *cgroup_get_from_id(u64 id) +struct cgroup *__cgroup_get_from_id(u64 id) { struct kernfs_node *kn; - struct cgroup *cgrp, *root_cgrp; + struct cgroup *cgrp; kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id); if (!kn) @@ -6373,6 +6511,22 @@ struct cgroup *cgroup_get_from_id(u64 id) if (!cgrp) return ERR_PTR(-ENOENT); + return cgrp; +} + +/* + * cgroup_get_from_id : get the cgroup associated with cgroup id + * @id: cgroup id + * On success return the cgrp or ERR_PTR on failure + * Only cgroups within current task's cgroup NS are valid. + */ +struct cgroup *cgroup_get_from_id(u64 id) +{ + struct cgroup *cgrp, *root_cgrp; + + cgrp = __cgroup_get_from_id(id); + if (IS_ERR(cgrp)) + return cgrp; root_cgrp = current_cgns_cgroup_dfl(); if (!cgroup_is_descendant(cgrp, root_cgrp)) { diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 383963e28ac6..337608f408ce 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -38,7 +38,6 @@ enum prs_errcode { /* bits in struct cpuset flags field */ typedef enum { - CS_ONLINE, CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, CS_MEM_HARDWALL, @@ -202,7 +201,7 @@ static inline struct cpuset *parent_cs(struct cpuset *cs) /* convenient tests for these bits */ static inline bool is_cpuset_online(struct cpuset *cs) { - return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); + return css_is_online(&cs->css) && !css_is_dying(&cs->css); } static inline int is_cpu_exclusive(const struct cpuset *cs) @@ -277,6 +276,8 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) ssize_t cpuset_write_resmask(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); int cpuset_common_seq_show(struct seq_file *sf, void *v); +void cpuset_full_lock(void); +void cpuset_full_unlock(void); /* * cpuset-v1.c diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index b69a7db67090..12e76774c75b 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -169,8 +169,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV; - cpus_read_lock(); - cpuset_lock(); + cpuset_full_lock(); if (!is_cpuset_online(cs)) goto out_unlock; @@ -184,8 +183,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - cpuset_unlock(); - cpus_read_unlock(); + cpuset_full_unlock(); return retval; } @@ -454,8 +452,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0; - cpus_read_lock(); - cpuset_lock(); + cpuset_full_lock(); if (!is_cpuset_online(cs)) { retval = -ENODEV; goto out_unlock; @@ -498,8 +495,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - cpuset_unlock(); - cpus_read_unlock(); + cpuset_full_unlock(); return retval; } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 27adb04df675..52468d2c178a 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -40,6 +40,7 @@ #include <linux/sched/isolation.h> #include <linux/wait.h> #include <linux/workqueue.h> +#include <linux/task_work.h> DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); @@ -131,11 +132,6 @@ static bool force_sd_rebuild; #define PRS_INVALID_ROOT -1 #define PRS_INVALID_ISOLATED -2 -static inline bool is_prs_invalid(int prs_state) -{ - return prs_state < 0; -} - /* * Temporary cpumasks for working with partitions that are passed among * functions to avoid memory allocation in inner functions. @@ -159,16 +155,21 @@ void dec_dl_tasks_cs(struct task_struct *p) cs->nr_deadline_tasks--; } -static inline int is_partition_valid(const struct cpuset *cs) +static inline bool is_partition_valid(const struct cpuset *cs) { return cs->partition_root_state > 0; } -static inline int is_partition_invalid(const struct cpuset *cs) +static inline bool is_partition_invalid(const struct cpuset *cs) { return cs->partition_root_state < 0; } +static inline bool cs_is_member(const struct cpuset *cs) +{ + return cs->partition_root_state == PRS_MEMBER; +} + /* * Callers should hold callback_lock to modify partition_root_state. */ @@ -207,7 +208,7 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs) * parallel, we may leave an offline CPU in cpu_allowed or some other masks. */ static struct cpuset top_cpuset = { - .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) | + .flags = BIT(CS_CPU_EXCLUSIVE) | BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, .relax_domain_level = -1, @@ -250,6 +251,12 @@ static struct cpuset top_cpuset = { static DEFINE_MUTEX(cpuset_mutex); +/** + * cpuset_lock - Acquire the global cpuset mutex + * + * This locks the global cpuset mutex to prevent modifications to cpuset + * hierarchy and configurations. This helper is not enough to make modification. + */ void cpuset_lock(void) { mutex_lock(&cpuset_mutex); @@ -260,6 +267,24 @@ void cpuset_unlock(void) mutex_unlock(&cpuset_mutex); } +/** + * cpuset_full_lock - Acquire full protection for cpuset modification + * + * Takes both CPU hotplug read lock (cpus_read_lock()) and cpuset mutex + * to safely modify cpuset data. + */ +void cpuset_full_lock(void) +{ + cpus_read_lock(); + mutex_lock(&cpuset_mutex); +} + +void cpuset_full_unlock(void) +{ + mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); +} + static DEFINE_SPINLOCK(callback_lock); void cpuset_callback_lock_irq(void) @@ -411,94 +436,104 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) } /** - * alloc_cpumasks - allocate three cpumasks for cpuset - * @cs: the cpuset that have cpumasks to be allocated. - * @tmp: the tmpmasks structure pointer + * alloc_cpumasks - Allocate an array of cpumask variables + * @pmasks: Pointer to array of cpumask_var_t pointers + * @size: Number of cpumasks to allocate * Return: 0 if successful, -ENOMEM otherwise. * - * Only one of the two input arguments should be non-NULL. + * Allocates @size cpumasks and initializes them to empty. Returns 0 on + * success, -ENOMEM on allocation failure. On failure, any previously + * allocated cpumasks are freed. */ -static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) +static inline int alloc_cpumasks(cpumask_var_t *pmasks[], u32 size) { - cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4; + int i; - if (cs) { - pmask1 = &cs->cpus_allowed; - pmask2 = &cs->effective_cpus; - pmask3 = &cs->effective_xcpus; - pmask4 = &cs->exclusive_cpus; - } else { - pmask1 = &tmp->new_cpus; - pmask2 = &tmp->addmask; - pmask3 = &tmp->delmask; - pmask4 = NULL; + for (i = 0; i < size; i++) { + if (!zalloc_cpumask_var(pmasks[i], GFP_KERNEL)) { + while (--i >= 0) + free_cpumask_var(*pmasks[i]); + return -ENOMEM; + } } - - if (!zalloc_cpumask_var(pmask1, GFP_KERNEL)) - return -ENOMEM; - - if (!zalloc_cpumask_var(pmask2, GFP_KERNEL)) - goto free_one; - - if (!zalloc_cpumask_var(pmask3, GFP_KERNEL)) - goto free_two; - - if (pmask4 && !zalloc_cpumask_var(pmask4, GFP_KERNEL)) - goto free_three; - - return 0; +} + +/** + * alloc_tmpmasks - Allocate temporary cpumasks for cpuset operations. + * @tmp: Pointer to tmpmasks structure to populate + * Return: 0 on success, -ENOMEM on allocation failure + */ +static inline int alloc_tmpmasks(struct tmpmasks *tmp) +{ + /* + * Array of pointers to the three cpumask_var_t fields in tmpmasks. + * Note: Array size must match actual number of masks (3) + */ + cpumask_var_t *pmask[3] = { + &tmp->new_cpus, + &tmp->addmask, + &tmp->delmask + }; -free_three: - free_cpumask_var(*pmask3); -free_two: - free_cpumask_var(*pmask2); -free_one: - free_cpumask_var(*pmask1); - return -ENOMEM; + return alloc_cpumasks(pmask, ARRAY_SIZE(pmask)); } /** - * free_cpumasks - free cpumasks in a tmpmasks structure - * @cs: the cpuset that have cpumasks to be free. + * free_tmpmasks - free cpumasks in a tmpmasks structure * @tmp: the tmpmasks structure pointer */ -static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) +static inline void free_tmpmasks(struct tmpmasks *tmp) { - if (cs) { - free_cpumask_var(cs->cpus_allowed); - free_cpumask_var(cs->effective_cpus); - free_cpumask_var(cs->effective_xcpus); - free_cpumask_var(cs->exclusive_cpus); - } - if (tmp) { - free_cpumask_var(tmp->new_cpus); - free_cpumask_var(tmp->addmask); - free_cpumask_var(tmp->delmask); - } + if (!tmp) + return; + + free_cpumask_var(tmp->new_cpus); + free_cpumask_var(tmp->addmask); + free_cpumask_var(tmp->delmask); } /** - * alloc_trial_cpuset - allocate a trial cpuset - * @cs: the cpuset that the trial cpuset duplicates + * dup_or_alloc_cpuset - Duplicate or allocate a new cpuset + * @cs: Source cpuset to duplicate (NULL for a fresh allocation) + * + * Creates a new cpuset by either: + * 1. Duplicating an existing cpuset (if @cs is non-NULL), or + * 2. Allocating a fresh cpuset with zero-initialized masks (if @cs is NULL) + * + * Return: Pointer to newly allocated cpuset on success, NULL on failure */ -static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) +static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs) { struct cpuset *trial; - trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL); + /* Allocate base structure */ + trial = cs ? kmemdup(cs, sizeof(*cs), GFP_KERNEL) : + kzalloc(sizeof(*cs), GFP_KERNEL); if (!trial) return NULL; - if (alloc_cpumasks(trial, NULL)) { + /* Setup cpumask pointer array */ + cpumask_var_t *pmask[4] = { + &trial->cpus_allowed, + &trial->effective_cpus, + &trial->effective_xcpus, + &trial->exclusive_cpus + }; + + if (alloc_cpumasks(pmask, ARRAY_SIZE(pmask))) { kfree(trial); return NULL; } - cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); - cpumask_copy(trial->effective_cpus, cs->effective_cpus); - cpumask_copy(trial->effective_xcpus, cs->effective_xcpus); - cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus); + /* Copy masks if duplicating */ + if (cs) { + cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); + cpumask_copy(trial->effective_cpus, cs->effective_cpus); + cpumask_copy(trial->effective_xcpus, cs->effective_xcpus); + cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus); + } + return trial; } @@ -508,7 +543,10 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) */ static inline void free_cpuset(struct cpuset *cs) { - free_cpumasks(cs, NULL); + free_cpumask_var(cs->cpus_allowed); + free_cpumask_var(cs->effective_cpus); + free_cpumask_var(cs->effective_xcpus); + free_cpumask_var(cs->exclusive_cpus); kfree(cs); } @@ -540,6 +578,47 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) return true; } +/** + * cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts + * @cs1: first cpuset to check + * @cs2: second cpuset to check + * + * Returns: true if CPU exclusivity conflict exists, false otherwise + * + * Conflict detection rules: + * 1. If either cpuset is CPU exclusive, they must be mutually exclusive + * 2. exclusive_cpus masks cannot intersect between cpusets + * 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs + */ +static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) +{ + /* If either cpuset is exclusive, check if they are mutually exclusive */ + if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2)) + return !cpusets_are_exclusive(cs1, cs2); + + /* Exclusive_cpus cannot intersect */ + if (cpumask_intersects(cs1->exclusive_cpus, cs2->exclusive_cpus)) + return true; + + /* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */ + if (!cpumask_empty(cs1->cpus_allowed) && + cpumask_subset(cs1->cpus_allowed, cs2->exclusive_cpus)) + return true; + + if (!cpumask_empty(cs2->cpus_allowed) && + cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus)) + return true; + + return false; +} + +static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) +{ + if ((is_mem_exclusive(cs1) || is_mem_exclusive(cs2))) + return nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); + return false; +} + /* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. @@ -621,38 +700,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) */ ret = -EINVAL; cpuset_for_each_child(c, css, par) { - bool txset, cxset; /* Are exclusive_cpus set? */ - if (c == cur) continue; - - txset = !cpumask_empty(trial->exclusive_cpus); - cxset = !cpumask_empty(c->exclusive_cpus); - if (is_cpu_exclusive(trial) || is_cpu_exclusive(c) || - (txset && cxset)) { - if (!cpusets_are_exclusive(trial, c)) - goto out; - } else if (txset || cxset) { - struct cpumask *xcpus, *acpus; - - /* - * When just one of the exclusive_cpus's is set, - * cpus_allowed of the other cpuset, if set, cannot be - * a subset of it or none of those CPUs will be - * available if these exclusive CPUs are activated. - */ - if (txset) { - xcpus = trial->exclusive_cpus; - acpus = c->cpus_allowed; - } else { - xcpus = c->exclusive_cpus; - acpus = trial->cpus_allowed; - } - if (!cpumask_empty(acpus) && cpumask_subset(acpus, xcpus)) - goto out; - } - if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && - nodes_intersects(trial->mems_allowed, c->mems_allowed)) + if (cpus_excl_conflict(trial, c)) + goto out; + if (mems_excl_conflict(trial, c)) goto out; } @@ -1363,38 +1415,25 @@ bool cpuset_cpu_is_isolated(int cpu) } EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); -/* - * compute_effective_exclusive_cpumask - compute effective exclusive CPUs - * @cs: cpuset - * @xcpus: effective exclusive CPUs value to be set - * @real_cs: the real cpuset (can be NULL) - * Return: 0 if there is no sibling conflict, > 0 otherwise +/** + * rm_siblings_excl_cpus - Remove exclusive CPUs that are used by sibling cpusets + * @parent: Parent cpuset containing all siblings + * @cs: Current cpuset (will be skipped) + * @excpus: exclusive effective CPU mask to modify * - * If exclusive_cpus isn't explicitly set or a real_cs is provided, we have to - * scan the sibling cpusets and exclude their exclusive_cpus or effective_xcpus - * as well. The provision of real_cs means that a cpumask is being changed and - * the given cs is a trial one. + * This function ensures the given @excpus mask doesn't include any CPUs that + * are exclusively allocated to sibling cpusets. It walks through all siblings + * of @cs under @parent and removes their exclusive CPUs from @excpus. */ -static int compute_effective_exclusive_cpumask(struct cpuset *cs, - struct cpumask *xcpus, - struct cpuset *real_cs) +static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs, + struct cpumask *excpus) { struct cgroup_subsys_state *css; - struct cpuset *parent = parent_cs(cs); struct cpuset *sibling; int retval = 0; - if (!xcpus) - xcpus = cs->effective_xcpus; - - cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus); - - if (!real_cs) { - if (!cpumask_empty(cs->exclusive_cpus)) - return 0; - } else { - cs = real_cs; - } + if (cpumask_empty(excpus)) + return retval; /* * Exclude exclusive CPUs from siblings @@ -1404,20 +1443,66 @@ static int compute_effective_exclusive_cpumask(struct cpuset *cs, if (sibling == cs) continue; - if (cpumask_intersects(xcpus, sibling->exclusive_cpus)) { - cpumask_andnot(xcpus, xcpus, sibling->exclusive_cpus); + if (cpumask_intersects(excpus, sibling->exclusive_cpus)) { + cpumask_andnot(excpus, excpus, sibling->exclusive_cpus); retval++; continue; } - if (cpumask_intersects(xcpus, sibling->effective_xcpus)) { - cpumask_andnot(xcpus, xcpus, sibling->effective_xcpus); + if (cpumask_intersects(excpus, sibling->effective_xcpus)) { + cpumask_andnot(excpus, excpus, sibling->effective_xcpus); retval++; } } rcu_read_unlock(); + return retval; } +/* + * compute_excpus - compute effective exclusive CPUs + * @cs: cpuset + * @xcpus: effective exclusive CPUs value to be set + * Return: 0 if there is no sibling conflict, > 0 otherwise + * + * If exclusive_cpus isn't explicitly set , we have to scan the sibling cpusets + * and exclude their exclusive_cpus or effective_xcpus as well. + */ +static int compute_excpus(struct cpuset *cs, struct cpumask *excpus) +{ + struct cpuset *parent = parent_cs(cs); + + cpumask_and(excpus, user_xcpus(cs), parent->effective_xcpus); + + if (!cpumask_empty(cs->exclusive_cpus)) + return 0; + + return rm_siblings_excl_cpus(parent, cs, excpus); +} + +/* + * compute_trialcs_excpus - Compute effective exclusive CPUs for a trial cpuset + * @trialcs: The trial cpuset containing the proposed new configuration + * @cs: The original cpuset that the trial configuration is based on + * Return: 0 if successful with no sibling conflict, >0 if a conflict is found + * + * Computes the effective_xcpus for a trial configuration. @cs is provided to represent + * the real cs. + */ +static int compute_trialcs_excpus(struct cpuset *trialcs, struct cpuset *cs) +{ + struct cpuset *parent = parent_cs(trialcs); + struct cpumask *excpus = trialcs->effective_xcpus; + + /* trialcs is member, cpuset.cpus has no impact to excpus */ + if (cs_is_member(cs)) + cpumask_and(excpus, trialcs->exclusive_cpus, + parent->effective_xcpus); + else + cpumask_and(excpus, user_xcpus(trialcs), parent->effective_xcpus); + + return rm_siblings_excl_cpus(parent, cs, excpus); +} + static inline bool is_remote_partition(struct cpuset *cs) { return !list_empty(&cs->remote_sibling); @@ -1459,7 +1544,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, * Note that creating a remote partition with any local partition root * above it or remote partition root underneath it is not allowed. */ - compute_effective_exclusive_cpumask(cs, tmp->new_cpus, NULL); + compute_excpus(cs, tmp->new_cpus); WARN_ON_ONCE(cpumask_intersects(tmp->new_cpus, subpartitions_cpus)); if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) || cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) @@ -1508,7 +1593,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) cs->partition_root_state = PRS_MEMBER; /* effective_xcpus may need to be changed */ - compute_effective_exclusive_cpumask(cs, NULL, NULL); + compute_excpus(cs, cs->effective_xcpus); reset_partition_data(cs); spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); @@ -1677,7 +1762,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, old_prs = new_prs = cs->partition_root_state; if (cmd == partcmd_invalidate) { - if (is_prs_invalid(old_prs)) + if (is_partition_invalid(cs)) return 0; /* @@ -1709,13 +1794,14 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) { /* - * Need to call compute_effective_exclusive_cpumask() in case + * Need to call compute_excpus() in case * exclusive_cpus not set. Sibling conflict should only happen * if exclusive_cpus isn't set. */ xcpus = tmp->delmask; - if (compute_effective_exclusive_cpumask(cs, xcpus, NULL)) + if (compute_excpus(cs, xcpus)) WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus)); + new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; /* * Enabling partition root is not allowed if its @@ -1727,11 +1813,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if (prstate_housekeeping_conflict(new_prs, xcpus)) return PERR_HKEEPING; - /* - * A parent can be left with no CPU as long as there is no - * task directly associated with the parent partition. - */ - if (nocpu) + if (tasks_nocpu_error(parent, cs, xcpus)) return PERR_NOCPUS; /* @@ -1748,7 +1830,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, deleting = true; subparts_delta++; - new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; } else if (cmd == partcmd_disable) { /* * May need to add cpus back to parent's effective_cpus @@ -1788,7 +1869,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, * For invalid partition: * delmask = newmask & parent->effective_xcpus */ - if (is_prs_invalid(old_prs)) { + if (is_partition_invalid(cs)) { adding = false; deleting = cpumask_and(tmp->delmask, newmask, parent->effective_xcpus); @@ -1837,7 +1918,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, * A partition error happens when parent has tasks and all * its effective CPUs will have to be distributed out. */ - WARN_ON_ONCE(!is_partition_valid(parent)); if (nocpu) { part_error = PERR_NOCPUS; if (is_partition_valid(cs)) @@ -1996,7 +2076,7 @@ static void compute_partition_effective_cpumask(struct cpuset *cs, * 2) All the effective_cpus will be used up and cp * has tasks */ - compute_effective_exclusive_cpumask(cs, new_ecpus, NULL); + compute_excpus(cs, new_ecpus); cpumask_and(new_ecpus, new_ecpus, cpu_active_mask); rcu_read_lock(); @@ -2075,7 +2155,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, * its value is being processed. */ if (remote && (cp != cs)) { - compute_effective_exclusive_cpumask(cp, tmp->new_cpus, NULL); + compute_excpus(cp, tmp->new_cpus); if (cpumask_equal(cp->effective_xcpus, tmp->new_cpus)) { pos_css = css_rightmost_descendant(pos_css); continue; @@ -2177,7 +2257,7 @@ get_css: cpumask_copy(cp->effective_cpus, tmp->new_cpus); cp->partition_root_state = new_prs; if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) - compute_effective_exclusive_cpumask(cp, NULL, NULL); + compute_excpus(cp, cp->effective_xcpus); /* * Make sure effective_xcpus is properly set for a valid @@ -2284,82 +2364,54 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, rcu_read_unlock(); } -/** - * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it - * @cs: the cpuset to consider - * @trialcs: trial cpuset - * @buf: buffer of cpu numbers written to this cpuset - */ -static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, - const char *buf) +static int parse_cpuset_cpulist(const char *buf, struct cpumask *out_mask) { int retval; - struct tmpmasks tmp; - struct cpuset *parent = parent_cs(cs); - bool invalidate = false; - bool force = false; - int old_prs = cs->partition_root_state; - /* top_cpuset.cpus_allowed tracks cpu_active_mask; it's read-only */ - if (cs == &top_cpuset) - return -EACCES; + retval = cpulist_parse(buf, out_mask); + if (retval < 0) + return retval; + if (!cpumask_subset(out_mask, top_cpuset.cpus_allowed)) + return -EINVAL; - /* - * An empty cpus_allowed is ok only if the cpuset has no tasks. - * Since cpulist_parse() fails on an empty mask, we special case - * that parsing. The validate_change() call ensures that cpusets - * with tasks have cpus. - */ - if (!*buf) { - cpumask_clear(trialcs->cpus_allowed); - if (cpumask_empty(trialcs->exclusive_cpus)) - cpumask_clear(trialcs->effective_xcpus); - } else { - retval = cpulist_parse(buf, trialcs->cpus_allowed); - if (retval < 0) - return retval; + return 0; +} - if (!cpumask_subset(trialcs->cpus_allowed, - top_cpuset.cpus_allowed)) - return -EINVAL; +/** + * validate_partition - Validate a cpuset partition configuration + * @cs: The cpuset to validate + * @trialcs: The trial cpuset containing proposed configuration changes + * + * If any validation check fails, the appropriate error code is set in the + * cpuset's prs_err field. + * + * Return: PRS error code (0 if valid, non-zero error code if invalid) + */ +static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *trialcs) +{ + struct cpuset *parent = parent_cs(cs); - /* - * When exclusive_cpus isn't explicitly set, it is constrained - * by cpus_allowed and parent's effective_xcpus. Otherwise, - * trialcs->effective_xcpus is used as a temporary cpumask - * for checking validity of the partition root. - */ - trialcs->partition_root_state = PRS_MEMBER; - if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs)) - compute_effective_exclusive_cpumask(trialcs, NULL, cs); - } + if (cs_is_member(trialcs)) + return PERR_NONE; - /* Nothing to do if the cpus didn't change */ - if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) - return 0; + if (cpumask_empty(trialcs->effective_xcpus)) + return PERR_INVCPUS; - if (alloc_cpumasks(NULL, &tmp)) - return -ENOMEM; + if (prstate_housekeeping_conflict(trialcs->partition_root_state, + trialcs->effective_xcpus)) + return PERR_HKEEPING; - if (old_prs) { - if (is_partition_valid(cs) && - cpumask_empty(trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_INVCPUS; - } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_HKEEPING; - } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_NOCPUS; - } - } + if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) + return PERR_NOCPUS; - /* - * Check all the descendants in update_cpumasks_hier() if - * effective_xcpus is to be changed. - */ - force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); + return PERR_NONE; +} + +static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs, + struct tmpmasks *tmp) +{ + int retval; + struct cpuset *parent = parent_cs(cs); retval = validate_change(cs, trialcs); @@ -2374,7 +2426,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * partition. However, any conflicting sibling partitions * have to be marked as invalid too. */ - invalidate = true; + trialcs->prs_err = PERR_NOTEXCL; rcu_read_lock(); cpuset_for_each_child(cp, css, parent) { struct cpumask *xcpus = user_xcpus(trialcs); @@ -2382,36 +2434,92 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (is_partition_valid(cp) && cpumask_intersects(xcpus, cp->effective_xcpus)) { rcu_read_unlock(); - update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp); + update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp); rcu_read_lock(); } } rcu_read_unlock(); retval = 0; } + return retval; +} - if (retval < 0) - goto out_free; +/** + * partition_cpus_change - Handle partition state changes due to CPU mask updates + * @cs: The target cpuset being modified + * @trialcs: The trial cpuset containing proposed configuration changes + * @tmp: Temporary masks for intermediate calculations + * + * This function handles partition state transitions triggered by CPU mask changes. + * CPU modifications may cause a partition to be disabled or require state updates. + */ +static void partition_cpus_change(struct cpuset *cs, struct cpuset *trialcs, + struct tmpmasks *tmp) +{ + enum prs_errcode prs_err; - if (is_partition_valid(cs) || - (is_partition_invalid(cs) && !invalidate)) { - struct cpumask *xcpus = trialcs->effective_xcpus; + if (cs_is_member(cs)) + return; - if (cpumask_empty(xcpus) && is_partition_invalid(cs)) - xcpus = trialcs->cpus_allowed; + prs_err = validate_partition(cs, trialcs); + if (prs_err) + trialcs->prs_err = cs->prs_err = prs_err; - /* - * Call remote_cpus_update() to handle valid remote partition - */ - if (is_remote_partition(cs)) - remote_cpus_update(cs, NULL, xcpus, &tmp); - else if (invalidate) + if (is_remote_partition(cs)) { + if (trialcs->prs_err) + remote_partition_disable(cs, tmp); + else + remote_cpus_update(cs, trialcs->exclusive_cpus, + trialcs->effective_xcpus, tmp); + } else { + if (trialcs->prs_err) update_parent_effective_cpumask(cs, partcmd_invalidate, - NULL, &tmp); + NULL, tmp); else update_parent_effective_cpumask(cs, partcmd_update, - xcpus, &tmp); + trialcs->effective_xcpus, tmp); } +} + +/** + * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it + * @cs: the cpuset to consider + * @trialcs: trial cpuset + * @buf: buffer of cpu numbers written to this cpuset + */ +static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, + const char *buf) +{ + int retval; + struct tmpmasks tmp; + bool force = false; + int old_prs = cs->partition_root_state; + + retval = parse_cpuset_cpulist(buf, trialcs->cpus_allowed); + if (retval < 0) + return retval; + + /* Nothing to do if the cpus didn't change */ + if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) + return 0; + + if (alloc_tmpmasks(&tmp)) + return -ENOMEM; + + compute_trialcs_excpus(trialcs, cs); + trialcs->prs_err = PERR_NONE; + + retval = cpus_allowed_validate_change(cs, trialcs, &tmp); + if (retval < 0) + goto out_free; + + /* + * Check all the descendants in update_cpumasks_hier() if + * effective_xcpus is to be changed. + */ + force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); + + partition_cpus_change(cs, trialcs, &tmp); spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); @@ -2427,7 +2535,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cs->partition_root_state) update_partition_sd_lb(cs, old_prs); out_free: - free_cpumasks(NULL, &tmp); + free_tmpmasks(&tmp); return retval; } @@ -2444,33 +2552,23 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, { int retval; struct tmpmasks tmp; - struct cpuset *parent = parent_cs(cs); - bool invalidate = false; bool force = false; int old_prs = cs->partition_root_state; - if (!*buf) { - cpumask_clear(trialcs->exclusive_cpus); - cpumask_clear(trialcs->effective_xcpus); - } else { - retval = cpulist_parse(buf, trialcs->exclusive_cpus); - if (retval < 0) - return retval; - } + retval = parse_cpuset_cpulist(buf, trialcs->exclusive_cpus); + if (retval < 0) + return retval; /* Nothing to do if the CPUs didn't change */ if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus)) return 0; - if (*buf) { - trialcs->partition_root_state = PRS_MEMBER; - /* - * Reject the change if there is exclusive CPUs conflict with - * the siblings. - */ - if (compute_effective_exclusive_cpumask(trialcs, NULL, cs)) - return -EINVAL; - } + /* + * Reject the change if there is exclusive CPUs conflict with + * the siblings. + */ + if (compute_trialcs_excpus(trialcs, cs)) + return -EINVAL; /* * Check all the descendants in update_cpumasks_hier() if @@ -2482,35 +2580,12 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval) return retval; - if (alloc_cpumasks(NULL, &tmp)) + if (alloc_tmpmasks(&tmp)) return -ENOMEM; - if (old_prs) { - if (cpumask_empty(trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_INVCPUS; - } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_HKEEPING; - } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { - invalidate = true; - cs->prs_err = PERR_NOCPUS; - } + trialcs->prs_err = PERR_NONE; + partition_cpus_change(cs, trialcs, &tmp); - if (is_remote_partition(cs)) { - if (invalidate) - remote_partition_disable(cs, &tmp); - else - remote_cpus_update(cs, trialcs->exclusive_cpus, - trialcs->effective_xcpus, &tmp); - } else if (invalidate) { - update_parent_effective_cpumask(cs, partcmd_invalidate, - NULL, &tmp); - } else { - update_parent_effective_cpumask(cs, partcmd_update, - trialcs->effective_xcpus, &tmp); - } - } spin_lock_irq(&callback_lock); cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus); cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); @@ -2530,7 +2605,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cs->partition_root_state) update_partition_sd_lb(cs, old_prs); - free_cpumasks(NULL, &tmp); + free_tmpmasks(&tmp); return 0; } @@ -2582,9 +2657,24 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, } } -static void cpuset_post_attach(void) +static void flush_migrate_mm_task_workfn(struct callback_head *head) { flush_workqueue(cpuset_migrate_mm_wq); + kfree(head); +} + +static void schedule_flush_migrate_mm(void) +{ + struct callback_head *flush_cb; + + flush_cb = kzalloc(sizeof(struct callback_head), GFP_KERNEL); + if (!flush_cb) + return; + + init_task_work(flush_cb, flush_migrate_mm_task_workfn); + + if (task_work_add(current, flush_cb, TWA_RESUME)) + kfree(flush_cb); } /* @@ -2750,32 +2840,17 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, int retval; /* - * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; - * it's read-only - */ - if (cs == &top_cpuset) { - retval = -EACCES; - goto done; - } - - /* * An empty mems_allowed is ok iff there are no tasks in the cpuset. - * Since nodelist_parse() fails on an empty mask, we special case - * that parsing. The validate_change() call ensures that cpusets - * with tasks have memory. + * The validate_change() call ensures that cpusets with tasks have memory. */ - if (!*buf) { - nodes_clear(trialcs->mems_allowed); - } else { - retval = nodelist_parse(buf, trialcs->mems_allowed); - if (retval < 0) - goto done; + retval = nodelist_parse(buf, trialcs->mems_allowed); + if (retval < 0) + goto done; - if (!nodes_subset(trialcs->mems_allowed, - top_cpuset.mems_allowed)) { - retval = -EINVAL; - goto done; - } + if (!nodes_subset(trialcs->mems_allowed, + top_cpuset.mems_allowed)) { + retval = -EINVAL; + goto done; } if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) { @@ -2826,7 +2901,7 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int spread_flag_changed; int err; - trialcs = alloc_trial_cpuset(cs); + trialcs = dup_or_alloc_cpuset(cs); if (!trialcs) return -ENOMEM; @@ -2884,10 +2959,10 @@ static int update_prstate(struct cpuset *cs, int new_prs) /* * Treat a previously invalid partition root as if it is a "member". */ - if (new_prs && is_prs_invalid(old_prs)) + if (new_prs && is_partition_invalid(cs)) old_prs = PRS_MEMBER; - if (alloc_cpumasks(NULL, &tmpmask)) + if (alloc_tmpmasks(&tmpmask)) return -ENOMEM; err = update_partition_exclusive_flag(cs, new_prs); @@ -2983,7 +3058,7 @@ out: notify_partition_change(cs, old_prs); if (force_sd_rebuild) rebuild_sched_domains_locked(); - free_cpumasks(NULL, &tmpmask); + free_tmpmasks(&tmpmask); return 0; } @@ -3141,6 +3216,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) struct cpuset *cs; struct cpuset *oldcs = cpuset_attach_old_cs; bool cpus_updated, mems_updated; + bool queue_task_work = false; cgroup_taskset_first(tset, &css); cs = css_cs(css); @@ -3191,15 +3267,18 @@ static void cpuset_attach(struct cgroup_taskset *tset) * @old_mems_allowed is the right nodesets that we * migrate mm from. */ - if (is_memory_migrate(cs)) + if (is_memory_migrate(cs)) { cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, &cpuset_attach_nodemask_to); - else + queue_task_work = true; + } else mmput(mm); } } out: + if (queue_task_work) + schedule_flush_migrate_mm(); cs->old_mems_allowed = cpuset_attach_nodemask_to; if (cs->nr_migrate_dl_tasks) { @@ -3223,13 +3302,16 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, struct cpuset *trialcs; int retval = -ENODEV; + /* root is read-only */ + if (cs == &top_cpuset) + return -EACCES; + buf = strstrip(buf); - cpus_read_lock(); - mutex_lock(&cpuset_mutex); + cpuset_full_lock(); if (!is_cpuset_online(cs)) goto out_unlock; - trialcs = alloc_trial_cpuset(cs); + trialcs = dup_or_alloc_cpuset(cs); if (!trialcs) { retval = -ENOMEM; goto out_unlock; @@ -3254,9 +3336,9 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, if (force_sd_rebuild) rebuild_sched_domains_locked(); out_unlock: - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); - flush_workqueue(cpuset_migrate_mm_wq); + cpuset_full_unlock(); + if (of_cft(of)->private == FILE_MEMLIST) + schedule_flush_migrate_mm(); return retval ?: nbytes; } @@ -3358,12 +3440,10 @@ static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf, else return -EINVAL; - cpus_read_lock(); - mutex_lock(&cpuset_mutex); + cpuset_full_lock(); if (is_cpuset_online(cs)) retval = update_prstate(cs, val); - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); + cpuset_full_unlock(); return retval ?: nbytes; } @@ -3462,15 +3542,10 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) if (!parent_css) return &top_cpuset.css; - cs = kzalloc(sizeof(*cs), GFP_KERNEL); + cs = dup_or_alloc_cpuset(NULL); if (!cs) return ERR_PTR(-ENOMEM); - if (alloc_cpumasks(cs, NULL)) { - kfree(cs); - return ERR_PTR(-ENOMEM); - } - __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; @@ -3493,10 +3568,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (!parent) return 0; - cpus_read_lock(); - mutex_lock(&cpuset_mutex); - - set_bit(CS_ONLINE, &cs->flags); + cpuset_full_lock(); if (is_spread_page(parent)) set_bit(CS_SPREAD_PAGE, &cs->flags); if (is_spread_slab(parent)) @@ -3548,8 +3620,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpumask_copy(cs->effective_cpus, parent->cpus_allowed); spin_unlock_irq(&callback_lock); out_unlock: - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); + cpuset_full_unlock(); return 0; } @@ -3564,17 +3635,12 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - cpus_read_lock(); - mutex_lock(&cpuset_mutex); - + cpuset_full_lock(); if (!cpuset_v2() && is_sched_load_balance(cs)) cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); cpuset_dec(); - clear_bit(CS_ONLINE, &cs->flags); - - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); + cpuset_full_unlock(); } /* @@ -3586,16 +3652,11 @@ static void cpuset_css_killed(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - cpus_read_lock(); - mutex_lock(&cpuset_mutex); - + cpuset_full_lock(); /* Reset valid partition back to member */ if (is_partition_valid(cs)) update_prstate(cs, PRS_MEMBER); - - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); - + cpuset_full_unlock(); } static void cpuset_css_free(struct cgroup_subsys_state *css) @@ -3724,7 +3785,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .can_attach = cpuset_can_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, - .post_attach = cpuset_post_attach, .bind = cpuset_bind, .can_fork = cpuset_can_fork, .cancel_fork = cpuset_cancel_fork, @@ -3928,7 +3988,7 @@ static void cpuset_handle_hotplug(void) bool on_dfl = is_in_v2_mode(); struct tmpmasks tmp, *ptmp = NULL; - if (on_dfl && !alloc_cpumasks(NULL, &tmp)) + if (on_dfl && !alloc_tmpmasks(&tmp)) ptmp = &tmp; lockdep_assert_cpus_held(); @@ -4008,7 +4068,7 @@ static void cpuset_handle_hotplug(void) if (force_sd_rebuild) rebuild_sched_domains_cpuslocked(); - free_cpumasks(NULL, ptmp); + free_tmpmasks(ptmp); } void cpuset_update_active_cpus(void) @@ -4073,7 +4133,6 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) struct cpuset *cs; spin_lock_irqsave(&callback_lock, flags); - rcu_read_lock(); cs = task_cs(tsk); if (cs != &top_cpuset) @@ -4095,7 +4154,6 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) cpumask_copy(pmask, possible_mask); } - rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); } @@ -4168,9 +4226,7 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) unsigned long flags; spin_lock_irqsave(&callback_lock, flags); - rcu_read_lock(); guarantee_online_mems(task_cs(tsk), &mask); - rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); return mask; @@ -4265,10 +4321,8 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask) /* Not hardwall and node outside mems_allowed: scan up cpusets */ spin_lock_irqsave(&callback_lock, flags); - rcu_read_lock(); cs = nearest_hardwall_ancestor(task_cs(current)); allowed = node_isset(node, cs->mems_allowed); - rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); return allowed; diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 80aa3f027ac3..81ea38dd6f9d 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -49,7 +49,6 @@ static int current_css_set_read(struct seq_file *seq, void *v) return -ENODEV; spin_lock_irq(&css_set_lock); - rcu_read_lock(); cset = task_css_set(current); refcnt = refcount_read(&cset->refcount); seq_printf(seq, "css_set %pK %d", cset, refcnt); @@ -67,7 +66,6 @@ static int current_css_set_read(struct seq_file *seq, void *v) seq_printf(seq, "%2d: %-4s\t- %p[%d]\n", ss->id, ss->name, css, css->id); } - rcu_read_unlock(); spin_unlock_irq(&css_set_lock); cgroup_kn_unlock(of->kn); return 0; @@ -95,7 +93,6 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) return -ENOMEM; spin_lock_irq(&css_set_lock); - rcu_read_lock(); cset = task_css_set(current); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; @@ -104,7 +101,6 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) seq_printf(seq, "Root %d group %s\n", c->root->hierarchy_id, name_buf); } - rcu_read_unlock(); spin_unlock_irq(&css_set_lock); kfree(name_buf); return 0; diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index bf1690a167dd..6c18854bff34 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -171,7 +171,7 @@ static void cgroup_freeze_task(struct task_struct *task, bool freeze) /* * Freeze or unfreeze all tasks in the given cgroup. */ -static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) +static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze, u64 ts_nsec) { struct css_task_iter it; struct task_struct *task; @@ -179,10 +179,16 @@ static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - if (freeze) + write_seqcount_begin(&cgrp->freezer.freeze_seq); + if (freeze) { set_bit(CGRP_FREEZE, &cgrp->flags); - else + cgrp->freezer.freeze_start_nsec = ts_nsec; + } else { clear_bit(CGRP_FREEZE, &cgrp->flags); + cgrp->freezer.frozen_nsec += (ts_nsec - + cgrp->freezer.freeze_start_nsec); + } + write_seqcount_end(&cgrp->freezer.freeze_seq); spin_unlock_irq(&css_set_lock); if (freeze) @@ -260,6 +266,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) struct cgroup *parent; struct cgroup *dsct; bool applied = false; + u64 ts_nsec; bool old_e; lockdep_assert_held(&cgroup_mutex); @@ -271,6 +278,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) return; cgrp->freezer.freeze = freeze; + ts_nsec = ktime_get_ns(); /* * Propagate changes downwards the cgroup tree. @@ -298,7 +306,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) /* * Do change actual state: freeze or unfreeze. */ - cgroup_do_freeze(dsct, freeze); + cgroup_do_freeze(dsct, freeze, ts_nsec); applied = true; } diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 144a464e45c6..fdbe57578e68 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -5,7 +5,7 @@ #include <linux/slab.h> #include <linux/nsproxy.h> #include <linux/proc_ns.h> - +#include <linux/nstree.h> /* cgroup namespaces */ @@ -21,33 +21,32 @@ static void dec_cgroup_namespaces(struct ucounts *ucounts) static struct cgroup_namespace *alloc_cgroup_ns(void) { - struct cgroup_namespace *new_ns; + struct cgroup_namespace *new_ns __free(kfree) = NULL; int ret; new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); - ret = ns_alloc_inum(&new_ns->ns); - if (ret) { - kfree(new_ns); + ret = ns_common_init(new_ns); + if (ret) return ERR_PTR(ret); - } - refcount_set(&new_ns->ns.count, 1); - new_ns->ns.ops = &cgroupns_operations; - return new_ns; + ns_tree_add(new_ns); + return no_free_ptr(new_ns); } void free_cgroup_ns(struct cgroup_namespace *ns) { + ns_tree_remove(ns); put_css_set(ns->root_cset); dec_cgroup_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); - kfree(ns); + ns_common_free(ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); } EXPORT_SYMBOL(free_cgroup_ns); -struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, +struct cgroup_namespace *copy_cgroup_ns(u64 flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { @@ -90,11 +89,6 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, return new_ns; } -static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) -{ - return container_of(ns, struct cgroup_namespace, ns); -} - static int cgroupns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; @@ -143,7 +137,6 @@ static struct user_namespace *cgroupns_owner(struct ns_common *ns) const struct proc_ns_operations cgroupns_operations = { .name = "cgroup", - .type = CLONE_NEWCGROUP, .get = cgroupns_get, .put = cgroupns_put, .install = cgroupns_install, |