diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-02-11 13:20:50 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-02-11 13:20:50 -0800 |
| commit | ff661eeee26038f15ed9dd33c91809632e11d9eb (patch) | |
| tree | 6ad1a8e8a47b929ac3e0c563a8b85e7a53363403 /kernel | |
| parent | 9bdc64892dcce732d55b2c07d80b36a6c3e1b5f4 (diff) | |
| parent | 8b1f3c54f930c3aeda0b5bad97bc317fc80267fd (diff) | |
Merge tag 'cgroup-for-6.20' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo:
- cpuset changes:
- Continue separating v1 and v2 implementations by moving more
v1-specific logic into cpuset-v1.c
- Improve partition handling. Sibling partitions are no longer
invalidated on cpuset.cpus conflict, cpuset.cpus changes no longer
fail in v2, and effective_xcpus computation is made consistent
- Fix partition effective CPUs overlap that caused a warning on
cpuset removal when sibling partitions shared CPUs
- Increase the maximum cgroup subsystem count from 16 to 32 to
accommodate future subsystem additions
- Misc cleanups and selftest improvements including switching to
css_is_online() helper, removing dead code and stale documentation
references, using lockdep_assert_cpuset_lock_held() consistently, and
adding polling helpers for asynchronously updated cgroup statistics
* tag 'cgroup-for-6.20' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (21 commits)
cpuset: fix overlap of partition effective CPUs
cgroup: increase maximum subsystem count from 16 to 32
cgroup: Remove stale cpu.rt.max reference from documentation
cpuset: replace direct lockdep_assert_held() with lockdep_assert_cpuset_lock_held()
cgroup/cpuset: Move the v1 empty cpus/mems check to cpuset1_validate_change()
cgroup/cpuset: Don't invalidate sibling partitions on cpuset.cpus conflict
cgroup/cpuset: Don't fail cpuset.cpus change in v2
cgroup/cpuset: Consistently compute effective_xcpus in update_cpumasks_hier()
cgroup/cpuset: Streamline rm_siblings_excl_cpus()
cpuset: remove dead code in cpuset-v1.c
cpuset: remove v1-specific code from generate_sched_domains
cpuset: separate generate_sched_domains for v1 and v2
cpuset: move update_domain_attr_tree to cpuset_v1.c
cpuset: add cpuset1_init helper for v1 initialization
cpuset: add cpuset1_online_css helper for v1-specific operations
cpuset: add lockdep_assert_cpuset_lock_held helper
cpuset: Remove unnecessary checks in rebuild_sched_domains_locked
cgroup: switch to css_is_online() helper
selftests: cgroup: Replace sleep with cg_read_key_long_poll() for waiting on nr_dying_descendants
selftests: cgroup: make test_memcg_sock robust against delayed sock stats
...
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/cgroup/cgroup-internal.h | 8 | ||||
| -rw-r--r-- | kernel/cgroup/cgroup-v1.c | 12 | ||||
| -rw-r--r-- | kernel/cgroup/cgroup.c | 50 | ||||
| -rw-r--r-- | kernel/cgroup/cpuset-internal.h | 54 | ||||
| -rw-r--r-- | kernel/cgroup/cpuset-v1.c | 271 | ||||
| -rw-r--r-- | kernel/cgroup/cpuset.c | 499 | ||||
| -rw-r--r-- | kernel/cgroup/debug.c | 2 |
7 files changed, 473 insertions, 423 deletions
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 22051b4f1ccb..3bfe37693d68 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -52,7 +52,7 @@ struct cgroup_fs_context { bool cpuset_clone_children; bool none; /* User explicitly requested empty subsystem */ bool all_ss; /* Seen 'all' option */ - u16 subsys_mask; /* Selected subsystems */ + u32 subsys_mask; /* Selected subsystems */ char *name; /* Hierarchy name */ char *release_agent; /* Path for release notifications */ }; @@ -146,7 +146,7 @@ struct cgroup_mgctx { struct cgroup_taskset tset; /* subsystems affected by migration */ - u16 ss_mask; + u32 ss_mask; }; #define CGROUP_TASKSET_INIT(tset) \ @@ -235,8 +235,8 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, void cgroup_favor_dynmods(struct cgroup_root *root, bool favor); void cgroup_free_root(struct cgroup_root *root); void init_cgroup_root(struct cgroup_fs_context *ctx); -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask); -int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); +int cgroup_setup_root(struct cgroup_root *root, u32 ss_mask); +int rebind_subsystems(struct cgroup_root *dst_root, u32 ss_mask); int cgroup_do_get_tree(struct fs_context *fc); int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index a9e029b570c8..724950c4b690 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -28,7 +28,7 @@ #define CGROUP_PIDLIST_DESTROY_DELAY HZ /* Controllers blocked by the commandline in v1 */ -static u16 cgroup_no_v1_mask; +static u32 cgroup_no_v1_mask; /* disable named v1 mounts */ static bool cgroup_no_v1_named; @@ -1037,13 +1037,13 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) static int check_cgroupfs_options(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); - u16 mask = U16_MAX; - u16 enabled = 0; + u32 mask = U32_MAX; + u32 enabled = 0; struct cgroup_subsys *ss; int i; #ifdef CONFIG_CPUSETS - mask = ~((u16)1 << cpuset_cgrp_id); + mask = ~((u32)1 << cpuset_cgrp_id); #endif for_each_subsys(ss, i) if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i) && @@ -1095,7 +1095,7 @@ int cgroup1_reconfigure(struct fs_context *fc) struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); int ret = 0; - u16 added_mask, removed_mask; + u32 added_mask, removed_mask; cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); @@ -1343,7 +1343,7 @@ static int __init cgroup_no_v1(char *str) continue; if (!strcmp(token, "all")) { - cgroup_no_v1_mask = U16_MAX; + cgroup_no_v1_mask = U32_MAX; continue; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 5f0d33b04910..8af4351536cf 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -203,13 +203,13 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); bool cgrp_dfl_visible; /* some controllers are not supported in the default hierarchy */ -static u16 cgrp_dfl_inhibit_ss_mask; +static u32 cgrp_dfl_inhibit_ss_mask; /* some controllers are implicitly enabled on the default hierarchy */ -static u16 cgrp_dfl_implicit_ss_mask; +static u32 cgrp_dfl_implicit_ss_mask; /* some controllers can be threaded on the default hierarchy */ -static u16 cgrp_dfl_threaded_ss_mask; +static u32 cgrp_dfl_threaded_ss_mask; /* The list of hierarchy roots */ LIST_HEAD(cgroup_roots); @@ -231,10 +231,10 @@ static u64 css_serial_nr_next = 1; * These bitmasks identify subsystems with specific features to avoid * having to do iterative checks repeatedly. */ -static u16 have_fork_callback __read_mostly; -static u16 have_exit_callback __read_mostly; -static u16 have_release_callback __read_mostly; -static u16 have_canfork_callback __read_mostly; +static u32 have_fork_callback __read_mostly; +static u32 have_exit_callback __read_mostly; +static u32 have_release_callback __read_mostly; +static u32 have_canfork_callback __read_mostly; static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS); @@ -472,13 +472,13 @@ static bool cgroup_is_valid_domain(struct cgroup *cgrp) } /* subsystems visibly enabled on a cgroup */ -static u16 cgroup_control(struct cgroup *cgrp) +static u32 cgroup_control(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); - u16 root_ss_mask = cgrp->root->subsys_mask; + u32 root_ss_mask = cgrp->root->subsys_mask; if (parent) { - u16 ss_mask = parent->subtree_control; + u32 ss_mask = parent->subtree_control; /* threaded cgroups can only have threaded controllers */ if (cgroup_is_threaded(cgrp)) @@ -493,12 +493,12 @@ static u16 cgroup_control(struct cgroup *cgrp) } /* subsystems enabled on a cgroup */ -static u16 cgroup_ss_mask(struct cgroup *cgrp) +static u32 cgroup_ss_mask(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); if (parent) { - u16 ss_mask = parent->subtree_ss_mask; + u32 ss_mask = parent->subtree_ss_mask; /* threaded cgroups can only have threaded controllers */ if (cgroup_is_threaded(cgrp)) @@ -1633,9 +1633,9 @@ static umode_t cgroup_file_mode(const struct cftype *cft) * This function calculates which subsystems need to be enabled if * @subtree_control is to be applied while restricted to @this_ss_mask. */ -static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) +static u32 cgroup_calc_subtree_ss_mask(u32 subtree_control, u32 this_ss_mask) { - u16 cur_ss_mask = subtree_control; + u32 cur_ss_mask = subtree_control; struct cgroup_subsys *ss; int ssid; @@ -1644,7 +1644,7 @@ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) cur_ss_mask |= cgrp_dfl_implicit_ss_mask; while (true) { - u16 new_ss_mask = cur_ss_mask; + u32 new_ss_mask = cur_ss_mask; do_each_subsys_mask(ss, ssid, cur_ss_mask) { new_ss_mask |= ss->depends_on; @@ -1848,12 +1848,12 @@ err: return ret; } -int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) +int rebind_subsystems(struct cgroup_root *dst_root, u32 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; int ssid, ret; - u16 dfl_disable_ss_mask = 0; + u32 dfl_disable_ss_mask = 0; lockdep_assert_held(&cgroup_mutex); @@ -2149,7 +2149,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) +int cgroup_setup_root(struct cgroup_root *root, u32 ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; @@ -3131,7 +3131,7 @@ void cgroup_procs_write_finish(struct task_struct *task, put_task_struct(task); } -static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) +static void cgroup_print_ss_mask(struct seq_file *seq, u32 ss_mask) { struct cgroup_subsys *ss; bool printed = false; @@ -3496,9 +3496,9 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret) cgroup_apply_control_disable(cgrp); } -static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable) +static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u32 enable) { - u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask; + u32 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask; /* if nothing is getting enabled, nothing to worry about */ if (!enable) @@ -3541,7 +3541,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - u16 enable = 0, disable = 0; + u32 enable = 0, disable = 0; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; @@ -4945,7 +4945,7 @@ bool css_has_online_children(struct cgroup_subsys_state *css) rcu_read_lock(); css_for_each_child(child, css) { - if (child->flags & CSS_ONLINE) { + if (css_is_online(child)) { ret = true; break; } @@ -5750,7 +5750,7 @@ static void offline_css(struct cgroup_subsys_state *css) lockdep_assert_held(&cgroup_mutex); - if (!(css->flags & CSS_ONLINE)) + if (!css_is_online(css)) return; if (ss->css_offline) @@ -6347,7 +6347,7 @@ int __init cgroup_init(void) struct cgroup_subsys *ss; int ssid; - BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); + BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 32); BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 01976c8e7d49..fd7d19842ded 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -9,6 +9,7 @@ #include <linux/cpuset.h> #include <linux/spinlock.h> #include <linux/union_find.h> +#include <linux/sched/isolation.h> /* See "Frequency meter" comments, below. */ @@ -144,17 +145,12 @@ struct cpuset { */ nodemask_t old_mems_allowed; - struct fmeter fmeter; /* memory_pressure filter */ - /* * Tasks are being attached to this cpuset. Used to prevent * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). */ int attach_in_progress; - /* for custom sched domain */ - int relax_domain_level; - /* partition root state */ int partition_root_state; @@ -179,10 +175,19 @@ struct cpuset { /* Handle for cpuset.cpus.partition */ struct cgroup_file partition_file; +#ifdef CONFIG_CPUSETS_V1 + struct fmeter fmeter; /* memory_pressure filter */ + + /* for custom sched domain */ + int relax_domain_level; + /* Used to merge intersecting subsets for generate_sched_domains */ struct uf_node node; +#endif }; +extern struct cpuset top_cpuset; + static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) { return css ? container_of(css, struct cpuset, css) : NULL; @@ -240,6 +245,30 @@ static inline int is_spread_slab(const struct cpuset *cs) return test_bit(CS_SPREAD_SLAB, &cs->flags); } +/* + * Helper routine for generate_sched_domains(). + * Do cpusets a, b have overlapping effective cpus_allowed masks? + */ +static inline int cpusets_overlap(struct cpuset *a, struct cpuset *b) +{ + return cpumask_intersects(a->effective_cpus, b->effective_cpus); +} + +static inline int nr_cpusets(void) +{ + /* jump label reference count + the top-level cpuset */ + return static_key_count(&cpusets_enabled_key.key) + 1; +} + +static inline bool cpuset_is_populated(struct cpuset *cs) +{ + lockdep_assert_cpuset_lock_held(); + + /* Cpusets in the process of attaching should be considered as populated */ + return cgroup_is_populated(cs->css.cgroup) || + cs->attach_in_progress; +} + /** * cpuset_for_each_child - traverse online children of a cpuset * @child_cs: loop cursor pointing to the current child @@ -285,7 +314,6 @@ void cpuset_full_unlock(void); */ #ifdef CONFIG_CPUSETS_V1 extern struct cftype cpuset1_files[]; -void fmeter_init(struct fmeter *fmp); void cpuset1_update_task_spread_flags(struct cpuset *cs, struct task_struct *tsk); void cpuset1_update_tasks_flags(struct cpuset *cs); @@ -293,8 +321,13 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated); int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial); +bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2); +void cpuset1_init(struct cpuset *cs); +void cpuset1_online_css(struct cgroup_subsys_state *css); +int cpuset1_generate_sched_domains(cpumask_var_t **domains, + struct sched_domain_attr **attributes); + #else -static inline void fmeter_init(struct fmeter *fmp) {} static inline void cpuset1_update_task_spread_flags(struct cpuset *cs, struct task_struct *tsk) {} static inline void cpuset1_update_tasks_flags(struct cpuset *cs) {} @@ -303,6 +336,13 @@ static inline void cpuset1_hotplug_update_tasks(struct cpuset *cs, bool cpus_updated, bool mems_updated) {} static inline int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial) { return 0; } +static inline bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, + struct cpuset *cs2) { return false; } +static inline void cpuset1_init(struct cpuset *cs) {} +static inline void cpuset1_online_css(struct cgroup_subsys_state *css) {} +static inline int cpuset1_generate_sched_domains(cpumask_var_t **domains, + struct sched_domain_attr **attributes) { return 0; }; + #endif /* CONFIG_CPUSETS_V1 */ #endif /* __CPUSET_INTERNAL_H */ diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index 12e76774c75b..7a23b9e8778f 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -62,7 +62,7 @@ struct cpuset_remove_tasks_struct { #define FM_SCALE 1000 /* faux fixed point scale */ /* Initialize a frequency meter */ -void fmeter_init(struct fmeter *fmp) +static void fmeter_init(struct fmeter *fmp) { fmp->cnt = 0; fmp->val = 0; @@ -368,11 +368,44 @@ int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial) if (par && !is_cpuset_subset(trial, par)) goto out; + /* + * Cpusets with tasks - existing or newly being attached - can't + * be changed to have empty cpus_allowed or mems_allowed. + */ + ret = -ENOSPC; + if (cpuset_is_populated(cur)) { + if (!cpumask_empty(cur->cpus_allowed) && + cpumask_empty(trial->cpus_allowed)) + goto out; + if (!nodes_empty(cur->mems_allowed) && + nodes_empty(trial->mems_allowed)) + goto out; + } + ret = 0; out: return ret; } +/* + * cpuset1_cpus_excl_conflict() - Check if two cpusets have exclusive CPU conflicts + * to legacy (v1) + * @cs1: first cpuset to check + * @cs2: second cpuset to check + * + * Returns: true if CPU exclusivity conflict exists, false otherwise + * + * If either cpuset is CPU exclusive, their allowed CPUs cannot intersect. + */ +bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) +{ + if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2)) + return cpumask_intersects(cs1->cpus_allowed, + cs2->cpus_allowed); + + return false; +} + #ifdef CONFIG_PROC_PID_CPUSET /* * proc_cpuset_show() @@ -499,6 +532,242 @@ out_unlock: return retval; } +void cpuset1_init(struct cpuset *cs) +{ + fmeter_init(&cs->fmeter); + cs->relax_domain_level = -1; +} + +void cpuset1_online_css(struct cgroup_subsys_state *css) +{ + struct cpuset *tmp_cs; + struct cgroup_subsys_state *pos_css; + struct cpuset *cs = css_cs(css); + struct cpuset *parent = parent_cs(cs); + + lockdep_assert_cpus_held(); + lockdep_assert_cpuset_lock_held(); + + if (is_spread_page(parent)) + set_bit(CS_SPREAD_PAGE, &cs->flags); + if (is_spread_slab(parent)) + set_bit(CS_SPREAD_SLAB, &cs->flags); + + if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) + return; + + /* + * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is + * set. This flag handling is implemented in cgroup core for + * historical reasons - the flag may be specified during mount. + * + * Currently, if any sibling cpusets have exclusive cpus or mem, we + * refuse to clone the configuration - thereby refusing the task to + * be entered, and as a result refusing the sys_unshare() or + * clone() which initiated it. If this becomes a problem for some + * users who wish to allow that scenario, then this could be + * changed to grant parent->cpus_allowed-sibling_cpus_exclusive + * (and likewise for mems) to the new cgroup. + */ + rcu_read_lock(); + cpuset_for_each_child(tmp_cs, pos_css, parent) { + if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { + rcu_read_unlock(); + return; + } + } + rcu_read_unlock(); + + cpuset_callback_lock_irq(); + cs->mems_allowed = parent->mems_allowed; + cs->effective_mems = parent->mems_allowed; + cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); + cpumask_copy(cs->effective_cpus, parent->cpus_allowed); + cpuset_callback_unlock_irq(); +} + +static void +update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) +{ + if (dattr->relax_domain_level < c->relax_domain_level) + dattr->relax_domain_level = c->relax_domain_level; +} + +static void update_domain_attr_tree(struct sched_domain_attr *dattr, + struct cpuset *root_cs) +{ + struct cpuset *cp; + struct cgroup_subsys_state *pos_css; + + rcu_read_lock(); + cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { + /* skip the whole subtree if @cp doesn't have any CPU */ + if (cpumask_empty(cp->cpus_allowed)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } + + if (is_sched_load_balance(cp)) + update_domain_attr(dattr, cp); + } + rcu_read_unlock(); +} + +/* + * cpuset1_generate_sched_domains() + * + * Finding the best partition (set of domains): + * The double nested loops below over i, j scan over the load + * balanced cpusets (using the array of cpuset pointers in csa[]) + * looking for pairs of cpusets that have overlapping cpus_allowed + * and merging them using a union-find algorithm. + * + * The union of the cpus_allowed masks from the set of all cpusets + * having the same root then form the one element of the partition + * (one sched domain) to be passed to partition_sched_domains(). + */ +int cpuset1_generate_sched_domains(cpumask_var_t **domains, + struct sched_domain_attr **attributes) +{ + struct cpuset *cp; /* top-down scan of cpusets */ + struct cpuset **csa; /* array of all cpuset ptrs */ + int csn; /* how many cpuset ptrs in csa so far */ + int i, j; /* indices for partition finding loops */ + cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ + struct sched_domain_attr *dattr; /* attributes for custom domains */ + int ndoms = 0; /* number of sched domains in result */ + int nslot; /* next empty doms[] struct cpumask slot */ + struct cgroup_subsys_state *pos_css; + int nslot_update; + + lockdep_assert_cpuset_lock_held(); + + doms = NULL; + dattr = NULL; + csa = NULL; + + /* Special case for the 99% of systems with one, full, sched domain */ + if (is_sched_load_balance(&top_cpuset)) { + ndoms = 1; + doms = alloc_sched_domains(ndoms); + if (!doms) + goto done; + + dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); + if (dattr) { + *dattr = SD_ATTR_INIT; + update_domain_attr_tree(dattr, &top_cpuset); + } + cpumask_and(doms[0], top_cpuset.effective_cpus, + housekeeping_cpumask(HK_TYPE_DOMAIN)); + + goto done; + } + + csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL); + if (!csa) + goto done; + csn = 0; + + rcu_read_lock(); + cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { + if (cp == &top_cpuset) + continue; + + /* + * Continue traversing beyond @cp iff @cp has some CPUs and + * isn't load balancing. The former is obvious. The + * latter: All child cpusets contain a subset of the + * parent's cpus, so just skip them, and then we call + * update_domain_attr_tree() to calc relax_domain_level of + * the corresponding sched domain. + */ + if (!cpumask_empty(cp->cpus_allowed) && + !(is_sched_load_balance(cp) && + cpumask_intersects(cp->cpus_allowed, + housekeeping_cpumask(HK_TYPE_DOMAIN)))) + continue; + + if (is_sched_load_balance(cp) && + !cpumask_empty(cp->effective_cpus)) + csa[csn++] = cp; + + /* skip @cp's subtree */ + pos_css = css_rightmost_descendant(pos_css); + continue; + } + rcu_read_unlock(); + + for (i = 0; i < csn; i++) + uf_node_init(&csa[i]->node); + + /* Merge overlapping cpusets */ + for (i = 0; i < csn; i++) { + for (j = i + 1; j < csn; j++) { + if (cpusets_overlap(csa[i], csa[j])) + uf_union(&csa[i]->node, &csa[j]->node); + } + } + + /* Count the total number of domains */ + for (i = 0; i < csn; i++) { + if (uf_find(&csa[i]->node) == &csa[i]->node) + ndoms++; + } + + /* + * Now we know how many domains to create. + * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. + */ + doms = alloc_sched_domains(ndoms); + if (!doms) + goto done; + + /* + * The rest of the code, including the scheduler, can deal with + * dattr==NULL case. No need to abort if alloc fails. + */ + dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr), + GFP_KERNEL); + + for (nslot = 0, i = 0; i < csn; i++) { + nslot_update = 0; + for (j = i; j < csn; j++) { + if (uf_find(&csa[j]->node) == &csa[i]->node) { + struct cpumask *dp = doms[nslot]; + + if (i == j) { + nslot_update = 1; + cpumask_clear(dp); + if (dattr) + *(dattr + nslot) = SD_ATTR_INIT; + } + cpumask_or(dp, dp, csa[j]->effective_cpus); + cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); + if (dattr) + update_domain_attr_tree(dattr + nslot, csa[j]); + } + } + if (nslot_update) + nslot++; + } + BUG_ON(nslot != ndoms); + +done: + kfree(csa); + + /* + * Fallback to the default domain if kmalloc() failed. + * See comments in partition_sched_domains(). + */ + if (doms == NULL) + ndoms = 1; + + *domains = doms; + *attributes = dattr; + return ndoms; +} + /* * for the common functions, 'private' gives the type of file */ diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 01a553caee56..c43efef7df71 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -119,6 +119,17 @@ static bool force_sd_rebuild; * For simplicity, a local partition can be created under a local or remote * partition but a remote partition cannot have any partition root in its * ancestor chain except the cgroup root. + * + * A valid partition can be formed by setting exclusive_cpus or cpus_allowed + * if exclusive_cpus is not set. In the case of partition with empty + * exclusive_cpus, all the conflicting exclusive CPUs specified in the + * following cpumasks of sibling cpusets will be removed from its + * cpus_allowed in determining its effective_xcpus. + * - effective_xcpus + * - exclusive_cpus + * + * The "cpuset.cpus.exclusive" control file should be used for setting up + * partition if the users want to get as many CPUs as possible. */ #define PRS_MEMBER 0 #define PRS_ROOT 1 @@ -201,12 +212,10 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs) * If cpu_online_mask is used while a hotunplug operation is happening in * parallel, we may leave an offline CPU in cpu_allowed or some other masks. */ -static struct cpuset top_cpuset = { +struct cpuset top_cpuset = { .flags = BIT(CS_CPU_EXCLUSIVE) | BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, - .relax_domain_level = -1, - .remote_partition = false, }; /* @@ -261,6 +270,11 @@ void cpuset_unlock(void) mutex_unlock(&cpuset_mutex); } +void lockdep_assert_cpuset_lock_held(void) +{ + lockdep_assert_held(&cpuset_mutex); +} + /** * cpuset_full_lock - Acquire full protection for cpuset modification * @@ -319,7 +333,7 @@ static inline void check_insane_mems_config(nodemask_t *nodes) */ static inline void dec_attach_in_progress_locked(struct cpuset *cs) { - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); cs->attach_in_progress--; if (!cs->attach_in_progress) @@ -353,15 +367,6 @@ static inline bool is_in_v2_mode(void) (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); } -static inline bool cpuset_is_populated(struct cpuset *cs) -{ - lockdep_assert_held(&cpuset_mutex); - - /* Cpusets in the process of attaching should be considered as populated */ - return cgroup_is_populated(cs->css.cgroup) || - cs->attach_in_progress; -} - /** * partition_is_populated - check if partition has tasks * @cs: partition root to be checked @@ -603,36 +608,32 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) /** * cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts - * @cs1: first cpuset to check - * @cs2: second cpuset to check + * @trial: the trial cpuset to be checked + * @sibling: a sibling cpuset to be checked against + * @xcpus_changed: set if exclusive_cpus has been set * * Returns: true if CPU exclusivity conflict exists, false otherwise * * Conflict detection rules: - * 1. If either cpuset is CPU exclusive, they must be mutually exclusive - * 2. exclusive_cpus masks cannot intersect between cpusets - * 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs + * o cgroup v1 + * See cpuset1_cpus_excl_conflict() + * o cgroup v2 + * - The exclusive_cpus values cannot overlap. + * - New exclusive_cpus cannot be a superset of a sibling's cpus_allowed. */ -static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) +static inline bool cpus_excl_conflict(struct cpuset *trial, struct cpuset *sibling, + bool xcpus_changed) { - /* If either cpuset is exclusive, check if they are mutually exclusive */ - if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2)) - return !cpusets_are_exclusive(cs1, cs2); - - /* Exclusive_cpus cannot intersect */ - if (cpumask_intersects(cs1->exclusive_cpus, cs2->exclusive_cpus)) - return true; - - /* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */ - if (!cpumask_empty(cs1->cpus_allowed) && - cpumask_subset(cs1->cpus_allowed, cs2->exclusive_cpus)) - return true; + if (!cpuset_v2()) + return cpuset1_cpus_excl_conflict(trial, sibling); - if (!cpumask_empty(cs2->cpus_allowed) && - cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus)) + /* The cpus_allowed of a sibling cpuset cannot be a subset of the new exclusive_cpus */ + if (xcpus_changed && !cpumask_empty(sibling->cpus_allowed) && + cpumask_subset(sibling->cpus_allowed, trial->exclusive_cpus)) return true; - return false; + /* Exclusive_cpus cannot intersect */ + return cpumask_intersects(trial->exclusive_cpus, sibling->exclusive_cpus); } static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) @@ -666,6 +667,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) { struct cgroup_subsys_state *css; struct cpuset *c, *par; + bool xcpus_changed; int ret = 0; rcu_read_lock(); @@ -682,20 +684,6 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) par = parent_cs(cur); /* - * Cpusets with tasks - existing or newly being attached - can't - * be changed to have empty cpus_allowed or mems_allowed. - */ - ret = -ENOSPC; - if (cpuset_is_populated(cur)) { - if (!cpumask_empty(cur->cpus_allowed) && - cpumask_empty(trial->cpus_allowed)) - goto out; - if (!nodes_empty(cur->mems_allowed) && - nodes_empty(trial->mems_allowed)) - goto out; - } - - /* * We can't shrink if we won't have enough room for SCHED_DEADLINE * tasks. This check is not done when scheduling is disabled as the * users should know what they are doing. @@ -722,10 +710,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) * overlap. exclusive_cpus cannot overlap with each other if set. */ ret = -EINVAL; + xcpus_changed = !cpumask_equal(cur->exclusive_cpus, trial->exclusive_cpus); cpuset_for_each_child(c, css, par) { if (c == cur) continue; - if (cpus_excl_conflict(trial, c)) + if (cpus_excl_conflict(trial, c, xcpus_changed)) goto out; if (mems_excl_conflict(trial, c)) goto out; @@ -738,49 +727,6 @@ out: } #ifdef CONFIG_SMP -/* - * Helper routine for generate_sched_domains(). - * Do cpusets a, b have overlapping effective cpus_allowed masks? - */ -static int cpusets_overlap(struct cpuset *a, struct cpuset *b) -{ - return cpumask_intersects(a->effective_cpus, b->effective_cpus); -} - -static void -update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) -{ - if (dattr->relax_domain_level < c->relax_domain_level) - dattr->relax_domain_level = c->relax_domain_level; - return; -} - -static void update_domain_attr_tree(struct sched_domain_attr *dattr, - struct cpuset *root_cs) -{ - struct cpuset *cp; - struct cgroup_subsys_state *pos_css; - - rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { - /* skip the whole subtree if @cp doesn't have any CPU */ - if (cpumask_empty(cp->cpus_allowed)) { - pos_css = css_rightmost_descendant(pos_css); - continue; - } - - if (is_sched_load_balance(cp)) - update_domain_attr(dattr, cp); - } - rcu_read_unlock(); -} - -/* Must be called with cpuset_mutex held. */ -static inline int nr_cpusets(void) -{ - /* jump label reference count + the top-level cpuset */ - return static_key_count(&cpusets_enabled_key.key) + 1; -} /* * generate_sched_domains() @@ -820,103 +766,46 @@ static inline int nr_cpusets(void) * convenient format, that can be easily compared to the prior * value to determine what partition elements (sched domains) * were changed (added or removed.) - * - * Finding the best partition (set of domains): - * The double nested loops below over i, j scan over the load - * balanced cpusets (using the array of cpuset pointers in csa[]) - * looking for pairs of cpusets that have overlapping cpus_allowed - * and merging them using a union-find algorithm. - * - * The union of the cpus_allowed masks from the set of all cpusets - * having the same root then form the one element of the partition - * (one sched domain) to be passed to partition_sched_domains(). - * */ static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { struct cpuset *cp; /* top-down scan of cpusets */ struct cpuset **csa; /* array of all cpuset ptrs */ - int csn; /* how many cpuset ptrs in csa so far */ int i, j; /* indices for partition finding loops */ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ - int nslot; /* next empty doms[] struct cpumask slot */ struct cgroup_subsys_state *pos_css; - bool root_load_balance = is_sched_load_balance(&top_cpuset); - bool cgrpv2 = cpuset_v2(); - int nslot_update; + + if (!cpuset_v2()) + return cpuset1_generate_sched_domains(domains, attributes); doms = NULL; dattr = NULL; csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ - if (root_load_balance && cpumask_empty(subpartitions_cpus)) { -single_root_domain: + if (cpumask_empty(subpartitions_cpus)) { ndoms = 1; - doms = alloc_sched_domains(ndoms); - if (!doms) - goto done; - - dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); - if (dattr) { - *dattr = SD_ATTR_INIT; - update_domain_attr_tree(dattr, &top_cpuset); - } - cpumask_and(doms[0], top_cpuset.effective_cpus, - housekeeping_cpumask(HK_TYPE_DOMAIN)); - - goto done; + /* !csa will be checked and can be correctly handled */ + goto generate_doms; } csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL); if (!csa) goto done; - csn = 0; + /* Find how many partitions and cache them to csa[] */ rcu_read_lock(); - if (root_load_balance) - csa[csn++] = &top_cpuset; cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { - if (cp == &top_cpuset) - continue; - - if (cgrpv2) - goto v2; - - /* - * v1: - * Continue traversing beyond @cp iff @cp has some CPUs and - * isn't load balancing. The former is obvious. The - * latter: All child cpusets contain a subset of the - * parent's cpus, so just skip them, and then we call - * update_domain_attr_tree() to calc relax_domain_level of - * the corresponding sched domain. - */ - if (!cpumask_empty(cp->cpus_allowed) && - !(is_sched_load_balance(cp) && - cpumask_intersects(cp->cpus_allowed, - housekeeping_cpumask(HK_TYPE_DOMAIN)))) - continue; - - if (is_sched_load_balance(cp) && - !cpumask_empty(cp->effective_cpus)) - csa[csn++] = cp; - - /* skip @cp's subtree */ - pos_css = css_rightmost_descendant(pos_css); - continue; - -v2: /* * Only valid partition roots that are not isolated and with - * non-empty effective_cpus will be saved into csn[]. + * non-empty effective_cpus will be saved into csa[]. */ if ((cp->partition_root_state == PRS_ROOT) && !cpumask_empty(cp->effective_cpus)) - csa[csn++] = cp; + csa[ndoms++] = cp; /* * Skip @cp's subtree if not a partition root and has no @@ -927,40 +816,18 @@ v2: } rcu_read_unlock(); - /* - * If there are only isolated partitions underneath the cgroup root, - * we can optimize out unneeded sched domains scanning. - */ - if (root_load_balance && (csn == 1)) - goto single_root_domain; - - for (i = 0; i < csn; i++) - uf_node_init(&csa[i]->node); - - /* Merge overlapping cpusets */ - for (i = 0; i < csn; i++) { - for (j = i + 1; j < csn; j++) { - if (cpusets_overlap(csa[i], csa[j])) { + for (i = 0; i < ndoms; i++) { + for (j = i + 1; j < ndoms; j++) { + if (cpusets_overlap(csa[i], csa[j])) /* * Cgroup v2 shouldn't pass down overlapping * partition root cpusets. */ - WARN_ON_ONCE(cgrpv2); - uf_union(&csa[i]->node, &csa[j]->node); - } + WARN_ON_ONCE(1); } } - /* Count the total number of domains */ - for (i = 0; i < csn; i++) { - if (uf_find(&csa[i]->node) == &csa[i]->node) - ndoms++; - } - - /* - * Now we know how many domains to create. - * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. - */ +generate_doms: doms = alloc_sched_domains(ndoms); if (!doms) goto done; @@ -977,45 +844,19 @@ v2: * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a * subset of HK_TYPE_DOMAIN housekeeping CPUs. */ - if (cgrpv2) { - for (i = 0; i < ndoms; i++) { - /* - * The top cpuset may contain some boot time isolated - * CPUs that need to be excluded from the sched domain. - */ - if (csa[i] == &top_cpuset) - cpumask_and(doms[i], csa[i]->effective_cpus, - housekeeping_cpumask(HK_TYPE_DOMAIN)); - else - cpumask_copy(doms[i], csa[i]->effective_cpus); - if (dattr) - dattr[i] = SD_ATTR_INIT; - } - goto done; - } - - for (nslot = 0, i = 0; i < csn; i++) { - nslot_update = 0; - for (j = i; j < csn; j++) { - if (uf_find(&csa[j]->node) == &csa[i]->node) { - struct cpumask *dp = doms[nslot]; - - if (i == j) { - nslot_update = 1; - cpumask_clear(dp); - if (dattr) - *(dattr + nslot) = SD_ATTR_INIT; - } - cpumask_or(dp, dp, csa[j]->effective_cpus); - cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); - if (dattr) - update_domain_attr_tree(dattr + nslot, csa[j]); - } - } - if (nslot_update) - nslot++; + for (i = 0; i < ndoms; i++) { + /* + * The top cpuset may contain some boot time isolated + * CPUs that need to be excluded from the sched domain. + */ + if (!csa || csa[i] == &top_cpuset) + cpumask_and(doms[i], top_cpuset.effective_cpus, + housekeeping_cpumask(HK_TYPE_DOMAIN)); + else + cpumask_copy(doms[i], csa[i]->effective_cpus); + if (dattr) + dattr[i] = SD_ATTR_INIT; } - BUG_ON(nslot != ndoms); done: kfree(csa); @@ -1055,7 +896,7 @@ void dl_rebuild_rd_accounting(void) int cpu; u64 cookie = ++dl_cookie; - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); lockdep_assert_cpus_held(); lockdep_assert_held(&sched_domains_mutex); @@ -1100,53 +941,33 @@ void dl_rebuild_rd_accounting(void) */ void rebuild_sched_domains_locked(void) { - struct cgroup_subsys_state *pos_css; struct sched_domain_attr *attr; cpumask_var_t *doms; - struct cpuset *cs; int ndoms; + int i; lockdep_assert_cpus_held(); - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); force_sd_rebuild = false; - /* - * If we have raced with CPU hotplug, return early to avoid - * passing doms with offlined cpu to partition_sched_domains(). - * Anyways, cpuset_handle_hotplug() will rebuild sched domains. - * - * With no CPUs in any subpartitions, top_cpuset's effective CPUs - * should be the same as the active CPUs, so checking only top_cpuset - * is enough to detect racing CPU offlines. - */ - if (cpumask_empty(subpartitions_cpus) && - !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - return; + /* Generate domain masks and attrs */ + ndoms = generate_sched_domains(&doms, &attr); /* - * With subpartition CPUs, however, the effective CPUs of a partition - * root should be only a subset of the active CPUs. Since a CPU in any - * partition root could be offlined, all must be checked. - */ - if (!cpumask_empty(subpartitions_cpus)) { - rcu_read_lock(); - cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { - if (!is_partition_valid(cs)) { - pos_css = css_rightmost_descendant(pos_css); - continue; - } - if (!cpumask_subset(cs->effective_cpus, - cpu_active_mask)) { - rcu_read_unlock(); - return; - } - } - rcu_read_unlock(); + * cpuset_hotplug_workfn is invoked synchronously now, thus this + * function should not race with CPU hotplug. And the effective CPUs + * must not include any offline CPUs. Passing an offline CPU in the + * doms to partition_sched_domains() will trigger a kernel panic. + * + * We perform a final check here: if the doms contains any + * offline CPUs, a warning is emitted and we return directly to + * prevent the panic. + */ + for (i = 0; i < ndoms; ++i) { + if (WARN_ON_ONCE(!cpumask_subset(doms[i], cpu_active_mask))) + return; } - /* Generate domain masks and attrs */ - ndoms = generate_sched_domains(&doms, &attr); - /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); } @@ -1501,23 +1322,29 @@ static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs, int retval = 0; if (cpumask_empty(excpus)) - return retval; + return 0; /* - * Exclude exclusive CPUs from siblings + * Remove exclusive CPUs from siblings */ rcu_read_lock(); cpuset_for_each_child(sibling, css, parent) { + struct cpumask *sibling_xcpus; + if (sibling == cs) continue; - if (cpumask_intersects(excpus, sibling->exclusive_cpus)) { - cpumask_andnot(excpus, excpus, sibling->exclusive_cpus); - retval++; - continue; - } - if (cpumask_intersects(excpus, sibling->effective_xcpus)) { - cpumask_andnot(excpus, excpus, sibling->effective_xcpus); + /* + * If exclusive_cpus is defined, effective_xcpus will always + * be a subset. Otherwise, effective_xcpus will only be set + * in a valid partition root. + */ + sibling_xcpus = cpumask_empty(sibling->exclusive_cpus) + ? sibling->effective_xcpus + : sibling->exclusive_cpus; + + if (cpumask_intersects(excpus, sibling_xcpus)) { + cpumask_andnot(excpus, excpus, sibling_xcpus); retval++; } } @@ -1806,7 +1633,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, int parent_prs = parent->partition_root_state; bool nocpu; - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); WARN_ON_ONCE(is_remote_partition(cs)); /* For local partition only */ /* @@ -2315,17 +2142,13 @@ get_css: spin_lock_irq(&callback_lock); cpumask_copy(cp->effective_cpus, tmp->new_cpus); cp->partition_root_state = new_prs; - if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) - compute_excpus(cp, cp->effective_xcpus); - /* - * Make sure effective_xcpus is properly set for a valid - * partition root. + * Need to compute effective_xcpus if either exclusive_cpus + * is non-empty or it is a valid partition root. */ - if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus)) - cpumask_and(cp->effective_xcpus, - cp->cpus_allowed, parent->effective_xcpus); - else if (new_prs < 0) + if ((new_prs > 0) || !cpumask_empty(cp->exclusive_cpus)) + compute_excpus(cp, cp->effective_xcpus); + if (new_prs <= 0) reset_partition_data(cp); spin_unlock_irq(&callback_lock); @@ -2378,7 +2201,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct cpuset *sibling; struct cgroup_subsys_state *pos_css; - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); /* * Check all its siblings and call update_cpumasks_hier() @@ -2387,27 +2210,20 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, * It is possible a change in parent's effective_cpus * due to a change in a child partition's effective_xcpus will impact * its siblings even if they do not inherit parent's effective_cpus - * directly. + * directly. It should not impact valid partition. * * The update_cpumasks_hier() function may sleep. So we have to * release the RCU read lock before calling it. */ rcu_read_lock(); cpuset_for_each_child(sibling, pos_css, parent) { - if (sibling == cs) + if (sibling == cs || is_partition_valid(sibling)) continue; - if (!is_partition_valid(sibling)) { - compute_effective_cpumask(tmp->new_cpus, sibling, - parent); - if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) - continue; - } else if (is_remote_partition(sibling)) { - /* - * Change in a sibling cpuset won't affect a remote - * partition root. - */ + + compute_effective_cpumask(tmp->new_cpus, sibling, + parent); + if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) continue; - } if (!css_tryget_online(&sibling->css)) continue; @@ -2463,43 +2279,6 @@ static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *tri return PERR_NONE; } -static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs, - struct tmpmasks *tmp) -{ - int retval; - struct cpuset *parent = parent_cs(cs); - - retval = validate_change(cs, trialcs); - - if ((retval == -EINVAL) && cpuset_v2()) { - struct cgroup_subsys_state *css; - struct cpuset *cp; - - /* - * The -EINVAL error code indicates that partition sibling - * CPU exclusivity rule has been violated. We still allow - * the cpumask change to proceed while invalidating the - * partition. However, any conflicting sibling partitions - * have to be marked as invalid too. - */ - trialcs->prs_err = PERR_NOTEXCL; - rcu_read_lock(); - cpuset_for_each_child(cp, css, parent) { - struct cpumask *xcpus = user_xcpus(trialcs); - - if (is_partition_valid(cp) && - cpumask_intersects(xcpus, cp->effective_xcpus)) { - rcu_read_unlock(); - update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp); - rcu_read_lock(); - } - } - rcu_read_unlock(); - retval = 0; - } - return retval; -} - /** * partition_cpus_change - Handle partition state changes due to CPU mask updates * @cs: The target cpuset being modified @@ -2559,15 +2338,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; - if (alloc_tmpmasks(&tmp)) - return -ENOMEM; - compute_trialcs_excpus(trialcs, cs); trialcs->prs_err = PERR_NONE; - retval = cpus_allowed_validate_change(cs, trialcs, &tmp); + retval = validate_change(cs, trialcs); if (retval < 0) - goto out_free; + return retval; + + if (alloc_tmpmasks(&tmp)) + return -ENOMEM; /* * Check all the descendants in update_cpumasks_hier() if @@ -2590,7 +2369,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ if (cs->partition_root_state) update_partition_sd_lb(cs, old_prs); -out_free: + free_tmpmasks(&tmp); return retval; } @@ -3249,7 +3028,7 @@ static nodemask_t cpuset_attach_nodemask_to; static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) { - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); if (cs != &top_cpuset) guarantee_active_cpus(task, cpus_attach); @@ -3605,8 +3384,7 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) return ERR_PTR(-ENOMEM); __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - fmeter_init(&cs->fmeter); - cs->relax_domain_level = -1; + cpuset1_init(cs); /* Set CS_MEMORY_MIGRATE for default hierarchy */ if (cpuset_v2()) @@ -3619,17 +3397,11 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); struct cpuset *parent = parent_cs(cs); - struct cpuset *tmp_cs; - struct cgroup_subsys_state *pos_css; if (!parent) return 0; cpuset_full_lock(); - if (is_spread_page(parent)) - set_bit(CS_SPREAD_PAGE, &cs->flags); - if (is_spread_slab(parent)) - set_bit(CS_SPREAD_SLAB, &cs->flags); /* * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated */ @@ -3644,39 +3416,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->effective_mems = parent->effective_mems; } spin_unlock_irq(&callback_lock); + cpuset1_online_css(css); - if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) - goto out_unlock; - - /* - * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is - * set. This flag handling is implemented in cgroup core for - * historical reasons - the flag may be specified during mount. - * - * Currently, if any sibling cpusets have exclusive cpus or mem, we - * refuse to clone the configuration - thereby refusing the task to - * be entered, and as a result refusing the sys_unshare() or - * clone() which initiated it. If this becomes a problem for some - * users who wish to allow that scenario, then this could be - * changed to grant parent->cpus_allowed-sibling_cpus_exclusive - * (and likewise for mems) to the new cgroup. - */ - rcu_read_lock(); - cpuset_for_each_child(tmp_cs, pos_css, parent) { - if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { - rcu_read_unlock(); - goto out_unlock; - } - } - rcu_read_unlock(); - - spin_lock_irq(&callback_lock); - cs->mems_allowed = parent->mems_allowed; - cs->effective_mems = parent->mems_allowed; - cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); - cpumask_copy(cs->effective_cpus, parent->cpus_allowed); - spin_unlock_irq(&callback_lock); -out_unlock: cpuset_full_unlock(); return 0; } @@ -3876,7 +3617,7 @@ int __init cpuset_init(void) cpumask_setall(top_cpuset.exclusive_cpus); nodes_setall(top_cpuset.effective_mems); - fmeter_init(&top_cpuset.fmeter); + cpuset1_init(&top_cpuset); BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); @@ -4210,7 +3951,7 @@ static void __cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask */ void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask) { - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); __cpuset_cpus_allowed_locked(tsk, pmask); } diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 81ea38dd6f9d..a5490097fe52 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -230,7 +230,7 @@ static int cgroup_subsys_states_read(struct seq_file *seq, void *v) } static void cgroup_masks_read_one(struct seq_file *seq, const char *name, - u16 mask) + u32 mask) { struct cgroup_subsys *ss; int ssid; |
