summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-02-11 13:20:50 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2026-02-11 13:20:50 -0800
commitff661eeee26038f15ed9dd33c91809632e11d9eb (patch)
tree6ad1a8e8a47b929ac3e0c563a8b85e7a53363403 /kernel
parent9bdc64892dcce732d55b2c07d80b36a6c3e1b5f4 (diff)
parent8b1f3c54f930c3aeda0b5bad97bc317fc80267fd (diff)
Merge tag 'cgroup-for-6.20' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: - cpuset changes: - Continue separating v1 and v2 implementations by moving more v1-specific logic into cpuset-v1.c - Improve partition handling. Sibling partitions are no longer invalidated on cpuset.cpus conflict, cpuset.cpus changes no longer fail in v2, and effective_xcpus computation is made consistent - Fix partition effective CPUs overlap that caused a warning on cpuset removal when sibling partitions shared CPUs - Increase the maximum cgroup subsystem count from 16 to 32 to accommodate future subsystem additions - Misc cleanups and selftest improvements including switching to css_is_online() helper, removing dead code and stale documentation references, using lockdep_assert_cpuset_lock_held() consistently, and adding polling helpers for asynchronously updated cgroup statistics * tag 'cgroup-for-6.20' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (21 commits) cpuset: fix overlap of partition effective CPUs cgroup: increase maximum subsystem count from 16 to 32 cgroup: Remove stale cpu.rt.max reference from documentation cpuset: replace direct lockdep_assert_held() with lockdep_assert_cpuset_lock_held() cgroup/cpuset: Move the v1 empty cpus/mems check to cpuset1_validate_change() cgroup/cpuset: Don't invalidate sibling partitions on cpuset.cpus conflict cgroup/cpuset: Don't fail cpuset.cpus change in v2 cgroup/cpuset: Consistently compute effective_xcpus in update_cpumasks_hier() cgroup/cpuset: Streamline rm_siblings_excl_cpus() cpuset: remove dead code in cpuset-v1.c cpuset: remove v1-specific code from generate_sched_domains cpuset: separate generate_sched_domains for v1 and v2 cpuset: move update_domain_attr_tree to cpuset_v1.c cpuset: add cpuset1_init helper for v1 initialization cpuset: add cpuset1_online_css helper for v1-specific operations cpuset: add lockdep_assert_cpuset_lock_held helper cpuset: Remove unnecessary checks in rebuild_sched_domains_locked cgroup: switch to css_is_online() helper selftests: cgroup: Replace sleep with cg_read_key_long_poll() for waiting on nr_dying_descendants selftests: cgroup: make test_memcg_sock robust against delayed sock stats ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup/cgroup-internal.h8
-rw-r--r--kernel/cgroup/cgroup-v1.c12
-rw-r--r--kernel/cgroup/cgroup.c50
-rw-r--r--kernel/cgroup/cpuset-internal.h54
-rw-r--r--kernel/cgroup/cpuset-v1.c271
-rw-r--r--kernel/cgroup/cpuset.c499
-rw-r--r--kernel/cgroup/debug.c2
7 files changed, 473 insertions, 423 deletions
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 22051b4f1ccb..3bfe37693d68 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -52,7 +52,7 @@ struct cgroup_fs_context {
bool cpuset_clone_children;
bool none; /* User explicitly requested empty subsystem */
bool all_ss; /* Seen 'all' option */
- u16 subsys_mask; /* Selected subsystems */
+ u32 subsys_mask; /* Selected subsystems */
char *name; /* Hierarchy name */
char *release_agent; /* Path for release notifications */
};
@@ -146,7 +146,7 @@ struct cgroup_mgctx {
struct cgroup_taskset tset;
/* subsystems affected by migration */
- u16 ss_mask;
+ u32 ss_mask;
};
#define CGROUP_TASKSET_INIT(tset) \
@@ -235,8 +235,8 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
void cgroup_favor_dynmods(struct cgroup_root *root, bool favor);
void cgroup_free_root(struct cgroup_root *root);
void init_cgroup_root(struct cgroup_fs_context *ctx);
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
-int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
+int cgroup_setup_root(struct cgroup_root *root, u32 ss_mask);
+int rebind_subsystems(struct cgroup_root *dst_root, u32 ss_mask);
int cgroup_do_get_tree(struct fs_context *fc);
int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index a9e029b570c8..724950c4b690 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -28,7 +28,7 @@
#define CGROUP_PIDLIST_DESTROY_DELAY HZ
/* Controllers blocked by the commandline in v1 */
-static u16 cgroup_no_v1_mask;
+static u32 cgroup_no_v1_mask;
/* disable named v1 mounts */
static bool cgroup_no_v1_named;
@@ -1037,13 +1037,13 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
static int check_cgroupfs_options(struct fs_context *fc)
{
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
- u16 mask = U16_MAX;
- u16 enabled = 0;
+ u32 mask = U32_MAX;
+ u32 enabled = 0;
struct cgroup_subsys *ss;
int i;
#ifdef CONFIG_CPUSETS
- mask = ~((u16)1 << cpuset_cgrp_id);
+ mask = ~((u32)1 << cpuset_cgrp_id);
#endif
for_each_subsys(ss, i)
if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i) &&
@@ -1095,7 +1095,7 @@ int cgroup1_reconfigure(struct fs_context *fc)
struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb);
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
int ret = 0;
- u16 added_mask, removed_mask;
+ u32 added_mask, removed_mask;
cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
@@ -1343,7 +1343,7 @@ static int __init cgroup_no_v1(char *str)
continue;
if (!strcmp(token, "all")) {
- cgroup_no_v1_mask = U16_MAX;
+ cgroup_no_v1_mask = U32_MAX;
continue;
}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 5f0d33b04910..8af4351536cf 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -203,13 +203,13 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
bool cgrp_dfl_visible;
/* some controllers are not supported in the default hierarchy */
-static u16 cgrp_dfl_inhibit_ss_mask;
+static u32 cgrp_dfl_inhibit_ss_mask;
/* some controllers are implicitly enabled on the default hierarchy */
-static u16 cgrp_dfl_implicit_ss_mask;
+static u32 cgrp_dfl_implicit_ss_mask;
/* some controllers can be threaded on the default hierarchy */
-static u16 cgrp_dfl_threaded_ss_mask;
+static u32 cgrp_dfl_threaded_ss_mask;
/* The list of hierarchy roots */
LIST_HEAD(cgroup_roots);
@@ -231,10 +231,10 @@ static u64 css_serial_nr_next = 1;
* These bitmasks identify subsystems with specific features to avoid
* having to do iterative checks repeatedly.
*/
-static u16 have_fork_callback __read_mostly;
-static u16 have_exit_callback __read_mostly;
-static u16 have_release_callback __read_mostly;
-static u16 have_canfork_callback __read_mostly;
+static u32 have_fork_callback __read_mostly;
+static u32 have_exit_callback __read_mostly;
+static u32 have_release_callback __read_mostly;
+static u32 have_canfork_callback __read_mostly;
static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS);
@@ -472,13 +472,13 @@ static bool cgroup_is_valid_domain(struct cgroup *cgrp)
}
/* subsystems visibly enabled on a cgroup */
-static u16 cgroup_control(struct cgroup *cgrp)
+static u32 cgroup_control(struct cgroup *cgrp)
{
struct cgroup *parent = cgroup_parent(cgrp);
- u16 root_ss_mask = cgrp->root->subsys_mask;
+ u32 root_ss_mask = cgrp->root->subsys_mask;
if (parent) {
- u16 ss_mask = parent->subtree_control;
+ u32 ss_mask = parent->subtree_control;
/* threaded cgroups can only have threaded controllers */
if (cgroup_is_threaded(cgrp))
@@ -493,12 +493,12 @@ static u16 cgroup_control(struct cgroup *cgrp)
}
/* subsystems enabled on a cgroup */
-static u16 cgroup_ss_mask(struct cgroup *cgrp)
+static u32 cgroup_ss_mask(struct cgroup *cgrp)
{
struct cgroup *parent = cgroup_parent(cgrp);
if (parent) {
- u16 ss_mask = parent->subtree_ss_mask;
+ u32 ss_mask = parent->subtree_ss_mask;
/* threaded cgroups can only have threaded controllers */
if (cgroup_is_threaded(cgrp))
@@ -1633,9 +1633,9 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
* This function calculates which subsystems need to be enabled if
* @subtree_control is to be applied while restricted to @this_ss_mask.
*/
-static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
+static u32 cgroup_calc_subtree_ss_mask(u32 subtree_control, u32 this_ss_mask)
{
- u16 cur_ss_mask = subtree_control;
+ u32 cur_ss_mask = subtree_control;
struct cgroup_subsys *ss;
int ssid;
@@ -1644,7 +1644,7 @@ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
while (true) {
- u16 new_ss_mask = cur_ss_mask;
+ u32 new_ss_mask = cur_ss_mask;
do_each_subsys_mask(ss, ssid, cur_ss_mask) {
new_ss_mask |= ss->depends_on;
@@ -1848,12 +1848,12 @@ err:
return ret;
}
-int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
+int rebind_subsystems(struct cgroup_root *dst_root, u32 ss_mask)
{
struct cgroup *dcgrp = &dst_root->cgrp;
struct cgroup_subsys *ss;
int ssid, ret;
- u16 dfl_disable_ss_mask = 0;
+ u32 dfl_disable_ss_mask = 0;
lockdep_assert_held(&cgroup_mutex);
@@ -2149,7 +2149,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
+int cgroup_setup_root(struct cgroup_root *root, u32 ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
@@ -3131,7 +3131,7 @@ void cgroup_procs_write_finish(struct task_struct *task,
put_task_struct(task);
}
-static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
+static void cgroup_print_ss_mask(struct seq_file *seq, u32 ss_mask)
{
struct cgroup_subsys *ss;
bool printed = false;
@@ -3496,9 +3496,9 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
cgroup_apply_control_disable(cgrp);
}
-static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
+static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u32 enable)
{
- u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
+ u32 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
/* if nothing is getting enabled, nothing to worry about */
if (!enable)
@@ -3541,7 +3541,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- u16 enable = 0, disable = 0;
+ u32 enable = 0, disable = 0;
struct cgroup *cgrp, *child;
struct cgroup_subsys *ss;
char *tok;
@@ -4945,7 +4945,7 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
rcu_read_lock();
css_for_each_child(child, css) {
- if (child->flags & CSS_ONLINE) {
+ if (css_is_online(child)) {
ret = true;
break;
}
@@ -5750,7 +5750,7 @@ static void offline_css(struct cgroup_subsys_state *css)
lockdep_assert_held(&cgroup_mutex);
- if (!(css->flags & CSS_ONLINE))
+ if (!css_is_online(css))
return;
if (ss->css_offline)
@@ -6347,7 +6347,7 @@ int __init cgroup_init(void)
struct cgroup_subsys *ss;
int ssid;
- BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
+ BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 32);
BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index 01976c8e7d49..fd7d19842ded 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -9,6 +9,7 @@
#include <linux/cpuset.h>
#include <linux/spinlock.h>
#include <linux/union_find.h>
+#include <linux/sched/isolation.h>
/* See "Frequency meter" comments, below. */
@@ -144,17 +145,12 @@ struct cpuset {
*/
nodemask_t old_mems_allowed;
- struct fmeter fmeter; /* memory_pressure filter */
-
/*
* Tasks are being attached to this cpuset. Used to prevent
* zeroing cpus/mems_allowed between ->can_attach() and ->attach().
*/
int attach_in_progress;
- /* for custom sched domain */
- int relax_domain_level;
-
/* partition root state */
int partition_root_state;
@@ -179,10 +175,19 @@ struct cpuset {
/* Handle for cpuset.cpus.partition */
struct cgroup_file partition_file;
+#ifdef CONFIG_CPUSETS_V1
+ struct fmeter fmeter; /* memory_pressure filter */
+
+ /* for custom sched domain */
+ int relax_domain_level;
+
/* Used to merge intersecting subsets for generate_sched_domains */
struct uf_node node;
+#endif
};
+extern struct cpuset top_cpuset;
+
static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
{
return css ? container_of(css, struct cpuset, css) : NULL;
@@ -240,6 +245,30 @@ static inline int is_spread_slab(const struct cpuset *cs)
return test_bit(CS_SPREAD_SLAB, &cs->flags);
}
+/*
+ * Helper routine for generate_sched_domains().
+ * Do cpusets a, b have overlapping effective cpus_allowed masks?
+ */
+static inline int cpusets_overlap(struct cpuset *a, struct cpuset *b)
+{
+ return cpumask_intersects(a->effective_cpus, b->effective_cpus);
+}
+
+static inline int nr_cpusets(void)
+{
+ /* jump label reference count + the top-level cpuset */
+ return static_key_count(&cpusets_enabled_key.key) + 1;
+}
+
+static inline bool cpuset_is_populated(struct cpuset *cs)
+{
+ lockdep_assert_cpuset_lock_held();
+
+ /* Cpusets in the process of attaching should be considered as populated */
+ return cgroup_is_populated(cs->css.cgroup) ||
+ cs->attach_in_progress;
+}
+
/**
* cpuset_for_each_child - traverse online children of a cpuset
* @child_cs: loop cursor pointing to the current child
@@ -285,7 +314,6 @@ void cpuset_full_unlock(void);
*/
#ifdef CONFIG_CPUSETS_V1
extern struct cftype cpuset1_files[];
-void fmeter_init(struct fmeter *fmp);
void cpuset1_update_task_spread_flags(struct cpuset *cs,
struct task_struct *tsk);
void cpuset1_update_tasks_flags(struct cpuset *cs);
@@ -293,8 +321,13 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs,
struct cpumask *new_cpus, nodemask_t *new_mems,
bool cpus_updated, bool mems_updated);
int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial);
+bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2);
+void cpuset1_init(struct cpuset *cs);
+void cpuset1_online_css(struct cgroup_subsys_state *css);
+int cpuset1_generate_sched_domains(cpumask_var_t **domains,
+ struct sched_domain_attr **attributes);
+
#else
-static inline void fmeter_init(struct fmeter *fmp) {}
static inline void cpuset1_update_task_spread_flags(struct cpuset *cs,
struct task_struct *tsk) {}
static inline void cpuset1_update_tasks_flags(struct cpuset *cs) {}
@@ -303,6 +336,13 @@ static inline void cpuset1_hotplug_update_tasks(struct cpuset *cs,
bool cpus_updated, bool mems_updated) {}
static inline int cpuset1_validate_change(struct cpuset *cur,
struct cpuset *trial) { return 0; }
+static inline bool cpuset1_cpus_excl_conflict(struct cpuset *cs1,
+ struct cpuset *cs2) { return false; }
+static inline void cpuset1_init(struct cpuset *cs) {}
+static inline void cpuset1_online_css(struct cgroup_subsys_state *css) {}
+static inline int cpuset1_generate_sched_domains(cpumask_var_t **domains,
+ struct sched_domain_attr **attributes) { return 0; };
+
#endif /* CONFIG_CPUSETS_V1 */
#endif /* __CPUSET_INTERNAL_H */
diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c
index 12e76774c75b..7a23b9e8778f 100644
--- a/kernel/cgroup/cpuset-v1.c
+++ b/kernel/cgroup/cpuset-v1.c
@@ -62,7 +62,7 @@ struct cpuset_remove_tasks_struct {
#define FM_SCALE 1000 /* faux fixed point scale */
/* Initialize a frequency meter */
-void fmeter_init(struct fmeter *fmp)
+static void fmeter_init(struct fmeter *fmp)
{
fmp->cnt = 0;
fmp->val = 0;
@@ -368,11 +368,44 @@ int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial)
if (par && !is_cpuset_subset(trial, par))
goto out;
+ /*
+ * Cpusets with tasks - existing or newly being attached - can't
+ * be changed to have empty cpus_allowed or mems_allowed.
+ */
+ ret = -ENOSPC;
+ if (cpuset_is_populated(cur)) {
+ if (!cpumask_empty(cur->cpus_allowed) &&
+ cpumask_empty(trial->cpus_allowed))
+ goto out;
+ if (!nodes_empty(cur->mems_allowed) &&
+ nodes_empty(trial->mems_allowed))
+ goto out;
+ }
+
ret = 0;
out:
return ret;
}
+/*
+ * cpuset1_cpus_excl_conflict() - Check if two cpusets have exclusive CPU conflicts
+ * to legacy (v1)
+ * @cs1: first cpuset to check
+ * @cs2: second cpuset to check
+ *
+ * Returns: true if CPU exclusivity conflict exists, false otherwise
+ *
+ * If either cpuset is CPU exclusive, their allowed CPUs cannot intersect.
+ */
+bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
+{
+ if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2))
+ return cpumask_intersects(cs1->cpus_allowed,
+ cs2->cpus_allowed);
+
+ return false;
+}
+
#ifdef CONFIG_PROC_PID_CPUSET
/*
* proc_cpuset_show()
@@ -499,6 +532,242 @@ out_unlock:
return retval;
}
+void cpuset1_init(struct cpuset *cs)
+{
+ fmeter_init(&cs->fmeter);
+ cs->relax_domain_level = -1;
+}
+
+void cpuset1_online_css(struct cgroup_subsys_state *css)
+{
+ struct cpuset *tmp_cs;
+ struct cgroup_subsys_state *pos_css;
+ struct cpuset *cs = css_cs(css);
+ struct cpuset *parent = parent_cs(cs);
+
+ lockdep_assert_cpus_held();
+ lockdep_assert_cpuset_lock_held();
+
+ if (is_spread_page(parent))
+ set_bit(CS_SPREAD_PAGE, &cs->flags);
+ if (is_spread_slab(parent))
+ set_bit(CS_SPREAD_SLAB, &cs->flags);
+
+ if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
+ return;
+
+ /*
+ * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
+ * set. This flag handling is implemented in cgroup core for
+ * historical reasons - the flag may be specified during mount.
+ *
+ * Currently, if any sibling cpusets have exclusive cpus or mem, we
+ * refuse to clone the configuration - thereby refusing the task to
+ * be entered, and as a result refusing the sys_unshare() or
+ * clone() which initiated it. If this becomes a problem for some
+ * users who wish to allow that scenario, then this could be
+ * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
+ * (and likewise for mems) to the new cgroup.
+ */
+ rcu_read_lock();
+ cpuset_for_each_child(tmp_cs, pos_css, parent) {
+ if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+ rcu_read_unlock();
+
+ cpuset_callback_lock_irq();
+ cs->mems_allowed = parent->mems_allowed;
+ cs->effective_mems = parent->mems_allowed;
+ cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
+ cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
+ cpuset_callback_unlock_irq();
+}
+
+static void
+update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
+{
+ if (dattr->relax_domain_level < c->relax_domain_level)
+ dattr->relax_domain_level = c->relax_domain_level;
+}
+
+static void update_domain_attr_tree(struct sched_domain_attr *dattr,
+ struct cpuset *root_cs)
+{
+ struct cpuset *cp;
+ struct cgroup_subsys_state *pos_css;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
+ /* skip the whole subtree if @cp doesn't have any CPU */
+ if (cpumask_empty(cp->cpus_allowed)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+
+ if (is_sched_load_balance(cp))
+ update_domain_attr(dattr, cp);
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * cpuset1_generate_sched_domains()
+ *
+ * Finding the best partition (set of domains):
+ * The double nested loops below over i, j scan over the load
+ * balanced cpusets (using the array of cpuset pointers in csa[])
+ * looking for pairs of cpusets that have overlapping cpus_allowed
+ * and merging them using a union-find algorithm.
+ *
+ * The union of the cpus_allowed masks from the set of all cpusets
+ * having the same root then form the one element of the partition
+ * (one sched domain) to be passed to partition_sched_domains().
+ */
+int cpuset1_generate_sched_domains(cpumask_var_t **domains,
+ struct sched_domain_attr **attributes)
+{
+ struct cpuset *cp; /* top-down scan of cpusets */
+ struct cpuset **csa; /* array of all cpuset ptrs */
+ int csn; /* how many cpuset ptrs in csa so far */
+ int i, j; /* indices for partition finding loops */
+ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
+ struct sched_domain_attr *dattr; /* attributes for custom domains */
+ int ndoms = 0; /* number of sched domains in result */
+ int nslot; /* next empty doms[] struct cpumask slot */
+ struct cgroup_subsys_state *pos_css;
+ int nslot_update;
+
+ lockdep_assert_cpuset_lock_held();
+
+ doms = NULL;
+ dattr = NULL;
+ csa = NULL;
+
+ /* Special case for the 99% of systems with one, full, sched domain */
+ if (is_sched_load_balance(&top_cpuset)) {
+ ndoms = 1;
+ doms = alloc_sched_domains(ndoms);
+ if (!doms)
+ goto done;
+
+ dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
+ if (dattr) {
+ *dattr = SD_ATTR_INIT;
+ update_domain_attr_tree(dattr, &top_cpuset);
+ }
+ cpumask_and(doms[0], top_cpuset.effective_cpus,
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
+
+ goto done;
+ }
+
+ csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
+ if (!csa)
+ goto done;
+ csn = 0;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
+ if (cp == &top_cpuset)
+ continue;
+
+ /*
+ * Continue traversing beyond @cp iff @cp has some CPUs and
+ * isn't load balancing. The former is obvious. The
+ * latter: All child cpusets contain a subset of the
+ * parent's cpus, so just skip them, and then we call
+ * update_domain_attr_tree() to calc relax_domain_level of
+ * the corresponding sched domain.
+ */
+ if (!cpumask_empty(cp->cpus_allowed) &&
+ !(is_sched_load_balance(cp) &&
+ cpumask_intersects(cp->cpus_allowed,
+ housekeeping_cpumask(HK_TYPE_DOMAIN))))
+ continue;
+
+ if (is_sched_load_balance(cp) &&
+ !cpumask_empty(cp->effective_cpus))
+ csa[csn++] = cp;
+
+ /* skip @cp's subtree */
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+ rcu_read_unlock();
+
+ for (i = 0; i < csn; i++)
+ uf_node_init(&csa[i]->node);
+
+ /* Merge overlapping cpusets */
+ for (i = 0; i < csn; i++) {
+ for (j = i + 1; j < csn; j++) {
+ if (cpusets_overlap(csa[i], csa[j]))
+ uf_union(&csa[i]->node, &csa[j]->node);
+ }
+ }
+
+ /* Count the total number of domains */
+ for (i = 0; i < csn; i++) {
+ if (uf_find(&csa[i]->node) == &csa[i]->node)
+ ndoms++;
+ }
+
+ /*
+ * Now we know how many domains to create.
+ * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
+ */
+ doms = alloc_sched_domains(ndoms);
+ if (!doms)
+ goto done;
+
+ /*
+ * The rest of the code, including the scheduler, can deal with
+ * dattr==NULL case. No need to abort if alloc fails.
+ */
+ dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
+ GFP_KERNEL);
+
+ for (nslot = 0, i = 0; i < csn; i++) {
+ nslot_update = 0;
+ for (j = i; j < csn; j++) {
+ if (uf_find(&csa[j]->node) == &csa[i]->node) {
+ struct cpumask *dp = doms[nslot];
+
+ if (i == j) {
+ nslot_update = 1;
+ cpumask_clear(dp);
+ if (dattr)
+ *(dattr + nslot) = SD_ATTR_INIT;
+ }
+ cpumask_or(dp, dp, csa[j]->effective_cpus);
+ cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
+ if (dattr)
+ update_domain_attr_tree(dattr + nslot, csa[j]);
+ }
+ }
+ if (nslot_update)
+ nslot++;
+ }
+ BUG_ON(nslot != ndoms);
+
+done:
+ kfree(csa);
+
+ /*
+ * Fallback to the default domain if kmalloc() failed.
+ * See comments in partition_sched_domains().
+ */
+ if (doms == NULL)
+ ndoms = 1;
+
+ *domains = doms;
+ *attributes = dattr;
+ return ndoms;
+}
+
/*
* for the common functions, 'private' gives the type of file
*/
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 01a553caee56..c43efef7df71 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -119,6 +119,17 @@ static bool force_sd_rebuild;
* For simplicity, a local partition can be created under a local or remote
* partition but a remote partition cannot have any partition root in its
* ancestor chain except the cgroup root.
+ *
+ * A valid partition can be formed by setting exclusive_cpus or cpus_allowed
+ * if exclusive_cpus is not set. In the case of partition with empty
+ * exclusive_cpus, all the conflicting exclusive CPUs specified in the
+ * following cpumasks of sibling cpusets will be removed from its
+ * cpus_allowed in determining its effective_xcpus.
+ * - effective_xcpus
+ * - exclusive_cpus
+ *
+ * The "cpuset.cpus.exclusive" control file should be used for setting up
+ * partition if the users want to get as many CPUs as possible.
*/
#define PRS_MEMBER 0
#define PRS_ROOT 1
@@ -201,12 +212,10 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs)
* If cpu_online_mask is used while a hotunplug operation is happening in
* parallel, we may leave an offline CPU in cpu_allowed or some other masks.
*/
-static struct cpuset top_cpuset = {
+struct cpuset top_cpuset = {
.flags = BIT(CS_CPU_EXCLUSIVE) |
BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
.partition_root_state = PRS_ROOT,
- .relax_domain_level = -1,
- .remote_partition = false,
};
/*
@@ -261,6 +270,11 @@ void cpuset_unlock(void)
mutex_unlock(&cpuset_mutex);
}
+void lockdep_assert_cpuset_lock_held(void)
+{
+ lockdep_assert_held(&cpuset_mutex);
+}
+
/**
* cpuset_full_lock - Acquire full protection for cpuset modification
*
@@ -319,7 +333,7 @@ static inline void check_insane_mems_config(nodemask_t *nodes)
*/
static inline void dec_attach_in_progress_locked(struct cpuset *cs)
{
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
cs->attach_in_progress--;
if (!cs->attach_in_progress)
@@ -353,15 +367,6 @@ static inline bool is_in_v2_mode(void)
(cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
}
-static inline bool cpuset_is_populated(struct cpuset *cs)
-{
- lockdep_assert_held(&cpuset_mutex);
-
- /* Cpusets in the process of attaching should be considered as populated */
- return cgroup_is_populated(cs->css.cgroup) ||
- cs->attach_in_progress;
-}
-
/**
* partition_is_populated - check if partition has tasks
* @cs: partition root to be checked
@@ -603,36 +608,32 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2)
/**
* cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts
- * @cs1: first cpuset to check
- * @cs2: second cpuset to check
+ * @trial: the trial cpuset to be checked
+ * @sibling: a sibling cpuset to be checked against
+ * @xcpus_changed: set if exclusive_cpus has been set
*
* Returns: true if CPU exclusivity conflict exists, false otherwise
*
* Conflict detection rules:
- * 1. If either cpuset is CPU exclusive, they must be mutually exclusive
- * 2. exclusive_cpus masks cannot intersect between cpusets
- * 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs
+ * o cgroup v1
+ * See cpuset1_cpus_excl_conflict()
+ * o cgroup v2
+ * - The exclusive_cpus values cannot overlap.
+ * - New exclusive_cpus cannot be a superset of a sibling's cpus_allowed.
*/
-static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
+static inline bool cpus_excl_conflict(struct cpuset *trial, struct cpuset *sibling,
+ bool xcpus_changed)
{
- /* If either cpuset is exclusive, check if they are mutually exclusive */
- if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2))
- return !cpusets_are_exclusive(cs1, cs2);
-
- /* Exclusive_cpus cannot intersect */
- if (cpumask_intersects(cs1->exclusive_cpus, cs2->exclusive_cpus))
- return true;
-
- /* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */
- if (!cpumask_empty(cs1->cpus_allowed) &&
- cpumask_subset(cs1->cpus_allowed, cs2->exclusive_cpus))
- return true;
+ if (!cpuset_v2())
+ return cpuset1_cpus_excl_conflict(trial, sibling);
- if (!cpumask_empty(cs2->cpus_allowed) &&
- cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus))
+ /* The cpus_allowed of a sibling cpuset cannot be a subset of the new exclusive_cpus */
+ if (xcpus_changed && !cpumask_empty(sibling->cpus_allowed) &&
+ cpumask_subset(sibling->cpus_allowed, trial->exclusive_cpus))
return true;
- return false;
+ /* Exclusive_cpus cannot intersect */
+ return cpumask_intersects(trial->exclusive_cpus, sibling->exclusive_cpus);
}
static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
@@ -666,6 +667,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
{
struct cgroup_subsys_state *css;
struct cpuset *c, *par;
+ bool xcpus_changed;
int ret = 0;
rcu_read_lock();
@@ -682,20 +684,6 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
par = parent_cs(cur);
/*
- * Cpusets with tasks - existing or newly being attached - can't
- * be changed to have empty cpus_allowed or mems_allowed.
- */
- ret = -ENOSPC;
- if (cpuset_is_populated(cur)) {
- if (!cpumask_empty(cur->cpus_allowed) &&
- cpumask_empty(trial->cpus_allowed))
- goto out;
- if (!nodes_empty(cur->mems_allowed) &&
- nodes_empty(trial->mems_allowed))
- goto out;
- }
-
- /*
* We can't shrink if we won't have enough room for SCHED_DEADLINE
* tasks. This check is not done when scheduling is disabled as the
* users should know what they are doing.
@@ -722,10 +710,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
* overlap. exclusive_cpus cannot overlap with each other if set.
*/
ret = -EINVAL;
+ xcpus_changed = !cpumask_equal(cur->exclusive_cpus, trial->exclusive_cpus);
cpuset_for_each_child(c, css, par) {
if (c == cur)
continue;
- if (cpus_excl_conflict(trial, c))
+ if (cpus_excl_conflict(trial, c, xcpus_changed))
goto out;
if (mems_excl_conflict(trial, c))
goto out;
@@ -738,49 +727,6 @@ out:
}
#ifdef CONFIG_SMP
-/*
- * Helper routine for generate_sched_domains().
- * Do cpusets a, b have overlapping effective cpus_allowed masks?
- */
-static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
-{
- return cpumask_intersects(a->effective_cpus, b->effective_cpus);
-}
-
-static void
-update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
-{
- if (dattr->relax_domain_level < c->relax_domain_level)
- dattr->relax_domain_level = c->relax_domain_level;
- return;
-}
-
-static void update_domain_attr_tree(struct sched_domain_attr *dattr,
- struct cpuset *root_cs)
-{
- struct cpuset *cp;
- struct cgroup_subsys_state *pos_css;
-
- rcu_read_lock();
- cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
- /* skip the whole subtree if @cp doesn't have any CPU */
- if (cpumask_empty(cp->cpus_allowed)) {
- pos_css = css_rightmost_descendant(pos_css);
- continue;
- }
-
- if (is_sched_load_balance(cp))
- update_domain_attr(dattr, cp);
- }
- rcu_read_unlock();
-}
-
-/* Must be called with cpuset_mutex held. */
-static inline int nr_cpusets(void)
-{
- /* jump label reference count + the top-level cpuset */
- return static_key_count(&cpusets_enabled_key.key) + 1;
-}
/*
* generate_sched_domains()
@@ -820,103 +766,46 @@ static inline int nr_cpusets(void)
* convenient format, that can be easily compared to the prior
* value to determine what partition elements (sched domains)
* were changed (added or removed.)
- *
- * Finding the best partition (set of domains):
- * The double nested loops below over i, j scan over the load
- * balanced cpusets (using the array of cpuset pointers in csa[])
- * looking for pairs of cpusets that have overlapping cpus_allowed
- * and merging them using a union-find algorithm.
- *
- * The union of the cpus_allowed masks from the set of all cpusets
- * having the same root then form the one element of the partition
- * (one sched domain) to be passed to partition_sched_domains().
- *
*/
static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr **attributes)
{
struct cpuset *cp; /* top-down scan of cpusets */
struct cpuset **csa; /* array of all cpuset ptrs */
- int csn; /* how many cpuset ptrs in csa so far */
int i, j; /* indices for partition finding loops */
cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
- int nslot; /* next empty doms[] struct cpumask slot */
struct cgroup_subsys_state *pos_css;
- bool root_load_balance = is_sched_load_balance(&top_cpuset);
- bool cgrpv2 = cpuset_v2();
- int nslot_update;
+
+ if (!cpuset_v2())
+ return cpuset1_generate_sched_domains(domains, attributes);
doms = NULL;
dattr = NULL;
csa = NULL;
/* Special case for the 99% of systems with one, full, sched domain */
- if (root_load_balance && cpumask_empty(subpartitions_cpus)) {
-single_root_domain:
+ if (cpumask_empty(subpartitions_cpus)) {
ndoms = 1;
- doms = alloc_sched_domains(ndoms);
- if (!doms)
- goto done;
-
- dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
- if (dattr) {
- *dattr = SD_ATTR_INIT;
- update_domain_attr_tree(dattr, &top_cpuset);
- }
- cpumask_and(doms[0], top_cpuset.effective_cpus,
- housekeeping_cpumask(HK_TYPE_DOMAIN));
-
- goto done;
+ /* !csa will be checked and can be correctly handled */
+ goto generate_doms;
}
csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
if (!csa)
goto done;
- csn = 0;
+ /* Find how many partitions and cache them to csa[] */
rcu_read_lock();
- if (root_load_balance)
- csa[csn++] = &top_cpuset;
cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
- if (cp == &top_cpuset)
- continue;
-
- if (cgrpv2)
- goto v2;
-
- /*
- * v1:
- * Continue traversing beyond @cp iff @cp has some CPUs and
- * isn't load balancing. The former is obvious. The
- * latter: All child cpusets contain a subset of the
- * parent's cpus, so just skip them, and then we call
- * update_domain_attr_tree() to calc relax_domain_level of
- * the corresponding sched domain.
- */
- if (!cpumask_empty(cp->cpus_allowed) &&
- !(is_sched_load_balance(cp) &&
- cpumask_intersects(cp->cpus_allowed,
- housekeeping_cpumask(HK_TYPE_DOMAIN))))
- continue;
-
- if (is_sched_load_balance(cp) &&
- !cpumask_empty(cp->effective_cpus))
- csa[csn++] = cp;
-
- /* skip @cp's subtree */
- pos_css = css_rightmost_descendant(pos_css);
- continue;
-
-v2:
/*
* Only valid partition roots that are not isolated and with
- * non-empty effective_cpus will be saved into csn[].
+ * non-empty effective_cpus will be saved into csa[].
*/
if ((cp->partition_root_state == PRS_ROOT) &&
!cpumask_empty(cp->effective_cpus))
- csa[csn++] = cp;
+ csa[ndoms++] = cp;
/*
* Skip @cp's subtree if not a partition root and has no
@@ -927,40 +816,18 @@ v2:
}
rcu_read_unlock();
- /*
- * If there are only isolated partitions underneath the cgroup root,
- * we can optimize out unneeded sched domains scanning.
- */
- if (root_load_balance && (csn == 1))
- goto single_root_domain;
-
- for (i = 0; i < csn; i++)
- uf_node_init(&csa[i]->node);
-
- /* Merge overlapping cpusets */
- for (i = 0; i < csn; i++) {
- for (j = i + 1; j < csn; j++) {
- if (cpusets_overlap(csa[i], csa[j])) {
+ for (i = 0; i < ndoms; i++) {
+ for (j = i + 1; j < ndoms; j++) {
+ if (cpusets_overlap(csa[i], csa[j]))
/*
* Cgroup v2 shouldn't pass down overlapping
* partition root cpusets.
*/
- WARN_ON_ONCE(cgrpv2);
- uf_union(&csa[i]->node, &csa[j]->node);
- }
+ WARN_ON_ONCE(1);
}
}
- /* Count the total number of domains */
- for (i = 0; i < csn; i++) {
- if (uf_find(&csa[i]->node) == &csa[i]->node)
- ndoms++;
- }
-
- /*
- * Now we know how many domains to create.
- * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
- */
+generate_doms:
doms = alloc_sched_domains(ndoms);
if (!doms)
goto done;
@@ -977,45 +844,19 @@ v2:
* to SD_ATTR_INIT. Also non-isolating partition root CPUs are a
* subset of HK_TYPE_DOMAIN housekeeping CPUs.
*/
- if (cgrpv2) {
- for (i = 0; i < ndoms; i++) {
- /*
- * The top cpuset may contain some boot time isolated
- * CPUs that need to be excluded from the sched domain.
- */
- if (csa[i] == &top_cpuset)
- cpumask_and(doms[i], csa[i]->effective_cpus,
- housekeeping_cpumask(HK_TYPE_DOMAIN));
- else
- cpumask_copy(doms[i], csa[i]->effective_cpus);
- if (dattr)
- dattr[i] = SD_ATTR_INIT;
- }
- goto done;
- }
-
- for (nslot = 0, i = 0; i < csn; i++) {
- nslot_update = 0;
- for (j = i; j < csn; j++) {
- if (uf_find(&csa[j]->node) == &csa[i]->node) {
- struct cpumask *dp = doms[nslot];
-
- if (i == j) {
- nslot_update = 1;
- cpumask_clear(dp);
- if (dattr)
- *(dattr + nslot) = SD_ATTR_INIT;
- }
- cpumask_or(dp, dp, csa[j]->effective_cpus);
- cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
- if (dattr)
- update_domain_attr_tree(dattr + nslot, csa[j]);
- }
- }
- if (nslot_update)
- nslot++;
+ for (i = 0; i < ndoms; i++) {
+ /*
+ * The top cpuset may contain some boot time isolated
+ * CPUs that need to be excluded from the sched domain.
+ */
+ if (!csa || csa[i] == &top_cpuset)
+ cpumask_and(doms[i], top_cpuset.effective_cpus,
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
+ else
+ cpumask_copy(doms[i], csa[i]->effective_cpus);
+ if (dattr)
+ dattr[i] = SD_ATTR_INIT;
}
- BUG_ON(nslot != ndoms);
done:
kfree(csa);
@@ -1055,7 +896,7 @@ void dl_rebuild_rd_accounting(void)
int cpu;
u64 cookie = ++dl_cookie;
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
lockdep_assert_cpus_held();
lockdep_assert_held(&sched_domains_mutex);
@@ -1100,53 +941,33 @@ void dl_rebuild_rd_accounting(void)
*/
void rebuild_sched_domains_locked(void)
{
- struct cgroup_subsys_state *pos_css;
struct sched_domain_attr *attr;
cpumask_var_t *doms;
- struct cpuset *cs;
int ndoms;
+ int i;
lockdep_assert_cpus_held();
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
force_sd_rebuild = false;
- /*
- * If we have raced with CPU hotplug, return early to avoid
- * passing doms with offlined cpu to partition_sched_domains().
- * Anyways, cpuset_handle_hotplug() will rebuild sched domains.
- *
- * With no CPUs in any subpartitions, top_cpuset's effective CPUs
- * should be the same as the active CPUs, so checking only top_cpuset
- * is enough to detect racing CPU offlines.
- */
- if (cpumask_empty(subpartitions_cpus) &&
- !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
- return;
+ /* Generate domain masks and attrs */
+ ndoms = generate_sched_domains(&doms, &attr);
/*
- * With subpartition CPUs, however, the effective CPUs of a partition
- * root should be only a subset of the active CPUs. Since a CPU in any
- * partition root could be offlined, all must be checked.
- */
- if (!cpumask_empty(subpartitions_cpus)) {
- rcu_read_lock();
- cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
- if (!is_partition_valid(cs)) {
- pos_css = css_rightmost_descendant(pos_css);
- continue;
- }
- if (!cpumask_subset(cs->effective_cpus,
- cpu_active_mask)) {
- rcu_read_unlock();
- return;
- }
- }
- rcu_read_unlock();
+ * cpuset_hotplug_workfn is invoked synchronously now, thus this
+ * function should not race with CPU hotplug. And the effective CPUs
+ * must not include any offline CPUs. Passing an offline CPU in the
+ * doms to partition_sched_domains() will trigger a kernel panic.
+ *
+ * We perform a final check here: if the doms contains any
+ * offline CPUs, a warning is emitted and we return directly to
+ * prevent the panic.
+ */
+ for (i = 0; i < ndoms; ++i) {
+ if (WARN_ON_ONCE(!cpumask_subset(doms[i], cpu_active_mask)))
+ return;
}
- /* Generate domain masks and attrs */
- ndoms = generate_sched_domains(&doms, &attr);
-
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
}
@@ -1501,23 +1322,29 @@ static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs,
int retval = 0;
if (cpumask_empty(excpus))
- return retval;
+ return 0;
/*
- * Exclude exclusive CPUs from siblings
+ * Remove exclusive CPUs from siblings
*/
rcu_read_lock();
cpuset_for_each_child(sibling, css, parent) {
+ struct cpumask *sibling_xcpus;
+
if (sibling == cs)
continue;
- if (cpumask_intersects(excpus, sibling->exclusive_cpus)) {
- cpumask_andnot(excpus, excpus, sibling->exclusive_cpus);
- retval++;
- continue;
- }
- if (cpumask_intersects(excpus, sibling->effective_xcpus)) {
- cpumask_andnot(excpus, excpus, sibling->effective_xcpus);
+ /*
+ * If exclusive_cpus is defined, effective_xcpus will always
+ * be a subset. Otherwise, effective_xcpus will only be set
+ * in a valid partition root.
+ */
+ sibling_xcpus = cpumask_empty(sibling->exclusive_cpus)
+ ? sibling->effective_xcpus
+ : sibling->exclusive_cpus;
+
+ if (cpumask_intersects(excpus, sibling_xcpus)) {
+ cpumask_andnot(excpus, excpus, sibling_xcpus);
retval++;
}
}
@@ -1806,7 +1633,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
int parent_prs = parent->partition_root_state;
bool nocpu;
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
WARN_ON_ONCE(is_remote_partition(cs)); /* For local partition only */
/*
@@ -2315,17 +2142,13 @@ get_css:
spin_lock_irq(&callback_lock);
cpumask_copy(cp->effective_cpus, tmp->new_cpus);
cp->partition_root_state = new_prs;
- if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs))
- compute_excpus(cp, cp->effective_xcpus);
-
/*
- * Make sure effective_xcpus is properly set for a valid
- * partition root.
+ * Need to compute effective_xcpus if either exclusive_cpus
+ * is non-empty or it is a valid partition root.
*/
- if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus))
- cpumask_and(cp->effective_xcpus,
- cp->cpus_allowed, parent->effective_xcpus);
- else if (new_prs < 0)
+ if ((new_prs > 0) || !cpumask_empty(cp->exclusive_cpus))
+ compute_excpus(cp, cp->effective_xcpus);
+ if (new_prs <= 0)
reset_partition_data(cp);
spin_unlock_irq(&callback_lock);
@@ -2378,7 +2201,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
struct cpuset *sibling;
struct cgroup_subsys_state *pos_css;
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
/*
* Check all its siblings and call update_cpumasks_hier()
@@ -2387,27 +2210,20 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
* It is possible a change in parent's effective_cpus
* due to a change in a child partition's effective_xcpus will impact
* its siblings even if they do not inherit parent's effective_cpus
- * directly.
+ * directly. It should not impact valid partition.
*
* The update_cpumasks_hier() function may sleep. So we have to
* release the RCU read lock before calling it.
*/
rcu_read_lock();
cpuset_for_each_child(sibling, pos_css, parent) {
- if (sibling == cs)
+ if (sibling == cs || is_partition_valid(sibling))
continue;
- if (!is_partition_valid(sibling)) {
- compute_effective_cpumask(tmp->new_cpus, sibling,
- parent);
- if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))
- continue;
- } else if (is_remote_partition(sibling)) {
- /*
- * Change in a sibling cpuset won't affect a remote
- * partition root.
- */
+
+ compute_effective_cpumask(tmp->new_cpus, sibling,
+ parent);
+ if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))
continue;
- }
if (!css_tryget_online(&sibling->css))
continue;
@@ -2463,43 +2279,6 @@ static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *tri
return PERR_NONE;
}
-static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs,
- struct tmpmasks *tmp)
-{
- int retval;
- struct cpuset *parent = parent_cs(cs);
-
- retval = validate_change(cs, trialcs);
-
- if ((retval == -EINVAL) && cpuset_v2()) {
- struct cgroup_subsys_state *css;
- struct cpuset *cp;
-
- /*
- * The -EINVAL error code indicates that partition sibling
- * CPU exclusivity rule has been violated. We still allow
- * the cpumask change to proceed while invalidating the
- * partition. However, any conflicting sibling partitions
- * have to be marked as invalid too.
- */
- trialcs->prs_err = PERR_NOTEXCL;
- rcu_read_lock();
- cpuset_for_each_child(cp, css, parent) {
- struct cpumask *xcpus = user_xcpus(trialcs);
-
- if (is_partition_valid(cp) &&
- cpumask_intersects(xcpus, cp->effective_xcpus)) {
- rcu_read_unlock();
- update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp);
- rcu_read_lock();
- }
- }
- rcu_read_unlock();
- retval = 0;
- }
- return retval;
-}
-
/**
* partition_cpus_change - Handle partition state changes due to CPU mask updates
* @cs: The target cpuset being modified
@@ -2559,15 +2338,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
return 0;
- if (alloc_tmpmasks(&tmp))
- return -ENOMEM;
-
compute_trialcs_excpus(trialcs, cs);
trialcs->prs_err = PERR_NONE;
- retval = cpus_allowed_validate_change(cs, trialcs, &tmp);
+ retval = validate_change(cs, trialcs);
if (retval < 0)
- goto out_free;
+ return retval;
+
+ if (alloc_tmpmasks(&tmp))
+ return -ENOMEM;
/*
* Check all the descendants in update_cpumasks_hier() if
@@ -2590,7 +2369,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
if (cs->partition_root_state)
update_partition_sd_lb(cs, old_prs);
-out_free:
+
free_tmpmasks(&tmp);
return retval;
}
@@ -3249,7 +3028,7 @@ static nodemask_t cpuset_attach_nodemask_to;
static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
{
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
if (cs != &top_cpuset)
guarantee_active_cpus(task, cpus_attach);
@@ -3605,8 +3384,7 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
return ERR_PTR(-ENOMEM);
__set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
- fmeter_init(&cs->fmeter);
- cs->relax_domain_level = -1;
+ cpuset1_init(cs);
/* Set CS_MEMORY_MIGRATE for default hierarchy */
if (cpuset_v2())
@@ -3619,17 +3397,11 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
struct cpuset *parent = parent_cs(cs);
- struct cpuset *tmp_cs;
- struct cgroup_subsys_state *pos_css;
if (!parent)
return 0;
cpuset_full_lock();
- if (is_spread_page(parent))
- set_bit(CS_SPREAD_PAGE, &cs->flags);
- if (is_spread_slab(parent))
- set_bit(CS_SPREAD_SLAB, &cs->flags);
/*
* For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
*/
@@ -3644,39 +3416,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cs->effective_mems = parent->effective_mems;
}
spin_unlock_irq(&callback_lock);
+ cpuset1_online_css(css);
- if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
- goto out_unlock;
-
- /*
- * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
- * set. This flag handling is implemented in cgroup core for
- * historical reasons - the flag may be specified during mount.
- *
- * Currently, if any sibling cpusets have exclusive cpus or mem, we
- * refuse to clone the configuration - thereby refusing the task to
- * be entered, and as a result refusing the sys_unshare() or
- * clone() which initiated it. If this becomes a problem for some
- * users who wish to allow that scenario, then this could be
- * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
- * (and likewise for mems) to the new cgroup.
- */
- rcu_read_lock();
- cpuset_for_each_child(tmp_cs, pos_css, parent) {
- if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
- rcu_read_unlock();
- goto out_unlock;
- }
- }
- rcu_read_unlock();
-
- spin_lock_irq(&callback_lock);
- cs->mems_allowed = parent->mems_allowed;
- cs->effective_mems = parent->mems_allowed;
- cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
- cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
- spin_unlock_irq(&callback_lock);
-out_unlock:
cpuset_full_unlock();
return 0;
}
@@ -3876,7 +3617,7 @@ int __init cpuset_init(void)
cpumask_setall(top_cpuset.exclusive_cpus);
nodes_setall(top_cpuset.effective_mems);
- fmeter_init(&top_cpuset.fmeter);
+ cpuset1_init(&top_cpuset);
BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
@@ -4210,7 +3951,7 @@ static void __cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask
*/
void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
{
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
__cpuset_cpus_allowed_locked(tsk, pmask);
}
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 81ea38dd6f9d..a5490097fe52 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -230,7 +230,7 @@ static int cgroup_subsys_states_read(struct seq_file *seq, void *v)
}
static void cgroup_masks_read_one(struct seq_file *seq, const char *name,
- u16 mask)
+ u32 mask)
{
struct cgroup_subsys *ss;
int ssid;