From 857a2beb09ab83e9a8185821ae16db7dfbe8b837 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 14 Apr 2013 20:50:08 -0700 Subject: cgroup: implement task_cgroup_path_from_hierarchy() kdbus folks want a sane way to determine the cgroup path that a given task belongs to on a given hierarchy, which is a reasonble thing to expect from cgroup core. Implement task_cgroup_path_from_hierarchy(). v2: Dropped unnecessary NULL check on the return value of task_cgroup_from_root() as suggested by Li Zefan. Signed-off-by: Tejun Heo Acked-by: Greg Kroah-Hartman Acked-by: Li Zefan Cc: Kay Sievers Cc: Lennart Poettering Cc: Daniel Mack --- include/linux/cgroup.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 5047355b9a0f..383c630f36f9 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -542,6 +542,8 @@ int cgroup_is_removed(const struct cgroup *cgrp); bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen); +int task_cgroup_path_from_hierarchy(struct task_struct *task, int hierarchy_id, + char *buf, size_t buflen); int cgroup_task_count(const struct cgroup *cgrp); -- cgit v1.2.3 From 23958e729e7029678e746bf8f4094c8863a79c3d Mon Sep 17 00:00:00 2001 From: Greg KH Date: Fri, 3 May 2013 16:26:59 -0700 Subject: cgroup.h: remove some functions that are now gone cgroup_lock() and cgroup_unlock() are now no longer exported, so fix cgroup.h to not declare them if CONFIG_CGROUPS is not enabled. Signed-off-by: Greg Kroah-Hartman Acked-by: Li Zefan Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 383c630f36f9..4f6f5138c340 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -840,8 +840,6 @@ static inline void cgroup_fork(struct task_struct *p) {} static inline void cgroup_post_fork(struct task_struct *p) {} static inline void cgroup_exit(struct task_struct *p, int callbacks) {} -static inline void cgroup_lock(void) {} -static inline void cgroup_unlock(void) {} static inline int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { -- cgit v1.2.3 From bdc7119f1bdd0632d42f435941dc290216a436e7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 May 2013 10:55:38 +0900 Subject: cgroup: make cgroup_is_removed() static cgroup_is_removed() no longer has external users and it shouldn't grow any - controllers should deal with cgroup_subsys_state on/offline state instead of cgroup removal state. Make it static. While at it, make it return bool. Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 1 - kernel/cgroup.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 1df5f699be61..8d9f3c911fca 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -538,7 +538,6 @@ static inline const char *cgroup_name(const struct cgroup *cgrp) int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); -int cgroup_is_removed(const struct cgroup *cgrp); bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a19419f4af1a..501974823b33 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -226,7 +226,7 @@ static int css_refcnt(struct cgroup_subsys_state *css) } /* convenient tests for these bits */ -inline int cgroup_is_removed(const struct cgroup *cgrp) +static inline bool cgroup_is_removed(const struct cgroup *cgrp) { return test_bit(CGRP_REMOVED, &cgrp->flags); } -- cgit v1.2.3 From 53fa5261747a90746531e8a1c81eeb78fedc2f71 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 May 2013 10:55:38 +0900 Subject: cgroup: add cgroup->serial_nr and implement cgroup_next_sibling() Currently, there's no easy way to find out the next sibling cgroup unless it's known that the current cgroup is accessed from the parent's children list in a single RCU critical section. This in turn forces all iterators to require whole iteration to be enclosed in a single RCU critical section, which sometimes is too restrictive. This patch implements cgroup_next_sibling() which can reliably determine the next sibling regardless of the state of the current cgroup as long as it's accessible. It currently is impossible to determine the next sibling after dropping RCU read lock because the cgroup being iterated could be removed anytime and if RCU read lock is dropped, nothing guarantess its ->sibling.next pointer is accessible. A removed cgroup would continue to point to its next sibling for RCU accesses but stop receiving updates from the sibling. IOW, the next sibling could be removed and then complete its grace period while RCU read lock is dropped, making it unsafe to dereference ->sibling.next after dropping and re-acquiring RCU read lock. This can be solved by adding a way to traverse to the next sibling without dereferencing ->sibling.next. This patch adds a monotonically increasing cgroup serial number, cgroup->serial_nr, which guarantees that all cgroup->children lists are kept in increasing serial_nr order. A new function, cgroup_next_sibling(), is implemented, which, if CGRP_REMOVED is not set on the current cgroup, follows ->sibling.next; otherwise, traverses the parent's ->children list until it sees a sibling with higher ->serial_nr. This allows the function to always return the next sibling regardless of the state of the current cgroup without adding overhead in the fast path. Further patches will update the iterators to use cgroup_next_sibling() so that they allow dropping RCU read lock and blocking while iteration is in progress which in turn will be used to simplify controllers. v2: Typo fix as per Serge. Signed-off-by: Tejun Heo Acked-by: Serge E. Hallyn --- include/linux/cgroup.h | 10 ++++++++ kernel/cgroup.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 8d9f3c911fca..ee041a01a67e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -188,6 +188,14 @@ struct cgroup { struct cgroup *parent; /* my parent */ struct dentry *dentry; /* cgroup fs entry, RCU protected */ + /* + * Monotonically increasing unique serial number which defines a + * uniform order among all cgroups. It's guaranteed that all + * ->children lists are in the ascending order of ->serial_nr. + * It's used to allow interrupting and resuming iterations. + */ + u64 serial_nr; + /* * This is a copy of dentry->d_name, and it's needed because * we can't use dentry->d_name in cgroup_path(). @@ -675,6 +683,8 @@ static inline struct cgroup* task_cgroup(struct task_struct *task, return task_subsys_state(task, subsys_id)->cgroup; } +struct cgroup *cgroup_next_sibling(struct cgroup *pos); + /** * cgroup_for_each_child - iterate through children of a cgroup * @pos: the cgroup * to use as the loop cursor diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 501974823b33..b87c7a5a5497 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2975,6 +2975,55 @@ static void cgroup_enable_task_cg_lists(void) write_unlock(&css_set_lock); } +/** + * cgroup_next_sibling - find the next sibling of a given cgroup + * @pos: the current cgroup + * + * This function returns the next sibling of @pos and should be called + * under RCU read lock. The only requirement is that @pos is accessible. + * The next sibling is guaranteed to be returned regardless of @pos's + * state. + */ +struct cgroup *cgroup_next_sibling(struct cgroup *pos) +{ + struct cgroup *next; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* + * @pos could already have been removed. Once a cgroup is removed, + * its ->sibling.next is no longer updated when its next sibling + * changes. As CGRP_REMOVED is set on removal which is fully + * serialized, if we see it unasserted, it's guaranteed that the + * next sibling hasn't finished its grace period even if it's + * already removed, and thus safe to dereference from this RCU + * critical section. If ->sibling.next is inaccessible, + * cgroup_is_removed() is guaranteed to be visible as %true here. + */ + if (likely(!cgroup_is_removed(pos))) { + next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); + if (&next->sibling != &pos->parent->children) + return next; + return NULL; + } + + /* + * Can't dereference the next pointer. Each cgroup is given a + * monotonically increasing unique serial number and always + * appended to the sibling list, so the next one can be found by + * walking the parent's children until we see a cgroup with higher + * serial number than @pos's. + * + * While this path can be slow, it's taken only when either the + * current cgroup is removed or iteration and removal race. + */ + list_for_each_entry_rcu(next, &pos->parent->children, sibling) + if (next->serial_nr > pos->serial_nr) + return next; + return NULL; +} +EXPORT_SYMBOL_GPL(cgroup_next_sibling); + /** * cgroup_next_descendant_pre - find the next descendant for pre-order walk * @pos: the current position (%NULL to initiate traversal) @@ -4137,6 +4186,7 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) static long cgroup_create(struct cgroup *parent, struct dentry *dentry, umode_t mode) { + static atomic64_t serial_nr_cursor = ATOMIC64_INIT(0); struct cgroup *cgrp; struct cgroup_name *name; struct cgroupfs_root *root = parent->root; @@ -4217,6 +4267,14 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_free_all; lockdep_assert_held(&dentry->d_inode->i_mutex); + /* + * Assign a monotonically increasing serial number. With the list + * appending below, it guarantees that sibling cgroups are always + * sorted in the ascending serial number order on the parent's + * ->children. + */ + cgrp->serial_nr = atomic64_inc_return(&serial_nr_cursor); + /* allocation complete, commit to creation */ list_add_tail(&cgrp->allcg_node, &root->allcg_list); list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); @@ -4304,6 +4362,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * removed. This makes future css_tryget() and child creation * attempts fail thus maintaining the removal conditions verified * above. + * + * Note that CGRP_REMVOED clearing is depended upon by + * cgroup_next_sibling() to resume iteration after dropping RCU + * read lock. See cgroup_next_sibling() for details. */ for_each_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; -- cgit v1.2.3 From 75501a6d59e989e5c286716e5b3b66ace4660e83 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 May 2013 10:55:38 +0900 Subject: cgroup: update iterators to use cgroup_next_sibling() This patch converts cgroup_for_each_child(), cgroup_next_descendant_pre/post() and thus cgroup_for_each_descendant_pre/post() to use cgroup_next_sibling() instead of manually dereferencing ->sibling.next. The only reason the iterators couldn't allow dropping RCU read lock while iteration is in progress was because they couldn't determine the next sibling safely once RCU read lock is dropped. Using cgroup_next_sibling() removes that problem and enables all iterators to allow dropping RCU read lock in the middle. Comments are updated accordingly. This makes the iterators easier to use and will simplify controllers. Note that @cgroup argument is renamed to @cgrp in cgroup_for_each_child() because it conflicts with "struct cgroup" used in the new macro body. Signed-off-by: Tejun Heo Acked-by: Serge E. Hallyn Reviewed-by: Michal Hocko --- include/linux/cgroup.h | 18 ++++++++++++++---- kernel/cgroup.c | 25 +++++++++++++++++++------ 2 files changed, 33 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ee041a01a67e..d0ad3794b947 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -688,9 +688,9 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos); /** * cgroup_for_each_child - iterate through children of a cgroup * @pos: the cgroup * to use as the loop cursor - * @cgroup: cgroup whose children to walk + * @cgrp: cgroup whose children to walk * - * Walk @cgroup's children. Must be called under rcu_read_lock(). A child + * Walk @cgrp's children. Must be called under rcu_read_lock(). A child * cgroup which hasn't finished ->css_online() or already has finished * ->css_offline() may show up during traversal and it's each subsystem's * responsibility to verify that each @pos is alive. @@ -698,9 +698,15 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos); * If a subsystem synchronizes against the parent in its ->css_online() and * before starting iterating, a cgroup which finished ->css_online() is * guaranteed to be visible in the future iterations. + * + * It is allowed to temporarily drop RCU read lock during iteration. The + * caller is responsible for ensuring that @pos remains accessible until + * the start of the next iteration by, for example, bumping the css refcnt. */ -#define cgroup_for_each_child(pos, cgroup) \ - list_for_each_entry_rcu(pos, &(cgroup)->children, sibling) +#define cgroup_for_each_child(pos, cgrp) \ + for ((pos) = list_first_or_null_rcu(&(cgrp)->children, \ + struct cgroup, sibling); \ + (pos); (pos) = cgroup_next_sibling((pos))) struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, struct cgroup *cgroup); @@ -759,6 +765,10 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos); * Alternatively, a subsystem may choose to use a single global lock to * synchronize ->css_online() and ->css_offline() against tree-walking * operations. + * + * It is allowed to temporarily drop RCU read lock during iteration. The + * caller is responsible for ensuring that @pos remains accessible until + * the start of the next iteration by, for example, bumping the css refcnt. */ #define cgroup_for_each_descendant_pre(pos, cgroup) \ for (pos = cgroup_next_descendant_pre(NULL, (cgroup)); (pos); \ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b87c7a5a5497..fefc41c1a147 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3031,6 +3031,11 @@ EXPORT_SYMBOL_GPL(cgroup_next_sibling); * * To be used by cgroup_for_each_descendant_pre(). Find the next * descendant to visit for pre-order traversal of @cgroup's descendants. + * + * While this function requires RCU read locking, it doesn't require the + * whole traversal to be contained in a single RCU critical section. This + * function will return the correct next descendant as long as both @pos + * and @cgroup are accessible and @pos is a descendant of @cgroup. */ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, struct cgroup *cgroup) @@ -3050,11 +3055,9 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, /* no child, visit my or the closest ancestor's next sibling */ while (pos != cgroup) { - next = list_entry_rcu(pos->sibling.next, struct cgroup, - sibling); - if (&next->sibling != &pos->parent->children) + next = cgroup_next_sibling(pos); + if (next) return next; - pos = pos->parent; } @@ -3069,6 +3072,11 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); * Return the rightmost descendant of @pos. If there's no descendant, * @pos is returned. This can be used during pre-order traversal to skip * subtree of @pos. + * + * While this function requires RCU read locking, it doesn't require the + * whole traversal to be contained in a single RCU critical section. This + * function will return the correct rightmost descendant as long as @pos is + * accessible. */ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) { @@ -3108,6 +3116,11 @@ static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) * * To be used by cgroup_for_each_descendant_post(). Find the next * descendant to visit for post-order traversal of @cgroup's descendants. + * + * While this function requires RCU read locking, it doesn't require the + * whole traversal to be contained in a single RCU critical section. This + * function will return the correct next descendant as long as both @pos + * and @cgroup are accessible and @pos is a descendant of @cgroup. */ struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, struct cgroup *cgroup) @@ -3123,8 +3136,8 @@ struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, } /* if there's an unvisited sibling, visit its leftmost descendant */ - next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); - if (&next->sibling != &pos->parent->children) + next = cgroup_next_sibling(pos); + if (next) return cgroup_leftmost_descendant(next); /* no sibling left, visit parent */ -- cgit v1.2.3 From 3fc3db9a3ae0ce108badf31a4a00e41b4236f5fc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Jun 2013 21:04:48 -0700 Subject: cgroup: remove now unused css_depth() Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 1 - kernel/cgroup.c | 12 ------------ 2 files changed, 13 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index d0ad3794b947..5830592258dc 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -848,7 +848,6 @@ bool css_is_ancestor(struct cgroup_subsys_state *cg, /* Get id and depth of css */ unsigned short css_id(struct cgroup_subsys_state *css); -unsigned short css_depth(struct cgroup_subsys_state *css); struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id); #else /* !CONFIG_CGROUPS */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index bc53d5014b28..d4a329f5874c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5227,18 +5227,6 @@ unsigned short css_id(struct cgroup_subsys_state *css) } EXPORT_SYMBOL_GPL(css_id); -unsigned short css_depth(struct cgroup_subsys_state *css) -{ - struct css_id *cssid; - - cssid = rcu_dereference_check(css->id, css_refcnt(css)); - - if (cssid) - return cssid->depth; - return 0; -} -EXPORT_SYMBOL_GPL(css_depth); - /** * css_is_ancestor - test "root" css is an ancestor of "child" * @child: the css to be tested. -- cgit v1.2.3 From 69d0206c793a17431eacee2694ee7a4b25df76b7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Jun 2013 21:04:50 -0700 Subject: cgroup: bring some sanity to naming around cg_cgroup_link cgroups and css_sets are mapped M:N and this M:N mapping is represented by struct cg_cgroup_link which forms linked lists on both sides. The naming around this mapping is already confusing and struct cg_cgroup_link exacerbates the situation quite a bit. >From cgroup side, it starts off ->css_sets and runs through ->cgrp_link_list. From css_set side, it starts off ->cg_links and runs through ->cg_link_list. This is rather reversed as cgrp_link_list is used to iterate css_sets and cg_link_list cgroups. Also, this is the only place which is still using the confusing "cg" for css_sets. This patch cleans it up a bit. * s/cgroup->css_sets/cgroup->cset_links/ s/css_set->cg_links/css_set->cgrp_links/ s/cgroup_iter->cg_link/cgroup_iter->cset_link/ * s/cg_cgroup_link/cgrp_cset_link/ * s/cgrp_cset_link->cg/cgrp_cset_link->cset/ s/cgrp_cset_link->cgrp_link_list/cgrp_cset_link->cset_link/ s/cgrp_cset_link->cg_link_list/cgrp_cset_link->cgrp_link/ * s/init_css_set_link/init_cgrp_cset_link/ s/free_cg_links/free_cgrp_cset_links/ s/allocate_cg_links/allocate_cgrp_cset_links/ * s/cgl[12]/link[12]/ in compare_css_sets() * s/saved_link/tmp_link/ s/tmp/tmp_links/ and a couple similar adustments. * Comment and whiteline adjustments. After the changes, we have list_for_each_entry(link, &cont->cset_links, cset_link) { struct css_set *cset = link->cset; instead of list_for_each_entry(link, &cont->css_sets, cgrp_link_list) { struct css_set *cset = link->cg; This patch is purely cosmetic. v2: Fix broken sentences in the patch description. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 15 ++-- kernel/cgroup.c | 226 ++++++++++++++++++++++++------------------------- 2 files changed, 120 insertions(+), 121 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 5830592258dc..0e32855edc92 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -215,10 +215,10 @@ struct cgroup { struct cgroupfs_root *root; /* - * List of cg_cgroup_links pointing at css_sets with - * tasks in this cgroup. Protected by css_set_lock + * List of cgrp_cset_links pointing at css_sets with tasks in this + * cgroup. Protected by css_set_lock. */ - struct list_head css_sets; + struct list_head cset_links; struct list_head allcg_node; /* cgroupfs_root->allcg_list */ struct list_head cft_q_node; /* used during cftype add/rm */ @@ -365,11 +365,10 @@ struct css_set { struct list_head tasks; /* - * List of cg_cgroup_link objects on link chains from - * cgroups referenced from this css_set. Protected by - * css_set_lock + * List of cgrp_cset_links pointing at cgroups referenced from this + * css_set. Protected by css_set_lock. */ - struct list_head cg_links; + struct list_head cgrp_links; /* * Set of subsystem states, one for each subsystem. This array @@ -792,7 +791,7 @@ struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, /* A cgroup_iter should be treated as an opaque object */ struct cgroup_iter { - struct list_head *cg_link; + struct list_head *cset_link; struct list_head *task; }; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1f5a4e101ed1..ef97bd0cd546 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -315,20 +315,24 @@ static void cgroup_release_agent(struct work_struct *work); static DECLARE_WORK(release_agent_work, cgroup_release_agent); static void check_for_release(struct cgroup *cgrp); -/* Link structure for associating css_set objects with cgroups */ -struct cg_cgroup_link { - /* - * List running through cg_cgroup_links associated with a - * cgroup, anchored on cgroup->css_sets - */ - struct list_head cgrp_link_list; - struct cgroup *cgrp; - /* - * List running through cg_cgroup_links pointing at a - * single css_set object, anchored on css_set->cg_links - */ - struct list_head cg_link_list; - struct css_set *cg; +/* + * A cgroup can be associated with multiple css_sets as different tasks may + * belong to different cgroups on different hierarchies. In the other + * direction, a css_set is naturally associated with multiple cgroups. + * This M:N relationship is represented by the following link structure + * which exists for each association and allows traversing the associations + * from both sides. + */ +struct cgrp_cset_link { + /* the cgroup and css_set this link associates */ + struct cgroup *cgrp; + struct css_set *cset; + + /* list of cgrp_cset_links anchored at cgrp->cset_links */ + struct list_head cset_link; + + /* list of cgrp_cset_links anchored at css_set->cgrp_links */ + struct list_head cgrp_link; }; /* The default css_set - used by init and its children prior to any @@ -339,7 +343,7 @@ struct cg_cgroup_link { */ static struct css_set init_css_set; -static struct cg_cgroup_link init_css_set_link; +static struct cgrp_cset_link init_cgrp_cset_link; static int cgroup_init_idr(struct cgroup_subsys *ss, struct cgroup_subsys_state *css); @@ -378,8 +382,7 @@ static int use_task_css_set_links __read_mostly; static void __put_css_set(struct css_set *cset, int taskexit) { - struct cg_cgroup_link *link; - struct cg_cgroup_link *saved_link; + struct cgrp_cset_link *link, *tmp_link; /* * Ensure that the refcount doesn't hit zero while any readers @@ -398,12 +401,11 @@ static void __put_css_set(struct css_set *cset, int taskexit) hash_del(&cset->hlist); css_set_count--; - list_for_each_entry_safe(link, saved_link, &cset->cg_links, - cg_link_list) { + list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) { struct cgroup *cgrp = link->cgrp; - list_del(&link->cg_link_list); - list_del(&link->cgrp_link_list); + list_del(&link->cset_link); + list_del(&link->cgrp_link); /* * We may not be holding cgroup_mutex, and if cgrp->count is @@ -475,26 +477,26 @@ static bool compare_css_sets(struct css_set *cset, * candidates. */ - l1 = &cset->cg_links; - l2 = &old_cset->cg_links; + l1 = &cset->cgrp_links; + l2 = &old_cset->cgrp_links; while (1) { - struct cg_cgroup_link *cgl1, *cgl2; + struct cgrp_cset_link *link1, *link2; struct cgroup *cgrp1, *cgrp2; l1 = l1->next; l2 = l2->next; /* See if we reached the end - both lists are equal length. */ - if (l1 == &cset->cg_links) { - BUG_ON(l2 != &old_cset->cg_links); + if (l1 == &cset->cgrp_links) { + BUG_ON(l2 != &old_cset->cgrp_links); break; } else { - BUG_ON(l2 == &old_cset->cg_links); + BUG_ON(l2 == &old_cset->cgrp_links); } /* Locate the cgroups associated with these links. */ - cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list); - cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list); - cgrp1 = cgl1->cgrp; - cgrp2 = cgl2->cgrp; + link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link); + link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link); + cgrp1 = link1->cgrp; + cgrp2 = link2->cgrp; /* Hierarchies should be linked in the same order. */ BUG_ON(cgrp1->root != cgrp2->root); @@ -569,61 +571,64 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, return NULL; } -static void free_cg_links(struct list_head *tmp) +static void free_cgrp_cset_links(struct list_head *links_to_free) { - struct cg_cgroup_link *link; - struct cg_cgroup_link *saved_link; + struct cgrp_cset_link *link, *tmp_link; - list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { - list_del(&link->cgrp_link_list); + list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) { + list_del(&link->cset_link); kfree(link); } } -/* - * allocate_cg_links() allocates "count" cg_cgroup_link structures - * and chains them on tmp through their cgrp_link_list fields. Returns 0 on - * success or a negative error +/** + * allocate_cgrp_cset_links - allocate cgrp_cset_links + * @count: the number of links to allocate + * @tmp_links: list_head the allocated links are put on + * + * Allocate @count cgrp_cset_link structures and chain them on @tmp_links + * through ->cset_link. Returns 0 on success or -errno. */ -static int allocate_cg_links(int count, struct list_head *tmp) +static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links) { - struct cg_cgroup_link *link; + struct cgrp_cset_link *link; int i; - INIT_LIST_HEAD(tmp); + + INIT_LIST_HEAD(tmp_links); + for (i = 0; i < count; i++) { link = kmalloc(sizeof(*link), GFP_KERNEL); if (!link) { - free_cg_links(tmp); + free_cgrp_cset_links(tmp_links); return -ENOMEM; } - list_add(&link->cgrp_link_list, tmp); + list_add(&link->cset_link, tmp_links); } return 0; } /** * link_css_set - a helper function to link a css_set to a cgroup - * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links() + * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links() * @cset: the css_set to be linked * @cgrp: the destination cgroup */ -static void link_css_set(struct list_head *tmp_cg_links, - struct css_set *cset, struct cgroup *cgrp) +static void link_css_set(struct list_head *tmp_links, struct css_set *cset, + struct cgroup *cgrp) { - struct cg_cgroup_link *link; + struct cgrp_cset_link *link; - BUG_ON(list_empty(tmp_cg_links)); - link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, - cgrp_link_list); - link->cg = cset; + BUG_ON(list_empty(tmp_links)); + link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); + link->cset = cset; link->cgrp = cgrp; atomic_inc(&cgrp->count); - list_move(&link->cgrp_link_list, &cgrp->css_sets); + list_move(&link->cset_link, &cgrp->cset_links); /* * Always add links to the tail of the list so that the list * is sorted by order of hierarchy creation */ - list_add_tail(&link->cg_link_list, &cset->cg_links); + list_add_tail(&link->cgrp_link, &cset->cgrp_links); } /* @@ -638,10 +643,8 @@ static struct css_set *find_css_set(struct css_set *old_cset, { struct css_set *cset; struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; - - struct list_head tmp_cg_links; - - struct cg_cgroup_link *link; + struct list_head tmp_links; + struct cgrp_cset_link *link; unsigned long key; /* First see if we already have a cgroup group that matches @@ -659,14 +662,14 @@ static struct css_set *find_css_set(struct css_set *old_cset, if (!cset) return NULL; - /* Allocate all the cg_cgroup_link objects that we'll need */ - if (allocate_cg_links(root_count, &tmp_cg_links) < 0) { + /* Allocate all the cgrp_cset_link objects that we'll need */ + if (allocate_cgrp_cset_links(root_count, &tmp_links) < 0) { kfree(cset); return NULL; } atomic_set(&cset->refcount, 1); - INIT_LIST_HEAD(&cset->cg_links); + INIT_LIST_HEAD(&cset->cgrp_links); INIT_LIST_HEAD(&cset->tasks); INIT_HLIST_NODE(&cset->hlist); @@ -676,14 +679,15 @@ static struct css_set *find_css_set(struct css_set *old_cset, write_lock(&css_set_lock); /* Add reference counts and links from the new css_set. */ - list_for_each_entry(link, &old_cset->cg_links, cg_link_list) { + list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; + if (c->root == cgrp->root) c = cgrp; - link_css_set(&tmp_cg_links, cset, c); + link_css_set(&tmp_links, cset, c); } - BUG_ON(!list_empty(&tmp_cg_links)); + BUG_ON(!list_empty(&tmp_links)); css_set_count++; @@ -717,9 +721,11 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, if (cset == &init_css_set) { res = &root->top_cgroup; } else { - struct cg_cgroup_link *link; - list_for_each_entry(link, &cset->cg_links, cg_link_list) { + struct cgrp_cset_link *link; + + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; + if (c->root == root) { res = c; break; @@ -1405,7 +1411,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->sibling); INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->files); - INIT_LIST_HEAD(&cgrp->css_sets); + INIT_LIST_HEAD(&cgrp->cset_links); INIT_LIST_HEAD(&cgrp->allcg_node); INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); @@ -1604,7 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, BUG_ON(!root); if (root == opts.new_root) { /* We used the new root structure, so this is a new hierarchy */ - struct list_head tmp_cg_links; + struct list_head tmp_links; struct cgroup *root_cgrp = &root->top_cgroup; struct cgroupfs_root *existing_root; const struct cred *cred; @@ -1636,7 +1642,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, * that's us. The worst that can happen is that we * have some link structures left over */ - ret = allocate_cg_links(css_set_count, &tmp_cg_links); + ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); if (ret) goto unlock_drop; @@ -1646,7 +1652,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, ret = rebind_subsystems(root, root->subsys_mask); if (ret == -EBUSY) { - free_cg_links(&tmp_cg_links); + free_cgrp_cset_links(&tmp_links); goto unlock_drop; } /* @@ -1668,10 +1674,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, * the css_set objects */ write_lock(&css_set_lock); hash_for_each(css_set_table, i, cset, hlist) - link_css_set(&tmp_cg_links, cset, root_cgrp); + link_css_set(&tmp_links, cset, root_cgrp); write_unlock(&css_set_lock); - free_cg_links(&tmp_cg_links); + free_cgrp_cset_links(&tmp_links); BUG_ON(!list_empty(&root_cgrp->children)); BUG_ON(root->number_of_cgroups != 1); @@ -1722,9 +1728,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, static void cgroup_kill_sb(struct super_block *sb) { struct cgroupfs_root *root = sb->s_fs_info; struct cgroup *cgrp = &root->top_cgroup; + struct cgrp_cset_link *link, *tmp_link; int ret; - struct cg_cgroup_link *link; - struct cg_cgroup_link *saved_link; BUG_ON(!root); @@ -1740,15 +1745,14 @@ static void cgroup_kill_sb(struct super_block *sb) { BUG_ON(ret); /* - * Release all the links from css_sets to this hierarchy's + * Release all the links from cset_links to this hierarchy's * root cgroup */ write_lock(&css_set_lock); - list_for_each_entry_safe(link, saved_link, &cgrp->css_sets, - cgrp_link_list) { - list_del(&link->cg_link_list); - list_del(&link->cgrp_link_list); + list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { + list_del(&link->cset_link); + list_del(&link->cgrp_link); kfree(link); } write_unlock(&css_set_lock); @@ -2908,12 +2912,11 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) int cgroup_task_count(const struct cgroup *cgrp) { int count = 0; - struct cg_cgroup_link *link; + struct cgrp_cset_link *link; read_lock(&css_set_lock); - list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { - count += atomic_read(&link->cg->refcount); - } + list_for_each_entry(link, &cgrp->cset_links, cset_link) + count += atomic_read(&link->cset->refcount); read_unlock(&css_set_lock); return count; } @@ -2922,24 +2925,23 @@ int cgroup_task_count(const struct cgroup *cgrp) * Advance a list_head iterator. The iterator should be positioned at * the start of a css_set */ -static void cgroup_advance_iter(struct cgroup *cgrp, - struct cgroup_iter *it) +static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) { - struct list_head *l = it->cg_link; - struct cg_cgroup_link *link; + struct list_head *l = it->cset_link; + struct cgrp_cset_link *link; struct css_set *cset; /* Advance to the next non-empty css_set */ do { l = l->next; - if (l == &cgrp->css_sets) { - it->cg_link = NULL; + if (l == &cgrp->cset_links) { + it->cset_link = NULL; return; } - link = list_entry(l, struct cg_cgroup_link, cgrp_link_list); - cset = link->cg; + link = list_entry(l, struct cgrp_cset_link, cset_link); + cset = link->cset; } while (list_empty(&cset->tasks)); - it->cg_link = l; + it->cset_link = l; it->task = cset->tasks.next; } @@ -3160,7 +3162,7 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) cgroup_enable_task_cg_lists(); read_lock(&css_set_lock); - it->cg_link = &cgrp->css_sets; + it->cset_link = &cgrp->cset_links; cgroup_advance_iter(cgrp, it); } @@ -3169,16 +3171,16 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, { struct task_struct *res; struct list_head *l = it->task; - struct cg_cgroup_link *link; + struct cgrp_cset_link *link; /* If the iterator cg is NULL, we have no tasks */ - if (!it->cg_link) + if (!it->cset_link) return NULL; res = list_entry(l, struct task_struct, cg_list); /* Advance iterator to find next entry */ l = l->next; - link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list); - if (l == &link->cg->tasks) { + link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); + if (l == &link->cset->tasks) { /* We reached the end of this task list - move on to * the next cg_cgroup_link */ cgroup_advance_iter(cgrp, it); @@ -4625,7 +4627,7 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys); */ void cgroup_unload_subsys(struct cgroup_subsys *ss) { - struct cg_cgroup_link *link; + struct cgrp_cset_link *link; BUG_ON(ss->module == NULL); @@ -4654,8 +4656,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) * in loading, we need to pay our respects to the hashtable gods. */ write_lock(&css_set_lock); - list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { - struct css_set *cset = link->cg; + list_for_each_entry(link, &dummytop->cset_links, cset_link) { + struct css_set *cset = link->cset; unsigned long key; hash_del(&cset->hlist); @@ -4688,7 +4690,7 @@ int __init cgroup_init_early(void) { int i; atomic_set(&init_css_set.refcount, 1); - INIT_LIST_HEAD(&init_css_set.cg_links); + INIT_LIST_HEAD(&init_css_set.cgrp_links); INIT_LIST_HEAD(&init_css_set.tasks); INIT_HLIST_NODE(&init_css_set.hlist); css_set_count = 1; @@ -4696,12 +4698,10 @@ int __init cgroup_init_early(void) root_count = 1; init_task.cgroups = &init_css_set; - init_css_set_link.cg = &init_css_set; - init_css_set_link.cgrp = dummytop; - list_add(&init_css_set_link.cgrp_link_list, - &rootnode.top_cgroup.css_sets); - list_add(&init_css_set_link.cg_link_list, - &init_css_set.cg_links); + init_cgrp_cset_link.cset = &init_css_set; + init_cgrp_cset_link.cgrp = dummytop; + list_add(&init_cgrp_cset_link.cset_link, &rootnode.top_cgroup.cset_links); + list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links); for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; @@ -5454,13 +5454,13 @@ static int current_css_set_cg_links_read(struct cgroup *cont, struct cftype *cft, struct seq_file *seq) { - struct cg_cgroup_link *link; + struct cgrp_cset_link *link; struct css_set *cset; read_lock(&css_set_lock); rcu_read_lock(); cset = rcu_dereference(current->cgroups); - list_for_each_entry(link, &cset->cg_links, cg_link_list) { + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; const char *name; @@ -5481,11 +5481,11 @@ static int cgroup_css_links_read(struct cgroup *cont, struct cftype *cft, struct seq_file *seq) { - struct cg_cgroup_link *link; + struct cgrp_cset_link *link; read_lock(&css_set_lock); - list_for_each_entry(link, &cont->css_sets, cgrp_link_list) { - struct css_set *cset = link->cg; + list_for_each_entry(link, &cont->cset_links, cset_link) { + struct css_set *cset = link->cset; struct task_struct *task; int count = 0; seq_printf(seq, "css_set %p\n", cset); -- cgit v1.2.3 From 5de0107e634ce862f16360139709d9d3a656463e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Jun 2013 21:04:52 -0700 Subject: cgroup: clean up css_[try]get() and css_put() * __css_get() isn't used by anyone. Fold it into css_get(). * Add proper function comments to all css reference functions. This patch is purely cosmetic. v2: Typo fix as per Li. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 0e32855edc92..a494636a34da 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -94,33 +94,31 @@ enum { CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ }; -/* Caller must verify that the css is not for root cgroup */ -static inline void __css_get(struct cgroup_subsys_state *css, int count) -{ - atomic_add(count, &css->refcnt); -} - -/* - * Call css_get() to hold a reference on the css; it can be used - * for a reference obtained via: - * - an existing ref-counted reference to the css - * - task->cgroups for a locked task +/** + * css_get - obtain a reference on the specified css + * @css: target css + * + * The caller must already have a reference. */ - static inline void css_get(struct cgroup_subsys_state *css) { /* We don't need to reference count the root state */ if (!(css->flags & CSS_ROOT)) - __css_get(css, 1); + atomic_inc(&css->refcnt); } -/* - * Call css_tryget() to take a reference on a css if your existing - * (known-valid) reference isn't already ref-counted. Returns false if - * the css has been destroyed. - */ - extern bool __css_tryget(struct cgroup_subsys_state *css); + +/** + * css_tryget - try to obtain a reference on the specified css + * @css: target css + * + * Obtain a reference on @css if it's alive. The caller naturally needs to + * ensure that @css is accessible but doesn't have to be holding a + * reference on it - IOW, RCU protected access is good enough for this + * function. Returns %true if a reference count was successfully obtained; + * %false otherwise. + */ static inline bool css_tryget(struct cgroup_subsys_state *css) { if (css->flags & CSS_ROOT) @@ -128,12 +126,14 @@ static inline bool css_tryget(struct cgroup_subsys_state *css) return __css_tryget(css); } -/* - * css_put() should be called to release a reference taken by - * css_get() or css_tryget() - */ - extern void __css_put(struct cgroup_subsys_state *css); + +/** + * css_put - put a css reference + * @css: target css + * + * Put a reference obtained via css_get() and css_tryget(). + */ static inline void css_put(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_ROOT)) -- cgit v1.2.3 From 54766d4a1d3d6f84ff8fa475cd8f165c0a0000eb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Jun 2013 21:04:53 -0700 Subject: cgroup: rename CGRP_REMOVED to CGRP_DEAD We will add another flag indicating that the cgroup is in the process of being killed. REMOVING / REMOVED is more difficult to distinguish and cgroup_is_removing()/cgroup_is_removed() are a bit awkward. Also, later percpu_ref usage will involve "kill"ing the refcnt. s/CGRP_REMOVED/CGRP_DEAD/ s/cgroup_is_removed()/cgroup_is_dead() This patch is purely cosmetic. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 2 +- kernel/cgroup.c | 30 ++++++++++++++---------------- 2 files changed, 15 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index a494636a34da..c86a93abe83d 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -143,7 +143,7 @@ static inline void css_put(struct cgroup_subsys_state *css) /* bits in struct cgroup flags field */ enum { /* Control Group is dead */ - CGRP_REMOVED, + CGRP_DEAD, /* * Control Group has previously had a child cgroup or a task, * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d86a8477d56a..84efb344fdf6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -226,9 +226,9 @@ static int css_refcnt(struct cgroup_subsys_state *css) } /* convenient tests for these bits */ -static inline bool cgroup_is_removed(const struct cgroup *cgrp) +static inline bool cgroup_is_dead(const struct cgroup *cgrp) { - return test_bit(CGRP_REMOVED, &cgrp->flags); + return test_bit(CGRP_DEAD, &cgrp->flags); } /** @@ -300,7 +300,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry) static bool cgroup_lock_live_group(struct cgroup *cgrp) { mutex_lock(&cgroup_mutex); - if (cgroup_is_removed(cgrp)) { + if (cgroup_is_dead(cgrp)) { mutex_unlock(&cgroup_mutex); return false; } @@ -892,7 +892,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) if (S_ISDIR(inode->i_mode)) { struct cgroup *cgrp = dentry->d_fsdata; - BUG_ON(!(cgroup_is_removed(cgrp))); + BUG_ON(!(cgroup_is_dead(cgrp))); call_rcu(&cgrp->rcu_head, cgroup_free_rcu); } else { struct cfent *cfe = __d_cfe(dentry); @@ -2363,7 +2363,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf, struct cftype *cft = __d_cft(file->f_dentry); struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - if (cgroup_is_removed(cgrp)) + if (cgroup_is_dead(cgrp)) return -ENODEV; if (cft->write) return cft->write(cgrp, cft, file, buf, nbytes, ppos); @@ -2408,7 +2408,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, struct cftype *cft = __d_cft(file->f_dentry); struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - if (cgroup_is_removed(cgrp)) + if (cgroup_is_dead(cgrp)) return -ENODEV; if (cft->read) @@ -2831,7 +2831,7 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); - if (!cgroup_is_removed(cgrp)) + if (!cgroup_is_dead(cgrp)) cgroup_addrm_files(cgrp, ss, cfts, is_add); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); @@ -2999,14 +2999,14 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos) /* * @pos could already have been removed. Once a cgroup is removed, * its ->sibling.next is no longer updated when its next sibling - * changes. As CGRP_REMOVED is set on removal which is fully + * changes. As CGRP_DEAD is set on removal which is fully * serialized, if we see it unasserted, it's guaranteed that the * next sibling hasn't finished its grace period even if it's * already removed, and thus safe to dereference from this RCU * critical section. If ->sibling.next is inaccessible, - * cgroup_is_removed() is guaranteed to be visible as %true here. + * cgroup_is_dead() is guaranteed to be visible as %true here. */ - if (likely(!cgroup_is_removed(pos))) { + if (likely(!cgroup_is_dead(pos))) { next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); if (&next->sibling != &pos->parent->children) return next; @@ -4383,7 +4383,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * attempts fail thus maintaining the removal conditions verified * above. * - * Note that CGRP_REMVOED clearing is depended upon by + * Note that CGRP_DEAD assertion is depended upon by * cgroup_next_sibling() to resume iteration after dropping RCU * read lock. See cgroup_next_sibling() for details. */ @@ -4393,7 +4393,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) WARN_ON(atomic_read(&css->refcnt) < 0); atomic_add(CSS_DEACT_BIAS, &css->refcnt); } - set_bit(CGRP_REMOVED, &cgrp->flags); + set_bit(CGRP_DEAD, &cgrp->flags); /* tell subsystems to initate destruction */ for_each_subsys(cgrp->root, ss) @@ -5063,7 +5063,7 @@ static void check_for_release(struct cgroup *cgrp) int need_schedule_work = 0; raw_spin_lock(&release_list_lock); - if (!cgroup_is_removed(cgrp) && + if (!cgroup_is_dead(cgrp) && list_empty(&cgrp->release_list)) { list_add(&cgrp->release_list, &release_list); need_schedule_work = 1; @@ -5209,9 +5209,7 @@ __setup("cgroup_disable=", cgroup_disable); * Functons for CSS ID. */ -/* - *To get ID other than 0, this should be called when !cgroup_is_removed(). - */ +/* to get ID other than 0, this should be called when !cgroup_is_dead() */ unsigned short css_id(struct cgroup_subsys_state *css) { struct css_id *cssid; -- cgit v1.2.3 From 6f3d828f0fb7fdaffc6f32cb8a1cb7fcf8824598 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Jun 2013 21:04:55 -0700 Subject: cgroup: remove cgroup->count and use cgroup->count tracks the number of css_sets associated with the cgroup and used only to verify that no css_set is associated when the cgroup is being destroyed. It's superflous as the destruction path can simply check whether cgroup->cset_links is empty instead. Drop cgroup->count and check ->cset_links directly from cgroup_destroy_locked(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 6 ------ kernel/cgroup.c | 21 +++++---------------- 2 files changed, 5 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c86a93abe83d..81bfd0268e93 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -169,12 +169,6 @@ struct cgroup_name { struct cgroup { unsigned long flags; /* "unsigned long" so bitops work */ - /* - * count users of this cgroup. >0 means busy, but doesn't - * necessarily indicate the number of tasks in the cgroup - */ - atomic_t count; - int id; /* ida allocated in-hierarchy ID */ /* diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1a68241ca835..49bfd7b0bbda 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -408,8 +408,7 @@ static void __put_css_set(struct css_set *cset, int taskexit) list_del(&link->cgrp_link); /* @cgrp can't go away while we're holding css_set_lock */ - if (atomic_dec_and_test(&cgrp->count) && - notify_on_release(cgrp)) { + if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { if (taskexit) set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); @@ -616,7 +615,6 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); link->cset = cset; link->cgrp = cgrp; - atomic_inc(&cgrp->count); list_move(&link->cset_link, &cgrp->cset_links); /* * Always add links to the tail of the list so that the list @@ -4370,11 +4368,11 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); /* - * css_set_lock prevents @cgrp from being removed while - * __put_css_set() is in progress. + * css_set_lock synchronizes access to ->cset_links and prevents + * @cgrp from being removed while __put_css_set() is in progress. */ read_lock(&css_set_lock); - empty = !atomic_read(&cgrp->count) && list_empty(&cgrp->children); + empty = list_empty(&cgrp->cset_links) && list_empty(&cgrp->children); read_unlock(&css_set_lock); if (!empty) return -EBUSY; @@ -5054,7 +5052,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) static void check_for_release(struct cgroup *cgrp) { if (cgroup_is_releasable(cgrp) && - !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) { + list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) { /* * Control Group is currently removeable. If it's not * already queued for a userspace notification, queue @@ -5422,11 +5420,6 @@ static void debug_css_free(struct cgroup *cont) kfree(cont->subsys[debug_subsys_id]); } -static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft) -{ - return atomic_read(&cont->count); -} - static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft) { return cgroup_task_count(cont); @@ -5507,10 +5500,6 @@ static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) } static struct cftype debug_files[] = { - { - .name = "cgroup_refcount", - .read_u64 = cgroup_refcount_read, - }, { .name = "taskcount", .read_u64 = debug_taskcount_read, -- cgit v1.2.3 From ea15f8ccdb430af1e8bc9b4e19a230eb4c356777 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Jun 2013 19:27:42 -0700 Subject: cgroup: split cgroup destruction into two steps Split cgroup_destroy_locked() into two steps and put the latter half into cgroup_offline_fn() which is executed from a work item. The latter half is responsible for offlining the css's, removing the cgroup from internal lists, and propagating release notification to the parent. The separation is to allow using percpu refcnt for css. Note that this allows for other cgroup operations to happen between the first and second halves of destruction, including creating a new cgroup with the same name. As the target cgroup is marked DEAD in the first half and cgroup internals don't care about the names of cgroups, this should be fine. A comment explaining this will be added by the next patch which implements the actual percpu refcnting. As RCU freeing is guaranteed to happen after the second step of destruction, we can use the same work item for both. This patch renames cgroup->free_work to ->destroy_work and uses it for both purposes. INIT_WORK() is now performed right before queueing the work item. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 2 +- kernel/cgroup.c | 38 +++++++++++++++++++++++++++----------- 2 files changed, 28 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 81bfd0268e93..e345d8b90046 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -233,7 +233,7 @@ struct cgroup { /* For RCU-protected deletion */ struct rcu_head rcu_head; - struct work_struct free_work; + struct work_struct destroy_work; /* List of events which userspace want to receive */ struct list_head event_list; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5a1ddecc3cfa..df6814706cca 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -208,6 +208,7 @@ static struct cgroup_name root_cgroup_name = { .name = "/" }; */ static int need_forkexit_callback __read_mostly; +static void cgroup_offline_fn(struct work_struct *work); static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, struct cftype cfts[], bool is_add); @@ -830,7 +831,7 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) static void cgroup_free_fn(struct work_struct *work) { - struct cgroup *cgrp = container_of(work, struct cgroup, free_work); + struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); struct cgroup_subsys *ss; mutex_lock(&cgroup_mutex); @@ -875,7 +876,8 @@ static void cgroup_free_rcu(struct rcu_head *head) { struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); - schedule_work(&cgrp->free_work); + INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); + schedule_work(&cgrp->destroy_work); } static void cgroup_diput(struct dentry *dentry, struct inode *inode) @@ -1407,7 +1409,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->allcg_node); INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); - INIT_WORK(&cgrp->free_work, cgroup_free_fn); mutex_init(&cgrp->pidlist_mutex); INIT_LIST_HEAD(&cgrp->event_list); spin_lock_init(&cgrp->event_list_lock); @@ -2991,12 +2992,13 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos) /* * @pos could already have been removed. Once a cgroup is removed, * its ->sibling.next is no longer updated when its next sibling - * changes. As CGRP_DEAD is set on removal which is fully - * serialized, if we see it unasserted, it's guaranteed that the - * next sibling hasn't finished its grace period even if it's - * already removed, and thus safe to dereference from this RCU - * critical section. If ->sibling.next is inaccessible, - * cgroup_is_dead() is guaranteed to be visible as %true here. + * changes. As CGRP_DEAD assertion is serialized and happens + * before the cgroup is taken off the ->sibling list, if we see it + * unasserted, it's guaranteed that the next sibling hasn't + * finished its grace period even if it's already removed, and thus + * safe to dereference from this RCU critical section. If + * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed + * to be visible as %true here. */ if (likely(!cgroup_is_dead(pos))) { next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); @@ -4359,7 +4361,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct dentry *d = cgrp->dentry; - struct cgroup *parent = cgrp->parent; struct cgroup_event *event, *tmp; struct cgroup_subsys *ss; bool empty; @@ -4423,6 +4424,21 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) } spin_unlock(&cgrp->event_list_lock); + INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); + schedule_work(&cgrp->destroy_work); + + return 0; +}; + +static void cgroup_offline_fn(struct work_struct *work) +{ + struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); + struct cgroup *parent = cgrp->parent; + struct dentry *d = cgrp->dentry; + struct cgroup_subsys *ss; + + mutex_lock(&cgroup_mutex); + /* tell subsystems to initate destruction */ for_each_subsys(cgrp->root, ss) offline_css(ss, cgrp); @@ -4446,7 +4462,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) set_bit(CGRP_RELEASABLE, &parent->flags); check_for_release(parent); - return 0; + mutex_unlock(&cgroup_mutex); } static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) -- cgit v1.2.3 From d3daf28da16a30af95bfb303189a634a87606725 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Jun 2013 19:39:16 -0700 Subject: cgroup: use percpu refcnt for cgroup_subsys_states A css (cgroup_subsys_state) is how each cgroup is represented to a controller. As such, it can be used in hot paths across the various subsystems different controllers are associated with. One of the common operations is reference counting, which up until now has been implemented using a global atomic counter and can have significant adverse impact on scalability. For example, css refcnt can be gotten and put multiple times by blkcg for each IO request. For highops configurations which try to do as much per-cpu as possible, the global frequent refcnting can be very expensive. In general, given the various and hugely diverse paths css's end up being used from, we need to make it cheap and highly scalable. In its usage, css refcnting isn't very different from module refcnting. This patch converts css refcnting to use the recently added percpu_ref. css_get/tryget/put() directly maps to the matching percpu_ref operations and the deactivation logic is no longer necessary as percpu_ref already has refcnt killing. The only complication is that as the refcnt is per-cpu, percpu_ref_kill() in itself doesn't ensure that further tryget operations will fail, which we need to guarantee before invoking ->css_offline()'s. This is resolved collecting kill confirmation using percpu_ref_kill_and_confirm() and initiating the offline phase of destruction after all css refcnt's are confirmed to be seen as killed on all CPUs. The previous patches already splitted destruction into two phases, so percpu_ref_kill_and_confirm() can be hooked up easily. This patch removes css_refcnt() which is used for rcu dereference sanity check in css_id(). While we can add a percpu refcnt API to ask the same question, css_id() itself is scheduled to be removed fairly soon, so let's not bother with it. Just drop the sanity check and use rcu_dereference_raw() instead. v2: - init_cgroup_css() was calling percpu_ref_init() without checking the return value. This causes two problems - the obvious lack of error handling and percpu_ref_init() being called from cgroup_init_subsys() before the allocators are up, which triggers warnings but doesn't cause actual problems as the refcnt isn't used for roots anyway. Fix both by moving percpu_ref_init() to cgroup_create(). - The base references were put too early by percpu_ref_kill_and_confirm() and cgroup_offline_fn() put the refs one extra time. This wasn't noticeable because css's go through another RCU grace period before being freed. Update cgroup_destroy_locked() to grab an extra reference before killing the refcnts. This problem was noticed by Kent. Signed-off-by: Tejun Heo Reviewed-by: Kent Overstreet Acked-by: Li Zefan Cc: Michal Hocko Cc: Mike Snitzer Cc: Vivek Goyal Cc: "Alasdair G. Kergon" Cc: Jens Axboe Cc: Mikulas Patocka Cc: Glauber Costa --- include/linux/cgroup.h | 23 +++---- kernel/cgroup.c | 165 +++++++++++++++++++++++++++++++------------------ 2 files changed, 112 insertions(+), 76 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e345d8b90046..b7bd4beae294 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -20,6 +20,7 @@ #include #include #include +#include #ifdef CONFIG_CGROUPS @@ -72,13 +73,8 @@ struct cgroup_subsys_state { */ struct cgroup *cgroup; - /* - * State maintained by the cgroup system to allow subsystems - * to be "busy". Should be accessed via css_get(), - * css_tryget() and css_put(). - */ - - atomic_t refcnt; + /* reference count - access via css_[try]get() and css_put() */ + struct percpu_ref refcnt; unsigned long flags; /* ID for this css, if possible */ @@ -104,11 +100,9 @@ static inline void css_get(struct cgroup_subsys_state *css) { /* We don't need to reference count the root state */ if (!(css->flags & CSS_ROOT)) - atomic_inc(&css->refcnt); + percpu_ref_get(&css->refcnt); } -extern bool __css_tryget(struct cgroup_subsys_state *css); - /** * css_tryget - try to obtain a reference on the specified css * @css: target css @@ -123,11 +117,9 @@ static inline bool css_tryget(struct cgroup_subsys_state *css) { if (css->flags & CSS_ROOT) return true; - return __css_tryget(css); + return percpu_ref_tryget(&css->refcnt); } -extern void __css_put(struct cgroup_subsys_state *css); - /** * css_put - put a css reference * @css: target css @@ -137,7 +129,7 @@ extern void __css_put(struct cgroup_subsys_state *css); static inline void css_put(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_ROOT)) - __css_put(css); + percpu_ref_put(&css->refcnt); } /* bits in struct cgroup flags field */ @@ -231,9 +223,10 @@ struct cgroup { struct list_head pidlists; struct mutex pidlist_mutex; - /* For RCU-protected deletion */ + /* For css percpu_ref killing and RCU-protected deletion */ struct rcu_head rcu_head; struct work_struct destroy_work; + atomic_t css_kill_cnt; /* List of events which userspace want to receive */ struct list_head event_list; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ebbfc043153f..2e9da7bf25cb 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -63,9 +63,6 @@ #include -/* css deactivation bias, makes css->refcnt negative to deny new trygets */ -#define CSS_DEACT_BIAS INT_MIN - /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. @@ -213,19 +210,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, struct cftype cfts[], bool is_add); -static int css_unbias_refcnt(int refcnt) -{ - return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; -} - -/* the current nr of refs, always >= 0 whether @css is deactivated or not */ -static int css_refcnt(struct cgroup_subsys_state *css) -{ - int v = atomic_read(&css->refcnt); - - return css_unbias_refcnt(v); -} - /* convenient tests for these bits */ static inline bool cgroup_is_dead(const struct cgroup *cgrp) { @@ -4139,12 +4123,19 @@ static void css_dput_fn(struct work_struct *work) deactivate_super(sb); } +static void css_release(struct percpu_ref *ref) +{ + struct cgroup_subsys_state *css = + container_of(ref, struct cgroup_subsys_state, refcnt); + + schedule_work(&css->dput_work); +} + static void init_cgroup_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, struct cgroup *cgrp) { css->cgroup = cgrp; - atomic_set(&css->refcnt, 1); css->flags = 0; css->id = NULL; if (cgrp == dummytop) @@ -4266,7 +4257,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err = PTR_ERR(css); goto err_free_all; } + + err = percpu_ref_init(&css->refcnt, css_release); + if (err) + goto err_free_all; + init_cgroup_css(css, ss, cgrp); + if (ss->use_id) { err = alloc_css_id(ss, parent, cgrp); if (err) @@ -4331,8 +4328,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err_free_all: for_each_subsys(root, ss) { - if (cgrp->subsys[ss->subsys_id]) + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + + if (css) { + percpu_ref_cancel_init(&css->refcnt); ss->css_free(cgrp); + } } mutex_unlock(&cgroup_mutex); /* Release the reference count that we took on the superblock */ @@ -4360,6 +4361,48 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return cgroup_create(c_parent, dentry, mode | S_IFDIR); } +static void cgroup_css_killed(struct cgroup *cgrp) +{ + if (!atomic_dec_and_test(&cgrp->css_kill_cnt)) + return; + + /* percpu ref's of all css's are killed, kick off the next step */ + INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); + schedule_work(&cgrp->destroy_work); +} + +static void css_ref_killed_fn(struct percpu_ref *ref) +{ + struct cgroup_subsys_state *css = + container_of(ref, struct cgroup_subsys_state, refcnt); + + cgroup_css_killed(css->cgroup); +} + +/** + * cgroup_destroy_locked - the first stage of cgroup destruction + * @cgrp: cgroup to be destroyed + * + * css's make use of percpu refcnts whose killing latency shouldn't be + * exposed to userland and are RCU protected. Also, cgroup core needs to + * guarantee that css_tryget() won't succeed by the time ->css_offline() is + * invoked. To satisfy all the requirements, destruction is implemented in + * the following two steps. + * + * s1. Verify @cgrp can be destroyed and mark it dying. Remove all + * userland visible parts and start killing the percpu refcnts of + * css's. Set up so that the next stage will be kicked off once all + * the percpu refcnts are confirmed to be killed. + * + * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the + * rest of destruction. Once all cgroup references are gone, the + * cgroup is RCU-freed. + * + * This function implements s1. After this step, @cgrp is gone as far as + * the userland is concerned and a new cgroup with the same name may be + * created. As cgroup doesn't care about the names internally, this + * doesn't cause any problem. + */ static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { @@ -4382,16 +4425,34 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) return -EBUSY; /* - * Block new css_tryget() by deactivating refcnt and mark @cgrp - * removed. This makes future css_tryget() attempts fail which we - * guarantee to ->css_offline() callbacks. + * Block new css_tryget() by killing css refcnts. cgroup core + * guarantees that, by the time ->css_offline() is invoked, no new + * css reference will be given out via css_tryget(). We can't + * simply call percpu_ref_kill() and proceed to offlining css's + * because percpu_ref_kill() doesn't guarantee that the ref is seen + * as killed on all CPUs on return. + * + * Use percpu_ref_kill_and_confirm() to get notifications as each + * css is confirmed to be seen as killed on all CPUs. The + * notification callback keeps track of the number of css's to be + * killed and schedules cgroup_offline_fn() to perform the rest of + * destruction once the percpu refs of all css's are confirmed to + * be killed. */ + atomic_set(&cgrp->css_kill_cnt, 1); for_each_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; - WARN_ON(atomic_read(&css->refcnt) < 0); - atomic_add(CSS_DEACT_BIAS, &css->refcnt); + /* + * Killing would put the base ref, but we need to keep it + * alive until after ->css_offline. + */ + percpu_ref_get(&css->refcnt); + + atomic_inc(&cgrp->css_kill_cnt); + percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn); } + cgroup_css_killed(cgrp); /* * Mark @cgrp dead. This prevents further task migration and child @@ -4427,12 +4488,19 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) } spin_unlock(&cgrp->event_list_lock); - INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); - schedule_work(&cgrp->destroy_work); - return 0; }; +/** + * cgroup_offline_fn - the second step of cgroup destruction + * @work: cgroup->destroy_free_work + * + * This function is invoked from a work item for a cgroup which is being + * destroyed after the percpu refcnts of all css's are guaranteed to be + * seen as killed on all CPUs, and performs the rest of destruction. This + * is the second step of destruction described in the comment above + * cgroup_destroy_locked(). + */ static void cgroup_offline_fn(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); @@ -4442,16 +4510,19 @@ static void cgroup_offline_fn(struct work_struct *work) mutex_lock(&cgroup_mutex); - /* tell subsystems to initate destruction */ + /* + * css_tryget() is guaranteed to fail now. Tell subsystems to + * initate destruction. + */ for_each_subsys(cgrp->root, ss) offline_css(ss, cgrp); /* - * Put all the base refs. Each css holds an extra reference to the - * cgroup's dentry and cgroup removal proceeds regardless of css - * refs. On the last put of each css, whenever that may be, the - * extra dentry ref is put so that dentry destruction happens only - * after all css's are released. + * Put the css refs from cgroup_destroy_locked(). Each css holds + * an extra reference to the cgroup's dentry and cgroup removal + * proceeds regardless of css refs. On the last put of each css, + * whenever that may be, the extra dentry ref is put so that dentry + * destruction happens only after all css's are released. */ for_each_subsys(cgrp->root, ss) css_put(cgrp->subsys[ss->subsys_id]); @@ -5100,34 +5171,6 @@ static void check_for_release(struct cgroup *cgrp) } } -/* Caller must verify that the css is not for root cgroup */ -bool __css_tryget(struct cgroup_subsys_state *css) -{ - while (true) { - int t, v; - - v = css_refcnt(css); - t = atomic_cmpxchg(&css->refcnt, v, v + 1); - if (likely(t == v)) - return true; - else if (t < 0) - return false; - cpu_relax(); - } -} -EXPORT_SYMBOL_GPL(__css_tryget); - -/* Caller must verify that the css is not for root cgroup */ -void __css_put(struct cgroup_subsys_state *css) -{ - int v; - - v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); - if (v == 0) - schedule_work(&css->dput_work); -} -EXPORT_SYMBOL_GPL(__css_put); - /* * Notify userspace when a cgroup is released, by running the * configured release agent with the name of the cgroup (path @@ -5245,7 +5288,7 @@ unsigned short css_id(struct cgroup_subsys_state *css) * on this or this is under rcu_read_lock(). Once css->id is allocated, * it's unchanged until freed. */ - cssid = rcu_dereference_check(css->id, css_refcnt(css)); + cssid = rcu_dereference_raw(css->id); if (cssid) return cssid->id; -- cgit v1.2.3 From f63674fd0d6afa1ba24309aee1f8c60195d39041 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Jun 2013 19:58:38 -0700 Subject: cgroup: update sane_behavior documentation f12dc02014 ("cgroup: mark "tasks" cgroup file as insane") and cc5943a781 ("cgroup: mark "notify_on_release" and "release_agent" cgroup files insane") forgot to update the changed behavior documentation in cgroup.h. Update it. Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b7bd4beae294..17604767adfd 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -264,13 +264,14 @@ enum { * * - Remount is disallowed. * - * - memcg: use_hierarchy is on by default and the cgroup file for - * the flag is not created. + * - "tasks" is removed. Everything should be at process + * granularity. Use "cgroup.procs" instead. * - * The followings are planned changes. + * - "release_agent" and "notify_on_release" are removed. + * Replacement notification mechanism will be implemented. * - * - release_agent will be disallowed once replacement notification - * mechanism is implemented. + * - memcg: use_hierarchy is on by default and the cgroup file for + * the flag is not created. */ CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), -- cgit v1.2.3 From 6db8e85c5c1f89cd0183b76dab027c81009f129f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Jun 2013 11:18:22 -0700 Subject: cgroup: disallow rename(2) if sane_behavior cgroup's rename(2) isn't a proper migration implementation - it can't move the cgroup to a different parent in the hierarchy. All it can do is swapping the name string for that cgroup. This isn't useful and can mislead users to think that cgroup supports proper cgroup-level migration. Disallow rename(2) if sane_behavior. v2: Fail with -EPERM instead of -EINVAL so that it matches the vfs return value when ->rename is not implemented as suggested by Li. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 2 ++ kernel/cgroup.c | 7 +++++++ 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 17604767adfd..f97522790682 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -270,6 +270,8 @@ enum { * - "release_agent" and "notify_on_release" are removed. * Replacement notification mechanism will be implemented. * + * - rename(2) is disallowed. + * * - memcg: use_hierarchy is on by default and the cgroup file for * the flag is not created. */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2e9da7bf25cb..c2c64005bbc2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2508,6 +2508,13 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, cgrp = __d_cgrp(old_dentry); + /* + * This isn't a proper migration and its usefulness is very + * limited. Disallow if sane_behavior. + */ + if (cgroup_sane_behavior(cgrp)) + return -EPERM; + name = cgroup_alloc_name(new_dentry); if (!name) return -ENOMEM; -- cgit v1.2.3 From e8c82d20a9f729cf4b9f73043f7fd4e0872bebfd Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 18 Jun 2013 18:48:37 +0800 Subject: cgroup: convert cgroup_cft_commit() to use cgroup_for_each_descendant_pre() We used root->allcg_list to iterate cgroup hierarchy because at that time cgroup_for_each_descendant_pre() hasn't been invented. tj: In cgroup_cfts_commit(), s/@serial_nr/@update_upto/, move the assignment right above releasing cgroup_mutex and explain what's going on there. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 6 ---- kernel/cgroup.c | 80 +++++++++++++++++++++++++++----------------------- 2 files changed, 44 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f97522790682..b28365890646 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -206,9 +206,6 @@ struct cgroup { */ struct list_head cset_links; - struct list_head allcg_node; /* cgroupfs_root->allcg_list */ - struct list_head cft_q_node; /* used during cftype add/rm */ - /* * Linked list running through all cgroups that can * potentially be reaped by the release agent. Protected by @@ -313,9 +310,6 @@ struct cgroupfs_root { /* A list running through the active hierarchies */ struct list_head root_list; - /* All cgroups on this root, cgroup_mutex protected */ - struct list_head allcg_list; - /* Hierarchy-specific flags */ unsigned long flags; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e6571ca822a0..0ed7d8db6508 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1399,7 +1399,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->files); INIT_LIST_HEAD(&cgrp->cset_links); - INIT_LIST_HEAD(&cgrp->allcg_node); INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); @@ -1414,12 +1413,10 @@ static void init_cgroup_root(struct cgroupfs_root *root) INIT_LIST_HEAD(&root->subsys_list); INIT_LIST_HEAD(&root->root_list); - INIT_LIST_HEAD(&root->allcg_list); root->number_of_cgroups = 1; cgrp->root = root; cgrp->name = &root_cgroup_name; init_cgroup_housekeeping(cgrp); - list_add_tail(&cgrp->allcg_node, &root->allcg_list); } static int cgroup_init_root_id(struct cgroupfs_root *root) @@ -2785,65 +2782,78 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, return ret; } -static DEFINE_MUTEX(cgroup_cft_mutex); - static void cgroup_cfts_prepare(void) - __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex) + __acquires(&cgroup_mutex) { /* * Thanks to the entanglement with vfs inode locking, we can't walk * the existing cgroups under cgroup_mutex and create files. - * Instead, we increment reference on all cgroups and build list of - * them using @cgrp->cft_q_node. Grab cgroup_cft_mutex to ensure - * exclusive access to the field. + * Instead, we use cgroup_for_each_descendant_pre() and drop RCU + * read lock before calling cgroup_addrm_files(). */ - mutex_lock(&cgroup_cft_mutex); mutex_lock(&cgroup_mutex); } static void cgroup_cfts_commit(struct cgroup_subsys *ss, struct cftype *cfts, bool is_add) - __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) + __releases(&cgroup_mutex) { LIST_HEAD(pending); - struct cgroup *cgrp, *n; + struct cgroup *cgrp, *root = &ss->root->top_cgroup; struct super_block *sb = ss->root->sb; + struct dentry *prev = NULL; + struct inode *inode; + u64 update_upto; /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ - if (cfts && ss->root != &rootnode && - atomic_inc_not_zero(sb->s_active)) { - list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) { - dget(cgrp->dentry); - list_add_tail(&cgrp->cft_q_node, &pending); - } - } else { - sb = NULL; + if (!cfts || ss->root == &rootnode || + !atomic_inc_not_zero(&sb->s_active)) { + mutex_unlock(&cgroup_mutex); + return; } - mutex_unlock(&cgroup_mutex); - /* - * All new cgroups will see @cfts update on @ss->cftsets. Add/rm - * files for all cgroups which were created before. + * All cgroups which are created after we drop cgroup_mutex will + * have the updated set of files, so we only need to update the + * cgroups created before the current @cgroup_serial_nr_cursor. */ - list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) { - struct inode *inode = cgrp->dentry->d_inode; + update_upto = atomic64_read(&cgroup_serial_nr_cursor); + + mutex_unlock(&cgroup_mutex); + + /* @root always needs to be updated */ + inode = root->dentry->d_inode; + mutex_lock(&inode->i_mutex); + mutex_lock(&cgroup_mutex); + cgroup_addrm_files(root, ss, cfts, is_add); + mutex_unlock(&cgroup_mutex); + mutex_unlock(&inode->i_mutex); + + /* add/rm files for all cgroups created before */ + rcu_read_lock(); + cgroup_for_each_descendant_pre(cgrp, root) { + if (cgroup_is_dead(cgrp)) + continue; + + inode = cgrp->dentry->d_inode; + dget(cgrp->dentry); + rcu_read_unlock(); + + dput(prev); + prev = cgrp->dentry; mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); - if (!cgroup_is_dead(cgrp)) + if (cgrp->serial_nr <= update_upto && !cgroup_is_dead(cgrp)) cgroup_addrm_files(cgrp, ss, cfts, is_add); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); - list_del_init(&cgrp->cft_q_node); - dput(cgrp->dentry); + rcu_read_lock(); } - - if (sb) - deactivate_super(sb); - - mutex_unlock(&cgroup_cft_mutex); + rcu_read_unlock(); + dput(prev); + deactivate_super(sb); } /** @@ -4320,7 +4330,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, cgrp->serial_nr = atomic64_inc_return(&cgroup_serial_nr_cursor); /* allocation complete, commit to creation */ - list_add_tail(&cgrp->allcg_node, &root->allcg_list); list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); root->number_of_cgroups++; @@ -4559,7 +4568,6 @@ static void cgroup_offline_fn(struct work_struct *work) /* delete this cgroup from parent->children */ list_del_rcu(&cgrp->sibling); - list_del_init(&cgrp->allcg_node); dput(d); -- cgit v1.2.3 From 03c78cbebb323fc97295ff97dc5e009d56371d57 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 14 Jun 2013 11:17:19 +0800 Subject: cgroup: rename cont to cgrp Cont is short for container. control group was named process container at first, but then people found container already has a meaning in linux kernel. Clean up the leftover variable name @cont. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 4 ++-- kernel/cgroup.c | 22 +++++++++++----------- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b28365890646..6c2ba52fc5d4 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -433,13 +433,13 @@ struct cftype { * entry. The key/value pairs (and their ordering) should not * change between reboots. */ - int (*read_map)(struct cgroup *cont, struct cftype *cft, + int (*read_map)(struct cgroup *cgrp, struct cftype *cft, struct cgroup_map_cb *cb); /* * read_seq_string() is used for outputting a simple sequence * using seqfile. */ - int (*read_seq_string)(struct cgroup *cont, struct cftype *cft, + int (*read_seq_string)(struct cgroup *cgrp, struct cftype *cft, struct seq_file *m); ssize_t (*write)(struct cgroup *cgrp, struct cftype *cft, diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 65f333ebb572..1051c1f69674 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5515,7 +5515,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) } #ifdef CONFIG_CGROUP_DEBUG -static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont) +static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) { struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); @@ -5525,23 +5525,23 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont) return css; } -static void debug_css_free(struct cgroup *cont) +static void debug_css_free(struct cgroup *cgrp) { - kfree(cont->subsys[debug_subsys_id]); + kfree(cgrp->subsys[debug_subsys_id]); } -static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft) +static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft) { - return cgroup_task_count(cont); + return cgroup_task_count(cgrp); } -static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) +static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft) { return (u64)(unsigned long)current->cgroups; } -static u64 current_css_set_refcount_read(struct cgroup *cont, - struct cftype *cft) +static u64 current_css_set_refcount_read(struct cgroup *cgrp, + struct cftype *cft) { u64 count; @@ -5551,7 +5551,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cont, return count; } -static int current_css_set_cg_links_read(struct cgroup *cont, +static int current_css_set_cg_links_read(struct cgroup *cgrp, struct cftype *cft, struct seq_file *seq) { @@ -5578,14 +5578,14 @@ static int current_css_set_cg_links_read(struct cgroup *cont, } #define MAX_TASKS_SHOWN_PER_CSS 25 -static int cgroup_css_links_read(struct cgroup *cont, +static int cgroup_css_links_read(struct cgroup *cgrp, struct cftype *cft, struct seq_file *seq) { struct cgrp_cset_link *link; read_lock(&css_set_lock); - list_for_each_entry(link, &cont->cset_links, cset_link) { + list_for_each_entry(link, &cgrp->cset_links, cset_link) { struct css_set *cset = link->cset; struct task_struct *task; int count = 0; -- cgit v1.2.3 From 02c402d98588bdfd3bebd267db574e13afdef722 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 24 Jun 2013 15:21:47 -0700 Subject: cgroup: convert CFTYPE_* flags to enums Purely cosmetic. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 6c2ba52fc5d4..ab27001a2c4a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -385,9 +385,11 @@ struct cgroup_map_cb { */ /* cftype->flags */ -#define CFTYPE_ONLY_ON_ROOT (1U << 0) /* only create on root cg */ -#define CFTYPE_NOT_ON_ROOT (1U << 1) /* don't create on root cg */ -#define CFTYPE_INSANE (1U << 2) /* don't create if sane_behavior */ +enum { + CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cg */ + CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cg */ + CFTYPE_INSANE = (1 << 2), /* don't create if sane_behavior */ +}; #define MAX_CFTYPE_NAME 64 -- cgit v1.2.3 From a8a648c4acee2095262f7fa65b0d8a68a03c32e4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 24 Jun 2013 15:21:47 -0700 Subject: cgroup: remove cgroup->actual_subsys_mask cgroup curiously has two subsystem masks, ->subsys_mask and ->actual_subsys_mask. The latter only exists because the new target subsys_mask is passed into rebind_subsystems() via @root>subsys_mask. rebind_subsystems() needs to know what the current mask is to decide how to reach the target mask so ->actual_subsys_mask is used as the temp location to remember the current state. Adding a temporary field to a permanent data structure is rather silly and can be misleading. Update rebind_subsystems() to take @added_mask and @removed_mask instead and remove @root->actual_subsys_mask. This patch shouldn't introduce any behavior changes. v2: Comment and description updated as suggested by Li. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 8 +------- kernel/cgroup.c | 22 ++++++++++++---------- 2 files changed, 13 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ab27001a2c4a..4c1eceb8c439 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -286,18 +286,12 @@ enum { struct cgroupfs_root { struct super_block *sb; - /* - * The bitmask of subsystems intended to be attached to this - * hierarchy - */ + /* The bitmask of subsystems attached to this hierarchy */ unsigned long subsys_mask; /* Unique id for this hierarchy. */ int hierarchy_id; - /* The bitmask of subsystems currently attached to this hierarchy */ - unsigned long actual_subsys_mask; - /* A list running through the attached subsystems */ struct list_head subsys_list; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8f296b83b6a3..67fc953c816a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -986,17 +986,14 @@ static void cgroup_d_remove_dir(struct dentry *dentry) * returns an error, no reference counts are touched. */ static int rebind_subsystems(struct cgroupfs_root *root, - unsigned long final_subsys_mask) + unsigned long added_mask, unsigned removed_mask) { - unsigned long added_mask, removed_mask; struct cgroup *cgrp = &root->top_cgroup; int i; BUG_ON(!mutex_is_locked(&cgroup_mutex)); BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); - removed_mask = root->actual_subsys_mask & ~final_subsys_mask; - added_mask = final_subsys_mask & ~root->actual_subsys_mask; /* Check that any added subsystems are currently free */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { unsigned long bit = 1UL << i; @@ -1032,27 +1029,33 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(cgrp->subsys[i]); BUG_ON(!cgroup_dummy_top->subsys[i]); BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top); + cgrp->subsys[i] = cgroup_dummy_top->subsys[i]; cgrp->subsys[i]->cgroup = cgrp; list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) ss->bind(cgrp); + /* refcount was already taken, and we're keeping it */ + root->subsys_mask |= bit; } else if (bit & removed_mask) { /* We're removing this subsystem */ BUG_ON(ss == NULL); BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]); BUG_ON(cgrp->subsys[i]->cgroup != cgrp); + if (ss->bind) ss->bind(cgroup_dummy_top); cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top; cgrp->subsys[i] = NULL; cgroup_subsys[i]->root = &cgroup_dummy_root; list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); + /* subsystem is now free - drop reference on module */ module_put(ss->module); - } else if (bit & final_subsys_mask) { + root->subsys_mask &= ~bit; + } else if (bit & root->subsys_mask) { /* Subsystem state should already exist */ BUG_ON(ss == NULL); BUG_ON(!cgrp->subsys[i]); @@ -1069,7 +1072,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(cgrp->subsys[i]); } } - root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; return 0; } @@ -1343,7 +1345,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) if (ret) goto out_unlock; - if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent) + if (opts.subsys_mask != root->subsys_mask || opts.release_agent) pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); @@ -1365,7 +1367,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) */ cgroup_clear_directory(cgrp->dentry, false, removed_mask); - ret = rebind_subsystems(root, opts.subsys_mask); + ret = rebind_subsystems(root, added_mask, removed_mask); if (ret) { /* rebind_subsystems failed, re-populate the removed files */ cgroup_populate_dir(cgrp, false, removed_mask); @@ -1634,7 +1636,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (ret) goto unlock_drop; - ret = rebind_subsystems(root, root->subsys_mask); + ret = rebind_subsystems(root, root->subsys_mask, 0); if (ret == -EBUSY) { free_cgrp_cset_links(&tmp_links); goto unlock_drop; @@ -1727,7 +1729,7 @@ static void cgroup_kill_sb(struct super_block *sb) { mutex_lock(&cgroup_root_mutex); /* Rebind all subsystems back to the default hierarchy */ - ret = rebind_subsystems(root, 0); + ret = rebind_subsystems(root, 0, root->subsys_mask); /* Shouldn't be able to fail ... */ BUG_ON(ret); -- cgit v1.2.3 From 1672d040709b789671c0502e7aac9d632c2f9175 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 25 Jun 2013 18:04:54 -0700 Subject: cgroup: fix cgroupfs_root early destruction path cgroupfs_root used to have ->actual_subsys_mask in addition to ->subsys_mask. a8a648c4ac ("cgroup: remove cgroup->actual_subsys_mask") removed it noting that the subsys_mask is essentially temporary and doesn't belong in cgroupfs_root; however, the patch made it impossible to tell whether a cgroupfs_root actually has the subsystems bound or just have the bits set leading to the following BUG when trying to mount with subsystems which are already mounted elsewhere. kernel BUG at kernel/cgroup.c:1038! invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC ... CPU: 1 PID: 7973 Comm: mount Tainted: G W 3.10.0-rc7-next-20130625-sasha-00011-g1c1dc0e #1105 task: ffff880fc0ae8000 ti: ffff880fc0b9a000 task.ti: ffff880fc0b9a000 RIP: 0010:[] [] rebind_subsystems+0x409/0x5f0 ... Call Trace: [] cgroup_kill_sb+0xff/0x210 [] deactivate_locked_super+0x4f/0x90 [] cgroup_mount+0x673/0x6e0 [] cpuset_mount+0xd9/0x110 [] mount_fs+0xb0/0x2d0 [] vfs_kern_mount+0xbd/0x180 [] do_new_mount+0x145/0x2c0 [] do_mount+0x356/0x3c0 [] SyS_mount+0xfd/0x140 [] tracesys+0xdd/0xe2 We still want rebind_subsystems() to take added/removed masks, so let's fix it by marking whether a cgroupfs_root has finished binding or not. Also, document what's going on around ->subsys_mask initialization so that similar mistakes aren't repeated. Signed-off-by: Tejun Heo Reported-by: Sasha Levin Acked-by: Li Zefan --- include/linux/cgroup.h | 1 + kernel/cgroup.c | 22 +++++++++++++++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4c1eceb8c439..8e4fd5e67384 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -276,6 +276,7 @@ enum { CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ + CGRP_ROOT_SUBSYS_BOUND = (1 << 3), /* subsystems finished binding */ }; /* diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f9c99abc38ab..e801ecfa36ef 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1086,6 +1086,12 @@ static int rebind_subsystems(struct cgroupfs_root *root, } } + /* + * Mark @root has finished binding subsystems. @root->subsys_mask + * now matches the bound subsystems. + */ + root->flags |= CGRP_ROOT_SUBSYS_BOUND; + return 0; } @@ -1485,6 +1491,14 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) init_cgroup_root(root); + /* + * We need to set @root->subsys_mask now so that @root can be + * matched by cgroup_test_super() before it finishes + * initialization; otherwise, competing mounts with the same + * options may try to bind the same subsystems instead of waiting + * for the first one leading to unexpected mount errors. + * SUBSYS_BOUND will be set once actual binding is complete. + */ root->subsys_mask = opts->subsys_mask; root->flags = opts->flags; ida_init(&root->cgroup_ida); @@ -1734,9 +1748,11 @@ static void cgroup_kill_sb(struct super_block *sb) { mutex_lock(&cgroup_root_mutex); /* Rebind all subsystems back to the default hierarchy */ - ret = rebind_subsystems(root, 0, root->subsys_mask); - /* Shouldn't be able to fail ... */ - BUG_ON(ret); + if (root->flags & CGRP_ROOT_SUBSYS_BOUND) { + ret = rebind_subsystems(root, 0, root->subsys_mask); + /* Shouldn't be able to fail ... */ + BUG_ON(ret); + } /* * Release all the links from cset_links to this hierarchy's -- cgit v1.2.3 From 14611e51a57df10240817d8ada510842faf0ec51 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 25 Jun 2013 11:48:32 -0700 Subject: cgroup: fix RCU accesses to task->cgroups task->cgroups is a RCU pointer pointing to struct css_set. A task switches to a different css_set on cgroup migration but a css_set doesn't change once created and its pointers to cgroup_subsys_states aren't RCU protected. task_subsys_state[_check]() is the macro to acquire css given a task and subsys_id pair. It RCU-dereferences task->cgroups->subsys[] not task->cgroups, so the RCU pointer task->cgroups ends up being dereferenced without read_barrier_depends() after it. It's broken. Fix it by introducing task_css_set[_check]() which does RCU-dereference on task->cgroups. task_subsys_state[_check]() is reimplemented to directly dereference ->subsys[] of the css_set returned from task_css_set[_check](). This removes some of sparse RCU warnings in cgroup. v2: Fixed unbalanced parenthsis and there's no need to use rcu_dereference_raw() when !CONFIG_PROVE_RCU. Both spotted by Li. Signed-off-by: Tejun Heo Reported-by: Fengguang Wu Acked-by: Li Zefan Cc: stable@vger.kernel.org --- include/linux/cgroup.h | 58 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 8e4fd5e67384..ad3555bc21f4 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -635,22 +635,60 @@ static inline struct cgroup_subsys_state *cgroup_subsys_state( return cgrp->subsys[subsys_id]; } -/* - * function to get the cgroup_subsys_state which allows for extra - * rcu_dereference_check() conditions, such as locks used during the - * cgroup_subsys::attach() methods. +/** + * task_css_set_check - obtain a task's css_set with extra access conditions + * @task: the task to obtain css_set for + * @__c: extra condition expression to be passed to rcu_dereference_check() + * + * A task's css_set is RCU protected, initialized and exited while holding + * task_lock(), and can only be modified while holding both cgroup_mutex + * and task_lock() while the task is alive. This macro verifies that the + * caller is inside proper critical section and returns @task's css_set. + * + * The caller can also specify additional allowed conditions via @__c, such + * as locks used during the cgroup_subsys::attach() methods. */ #ifdef CONFIG_PROVE_RCU extern struct mutex cgroup_mutex; -#define task_subsys_state_check(task, subsys_id, __c) \ - rcu_dereference_check((task)->cgroups->subsys[(subsys_id)], \ - lockdep_is_held(&(task)->alloc_lock) || \ - lockdep_is_held(&cgroup_mutex) || (__c)) +#define task_css_set_check(task, __c) \ + rcu_dereference_check((task)->cgroups, \ + lockdep_is_held(&(task)->alloc_lock) || \ + lockdep_is_held(&cgroup_mutex) || (__c)) #else -#define task_subsys_state_check(task, subsys_id, __c) \ - rcu_dereference((task)->cgroups->subsys[(subsys_id)]) +#define task_css_set_check(task, __c) \ + rcu_dereference((task)->cgroups) #endif +/** + * task_subsys_state_check - obtain css for (task, subsys) w/ extra access conds + * @task: the target task + * @subsys_id: the target subsystem ID + * @__c: extra condition expression to be passed to rcu_dereference_check() + * + * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The + * synchronization rules are the same as task_css_set_check(). + */ +#define task_subsys_state_check(task, subsys_id, __c) \ + task_css_set_check((task), (__c))->subsys[(subsys_id)] + +/** + * task_css_set - obtain a task's css_set + * @task: the task to obtain css_set for + * + * See task_css_set_check(). + */ +static inline struct css_set *task_css_set(struct task_struct *task) +{ + return task_css_set_check(task, false); +} + +/** + * task_subsys_state - obtain css for (task, subsys) + * @task: the target task + * @subsys_id: the target subsystem ID + * + * See task_subsys_state_check(). + */ static inline struct cgroup_subsys_state * task_subsys_state(struct task_struct *task, int subsys_id) { -- cgit v1.2.3 From 0ce6cba35777cf96a54ce0d5856dc962566b8717 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 27 Jun 2013 19:37:26 -0700 Subject: cgroup: CGRP_ROOT_SUBSYS_BOUND should be ignored when comparing mount options 1672d04070 ("cgroup: fix cgroupfs_root early destruction path") introduced CGRP_ROOT_SUBSYS_BOUND which is used to mark completion of subsys binding on a new root; however, this broke remounts. cgroup_remount() doesn't allow changing root options via remount and CGRP_ROOT_SUBSYS_BOUND, which is set on all fully initialized roots, makes the function reject all remounts. Fix it by putting the options part in the lower 16 bits of root->flags and masking the comparions. While at it, make cgroup_remount() emit an error message explaining why it's rejecting a remount request, so that it's less of a mystery. Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 6 +++++- kernel/cgroup.c | 5 ++++- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ad3555bc21f4..8db53974f7b5 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -276,7 +276,11 @@ enum { CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ - CGRP_ROOT_SUBSYS_BOUND = (1 << 3), /* subsystems finished binding */ + + /* mount options live below bit 16 */ + CGRP_ROOT_OPTION_MASK = (1 << 16) - 1, + + CGRP_ROOT_SUBSYS_BOUND = (1 << 16), /* subsystems finished binding */ }; /* diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1b7b567208cd..5a2fcf5bcc4a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1362,8 +1362,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) removed_mask = root->subsys_mask & ~opts.subsys_mask; /* Don't allow flags or name to change at remount */ - if (opts.flags != root->flags || + if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || (opts.name && strcmp(opts.name, root->name))) { + pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n", + opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", + root->flags & CGRP_ROOT_OPTION_MASK, root->name); ret = -EINVAL; goto out_unlock; } -- cgit v1.2.3