From 788b950c62e06b02278a0fd380e1a0667996ce3c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 16 Jul 2017 21:43:33 -0400 Subject: cgroup: distinguish local and children populated states cgrp->populated_cnt counts both local (the cgroup's populated css_sets) and subtree proper (populated children) so that it's only zero when the whole subtree, including self, is empty. This patch splits the counter into two so that local and children populated states are tracked separately. It allows finer-grained tests on the state of the hierarchy which will be used to replace css_set walking local populated test. Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 710a005c6b7a..308b10797a54 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -537,7 +537,7 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, /* no synchronization, the result can only be used as a hint */ static inline bool cgroup_is_populated(struct cgroup *cgrp) { - return cgrp->populated_cnt; + return cgrp->nr_populated_csets + cgrp->nr_populated_children; } /* returns ino associated with a cgroup */ -- cgit v1.2.3 From bc2fb7ed089ffd16d26e1d95b898a37d2b37d201 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 15 May 2017 09:34:01 -0400 Subject: cgroup: add @flags to css_task_iter_start() and implement CSS_TASK_ITER_PROCS css_task_iter currently always walks all tasks. With the scheduled cgroup v2 thread support, the iterator would need to handle multiple types of iteration. As a preparation, add @flags to css_task_iter_start() and implement CSS_TASK_ITER_PROCS. If the flag is not specified, it walks all tasks as before. When asserted, the iterator only walks the group leaders. For now, the only user of the flag is cgroup v2 "cgroup.procs" file which no longer needs to skip non-leader tasks in cgroup_procs_next(). Note that cgroup v1 "cgroup.procs" can't use the group leader walk as v1 "cgroup.procs" doesn't mean "list all thread group leaders in the cgroup" but "list all thread group id's with any threads in the cgroup". While at it, update cgroup_procs_show() to use task_pid_vnr() instead of task_tgid_vnr(). As the iteration guarantees that the function only sees group leaders, this doesn't change the output and will allow sharing the function for thread iteration. Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 6 +++++- kernel/cgroup/cgroup-v1.c | 6 +++--- kernel/cgroup/cgroup.c | 24 ++++++++++++++---------- kernel/cgroup/cpuset.c | 6 +++--- kernel/cgroup/freezer.c | 6 +++--- mm/memcontrol.c | 2 +- net/core/netclassid_cgroup.c | 2 +- 7 files changed, 30 insertions(+), 22 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 308b10797a54..cae5831ae650 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -36,9 +36,13 @@ #define CGROUP_WEIGHT_DFL 100 #define CGROUP_WEIGHT_MAX 10000 +/* walk only threadgroup leaders */ +#define CSS_TASK_ITER_PROCS (1U << 0) + /* a css_task_iter should be treated as an opaque object */ struct css_task_iter { struct cgroup_subsys *ss; + unsigned int flags; struct list_head *cset_pos; struct list_head *cset_head; @@ -129,7 +133,7 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp); -void css_task_iter_start(struct cgroup_subsys_state *css, +void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it); struct task_struct *css_task_iter_next(struct css_task_iter *it); void css_task_iter_end(struct css_task_iter *it); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 60f72475863e..167aaab04bf9 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -121,7 +121,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) * ->can_attach() fails. */ do { - css_task_iter_start(&from->self, &it); + css_task_iter_start(&from->self, 0, &it); task = css_task_iter_next(&it); if (task) get_task_struct(task); @@ -373,7 +373,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (!array) return -ENOMEM; /* now, populate the array */ - css_task_iter_start(&cgrp->self, &it); + css_task_iter_start(&cgrp->self, 0, &it); while ((tsk = css_task_iter_next(&it))) { if (unlikely(n == length)) break; @@ -749,7 +749,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) } rcu_read_unlock(); - css_task_iter_start(&cgrp->self, &it); + css_task_iter_start(&cgrp->self, 0, &it); while ((tsk = css_task_iter_next(&it))) { switch (tsk->state) { case TASK_RUNNING: diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e3bda0752501..3c5a37a9a892 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3643,6 +3643,7 @@ static void css_task_iter_advance(struct css_task_iter *it) lockdep_assert_held(&css_set_lock); WARN_ON_ONCE(!l); +repeat: /* * Advance iterator to find next entry. cset->tasks is consumed * first and then ->mg_tasks. After ->mg_tasks, we move onto the @@ -3657,11 +3658,18 @@ static void css_task_iter_advance(struct css_task_iter *it) css_task_iter_advance_css_set(it); else it->task_pos = l; + + /* if PROCS, skip over tasks which aren't group leaders */ + if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos && + !thread_group_leader(list_entry(it->task_pos, struct task_struct, + cg_list))) + goto repeat; } /** * css_task_iter_start - initiate task iteration * @css: the css to walk tasks of + * @flags: CSS_TASK_ITER_* flags * @it: the task iterator to use * * Initiate iteration through the tasks of @css. The caller can call @@ -3669,7 +3677,7 @@ static void css_task_iter_advance(struct css_task_iter *it) * returns NULL. On completion of iteration, css_task_iter_end() must be * called. */ -void css_task_iter_start(struct cgroup_subsys_state *css, +void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it) { /* no one should try to iterate before mounting cgroups */ @@ -3680,6 +3688,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, spin_lock_irq(&css_set_lock); it->ss = css->ss; + it->flags = flags; if (it->ss) it->cset_pos = &css->cgroup->e_csets[css->ss->id]; @@ -3753,13 +3762,8 @@ static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) { struct kernfs_open_file *of = s->private; struct css_task_iter *it = of->priv; - struct task_struct *task; - - do { - task = css_task_iter_next(it); - } while (task && !thread_group_leader(task)); - return task; + return css_task_iter_next(it); } static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) @@ -3780,10 +3784,10 @@ static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) if (!it) return ERR_PTR(-ENOMEM); of->priv = it; - css_task_iter_start(&cgrp->self, it); + css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS, it); } else if (!(*pos)++) { css_task_iter_end(it); - css_task_iter_start(&cgrp->self, it); + css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS, it); } return cgroup_procs_next(s, NULL, NULL); @@ -3791,7 +3795,7 @@ static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) static int cgroup_procs_show(struct seq_file *s, void *v) { - seq_printf(s, "%d\n", task_tgid_vnr(v)); + seq_printf(s, "%d\n", task_pid_vnr(v)); return 0; } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index ca8376e5008c..252d70c9a49b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -861,7 +861,7 @@ static void update_tasks_cpumask(struct cpuset *cs) struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&cs->css, &it); + css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) set_cpus_allowed_ptr(task, cs->effective_cpus); css_task_iter_end(&it); @@ -1091,7 +1091,7 @@ static void update_tasks_nodemask(struct cpuset *cs) * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. */ - css_task_iter_start(&cs->css, &it); + css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) { struct mm_struct *mm; bool migrate; @@ -1284,7 +1284,7 @@ static void update_tasks_flags(struct cpuset *cs) struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&cs->css, &it); + css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) cpuset_update_task_spread_flag(cs, task); css_task_iter_end(&it); diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index 1b72d56edce5..08236798d173 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -268,7 +268,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css) rcu_read_unlock(); /* are all tasks frozen? */ - css_task_iter_start(css, &it); + css_task_iter_start(css, 0, &it); while ((task = css_task_iter_next(&it))) { if (freezing(task)) { @@ -320,7 +320,7 @@ static void freeze_cgroup(struct freezer *freezer) struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&freezer->css, &it); + css_task_iter_start(&freezer->css, 0, &it); while ((task = css_task_iter_next(&it))) freeze_task(task); css_task_iter_end(&it); @@ -331,7 +331,7 @@ static void unfreeze_cgroup(struct freezer *freezer) struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&freezer->css, &it); + css_task_iter_start(&freezer->css, 0, &it); while ((task = css_task_iter_next(&it))) __thaw_task(task); css_task_iter_end(&it); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3df3c04d73ab..2b2f071f914b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -917,7 +917,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&iter->css, &it); + css_task_iter_start(&iter->css, 0, &it); while (!ret && (task = css_task_iter_next(&it))) ret = fn(task, arg); css_task_iter_end(&it); diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 029a61ac6cdd..5e4f04004a49 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -100,7 +100,7 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, cs->classid = (u32)value; - css_task_iter_start(css, &it); + css_task_iter_start(css, 0, &it); while ((p = css_task_iter_next(&it))) { task_lock(p); iterate_fd(p->files, 0, update_classid_sock, -- cgit v1.2.3 From 454000adaa2a7420df6e56a42f22726d05872a3f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 15 May 2017 09:34:02 -0400 Subject: cgroup: introduce cgroup->dom_cgrp and threaded css_set handling cgroup v2 is in the process of growing thread granularity support. A threaded subtree is composed of a thread root and threaded cgroups which are proper members of the subtree. The root cgroup of the subtree serves as the domain cgroup to which the processes (as opposed to threads / tasks) of the subtree conceptually belong and domain-level resource consumptions not tied to any specific task are charged. Inside the subtree, threads won't be subject to process granularity or no-internal-task constraint and can be distributed arbitrarily across the subtree. This patch introduces cgroup->dom_cgrp along with threaded css_set handling. * cgroup->dom_cgrp points to self for normal and thread roots. For proper thread subtree members, points to the dom_cgrp (the thread root). * css_set->dom_cset points to self if for normal and thread roots. If threaded, points to the css_set which belongs to the cgrp->dom_cgrp. The dom_cgrp serves as the resource domain and keeps the matching csses available. The dom_cset holds those csses and makes them easily accessible. * All threaded csets are linked on their dom_csets to enable iteration of all threaded tasks. * cgroup->nr_threaded_children keeps track of the number of threaded children. This patch adds the above but doesn't actually use them yet. The following patches will build on top. v4: ->nr_threaded_children added. v3: ->proc_cgrp/cset renamed to ->dom_cgrp/cset. Updated for the new enable-threaded-per-cgroup behavior. v2: Added cgroup_is_threaded() helper. Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 33 +++++++++++++++++++--- include/linux/cgroup.h | 3 +- kernel/cgroup/cgroup.c | 69 +++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 97 insertions(+), 8 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ae7bc1e70085..651c4363c85e 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -172,6 +172,14 @@ struct css_set { /* reference count */ refcount_t refcount; + /* + * For a domain cgroup, the following points to self. If threaded, + * to the matching cset of the nearest domain ancestor. The + * dom_cset provides access to the domain cgroup and its csses to + * which domain level resource consumptions should be charged. + */ + struct css_set *dom_cset; + /* the default cgroup associated with this css_set */ struct cgroup *dfl_cgrp; @@ -200,6 +208,10 @@ struct css_set { */ struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; + /* all threaded csets whose ->dom_cset points to this cset */ + struct list_head threaded_csets; + struct list_head threaded_csets_node; + /* * List running through all cgroup groups in the same hash * slot. Protected by css_set_lock @@ -267,12 +279,16 @@ struct cgroup { * doesn't have any tasks. * * All children which have non-zero nr_populated_csets and/or - * nr_populated_children of their own contribute one to - * nr_populated_children. The counter is zero iff this cgroup's - * subtree proper doesn't have any tasks. + * nr_populated_children of their own contribute one to either + * nr_populated_domain_children or nr_populated_threaded_children + * depending on their type. Each counter is zero iff all cgroups + * of the type in the subtree proper don't have any tasks. */ int nr_populated_csets; - int nr_populated_children; + int nr_populated_domain_children; + int nr_populated_threaded_children; + + int nr_threaded_children; /* # of live threaded child cgroups */ struct kernfs_node *kn; /* cgroup kernfs entry */ struct cgroup_file procs_file; /* handle for "cgroup.procs" */ @@ -310,6 +326,15 @@ struct cgroup { */ struct list_head e_csets[CGROUP_SUBSYS_COUNT]; + /* + * If !threaded, self. If threaded, it points to the nearest + * domain ancestor. Inside a threaded subtree, cgroups are exempt + * from process granularity and no-internal-task constraint. + * Domain level resource consumptions which aren't tied to a + * specific task are charged to the dom_cgrp. + */ + struct cgroup *dom_cgrp; + /* * list of pidlists, up to two for each namespace (one for procs, one * for tasks); created on demand. diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index cae5831ae650..b7dd23040cd5 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -541,7 +541,8 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, /* no synchronization, the result can only be used as a hint */ static inline bool cgroup_is_populated(struct cgroup *cgrp) { - return cgrp->nr_populated_csets + cgrp->nr_populated_children; + return cgrp->nr_populated_csets + cgrp->nr_populated_domain_children + + cgrp->nr_populated_threaded_children; } /* returns ino associated with a cgroup */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3c5a37a9a892..c7e1c243b77d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -330,6 +330,11 @@ static bool cgroup_has_tasks(struct cgroup *cgrp) return cgrp->nr_populated_csets; } +static bool cgroup_is_threaded(struct cgroup *cgrp) +{ + return cgrp->dom_cgrp != cgrp; +} + /* subsystems visibly enabled on a cgroup */ static u16 cgroup_control(struct cgroup *cgrp) { @@ -565,9 +570,11 @@ EXPORT_SYMBOL_GPL(of_css); */ struct css_set init_css_set = { .refcount = REFCOUNT_INIT(1), + .dom_cset = &init_css_set, .tasks = LIST_HEAD_INIT(init_css_set.tasks), .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), + .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), @@ -575,6 +582,11 @@ struct css_set init_css_set = { static int css_set_count = 1; /* 1 for init_css_set */ +static bool css_set_threaded(struct css_set *cset) +{ + return cset->dom_cset != cset; +} + /** * css_set_populated - does a css_set contain any tasks? * @cset: target css_set @@ -618,10 +630,14 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) do { bool was_populated = cgroup_is_populated(cgrp); - if (!child) + if (!child) { cgrp->nr_populated_csets += adj; - else - cgrp->nr_populated_children += adj; + } else { + if (cgroup_is_threaded(child)) + cgrp->nr_populated_threaded_children += adj; + else + cgrp->nr_populated_domain_children += adj; + } if (was_populated == cgroup_is_populated(cgrp)) break; @@ -747,6 +763,8 @@ void put_css_set_locked(struct css_set *cset) if (!refcount_dec_and_test(&cset->refcount)) return; + WARN_ON_ONCE(!list_empty(&cset->threaded_csets)); + /* This css_set is dead. unlink it and release cgroup and css refs */ for_each_subsys(ss, ssid) { list_del(&cset->e_cset_node[ssid]); @@ -763,6 +781,11 @@ void put_css_set_locked(struct css_set *cset) kfree(link); } + if (css_set_threaded(cset)) { + list_del(&cset->threaded_csets_node); + put_css_set_locked(cset->dom_cset); + } + kfree_rcu(cset, rcu_head); } @@ -781,6 +804,7 @@ static bool compare_css_sets(struct css_set *cset, struct cgroup *new_cgrp, struct cgroup_subsys_state *template[]) { + struct cgroup *new_dfl_cgrp; struct list_head *l1, *l2; /* @@ -791,6 +815,16 @@ static bool compare_css_sets(struct css_set *cset, if (memcmp(template, cset->subsys, sizeof(cset->subsys))) return false; + + /* @cset's domain should match the default cgroup's */ + if (cgroup_on_dfl(new_cgrp)) + new_dfl_cgrp = new_cgrp; + else + new_dfl_cgrp = old_cset->dfl_cgrp; + + if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp) + return false; + /* * Compare cgroup pointers in order to distinguish between * different cgroups in hierarchies. As different cgroups may @@ -998,9 +1032,11 @@ static struct css_set *find_css_set(struct css_set *old_cset, } refcount_set(&cset->refcount, 1); + cset->dom_cset = cset; INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); INIT_LIST_HEAD(&cset->task_iters); + INIT_LIST_HEAD(&cset->threaded_csets); INIT_HLIST_NODE(&cset->hlist); INIT_LIST_HEAD(&cset->cgrp_links); INIT_LIST_HEAD(&cset->mg_preload_node); @@ -1038,6 +1074,28 @@ static struct css_set *find_css_set(struct css_set *old_cset, spin_unlock_irq(&css_set_lock); + /* + * If @cset should be threaded, look up the matching dom_cset and + * link them up. We first fully initialize @cset then look for the + * dom_cset. It's simpler this way and safe as @cset is guaranteed + * to stay empty until we return. + */ + if (cgroup_is_threaded(cset->dfl_cgrp)) { + struct css_set *dcset; + + dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp); + if (!dcset) { + put_css_set(cset); + return NULL; + } + + spin_lock_irq(&css_set_lock); + cset->dom_cset = dcset; + list_add_tail(&cset->threaded_csets_node, + &dcset->threaded_csets); + spin_unlock_irq(&css_set_lock); + } + return cset; } @@ -1680,6 +1738,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) mutex_init(&cgrp->pidlist_mutex); cgrp->self.cgroup = cgrp; cgrp->self.flags |= CSS_ONLINE; + cgrp->dom_cgrp = cgrp; for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); @@ -4408,6 +4467,7 @@ static void kill_css(struct cgroup_subsys_state *css) static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { + struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *css; struct cgrp_cset_link *link; int ssid; @@ -4452,6 +4512,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ kernfs_remove(cgrp->kn); + if (parent && cgroup_is_threaded(cgrp)) + parent->nr_threaded_children--; + cgroup1_check_for_release(cgroup_parent(cgrp)); /* put the base reference */ -- cgit v1.2.3 From 450ee0c1feed657894e0b4bdd48f3974af9d394c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 15 May 2017 09:34:03 -0400 Subject: cgroup: implement CSS_TASK_ITER_THREADED cgroup v2 is in the process of growing thread granularity support. Once thread mode is enabled, the root cgroup of the subtree serves as the dom_cgrp to which the processes of the subtree conceptually belong and domain-level resource consumptions not tied to any specific task are charged. In the subtree, threads won't be subject to process granularity or no-internal-task constraint and can be distributed arbitrarily across the subtree. This patch implements a new task iterator flag CSS_TASK_ITER_THREADED, which, when used on a dom_cgrp, makes the iteration include the tasks on all the associated threaded css_sets. "cgroup.procs" read path is updated to use it so that reading the file on a proc_cgrp lists all processes. This will also be used by controller implementations which need to walk processes or tasks at the resource domain level. Task iteration is implemented nested in css_set iteration. If CSS_TASK_ITER_THREADED is specified, after walking tasks of each !threaded css_set, all the associated threaded css_sets are visited before moving onto the next !threaded css_set. v2: ->cur_pcset renamed to ->cur_dcset. Updated for the new enable-threaded-per-cgroup behavior. Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 6 ++++ kernel/cgroup/cgroup.c | 77 +++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 66 insertions(+), 17 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b7dd23040cd5..79faa6467f76 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -38,6 +38,8 @@ /* walk only threadgroup leaders */ #define CSS_TASK_ITER_PROCS (1U << 0) +/* walk all threaded css_sets in the domain */ +#define CSS_TASK_ITER_THREADED (1U << 1) /* a css_task_iter should be treated as an opaque object */ struct css_task_iter { @@ -47,11 +49,15 @@ struct css_task_iter { struct list_head *cset_pos; struct list_head *cset_head; + struct list_head *tcset_pos; + struct list_head *tcset_head; + struct list_head *task_pos; struct list_head *tasks_head; struct list_head *mg_tasks_head; struct css_set *cur_cset; + struct css_set *cur_dcset; struct task_struct *cur_task; struct list_head iters_node; /* css_set->task_iters */ }; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c7e1c243b77d..a1d59af274a9 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3629,6 +3629,58 @@ bool css_has_online_children(struct cgroup_subsys_state *css) return ret; } +static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it) +{ + struct list_head *l; + struct cgrp_cset_link *link; + struct css_set *cset; + + lockdep_assert_held(&css_set_lock); + + /* find the next threaded cset */ + if (it->tcset_pos) { + l = it->tcset_pos->next; + + if (l != it->tcset_head) { + it->tcset_pos = l; + return container_of(l, struct css_set, + threaded_csets_node); + } + + it->tcset_pos = NULL; + } + + /* find the next cset */ + l = it->cset_pos; + l = l->next; + if (l == it->cset_head) { + it->cset_pos = NULL; + return NULL; + } + + if (it->ss) { + cset = container_of(l, struct css_set, e_cset_node[it->ss->id]); + } else { + link = list_entry(l, struct cgrp_cset_link, cset_link); + cset = link->cset; + } + + it->cset_pos = l; + + /* initialize threaded css_set walking */ + if (it->flags & CSS_TASK_ITER_THREADED) { + if (it->cur_dcset) + put_css_set_locked(it->cur_dcset); + it->cur_dcset = cset; + get_css_set(cset); + + it->tcset_head = &cset->threaded_csets; + it->tcset_pos = &cset->threaded_csets; + } + + return cset; +} + /** * css_task_iter_advance_css_set - advance a task itererator to the next css_set * @it: the iterator to advance @@ -3637,32 +3689,19 @@ bool css_has_online_children(struct cgroup_subsys_state *css) */ static void css_task_iter_advance_css_set(struct css_task_iter *it) { - struct list_head *l = it->cset_pos; - struct cgrp_cset_link *link; struct css_set *cset; lockdep_assert_held(&css_set_lock); /* Advance to the next non-empty css_set */ do { - l = l->next; - if (l == it->cset_head) { - it->cset_pos = NULL; + cset = css_task_iter_next_css_set(it); + if (!cset) { it->task_pos = NULL; return; } - - if (it->ss) { - cset = container_of(l, struct css_set, - e_cset_node[it->ss->id]); - } else { - link = list_entry(l, struct cgrp_cset_link, cset_link); - cset = link->cset; - } } while (!css_set_populated(cset)); - it->cset_pos = l; - if (!list_empty(&cset->tasks)) it->task_pos = cset->tasks.next; else @@ -3805,6 +3844,9 @@ void css_task_iter_end(struct css_task_iter *it) spin_unlock_irq(&css_set_lock); } + if (it->cur_dcset) + put_css_set(it->cur_dcset); + if (it->cur_task) put_task_struct(it->cur_task); } @@ -3830,6 +3872,7 @@ static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) struct kernfs_open_file *of = s->private; struct cgroup *cgrp = seq_css(s)->cgroup; struct css_task_iter *it = of->priv; + unsigned iter_flags = CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED; /* * When a seq_file is seeked, it's always traversed sequentially @@ -3843,10 +3886,10 @@ static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) if (!it) return ERR_PTR(-ENOMEM); of->priv = it; - css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS, it); + css_task_iter_start(&cgrp->self, iter_flags, it); } else if (!(*pos)++) { css_task_iter_end(it); - css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS, it); + css_task_iter_start(&cgrp->self, iter_flags, it); } return cgroup_procs_next(s, NULL, NULL); -- cgit v1.2.3 From c53cd490b1a491ebf1d8e30da97e7231459a4208 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:50 -0700 Subject: kernfs: introduce kernfs_node_id inode number and generation can identify a kernfs node. We are going to export the identification by exportfs operations, so put ino and generation into a separate structure. It's convenient when later patches use the identification. Acked-by: Greg Kroah-Hartman Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- fs/kernfs/dir.c | 10 +++++----- fs/kernfs/file.c | 4 ++-- fs/kernfs/inode.c | 4 ++-- include/linux/cgroup.h | 2 +- include/linux/kernfs.h | 12 ++++++++++-- include/trace/events/writeback.h | 2 +- 6 files changed, 21 insertions(+), 13 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index b61a7efceb7a..89d1dc19340b 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -539,7 +539,7 @@ void kernfs_put(struct kernfs_node *kn) } kfree(kn->iattr); spin_lock(&kernfs_idr_lock); - idr_remove(&root->ino_idr, kn->ino); + idr_remove(&root->ino_idr, kn->id.ino); spin_unlock(&kernfs_idr_lock); kmem_cache_free(kernfs_node_cache, kn); @@ -645,8 +645,8 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, idr_preload_end(); if (ret < 0) goto err_out2; - kn->ino = ret; - kn->generation = gen; + kn->id.ino = ret; + kn->id.generation = gen; /* * set ino first. This barrier is paired with atomic_inc_not_zero in @@ -721,7 +721,7 @@ struct kernfs_node *kernfs_find_and_get_node_by_ino(struct kernfs_root *root, * before 'count'. So if 'count' is uptodate, 'ino' should be uptodate, * hence we can use 'ino' to filter stale node. */ - if (kn->ino != ino) + if (kn->id.ino != ino) goto out; rcu_read_unlock(); @@ -1654,7 +1654,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) const char *name = pos->name; unsigned int type = dt_type(pos); int len = strlen(name); - ino_t ino = pos->ino; + ino_t ino = pos->id.ino; ctx->pos = pos->hash; file->private_data = pos; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 7f90d4de86b6..744192539010 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -895,7 +895,7 @@ repeat: * have the matching @file available. Look up the inodes * and generate the events manually. */ - inode = ilookup(info->sb, kn->ino); + inode = ilookup(info->sb, kn->id.ino); if (!inode) continue; @@ -903,7 +903,7 @@ repeat: if (parent) { struct inode *p_inode; - p_inode = ilookup(info->sb, parent->ino); + p_inode = ilookup(info->sb, parent->id.ino); if (p_inode) { fsnotify(p_inode, FS_MODIFY | FS_EVENT_ON_CHILD, inode, FSNOTIFY_EVENT_INODE, kn->name, 0); diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 4c8b51085a86..a34303981deb 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -220,7 +220,7 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) inode->i_private = kn; inode->i_mapping->a_ops = &kernfs_aops; inode->i_op = &kernfs_iops; - inode->i_generation = kn->generation; + inode->i_generation = kn->id.generation; set_default_inode_attr(inode, kn->mode); kernfs_refresh_inode(kn, inode); @@ -266,7 +266,7 @@ struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) { struct inode *inode; - inode = iget_locked(sb, kn->ino); + inode = iget_locked(sb, kn->id.ino); if (inode && (inode->i_state & I_NEW)) kernfs_init_inode(kn, inode); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 710a005c6b7a..30c68773fd1e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -543,7 +543,7 @@ static inline bool cgroup_is_populated(struct cgroup *cgrp) /* returns ino associated with a cgroup */ static inline ino_t cgroup_ino(struct cgroup *cgrp) { - return cgrp->kn->ino; + return cgrp->kn->id.ino; } /* cft/css accessors for cftype->write() operation */ diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 8c00d28f468a..06a0c5913e1d 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -95,6 +95,15 @@ struct kernfs_elem_attr { struct kernfs_node *notify_next; /* for kernfs_notify() */ }; +/* represent a kernfs node */ +union kernfs_node_id { + struct { + u32 ino; + u32 generation; + }; + u64 id; +}; + /* * kernfs_node - the building block of kernfs hierarchy. Each and every * kernfs node is represented by single kernfs_node. Most fields are @@ -131,11 +140,10 @@ struct kernfs_node { void *priv; + union kernfs_node_id id; unsigned short flags; umode_t mode; - unsigned int ino; struct kernfs_iattrs *iattr; - u32 generation; }; /* diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 7bd8783a590f..9b57f014d79d 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -136,7 +136,7 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode, static inline unsigned int __trace_wb_assign_cgroup(struct bdi_writeback *wb) { - return wb->memcg_css->cgroup->kn->ino; + return wb->memcg_css->cgroup->kn->id.ino; } static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *wbc) -- cgit v1.2.3 From 121508df44d074245a72eda6b067478218480a40 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:52 -0700 Subject: cgroup: export fhandle info for a cgroup Add an API to export cgroup fhandle info. We don't export a full 'struct file_handle', there are unrequired info. Sepcifically, cgroup is always a directory, so we don't need a 'FILEID_INO32_GEN_PARENT' type fhandle, we only need export the inode number and generation number just like what generic_fh_to_dentry does. And we can avoid the overhead of getting an inode too, since kernfs_node_id (ino and generation) has all the info required. Acked-by: Tejun Heo Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/cgroup.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 30c68773fd1e..52ef9a68ff14 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -609,6 +609,10 @@ static inline void cgroup_kthread_ready(void) current->no_cgroup_migration = 0; } +static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) +{ + return &cgrp->kn->id; +} #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; @@ -631,6 +635,10 @@ static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } static inline void cgroup_init_kthreadd(void) {} static inline void cgroup_kthread_ready(void) {} +static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) +{ + return NULL; +} static inline bool task_under_cgroup_hierarchy(struct task_struct *task, struct cgroup *ancestor) -- cgit v1.2.3 From 69fd5c391763bd94a40dd152bc72a7f230137150 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:55 -0700 Subject: blktrace: add an option to allow displaying cgroup path By default we output cgroup id in blktrace. This adds an option to display cgroup path. Since get cgroup path is a relativly heavy operation, we don't enable it by default. with the option enabled, blktrace will output something like this: dd-1353 [007] d..2 293.015252: 8,0 /test/level D R 24 + 8 [dd] Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- fs/kernfs/mount.c | 19 +++++++++++++++++++ include/linux/cgroup.h | 6 ++++++ include/linux/kernfs.h | 2 ++ kernel/cgroup/cgroup.c | 12 ++++++++++++ kernel/trace/blktrace.c | 14 +++++++++++++- 5 files changed, 52 insertions(+), 1 deletion(-) (limited to 'include/linux/cgroup.h') diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index fa323589704f..7c452f4d83e9 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -65,6 +65,25 @@ const struct super_operations kernfs_sops = { .show_path = kernfs_sop_show_path, }; +/* + * Similar to kernfs_fh_get_inode, this one gets kernfs node from inode + * number and generation + */ +struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, + const union kernfs_node_id *id) +{ + struct kernfs_node *kn; + + kn = kernfs_find_and_get_node_by_ino(root, id->ino); + if (!kn) + return NULL; + if (kn->id.generation != id->generation) { + kernfs_put(kn); + return NULL; + } + return kn; +} + static struct inode *kernfs_fh_get_inode(struct super_block *sb, u64 ino, u32 generation) { diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 52ef9a68ff14..6144fe923b73 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -613,6 +613,9 @@ static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) { return &cgrp->kn->id; } + +void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, + char *buf, size_t buflen); #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; @@ -645,6 +648,9 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, { return true; } + +static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, + char *buf, size_t buflen) {} #endif /* !CONFIG_CGROUPS */ /* diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index d149361e5875..ab25c8b6d9e3 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -358,6 +358,8 @@ struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns); void kernfs_init(void); +struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, + const union kernfs_node_id *id); #else /* CONFIG_KERNFS */ static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6cefa277f39c..2aba1c519138 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4701,6 +4701,18 @@ static int __init cgroup_wq_init(void) } core_initcall(cgroup_wq_init); +void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, + char *buf, size_t buflen) +{ + struct kernfs_node *kn; + + kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id); + if (!kn) + return; + kernfs_path(kn, buf, buflen); + kernfs_put(kn); +} + /* * proc_cgroup_show() * - Print task's cgroup paths into seq_file, one line for each hierarchy diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index f393d7a43695..e90974ed4532 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -48,12 +48,14 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock); /* Select an alternative, minimalistic output than the original one */ #define TRACE_BLK_OPT_CLASSIC 0x1 #define TRACE_BLK_OPT_CGROUP 0x2 +#define TRACE_BLK_OPT_CGNAME 0x4 static struct tracer_opt blk_tracer_opts[] = { /* Default disable the minimalistic output */ { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) }, #ifdef CONFIG_BLK_CGROUP { TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) }, + { TRACER_OPT(blk_cgname, TRACE_BLK_OPT_CGNAME) }, #endif { } }; @@ -1213,7 +1215,17 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, if (has_cg) { const union kernfs_node_id *id = cgid_start(iter->ent); - trace_seq_printf(&iter->seq, "%3d,%-3d %x,%-x %2s %3s ", + if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) { + char blkcg_name_buf[NAME_MAX + 1] = "<...>"; + + cgroup_path_from_kernfs_id(id, blkcg_name_buf, + sizeof(blkcg_name_buf)); + trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ", + MAJOR(t->device), MINOR(t->device), + blkcg_name_buf, act, rwbs); + } else + trace_seq_printf(&iter->seq, + "%3d,%-3d %x,%-x %2s %3s ", MAJOR(t->device), MINOR(t->device), id->ino, id->generation, act, rwbs); } else -- cgit v1.2.3 From 3e48930cc74f0c212ee1838f89ad0ca7fcf2fea1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 11 Aug 2017 05:49:01 -0700 Subject: cgroup: misc changes Misc trivial changes to prepare for future changes. No functional difference. * Expose cgroup_get(), cgroup_tryget() and cgroup_parent(). * Implement task_dfl_cgroup() which dereferences css_set->dfl_cgrp. * Rename cgroup_stats_show() to cgroup_stat_show() for consistency with the file name. Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 24 ++++++++++++++++++++++++ kernel/cgroup/cgroup.c | 23 ++--------------------- 2 files changed, 26 insertions(+), 21 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 79faa6467f76..085056e562b1 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -398,6 +398,16 @@ static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n) percpu_ref_put_many(&css->refcnt, n); } +static inline void cgroup_get(struct cgroup *cgrp) +{ + css_get(&cgrp->self); +} + +static inline bool cgroup_tryget(struct cgroup *cgrp) +{ + return css_tryget(&cgrp->self); +} + static inline void cgroup_put(struct cgroup *cgrp) { css_put(&cgrp->self); @@ -510,6 +520,20 @@ static inline struct cgroup *task_cgroup(struct task_struct *task, return task_css(task, subsys_id)->cgroup; } +static inline struct cgroup *task_dfl_cgroup(struct task_struct *task) +{ + return task_css_set(task)->dfl_cgrp; +} + +static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) +{ + struct cgroup_subsys_state *parent_css = cgrp->self.parent; + + if (parent_css) + return container_of(parent_css, struct cgroup, self); + return NULL; +} + /** * cgroup_is_descendant - test ancestry * @cgrp: the cgroup to be tested diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f5ca55db1fe1..c038ccf95b5d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -319,15 +319,6 @@ static void cgroup_idr_remove(struct idr *idr, int id) spin_unlock_bh(&cgroup_idr_lock); } -static struct cgroup *cgroup_parent(struct cgroup *cgrp) -{ - struct cgroup_subsys_state *parent_css = cgrp->self.parent; - - if (parent_css) - return container_of(parent_css, struct cgroup, self); - return NULL; -} - static bool cgroup_has_tasks(struct cgroup *cgrp) { return cgrp->nr_populated_csets; @@ -534,22 +525,12 @@ out_unlock: return css; } -static void __maybe_unused cgroup_get(struct cgroup *cgrp) -{ - css_get(&cgrp->self); -} - static void cgroup_get_live(struct cgroup *cgrp) { WARN_ON_ONCE(cgroup_is_dead(cgrp)); css_get(&cgrp->self); } -static bool cgroup_tryget(struct cgroup *cgrp) -{ - return css_tryget(&cgrp->self); -} - struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { struct cgroup *cgrp = of->kn->parent->priv; @@ -3306,7 +3287,7 @@ static int cgroup_events_show(struct seq_file *seq, void *v) return 0; } -static int cgroup_stats_show(struct seq_file *seq, void *v) +static int cgroup_stat_show(struct seq_file *seq, void *v) { struct cgroup *cgroup = seq_css(seq)->cgroup; @@ -4423,7 +4404,7 @@ static struct cftype cgroup_base_files[] = { }, { .name = "cgroup.stat", - .seq_show = cgroup_stats_show, + .seq_show = cgroup_stat_show, }, { } /* terminate */ }; -- cgit v1.2.3