From edfc4c8a1edffa6849e19ffade1be8dd824989d0 Mon Sep 17 00:00:00 2001 From: Michal Koutný Date: Tue, 3 Jun 2025 17:45:27 +0200 Subject: cgroup: Drop sock_cgroup_classid() dummy implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The semantic of returning 0 is unclear when !CONFIG_CGROUP_NET_CLASSID. Since there are no callers of sock_cgroup_classid() with that config anymore we can undefine the helper at all and enforce all (future) callers to handle cases when !CONFIG_CGROUP_NET_CLASSID. Signed-off-by: Michal Koutný Link: https://lore.kernel.org/r/Z_52r_v9-3JUzDT7@calendula/ Acked-by: Tejun Heo Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index e61687d5e496..cd7f093e34cd 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -898,14 +898,12 @@ static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd) #endif } +#ifdef CONFIG_CGROUP_NET_CLASSID static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd) { -#ifdef CONFIG_CGROUP_NET_CLASSID return READ_ONCE(skcd->classid); -#else - return 0; -#endif } +#endif static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, u16 prioidx) @@ -915,13 +913,13 @@ static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, #endif } +#ifdef CONFIG_CGROUP_NET_CLASSID static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd, u32 classid) { -#ifdef CONFIG_CGROUP_NET_CLASSID WRITE_ONCE(skcd->classid, classid); -#endif } +#endif #else /* CONFIG_SOCK_CGROUP_DATA */ -- cgit v1.2.3 From 38b9342ee62e9a1a868654ba6619de9da251bd57 Mon Sep 17 00:00:00 2001 From: Vishal Chourasia Date: Fri, 6 Jun 2025 09:40:05 +0530 Subject: Documentation: cgroup: add section explaining controller availability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add "Availability" section to Control Group v2 docs. It describes the meaning of a controller being available in a cgroup, complementing the existing "Enabling and Disabling" section. This update improves the clarity of cgroup controller management by explicitly distinguishing between: 1. Availability – when a controller is supported by the kernel and listed in "cgroup.controllers", making its interface files accessible in the cgroup's directory. 2. Enabling – when a controller is enabled via explicitly writing the name of the controller to "cgroup.subtree_control" to control distribution of resource across the cgroup's immediate children. As an example, consider /sys/fs/cgroup # cat cgroup.controllers cpuset cpu io memory hugetlb pids misc /sys/fs/cgroup # cat cgroup.subtree_control # No controllers enabled by default /sys/fs/cgroup # echo +cpu +memory > cgroup.subtree_control # enabling "cpu" and "memory" /sys/fs/cgroup # cat cgroup.subtree_control cpu memory # cpu and memory enabled in /sys/fs/cgroup /sys/fs/cgroup # mkdir foo_cgrp /sys/fs/cgroup # cd foo_cgrp/ /sys/fs/cgroup/foo_cgrp # cat cgroup.controllers cpu memory # cpu and memory available in 'foo_cgrp' /sys/fs/cgroup/foo_cgrp # cat cgroup.subtree_control # empty by default /sys/fs/cgroup/foo_cgrp # ls cgroup.controllers cpu.max.burst memory.numa_stat cgroup.events cpu.pressure memory.oom.group cgroup.freeze cpu.stat memory.peak cgroup.kill cpu.stat.local memory.pressure cgroup.max.depth cpu.weight memory.reclaim cgroup.max.descendants cpu.weight.nice memory.stat cgroup.pressure io.pressure memory.swap.current cgroup.procs memory.current memory.swap.events cgroup.stat memory.events memory.swap.high cgroup.subtree_control memory.events.local memory.swap.max cgroup.threads memory.high memory.swap.peak cgroup.type memory.low memory.zswap.current cpu.idle memory.max memory.zswap.max cpu.max memory.min memory.zswap.writeback In this example, "cpu" and "memory" are enabled in the root cgroup, making them available in "foo_cgrp". This exposes the corresponding interface files in "foo_cgrp/", allowing resource control of processes in that cgroup. However, these controllers are not yet enabled in "foo_cgrp" itself. Once a controller is available in a cgroup it can be used to resource control processes of the cgroup. Acked-by: Michal Koutný Reviewed-by: Bagas Sanjaya Signed-off-by: Vishal Chourasia Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v2.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 0cc35a14afbe..31acc64e656f 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -435,6 +435,15 @@ both cgroups. Controlling Controllers ----------------------- +Availablity +~~~~~~~~~~~ + +A controller is available in a cgroup when it is supported by the kernel (i.e., +compiled in, not disabled and not attached to a v1 hierarchy) and listed in the +"cgroup.controllers" file. Availability means the controller's interface files +are exposed in the cgroup’s directory, allowing the distribution of the target +resource to be observed or controlled within that cgroup. + Enabling and Disabling ~~~~~~~~~~~~~~~~~~~~~~ -- cgit v1.2.3 From 0925275a173d07786bfddf453f629f78d7fc4278 Mon Sep 17 00:00:00 2001 From: Michal Koutný Date: Tue, 17 Jun 2025 15:36:53 +0200 Subject: selftests: cgroup_util: Add helpers for testing named v1 hierarchies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Non-functional change, the control variable will be wired in a separate commit. Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/lib/cgroup_util.c | 4 +++- tools/testing/selftests/cgroup/lib/include/cgroup_util.h | 5 +++++ tools/testing/selftests/cgroup/test_core.c | 6 +++--- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/cgroup/lib/cgroup_util.c b/tools/testing/selftests/cgroup/lib/cgroup_util.c index 8832f3d1cb61..0e89fcff4d05 100644 --- a/tools/testing/selftests/cgroup/lib/cgroup_util.c +++ b/tools/testing/selftests/cgroup/lib/cgroup_util.c @@ -19,6 +19,8 @@ #include "cgroup_util.h" #include "../../clone3/clone3_selftests.h" +bool cg_test_v1_named; + /* Returns read len on success, or -errno on failure. */ ssize_t read_text(const char *path, char *buf, size_t max_len) { @@ -361,7 +363,7 @@ int cg_enter_current(const char *cgroup) int cg_enter_current_thread(const char *cgroup) { - return cg_write(cgroup, "cgroup.threads", "0"); + return cg_write(cgroup, CG_THREADS_FILE, "0"); } int cg_run(const char *cgroup, diff --git a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h index adb2bc193183..c69cab66254b 100644 --- a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h +++ b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h @@ -13,6 +13,10 @@ #define TEST_UID 65534 /* usually nobody, any !root is fine */ +#define CG_THREADS_FILE (!cg_test_v1_named ? "cgroup.threads" : "tasks") +#define CG_NAMED_NAME "selftest" +#define CG_PATH_FORMAT (!cg_test_v1_named ? "0::%s" : (":name=" CG_NAMED_NAME ":%s")) + /* * Checks if two given values differ by less than err% of their sum. */ @@ -65,3 +69,4 @@ extern int dirfd_open_opath(const char *dir); extern int cg_prepare_for_wait(const char *cgroup); extern int memcg_prepare_for_wait(const char *cgroup); extern int cg_wait_for(int fd); +extern bool cg_test_v1_named; diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c index a5672a91d273..0c4cc4e5fc8c 100644 --- a/tools/testing/selftests/cgroup/test_core.c +++ b/tools/testing/selftests/cgroup/test_core.c @@ -573,7 +573,7 @@ static int test_cgcore_proc_migration(const char *root) } cg_enter_current(dst); - if (cg_read_lc(dst, "cgroup.threads") != n_threads + 1) + if (cg_read_lc(dst, CG_THREADS_FILE) != n_threads + 1) goto cleanup; ret = KSFT_PASS; @@ -605,7 +605,7 @@ static void *migrating_thread_fn(void *arg) char lines[3][PATH_MAX]; for (g = 1; g < 3; ++g) - snprintf(lines[g], sizeof(lines[g]), "0::%s", grps[g] + strlen(grps[0])); + snprintf(lines[g], sizeof(lines[g]), CG_PATH_FORMAT, grps[g] + strlen(grps[0])); for (i = 0; i < n_iterations; ++i) { cg_enter_current_thread(grps[(i % 2) + 1]); @@ -659,7 +659,7 @@ static int test_cgcore_thread_migration(const char *root) if (retval) goto cleanup; - snprintf(line, sizeof(line), "0::%s", grps[1] + strlen(grps[0])); + snprintf(line, sizeof(line), CG_PATH_FORMAT, grps[1] + strlen(grps[0])); if (proc_read_strstr(0, 1, "cgroup", line)) goto cleanup; -- cgit v1.2.3 From dd7588e455f847d3b0108d9981b1fcff4441f77b Mon Sep 17 00:00:00 2001 From: Michal Koutný Date: Tue, 17 Jun 2025 15:36:54 +0200 Subject: selftests: cgroup: Add support for named v1 hierarchies in test_core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This comes useful when using selftests from mainline on older kernels/setups that still rely on cgroup v1. The core tests that rely on v2 specific features are skipped, the remaining ones are adjusted to work with a v1 hierarchy. The expected output on v1 system: ok 1 # SKIP test_cgcore_internal_process_constraint ok 2 # SKIP test_cgcore_top_down_constraint_enable ok 3 # SKIP test_cgcore_top_down_constraint_disable ok 4 # SKIP test_cgcore_no_internal_process_constraint_on_threads ok 5 # SKIP test_cgcore_parent_becomes_threaded ok 6 # SKIP test_cgcore_invalid_domain ok 7 # SKIP test_cgcore_populated ok 8 test_cgcore_proc_migration ok 9 test_cgcore_thread_migration ok 10 test_cgcore_destroy ok 11 test_cgcore_lesser_euid_open ok 12 # SKIP test_cgcore_lesser_ns_open Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/test_core.c | 31 +++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c index 0c4cc4e5fc8c..338e276aae5d 100644 --- a/tools/testing/selftests/cgroup/test_core.c +++ b/tools/testing/selftests/cgroup/test_core.c @@ -148,6 +148,9 @@ static int test_cgcore_populated(const char *root) int cgroup_fd = -EBADF; pid_t pid; + if (cg_test_v1_named) + return KSFT_SKIP; + cg_test_a = cg_name(root, "cg_test_a"); cg_test_b = cg_name(root, "cg_test_a/cg_test_b"); cg_test_c = cg_name(root, "cg_test_a/cg_test_b/cg_test_c"); @@ -277,6 +280,9 @@ static int test_cgcore_invalid_domain(const char *root) int ret = KSFT_FAIL; char *grandparent = NULL, *parent = NULL, *child = NULL; + if (cg_test_v1_named) + return KSFT_SKIP; + grandparent = cg_name(root, "cg_test_grandparent"); parent = cg_name(root, "cg_test_grandparent/cg_test_parent"); child = cg_name(root, "cg_test_grandparent/cg_test_parent/cg_test_child"); @@ -339,6 +345,9 @@ static int test_cgcore_parent_becomes_threaded(const char *root) int ret = KSFT_FAIL; char *parent = NULL, *child = NULL; + if (cg_test_v1_named) + return KSFT_SKIP; + parent = cg_name(root, "cg_test_parent"); child = cg_name(root, "cg_test_parent/cg_test_child"); if (!parent || !child) @@ -378,7 +387,8 @@ static int test_cgcore_no_internal_process_constraint_on_threads(const char *roo int ret = KSFT_FAIL; char *parent = NULL, *child = NULL; - if (cg_read_strstr(root, "cgroup.controllers", "cpu") || + if (cg_test_v1_named || + cg_read_strstr(root, "cgroup.controllers", "cpu") || cg_write(root, "cgroup.subtree_control", "+cpu")) { ret = KSFT_SKIP; goto cleanup; @@ -430,6 +440,9 @@ static int test_cgcore_top_down_constraint_enable(const char *root) int ret = KSFT_FAIL; char *parent = NULL, *child = NULL; + if (cg_test_v1_named) + return KSFT_SKIP; + parent = cg_name(root, "cg_test_parent"); child = cg_name(root, "cg_test_parent/cg_test_child"); if (!parent || !child) @@ -465,6 +478,9 @@ static int test_cgcore_top_down_constraint_disable(const char *root) int ret = KSFT_FAIL; char *parent = NULL, *child = NULL; + if (cg_test_v1_named) + return KSFT_SKIP; + parent = cg_name(root, "cg_test_parent"); child = cg_name(root, "cg_test_parent/cg_test_child"); if (!parent || !child) @@ -506,6 +522,9 @@ static int test_cgcore_internal_process_constraint(const char *root) int ret = KSFT_FAIL; char *parent = NULL, *child = NULL; + if (cg_test_v1_named) + return KSFT_SKIP; + parent = cg_name(root, "cg_test_parent"); child = cg_name(root, "cg_test_parent/cg_test_child"); if (!parent || !child) @@ -642,10 +661,12 @@ static int test_cgcore_thread_migration(const char *root) if (cg_create(grps[2])) goto cleanup; - if (cg_write(grps[1], "cgroup.type", "threaded")) - goto cleanup; - if (cg_write(grps[2], "cgroup.type", "threaded")) - goto cleanup; + if (!cg_test_v1_named) { + if (cg_write(grps[1], "cgroup.type", "threaded")) + goto cleanup; + if (cg_write(grps[2], "cgroup.type", "threaded")) + goto cleanup; + } if (cg_enter_current(grps[1])) goto cleanup; -- cgit v1.2.3 From d74cd7864ffa6913f1d70f80858bd3fd19101cdf Mon Sep 17 00:00:00 2001 From: Michal Koutný Date: Tue, 17 Jun 2025 15:36:55 +0200 Subject: selftests: cgroup: Optionally set up v1 environment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the missing mount of the unifier hierarchy as a hint of legacy system and prepare our own named v1 hierarchy for tests. The code is only in test_core.c and not cgroup_util.c because other selftests are related to controllers which will be exposed on v2 hierarchy but named hierarchies are only v1 thing. Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/test_core.c | 44 ++++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c index 338e276aae5d..452c2abf9794 100644 --- a/tools/testing/selftests/cgroup/test_core.c +++ b/tools/testing/selftests/cgroup/test_core.c @@ -5,6 +5,8 @@ #include #include #include +#include +#include #include #include #include @@ -863,6 +865,38 @@ cleanup: return ret; } +static int setup_named_v1_root(char *root, size_t len, const char *name) +{ + char options[PATH_MAX]; + int r; + + r = snprintf(root, len, "/mnt/cg_selftest"); + if (r < 0) + return r; + + r = snprintf(options, sizeof(options), "none,name=%s", name); + if (r < 0) + return r; + + r = mkdir(root, 0755); + if (r < 0 && errno != EEXIST) + return r; + + r = mount("none", root, "cgroup", 0, options); + if (r < 0) + return r; + + return 0; +} + +static void cleanup_named_v1_root(char *root) +{ + if (!cg_test_v1_named) + return; + umount(root); + rmdir(root); +} + #define T(x) { x, #x } struct corecg_test { int (*fn)(const char *root); @@ -888,13 +922,18 @@ int main(int argc, char *argv[]) char root[PATH_MAX]; int i, ret = EXIT_SUCCESS; - if (cg_find_unified_root(root, sizeof(root), &nsdelegate)) - ksft_exit_skip("cgroup v2 isn't mounted\n"); + if (cg_find_unified_root(root, sizeof(root), &nsdelegate)) { + if (setup_named_v1_root(root, sizeof(root), CG_NAMED_NAME)) + ksft_exit_skip("cgroup v2 isn't mounted and could not setup named v1 hierarchy\n"); + cg_test_v1_named = true; + goto post_v2_setup; + } if (cg_read_strstr(root, "cgroup.subtree_control", "memory")) if (cg_write(root, "cgroup.subtree_control", "+memory")) ksft_exit_skip("Failed to set memory controller\n"); +post_v2_setup: for (i = 0; i < ARRAY_SIZE(tests); i++) { switch (tests[i].fn(root)) { case KSFT_PASS: @@ -910,5 +949,6 @@ int main(int argc, char *argv[]) } } + cleanup_named_v1_root(root); return ret; } -- cgit v1.2.3 From 5da4f9db980cc475bb6f027153cce75eaa3026ec Mon Sep 17 00:00:00 2001 From: Michal Koutný Date: Tue, 17 Jun 2025 15:36:56 +0200 Subject: selftests: cgroup: Fix compilation on pre-cgroupns kernels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The test would be skipped because of nsdelegate, so the defined value is not used (0 is always acceptable). Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/test_core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c index 452c2abf9794..a360e2eb2eef 100644 --- a/tools/testing/selftests/cgroup/test_core.c +++ b/tools/testing/selftests/cgroup/test_core.c @@ -21,6 +21,9 @@ #include "cgroup_util.h" static bool nsdelegate; +#ifndef CLONE_NEWCGROUP +#define CLONE_NEWCGROUP 0 +#endif static int touch_anon(char *buf, size_t size) { -- cgit v1.2.3 From 1257b8786ac689a2ce5fe3e1741c65038035adc6 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 17 Jun 2025 12:57:22 -0700 Subject: cgroup: support to enable nmi-safe css_rstat_updated Add necessary infrastructure to enable the nmi-safe execution of css_rstat_updated(). Currently css_rstat_updated() takes a per-cpu per-css raw spinlock to add the given css in the per-cpu per-css update tree. However the kernel can not spin in nmi context, so we need to remove the spinning on the raw spinlock in css_rstat_updated(). To support lockless css_rstat_updated(), let's add necessary data structures in the css and ss structures. Signed-off-by: Shakeel Butt Tested-by: JP Kobryn Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 4 ++++ kernel/cgroup/rstat.c | 23 +++++++++++++++++++++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index cd7f093e34cd..04191d99228c 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -384,6 +384,9 @@ struct css_rstat_cpu { */ struct cgroup_subsys_state *updated_children; struct cgroup_subsys_state *updated_next; /* NULL if not on the list */ + + struct llist_node lnode; /* lockless list for update */ + struct cgroup_subsys_state *owner; /* back pointer */ }; /* @@ -822,6 +825,7 @@ struct cgroup_subsys { spinlock_t rstat_ss_lock; raw_spinlock_t __percpu *rstat_ss_cpu_lock; + struct llist_head __percpu *lhead; /* lockless update list head */ }; extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index ce4752ab9e09..bfa6366d2325 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -11,6 +11,7 @@ static DEFINE_SPINLOCK(rstat_base_lock); static DEFINE_PER_CPU(raw_spinlock_t, rstat_base_cpu_lock); +static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list); static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); @@ -45,6 +46,13 @@ static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss) return &rstat_base_lock; } +static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu) +{ + if (ss) + return per_cpu_ptr(ss->lhead, cpu); + return per_cpu_ptr(&rstat_backlog_list, cpu); +} + static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu) { if (ss) @@ -456,7 +464,8 @@ int css_rstat_init(struct cgroup_subsys_state *css) for_each_possible_cpu(cpu) { struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); - rstatc->updated_children = css; + rstatc->owner = rstatc->updated_children = css; + init_llist_node(&rstatc->lnode); if (is_self) { struct cgroup_rstat_base_cpu *rstatbc; @@ -525,9 +534,19 @@ int __init ss_rstat_init(struct cgroup_subsys *ss) } #endif + if (ss) { + ss->lhead = alloc_percpu(struct llist_head); + if (!ss->lhead) { + free_percpu(ss->rstat_ss_cpu_lock); + return -ENOMEM; + } + } + spin_lock_init(ss_rstat_lock(ss)); - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { raw_spin_lock_init(ss_rstat_cpu_lock(ss, cpu)); + init_llist_head(ss_lhead_cpu(ss, cpu)); + } return 0; } -- cgit v1.2.3 From 36df6e3dbd7e7b074e55fec080012184e2fa3a46 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 17 Jun 2025 12:57:23 -0700 Subject: cgroup: make css_rstat_updated nmi safe To make css_rstat_updated() able to safely run in nmi context, let's move the rstat update tree creation at the flush side and use per-cpu lockless lists in struct cgroup_subsys to track the css whose stats are updated on that cpu. The struct cgroup_subsys_state now has per-cpu lnode which needs to be inserted into the corresponding per-cpu lhead of struct cgroup_subsys. Since we want the insertion to be nmi safe, there can be multiple inserters on the same cpu for the same lnode. Here multiple inserters are from stacked contexts like softirq, hardirq and nmi. The current llist does not provide function to protect against the scenario where multiple inserters can use the same lnode. So, using llist_node() out of the box is not safe for this scenario. However we can protect against multiple inserters using the same lnode by using the fact llist node points to itself when not on the llist and atomically reset it and select the winner as the single inserter. Signed-off-by: Shakeel Butt Tested-by: JP Kobryn Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 65 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 53 insertions(+), 12 deletions(-) diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index bfa6366d2325..823a4c7c3fea 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -126,13 +126,16 @@ void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu, * @css: target cgroup subsystem state * @cpu: cpu on which rstat_cpu was updated * - * @css's rstat_cpu on @cpu was updated. Put it on the parent's matching - * rstat_cpu->updated_children list. See the comment on top of - * css_rstat_cpu definition for details. + * Atomically inserts the css in the ss's llist for the given cpu. This is + * reentrant safe i.e. safe against softirq, hardirq and nmi. The ss's llist + * will be processed at the flush time to create the update tree. */ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) { - unsigned long flags; + struct llist_head *lhead; + struct css_rstat_cpu *rstatc; + struct css_rstat_cpu __percpu *rstatc_pcpu; + struct llist_node *self; /* * Since bpf programs can call this function, prevent access to @@ -141,19 +144,44 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) if (!css_uses_rstat(css)) return; + lockdep_assert_preemption_disabled(); + + /* + * For archs withnot nmi safe cmpxchg or percpu ops support, ignore + * the requests from nmi context. + */ + if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) || + !IS_ENABLED(CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS)) && in_nmi()) + return; + + rstatc = css_rstat_cpu(css, cpu); + /* If already on list return. */ + if (llist_on_list(&rstatc->lnode)) + return; + /* - * Speculative already-on-list test. This may race leading to - * temporary inaccuracies, which is fine. + * This function can be renentered by irqs and nmis for the same cgroup + * and may try to insert the same per-cpu lnode into the llist. Note + * that llist_add() does not protect against such scenarios. * - * Because @parent's updated_children is terminated with @parent - * instead of NULL, we can tell whether @css is on the list by - * testing the next pointer for NULL. + * To protect against such stacked contexts of irqs/nmis, we use the + * fact that lnode points to itself when not on a list and then use + * this_cpu_cmpxchg() to atomically set to NULL to select the winner + * which will call llist_add(). The losers can assume the insertion is + * successful and the winner will eventually add the per-cpu lnode to + * the llist. */ - if (data_race(css_rstat_cpu(css, cpu)->updated_next)) + self = &rstatc->lnode; + rstatc_pcpu = css->rstat_cpu; + if (this_cpu_cmpxchg(rstatc_pcpu->lnode.next, self, NULL) != self) return; - flags = _css_rstat_cpu_lock(css, cpu, true); + lhead = ss_lhead_cpu(css->ss, cpu); + llist_add(&rstatc->lnode, lhead); +} +static void __css_process_update_tree(struct cgroup_subsys_state *css, int cpu) +{ /* put @css and all ancestors on the corresponding updated lists */ while (true) { struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); @@ -179,8 +207,19 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) css = parent; } +} + +static void css_process_update_tree(struct cgroup_subsys *ss, int cpu) +{ + struct llist_head *lhead = ss_lhead_cpu(ss, cpu); + struct llist_node *lnode; + + while ((lnode = llist_del_first_init(lhead))) { + struct css_rstat_cpu *rstatc; - _css_rstat_cpu_unlock(css, cpu, flags, true); + rstatc = container_of(lnode, struct css_rstat_cpu, lnode); + __css_process_update_tree(rstatc->owner, cpu); + } } /** @@ -288,6 +327,8 @@ static struct cgroup_subsys_state *css_rstat_updated_list( flags = _css_rstat_cpu_lock(root, cpu, false); + css_process_update_tree(root->ss, cpu); + /* Return NULL if this subtree is not on-list */ if (!rstatc->updated_next) goto unlock_ret; -- cgit v1.2.3 From 6af89c6ca71742e9227e6f8172a86ce1ee16aa85 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 17 Jun 2025 12:57:24 -0700 Subject: cgroup: remove per-cpu per-subsystem locks The rstat update side used to insert the cgroup whose stats are updated in the update tree and the read side flush the update tree to get the latest uptodate stats. The per-cpu per-subsystem locks were used to synchronize the update and flush side. However now the update side does not access update tree but uses per-cpu lockless lists. So there is no need for locks to synchronize update and flush side. Let's remove them. Suggested-by: JP Kobryn Signed-off-by: Shakeel Butt Tested-by: JP Kobryn Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 7 --- include/trace/events/cgroup.h | 47 -------------------- kernel/cgroup/rstat.c | 100 ++---------------------------------------- 3 files changed, 4 insertions(+), 150 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 04191d99228c..6b93a64115fe 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -375,12 +375,6 @@ struct css_rstat_cpu { * Child cgroups with stat updates on this cpu since the last read * are linked on the parent's ->updated_children through * ->updated_next. updated_children is terminated by its container css. - * - * In addition to being more compact, singly-linked list pointing to - * the css makes it unnecessary for each per-cpu struct to point back - * to the associated css. - * - * Protected by per-cpu css->ss->rstat_ss_cpu_lock. */ struct cgroup_subsys_state *updated_children; struct cgroup_subsys_state *updated_next; /* NULL if not on the list */ @@ -824,7 +818,6 @@ struct cgroup_subsys { unsigned int depends_on; spinlock_t rstat_ss_lock; - raw_spinlock_t __percpu *rstat_ss_cpu_lock; struct llist_head __percpu *lhead; /* lockless update list head */ }; diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h index 7d332387be6c..ba9229af9a34 100644 --- a/include/trace/events/cgroup.h +++ b/include/trace/events/cgroup.h @@ -257,53 +257,6 @@ DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock, TP_ARGS(cgrp, cpu, contended) ); -/* - * Related to per CPU locks: - * global rstat_base_cpu_lock for base stats - * cgroup_subsys::rstat_ss_cpu_lock for subsystem stats - */ -DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended, - - TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), - - TP_ARGS(cgrp, cpu, contended) -); - -DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended_fastpath, - - TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), - - TP_ARGS(cgrp, cpu, contended) -); - -DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked, - - TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), - - TP_ARGS(cgrp, cpu, contended) -); - -DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked_fastpath, - - TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), - - TP_ARGS(cgrp, cpu, contended) -); - -DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock, - - TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), - - TP_ARGS(cgrp, cpu, contended) -); - -DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath, - - TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), - - TP_ARGS(cgrp, cpu, contended) -); - #endif /* _TRACE_CGROUP_H */ /* This part must be outside protection */ diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 823a4c7c3fea..c8a48cf83878 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -10,7 +10,6 @@ #include static DEFINE_SPINLOCK(rstat_base_lock); -static DEFINE_PER_CPU(raw_spinlock_t, rstat_base_cpu_lock); static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list); static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); @@ -53,74 +52,6 @@ static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu) return per_cpu_ptr(&rstat_backlog_list, cpu); } -static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu) -{ - if (ss) - return per_cpu_ptr(ss->rstat_ss_cpu_lock, cpu); - - return per_cpu_ptr(&rstat_base_cpu_lock, cpu); -} - -/* - * Helper functions for rstat per CPU locks. - * - * This makes it easier to diagnose locking issues and contention in - * production environments. The parameter @fast_path determine the - * tracepoints being added, allowing us to diagnose "flush" related - * operations without handling high-frequency fast-path "update" events. - */ -static __always_inline -unsigned long _css_rstat_cpu_lock(struct cgroup_subsys_state *css, int cpu, - const bool fast_path) -{ - struct cgroup *cgrp = css->cgroup; - raw_spinlock_t *cpu_lock; - unsigned long flags; - bool contended; - - /* - * The _irqsave() is needed because the locks used for flushing are - * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring this lock - * with the _irq() suffix only disables interrupts on a non-PREEMPT_RT - * kernel. The raw_spinlock_t below disables interrupts on both - * configurations. The _irqsave() ensures that interrupts are always - * disabled and later restored. - */ - cpu_lock = ss_rstat_cpu_lock(css->ss, cpu); - contended = !raw_spin_trylock_irqsave(cpu_lock, flags); - if (contended) { - if (fast_path) - trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended); - else - trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended); - - raw_spin_lock_irqsave(cpu_lock, flags); - } - - if (fast_path) - trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended); - else - trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended); - - return flags; -} - -static __always_inline -void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu, - unsigned long flags, const bool fast_path) -{ - struct cgroup *cgrp = css->cgroup; - raw_spinlock_t *cpu_lock; - - if (fast_path) - trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false); - else - trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false); - - cpu_lock = ss_rstat_cpu_lock(css->ss, cpu); - raw_spin_unlock_irqrestore(cpu_lock, flags); -} - /** * css_rstat_updated - keep track of updated rstat_cpu * @css: target cgroup subsystem state @@ -323,15 +254,12 @@ static struct cgroup_subsys_state *css_rstat_updated_list( { struct css_rstat_cpu *rstatc = css_rstat_cpu(root, cpu); struct cgroup_subsys_state *head = NULL, *parent, *child; - unsigned long flags; - - flags = _css_rstat_cpu_lock(root, cpu, false); css_process_update_tree(root->ss, cpu); /* Return NULL if this subtree is not on-list */ if (!rstatc->updated_next) - goto unlock_ret; + return NULL; /* * Unlink @root from its parent. As the updated_children list is @@ -363,8 +291,7 @@ static struct cgroup_subsys_state *css_rstat_updated_list( rstatc->updated_children = root; if (child != root) head = css_rstat_push_children(head, child, cpu); -unlock_ret: - _css_rstat_cpu_unlock(root, cpu, flags, false); + return head; } @@ -560,34 +487,15 @@ int __init ss_rstat_init(struct cgroup_subsys *ss) { int cpu; -#ifdef CONFIG_SMP - /* - * On uniprocessor machines, arch_spinlock_t is defined as an empty - * struct. Avoid allocating a size of zero by having this block - * excluded in this case. It's acceptable to leave the subsystem locks - * unitialized since the associated lock functions are no-ops in the - * non-smp case. - */ - if (ss) { - ss->rstat_ss_cpu_lock = alloc_percpu(raw_spinlock_t); - if (!ss->rstat_ss_cpu_lock) - return -ENOMEM; - } -#endif - if (ss) { ss->lhead = alloc_percpu(struct llist_head); - if (!ss->lhead) { - free_percpu(ss->rstat_ss_cpu_lock); + if (!ss->lhead) return -ENOMEM; - } } spin_lock_init(ss_rstat_lock(ss)); - for_each_possible_cpu(cpu) { - raw_spin_lock_init(ss_rstat_cpu_lock(ss, cpu)); + for_each_possible_cpu(cpu) init_llist_head(ss_lhead_cpu(ss, cpu)); - } return 0; } -- cgit v1.2.3 From 8dcb0ed834a3ec037c153c7757240ede9a8c9808 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 17 Jun 2025 12:57:25 -0700 Subject: memcg: cgroup: call css_rstat_updated irrespective of in_nmi() css_rstat_updated() is nmi safe, so there is no need to avoid it in in_nmi(), so remove the check. Signed-off-by: Shakeel Butt Tested-by: JP Kobryn Signed-off-by: Tejun Heo --- mm/memcontrol.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 902da8a9c643..d122bfe33e98 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -573,9 +573,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val, if (!val) return; - /* TODO: add to cgroup update tree once it is nmi-safe. */ - if (!in_nmi()) - css_rstat_updated(&memcg->css, cpu); + css_rstat_updated(&memcg->css, cpu); statc_pcpu = memcg->vmstats_percpu; for (; statc_pcpu; statc_pcpu = statc->parent_pcpu) { statc = this_cpu_ptr(statc_pcpu); @@ -2530,7 +2528,8 @@ static inline void account_slab_nmi_safe(struct mem_cgroup *memcg, } else { struct mem_cgroup_per_node *pn = memcg->nodeinfo[pgdat->node_id]; - /* TODO: add to cgroup update tree once it is nmi-safe. */ + /* preemption is disabled in_nmi(). */ + css_rstat_updated(&memcg->css, smp_processor_id()); if (idx == NR_SLAB_RECLAIMABLE_B) atomic_add(nr, &pn->slab_reclaimable); else @@ -2753,7 +2752,8 @@ static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val) if (likely(!in_nmi())) { mod_memcg_state(memcg, MEMCG_KMEM, val); } else { - /* TODO: add to cgroup update tree once it is nmi-safe. */ + /* preemption is disabled in_nmi(). */ + css_rstat_updated(&memcg->css, smp_processor_id()); atomic_add(val, &memcg->kmem_stat); } } -- cgit v1.2.3 From c7d7713e36a6ab4c42e40c952d5ba7a51b1091b0 Mon Sep 17 00:00:00 2001 From: Sebastian Chlad Date: Wed, 2 Jul 2025 15:23:36 +0200 Subject: selftests: cgroup: Allow longer timeout for kmem_dead_cgroups cleanup The test_kmem_dead_cgroups test currently assumes that RCU and memory reclaim will complete within 5 seconds. In some environments this timeout may be insufficient, leading to spurious test failures. This patch introduces max_time set to 20 which is then used in the test. After 5th sec the debug message is printed to indicate the cleanup is still ongoing. In the system under test with 16 CPUs the original test was failing most of the time and the cleanup time took usually approx. 6sec. Further tests were conducted with and without do_rcu_barrier and the results (respectively) are as follow: quantiles 0 0.25 0.5 0.75 1 1 2 3 8 20 (mean = 4.7667) 3 5 8 8 20 (mean = 7.6667) Acked-by: Michal Koutny Signed-off-by: Sebastian Chlad Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/test_kmem.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c index 96693d8772be..63b3c9aad399 100644 --- a/tools/testing/selftests/cgroup/test_kmem.c +++ b/tools/testing/selftests/cgroup/test_kmem.c @@ -308,6 +308,7 @@ static int test_kmem_dead_cgroups(const char *root) char *parent; long dead; int i; + int max_time = 20; parent = cg_name(root, "kmem_dead_cgroups_test"); if (!parent) @@ -322,7 +323,7 @@ static int test_kmem_dead_cgroups(const char *root) if (cg_run_in_subcgroups(parent, alloc_dcache, (void *)100, 30)) goto cleanup; - for (i = 0; i < 5; i++) { + for (i = 0; i < max_time; i++) { dead = cg_read_key_long(parent, "cgroup.stat", "nr_dying_descendants "); if (dead == 0) { @@ -334,6 +335,8 @@ static int test_kmem_dead_cgroups(const char *root) * let's wait a bit and repeat. */ sleep(1); + if (i > 5) + printf("Waiting time longer than 5s; wait: %ds (dead: %ld)\n", i, dead); } cleanup: -- cgit v1.2.3 From e07caae73557d144a9237fb977dfee08befa015f Mon Sep 17 00:00:00 2001 From: Sebastian Chlad Date: Wed, 2 Jul 2025 18:40:10 +0200 Subject: selftests: cgroup: Fix missing newline in test_zswap_writeback_one Fixes malformed test output due to missing newline Signed-off-by: Sebastian Chlad Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/test_zswap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index 40de679248b8..e1f578ca2841 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -338,7 +338,7 @@ static int test_zswap_writeback_one(const char *cgroup, bool wb) return -1; if (wb != !!zswpwb_after) { - ksft_print_msg("zswpwb_after is %ld while wb is %s", + ksft_print_msg("zswpwb_after is %ld while wb is %s\n", zswpwb_after, wb ? "enabled" : "disabled"); return -1; } -- cgit v1.2.3 From dfe25fbaedfc2a07281ed5ff0442270217d25b31 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 4 Jul 2025 11:08:04 -0700 Subject: cgroup: llist: avoid memory tears for llist_node Before the commit 36df6e3dbd7e ("cgroup: make css_rstat_updated nmi safe"), the struct llist_node is expected to be private to the one inserting the node to the lockless list or the one removing the node from the lockless list. After the mentioned commit, the llist_node in the rstat code is per-cpu shared between the stacked contexts i.e. process, softirq, hardirq & nmi. It is possible the compiler may tear the loads or stores of llist_node. Let's avoid that. KCSAN reported the following race: Reported by Kernel Concurrency Sanitizer on: CPU: 60 UID: 0 PID: 5425 ... 6.16.0-rc3-next-20250626 #1 NONE Tainted: [E]=UNSIGNED_MODULE Hardware name: ... ================================================================== ================================================================== BUG: KCSAN: data-race in css_rstat_flush / css_rstat_updated write to 0xffffe8fffe1c85f0 of 8 bytes by task 1061 on cpu 1: css_rstat_flush+0x1b8/0xeb0 __mem_cgroup_flush_stats+0x184/0x190 flush_memcg_stats_dwork+0x22/0x50 process_one_work+0x335/0x630 worker_thread+0x5f1/0x8a0 kthread+0x197/0x340 ret_from_fork+0xd3/0x110 ret_from_fork_asm+0x11/0x20 read to 0xffffe8fffe1c85f0 of 8 bytes by task 3551 on cpu 15: css_rstat_updated+0x81/0x180 mod_memcg_lruvec_state+0x113/0x2d0 __mod_lruvec_state+0x3d/0x50 lru_add+0x21e/0x3f0 folio_batch_move_lru+0x80/0x1b0 __folio_batch_add_and_move+0xd7/0x160 folio_add_lru_vma+0x42/0x50 do_anonymous_page+0x892/0xe90 __handle_mm_fault+0xfaa/0x1520 handle_mm_fault+0xdc/0x350 do_user_addr_fault+0x1dc/0x650 exc_page_fault+0x5c/0x110 asm_exc_page_fault+0x22/0x30 value changed: 0xffffe8fffe18e0d0 -> 0xffffe8fffe1c85f0 $ ./scripts/faddr2line vmlinux css_rstat_flush+0x1b8/0xeb0 css_rstat_flush+0x1b8/0xeb0: init_llist_node at include/linux/llist.h:86 (inlined by) llist_del_first_init at include/linux/llist.h:308 (inlined by) css_process_update_tree at kernel/cgroup/rstat.c:148 (inlined by) css_rstat_updated_list at kernel/cgroup/rstat.c:258 (inlined by) css_rstat_flush at kernel/cgroup/rstat.c:389 $ ./scripts/faddr2line vmlinux css_rstat_updated+0x81/0x180 css_rstat_updated+0x81/0x180: css_rstat_updated at kernel/cgroup/rstat.c:90 (discriminator 1) These are expected race and a simple READ_ONCE/WRITE_ONCE resolves these reports. However let's add comments to explain the race and the need for memory barriers if stronger guarantees are needed. More specifically the rstat updater and the flusher can race and cause a scenario where the stats updater skips adding the css to the lockless list but the flusher might not see those updates done by the skipped updater. This is benign race and the subsequent flusher will flush those stats and at the moment there aren't any rstat users which are not fine with this kind of race. However some future user might want more stricter guarantee, so let's add appropriate comments to ease the job of future users. Signed-off-by: Shakeel Butt Reviewed-by: Paul E. McKenney Fixes: 36df6e3dbd7e ("cgroup: make css_rstat_updated nmi safe") Signed-off-by: Tejun Heo --- include/linux/llist.h | 6 +++--- kernel/cgroup/rstat.c | 28 +++++++++++++++++++++++++++- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/include/linux/llist.h b/include/linux/llist.h index 27b17f64bcee..607b2360c938 100644 --- a/include/linux/llist.h +++ b/include/linux/llist.h @@ -83,7 +83,7 @@ static inline void init_llist_head(struct llist_head *list) */ static inline void init_llist_node(struct llist_node *node) { - node->next = node; + WRITE_ONCE(node->next, node); } /** @@ -97,7 +97,7 @@ static inline void init_llist_node(struct llist_node *node) */ static inline bool llist_on_list(const struct llist_node *node) { - return node->next != node; + return READ_ONCE(node->next) != node; } /** @@ -220,7 +220,7 @@ static inline bool llist_empty(const struct llist_head *head) static inline struct llist_node *llist_next(struct llist_node *node) { - return node->next; + return READ_ONCE(node->next); } /** diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index c8a48cf83878..981e2f77ad4e 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -60,6 +60,12 @@ static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu) * Atomically inserts the css in the ss's llist for the given cpu. This is * reentrant safe i.e. safe against softirq, hardirq and nmi. The ss's llist * will be processed at the flush time to create the update tree. + * + * NOTE: if the user needs the guarantee that the updater either add itself in + * the lockless list or the concurrent flusher flushes its updated stats, a + * memory barrier is needed before the call to css_rstat_updated() i.e. a + * barrier after updating the per-cpu stats and before calling + * css_rstat_updated(). */ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) { @@ -86,7 +92,12 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) return; rstatc = css_rstat_cpu(css, cpu); - /* If already on list return. */ + /* + * If already on list return. This check is racy and smp_mb() is needed + * to pair it with the smp_mb() in css_process_update_tree() if the + * guarantee that the updated stats are visible to concurrent flusher is + * needed. + */ if (llist_on_list(&rstatc->lnode)) return; @@ -148,6 +159,21 @@ static void css_process_update_tree(struct cgroup_subsys *ss, int cpu) while ((lnode = llist_del_first_init(lhead))) { struct css_rstat_cpu *rstatc; + /* + * smp_mb() is needed here (more specifically in between + * init_llist_node() and per-cpu stats flushing) if the + * guarantee is required by a rstat user where etiher the + * updater should add itself on the lockless list or the + * flusher flush the stats updated by the updater who have + * observed that they are already on the list. The + * corresponding barrier pair for this one should be before + * css_rstat_updated() by the user. + * + * For now, there aren't any such user, so not adding the + * barrier here but if such a use-case arise, please add + * smp_mb() here. + */ + rstatc = container_of(lnode, struct css_rstat_cpu, lnode); __css_process_update_tree(rstatc->owner, cpu); } -- cgit v1.2.3 From 954bacce36d976fe472090b55987df66da00c49b Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Fri, 4 Jul 2025 20:08:41 +0900 Subject: selftests/cgroup: fix cpu.max tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Current cpu.max tests (both the normal one and the nested one) are broken. They setup cpu.max with 1000 us quota and the default period (100,000 us). A cpu hog is run for a duration of 1s as per wall clock time. This corresponds to 10 periods, hence an expected usage of 10,000 us. We want the measured usage (as per cpu.stat) to be close to 10,000 us. Previously, this approximate equality test was done by `!values_close(usage_usec, expected_usage_usec, 95)`: if the absolute difference between usage_usec and expected_usage_usec is greater than 95% of their sum, then we pass. And expected_usage_usec was set to 1,000,000 us. Mathematically, this translates to the following being true for pass: |usage - expected_usage| > (usage + expected_usage)*0.95 If usage > expected_usage: usage - expected_usage > (usage + expected_usage)*0.95 0.05*usage > 1.95*expected_usage usage > 39*expected_usage = 39s If usage < expected_usage: expected_usage - usage > (usage + expected_usage)*0.95 0.05*expected_usage > 1.95*usage usage < 0.0256*expected_usage = 25,600 us Combined, Pass if usage < 25,600 us or > 39 s, which makes no sense given that all we need is for usage_usec to be close to 10,000 us. Fix this by explicitly calcuating the expected usage duration based on the configured quota, default period, and the duration, and compare usage_usec and expected_usage_usec using values_close() with a 10% error margin. Also, use snprintf to get the quota string to write to cpu.max instead of hardcoding the quota, ensuring a single source of truth. Remove the check comparing user_usec and expected_usage_usec, since on running this test modified with printfs, it's seen that user_usec and usage_usec can regularly exceed the theoretical expected_usage_usec: $ sudo ./test_cpu user: 10485, usage: 10485, expected: 10000 ok 1 test_cpucg_max user: 11127, usage: 11127, expected: 10000 ok 2 test_cpucg_max_nested $ sudo ./test_cpu user: 10286, usage: 10286, expected: 10000 ok 1 test_cpucg_max user: 10404, usage: 11271, expected: 10000 ok 2 test_cpucg_max_nested Hence, a values_close() check of usage_usec and expected_usage_usec is sufficient. Fixes: a79906570f9646ae17 ("cgroup: Add test_cpucg_max_nested() testcase") Fixes: 889ab8113ef1386c57 ("cgroup: Add test_cpucg_max() testcase") Acked-by: Michal Koutný Signed-off-by: Shashank Balaji Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/test_cpu.c | 63 +++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_cpu.c b/tools/testing/selftests/cgroup/test_cpu.c index a2b50af8e9ee..2a60e6c41940 100644 --- a/tools/testing/selftests/cgroup/test_cpu.c +++ b/tools/testing/selftests/cgroup/test_cpu.c @@ -2,6 +2,7 @@ #define _GNU_SOURCE #include +#include #include #include #include @@ -645,10 +646,16 @@ test_cpucg_nested_weight_underprovisioned(const char *root) static int test_cpucg_max(const char *root) { int ret = KSFT_FAIL; - long usage_usec, user_usec; - long usage_seconds = 1; - long expected_usage_usec = usage_seconds * USEC_PER_SEC; + long quota_usec = 1000; + long default_period_usec = 100000; /* cpu.max's default period */ + long duration_seconds = 1; + + long duration_usec = duration_seconds * USEC_PER_SEC; + long usage_usec, n_periods, remainder_usec, expected_usage_usec; char *cpucg; + char quota_buf[32]; + + snprintf(quota_buf, sizeof(quota_buf), "%ld", quota_usec); cpucg = cg_name(root, "cpucg_test"); if (!cpucg) @@ -657,13 +664,13 @@ static int test_cpucg_max(const char *root) if (cg_create(cpucg)) goto cleanup; - if (cg_write(cpucg, "cpu.max", "1000")) + if (cg_write(cpucg, "cpu.max", quota_buf)) goto cleanup; struct cpu_hog_func_param param = { .nprocs = 1, .ts = { - .tv_sec = usage_seconds, + .tv_sec = duration_seconds, .tv_nsec = 0, }, .clock_type = CPU_HOG_CLOCK_WALL, @@ -672,14 +679,19 @@ static int test_cpucg_max(const char *root) goto cleanup; usage_usec = cg_read_key_long(cpucg, "cpu.stat", "usage_usec"); - user_usec = cg_read_key_long(cpucg, "cpu.stat", "user_usec"); - if (user_usec <= 0) + if (usage_usec <= 0) goto cleanup; - if (user_usec >= expected_usage_usec) - goto cleanup; + /* + * The following calculation applies only since + * the cpu hog is set to run as per wall-clock time + */ + n_periods = duration_usec / default_period_usec; + remainder_usec = duration_usec - n_periods * default_period_usec; + expected_usage_usec + = n_periods * quota_usec + MIN(remainder_usec, quota_usec); - if (values_close(usage_usec, expected_usage_usec, 95)) + if (!values_close(usage_usec, expected_usage_usec, 10)) goto cleanup; ret = KSFT_PASS; @@ -698,10 +710,16 @@ cleanup: static int test_cpucg_max_nested(const char *root) { int ret = KSFT_FAIL; - long usage_usec, user_usec; - long usage_seconds = 1; - long expected_usage_usec = usage_seconds * USEC_PER_SEC; + long quota_usec = 1000; + long default_period_usec = 100000; /* cpu.max's default period */ + long duration_seconds = 1; + + long duration_usec = duration_seconds * USEC_PER_SEC; + long usage_usec, n_periods, remainder_usec, expected_usage_usec; char *parent, *child; + char quota_buf[32]; + + snprintf(quota_buf, sizeof(quota_buf), "%ld", quota_usec); parent = cg_name(root, "cpucg_parent"); child = cg_name(parent, "cpucg_child"); @@ -717,13 +735,13 @@ static int test_cpucg_max_nested(const char *root) if (cg_create(child)) goto cleanup; - if (cg_write(parent, "cpu.max", "1000")) + if (cg_write(parent, "cpu.max", quota_buf)) goto cleanup; struct cpu_hog_func_param param = { .nprocs = 1, .ts = { - .tv_sec = usage_seconds, + .tv_sec = duration_seconds, .tv_nsec = 0, }, .clock_type = CPU_HOG_CLOCK_WALL, @@ -732,14 +750,19 @@ static int test_cpucg_max_nested(const char *root) goto cleanup; usage_usec = cg_read_key_long(child, "cpu.stat", "usage_usec"); - user_usec = cg_read_key_long(child, "cpu.stat", "user_usec"); - if (user_usec <= 0) + if (usage_usec <= 0) goto cleanup; - if (user_usec >= expected_usage_usec) - goto cleanup; + /* + * The following calculation applies only since + * the cpu hog is set to run as per wall-clock time + */ + n_periods = duration_usec / default_period_usec; + remainder_usec = duration_usec - n_periods * default_period_usec; + expected_usage_usec + = n_periods * quota_usec + MIN(remainder_usec, quota_usec); - if (values_close(usage_usec, expected_usage_usec, 95)) + if (!values_close(usage_usec, expected_usage_usec, 10)) goto cleanup; ret = KSFT_PASS; -- cgit v1.2.3 From 646faf36d7271c597497ca547a59912fcab49be9 Mon Sep 17 00:00:00 2001 From: Michal Koutný Date: Fri, 18 Jul 2025 11:18:54 +0200 Subject: cgroup: Add compatibility option for content of /proc/cgroups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit /proc/cgroups lists only v1 controllers by default, however, this is only enforced since the commit af000ce85293b ("cgroup: Do not report unavailable v1 controllers in /proc/cgroups") and there is software in the wild that uses content of /proc/cgroups to decide on availability of v2 (sic) controllers. Add a boottime param that can bring back the previous behavior for setups where the check in the software cannot be changed and it causes e.g. unintended OOMs. Also, this patch takes out cgrp_v1_visible from cgroup1_subsys_absent() guard since it's only important to check which hierarchy (v1 vs v2) the subsys is attached to. This has no effect on the printed message but the code is cleaner since cgrp_v1_visible is really about mounted hierarchies, not the content of /proc/cgroups. Link: https://lore.kernel.org/r/b26b60b7d0d2a5ecfd2f3c45f95f32922ed24686.camel@decadent.org.uk Fixes: af000ce85293b ("cgroup: Do not report unavailable v1 controllers in /proc/cgroups") Fixes: a0ab1453226d8 ("cgroup: Print message when /proc/cgroups is read on v2-only system") Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- Documentation/admin-guide/kernel-parameters.txt | 8 ++++++++ kernel/cgroup/cgroup-v1.c | 14 ++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a3ea40b22fb9..60fd3f84272b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -630,6 +630,14 @@ named mounts. Specifying both "all" and "named" disables all v1 hierarchies. + cgroup_v1_proc= [KNL] Show also missing controllers in /proc/cgroups + Format: { "true" | "false" } + /proc/cgroups lists only v1 controllers by default. + This compatibility option enables listing also v2 + controllers (whose v1 code is not compiled!), so that + semi-legacy software can check this file to decide + about usage of v2 (sic) controllers. + cgroup_favordynmods= [KNL] Enable or Disable favordynmods. Format: { "true" | "false" } Defaults to the value of CONFIG_CGROUP_FAVOR_DYNMODS. diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index fa24c032ed6f..2a4a387f867a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -32,6 +32,9 @@ static u16 cgroup_no_v1_mask; /* disable named v1 mounts */ static bool cgroup_no_v1_named; +/* Show unavailable controllers in /proc/cgroups */ +static bool proc_show_all; + /* * pidlist destructions need to be flushed on cgroup destruction. Use a * separate workqueue as flush domain. @@ -683,10 +686,11 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) */ for_each_subsys(ss, i) { - if (cgroup1_subsys_absent(ss)) - continue; cgrp_v1_visible |= ss->root != &cgrp_dfl_root; + if (!proc_show_all && cgroup1_subsys_absent(ss)) + continue; + seq_printf(m, "%s\t%d\t%d\t%d\n", ss->legacy_name, ss->root->hierarchy_id, atomic_read(&ss->root->nr_cgrps), @@ -1359,3 +1363,9 @@ static int __init cgroup_no_v1(char *str) return 1; } __setup("cgroup_no_v1=", cgroup_no_v1); + +static int __init cgroup_v1_proc(char *str) +{ + return (kstrtobool(str, &proc_show_all) == 0); +} +__setup("cgroup_v1_proc=", cgroup_v1_proc); -- cgit v1.2.3