From edfc4c8a1edffa6849e19ffade1be8dd824989d0 Mon Sep 17 00:00:00 2001
From: Michal Koutný <mkoutny@suse.com>
Date: Tue, 3 Jun 2025 17:45:27 +0200
Subject: cgroup: Drop sock_cgroup_classid() dummy implementation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The semantic of returning 0 is unclear when !CONFIG_CGROUP_NET_CLASSID.
Since there are no callers of sock_cgroup_classid() with that config
anymore we can undefine the helper at all and enforce all (future)
callers to handle cases when !CONFIG_CGROUP_NET_CLASSID.

Signed-off-by: Michal Koutný <mkoutny@suse.com>
Link: https://lore.kernel.org/r/Z_52r_v9-3JUzDT7@calendula/
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index e61687d5e496..cd7f093e34cd 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -898,14 +898,12 @@ static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd)
 #endif
 }
 
+#ifdef CONFIG_CGROUP_NET_CLASSID
 static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd)
 {
-#ifdef CONFIG_CGROUP_NET_CLASSID
 	return READ_ONCE(skcd->classid);
-#else
-	return 0;
-#endif
 }
+#endif
 
 static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd,
 					   u16 prioidx)
@@ -915,13 +913,13 @@ static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd,
 #endif
 }
 
+#ifdef CONFIG_CGROUP_NET_CLASSID
 static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd,
 					   u32 classid)
 {
-#ifdef CONFIG_CGROUP_NET_CLASSID
 	WRITE_ONCE(skcd->classid, classid);
-#endif
 }
+#endif
 
 #else	/* CONFIG_SOCK_CGROUP_DATA */
 
-- 
cgit v1.2.3


From 38b9342ee62e9a1a868654ba6619de9da251bd57 Mon Sep 17 00:00:00 2001
From: Vishal Chourasia <vishalc@linux.ibm.com>
Date: Fri, 6 Jun 2025 09:40:05 +0530
Subject: Documentation: cgroup: add section explaining controller availability
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add "Availability" section to Control Group v2 docs. It describes the
meaning of a controller being available in a cgroup, complementing the
existing "Enabling and Disabling" section.

This update improves the clarity of cgroup controller management by
explicitly distinguishing between:

1. Availability – when a controller is supported by the kernel and
   listed in "cgroup.controllers", making its interface files accessible
   in the cgroup's directory.
2. Enabling – when a controller is enabled via explicitly writing the
   name of the controller to "cgroup.subtree_control" to control
   distribution of resource across the cgroup's immediate children.

As an example, consider

/sys/fs/cgroup # cat cgroup.controllers
cpuset cpu io memory hugetlb pids misc
/sys/fs/cgroup # cat cgroup.subtree_control # No controllers enabled by default
/sys/fs/cgroup # echo +cpu +memory > cgroup.subtree_control # enabling "cpu" and "memory"
/sys/fs/cgroup # cat cgroup.subtree_control
cpu memory                   # cpu and memory enabled in /sys/fs/cgroup
/sys/fs/cgroup # mkdir foo_cgrp
/sys/fs/cgroup # cd foo_cgrp/
/sys/fs/cgroup/foo_cgrp # cat cgroup.controllers
cpu memory                   # cpu and memory available in 'foo_cgrp'
/sys/fs/cgroup/foo_cgrp # cat cgroup.subtree_control  # empty by default
/sys/fs/cgroup/foo_cgrp # ls
cgroup.controllers      cpu.max.burst           memory.numa_stat
cgroup.events           cpu.pressure            memory.oom.group
cgroup.freeze           cpu.stat                memory.peak
cgroup.kill             cpu.stat.local          memory.pressure
cgroup.max.depth        cpu.weight              memory.reclaim
cgroup.max.descendants  cpu.weight.nice         memory.stat
cgroup.pressure         io.pressure             memory.swap.current
cgroup.procs            memory.current          memory.swap.events
cgroup.stat             memory.events           memory.swap.high
cgroup.subtree_control  memory.events.local     memory.swap.max
cgroup.threads          memory.high             memory.swap.peak
cgroup.type             memory.low              memory.zswap.current
cpu.idle                memory.max              memory.zswap.max
cpu.max                 memory.min              memory.zswap.writeback

In this example, "cpu" and "memory" are enabled in the root cgroup,
making them available in "foo_cgrp". This exposes the corresponding
interface files in "foo_cgrp/", allowing resource control of processes
in that cgroup. However, these controllers are not yet enabled in
"foo_cgrp" itself.

Once a controller is available in a cgroup it can be used to resource
control processes of the cgroup.

Acked-by: Michal Koutný <mkoutny@suse.com>
Reviewed-by: Bagas Sanjaya <bagasdotme@gmail.com>
Signed-off-by: Vishal Chourasia <vishalc@linux.ibm.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/admin-guide/cgroup-v2.rst | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 0cc35a14afbe..31acc64e656f 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -435,6 +435,15 @@ both cgroups.
 Controlling Controllers
 -----------------------
 
+Availablity
+~~~~~~~~~~~
+
+A controller is available in a cgroup when it is supported by the kernel (i.e.,
+compiled in, not disabled and not attached to a v1 hierarchy) and listed in the
+"cgroup.controllers" file. Availability means the controller's interface files
+are exposed in the cgroup’s directory, allowing the distribution of the target
+resource to be observed or controlled within that cgroup.
+
 Enabling and Disabling
 ~~~~~~~~~~~~~~~~~~~~~~
 
-- 
cgit v1.2.3


From 0925275a173d07786bfddf453f629f78d7fc4278 Mon Sep 17 00:00:00 2001
From: Michal Koutný <mkoutny@suse.com>
Date: Tue, 17 Jun 2025 15:36:53 +0200
Subject: selftests: cgroup_util: Add helpers for testing named v1 hierarchies
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Non-functional change, the control variable will be wired in a separate
commit.

Signed-off-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/cgroup/lib/cgroup_util.c         | 4 +++-
 tools/testing/selftests/cgroup/lib/include/cgroup_util.h | 5 +++++
 tools/testing/selftests/cgroup/test_core.c               | 6 +++---
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/cgroup/lib/cgroup_util.c b/tools/testing/selftests/cgroup/lib/cgroup_util.c
index 8832f3d1cb61..0e89fcff4d05 100644
--- a/tools/testing/selftests/cgroup/lib/cgroup_util.c
+++ b/tools/testing/selftests/cgroup/lib/cgroup_util.c
@@ -19,6 +19,8 @@
 #include "cgroup_util.h"
 #include "../../clone3/clone3_selftests.h"
 
+bool cg_test_v1_named;
+
 /* Returns read len on success, or -errno on failure. */
 ssize_t read_text(const char *path, char *buf, size_t max_len)
 {
@@ -361,7 +363,7 @@ int cg_enter_current(const char *cgroup)
 
 int cg_enter_current_thread(const char *cgroup)
 {
-	return cg_write(cgroup, "cgroup.threads", "0");
+	return cg_write(cgroup, CG_THREADS_FILE, "0");
 }
 
 int cg_run(const char *cgroup,
diff --git a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h
index adb2bc193183..c69cab66254b 100644
--- a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h
+++ b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h
@@ -13,6 +13,10 @@
 
 #define TEST_UID	65534 /* usually nobody, any !root is fine */
 
+#define CG_THREADS_FILE (!cg_test_v1_named ? "cgroup.threads" : "tasks")
+#define CG_NAMED_NAME "selftest"
+#define CG_PATH_FORMAT (!cg_test_v1_named ? "0::%s" : (":name=" CG_NAMED_NAME ":%s"))
+
 /*
  * Checks if two given values differ by less than err% of their sum.
  */
@@ -65,3 +69,4 @@ extern int dirfd_open_opath(const char *dir);
 extern int cg_prepare_for_wait(const char *cgroup);
 extern int memcg_prepare_for_wait(const char *cgroup);
 extern int cg_wait_for(int fd);
+extern bool cg_test_v1_named;
diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c
index a5672a91d273..0c4cc4e5fc8c 100644
--- a/tools/testing/selftests/cgroup/test_core.c
+++ b/tools/testing/selftests/cgroup/test_core.c
@@ -573,7 +573,7 @@ static int test_cgcore_proc_migration(const char *root)
 	}
 
 	cg_enter_current(dst);
-	if (cg_read_lc(dst, "cgroup.threads") != n_threads + 1)
+	if (cg_read_lc(dst, CG_THREADS_FILE) != n_threads + 1)
 		goto cleanup;
 
 	ret = KSFT_PASS;
@@ -605,7 +605,7 @@ static void *migrating_thread_fn(void *arg)
 	char lines[3][PATH_MAX];
 
 	for (g = 1; g < 3; ++g)
-		snprintf(lines[g], sizeof(lines[g]), "0::%s", grps[g] + strlen(grps[0]));
+		snprintf(lines[g], sizeof(lines[g]), CG_PATH_FORMAT, grps[g] + strlen(grps[0]));
 
 	for (i = 0; i < n_iterations; ++i) {
 		cg_enter_current_thread(grps[(i % 2) + 1]);
@@ -659,7 +659,7 @@ static int test_cgcore_thread_migration(const char *root)
 	if (retval)
 		goto cleanup;
 
-	snprintf(line, sizeof(line), "0::%s", grps[1] + strlen(grps[0]));
+	snprintf(line, sizeof(line), CG_PATH_FORMAT, grps[1] + strlen(grps[0]));
 	if (proc_read_strstr(0, 1, "cgroup", line))
 		goto cleanup;
 
-- 
cgit v1.2.3


From dd7588e455f847d3b0108d9981b1fcff4441f77b Mon Sep 17 00:00:00 2001
From: Michal Koutný <mkoutny@suse.com>
Date: Tue, 17 Jun 2025 15:36:54 +0200
Subject: selftests: cgroup: Add support for named v1 hierarchies in test_core
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This comes useful when using selftests from mainline on older
kernels/setups that still rely on cgroup v1.
The core tests that rely on v2 specific features are skipped, the
remaining ones are adjusted to work with a v1 hierarchy.

The expected output on v1 system:
	ok 1 # SKIP test_cgcore_internal_process_constraint
	ok 2 # SKIP test_cgcore_top_down_constraint_enable
	ok 3 # SKIP test_cgcore_top_down_constraint_disable
	ok 4 # SKIP test_cgcore_no_internal_process_constraint_on_threads
	ok 5 # SKIP test_cgcore_parent_becomes_threaded
	ok 6 # SKIP test_cgcore_invalid_domain
	ok 7 # SKIP test_cgcore_populated
	ok 8 test_cgcore_proc_migration
	ok 9 test_cgcore_thread_migration
	ok 10 test_cgcore_destroy
	ok 11 test_cgcore_lesser_euid_open
	ok 12 # SKIP test_cgcore_lesser_ns_open

Signed-off-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/cgroup/test_core.c | 31 +++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c
index 0c4cc4e5fc8c..338e276aae5d 100644
--- a/tools/testing/selftests/cgroup/test_core.c
+++ b/tools/testing/selftests/cgroup/test_core.c
@@ -148,6 +148,9 @@ static int test_cgcore_populated(const char *root)
 	int cgroup_fd = -EBADF;
 	pid_t pid;
 
+	if (cg_test_v1_named)
+		return KSFT_SKIP;
+
 	cg_test_a = cg_name(root, "cg_test_a");
 	cg_test_b = cg_name(root, "cg_test_a/cg_test_b");
 	cg_test_c = cg_name(root, "cg_test_a/cg_test_b/cg_test_c");
@@ -277,6 +280,9 @@ static int test_cgcore_invalid_domain(const char *root)
 	int ret = KSFT_FAIL;
 	char *grandparent = NULL, *parent = NULL, *child = NULL;
 
+	if (cg_test_v1_named)
+		return KSFT_SKIP;
+
 	grandparent = cg_name(root, "cg_test_grandparent");
 	parent = cg_name(root, "cg_test_grandparent/cg_test_parent");
 	child = cg_name(root, "cg_test_grandparent/cg_test_parent/cg_test_child");
@@ -339,6 +345,9 @@ static int test_cgcore_parent_becomes_threaded(const char *root)
 	int ret = KSFT_FAIL;
 	char *parent = NULL, *child = NULL;
 
+	if (cg_test_v1_named)
+		return KSFT_SKIP;
+
 	parent = cg_name(root, "cg_test_parent");
 	child = cg_name(root, "cg_test_parent/cg_test_child");
 	if (!parent || !child)
@@ -378,7 +387,8 @@ static int test_cgcore_no_internal_process_constraint_on_threads(const char *roo
 	int ret = KSFT_FAIL;
 	char *parent = NULL, *child = NULL;
 
-	if (cg_read_strstr(root, "cgroup.controllers", "cpu") ||
+	if (cg_test_v1_named ||
+	    cg_read_strstr(root, "cgroup.controllers", "cpu") ||
 	    cg_write(root, "cgroup.subtree_control", "+cpu")) {
 		ret = KSFT_SKIP;
 		goto cleanup;
@@ -430,6 +440,9 @@ static int test_cgcore_top_down_constraint_enable(const char *root)
 	int ret = KSFT_FAIL;
 	char *parent = NULL, *child = NULL;
 
+	if (cg_test_v1_named)
+		return KSFT_SKIP;
+
 	parent = cg_name(root, "cg_test_parent");
 	child = cg_name(root, "cg_test_parent/cg_test_child");
 	if (!parent || !child)
@@ -465,6 +478,9 @@ static int test_cgcore_top_down_constraint_disable(const char *root)
 	int ret = KSFT_FAIL;
 	char *parent = NULL, *child = NULL;
 
+	if (cg_test_v1_named)
+		return KSFT_SKIP;
+
 	parent = cg_name(root, "cg_test_parent");
 	child = cg_name(root, "cg_test_parent/cg_test_child");
 	if (!parent || !child)
@@ -506,6 +522,9 @@ static int test_cgcore_internal_process_constraint(const char *root)
 	int ret = KSFT_FAIL;
 	char *parent = NULL, *child = NULL;
 
+	if (cg_test_v1_named)
+		return KSFT_SKIP;
+
 	parent = cg_name(root, "cg_test_parent");
 	child = cg_name(root, "cg_test_parent/cg_test_child");
 	if (!parent || !child)
@@ -642,10 +661,12 @@ static int test_cgcore_thread_migration(const char *root)
 	if (cg_create(grps[2]))
 		goto cleanup;
 
-	if (cg_write(grps[1], "cgroup.type", "threaded"))
-		goto cleanup;
-	if (cg_write(grps[2], "cgroup.type", "threaded"))
-		goto cleanup;
+	if (!cg_test_v1_named) {
+		if (cg_write(grps[1], "cgroup.type", "threaded"))
+			goto cleanup;
+		if (cg_write(grps[2], "cgroup.type", "threaded"))
+			goto cleanup;
+	}
 
 	if (cg_enter_current(grps[1]))
 		goto cleanup;
-- 
cgit v1.2.3


From d74cd7864ffa6913f1d70f80858bd3fd19101cdf Mon Sep 17 00:00:00 2001
From: Michal Koutný <mkoutny@suse.com>
Date: Tue, 17 Jun 2025 15:36:55 +0200
Subject: selftests: cgroup: Optionally set up v1 environment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use the missing mount of the unifier hierarchy as a hint of legacy
system and prepare our own named v1 hierarchy for tests.

The code is only in test_core.c and not cgroup_util.c because other
selftests are related to controllers which will be exposed on v2
hierarchy but named hierarchies are only v1 thing.

Signed-off-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/cgroup/test_core.c | 44 ++++++++++++++++++++++++++++--
 1 file changed, 42 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c
index 338e276aae5d..452c2abf9794 100644
--- a/tools/testing/selftests/cgroup/test_core.c
+++ b/tools/testing/selftests/cgroup/test_core.c
@@ -5,6 +5,8 @@
 #include <linux/sched.h>
 #include <sys/types.h>
 #include <sys/mman.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
 #include <sys/wait.h>
 #include <unistd.h>
 #include <fcntl.h>
@@ -863,6 +865,38 @@ cleanup:
 	return ret;
 }
 
+static int setup_named_v1_root(char *root, size_t len, const char *name)
+{
+	char options[PATH_MAX];
+	int r;
+
+	r = snprintf(root, len, "/mnt/cg_selftest");
+	if (r < 0)
+		return r;
+
+	r = snprintf(options, sizeof(options), "none,name=%s", name);
+	if (r < 0)
+		return r;
+
+	r = mkdir(root, 0755);
+	if (r < 0 && errno != EEXIST)
+		return r;
+
+	r = mount("none", root, "cgroup", 0, options);
+	if (r < 0)
+		return r;
+
+	return 0;
+}
+
+static void cleanup_named_v1_root(char *root)
+{
+	if (!cg_test_v1_named)
+		return;
+	umount(root);
+	rmdir(root);
+}
+
 #define T(x) { x, #x }
 struct corecg_test {
 	int (*fn)(const char *root);
@@ -888,13 +922,18 @@ int main(int argc, char *argv[])
 	char root[PATH_MAX];
 	int i, ret = EXIT_SUCCESS;
 
-	if (cg_find_unified_root(root, sizeof(root), &nsdelegate))
-		ksft_exit_skip("cgroup v2 isn't mounted\n");
+	if (cg_find_unified_root(root, sizeof(root), &nsdelegate)) {
+		if (setup_named_v1_root(root, sizeof(root), CG_NAMED_NAME))
+			ksft_exit_skip("cgroup v2 isn't mounted and could not setup named v1 hierarchy\n");
+		cg_test_v1_named = true;
+		goto post_v2_setup;
+	}
 
 	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
 		if (cg_write(root, "cgroup.subtree_control", "+memory"))
 			ksft_exit_skip("Failed to set memory controller\n");
 
+post_v2_setup:
 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
 		switch (tests[i].fn(root)) {
 		case KSFT_PASS:
@@ -910,5 +949,6 @@ int main(int argc, char *argv[])
 		}
 	}
 
+	cleanup_named_v1_root(root);
 	return ret;
 }
-- 
cgit v1.2.3


From 5da4f9db980cc475bb6f027153cce75eaa3026ec Mon Sep 17 00:00:00 2001
From: Michal Koutný <mkoutny@suse.com>
Date: Tue, 17 Jun 2025 15:36:56 +0200
Subject: selftests: cgroup: Fix compilation on pre-cgroupns kernels
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The test would be skipped because of nsdelegate, so the defined value is
not used (0 is always acceptable).

Signed-off-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/cgroup/test_core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c
index 452c2abf9794..a360e2eb2eef 100644
--- a/tools/testing/selftests/cgroup/test_core.c
+++ b/tools/testing/selftests/cgroup/test_core.c
@@ -21,6 +21,9 @@
 #include "cgroup_util.h"
 
 static bool nsdelegate;
+#ifndef CLONE_NEWCGROUP
+#define CLONE_NEWCGROUP 0
+#endif
 
 static int touch_anon(char *buf, size_t size)
 {
-- 
cgit v1.2.3


From 1257b8786ac689a2ce5fe3e1741c65038035adc6 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Tue, 17 Jun 2025 12:57:22 -0700
Subject: cgroup: support to enable nmi-safe css_rstat_updated

Add necessary infrastructure to enable the nmi-safe execution of
css_rstat_updated(). Currently css_rstat_updated() takes a per-cpu
per-css raw spinlock to add the given css in the per-cpu per-css update
tree. However the kernel can not spin in nmi context, so we need to
remove the spinning on the raw spinlock in css_rstat_updated().

To support lockless css_rstat_updated(), let's add necessary data
structures in the css and ss structures.

Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Tested-by: JP Kobryn <inwardvessel@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h |  4 ++++
 kernel/cgroup/rstat.c       | 23 +++++++++++++++++++++--
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index cd7f093e34cd..04191d99228c 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -384,6 +384,9 @@ struct css_rstat_cpu {
 	 */
 	struct cgroup_subsys_state *updated_children;
 	struct cgroup_subsys_state *updated_next;	/* NULL if not on the list */
+
+	struct llist_node lnode;		/* lockless list for update */
+	struct cgroup_subsys_state *owner;	/* back pointer */
 };
 
 /*
@@ -822,6 +825,7 @@ struct cgroup_subsys {
 
 	spinlock_t rstat_ss_lock;
 	raw_spinlock_t __percpu *rstat_ss_cpu_lock;
+	struct llist_head __percpu *lhead; /* lockless update list head */
 };
 
 extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index ce4752ab9e09..bfa6366d2325 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -11,6 +11,7 @@
 
 static DEFINE_SPINLOCK(rstat_base_lock);
 static DEFINE_PER_CPU(raw_spinlock_t, rstat_base_cpu_lock);
+static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list);
 
 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
 
@@ -45,6 +46,13 @@ static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss)
 	return &rstat_base_lock;
 }
 
+static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu)
+{
+	if (ss)
+		return per_cpu_ptr(ss->lhead, cpu);
+	return per_cpu_ptr(&rstat_backlog_list, cpu);
+}
+
 static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu)
 {
 	if (ss)
@@ -456,7 +464,8 @@ int css_rstat_init(struct cgroup_subsys_state *css)
 	for_each_possible_cpu(cpu) {
 		struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
 
-		rstatc->updated_children = css;
+		rstatc->owner = rstatc->updated_children = css;
+		init_llist_node(&rstatc->lnode);
 
 		if (is_self) {
 			struct cgroup_rstat_base_cpu *rstatbc;
@@ -525,9 +534,19 @@ int __init ss_rstat_init(struct cgroup_subsys *ss)
 	}
 #endif
 
+	if (ss) {
+		ss->lhead = alloc_percpu(struct llist_head);
+		if (!ss->lhead) {
+			free_percpu(ss->rstat_ss_cpu_lock);
+			return -ENOMEM;
+		}
+	}
+
 	spin_lock_init(ss_rstat_lock(ss));
-	for_each_possible_cpu(cpu)
+	for_each_possible_cpu(cpu) {
 		raw_spin_lock_init(ss_rstat_cpu_lock(ss, cpu));
+		init_llist_head(ss_lhead_cpu(ss, cpu));
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3


From 36df6e3dbd7e7b074e55fec080012184e2fa3a46 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Tue, 17 Jun 2025 12:57:23 -0700
Subject: cgroup: make css_rstat_updated nmi safe

To make css_rstat_updated() able to safely run in nmi context, let's
move the rstat update tree creation at the flush side and use per-cpu
lockless lists in struct cgroup_subsys to track the css whose stats are
updated on that cpu.

The struct cgroup_subsys_state now has per-cpu lnode which needs to be
inserted into the corresponding per-cpu lhead of struct cgroup_subsys.
Since we want the insertion to be nmi safe, there can be multiple
inserters on the same cpu for the same lnode. Here multiple inserters
are from stacked contexts like softirq, hardirq and nmi.

The current llist does not provide function to protect against the
scenario where multiple inserters can use the same lnode. So, using
llist_node() out of the box is not safe for this scenario.

However we can protect against multiple inserters using the same lnode
by using the fact llist node points to itself when not on the llist and
atomically reset it and select the winner as the single inserter.

Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Tested-by: JP Kobryn <inwardvessel@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/rstat.c | 65 +++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 53 insertions(+), 12 deletions(-)

diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index bfa6366d2325..823a4c7c3fea 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -126,13 +126,16 @@ void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu,
  * @css: target cgroup subsystem state
  * @cpu: cpu on which rstat_cpu was updated
  *
- * @css's rstat_cpu on @cpu was updated. Put it on the parent's matching
- * rstat_cpu->updated_children list. See the comment on top of
- * css_rstat_cpu definition for details.
+ * Atomically inserts the css in the ss's llist for the given cpu. This is
+ * reentrant safe i.e. safe against softirq, hardirq and nmi. The ss's llist
+ * will be processed at the flush time to create the update tree.
  */
 __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
 {
-	unsigned long flags;
+	struct llist_head *lhead;
+	struct css_rstat_cpu *rstatc;
+	struct css_rstat_cpu __percpu *rstatc_pcpu;
+	struct llist_node *self;
 
 	/*
 	 * Since bpf programs can call this function, prevent access to
@@ -141,19 +144,44 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
 	if (!css_uses_rstat(css))
 		return;
 
+	lockdep_assert_preemption_disabled();
+
+	/*
+	 * For archs withnot nmi safe cmpxchg or percpu ops support, ignore
+	 * the requests from nmi context.
+	 */
+	if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) ||
+	     !IS_ENABLED(CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS)) && in_nmi())
+		return;
+
+	rstatc = css_rstat_cpu(css, cpu);
+	/* If already on list return. */
+	if (llist_on_list(&rstatc->lnode))
+		return;
+
 	/*
-	 * Speculative already-on-list test. This may race leading to
-	 * temporary inaccuracies, which is fine.
+	 * This function can be renentered by irqs and nmis for the same cgroup
+	 * and may try to insert the same per-cpu lnode into the llist. Note
+	 * that llist_add() does not protect against such scenarios.
 	 *
-	 * Because @parent's updated_children is terminated with @parent
-	 * instead of NULL, we can tell whether @css is on the list by
-	 * testing the next pointer for NULL.
+	 * To protect against such stacked contexts of irqs/nmis, we use the
+	 * fact that lnode points to itself when not on a list and then use
+	 * this_cpu_cmpxchg() to atomically set to NULL to select the winner
+	 * which will call llist_add(). The losers can assume the insertion is
+	 * successful and the winner will eventually add the per-cpu lnode to
+	 * the llist.
 	 */
-	if (data_race(css_rstat_cpu(css, cpu)->updated_next))
+	self = &rstatc->lnode;
+	rstatc_pcpu = css->rstat_cpu;
+	if (this_cpu_cmpxchg(rstatc_pcpu->lnode.next, self, NULL) != self)
 		return;
 
-	flags = _css_rstat_cpu_lock(css, cpu, true);
+	lhead = ss_lhead_cpu(css->ss, cpu);
+	llist_add(&rstatc->lnode, lhead);
+}
 
+static void __css_process_update_tree(struct cgroup_subsys_state *css, int cpu)
+{
 	/* put @css and all ancestors on the corresponding updated lists */
 	while (true) {
 		struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
@@ -179,8 +207,19 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
 
 		css = parent;
 	}
+}
+
+static void css_process_update_tree(struct cgroup_subsys *ss, int cpu)
+{
+	struct llist_head *lhead = ss_lhead_cpu(ss, cpu);
+	struct llist_node *lnode;
+
+	while ((lnode = llist_del_first_init(lhead))) {
+		struct css_rstat_cpu *rstatc;
 
-	_css_rstat_cpu_unlock(css, cpu, flags, true);
+		rstatc = container_of(lnode, struct css_rstat_cpu, lnode);
+		__css_process_update_tree(rstatc->owner, cpu);
+	}
 }
 
 /**
@@ -288,6 +327,8 @@ static struct cgroup_subsys_state *css_rstat_updated_list(
 
 	flags = _css_rstat_cpu_lock(root, cpu, false);
 
+	css_process_update_tree(root->ss, cpu);
+
 	/* Return NULL if this subtree is not on-list */
 	if (!rstatc->updated_next)
 		goto unlock_ret;
-- 
cgit v1.2.3


From 6af89c6ca71742e9227e6f8172a86ce1ee16aa85 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Tue, 17 Jun 2025 12:57:24 -0700
Subject: cgroup: remove per-cpu per-subsystem locks

The rstat update side used to insert the cgroup whose stats are updated
in the update tree and the read side flush the update tree to get the
latest uptodate stats. The per-cpu per-subsystem locks were used to
synchronize the update and flush side. However now the update side does
not access update tree but uses per-cpu lockless lists. So there is no
need for locks to synchronize update and flush side. Let's remove them.

Suggested-by: JP Kobryn <inwardvessel@gmail.com>
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Tested-by: JP Kobryn <inwardvessel@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h   |   7 ---
 include/trace/events/cgroup.h |  47 --------------------
 kernel/cgroup/rstat.c         | 100 ++----------------------------------------
 3 files changed, 4 insertions(+), 150 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 04191d99228c..6b93a64115fe 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -375,12 +375,6 @@ struct css_rstat_cpu {
 	 * Child cgroups with stat updates on this cpu since the last read
 	 * are linked on the parent's ->updated_children through
 	 * ->updated_next. updated_children is terminated by its container css.
-	 *
-	 * In addition to being more compact, singly-linked list pointing to
-	 * the css makes it unnecessary for each per-cpu struct to point back
-	 * to the associated css.
-	 *
-	 * Protected by per-cpu css->ss->rstat_ss_cpu_lock.
 	 */
 	struct cgroup_subsys_state *updated_children;
 	struct cgroup_subsys_state *updated_next;	/* NULL if not on the list */
@@ -824,7 +818,6 @@ struct cgroup_subsys {
 	unsigned int depends_on;
 
 	spinlock_t rstat_ss_lock;
-	raw_spinlock_t __percpu *rstat_ss_cpu_lock;
 	struct llist_head __percpu *lhead; /* lockless update list head */
 };
 
diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h
index 7d332387be6c..ba9229af9a34 100644
--- a/include/trace/events/cgroup.h
+++ b/include/trace/events/cgroup.h
@@ -257,53 +257,6 @@ DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
 	TP_ARGS(cgrp, cpu, contended)
 );
 
-/*
- * Related to per CPU locks:
- * global rstat_base_cpu_lock for base stats
- * cgroup_subsys::rstat_ss_cpu_lock for subsystem stats
- */
-DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended,
-
-	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
-
-	TP_ARGS(cgrp, cpu, contended)
-);
-
-DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended_fastpath,
-
-	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
-
-	TP_ARGS(cgrp, cpu, contended)
-);
-
-DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked,
-
-	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
-
-	TP_ARGS(cgrp, cpu, contended)
-);
-
-DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked_fastpath,
-
-	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
-
-	TP_ARGS(cgrp, cpu, contended)
-);
-
-DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock,
-
-	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
-
-	TP_ARGS(cgrp, cpu, contended)
-);
-
-DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath,
-
-	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
-
-	TP_ARGS(cgrp, cpu, contended)
-);
-
 #endif /* _TRACE_CGROUP_H */
 
 /* This part must be outside protection */
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 823a4c7c3fea..c8a48cf83878 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -10,7 +10,6 @@
 #include <trace/events/cgroup.h>
 
 static DEFINE_SPINLOCK(rstat_base_lock);
-static DEFINE_PER_CPU(raw_spinlock_t, rstat_base_cpu_lock);
 static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list);
 
 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
@@ -53,74 +52,6 @@ static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu)
 	return per_cpu_ptr(&rstat_backlog_list, cpu);
 }
 
-static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu)
-{
-	if (ss)
-		return per_cpu_ptr(ss->rstat_ss_cpu_lock, cpu);
-
-	return per_cpu_ptr(&rstat_base_cpu_lock, cpu);
-}
-
-/*
- * Helper functions for rstat per CPU locks.
- *
- * This makes it easier to diagnose locking issues and contention in
- * production environments. The parameter @fast_path determine the
- * tracepoints being added, allowing us to diagnose "flush" related
- * operations without handling high-frequency fast-path "update" events.
- */
-static __always_inline
-unsigned long _css_rstat_cpu_lock(struct cgroup_subsys_state *css, int cpu,
-		const bool fast_path)
-{
-	struct cgroup *cgrp = css->cgroup;
-	raw_spinlock_t *cpu_lock;
-	unsigned long flags;
-	bool contended;
-
-	/*
-	 * The _irqsave() is needed because the locks used for flushing are
-	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring this lock
-	 * with the _irq() suffix only disables interrupts on a non-PREEMPT_RT
-	 * kernel. The raw_spinlock_t below disables interrupts on both
-	 * configurations. The _irqsave() ensures that interrupts are always
-	 * disabled and later restored.
-	 */
-	cpu_lock = ss_rstat_cpu_lock(css->ss, cpu);
-	contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
-	if (contended) {
-		if (fast_path)
-			trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended);
-		else
-			trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended);
-
-		raw_spin_lock_irqsave(cpu_lock, flags);
-	}
-
-	if (fast_path)
-		trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended);
-	else
-		trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended);
-
-	return flags;
-}
-
-static __always_inline
-void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu,
-		unsigned long flags, const bool fast_path)
-{
-	struct cgroup *cgrp = css->cgroup;
-	raw_spinlock_t *cpu_lock;
-
-	if (fast_path)
-		trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false);
-	else
-		trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false);
-
-	cpu_lock = ss_rstat_cpu_lock(css->ss, cpu);
-	raw_spin_unlock_irqrestore(cpu_lock, flags);
-}
-
 /**
  * css_rstat_updated - keep track of updated rstat_cpu
  * @css: target cgroup subsystem state
@@ -323,15 +254,12 @@ static struct cgroup_subsys_state *css_rstat_updated_list(
 {
 	struct css_rstat_cpu *rstatc = css_rstat_cpu(root, cpu);
 	struct cgroup_subsys_state *head = NULL, *parent, *child;
-	unsigned long flags;
-
-	flags = _css_rstat_cpu_lock(root, cpu, false);
 
 	css_process_update_tree(root->ss, cpu);
 
 	/* Return NULL if this subtree is not on-list */
 	if (!rstatc->updated_next)
-		goto unlock_ret;
+		return NULL;
 
 	/*
 	 * Unlink @root from its parent. As the updated_children list is
@@ -363,8 +291,7 @@ static struct cgroup_subsys_state *css_rstat_updated_list(
 	rstatc->updated_children = root;
 	if (child != root)
 		head = css_rstat_push_children(head, child, cpu);
-unlock_ret:
-	_css_rstat_cpu_unlock(root, cpu, flags, false);
+
 	return head;
 }
 
@@ -560,34 +487,15 @@ int __init ss_rstat_init(struct cgroup_subsys *ss)
 {
 	int cpu;
 
-#ifdef CONFIG_SMP
-	/*
-	 * On uniprocessor machines, arch_spinlock_t is defined as an empty
-	 * struct. Avoid allocating a size of zero by having this block
-	 * excluded in this case. It's acceptable to leave the subsystem locks
-	 * unitialized since the associated lock functions are no-ops in the
-	 * non-smp case.
-	 */
-	if (ss) {
-		ss->rstat_ss_cpu_lock = alloc_percpu(raw_spinlock_t);
-		if (!ss->rstat_ss_cpu_lock)
-			return -ENOMEM;
-	}
-#endif
-
 	if (ss) {
 		ss->lhead = alloc_percpu(struct llist_head);
-		if (!ss->lhead) {
-			free_percpu(ss->rstat_ss_cpu_lock);
+		if (!ss->lhead)
 			return -ENOMEM;
-		}
 	}
 
 	spin_lock_init(ss_rstat_lock(ss));
-	for_each_possible_cpu(cpu) {
-		raw_spin_lock_init(ss_rstat_cpu_lock(ss, cpu));
+	for_each_possible_cpu(cpu)
 		init_llist_head(ss_lhead_cpu(ss, cpu));
-	}
 
 	return 0;
 }
-- 
cgit v1.2.3


From 8dcb0ed834a3ec037c153c7757240ede9a8c9808 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Tue, 17 Jun 2025 12:57:25 -0700
Subject: memcg: cgroup: call css_rstat_updated irrespective of in_nmi()

css_rstat_updated() is nmi safe, so there is no need to avoid it in
in_nmi(), so remove the check.

Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Tested-by: JP Kobryn <inwardvessel@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/memcontrol.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 902da8a9c643..d122bfe33e98 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -573,9 +573,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val,
 	if (!val)
 		return;
 
-	/* TODO: add to cgroup update tree once it is nmi-safe. */
-	if (!in_nmi())
-		css_rstat_updated(&memcg->css, cpu);
+	css_rstat_updated(&memcg->css, cpu);
 	statc_pcpu = memcg->vmstats_percpu;
 	for (; statc_pcpu; statc_pcpu = statc->parent_pcpu) {
 		statc = this_cpu_ptr(statc_pcpu);
@@ -2530,7 +2528,8 @@ static inline void account_slab_nmi_safe(struct mem_cgroup *memcg,
 	} else {
 		struct mem_cgroup_per_node *pn = memcg->nodeinfo[pgdat->node_id];
 
-		/* TODO: add to cgroup update tree once it is nmi-safe. */
+		/* preemption is disabled in_nmi(). */
+		css_rstat_updated(&memcg->css, smp_processor_id());
 		if (idx == NR_SLAB_RECLAIMABLE_B)
 			atomic_add(nr, &pn->slab_reclaimable);
 		else
@@ -2753,7 +2752,8 @@ static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val)
 	if (likely(!in_nmi())) {
 		mod_memcg_state(memcg, MEMCG_KMEM, val);
 	} else {
-		/* TODO: add to cgroup update tree once it is nmi-safe. */
+		/* preemption is disabled in_nmi(). */
+		css_rstat_updated(&memcg->css, smp_processor_id());
 		atomic_add(val, &memcg->kmem_stat);
 	}
 }
-- 
cgit v1.2.3


From c7d7713e36a6ab4c42e40c952d5ba7a51b1091b0 Mon Sep 17 00:00:00 2001
From: Sebastian Chlad <sebastianchlad@gmail.com>
Date: Wed, 2 Jul 2025 15:23:36 +0200
Subject: selftests: cgroup: Allow longer timeout for kmem_dead_cgroups cleanup

The test_kmem_dead_cgroups test currently assumes that RCU and
memory reclaim will complete within 5 seconds. In some environments
this timeout may be insufficient, leading to spurious test failures.

This patch introduces max_time set to 20 which is then used in the
test. After 5th sec the debug message is printed to indicate the
cleanup is still ongoing.

In the system under test with 16 CPUs the original test was failing
most of the time and the cleanup time took usually approx. 6sec.
Further tests were conducted with and without do_rcu_barrier and the
results (respectively) are as follow:
quantiles 0  0.25  0.5  0.75  1
          1    2    3    8    20 (mean = 4.7667)
          3    5    8    8    20 (mean = 7.6667)

Acked-by: Michal Koutny <mkoutny@suse.com>
Signed-off-by: Sebastian Chlad <sebastian.chlad@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/cgroup/test_kmem.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c
index 96693d8772be..63b3c9aad399 100644
--- a/tools/testing/selftests/cgroup/test_kmem.c
+++ b/tools/testing/selftests/cgroup/test_kmem.c
@@ -308,6 +308,7 @@ static int test_kmem_dead_cgroups(const char *root)
 	char *parent;
 	long dead;
 	int i;
+	int max_time = 20;
 
 	parent = cg_name(root, "kmem_dead_cgroups_test");
 	if (!parent)
@@ -322,7 +323,7 @@ static int test_kmem_dead_cgroups(const char *root)
 	if (cg_run_in_subcgroups(parent, alloc_dcache, (void *)100, 30))
 		goto cleanup;
 
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < max_time; i++) {
 		dead = cg_read_key_long(parent, "cgroup.stat",
 					"nr_dying_descendants ");
 		if (dead == 0) {
@@ -334,6 +335,8 @@ static int test_kmem_dead_cgroups(const char *root)
 		 * let's wait a bit and repeat.
 		 */
 		sleep(1);
+		if (i > 5)
+			printf("Waiting time longer than 5s; wait: %ds (dead: %ld)\n", i, dead);
 	}
 
 cleanup:
-- 
cgit v1.2.3


From e07caae73557d144a9237fb977dfee08befa015f Mon Sep 17 00:00:00 2001
From: Sebastian Chlad <sebastianchlad@gmail.com>
Date: Wed, 2 Jul 2025 18:40:10 +0200
Subject: selftests: cgroup: Fix missing newline in test_zswap_writeback_one

Fixes malformed test output due to missing newline

Signed-off-by: Sebastian Chlad <sebastian.chlad@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/cgroup/test_zswap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c
index 40de679248b8..e1f578ca2841 100644
--- a/tools/testing/selftests/cgroup/test_zswap.c
+++ b/tools/testing/selftests/cgroup/test_zswap.c
@@ -338,7 +338,7 @@ static int test_zswap_writeback_one(const char *cgroup, bool wb)
 		return -1;
 
 	if (wb != !!zswpwb_after) {
-		ksft_print_msg("zswpwb_after is %ld while wb is %s",
+		ksft_print_msg("zswpwb_after is %ld while wb is %s\n",
 				zswpwb_after, wb ? "enabled" : "disabled");
 		return -1;
 	}
-- 
cgit v1.2.3


From dfe25fbaedfc2a07281ed5ff0442270217d25b31 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Fri, 4 Jul 2025 11:08:04 -0700
Subject: cgroup: llist: avoid memory tears for llist_node

Before the commit 36df6e3dbd7e ("cgroup: make css_rstat_updated nmi
safe"), the struct llist_node is expected to be private to the one
inserting the node to the lockless list or the one removing the node
from the lockless list. After the mentioned commit, the llist_node in
the rstat code is per-cpu shared between the stacked contexts i.e.
process, softirq, hardirq & nmi. It is possible the compiler may tear
the loads or stores of llist_node. Let's avoid that.

KCSAN reported the following race:

 Reported by Kernel Concurrency Sanitizer on:
 CPU: 60 UID: 0 PID: 5425 ... 6.16.0-rc3-next-20250626 #1 NONE
 Tainted: [E]=UNSIGNED_MODULE
 Hardware name: ...
 ==================================================================
 ==================================================================
 BUG: KCSAN: data-race in css_rstat_flush / css_rstat_updated
 write to 0xffffe8fffe1c85f0 of 8 bytes by task 1061 on cpu 1:
  css_rstat_flush+0x1b8/0xeb0
  __mem_cgroup_flush_stats+0x184/0x190
  flush_memcg_stats_dwork+0x22/0x50
  process_one_work+0x335/0x630
  worker_thread+0x5f1/0x8a0
  kthread+0x197/0x340
  ret_from_fork+0xd3/0x110
  ret_from_fork_asm+0x11/0x20
 read to 0xffffe8fffe1c85f0 of 8 bytes by task 3551 on cpu 15:
  css_rstat_updated+0x81/0x180
  mod_memcg_lruvec_state+0x113/0x2d0
  __mod_lruvec_state+0x3d/0x50
  lru_add+0x21e/0x3f0
  folio_batch_move_lru+0x80/0x1b0
  __folio_batch_add_and_move+0xd7/0x160
  folio_add_lru_vma+0x42/0x50
  do_anonymous_page+0x892/0xe90
  __handle_mm_fault+0xfaa/0x1520
  handle_mm_fault+0xdc/0x350
  do_user_addr_fault+0x1dc/0x650
  exc_page_fault+0x5c/0x110
  asm_exc_page_fault+0x22/0x30
 value changed: 0xffffe8fffe18e0d0 -> 0xffffe8fffe1c85f0

$ ./scripts/faddr2line vmlinux css_rstat_flush+0x1b8/0xeb0
css_rstat_flush+0x1b8/0xeb0:
init_llist_node at include/linux/llist.h:86
(inlined by) llist_del_first_init at include/linux/llist.h:308
(inlined by) css_process_update_tree at kernel/cgroup/rstat.c:148
(inlined by) css_rstat_updated_list at kernel/cgroup/rstat.c:258
(inlined by) css_rstat_flush at kernel/cgroup/rstat.c:389

$ ./scripts/faddr2line vmlinux css_rstat_updated+0x81/0x180
css_rstat_updated+0x81/0x180:
css_rstat_updated at kernel/cgroup/rstat.c:90 (discriminator 1)

These are expected race and a simple READ_ONCE/WRITE_ONCE resolves these
reports. However let's add comments to explain the race and the need for
memory barriers if stronger guarantees are needed.

More specifically the rstat updater and the flusher can race and cause a
scenario where the stats updater skips adding the css to the lockless
list but the flusher might not see those updates done by the skipped
updater. This is benign race and the subsequent flusher will flush those
stats and at the moment there aren't any rstat users which are not fine
with this kind of race. However some future user might want more
stricter guarantee, so let's add appropriate comments to ease the job of
future users.

Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Fixes: 36df6e3dbd7e ("cgroup: make css_rstat_updated nmi safe")
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/llist.h |  6 +++---
 kernel/cgroup/rstat.c | 28 +++++++++++++++++++++++++++-
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/include/linux/llist.h b/include/linux/llist.h
index 27b17f64bcee..607b2360c938 100644
--- a/include/linux/llist.h
+++ b/include/linux/llist.h
@@ -83,7 +83,7 @@ static inline void init_llist_head(struct llist_head *list)
  */
 static inline void init_llist_node(struct llist_node *node)
 {
-	node->next = node;
+	WRITE_ONCE(node->next, node);
 }
 
 /**
@@ -97,7 +97,7 @@ static inline void init_llist_node(struct llist_node *node)
  */
 static inline bool llist_on_list(const struct llist_node *node)
 {
-	return node->next != node;
+	return READ_ONCE(node->next) != node;
 }
 
 /**
@@ -220,7 +220,7 @@ static inline bool llist_empty(const struct llist_head *head)
 
 static inline struct llist_node *llist_next(struct llist_node *node)
 {
-	return node->next;
+	return READ_ONCE(node->next);
 }
 
 /**
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index c8a48cf83878..981e2f77ad4e 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -60,6 +60,12 @@ static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu)
  * Atomically inserts the css in the ss's llist for the given cpu. This is
  * reentrant safe i.e. safe against softirq, hardirq and nmi. The ss's llist
  * will be processed at the flush time to create the update tree.
+ *
+ * NOTE: if the user needs the guarantee that the updater either add itself in
+ * the lockless list or the concurrent flusher flushes its updated stats, a
+ * memory barrier is needed before the call to css_rstat_updated() i.e. a
+ * barrier after updating the per-cpu stats and before calling
+ * css_rstat_updated().
  */
 __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
 {
@@ -86,7 +92,12 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
 		return;
 
 	rstatc = css_rstat_cpu(css, cpu);
-	/* If already on list return. */
+	/*
+	 * If already on list return. This check is racy and smp_mb() is needed
+	 * to pair it with the smp_mb() in css_process_update_tree() if the
+	 * guarantee that the updated stats are visible to concurrent flusher is
+	 * needed.
+	 */
 	if (llist_on_list(&rstatc->lnode))
 		return;
 
@@ -148,6 +159,21 @@ static void css_process_update_tree(struct cgroup_subsys *ss, int cpu)
 	while ((lnode = llist_del_first_init(lhead))) {
 		struct css_rstat_cpu *rstatc;
 
+		/*
+		 * smp_mb() is needed here (more specifically in between
+		 * init_llist_node() and per-cpu stats flushing) if the
+		 * guarantee is required by a rstat user where etiher the
+		 * updater should add itself on the lockless list or the
+		 * flusher flush the stats updated by the updater who have
+		 * observed that they are already on the list. The
+		 * corresponding barrier pair for this one should be before
+		 * css_rstat_updated() by the user.
+		 *
+		 * For now, there aren't any such user, so not adding the
+		 * barrier here but if such a use-case arise, please add
+		 * smp_mb() here.
+		 */
+
 		rstatc = container_of(lnode, struct css_rstat_cpu, lnode);
 		__css_process_update_tree(rstatc->owner, cpu);
 	}
-- 
cgit v1.2.3


From 954bacce36d976fe472090b55987df66da00c49b Mon Sep 17 00:00:00 2001
From: Shashank Balaji <shashank.mahadasyam@sony.com>
Date: Fri, 4 Jul 2025 20:08:41 +0900
Subject: selftests/cgroup: fix cpu.max tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Current cpu.max tests (both the normal one and the nested one) are broken.

They setup cpu.max with 1000 us quota and the default period (100,000 us).
A cpu hog is run for a duration of 1s as per wall clock time. This corresponds
to 10 periods, hence an expected usage of 10,000 us. We want the measured
usage (as per cpu.stat) to be close to 10,000 us.

Previously, this approximate equality test was done by
`!values_close(usage_usec, expected_usage_usec, 95)`: if the absolute
difference between usage_usec and expected_usage_usec is greater than 95% of
their sum, then we pass. And expected_usage_usec was set to 1,000,000 us.
Mathematically, this translates to the following being true for pass:

	|usage - expected_usage| > (usage + expected_usage)*0.95

	If usage > expected_usage:
		usage - expected_usage > (usage + expected_usage)*0.95
		0.05*usage > 1.95*expected_usage
		usage > 39*expected_usage = 39s

	If usage < expected_usage:
		expected_usage - usage > (usage + expected_usage)*0.95
		0.05*expected_usage > 1.95*usage
		usage < 0.0256*expected_usage = 25,600 us

Combined,

	Pass if usage < 25,600 us or > 39 s,

which makes no sense given that all we need is for usage_usec to be close to
10,000 us.

Fix this by explicitly calcuating the expected usage duration based on the
configured quota, default period, and the duration, and compare usage_usec
and expected_usage_usec using values_close() with a 10% error margin.

Also, use snprintf to get the quota string to write to cpu.max instead of
hardcoding the quota, ensuring a single source of truth.

Remove the check comparing user_usec and expected_usage_usec, since on running
this test modified with printfs, it's seen that user_usec and usage_usec can
regularly exceed the theoretical expected_usage_usec:

	$ sudo ./test_cpu
	user: 10485, usage: 10485, expected: 10000
	ok 1 test_cpucg_max
	user: 11127, usage: 11127, expected: 10000
	ok 2 test_cpucg_max_nested
	$ sudo ./test_cpu
	user: 10286, usage: 10286, expected: 10000
	ok 1 test_cpucg_max
	user: 10404, usage: 11271, expected: 10000
	ok 2 test_cpucg_max_nested

Hence, a values_close() check of usage_usec and expected_usage_usec is
sufficient.

Fixes: a79906570f9646ae17 ("cgroup: Add test_cpucg_max_nested() testcase")
Fixes: 889ab8113ef1386c57 ("cgroup: Add test_cpucg_max() testcase")
Acked-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Shashank Balaji <shashank.mahadasyam@sony.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/cgroup/test_cpu.c | 63 +++++++++++++++++++++----------
 1 file changed, 43 insertions(+), 20 deletions(-)

diff --git a/tools/testing/selftests/cgroup/test_cpu.c b/tools/testing/selftests/cgroup/test_cpu.c
index a2b50af8e9ee..2a60e6c41940 100644
--- a/tools/testing/selftests/cgroup/test_cpu.c
+++ b/tools/testing/selftests/cgroup/test_cpu.c
@@ -2,6 +2,7 @@
 
 #define _GNU_SOURCE
 #include <linux/limits.h>
+#include <sys/param.h>
 #include <sys/sysinfo.h>
 #include <sys/wait.h>
 #include <errno.h>
@@ -645,10 +646,16 @@ test_cpucg_nested_weight_underprovisioned(const char *root)
 static int test_cpucg_max(const char *root)
 {
 	int ret = KSFT_FAIL;
-	long usage_usec, user_usec;
-	long usage_seconds = 1;
-	long expected_usage_usec = usage_seconds * USEC_PER_SEC;
+	long quota_usec = 1000;
+	long default_period_usec = 100000; /* cpu.max's default period */
+	long duration_seconds = 1;
+
+	long duration_usec = duration_seconds * USEC_PER_SEC;
+	long usage_usec, n_periods, remainder_usec, expected_usage_usec;
 	char *cpucg;
+	char quota_buf[32];
+
+	snprintf(quota_buf, sizeof(quota_buf), "%ld", quota_usec);
 
 	cpucg = cg_name(root, "cpucg_test");
 	if (!cpucg)
@@ -657,13 +664,13 @@ static int test_cpucg_max(const char *root)
 	if (cg_create(cpucg))
 		goto cleanup;
 
-	if (cg_write(cpucg, "cpu.max", "1000"))
+	if (cg_write(cpucg, "cpu.max", quota_buf))
 		goto cleanup;
 
 	struct cpu_hog_func_param param = {
 		.nprocs = 1,
 		.ts = {
-			.tv_sec = usage_seconds,
+			.tv_sec = duration_seconds,
 			.tv_nsec = 0,
 		},
 		.clock_type = CPU_HOG_CLOCK_WALL,
@@ -672,14 +679,19 @@ static int test_cpucg_max(const char *root)
 		goto cleanup;
 
 	usage_usec = cg_read_key_long(cpucg, "cpu.stat", "usage_usec");
-	user_usec = cg_read_key_long(cpucg, "cpu.stat", "user_usec");
-	if (user_usec <= 0)
+	if (usage_usec <= 0)
 		goto cleanup;
 
-	if (user_usec >= expected_usage_usec)
-		goto cleanup;
+	/*
+	 * The following calculation applies only since
+	 * the cpu hog is set to run as per wall-clock time
+	 */
+	n_periods = duration_usec / default_period_usec;
+	remainder_usec = duration_usec - n_periods * default_period_usec;
+	expected_usage_usec
+		= n_periods * quota_usec + MIN(remainder_usec, quota_usec);
 
-	if (values_close(usage_usec, expected_usage_usec, 95))
+	if (!values_close(usage_usec, expected_usage_usec, 10))
 		goto cleanup;
 
 	ret = KSFT_PASS;
@@ -698,10 +710,16 @@ cleanup:
 static int test_cpucg_max_nested(const char *root)
 {
 	int ret = KSFT_FAIL;
-	long usage_usec, user_usec;
-	long usage_seconds = 1;
-	long expected_usage_usec = usage_seconds * USEC_PER_SEC;
+	long quota_usec = 1000;
+	long default_period_usec = 100000; /* cpu.max's default period */
+	long duration_seconds = 1;
+
+	long duration_usec = duration_seconds * USEC_PER_SEC;
+	long usage_usec, n_periods, remainder_usec, expected_usage_usec;
 	char *parent, *child;
+	char quota_buf[32];
+
+	snprintf(quota_buf, sizeof(quota_buf), "%ld", quota_usec);
 
 	parent = cg_name(root, "cpucg_parent");
 	child = cg_name(parent, "cpucg_child");
@@ -717,13 +735,13 @@ static int test_cpucg_max_nested(const char *root)
 	if (cg_create(child))
 		goto cleanup;
 
-	if (cg_write(parent, "cpu.max", "1000"))
+	if (cg_write(parent, "cpu.max", quota_buf))
 		goto cleanup;
 
 	struct cpu_hog_func_param param = {
 		.nprocs = 1,
 		.ts = {
-			.tv_sec = usage_seconds,
+			.tv_sec = duration_seconds,
 			.tv_nsec = 0,
 		},
 		.clock_type = CPU_HOG_CLOCK_WALL,
@@ -732,14 +750,19 @@ static int test_cpucg_max_nested(const char *root)
 		goto cleanup;
 
 	usage_usec = cg_read_key_long(child, "cpu.stat", "usage_usec");
-	user_usec = cg_read_key_long(child, "cpu.stat", "user_usec");
-	if (user_usec <= 0)
+	if (usage_usec <= 0)
 		goto cleanup;
 
-	if (user_usec >= expected_usage_usec)
-		goto cleanup;
+	/*
+	 * The following calculation applies only since
+	 * the cpu hog is set to run as per wall-clock time
+	 */
+	n_periods = duration_usec / default_period_usec;
+	remainder_usec = duration_usec - n_periods * default_period_usec;
+	expected_usage_usec
+		= n_periods * quota_usec + MIN(remainder_usec, quota_usec);
 
-	if (values_close(usage_usec, expected_usage_usec, 95))
+	if (!values_close(usage_usec, expected_usage_usec, 10))
 		goto cleanup;
 
 	ret = KSFT_PASS;
-- 
cgit v1.2.3


From 646faf36d7271c597497ca547a59912fcab49be9 Mon Sep 17 00:00:00 2001
From: Michal Koutný <mkoutny@suse.com>
Date: Fri, 18 Jul 2025 11:18:54 +0200
Subject: cgroup: Add compatibility option for content of /proc/cgroups
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

/proc/cgroups lists only v1 controllers by default, however, this is
only enforced since the commit af000ce85293b ("cgroup: Do not report
unavailable v1 controllers in /proc/cgroups") and there is software in
the wild that uses content of /proc/cgroups to decide on availability of
v2 (sic) controllers.

Add a boottime param that can bring back the previous behavior for
setups where the check in the software cannot be changed and it causes
e.g. unintended OOMs.

Also, this patch takes out cgrp_v1_visible from cgroup1_subsys_absent()
guard since it's only important to check which hierarchy (v1 vs v2) the
subsys is attached to. This has no effect on the printed message but
the code is cleaner since cgrp_v1_visible is really about mounted
hierarchies, not the content of /proc/cgroups.

Link: https://lore.kernel.org/r/b26b60b7d0d2a5ecfd2f3c45f95f32922ed24686.camel@decadent.org.uk
Fixes: af000ce85293b ("cgroup: Do not report unavailable v1 controllers in /proc/cgroups")
Fixes: a0ab1453226d8 ("cgroup: Print message when /proc/cgroups is read on v2-only system")
Signed-off-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  8 ++++++++
 kernel/cgroup/cgroup-v1.c                       | 14 ++++++++++++--
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a3ea40b22fb9..60fd3f84272b 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -630,6 +630,14 @@
 			named mounts. Specifying both "all" and "named" disables
 			all v1 hierarchies.
 
+	cgroup_v1_proc=	[KNL] Show also missing controllers in /proc/cgroups
+			Format: { "true" | "false" }
+			/proc/cgroups lists only v1 controllers by default.
+			This compatibility option enables listing also v2
+			controllers (whose v1 code is not compiled!), so that
+			semi-legacy software can check this file to decide
+			about usage of v2 (sic) controllers.
+
 	cgroup_favordynmods= [KNL] Enable or Disable favordynmods.
 			Format: { "true" | "false" }
 			Defaults to the value of CONFIG_CGROUP_FAVOR_DYNMODS.
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index fa24c032ed6f..2a4a387f867a 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -32,6 +32,9 @@ static u16 cgroup_no_v1_mask;
 /* disable named v1 mounts */
 static bool cgroup_no_v1_named;
 
+/* Show unavailable controllers in /proc/cgroups */
+static bool proc_show_all;
+
 /*
  * pidlist destructions need to be flushed on cgroup destruction.  Use a
  * separate workqueue as flush domain.
@@ -683,10 +686,11 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
 	 */
 
 	for_each_subsys(ss, i) {
-		if (cgroup1_subsys_absent(ss))
-			continue;
 		cgrp_v1_visible |= ss->root != &cgrp_dfl_root;
 
+		if (!proc_show_all && cgroup1_subsys_absent(ss))
+			continue;
+
 		seq_printf(m, "%s\t%d\t%d\t%d\n",
 			   ss->legacy_name, ss->root->hierarchy_id,
 			   atomic_read(&ss->root->nr_cgrps),
@@ -1359,3 +1363,9 @@ static int __init cgroup_no_v1(char *str)
 	return 1;
 }
 __setup("cgroup_no_v1=", cgroup_no_v1);
+
+static int __init cgroup_v1_proc(char *str)
+{
+	return (kstrtobool(str, &proc_show_all) == 0);
+}
+__setup("cgroup_v1_proc=", cgroup_v1_proc);
-- 
cgit v1.2.3