From edfc4c8a1edffa6849e19ffade1be8dd824989d0 Mon Sep 17 00:00:00 2001 From: Michal Koutný Date: Tue, 3 Jun 2025 17:45:27 +0200 Subject: cgroup: Drop sock_cgroup_classid() dummy implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The semantic of returning 0 is unclear when !CONFIG_CGROUP_NET_CLASSID. Since there are no callers of sock_cgroup_classid() with that config anymore we can undefine the helper at all and enforce all (future) callers to handle cases when !CONFIG_CGROUP_NET_CLASSID. Signed-off-by: Michal Koutný Link: https://lore.kernel.org/r/Z_52r_v9-3JUzDT7@calendula/ Acked-by: Tejun Heo Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index e61687d5e496..cd7f093e34cd 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -898,14 +898,12 @@ static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd) #endif } +#ifdef CONFIG_CGROUP_NET_CLASSID static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd) { -#ifdef CONFIG_CGROUP_NET_CLASSID return READ_ONCE(skcd->classid); -#else - return 0; -#endif } +#endif static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, u16 prioidx) @@ -915,13 +913,13 @@ static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, #endif } +#ifdef CONFIG_CGROUP_NET_CLASSID static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd, u32 classid) { -#ifdef CONFIG_CGROUP_NET_CLASSID WRITE_ONCE(skcd->classid, classid); -#endif } +#endif #else /* CONFIG_SOCK_CGROUP_DATA */ -- cgit v1.2.3 From 1257b8786ac689a2ce5fe3e1741c65038035adc6 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 17 Jun 2025 12:57:22 -0700 Subject: cgroup: support to enable nmi-safe css_rstat_updated Add necessary infrastructure to enable the nmi-safe execution of css_rstat_updated(). Currently css_rstat_updated() takes a per-cpu per-css raw spinlock to add the given css in the per-cpu per-css update tree. However the kernel can not spin in nmi context, so we need to remove the spinning on the raw spinlock in css_rstat_updated(). To support lockless css_rstat_updated(), let's add necessary data structures in the css and ss structures. Signed-off-by: Shakeel Butt Tested-by: JP Kobryn Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 4 ++++ kernel/cgroup/rstat.c | 23 +++++++++++++++++++++-- 2 files changed, 25 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index cd7f093e34cd..04191d99228c 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -384,6 +384,9 @@ struct css_rstat_cpu { */ struct cgroup_subsys_state *updated_children; struct cgroup_subsys_state *updated_next; /* NULL if not on the list */ + + struct llist_node lnode; /* lockless list for update */ + struct cgroup_subsys_state *owner; /* back pointer */ }; /* @@ -822,6 +825,7 @@ struct cgroup_subsys { spinlock_t rstat_ss_lock; raw_spinlock_t __percpu *rstat_ss_cpu_lock; + struct llist_head __percpu *lhead; /* lockless update list head */ }; extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index ce4752ab9e09..bfa6366d2325 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -11,6 +11,7 @@ static DEFINE_SPINLOCK(rstat_base_lock); static DEFINE_PER_CPU(raw_spinlock_t, rstat_base_cpu_lock); +static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list); static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); @@ -45,6 +46,13 @@ static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss) return &rstat_base_lock; } +static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu) +{ + if (ss) + return per_cpu_ptr(ss->lhead, cpu); + return per_cpu_ptr(&rstat_backlog_list, cpu); +} + static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu) { if (ss) @@ -456,7 +464,8 @@ int css_rstat_init(struct cgroup_subsys_state *css) for_each_possible_cpu(cpu) { struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); - rstatc->updated_children = css; + rstatc->owner = rstatc->updated_children = css; + init_llist_node(&rstatc->lnode); if (is_self) { struct cgroup_rstat_base_cpu *rstatbc; @@ -525,9 +534,19 @@ int __init ss_rstat_init(struct cgroup_subsys *ss) } #endif + if (ss) { + ss->lhead = alloc_percpu(struct llist_head); + if (!ss->lhead) { + free_percpu(ss->rstat_ss_cpu_lock); + return -ENOMEM; + } + } + spin_lock_init(ss_rstat_lock(ss)); - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { raw_spin_lock_init(ss_rstat_cpu_lock(ss, cpu)); + init_llist_head(ss_lhead_cpu(ss, cpu)); + } return 0; } -- cgit v1.2.3 From 6af89c6ca71742e9227e6f8172a86ce1ee16aa85 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 17 Jun 2025 12:57:24 -0700 Subject: cgroup: remove per-cpu per-subsystem locks The rstat update side used to insert the cgroup whose stats are updated in the update tree and the read side flush the update tree to get the latest uptodate stats. The per-cpu per-subsystem locks were used to synchronize the update and flush side. However now the update side does not access update tree but uses per-cpu lockless lists. So there is no need for locks to synchronize update and flush side. Let's remove them. Suggested-by: JP Kobryn Signed-off-by: Shakeel Butt Tested-by: JP Kobryn Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 7 --- include/trace/events/cgroup.h | 47 -------------------- kernel/cgroup/rstat.c | 100 ++---------------------------------------- 3 files changed, 4 insertions(+), 150 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 04191d99228c..6b93a64115fe 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -375,12 +375,6 @@ struct css_rstat_cpu { * Child cgroups with stat updates on this cpu since the last read * are linked on the parent's ->updated_children through * ->updated_next. updated_children is terminated by its container css. - * - * In addition to being more compact, singly-linked list pointing to - * the css makes it unnecessary for each per-cpu struct to point back - * to the associated css. - * - * Protected by per-cpu css->ss->rstat_ss_cpu_lock. */ struct cgroup_subsys_state *updated_children; struct cgroup_subsys_state *updated_next; /* NULL if not on the list */ @@ -824,7 +818,6 @@ struct cgroup_subsys { unsigned int depends_on; spinlock_t rstat_ss_lock; - raw_spinlock_t __percpu *rstat_ss_cpu_lock; struct llist_head __percpu *lhead; /* lockless update list head */ }; diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h index 7d332387be6c..ba9229af9a34 100644 --- a/include/trace/events/cgroup.h +++ b/include/trace/events/cgroup.h @@ -257,53 +257,6 @@ DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock, TP_ARGS(cgrp, cpu, contended) ); -/* - * Related to per CPU locks: - * global rstat_base_cpu_lock for base stats - * cgroup_subsys::rstat_ss_cpu_lock for subsystem stats - */ -DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended, - - TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), - - TP_ARGS(cgrp, cpu, contended) -); - -DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended_fastpath, - - TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), - - TP_ARGS(cgrp, cpu, contended) -); - -DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked, - - TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), - - TP_ARGS(cgrp, cpu, contended) -); - -DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked_fastpath, - - TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), - - TP_ARGS(cgrp, cpu, contended) -); - -DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock, - - TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), - - TP_ARGS(cgrp, cpu, contended) -); - -DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath, - - TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), - - TP_ARGS(cgrp, cpu, contended) -); - #endif /* _TRACE_CGROUP_H */ /* This part must be outside protection */ diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 823a4c7c3fea..c8a48cf83878 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -10,7 +10,6 @@ #include static DEFINE_SPINLOCK(rstat_base_lock); -static DEFINE_PER_CPU(raw_spinlock_t, rstat_base_cpu_lock); static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list); static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); @@ -53,74 +52,6 @@ static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu) return per_cpu_ptr(&rstat_backlog_list, cpu); } -static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu) -{ - if (ss) - return per_cpu_ptr(ss->rstat_ss_cpu_lock, cpu); - - return per_cpu_ptr(&rstat_base_cpu_lock, cpu); -} - -/* - * Helper functions for rstat per CPU locks. - * - * This makes it easier to diagnose locking issues and contention in - * production environments. The parameter @fast_path determine the - * tracepoints being added, allowing us to diagnose "flush" related - * operations without handling high-frequency fast-path "update" events. - */ -static __always_inline -unsigned long _css_rstat_cpu_lock(struct cgroup_subsys_state *css, int cpu, - const bool fast_path) -{ - struct cgroup *cgrp = css->cgroup; - raw_spinlock_t *cpu_lock; - unsigned long flags; - bool contended; - - /* - * The _irqsave() is needed because the locks used for flushing are - * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring this lock - * with the _irq() suffix only disables interrupts on a non-PREEMPT_RT - * kernel. The raw_spinlock_t below disables interrupts on both - * configurations. The _irqsave() ensures that interrupts are always - * disabled and later restored. - */ - cpu_lock = ss_rstat_cpu_lock(css->ss, cpu); - contended = !raw_spin_trylock_irqsave(cpu_lock, flags); - if (contended) { - if (fast_path) - trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended); - else - trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended); - - raw_spin_lock_irqsave(cpu_lock, flags); - } - - if (fast_path) - trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended); - else - trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended); - - return flags; -} - -static __always_inline -void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu, - unsigned long flags, const bool fast_path) -{ - struct cgroup *cgrp = css->cgroup; - raw_spinlock_t *cpu_lock; - - if (fast_path) - trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false); - else - trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false); - - cpu_lock = ss_rstat_cpu_lock(css->ss, cpu); - raw_spin_unlock_irqrestore(cpu_lock, flags); -} - /** * css_rstat_updated - keep track of updated rstat_cpu * @css: target cgroup subsystem state @@ -323,15 +254,12 @@ static struct cgroup_subsys_state *css_rstat_updated_list( { struct css_rstat_cpu *rstatc = css_rstat_cpu(root, cpu); struct cgroup_subsys_state *head = NULL, *parent, *child; - unsigned long flags; - - flags = _css_rstat_cpu_lock(root, cpu, false); css_process_update_tree(root->ss, cpu); /* Return NULL if this subtree is not on-list */ if (!rstatc->updated_next) - goto unlock_ret; + return NULL; /* * Unlink @root from its parent. As the updated_children list is @@ -363,8 +291,7 @@ static struct cgroup_subsys_state *css_rstat_updated_list( rstatc->updated_children = root; if (child != root) head = css_rstat_push_children(head, child, cpu); -unlock_ret: - _css_rstat_cpu_unlock(root, cpu, flags, false); + return head; } @@ -560,34 +487,15 @@ int __init ss_rstat_init(struct cgroup_subsys *ss) { int cpu; -#ifdef CONFIG_SMP - /* - * On uniprocessor machines, arch_spinlock_t is defined as an empty - * struct. Avoid allocating a size of zero by having this block - * excluded in this case. It's acceptable to leave the subsystem locks - * unitialized since the associated lock functions are no-ops in the - * non-smp case. - */ - if (ss) { - ss->rstat_ss_cpu_lock = alloc_percpu(raw_spinlock_t); - if (!ss->rstat_ss_cpu_lock) - return -ENOMEM; - } -#endif - if (ss) { ss->lhead = alloc_percpu(struct llist_head); - if (!ss->lhead) { - free_percpu(ss->rstat_ss_cpu_lock); + if (!ss->lhead) return -ENOMEM; - } } spin_lock_init(ss_rstat_lock(ss)); - for_each_possible_cpu(cpu) { - raw_spin_lock_init(ss_rstat_cpu_lock(ss, cpu)); + for_each_possible_cpu(cpu) init_llist_head(ss_lhead_cpu(ss, cpu)); - } return 0; } -- cgit v1.2.3 From dfe25fbaedfc2a07281ed5ff0442270217d25b31 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 4 Jul 2025 11:08:04 -0700 Subject: cgroup: llist: avoid memory tears for llist_node Before the commit 36df6e3dbd7e ("cgroup: make css_rstat_updated nmi safe"), the struct llist_node is expected to be private to the one inserting the node to the lockless list or the one removing the node from the lockless list. After the mentioned commit, the llist_node in the rstat code is per-cpu shared between the stacked contexts i.e. process, softirq, hardirq & nmi. It is possible the compiler may tear the loads or stores of llist_node. Let's avoid that. KCSAN reported the following race: Reported by Kernel Concurrency Sanitizer on: CPU: 60 UID: 0 PID: 5425 ... 6.16.0-rc3-next-20250626 #1 NONE Tainted: [E]=UNSIGNED_MODULE Hardware name: ... ================================================================== ================================================================== BUG: KCSAN: data-race in css_rstat_flush / css_rstat_updated write to 0xffffe8fffe1c85f0 of 8 bytes by task 1061 on cpu 1: css_rstat_flush+0x1b8/0xeb0 __mem_cgroup_flush_stats+0x184/0x190 flush_memcg_stats_dwork+0x22/0x50 process_one_work+0x335/0x630 worker_thread+0x5f1/0x8a0 kthread+0x197/0x340 ret_from_fork+0xd3/0x110 ret_from_fork_asm+0x11/0x20 read to 0xffffe8fffe1c85f0 of 8 bytes by task 3551 on cpu 15: css_rstat_updated+0x81/0x180 mod_memcg_lruvec_state+0x113/0x2d0 __mod_lruvec_state+0x3d/0x50 lru_add+0x21e/0x3f0 folio_batch_move_lru+0x80/0x1b0 __folio_batch_add_and_move+0xd7/0x160 folio_add_lru_vma+0x42/0x50 do_anonymous_page+0x892/0xe90 __handle_mm_fault+0xfaa/0x1520 handle_mm_fault+0xdc/0x350 do_user_addr_fault+0x1dc/0x650 exc_page_fault+0x5c/0x110 asm_exc_page_fault+0x22/0x30 value changed: 0xffffe8fffe18e0d0 -> 0xffffe8fffe1c85f0 $ ./scripts/faddr2line vmlinux css_rstat_flush+0x1b8/0xeb0 css_rstat_flush+0x1b8/0xeb0: init_llist_node at include/linux/llist.h:86 (inlined by) llist_del_first_init at include/linux/llist.h:308 (inlined by) css_process_update_tree at kernel/cgroup/rstat.c:148 (inlined by) css_rstat_updated_list at kernel/cgroup/rstat.c:258 (inlined by) css_rstat_flush at kernel/cgroup/rstat.c:389 $ ./scripts/faddr2line vmlinux css_rstat_updated+0x81/0x180 css_rstat_updated+0x81/0x180: css_rstat_updated at kernel/cgroup/rstat.c:90 (discriminator 1) These are expected race and a simple READ_ONCE/WRITE_ONCE resolves these reports. However let's add comments to explain the race and the need for memory barriers if stronger guarantees are needed. More specifically the rstat updater and the flusher can race and cause a scenario where the stats updater skips adding the css to the lockless list but the flusher might not see those updates done by the skipped updater. This is benign race and the subsequent flusher will flush those stats and at the moment there aren't any rstat users which are not fine with this kind of race. However some future user might want more stricter guarantee, so let's add appropriate comments to ease the job of future users. Signed-off-by: Shakeel Butt Reviewed-by: Paul E. McKenney Fixes: 36df6e3dbd7e ("cgroup: make css_rstat_updated nmi safe") Signed-off-by: Tejun Heo --- include/linux/llist.h | 6 +++--- kernel/cgroup/rstat.c | 28 +++++++++++++++++++++++++++- 2 files changed, 30 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/llist.h b/include/linux/llist.h index 27b17f64bcee..607b2360c938 100644 --- a/include/linux/llist.h +++ b/include/linux/llist.h @@ -83,7 +83,7 @@ static inline void init_llist_head(struct llist_head *list) */ static inline void init_llist_node(struct llist_node *node) { - node->next = node; + WRITE_ONCE(node->next, node); } /** @@ -97,7 +97,7 @@ static inline void init_llist_node(struct llist_node *node) */ static inline bool llist_on_list(const struct llist_node *node) { - return node->next != node; + return READ_ONCE(node->next) != node; } /** @@ -220,7 +220,7 @@ static inline bool llist_empty(const struct llist_head *head) static inline struct llist_node *llist_next(struct llist_node *node) { - return node->next; + return READ_ONCE(node->next); } /** diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index c8a48cf83878..981e2f77ad4e 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -60,6 +60,12 @@ static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu) * Atomically inserts the css in the ss's llist for the given cpu. This is * reentrant safe i.e. safe against softirq, hardirq and nmi. The ss's llist * will be processed at the flush time to create the update tree. + * + * NOTE: if the user needs the guarantee that the updater either add itself in + * the lockless list or the concurrent flusher flushes its updated stats, a + * memory barrier is needed before the call to css_rstat_updated() i.e. a + * barrier after updating the per-cpu stats and before calling + * css_rstat_updated(). */ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) { @@ -86,7 +92,12 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) return; rstatc = css_rstat_cpu(css, cpu); - /* If already on list return. */ + /* + * If already on list return. This check is racy and smp_mb() is needed + * to pair it with the smp_mb() in css_process_update_tree() if the + * guarantee that the updated stats are visible to concurrent flusher is + * needed. + */ if (llist_on_list(&rstatc->lnode)) return; @@ -148,6 +159,21 @@ static void css_process_update_tree(struct cgroup_subsys *ss, int cpu) while ((lnode = llist_del_first_init(lhead))) { struct css_rstat_cpu *rstatc; + /* + * smp_mb() is needed here (more specifically in between + * init_llist_node() and per-cpu stats flushing) if the + * guarantee is required by a rstat user where etiher the + * updater should add itself on the lockless list or the + * flusher flush the stats updated by the updater who have + * observed that they are already on the list. The + * corresponding barrier pair for this one should be before + * css_rstat_updated() by the user. + * + * For now, there aren't any such user, so not adding the + * barrier here but if such a use-case arise, please add + * smp_mb() here. + */ + rstatc = container_of(lnode, struct css_rstat_cpu, lnode); __css_process_update_tree(rstatc->owner, cpu); } -- cgit v1.2.3