From cac5cefbade90ff0bb0b393d301fa3b5234cf056 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:01 +0200 Subject: sched/smp: Make SMP unconditional Simplify the scheduler by making CONFIG_SMP=y primitives and data structures unconditional. Introduce transitory wrappers for functionality not yet converted to SMP. Note that this patch is pretty large, because there's no clear separation between various aspects of the SMP scheduler, it's basically a huge block of #ifdef CONFIG_SMP. A fair amount of it has to be switched on for it to boot and work on UP systems. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-21-mingo@kernel.org --- include/linux/sched/deadline.h | 4 ---- include/linux/sched/idle.h | 4 ---- include/linux/sched/nohz.h | 4 ++-- include/linux/sched/topology.h | 32 -------------------------------- 4 files changed, 2 insertions(+), 42 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index f9aabbc9d22e..c40115d4e34d 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -29,15 +29,11 @@ static inline bool dl_time_before(u64 a, u64 b) return (s64)(a - b) < 0; } -#ifdef CONFIG_SMP - struct root_domain; extern void dl_add_task_root_domain(struct task_struct *p); extern void dl_clear_root_domain(struct root_domain *rd); extern void dl_clear_root_domain_cpu(int cpu); -#endif /* CONFIG_SMP */ - extern u64 dl_cookie; extern bool dl_bw_visited(int cpu, u64 cookie); diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h index 439f6029d3b9..8465ff1f20d1 100644 --- a/include/linux/sched/idle.h +++ b/include/linux/sched/idle.h @@ -11,11 +11,7 @@ enum cpu_idle_type { CPU_MAX_IDLE_TYPES }; -#ifdef CONFIG_SMP extern void wake_up_if_idle(int cpu); -#else -static inline void wake_up_if_idle(int cpu) { } -#endif /* * Idle thread specific functions to determine the need_resched diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h index 6d67e9a5af6b..0db7f67935fe 100644 --- a/include/linux/sched/nohz.h +++ b/include/linux/sched/nohz.h @@ -6,7 +6,7 @@ * This is the interface between the scheduler and nohz/dynticks: */ -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +#ifdef CONFIG_NO_HZ_COMMON extern void nohz_balance_enter_idle(int cpu); extern int get_nohz_timer_target(void); #else @@ -23,7 +23,7 @@ static inline void calc_load_nohz_remote(struct rq *rq) { } static inline void calc_load_nohz_stop(void) { } #endif /* CONFIG_NO_HZ_COMMON */ -#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) +#ifdef CONFIG_NO_HZ_COMMON extern void wake_up_nohz_cpu(int cpu); #else static inline void wake_up_nohz_cpu(int cpu) { } diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 198bb5cc1774..e54e7fa76ba6 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -9,7 +9,6 @@ /* * sched-domains (multiprocessor balancing) declarations: */ -#ifdef CONFIG_SMP /* Generate SD flag indexes */ #define SD_FLAG(name, mflags) __##name, @@ -200,37 +199,6 @@ extern void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio); # define SD_INIT_NAME(type) .name = #type -#else /* CONFIG_SMP */ - -struct sched_domain_attr; - -static inline void -partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new) -{ -} - -static inline bool cpus_equal_capacity(int this_cpu, int that_cpu) -{ - return true; -} - -static inline bool cpus_share_cache(int this_cpu, int that_cpu) -{ - return true; -} - -static inline bool cpus_share_resources(int this_cpu, int that_cpu) -{ - return true; -} - -static inline void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) -{ -} - -#endif /* !CONFIG_SMP */ - #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) extern void rebuild_sched_domains_energy(void); #else -- cgit v1.2.3 From 1f25730e5a780b33f78e3ea23e64d3f75e0b2042 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:07 +0200 Subject: sched/smp: Use the SMP version of sched_exec() Simplify the scheduler making CONFIG_SMP=y sched_exec() code unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-27-mingo@kernel.org --- include/linux/sched/task.h | 4 ---- kernel/sched/core.c | 4 ---- 2 files changed, 8 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index ca1db4b92c32..c517dbc242f7 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -109,11 +109,7 @@ int kernel_wait(pid_t pid, int *stat); extern void free_task(struct task_struct *tsk); /* sched_exec is called by processes performing an exec */ -#ifdef CONFIG_SMP extern void sched_exec(void); -#else -#define sched_exec() {} -#endif static inline struct task_struct *get_task_struct(struct task_struct *t) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c108b5c2e115..fa89006e05e9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5419,8 +5419,6 @@ unsigned int nr_iowait(void) return sum; } -#ifdef CONFIG_SMP - /* * sched_exec - execve() is a valuable balancing opportunity, because at * this point the task has the smallest effective memory and cache footprint. @@ -5444,8 +5442,6 @@ void sched_exec(void) stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); } -#endif /* CONFIG_SMP */ - DEFINE_PER_CPU(struct kernel_stat, kstat); DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); -- cgit v1.2.3 From 6e6558a6bc418f1478c5dc8609d03805364e0cb9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Jun 2025 15:33:10 -1000 Subject: sched_ext, sched/core: Factor out struct scx_task_group More sched_ext fields will be added to struct task_group. In preparation, factor out sched_ext fields into struct scx_task_group to reduce clutter in the common header. No functional changes. Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 8 ++++++++ kernel/sched/ext.c | 32 ++++++++++++++++---------------- kernel/sched/sched.h | 5 +---- 3 files changed, 25 insertions(+), 20 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index f7545430a548..eda89acdb7ab 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -214,4 +214,12 @@ static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} static inline void scx_softlockup(u32 dur_s) {} #endif /* CONFIG_SCHED_CLASS_EXT */ + +struct scx_task_group { +#ifdef CONFIG_EXT_GROUP_SCHED + u32 flags; /* SCX_TG_* */ + u32 weight; +#endif +}; + #endif /* _LINUX_SCHED_EXT_H */ diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 4db51e708f86..6732e50e0679 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4058,7 +4058,7 @@ static bool scx_cgroup_enabled; void scx_tg_init(struct task_group *tg) { - tg->scx_weight = CGROUP_WEIGHT_DFL; + tg->scx.weight = CGROUP_WEIGHT_DFL; } int scx_tg_online(struct task_group *tg) @@ -4066,14 +4066,14 @@ int scx_tg_online(struct task_group *tg) struct scx_sched *sch = scx_root; int ret = 0; - WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED)); + WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED)); percpu_down_read(&scx_cgroup_rwsem); if (scx_cgroup_enabled) { if (SCX_HAS_OP(sch, cgroup_init)) { struct scx_cgroup_init_args args = - { .weight = tg->scx_weight }; + { .weight = tg->scx.weight }; ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL, tg->css.cgroup, &args); @@ -4081,9 +4081,9 @@ int scx_tg_online(struct task_group *tg) ret = ops_sanitize_err(sch, "cgroup_init", ret); } if (ret == 0) - tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED; + tg->scx.flags |= SCX_TG_ONLINE | SCX_TG_INITED; } else { - tg->scx_flags |= SCX_TG_ONLINE; + tg->scx.flags |= SCX_TG_ONLINE; } percpu_up_read(&scx_cgroup_rwsem); @@ -4094,15 +4094,15 @@ void scx_tg_offline(struct task_group *tg) { struct scx_sched *sch = scx_root; - WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE)); + WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE)); percpu_down_read(&scx_cgroup_rwsem); if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) && - (tg->scx_flags & SCX_TG_INITED)) + (tg->scx.flags & SCX_TG_INITED)) SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, tg->css.cgroup); - tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); + tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); percpu_up_read(&scx_cgroup_rwsem); } @@ -4211,11 +4211,11 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) percpu_down_read(&scx_cgroup_rwsem); if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) && - tg->scx_weight != weight) + tg->scx.weight != weight) SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL, tg_cgrp(tg), weight); - tg->scx_weight = weight; + tg->scx.weight = weight; percpu_up_read(&scx_cgroup_rwsem); } @@ -4366,9 +4366,9 @@ static void scx_cgroup_exit(struct scx_sched *sch) css_for_each_descendant_post(css, &root_task_group.css) { struct task_group *tg = css_tg(css); - if (!(tg->scx_flags & SCX_TG_INITED)) + if (!(tg->scx.flags & SCX_TG_INITED)) continue; - tg->scx_flags &= ~SCX_TG_INITED; + tg->scx.flags &= ~SCX_TG_INITED; if (!sch->ops.cgroup_exit) continue; @@ -4400,14 +4400,14 @@ static int scx_cgroup_init(struct scx_sched *sch) rcu_read_lock(); css_for_each_descendant_pre(css, &root_task_group.css) { struct task_group *tg = css_tg(css); - struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; + struct scx_cgroup_init_args args = { .weight = tg->scx.weight }; - if ((tg->scx_flags & + if ((tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) continue; if (!sch->ops.cgroup_init) { - tg->scx_flags |= SCX_TG_INITED; + tg->scx.flags |= SCX_TG_INITED; continue; } @@ -4422,7 +4422,7 @@ static int scx_cgroup_init(struct scx_sched *sch) scx_error(sch, "ops.cgroup_init() failed (%d)", ret); return ret; } - tg->scx_flags |= SCX_TG_INITED; + tg->scx.flags |= SCX_TG_INITED; rcu_read_lock(); css_put(css); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 105190b18020..fdf5f52b54a3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -471,10 +471,7 @@ struct task_group { struct rt_bandwidth rt_bandwidth; #endif -#ifdef CONFIG_EXT_GROUP_SCHED - u32 scx_flags; /* SCX_TG_* */ - u32 scx_weight; -#endif + struct scx_task_group scx; struct rcu_head rcu; struct list_head list; -- cgit v1.2.3 From ddceadce63d9cb752c2472e220ded05cabaf7971 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Jun 2025 15:34:22 -1000 Subject: sched_ext: Add support for cgroup bandwidth control interface From 077814f57f8acce13f91dc34bbd2b7e4911fbf25 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Jun 2025 15:06:47 -1000 - Add CONFIG_GROUP_SCHED_BANDWIDTH which is selected by both CONFIG_CFS_BANDWIDTH and EXT_GROUP_SCHED. - Put bandwidth control interface files for both cgroup v1 and v2 under CONFIG_GROUP_SCHED_BANDWIDTH. - Update tg_bandwidth() to fetch configuration parameters from fair if CONFIG_CFS_BANDWIDTH, SCX otherwise. - Update tg_set_bandwidth() to update the parameters for both fair and SCX. - Add bandwidth control parameters to struct scx_cgroup_init_args. - Add sched_ext_ops.cgroup_set_bandwidth() which is invoked on bandwidth control parameter updates. - Update scx_qmap and maximal selftest to test the new feature. Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 3 ++ init/Kconfig | 5 ++ kernel/sched/core.c | 29 +++++++++-- kernel/sched/ext.c | 66 +++++++++++++++++++++++-- kernel/sched/ext.h | 2 + kernel/sched/sched.h | 4 +- tools/sched_ext/scx_qmap.bpf.c | 23 +++++++++ tools/testing/selftests/sched_ext/maximal.bpf.c | 5 ++ 8 files changed, 127 insertions(+), 10 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index eda89acdb7ab..8b92842776cb 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -219,6 +219,9 @@ struct scx_task_group { #ifdef CONFIG_EXT_GROUP_SCHED u32 flags; /* SCX_TG_* */ u32 weight; + u64 bw_period_us; + u64 bw_quota_us; + u64 bw_burst_us; #endif }; diff --git a/init/Kconfig b/init/Kconfig index af4c2f085455..baf59d2a20a2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1065,6 +1065,9 @@ if CGROUP_SCHED config GROUP_SCHED_WEIGHT def_bool n +config GROUP_SCHED_BANDWIDTH + def_bool n + config FAIR_GROUP_SCHED bool "Group scheduling for SCHED_OTHER" depends on CGROUP_SCHED @@ -1074,6 +1077,7 @@ config FAIR_GROUP_SCHED config CFS_BANDWIDTH bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED" depends on FAIR_GROUP_SCHED + select GROUP_SCHED_BANDWIDTH default n help This option allows users to define CPU bandwidth rates (limits) for @@ -1108,6 +1112,7 @@ config EXT_GROUP_SCHED bool depends on SCHED_CLASS_EXT && CGROUP_SCHED select GROUP_SCHED_WEIGHT + select GROUP_SCHED_BANDWIDTH default y endif #CGROUP_SCHED diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0e3a00e2a2cc..91845d00a1cd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9545,7 +9545,9 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v) return 0; } +#endif /* CONFIG_CFS_BANDWIDTH */ +#ifdef CONFIG_GROUP_SCHED_BANDWIDTH const u64 max_bw_quota_period_us = 1 * USEC_PER_SEC; /* 1s */ static const u64 min_bw_quota_period_us = 1 * USEC_PER_MSEC; /* 1ms */ /* More than 203 days if BW_SHIFT equals 20. */ @@ -9554,12 +9556,21 @@ static const u64 max_bw_runtime_us = MAX_BW; static void tg_bandwidth(struct task_group *tg, u64 *period_us_p, u64 *quota_us_p, u64 *burst_us_p) { +#ifdef CONFIG_CFS_BANDWIDTH if (period_us_p) *period_us_p = tg_get_cfs_period(tg); if (quota_us_p) *quota_us_p = tg_get_cfs_quota(tg); if (burst_us_p) *burst_us_p = tg_get_cfs_burst(tg); +#else /* !CONFIG_CFS_BANDWIDTH */ + if (period_us_p) + *period_us_p = tg->scx.bw_period_us; + if (quota_us_p) + *quota_us_p = tg->scx.bw_quota_us; + if (burst_us_p) + *burst_us_p = tg->scx.bw_burst_us; +#endif /* CONFIG_CFS_BANDWIDTH */ } static u64 cpu_period_read_u64(struct cgroup_subsys_state *css, @@ -9575,6 +9586,7 @@ static int tg_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us) { const u64 max_usec = U64_MAX / NSEC_PER_USEC; + int ret = 0; if (tg == &root_task_group) return -EINVAL; @@ -9612,7 +9624,12 @@ static int tg_set_bandwidth(struct task_group *tg, burst_us + quota_us > max_bw_runtime_us)) return -EINVAL; - return tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us); +#ifdef CONFIG_CFS_BANDWIDTH + ret = tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us); +#endif /* CONFIG_CFS_BANDWIDTH */ + if (!ret) + scx_group_set_bandwidth(tg, period_us, quota_us, burst_us); + return ret; } static s64 cpu_quota_read_s64(struct cgroup_subsys_state *css, @@ -9665,7 +9682,7 @@ static int cpu_burst_write_u64(struct cgroup_subsys_state *css, tg_bandwidth(tg, &period_us, "a_us, NULL); return tg_set_bandwidth(tg, period_us, quota_us, burst_us); } -#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* CONFIG_GROUP_SCHED_BANDWIDTH */ #ifdef CONFIG_RT_GROUP_SCHED static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, @@ -9725,7 +9742,7 @@ static struct cftype cpu_legacy_files[] = { .write_s64 = cpu_idle_write_s64, }, #endif -#ifdef CONFIG_CFS_BANDWIDTH +#ifdef CONFIG_GROUP_SCHED_BANDWIDTH { .name = "cfs_period_us", .read_u64 = cpu_period_read_u64, @@ -9741,6 +9758,8 @@ static struct cftype cpu_legacy_files[] = { .read_u64 = cpu_burst_read_u64, .write_u64 = cpu_burst_write_u64, }, +#endif +#ifdef CONFIG_CFS_BANDWIDTH { .name = "stat", .seq_show = cpu_cfs_stat_show, @@ -9954,7 +9973,7 @@ static int __maybe_unused cpu_period_quota_parse(char *buf, u64 *period_us_p, return 0; } -#ifdef CONFIG_CFS_BANDWIDTH +#ifdef CONFIG_GROUP_SCHED_BANDWIDTH static int cpu_max_show(struct seq_file *sf, void *v) { struct task_group *tg = css_tg(seq_css(sf)); @@ -10001,7 +10020,7 @@ static struct cftype cpu_files[] = { .write_s64 = cpu_idle_write_s64, }, #endif -#ifdef CONFIG_CFS_BANDWIDTH +#ifdef CONFIG_GROUP_SCHED_BANDWIDTH { .name = "max", .flags = CFTYPE_NOT_ON_ROOT, diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 6732e50e0679..39cba11688a9 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -203,6 +203,11 @@ struct scx_exit_task_args { struct scx_cgroup_init_args { /* the weight of the cgroup [1..10000] */ u32 weight; + + /* bandwidth control parameters from cpu.max and cpu.max.burst */ + u64 bw_period_us; + u64 bw_quota_us; + u64 bw_burst_us; }; enum scx_cpu_preempt_reason { @@ -664,9 +669,31 @@ struct sched_ext_ops { * @cgrp: cgroup whose weight is being updated * @weight: new weight [1..10000] * - * Update @tg's weight to @weight. + * Update @cgrp's weight to @weight. */ void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); + + /** + * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed + * @cgrp: cgroup whose bandwidth is being updated + * @period_us: bandwidth control period + * @quota_us: bandwidth control quota + * @burst_us: bandwidth control burst + * + * Update @cgrp's bandwidth control parameters. This is from the cpu.max + * cgroup interface. + * + * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled + * to. For example, if @period_us is 1_000_000 and @quota_us is + * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be + * interpreted in the same fashion and specifies how much @cgrp can + * burst temporarily. The specific control mechanism and thus the + * interpretation of @period_us and burstiness is upto to the BPF + * scheduler. + */ + void (*cgroup_set_bandwidth)(struct cgroup *cgrp, + u64 period_us, u64 quota_us, u64 burst_us); + #endif /* CONFIG_EXT_GROUP_SCHED */ /* @@ -4059,6 +4086,8 @@ static bool scx_cgroup_enabled; void scx_tg_init(struct task_group *tg) { tg->scx.weight = CGROUP_WEIGHT_DFL; + tg->scx.bw_period_us = default_bw_period_us(); + tg->scx.bw_quota_us = RUNTIME_INF; } int scx_tg_online(struct task_group *tg) @@ -4073,7 +4102,10 @@ int scx_tg_online(struct task_group *tg) if (scx_cgroup_enabled) { if (SCX_HAS_OP(sch, cgroup_init)) { struct scx_cgroup_init_args args = - { .weight = tg->scx.weight }; + { .weight = tg->scx.weight, + .bw_period_us = tg->scx.bw_period_us, + .bw_quota_us = tg->scx.bw_quota_us, + .bw_burst_us = tg->scx.bw_burst_us }; ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL, tg->css.cgroup, &args); @@ -4225,6 +4257,27 @@ void scx_group_set_idle(struct task_group *tg, bool idle) /* TODO: Implement ops->cgroup_set_idle() */ } +void scx_group_set_bandwidth(struct task_group *tg, + u64 period_us, u64 quota_us, u64 burst_us) +{ + struct scx_sched *sch = scx_root; + + percpu_down_read(&scx_cgroup_rwsem); + + if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) && + (tg->scx.bw_period_us != period_us || + tg->scx.bw_quota_us != quota_us || + tg->scx.bw_burst_us != burst_us)) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_bandwidth, NULL, + tg_cgrp(tg), period_us, quota_us, burst_us); + + tg->scx.bw_period_us = period_us; + tg->scx.bw_quota_us = quota_us; + tg->scx.bw_burst_us = burst_us; + + percpu_up_read(&scx_cgroup_rwsem); +} + static void scx_cgroup_lock(void) { percpu_down_write(&scx_cgroup_rwsem); @@ -4400,7 +4453,12 @@ static int scx_cgroup_init(struct scx_sched *sch) rcu_read_lock(); css_for_each_descendant_pre(css, &root_task_group.css) { struct task_group *tg = css_tg(css); - struct scx_cgroup_init_args args = { .weight = tg->scx.weight }; + struct scx_cgroup_init_args args = { + .weight = tg->scx.weight, + .bw_period_us = tg->scx.bw_period_us, + .bw_quota_us = tg->scx.bw_quota_us, + .bw_burst_us = tg->scx.bw_burst_us, + }; if ((tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) @@ -5902,6 +5960,7 @@ static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {} +static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {} #endif static void sched_ext_ops__cpu_online(s32 cpu) {} static void sched_ext_ops__cpu_offline(s32 cpu) {} @@ -5939,6 +5998,7 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = { .cgroup_move = sched_ext_ops__cgroup_move, .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move, .cgroup_set_weight = sched_ext_ops__cgroup_set_weight, + .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth, #endif .cpu_online = sched_ext_ops__cpu_online, .cpu_offline = sched_ext_ops__cpu_offline, diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index e7bcaa02ea56..292bb41a242e 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -104,6 +104,7 @@ void scx_cgroup_finish_attach(void); void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); void scx_group_set_idle(struct task_group *tg, bool idle); +void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us); #else /* CONFIG_EXT_GROUP_SCHED */ static inline void scx_tg_init(struct task_group *tg) {} static inline int scx_tg_online(struct task_group *tg) { return 0; } @@ -114,5 +115,6 @@ static inline void scx_cgroup_finish_attach(void) {} static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} static inline void scx_group_set_idle(struct task_group *tg, bool idle) {} +static inline void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us) {} #endif /* CONFIG_EXT_GROUP_SCHED */ #endif /* CONFIG_CGROUP_SCHED */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fdf5f52b54a3..06767a210717 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -402,7 +402,7 @@ static inline bool dl_server_active(struct sched_dl_entity *dl_se) extern struct list_head task_groups; -#ifdef CONFIG_CFS_BANDWIDTH +#ifdef CONFIG_GROUP_SCHED_BANDWIDTH extern const u64 max_bw_quota_period_us; /* @@ -413,7 +413,7 @@ static inline u64 default_bw_period_us(void) { return 100000ULL; } -#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* CONFIG_GROUP_SCHED_BANDWIDTH */ struct cfs_bandwidth { #ifdef CONFIG_CFS_BANDWIDTH diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index c3cd9a17d48e..69d877501cb7 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -615,6 +615,26 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc taskc->force_local, taskc->core_sched_seq); } +s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args) +{ + bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu", + cgrp->kn->id, args->weight, args->bw_period_us, + args->bw_quota_us, args->bw_burst_us); + return 0; +} + +void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight) +{ + bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight); +} + +void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp, + u64 period_us, u64 quota_us, u64 burst_us) +{ + bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu", cgrp->kn->id, + period_us, quota_us, burst_us); +} + /* * Print out the online and possible CPU map using bpf_printk() as a * demonstration of using the cpumask kfuncs and ops.cpu_on/offline(). @@ -840,6 +860,9 @@ SCX_OPS_DEFINE(qmap_ops, .dump = (void *)qmap_dump, .dump_cpu = (void *)qmap_dump_cpu, .dump_task = (void *)qmap_dump_task, + .cgroup_init = (void *)qmap_cgroup_init, + .cgroup_set_weight = (void *)qmap_cgroup_set_weight, + .cgroup_set_bandwidth = (void *)qmap_cgroup_set_bandwidth, .cpu_online = (void *)qmap_cpu_online, .cpu_offline = (void *)qmap_cpu_offline, .init = (void *)qmap_init, diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c index 430f5e13bf55..01cf4f3da4e0 100644 --- a/tools/testing/selftests/sched_ext/maximal.bpf.c +++ b/tools/testing/selftests/sched_ext/maximal.bpf.c @@ -123,6 +123,10 @@ void BPF_STRUCT_OPS(maximal_cgroup_cancel_move, struct task_struct *p, void BPF_STRUCT_OPS(maximal_cgroup_set_weight, struct cgroup *cgrp, u32 weight) {} +void BPF_STRUCT_OPS(maximal_cgroup_set_bandwidth, struct cgroup *cgrp, + u64 period_us, u64 quota_us, u64 burst_us) +{} + s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init) { return scx_bpf_create_dsq(DSQ_ID, -1); @@ -160,6 +164,7 @@ struct sched_ext_ops maximal_ops = { .cgroup_move = (void *) maximal_cgroup_move, .cgroup_cancel_move = (void *) maximal_cgroup_cancel_move, .cgroup_set_weight = (void *) maximal_cgroup_set_weight, + .cgroup_set_bandwidth = (void *) maximal_cgroup_set_bandwidth, .init = (void *) maximal_init, .exit = (void *) maximal_exit, .name = "maximal", -- cgit v1.2.3 From cb444006a625c60e6d4dd3753863c3c74f96aac3 Mon Sep 17 00:00:00 2001 From: David Dai Date: Tue, 24 Jun 2025 15:49:06 -0700 Subject: sched_ext, rcu: Eject BPF scheduler on RCU CPU stall panic For systems using a sched_ext scheduler and has panic_on_rcu_stall enabled, try kicking out the current scheduler before issuing a panic. While there are numerous reasons for RCU CPU stalls that are not directly attributed to the scheduler, deferring the panic gives sched_ext an opportunity to provide additional debug info when ejecting the current scheduler. Also, handling the event more gracefully allows us to potentially recover the system instead of incurring additional down time. Suggested-by: Tejun Heo Reviewed-by: Paul E. McKenney Signed-off-by: David Dai Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 2 ++ kernel/rcu/tree_stall.h | 7 +++++++ kernel/sched/ext.c | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+) (limited to 'include/linux/sched') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 8b92842776cb..0cf0915572c9 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -206,12 +206,14 @@ struct sched_ext_entity { void sched_ext_free(struct task_struct *p); void print_scx_info(const char *log_lvl, struct task_struct *p); void scx_softlockup(u32 dur_s); +bool scx_rcu_cpu_stall(void); #else /* !CONFIG_SCHED_CLASS_EXT */ static inline void sched_ext_free(struct task_struct *p) {} static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} static inline void scx_softlockup(u32 dur_s) {} +static inline bool scx_rcu_cpu_stall(void) { return false; } #endif /* CONFIG_SCHED_CLASS_EXT */ diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 486c00536207..af61b2d0d311 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -134,6 +134,13 @@ static void panic_on_rcu_stall(void) { static int cpu_stall; + /* + * Attempt to kick out the BPF scheduler if it's installed and defer + * the panic to give the system a chance to recover. + */ + if (scx_rcu_cpu_stall()) + return; + if (++cpu_stall < sysctl_max_rcu_stall_to_panic) return; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index bee98fdcdd01..df5b2c952cf7 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4672,6 +4672,41 @@ bool scx_allow_ttwu_queue(const struct task_struct *p) p->sched_class != &ext_sched_class; } +/** + * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler + * + * While there are various reasons why RCU CPU stalls can occur on a system + * that may not be caused by the current BPF scheduler, try kicking out the + * current scheduler in an attempt to recover the system to a good state before + * issuing panics. + */ +bool scx_rcu_cpu_stall(void) +{ + struct scx_sched *sch; + + rcu_read_lock(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) { + rcu_read_unlock(); + return false; + } + + switch (scx_enable_state()) { + case SCX_ENABLING: + case SCX_ENABLED: + break; + default: + rcu_read_unlock(); + return false; + } + + scx_error(sch, "RCU CPU stall detected!"); + rcu_read_unlock(); + + return true; +} + /** * scx_softlockup - sched_ext softlockup handler * @dur_s: number of seconds of CPU stuck due to soft lockup -- cgit v1.2.3 From 4ecf83741401c70d4420588ee1f3b1ca04ef58d5 Mon Sep 17 00:00:00 2001 From: Jake Hillion Date: Wed, 25 Jun 2025 18:05:46 +0100 Subject: sched_ext: Drop kfuncs marked for removal in 6.15 sched_ext performed a kfunc renaming pass in 6.13 and kept the old names around for compatibility with old binaries. These were scheduled for cleanup in 6.15 but were missed. Submitting for cleanup in for-next. Removed the kfuncs, their flags, and any references I could find to them in doc comments. Left the entries in include/scx/compat.bpf.h as they're still useful to make new binaries compatible with old kernels. Tested by applying to my kernel. It builds and a modern version of scx_lavd loads fine. Signed-off-by: Jake Hillion Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 10 +++---- kernel/sched/ext.c | 71 ++--------------------------------------------- 2 files changed, 7 insertions(+), 74 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 0cf0915572c9..7047101dbf58 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -164,7 +164,7 @@ struct sched_ext_entity { /* * Runtime budget in nsecs. This is usually set through - * scx_bpf_dispatch() but can also be modified directly by the BPF + * scx_bpf_dsq_insert() but can also be modified directly by the BPF * scheduler. Automatically decreased by SCX as the task executes. On * depletion, a scheduling event is triggered. * @@ -176,10 +176,10 @@ struct sched_ext_entity { /* * Used to order tasks when dispatching to the vtime-ordered priority - * queue of a dsq. This is usually set through scx_bpf_dispatch_vtime() - * but can also be modified directly by the BPF scheduler. Modifying it - * while a task is queued on a dsq may mangle the ordering and is not - * recommended. + * queue of a dsq. This is usually set through + * scx_bpf_dsq_insert_vtime() but can also be modified directly by the + * BPF scheduler. Modifying it while a task is queued on a dsq may + * mangle the ordering and is not recommended. */ u64 dsq_vtime; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index df5b2c952cf7..512474eabea6 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6391,7 +6391,8 @@ __bpf_kfunc_start_defs(); * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id * and this function can be called upto ops.dispatch_max_batch times to insert * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the - * remaining slots. scx_bpf_consume() flushes the batch and resets the counter. + * remaining slots. scx_bpf_dsq_move_to_local() flushes the batch and resets the + * counter. * * This function doesn't have any locking restrictions and may be called under * BPF locks (in the future when BPF introduces more flexible locking). @@ -6415,14 +6416,6 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice scx_dsq_insert_commit(p, dsq_id, enq_flags); } -/* for backward compatibility, will be removed in v6.15 */ -__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, - u64 enq_flags) -{ - printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()"); - scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags); -} - /** * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ * @p: task_struct to insert @@ -6460,21 +6453,11 @@ __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); } -/* for backward compatibility, will be removed in v6.15 */ -__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, - u64 slice, u64 vtime, u64 enq_flags) -{ - printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()"); - scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags); -} - __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { @@ -6647,13 +6630,6 @@ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id) } } -/* for backward compatibility, will be removed in v6.15 */ -__bpf_kfunc bool scx_bpf_consume(u64 dsq_id) -{ - printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()"); - return scx_bpf_dsq_move_to_local(dsq_id); -} - /** * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs * @it__iter: DSQ iterator in progress @@ -6672,14 +6648,6 @@ __bpf_kfunc void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE; } -/* for backward compatibility, will be removed in v6.15 */ -__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice( - struct bpf_iter_scx_dsq *it__iter, u64 slice) -{ - printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()"); - scx_bpf_dsq_move_set_slice(it__iter, slice); -} - /** * scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs * @it__iter: DSQ iterator in progress @@ -6699,14 +6667,6 @@ __bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME; } -/* for backward compatibility, will be removed in v6.15 */ -__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime( - struct bpf_iter_scx_dsq *it__iter, u64 vtime) -{ - printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()"); - scx_bpf_dsq_move_set_vtime(it__iter, vtime); -} - /** * scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ * @it__iter: DSQ iterator in progress @@ -6739,15 +6699,6 @@ __bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, p, dsq_id, enq_flags); } -/* for backward compatibility, will be removed in v6.15 */ -__bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, - struct task_struct *p, u64 dsq_id, - u64 enq_flags) -{ - printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()"); - return scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags); -} - /** * scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ * @it__iter: DSQ iterator in progress @@ -6773,30 +6724,16 @@ __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); } -/* for backward compatibility, will be removed in v6.15 */ -__bpf_kfunc bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, - struct task_struct *p, u64 dsq_id, - u64 enq_flags) -{ - printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_vtime() renamed to scx_bpf_dsq_move_vtime()"); - return scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags); -} - __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_dispatch) BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots) BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel) BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local) -BTF_ID_FLAGS(func, scx_bpf_consume) BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice) BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime) BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice) -BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime) -BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_dispatch) static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { @@ -6927,10 +6864,6 @@ BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice) BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime) BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice) -BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime) -BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_unlocked) static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = { -- cgit v1.2.3 From 56180dd20c19e5b0fa34822997a9ac66b517e7b3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Jul 2025 13:00:07 +0200 Subject: futex: Use RCU-based per-CPU reference counting instead of rcuref_t The use of rcuref_t for reference counting introduces a performance bottleneck when accessed concurrently by multiple threads during futex operations. Replace rcuref_t with special crafted per-CPU reference counters. The lifetime logic remains the same. The newly allocate private hash starts in FR_PERCPU state. In this state, each futex operation that requires the private hash uses a per-CPU counter (an unsigned int) for incrementing or decrementing the reference count. When the private hash is about to be replaced, the per-CPU counters are migrated to a atomic_t counter mm_struct::futex_atomic. The migration process: - Waiting for one RCU grace period to ensure all users observe the current private hash. This can be skipped if a grace period elapsed since the private hash was assigned. - futex_private_hash::state is set to FR_ATOMIC, forcing all users to use mm_struct::futex_atomic for reference counting. - After a RCU grace period, all users are guaranteed to be using the atomic counter. The per-CPU counters can now be summed up and added to the atomic_t counter. If the resulting count is zero, the hash can be safely replaced. Otherwise, active users still hold a valid reference. - Once the atomic reference count drops to zero, the next futex operation will switch to the new private hash. call_rcu_hurry() is used to speed up transition which otherwise might be delay with RCU_LAZY. There is nothing wrong with using call_rcu(). The side effects would be that on auto scaling the new hash is used later and the SET_SLOTS prctl() will block longer. [bigeasy: commit description + mm get/ put_async] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250710110011.384614-3-bigeasy@linutronix.de --- include/linux/futex.h | 16 +--- include/linux/mm_types.h | 5 + include/linux/sched/mm.h | 2 +- init/Kconfig | 4 - kernel/fork.c | 8 +- kernel/futex/core.c | 243 +++++++++++++++++++++++++++++++++++++++++++---- 6 files changed, 243 insertions(+), 35 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/futex.h b/include/linux/futex.h index b37193653e6b..9e9750f04980 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -85,18 +85,12 @@ int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4) #ifdef CONFIG_FUTEX_PRIVATE_HASH int futex_hash_allocate_default(void); void futex_hash_free(struct mm_struct *mm); - -static inline void futex_mm_init(struct mm_struct *mm) -{ - RCU_INIT_POINTER(mm->futex_phash, NULL); - mm->futex_phash_new = NULL; - mutex_init(&mm->futex_hash_lock); -} +int futex_mm_init(struct mm_struct *mm); #else /* !CONFIG_FUTEX_PRIVATE_HASH */ static inline int futex_hash_allocate_default(void) { return 0; } -static inline void futex_hash_free(struct mm_struct *mm) { } -static inline void futex_mm_init(struct mm_struct *mm) { } +static inline int futex_hash_free(struct mm_struct *mm) { return 0; } +static inline int futex_mm_init(struct mm_struct *mm) { return 0; } #endif /* CONFIG_FUTEX_PRIVATE_HASH */ #else /* !CONFIG_FUTEX */ @@ -118,8 +112,8 @@ static inline int futex_hash_allocate_default(void) { return 0; } -static inline void futex_hash_free(struct mm_struct *mm) { } -static inline void futex_mm_init(struct mm_struct *mm) { } +static inline int futex_hash_free(struct mm_struct *mm) { return 0; } +static inline int futex_mm_init(struct mm_struct *mm) { return 0; } #endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d6b91e8a66d6..0f0662157066 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1070,6 +1070,11 @@ struct mm_struct { struct mutex futex_hash_lock; struct futex_private_hash __rcu *futex_phash; struct futex_private_hash *futex_phash_new; + /* futex-ref */ + unsigned long futex_batches; + struct rcu_head futex_rcu; + atomic_long_t futex_atomic; + unsigned int __percpu *futex_ref; #endif unsigned long hiwater_rss; /* High-watermark of RSS usage */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index b13474825130..2201da0afecc 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -140,7 +140,7 @@ static inline bool mmget_not_zero(struct mm_struct *mm) /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); -#ifdef CONFIG_MMU +#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH) /* same as above but performs the slow path from the async context. Can * be called from the atomic context as well */ diff --git a/init/Kconfig b/init/Kconfig index 666783eb50ab..af4c2f085455 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1716,13 +1716,9 @@ config FUTEX_PI depends on FUTEX && RT_MUTEXES default y -# -# marked broken for performance reasons; gives us one more cycle to sort things out. -# config FUTEX_PRIVATE_HASH bool depends on FUTEX && !BASE_SMALL && MMU - depends on BROKEN default y config FUTEX_MPOL diff --git a/kernel/fork.c b/kernel/fork.c index 1ee8eb11f38b..0b885dcbde9a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1046,7 +1046,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_subscriptions_init(mm); init_tlb_flush_pending(mm); - futex_mm_init(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS) mm->pmd_huge_pte = NULL; #endif @@ -1061,6 +1060,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->def_flags = 0; } + if (futex_mm_init(mm)) + goto fail_mm_init; + if (mm_alloc_pgd(mm)) goto fail_nopgd; @@ -1090,6 +1092,8 @@ fail_nocontext: fail_noid: mm_free_pgd(mm); fail_nopgd: + futex_hash_free(mm); +fail_mm_init: free_mm(mm); return NULL; } @@ -1145,7 +1149,7 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); -#ifdef CONFIG_MMU +#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH) static void mmput_async_fn(struct work_struct *work) { struct mm_struct *mm = container_of(work, struct mm_struct, diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 90d53fb0ee9e..1dcb4c8a2585 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -42,7 +42,6 @@ #include #include #include -#include #include #include @@ -65,7 +64,7 @@ static struct { #define futex_queues (__futex_data.queues) struct futex_private_hash { - rcuref_t users; + int state; unsigned int hash_mask; struct rcu_head rcu; void *mm; @@ -129,6 +128,12 @@ static struct futex_hash_bucket * __futex_hash(union futex_key *key, struct futex_private_hash *fph); #ifdef CONFIG_FUTEX_PRIVATE_HASH +static bool futex_ref_get(struct futex_private_hash *fph); +static bool futex_ref_put(struct futex_private_hash *fph); +static bool futex_ref_is_dead(struct futex_private_hash *fph); + +enum { FR_PERCPU = 0, FR_ATOMIC }; + static inline bool futex_key_is_private(union futex_key *key) { /* @@ -142,15 +147,14 @@ bool futex_private_hash_get(struct futex_private_hash *fph) { if (fph->immutable) return true; - return rcuref_get(&fph->users); + return futex_ref_get(fph); } void futex_private_hash_put(struct futex_private_hash *fph) { - /* Ignore return value, last put is verified via rcuref_is_dead() */ if (fph->immutable) return; - if (rcuref_put(&fph->users)) + if (futex_ref_put(fph)) wake_up_var(fph->mm); } @@ -243,14 +247,18 @@ static bool __futex_pivot_hash(struct mm_struct *mm, fph = rcu_dereference_protected(mm->futex_phash, lockdep_is_held(&mm->futex_hash_lock)); if (fph) { - if (!rcuref_is_dead(&fph->users)) { + if (!futex_ref_is_dead(fph)) { mm->futex_phash_new = new; return false; } futex_rehash_private(fph, new); } - rcu_assign_pointer(mm->futex_phash, new); + new->state = FR_PERCPU; + scoped_guard(rcu) { + mm->futex_batches = get_state_synchronize_rcu(); + rcu_assign_pointer(mm->futex_phash, new); + } kvfree_rcu(fph, rcu); return true; } @@ -289,9 +297,7 @@ again: if (!fph) return NULL; - if (fph->immutable) - return fph; - if (rcuref_get(&fph->users)) + if (futex_private_hash_get(fph)) return fph; } futex_pivot_hash(mm); @@ -1527,16 +1533,219 @@ static void futex_hash_bucket_init(struct futex_hash_bucket *fhb, #define FH_IMMUTABLE 0x02 #ifdef CONFIG_FUTEX_PRIVATE_HASH + +/* + * futex-ref + * + * Heavily inspired by percpu-rwsem/percpu-refcount; not reusing any of that + * code because it just doesn't fit right. + * + * Dual counter, per-cpu / atomic approach like percpu-refcount, except it + * re-initializes the state automatically, such that the fph swizzle is also a + * transition back to per-cpu. + */ + +static void futex_ref_rcu(struct rcu_head *head); + +static void __futex_ref_atomic_begin(struct futex_private_hash *fph) +{ + struct mm_struct *mm = fph->mm; + + /* + * The counter we're about to switch to must have fully switched; + * otherwise it would be impossible for it to have reported success + * from futex_ref_is_dead(). + */ + WARN_ON_ONCE(atomic_long_read(&mm->futex_atomic) != 0); + + /* + * Set the atomic to the bias value such that futex_ref_{get,put}() + * will never observe 0. Will be fixed up in __futex_ref_atomic_end() + * when folding in the percpu count. + */ + atomic_long_set(&mm->futex_atomic, LONG_MAX); + smp_store_release(&fph->state, FR_ATOMIC); + + call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu); +} + +static void __futex_ref_atomic_end(struct futex_private_hash *fph) +{ + struct mm_struct *mm = fph->mm; + unsigned int count = 0; + long ret; + int cpu; + + /* + * Per __futex_ref_atomic_begin() the state of the fph must be ATOMIC + * and per this RCU callback, everybody must now observe this state and + * use the atomic variable. + */ + WARN_ON_ONCE(fph->state != FR_ATOMIC); + + /* + * Therefore the per-cpu counter is now stable, sum and reset. + */ + for_each_possible_cpu(cpu) { + unsigned int *ptr = per_cpu_ptr(mm->futex_ref, cpu); + count += *ptr; + *ptr = 0; + } + + /* + * Re-init for the next cycle. + */ + this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */ + + /* + * Add actual count, subtract bias and initial refcount. + * + * The moment this atomic operation happens, futex_ref_is_dead() can + * become true. + */ + ret = atomic_long_add_return(count - LONG_MAX - 1, &mm->futex_atomic); + if (!ret) + wake_up_var(mm); + + WARN_ON_ONCE(ret < 0); + mmput_async(mm); +} + +static void futex_ref_rcu(struct rcu_head *head) +{ + struct mm_struct *mm = container_of(head, struct mm_struct, futex_rcu); + struct futex_private_hash *fph = rcu_dereference_raw(mm->futex_phash); + + if (fph->state == FR_PERCPU) { + /* + * Per this extra grace-period, everybody must now observe + * fph as the current fph and no previously observed fph's + * are in-flight. + * + * Notably, nobody will now rely on the atomic + * futex_ref_is_dead() state anymore so we can begin the + * migration of the per-cpu counter into the atomic. + */ + __futex_ref_atomic_begin(fph); + return; + } + + __futex_ref_atomic_end(fph); +} + +/* + * Drop the initial refcount and transition to atomics. + */ +static void futex_ref_drop(struct futex_private_hash *fph) +{ + struct mm_struct *mm = fph->mm; + + /* + * Can only transition the current fph; + */ + WARN_ON_ONCE(rcu_dereference_raw(mm->futex_phash) != fph); + /* + * We enqueue at least one RCU callback. Ensure mm stays if the task + * exits before the transition is completed. + */ + mmget(mm); + + /* + * In order to avoid the following scenario: + * + * futex_hash() __futex_pivot_hash() + * guard(rcu); guard(mm->futex_hash_lock); + * fph = mm->futex_phash; + * rcu_assign_pointer(&mm->futex_phash, new); + * futex_hash_allocate() + * futex_ref_drop() + * fph->state = FR_ATOMIC; + * atomic_set(, BIAS); + * + * futex_private_hash_get(fph); // OOPS + * + * Where an old fph (which is FR_ATOMIC) and should fail on + * inc_not_zero, will succeed because a new transition is started and + * the atomic is bias'ed away from 0. + * + * There must be at least one full grace-period between publishing a + * new fph and trying to replace it. + */ + if (poll_state_synchronize_rcu(mm->futex_batches)) { + /* + * There was a grace-period, we can begin now. + */ + __futex_ref_atomic_begin(fph); + return; + } + + call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu); +} + +static bool futex_ref_get(struct futex_private_hash *fph) +{ + struct mm_struct *mm = fph->mm; + + guard(rcu)(); + + if (smp_load_acquire(&fph->state) == FR_PERCPU) { + this_cpu_inc(*mm->futex_ref); + return true; + } + + return atomic_long_inc_not_zero(&mm->futex_atomic); +} + +static bool futex_ref_put(struct futex_private_hash *fph) +{ + struct mm_struct *mm = fph->mm; + + guard(rcu)(); + + if (smp_load_acquire(&fph->state) == FR_PERCPU) { + this_cpu_dec(*mm->futex_ref); + return false; + } + + return atomic_long_dec_and_test(&mm->futex_atomic); +} + +static bool futex_ref_is_dead(struct futex_private_hash *fph) +{ + struct mm_struct *mm = fph->mm; + + guard(rcu)(); + + if (smp_load_acquire(&fph->state) == FR_PERCPU) + return false; + + return atomic_long_read(&mm->futex_atomic) == 0; +} + +int futex_mm_init(struct mm_struct *mm) +{ + mutex_init(&mm->futex_hash_lock); + RCU_INIT_POINTER(mm->futex_phash, NULL); + mm->futex_phash_new = NULL; + /* futex-ref */ + atomic_long_set(&mm->futex_atomic, 0); + mm->futex_batches = get_state_synchronize_rcu(); + mm->futex_ref = alloc_percpu(unsigned int); + if (!mm->futex_ref) + return -ENOMEM; + this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */ + return 0; +} + void futex_hash_free(struct mm_struct *mm) { struct futex_private_hash *fph; + free_percpu(mm->futex_ref); kvfree(mm->futex_phash_new); fph = rcu_dereference_raw(mm->futex_phash); - if (fph) { - WARN_ON_ONCE(rcuref_read(&fph->users) > 1); + if (fph) kvfree(fph); - } } static bool futex_pivot_pending(struct mm_struct *mm) @@ -1549,7 +1758,7 @@ static bool futex_pivot_pending(struct mm_struct *mm) return true; fph = rcu_dereference(mm->futex_phash); - return rcuref_is_dead(&fph->users); + return futex_ref_is_dead(fph); } static bool futex_hash_less(struct futex_private_hash *a, @@ -1598,11 +1807,11 @@ static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) } } - fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); + fph = kvzalloc(struct_size(fph, queues, hash_slots), + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!fph) return -ENOMEM; - rcuref_init(&fph->users, 1); fph->hash_mask = hash_slots ? hash_slots - 1 : 0; fph->custom = custom; fph->immutable = !!(flags & FH_IMMUTABLE); @@ -1645,7 +1854,7 @@ again: * allocated a replacement hash, drop the initial * reference on the existing hash. */ - futex_private_hash_put(cur); + futex_ref_drop(cur); } if (new) { -- cgit v1.2.3 From e075f4360931263f5ec006ea5dadc065e5e98eb8 Mon Sep 17 00:00:00 2001 From: Li Chen Date: Thu, 10 Jul 2025 18:57:07 +0800 Subject: smpboot: introduce SDTL_INIT() helper to tidy sched topology setup Define a small SDTL_INIT(maskfn, flagsfn, name) macro and use it to build the sched_domain_topology_level array. Purely a cleanup; behaviour is unchanged. Suggested-by: Thomas Gleixner Signed-off-by: Li Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Tested-by: K Prateek Nayak Link: https://lore.kernel.org/r/20250710105715.66594-2-me@linux.beauty --- arch/powerpc/kernel/smp.c | 25 ++++++++++--------------- arch/s390/kernel/topology.c | 10 +++++----- arch/x86/kernel/smpboot.c | 21 ++++++--------------- include/linux/sched/topology.h | 4 ++-- kernel/sched/topology.c | 24 ++++++++---------------- 5 files changed, 31 insertions(+), 53 deletions(-) (limited to 'include/linux/sched') diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 5ac7084eebc0..f59e4b9cc207 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1700,28 +1700,23 @@ static void __init build_sched_topology(void) #ifdef CONFIG_SCHED_SMT if (has_big_cores) { pr_info("Big cores detected but using small core scheduling\n"); - powerpc_topology[i++] = (struct sched_domain_topology_level){ - smallcore_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) - }; + powerpc_topology[i++] = + SDTL_INIT(smallcore_smt_mask, powerpc_smt_flags, SMT); } else { - powerpc_topology[i++] = (struct sched_domain_topology_level){ - cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) - }; + powerpc_topology[i++] = SDTL_INIT(cpu_smt_mask, powerpc_smt_flags, SMT); } #endif if (shared_caches) { - powerpc_topology[i++] = (struct sched_domain_topology_level){ - shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) - }; + powerpc_topology[i++] = + SDTL_INIT(shared_cache_mask, powerpc_shared_cache_flags, CACHE); } + if (has_coregroup_support()) { - powerpc_topology[i++] = (struct sched_domain_topology_level){ - cpu_mc_mask, powerpc_shared_proc_flags, SD_INIT_NAME(MC) - }; + powerpc_topology[i++] = + SDTL_INIT(cpu_mc_mask, powerpc_shared_proc_flags, MC); } - powerpc_topology[i++] = (struct sched_domain_topology_level){ - cpu_cpu_mask, powerpc_shared_proc_flags, SD_INIT_NAME(PKG) - }; + + powerpc_topology[i++] = SDTL_INIT(cpu_cpu_mask, powerpc_shared_proc_flags, PKG); /* There must be one trailing NULL entry left. */ BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1); diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 3df048e190b1..46569b8e47dd 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -531,11 +531,11 @@ static const struct cpumask *cpu_drawer_mask(int cpu) } static struct sched_domain_topology_level s390_topology[] = { - { cpu_thread_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, - { cpu_book_mask, SD_INIT_NAME(BOOK) }, - { cpu_drawer_mask, SD_INIT_NAME(DRAWER) }, - { cpu_cpu_mask, SD_INIT_NAME(PKG) }, + SDTL_INIT(cpu_thread_mask, cpu_smt_flags, SMT), + SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC), + SDTL_INIT(cpu_book_mask, NULL, BOOK), + SDTL_INIT(cpu_drawer_mask, NULL, DRAWER), + SDTL_INIT(cpu_cpu_mask, NULL, PKG), { NULL, }, }; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fc78c2325fd2..e0adf75f617a 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -485,35 +485,26 @@ static void __init build_sched_topology(void) int i = 0; #ifdef CONFIG_SCHED_SMT - x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) - }; + x86_topology[i++] = SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT); #endif #ifdef CONFIG_SCHED_CLUSTER - x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) - }; + x86_topology[i++] = SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS); #endif #ifdef CONFIG_SCHED_MC - x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) - }; + x86_topology[i++] = SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC); #endif /* * When there is NUMA topology inside the package skip the PKG domain * since the NUMA domains will auto-magically create the right spanning * domains based on the SLIT. */ - if (!x86_has_numa_in_package) { - x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_cpu_mask, x86_sched_itmt_flags, SD_INIT_NAME(PKG) - }; - } + if (!x86_has_numa_in_package) + x86_topology[i++] = SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG); /* * There must be one trailing NULL entry left. */ - BUG_ON(i >= ARRAY_SIZE(x86_topology)-1); + BUG_ON(i >= ARRAY_SIZE(x86_topology) - 1); set_sched_topology(x86_topology); } diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index e54e7fa76ba6..0d5daaa277b7 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -196,8 +196,8 @@ struct sched_domain_topology_level { extern void __init set_sched_topology(struct sched_domain_topology_level *tl); extern void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio); - -# define SD_INIT_NAME(type) .name = #type +#define SDTL_INIT(maskfn, flagsfn, dname) ((struct sched_domain_topology_level) \ + { .mask = maskfn, .sd_flags = flagsfn, .name = #dname }) #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) extern void rebuild_sched_domains_energy(void); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8e06b1d22e91..d01f5a49f2e7 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1737,17 +1737,17 @@ sd_init(struct sched_domain_topology_level *tl, */ static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, + SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), #endif #ifdef CONFIG_SCHED_CLUSTER - { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) }, + SDTL_INIT(cpu_clustergroup_mask, cpu_cluster_flags, CLS), #endif #ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, + SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC), #endif - { cpu_cpu_mask, SD_INIT_NAME(PKG) }, + SDTL_INIT(cpu_cpu_mask, NULL, PKG), { NULL, }, }; @@ -2008,23 +2008,15 @@ void sched_init_numa(int offline_node) /* * Add the NUMA identity distance, aka single NODE. */ - tl[i++] = (struct sched_domain_topology_level){ - .mask = sd_numa_mask, - .numa_level = 0, - SD_INIT_NAME(NODE) - }; + tl[i++] = SDTL_INIT(sd_numa_mask, NULL, NODE); /* * .. and append 'j' levels of NUMA goodness. */ for (j = 1; j < nr_levels; i++, j++) { - tl[i] = (struct sched_domain_topology_level){ - .mask = sd_numa_mask, - .sd_flags = cpu_numa_flags, - .flags = SDTL_OVERLAP, - .numa_level = j, - SD_INIT_NAME(NUMA) - }; + tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA); + tl[i].numa_level = j; + tl[i].flags = SDTL_OVERLAP; } sched_domain_topology_saved = sched_domain_topology; -- cgit v1.2.3 From 1eec89a671413ce38df9fe9e70f5130a9eb79a59 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Fri, 11 Jul 2025 11:20:30 +0530 Subject: sched/topology: Remove sched_domain_topology_level::flags Support for overlapping domains added in commit e3589f6c81e4 ("sched: Allow for overlapping sched_domain spans") also allowed forcefully setting SD_OVERLAP for !NUMA domains via FORCE_SD_OVERLAP sched_feat(). Since NUMA domains had to be presumed overlapping to ensure correct behavior, "sched_domain_topology_level::flags" was introduced. NUMA domains added the SDTL_OVERLAP flag would ensure SD_OVERLAP was always added during build_sched_domains() for these domains, even when FORCE_SD_OVERLAP was off. Condition for adding the SD_OVERLAP flag at the aforementioned commit was as follows: if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) sd->flags |= SD_OVERLAP; The FORCE_SD_OVERLAP debug feature was removed in commit af85596c74de ("sched/topology: Remove FORCE_SD_OVERLAP") which left the NUMA domains as the exclusive users of SDTL_OVERLAP, SD_OVERLAP, and SD_NUMA flags. Get rid of SDTL_OVERLAP and SD_OVERLAP as they have become redundant and instead rely on SD_NUMA to detect the only overlapping domain currently supported. Since SDTL_OVERLAP was the only user of "tl->flags", get rid of "sched_domain_topology_level::flags" too. Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/ba4dbdf8-bc37-493d-b2e0-2efb00ea3e19@amd.com --- include/linux/sched/sd_flags.h | 8 -------- include/linux/sched/topology.h | 3 --- kernel/sched/fair.c | 6 +++--- kernel/sched/topology.c | 19 ++++++++++--------- 4 files changed, 13 insertions(+), 23 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index b04a5d04dee9..42839cfa2778 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -153,14 +153,6 @@ SD_FLAG(SD_ASYM_PACKING, SDF_NEEDS_GROUPS) */ SD_FLAG(SD_PREFER_SIBLING, SDF_NEEDS_GROUPS) -/* - * sched_groups of this level overlap - * - * SHARED_PARENT: Set for all NUMA levels above NODE. - * NEEDS_GROUPS: Overlaps can only exist with more than one group. - */ -SD_FLAG(SD_OVERLAP, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) - /* * Cross-node balancing * diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 0d5daaa277b7..5263746b63e8 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -175,8 +175,6 @@ bool cpus_share_resources(int this_cpu, int that_cpu); typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); typedef int (*sched_domain_flags_f)(void); -#define SDTL_OVERLAP 0x01 - struct sd_data { struct sched_domain *__percpu *sd; struct sched_domain_shared *__percpu *sds; @@ -187,7 +185,6 @@ struct sd_data { struct sched_domain_topology_level { sched_domain_mask_f mask; sched_domain_flags_f sd_flags; - int flags; int numa_level; struct sd_data data; char *name; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 20a845697c1d..b9b4bbbf0af6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9926,9 +9926,9 @@ void update_group_capacity(struct sched_domain *sd, int cpu) min_capacity = ULONG_MAX; max_capacity = 0; - if (child->flags & SD_OVERLAP) { + if (child->flags & SD_NUMA) { /* - * SD_OVERLAP domains cannot assume that child groups + * SD_NUMA domains cannot assume that child groups * span the current group. */ @@ -9941,7 +9941,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } } else { /* - * !SD_OVERLAP domains can assume that child groups + * !SD_NUMA domains can assume that child groups * span the current group. */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index d01f5a49f2e7..977e133bb8a4 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -89,7 +89,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!(sd->flags & SD_OVERLAP) && + if (!(sd->flags & SD_NUMA) && cpumask_intersects(groupmask, sched_group_span(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); @@ -102,7 +102,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, group->sgc->id, cpumask_pr_args(sched_group_span(group))); - if ((sd->flags & SD_OVERLAP) && + if ((sd->flags & SD_NUMA) && !cpumask_equal(group_balance_mask(group), sched_group_span(group))) { printk(KERN_CONT " mask=%*pbl", cpumask_pr_args(group_balance_mask(group))); @@ -1344,7 +1344,7 @@ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) * "sg->asym_prefer_cpu" to "sg->sgc->asym_prefer_cpu" * which is shared by all the overlapping groups. */ - WARN_ON_ONCE(sd->flags & SD_OVERLAP); + WARN_ON_ONCE(sd->flags & SD_NUMA); sg = sd->groups; if (cpu != sg->asym_prefer_cpu) { @@ -2016,7 +2016,6 @@ void sched_init_numa(int offline_node) for (j = 1; j < nr_levels; i++, j++) { tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA); tl[i].numa_level = j; - tl[i].flags = SDTL_OVERLAP; } sched_domain_topology_saved = sched_domain_topology; @@ -2327,7 +2326,7 @@ static void __sdt_free(const struct cpumask *cpu_map) if (sdd->sd) { sd = *per_cpu_ptr(sdd->sd, j); - if (sd && (sd->flags & SD_OVERLAP)) + if (sd && (sd->flags & SD_NUMA)) free_sched_groups(sd->groups, 0); kfree(*per_cpu_ptr(sdd->sd, j)); } @@ -2393,9 +2392,13 @@ static bool topology_span_sane(const struct cpumask *cpu_map) id_seen = sched_domains_tmpmask2; for_each_sd_topology(tl) { + int tl_common_flags = 0; + + if (tl->sd_flags) + tl_common_flags = (*tl->sd_flags)(); /* NUMA levels are allowed to overlap */ - if (tl->flags & SDTL_OVERLAP) + if (tl_common_flags & SD_NUMA) continue; cpumask_clear(covered); @@ -2466,8 +2469,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att if (tl == sched_domain_topology) *per_cpu_ptr(d.sd, i) = sd; - if (tl->flags & SDTL_OVERLAP) - sd->flags |= SD_OVERLAP; if (cpumask_equal(cpu_map, sched_domain_span(sd))) break; } @@ -2480,7 +2481,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att for_each_cpu(i, cpu_map) { for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { sd->span_weight = cpumask_weight(sched_domain_span(sd)); - if (sd->flags & SD_OVERLAP) { + if (sd->flags & SD_NUMA) { if (build_overlap_sched_groups(sd, i)) goto error; } else { -- cgit v1.2.3 From 8671bad873ebeb082afcf7b4501395c374da6023 Mon Sep 17 00:00:00 2001 From: "Luis Claudio R. Goncalves" Date: Mon, 7 Jul 2025 11:03:59 -0300 Subject: sched: Do not call __put_task_struct() on rt if pi_blocked_on is set With PREEMPT_RT enabled, some of the calls to put_task_struct() coming from rt_mutex_adjust_prio_chain() could happen in preemptible context and with a mutex enqueued. That could lead to this sequence: rt_mutex_adjust_prio_chain() put_task_struct() __put_task_struct() sched_ext_free() spin_lock_irqsave() rtlock_lock() ---> TRIGGERS lockdep_assert(!current->pi_blocked_on); This is not a SCHED_EXT bug. The first cleanup function called by __put_task_struct() is sched_ext_free() and it happens to take a (RT) spin_lock, which in the scenario described above, would trigger the lockdep assertion of "!current->pi_blocked_on". Crystal Wood was able to identify the problem as __put_task_struct() being called during rt_mutex_adjust_prio_chain(), in the context of a process with a mutex enqueued. Instead of adding more complex conditions to decide when to directly call __put_task_struct() and when to defer the call, unconditionally resort to the deferred call on PREEMPT_RT to simplify the code. Fixes: 893cdaaa3977 ("sched: avoid false lockdep splat in put_task_struct()") Suggested-by: Crystal Wood Signed-off-by: Luis Claudio R. Goncalves Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Wander Lairson Costa Reviewed-by: Valentin Schneider Reviewed-by: Sebastian Andrzej Siewior Link: https://lore.kernel.org/r/aGvTz5VaPFyj0pBV@uudg.org --- include/linux/sched/task.h | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index c517dbc242f7..ea41795a352b 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -131,24 +131,17 @@ static inline void put_task_struct(struct task_struct *t) return; /* - * In !RT, it is always safe to call __put_task_struct(). - * Under RT, we can only call it in preemptible context. - */ - if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) { - static DEFINE_WAIT_OVERRIDE_MAP(put_task_map, LD_WAIT_SLEEP); - - lock_map_acquire_try(&put_task_map); - __put_task_struct(t); - lock_map_release(&put_task_map); - return; - } - - /* - * under PREEMPT_RT, we can't call put_task_struct + * Under PREEMPT_RT, we can't call __put_task_struct * in atomic context because it will indirectly - * acquire sleeping locks. + * acquire sleeping locks. The same is true if the + * current process has a mutex enqueued (blocked on + * a PI chain). + * + * In !RT, it is always safe to call __put_task_struct(). + * Though, in order to simplify the code, resort to the + * deferred call too. * - * call_rcu() will schedule delayed_put_task_struct_rcu() + * call_rcu() will schedule __put_task_struct_rcu_cb() * to be called in process context. * * __put_task_struct() is called when @@ -161,7 +154,7 @@ static inline void put_task_struct(struct task_struct *t) * * delayed_free_task() also uses ->rcu, but it is only called * when it fails to fork a process. Therefore, there is no - * way it can conflict with put_task_struct(). + * way it can conflict with __put_task_struct(). */ call_rcu(&t->rcu, __put_task_struct_rcu_cb); } -- cgit v1.2.3 From 32e42ab9fc88a884435c27527a433f61c4d2b61b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 26 Jul 2025 00:29:54 -0700 Subject: sched/task_stack: Add missing const qualifier to end_of_stack() Add missing const qualifier to the non-CONFIG_THREAD_INFO_IN_TASK version of end_of_stack() to match the CONFIG_THREAD_INFO_IN_TASK version. Fixes a warning with CONFIG_KSTACK_ERASE=y on archs that don't select THREAD_INFO_IN_TASK (such as LoongArch): error: passing 'const struct task_struct *' to parameter of type 'struct task_struct *' discards qualifiers The stackleak_task_low_bound() function correctly uses a const task parameter, but the legacy end_of_stack() prototype didn't like that. Build tested on loongarch (with CONFIG_KSTACK_ERASE=y) and m68k (with CONFIG_DEBUG_STACK_USAGE=y). Fixes: a45728fd4120 ("LoongArch: Enable HAVE_ARCH_STACKLEAK") Reported-by: Nathan Chancellor Closes: https://lore.kernel.org/all/20250726004313.GA3650901@ax162 Cc: Youling Tang Cc: Huacai Chen Tested-by: Nathan Chancellor Signed-off-by: Kees Cook --- include/linux/sched/task_stack.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h index 85c5a6392e02..1fab7e9043a3 100644 --- a/include/linux/sched/task_stack.h +++ b/include/linux/sched/task_stack.h @@ -53,7 +53,7 @@ static inline void setup_thread_stack(struct task_struct *p, struct task_struct * When the stack grows up, this is the highest address. * Beyond that position, we corrupt data on the next page. */ -static inline unsigned long *end_of_stack(struct task_struct *p) +static inline unsigned long *end_of_stack(const struct task_struct *p) { #ifdef CONFIG_STACK_GROWSUP return (unsigned long *)((unsigned long)task_thread_info(p) + THREAD_SIZE) - 1; -- cgit v1.2.3