From 089d84203ad42bc8fd6dbf41683e162ac6e848cd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 27 Nov 2025 16:39:44 +0100 Subject: sched/fair: Fold the sched_avg update Nine (and a half) instances of the same pattern is just silly, fold the lot. Notably, the half instance in enqueue_load_avg() is right after setting cfs_rq->avg.load_sum to cfs_rq->avg.load_avg * get_pelt_divider(&cfs_rq->avg). Since get_pelt_divisor() >= PELT_MIN_DIVIDER, this ends up being a no-op change. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Shrikanth Hegde Cc: Valentin Schneider Cc: Vincent Guittot Link: https://patch.msgid.link/20251127154725.413564507@infradead.org --- kernel/sched/fair.c | 108 ++++++++++++++++------------------------------------ 1 file changed, 32 insertions(+), 76 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index da46c3164537..aa033e45dce4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3693,7 +3693,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) */ #define add_positive(_ptr, _val) do { \ typeof(_ptr) ptr = (_ptr); \ - typeof(_val) val = (_val); \ + __signed_scalar_typeof(*ptr) val = (_val); \ typeof(*ptr) res, var = READ_ONCE(*ptr); \ \ res = var + val; \ @@ -3704,23 +3704,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) WRITE_ONCE(*ptr, res); \ } while (0) -/* - * Unsigned subtract and clamp on underflow. - * - * Explicitly do a load-store to ensure the intermediate value never hits - * memory. This allows lockless observations without ever seeing the negative - * values. - */ -#define sub_positive(_ptr, _val) do { \ - typeof(_ptr) ptr = (_ptr); \ - typeof(*ptr) val = (_val); \ - typeof(*ptr) res, var = READ_ONCE(*ptr); \ - res = var - val; \ - if (res > var) \ - res = 0; \ - WRITE_ONCE(*ptr, res); \ -} while (0) - /* * Remove and clamp on negative, from a local variable. * @@ -3732,21 +3715,37 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) *ptr -= min_t(typeof(*ptr), *ptr, _val); \ } while (0) + +/* + * Because of rounding, se->util_sum might ends up being +1 more than + * cfs->util_sum. Although this is not a problem by itself, detaching + * a lot of tasks with the rounding problem between 2 updates of + * util_avg (~1ms) can make cfs->util_sum becoming null whereas + * cfs_util_avg is not. + * + * Check that util_sum is still above its lower bound for the new + * util_avg. Given that period_contrib might have moved since the last + * sync, we are only sure that util_sum must be above or equal to + * util_avg * minimum possible divider + */ +#define __update_sa(sa, name, delta_avg, delta_sum) do { \ + add_positive(&(sa)->name##_avg, delta_avg); \ + add_positive(&(sa)->name##_sum, delta_sum); \ + (sa)->name##_sum = max_t(typeof((sa)->name##_sum), \ + (sa)->name##_sum, \ + (sa)->name##_avg * PELT_MIN_DIVIDER); \ +} while (0) + static inline void enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - cfs_rq->avg.load_avg += se->avg.load_avg; - cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum; + __update_sa(&cfs_rq->avg, load, se->avg.load_avg, se->avg.load_sum); } static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); - sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, - cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); + __update_sa(&cfs_rq->avg, load, -se->avg.load_avg, -se->avg.load_sum); } static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); @@ -4242,7 +4241,6 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq */ divider = get_pelt_divider(&cfs_rq->avg); - /* Set new sched_entity's utilization */ se->avg.util_avg = gcfs_rq->avg.util_avg; new_sum = se->avg.util_avg * divider; @@ -4250,12 +4248,7 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq se->avg.util_sum = new_sum; /* Update parent cfs_rq utilization */ - add_positive(&cfs_rq->avg.util_avg, delta_avg); - add_positive(&cfs_rq->avg.util_sum, delta_sum); - - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, - cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); + __update_sa(&cfs_rq->avg, util, delta_avg, delta_sum); } static inline void @@ -4281,11 +4274,7 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf se->avg.runnable_sum = new_sum; /* Update parent cfs_rq runnable */ - add_positive(&cfs_rq->avg.runnable_avg, delta_avg); - add_positive(&cfs_rq->avg.runnable_sum, delta_sum); - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, - cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); + __update_sa(&cfs_rq->avg, runnable, delta_avg, delta_sum); } static inline void @@ -4349,11 +4338,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq se->avg.load_sum = runnable_sum; se->avg.load_avg = load_avg; - add_positive(&cfs_rq->avg.load_avg, delta_avg); - add_positive(&cfs_rq->avg.load_sum, delta_sum); - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, - cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); + __update_sa(&cfs_rq->avg, load, delta_avg, delta_sum); } static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) @@ -4552,33 +4537,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) raw_spin_unlock(&cfs_rq->removed.lock); r = removed_load; - sub_positive(&sa->load_avg, r); - sub_positive(&sa->load_sum, r * divider); - /* See sa->util_sum below */ - sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER); + __update_sa(sa, load, -r, -r*divider); r = removed_util; - sub_positive(&sa->util_avg, r); - sub_positive(&sa->util_sum, r * divider); - /* - * Because of rounding, se->util_sum might ends up being +1 more than - * cfs->util_sum. Although this is not a problem by itself, detaching - * a lot of tasks with the rounding problem between 2 updates of - * util_avg (~1ms) can make cfs->util_sum becoming null whereas - * cfs_util_avg is not. - * Check that util_sum is still above its lower bound for the new - * util_avg. Given that period_contrib might have moved since the last - * sync, we are only sure that util_sum must be above or equal to - * util_avg * minimum possible divider - */ - sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER); + __update_sa(sa, util, -r, -r*divider); r = removed_runnable; - sub_positive(&sa->runnable_avg, r); - sub_positive(&sa->runnable_sum, r * divider); - /* See sa->util_sum above */ - sa->runnable_sum = max_t(u32, sa->runnable_sum, - sa->runnable_avg * PELT_MIN_DIVIDER); + __update_sa(sa, runnable, -r, -r*divider); /* * removed_runnable is the unweighted version of removed_load so we @@ -4663,17 +4628,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { dequeue_load_avg(cfs_rq, se); - sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); - sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, - cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); - - sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); - sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum); - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, - cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); + __update_sa(&cfs_rq->avg, util, -se->avg.util_avg, -se->avg.util_sum); + __update_sa(&cfs_rq->avg, runnable, -se->avg.runnable_avg, -se->avg.runnable_sum); add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); -- cgit v1.2.3 From 45e09225085f70b856b7b9f26a18ea767a7e1563 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 12 Nov 2025 16:08:23 +0100 Subject: sched/fair: Avoid rq->lock bouncing in sched_balance_newidle() While poking at this code recently I noted we do a pointless unlock+lock cycle in sched_balance_newidle(). We drop the rq->lock (so we can balance) but then instantly grab the same rq->lock again in sched_balance_update_blocked_averages(). Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://patch.msgid.link/20251127154725.532469061@infradead.org --- kernel/sched/fair.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index aa033e45dce4..708ad01ac231 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9905,15 +9905,11 @@ static unsigned long task_h_load(struct task_struct *p) } #endif /* !CONFIG_FAIR_GROUP_SCHED */ -static void sched_balance_update_blocked_averages(int cpu) +static void __sched_balance_update_blocked_averages(struct rq *rq) { bool decayed = false, done = true; - struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; - rq_lock_irqsave(rq, &rf); update_blocked_load_tick(rq); - update_rq_clock(rq); decayed |= __update_blocked_others(rq, &done); decayed |= __update_blocked_fair(rq, &done); @@ -9921,7 +9917,15 @@ static void sched_balance_update_blocked_averages(int cpu) update_blocked_load_status(rq, !done); if (decayed) cpufreq_update_util(rq, 0); - rq_unlock_irqrestore(rq, &rf); +} + +static void sched_balance_update_blocked_averages(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + guard(rq_lock_irqsave)(rq); + update_rq_clock(rq); + __sched_balance_update_blocked_averages(rq); } /********** Helpers for sched_balance_find_src_group ************************/ @@ -12868,12 +12872,17 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) } rcu_read_unlock(); + /* + * Include sched_balance_update_blocked_averages() in the cost + * calculation because it can be quite costly -- this ensures we skip + * it when avg_idle gets to be very low. + */ + t0 = sched_clock_cpu(this_cpu); + __sched_balance_update_blocked_averages(this_rq); + rq_modified_clear(this_rq); raw_spin_rq_unlock(this_rq); - t0 = sched_clock_cpu(this_cpu); - sched_balance_update_blocked_averages(this_cpu); - rcu_read_lock(); for_each_domain(this_cpu, sd) { u64 domain_cost; -- cgit v1.2.3 From f24165bfa7ef6b37856c8f51e2001b9ad10ba688 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 28 Nov 2025 13:31:36 +0100 Subject: sched/headers: Rename rcu_dereference_check_sched_domain() => rcu_dereference_sched_domain() Remove check from the name for being surplus to requirements. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 708ad01ac231..74a0550b4552 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12857,7 +12857,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) rq_unpin_lock(this_rq, rf); rcu_read_lock(); - sd = rcu_dereference_check_sched_domain(this_rq->sd); + sd = rcu_dereference_sched_domain(this_rq->sd); if (!sd) { rcu_read_unlock(); goto out; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d30cca6870f5..2c0a4eaf02de 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2010,7 +2010,7 @@ queue_balance_callback(struct rq *rq, rq->balance_callback = head; } -#define rcu_dereference_check_sched_domain(p) \ +#define rcu_dereference_sched_domain(p) \ rcu_dereference_check((p), lockdep_is_held(&sched_domains_mutex)) /* @@ -2021,7 +2021,7 @@ queue_balance_callback(struct rq *rq, * preempt-disabled sections. */ #define for_each_domain(cpu, __sd) \ - for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ + for (__sd = rcu_dereference_sched_domain(cpu_rq(cpu)->sd); \ __sd; __sd = __sd->parent) /* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */ -- cgit v1.2.3 From 71fedc41c23b0010c578e6e224694ca15c19cf7d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 28 Nov 2025 13:32:21 +0100 Subject: sched/fair: Switch to rcu_dereference_all() With the {rcu,sched,bh} RCU flavours being unified, it doesn't really make sense to check for just the rcu one. Switch to the _all family of verification which includes all 3 of the listed flavours. Notably, this will enable us to remove some superfluous rcu_read_lock() regions when we know they are inside preempt/IRQ disabled regions. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 48 ++++++++++++++++++++++++------------------------ kernel/sched/sched.h | 2 +- 2 files changed, 25 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 74a0550b4552..44a359d6a299 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1513,7 +1513,7 @@ static unsigned int task_scan_start(struct task_struct *p) /* Scale the maximum scan period with the amount of shared memory. */ rcu_read_lock(); - ng = rcu_dereference(p->numa_group); + ng = rcu_dereference_all(p->numa_group); if (ng) { unsigned long shared = group_faults_shared(ng); unsigned long private = group_faults_priv(ng); @@ -1580,7 +1580,7 @@ pid_t task_numa_group_id(struct task_struct *p) pid_t gid = 0; rcu_read_lock(); - ng = rcu_dereference(p->numa_group); + ng = rcu_dereference_all(p->numa_group); if (ng) gid = ng->gid; rcu_read_unlock(); @@ -2239,7 +2239,7 @@ static bool task_numa_compare(struct task_numa_env *env, return false; rcu_read_lock(); - cur = rcu_dereference(dst_rq->curr); + cur = rcu_dereference_all(dst_rq->curr); if (cur && ((cur->flags & (PF_EXITING | PF_KTHREAD)) || !cur->mm)) cur = NULL; @@ -2284,7 +2284,7 @@ static bool task_numa_compare(struct task_numa_env *env, * If dst and source tasks are in the same NUMA group, or not * in any group then look only at task weights. */ - cur_ng = rcu_dereference(cur->numa_group); + cur_ng = rcu_dereference_all(cur->numa_group); if (cur_ng == p_ng) { /* * Do not swap within a group or between tasks that have @@ -2499,7 +2499,7 @@ static int task_numa_migrate(struct task_struct *p) * to satisfy here. */ rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); + sd = rcu_dereference_all(per_cpu(sd_numa, env.src_cpu)); if (sd) { env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; env.imb_numa_nr = sd->imb_numa_nr; @@ -3022,7 +3022,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (!cpupid_match_pid(tsk, cpupid)) goto no_join; - grp = rcu_dereference(tsk->numa_group); + grp = rcu_dereference_all(tsk->numa_group); if (!grp) goto no_join; @@ -4435,7 +4435,7 @@ static inline void migrate_se_pelt_lag(struct sched_entity *se) rq = rq_of(cfs_rq); rcu_read_lock(); - is_idle = is_idle_task(rcu_dereference(rq->curr)); + is_idle = is_idle_task(rcu_dereference_all(rq->curr)); rcu_read_unlock(); /* @@ -7462,7 +7462,7 @@ static inline void set_idle_cores(int cpu, int val) { struct sched_domain_shared *sds; - sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); if (sds) WRITE_ONCE(sds->has_idle_cores, val); } @@ -7471,7 +7471,7 @@ static inline bool test_idle_cores(int cpu) { struct sched_domain_shared *sds; - sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); if (sds) return READ_ONCE(sds->has_idle_cores); @@ -7600,7 +7600,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); if (sched_feat(SIS_UTIL)) { - sd_share = rcu_dereference(per_cpu(sd_llc_shared, target)); + sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, target)); if (sd_share) { /* because !--nr is the condition to stop scan */ nr = READ_ONCE(sd_share->nr_idle_scan) + 1; @@ -7806,7 +7806,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * sd_asym_cpucapacity rather than sd_llc. */ if (sched_asym_cpucap_active()) { - sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); + sd = rcu_dereference_all(per_cpu(sd_asym_cpucapacity, target)); /* * On an asymmetric CPU capacity system where an exclusive * cpuset defines a symmetric island (i.e. one unique @@ -7821,7 +7821,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) } } - sd = rcu_dereference(per_cpu(sd_llc, target)); + sd = rcu_dereference_all(per_cpu(sd_llc, target)); if (!sd) return target; @@ -8290,7 +8290,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) struct energy_env eenv; rcu_read_lock(); - pd = rcu_dereference(rd->pd); + pd = rcu_dereference_all(rd->pd); if (!pd) goto unlock; @@ -8298,7 +8298,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) * Energy-aware wake-up happens on the lowest sched_domain starting * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu. */ - sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity)); + sd = rcu_dereference_all(*this_cpu_ptr(&sd_asym_cpucapacity)); while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) sd = sd->parent; if (!sd) @@ -9305,7 +9305,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) */ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { - struct numa_group *numa_group = rcu_dereference(p->numa_group); + struct numa_group *numa_group = rcu_dereference_all(p->numa_group); unsigned long src_weight, dst_weight; int src_nid, dst_nid, dist; @@ -10985,7 +10985,7 @@ static void update_idle_cpu_scan(struct lb_env *env, if (env->sd->span_weight != llc_weight) return; - sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu)); + sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, env->dst_cpu)); if (!sd_share) return; @@ -11335,7 +11335,7 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env) goto force_balance; if (!is_rd_overutilized(env->dst_rq->rd) && - rcu_dereference(env->dst_rq->rd->pd)) + rcu_dereference_all(env->dst_rq->rd->pd)) goto out_balanced; /* ASYM feature bypasses nice load balance check */ @@ -12424,7 +12424,7 @@ static void nohz_balancer_kick(struct rq *rq) rcu_read_lock(); - sd = rcu_dereference(rq->sd); + sd = rcu_dereference_all(rq->sd); if (sd) { /* * If there's a runnable CFS task and the current CPU has reduced @@ -12436,7 +12436,7 @@ static void nohz_balancer_kick(struct rq *rq) } } - sd = rcu_dereference(per_cpu(sd_asym_packing, cpu)); + sd = rcu_dereference_all(per_cpu(sd_asym_packing, cpu)); if (sd) { /* * When ASYM_PACKING; see if there's a more preferred CPU @@ -12454,7 +12454,7 @@ static void nohz_balancer_kick(struct rq *rq) } } - sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu)); + sd = rcu_dereference_all(per_cpu(sd_asym_cpucapacity, cpu)); if (sd) { /* * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU @@ -12475,7 +12475,7 @@ static void nohz_balancer_kick(struct rq *rq) goto unlock; } - sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); if (sds) { /* * If there is an imbalance between LLC domains (IOW we could @@ -12507,7 +12507,7 @@ static void set_cpu_sd_state_busy(int cpu) struct sched_domain *sd; rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_llc, cpu)); + sd = rcu_dereference_all(per_cpu(sd_llc, cpu)); if (!sd || !sd->nohz_idle) goto unlock; @@ -12537,7 +12537,7 @@ static void set_cpu_sd_state_idle(int cpu) struct sched_domain *sd; rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_llc, cpu)); + sd = rcu_dereference_all(per_cpu(sd_llc, cpu)); if (!sd || sd->nohz_idle) goto unlock; @@ -13915,7 +13915,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) struct numa_group *ng; rcu_read_lock(); - ng = rcu_dereference(p->numa_group); + ng = rcu_dereference_all(p->numa_group); for_each_online_node(node) { if (p->numa_faults) { tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)]; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2c0a4eaf02de..67cff7d6cc2f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2011,7 +2011,7 @@ queue_balance_callback(struct rq *rq, } #define rcu_dereference_sched_domain(p) \ - rcu_dereference_check((p), lockdep_is_held(&sched_domains_mutex)) + rcu_dereference_all_check((p), lockdep_is_held(&sched_domains_mutex)) /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. -- cgit v1.2.3 From a03fee333a2f1e065a739bdbe5edbc5512fab9a4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 14 Nov 2025 11:00:55 +0100 Subject: sched/fair: Remove superfluous rcu_read_lock() With fair switched to rcu_dereference_all() validation, having IRQ or preemption disabled is sufficient, remove the rcu_read_lock() clutter. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://patch.msgid.link/20251127154725.647502625@infradead.org --- kernel/sched/fair.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 44a359d6a299..496a30a41854 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12856,21 +12856,16 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) */ rq_unpin_lock(this_rq, rf); - rcu_read_lock(); sd = rcu_dereference_sched_domain(this_rq->sd); - if (!sd) { - rcu_read_unlock(); + if (!sd) goto out; - } if (!get_rd_overloaded(this_rq->rd) || this_rq->avg_idle < sd->max_newidle_lb_cost) { update_next_balance(sd, &next_balance); - rcu_read_unlock(); goto out; } - rcu_read_unlock(); /* * Include sched_balance_update_blocked_averages() in the cost @@ -12883,7 +12878,6 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) rq_modified_clear(this_rq); raw_spin_rq_unlock(this_rq); - rcu_read_lock(); for_each_domain(this_cpu, sd) { u64 domain_cost; @@ -12933,7 +12927,6 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) if (pulled_task || !continue_balancing) break; } - rcu_read_unlock(); raw_spin_rq_lock(this_rq); -- cgit v1.2.3 From 95a0155224a658965f34ed4b1943b238d9be1fea Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 1 Sep 2025 22:50:56 +0200 Subject: sched/fair: Limit hrtick work The task_tick_fair() function does: - update the hierarchical runtimes - drive NUMA-balancing - update load-balance statistics - drive force-idle preemption All but the very first can be limited to the periodic tick. Let hrtick only update accounting and drive preemption, not load-balancing and other bits. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://patch.msgid.link/20250918080205.563385766@infradead.org --- kernel/sched/fair.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 496a30a41854..f79951facff4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -13332,6 +13332,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) entity_tick(cfs_rq, se, queued); } + if (queued) { + if (!need_resched()) + hrtick_start_fair(rq, curr); + return; + } + if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); -- cgit v1.2.3 From 47efe2ddccb1f285a02bfcf1e079f49bf7a9ccb3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 30 Oct 2025 12:56:34 +0100 Subject: sched/core: Add assertions to QUEUE_CLASS Add some checks to the sched_change pattern to validate assumptions around changing classes. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://patch.msgid.link/20251127154725.771691954@infradead.org --- kernel/sched/core.c | 13 +++++++++++++ kernel/sched/sched.h | 1 + 2 files changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 41ba0be16911..4479f7d1fdfb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10782,6 +10782,7 @@ struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int *ctx = (struct sched_change_ctx){ .p = p, + .class = p->sched_class, .flags = flags, .queued = task_on_rq_queued(p), .running = task_current_donor(rq, p), @@ -10812,6 +10813,11 @@ void sched_change_end(struct sched_change_ctx *ctx) lockdep_assert_rq_held(rq); + /* + * Changing class without *QUEUE_CLASS is bad. + */ + WARN_ON_ONCE(p->sched_class != ctx->class && !(ctx->flags & ENQUEUE_CLASS)); + if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to) p->sched_class->switching_to(rq, p); @@ -10823,6 +10829,13 @@ void sched_change_end(struct sched_change_ctx *ctx) if (ctx->flags & ENQUEUE_CLASS) { if (p->sched_class->switched_to) p->sched_class->switched_to(rq, p); + + /* + * If this was a degradation in class someone should have set + * need_resched by now. + */ + WARN_ON_ONCE(sched_class_above(ctx->class, p->sched_class) && + !test_tsk_need_resched(p)); } else { p->sched_class->prio_changed(rq, p, ctx->prio); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 67cff7d6cc2f..a40582d5b288 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3968,6 +3968,7 @@ extern void balance_callbacks(struct rq *rq, struct balance_callback *head); struct sched_change_ctx { u64 prio; struct task_struct *p; + const struct sched_class *class; int flags; bool queued; bool running; -- cgit v1.2.3 From 2b8c3d3dc9b1ee323e2982945088e3f5eebdf3dd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 26 Nov 2025 11:31:09 +0100 Subject: sched/fair: Join two #ifdef CONFIG_FAIR_GROUP_SCHED blocks Join two identical #ifdef blocks: #ifdef CONFIG_FAIR_GROUP_SCHED ... #endif #ifdef CONFIG_FAIR_GROUP_SCHED ... #endif Also mark nested #ifdef blocks in the usual fashion, to make it more apparent where in a nested hierarchy of #ifdefs we are at a glance. Signed-off-by: Ingo Molnar Reviewed-by: Shrikanth Hegde Link: https://patch.msgid.link/20251201064647.1851919-2-mingo@kernel.org --- kernel/sched/sched.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a40582d5b288..2173e3d4fa0d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -726,9 +726,7 @@ struct cfs_rq { unsigned long h_load; u64 last_h_load_update; struct sched_entity *h_load_next; -#endif /* CONFIG_FAIR_GROUP_SCHED */ -#ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ /* @@ -746,14 +744,14 @@ struct cfs_rq { /* Locally cached copy of our task_group's idle value */ int idle; -#ifdef CONFIG_CFS_BANDWIDTH +# ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; s64 runtime_remaining; u64 throttled_pelt_idle; -#ifndef CONFIG_64BIT +# ifndef CONFIG_64BIT u64 throttled_pelt_idle_copy; -#endif +# endif u64 throttled_clock; u64 throttled_clock_pelt; u64 throttled_clock_pelt_time; @@ -765,7 +763,7 @@ struct cfs_rq { struct list_head throttled_list; struct list_head throttled_csd_list; struct list_head throttled_limbo_list; -#endif /* CONFIG_CFS_BANDWIDTH */ +# endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ }; -- cgit v1.2.3 From fb9a7458e508ef1beae8d80ee40c2cd1b5b45f3a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 26 Nov 2025 11:29:18 +0100 Subject: sched/fair: Clean up comments in 'struct cfs_rq' - Fix vertical alignment - Fix typos - Fix capitalization Signed-off-by: Ingo Molnar Link: https://patch.msgid.link/20251201064647.1851919-3-mingo@kernel.org --- kernel/sched/sched.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2173e3d4fa0d..82522c98c903 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -670,13 +670,13 @@ struct balance_callback { void (*func)(struct rq *rq); }; -/* CFS-related fields in a runqueue */ +/* Fair scheduling SCHED_{NORMAL,BATCH,IDLE} related fields in a runqueue: */ struct cfs_rq { struct load_weight load; unsigned int nr_queued; - unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */ - unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */ - unsigned int h_nr_idle; /* SCHED_IDLE */ + unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int h_nr_idle; /* SCHED_IDLE */ s64 avg_vruntime; u64 avg_load; @@ -690,7 +690,7 @@ struct cfs_rq { struct rb_root_cached tasks_timeline; /* - * 'curr' points to currently running entity on this cfs_rq. + * 'curr' points to the currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ struct sched_entity *curr; @@ -739,7 +739,7 @@ struct cfs_rq { */ int on_list; struct list_head leaf_cfs_rq_list; - struct task_group *tg; /* group that "owns" this runqueue */ + struct task_group *tg; /* Group that "owns" this runqueue */ /* Locally cached copy of our task_group's idle value */ int idle; -- cgit v1.2.3 From 4ff674fa986c27ec8a0542479258c92d361a2566 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 26 Nov 2025 12:09:16 +0100 Subject: sched/fair: Rename cfs_rq::avg_load to cfs_rq::sum_weight The ::avg_load field is a long-standing misnomer: it says it's an 'average load', but in reality it's the momentary sum of the load of all currently runnable tasks. We'd have to also perform a division by nr_running (or use time-decay) to arrive at any sort of average value. This is clear from comments about the math of fair scheduling: * \Sum w_i := cfs_rq->avg_load The sum of all weights is ... the sum of all weights, not the average of all weights. To make it doubly confusing, there's also an ::avg_load in the load-balancing struct sg_lb_stats, which *is* a true average. The second part of the field's name is a minor misnomer as well: it says 'load', and it is indeed a load_weight structure as it shares code with the load-balancer - but it's only in an SMP load-balancing context where load = weight, in the fair scheduling context the primary purpose is the weighting of different nice levels. So rename the field to ::sum_weight instead, which makes the terminology of the EEVDF math match up with our implementation of it: * \Sum w_i := cfs_rq->sum_weight Signed-off-by: Ingo Molnar Link: https://patch.msgid.link/20251201064647.1851919-6-mingo@kernel.org --- kernel/sched/fair.c | 16 ++++++++-------- kernel/sched/sched.h | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f79951facff4..65b1065f9b21 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -608,7 +608,7 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * * v0 := cfs_rq->zero_vruntime * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime - * \Sum w_i := cfs_rq->avg_load + * \Sum w_i := cfs_rq->sum_weight * * Since zero_vruntime closely tracks the per-task service, these * deltas: (v_i - v), will be in the order of the maximal (virtual) lag @@ -625,7 +625,7 @@ avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) s64 key = entity_key(cfs_rq, se); cfs_rq->avg_vruntime += key * weight; - cfs_rq->avg_load += weight; + cfs_rq->sum_weight += weight; } static void @@ -635,16 +635,16 @@ avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) s64 key = entity_key(cfs_rq, se); cfs_rq->avg_vruntime -= key * weight; - cfs_rq->avg_load -= weight; + cfs_rq->sum_weight -= weight; } static inline void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) { /* - * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load + * v' = v + d ==> avg_vruntime' = avg_runtime - d*sum_weight */ - cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta; + cfs_rq->avg_vruntime -= cfs_rq->sum_weight * delta; } /* @@ -655,7 +655,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; s64 avg = cfs_rq->avg_vruntime; - long load = cfs_rq->avg_load; + long load = cfs_rq->sum_weight; if (curr && curr->on_rq) { unsigned long weight = scale_load_down(curr->load.weight); @@ -723,7 +723,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) { struct sched_entity *curr = cfs_rq->curr; s64 avg = cfs_rq->avg_vruntime; - long load = cfs_rq->avg_load; + long load = cfs_rq->sum_weight; if (curr && curr->on_rq) { unsigned long weight = scale_load_down(curr->load.weight); @@ -5131,7 +5131,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * * vl_i = (W + w_i)*vl'_i / W */ - load = cfs_rq->avg_load; + load = cfs_rq->sum_weight; if (curr && curr->on_rq) load += scale_load_down(curr->load.weight); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 82522c98c903..3334aa535423 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -679,7 +679,7 @@ struct cfs_rq { unsigned int h_nr_idle; /* SCHED_IDLE */ s64 avg_vruntime; - u64 avg_load; + u64 sum_weight; u64 zero_vruntime; #ifdef CONFIG_SCHED_CORE -- cgit v1.2.3 From dcbc9d3f0e594223275a18f7016001889ad35eff Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 2 Dec 2025 16:09:23 +0100 Subject: sched/fair: Rename cfs_rq::avg_vruntime to ::sum_w_vruntime, and helper functions The ::avg_vruntime field is a misnomer: it says it's an 'average vruntime', but in reality it's the momentary sum of the weighted vruntimes of all queued tasks, which is at least a division away from being an average. This is clear from comments about the math of fair scheduling: * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime This confusion is increased by the cfs_avg_vruntime() function, which does perform the division and returns a true average. The sum of all weighted vruntimes should be named thusly, so rename the field to ::sum_w_vruntime. (As arguably ::sum_weighted_vruntime would be a bit of a mouthful.) Understanding the scheduler is hard enough already, without extra layers of obfuscated naming. ;-) Also rename related helper functions: sum_vruntime_add() => sum_w_vruntime_add() sum_vruntime_sub() => sum_w_vruntime_sub() sum_vruntime_update() => sum_w_vruntime_update() With the notable exception of cfs_avg_vruntime(), which was named accurately. Signed-off-by: Ingo Molnar Link: https://patch.msgid.link/20251201064647.1851919-7-mingo@kernel.org --- kernel/sched/fair.c | 26 +++++++++++++------------- kernel/sched/sched.h | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 65b1065f9b21..dcbd995de46d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -607,7 +607,7 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * Which we track using: * * v0 := cfs_rq->zero_vruntime - * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime + * \Sum (v_i - v0) * w_i := cfs_rq->sum_w_vruntime * \Sum w_i := cfs_rq->sum_weight * * Since zero_vruntime closely tracks the per-task service, these @@ -619,32 +619,32 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * As measured, the max (key * weight) value was ~44 bits for a kernel build. */ static void -avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) { unsigned long weight = scale_load_down(se->load.weight); s64 key = entity_key(cfs_rq, se); - cfs_rq->avg_vruntime += key * weight; + cfs_rq->sum_w_vruntime += key * weight; cfs_rq->sum_weight += weight; } static void -avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) +sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) { unsigned long weight = scale_load_down(se->load.weight); s64 key = entity_key(cfs_rq, se); - cfs_rq->avg_vruntime -= key * weight; + cfs_rq->sum_w_vruntime -= key * weight; cfs_rq->sum_weight -= weight; } static inline -void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) +void sum_w_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) { /* - * v' = v + d ==> avg_vruntime' = avg_runtime - d*sum_weight + * v' = v + d ==> sum_w_vruntime' = sum_runtime - d*sum_weight */ - cfs_rq->avg_vruntime -= cfs_rq->sum_weight * delta; + cfs_rq->sum_w_vruntime -= cfs_rq->sum_weight * delta; } /* @@ -654,7 +654,7 @@ void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) u64 avg_vruntime(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; - s64 avg = cfs_rq->avg_vruntime; + s64 avg = cfs_rq->sum_w_vruntime; long load = cfs_rq->sum_weight; if (curr && curr->on_rq) { @@ -722,7 +722,7 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) { struct sched_entity *curr = cfs_rq->curr; - s64 avg = cfs_rq->avg_vruntime; + s64 avg = cfs_rq->sum_w_vruntime; long load = cfs_rq->sum_weight; if (curr && curr->on_rq) { @@ -745,7 +745,7 @@ static void update_zero_vruntime(struct cfs_rq *cfs_rq) u64 vruntime = avg_vruntime(cfs_rq); s64 delta = (s64)(vruntime - cfs_rq->zero_vruntime); - avg_vruntime_update(cfs_rq, delta); + sum_w_vruntime_update(cfs_rq, delta); cfs_rq->zero_vruntime = vruntime; } @@ -819,7 +819,7 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, */ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - avg_vruntime_add(cfs_rq, se); + sum_w_vruntime_add(cfs_rq, se); update_zero_vruntime(cfs_rq); se->min_vruntime = se->vruntime; se->min_slice = se->slice; @@ -831,7 +831,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, &min_vruntime_cb); - avg_vruntime_sub(cfs_rq, se); + sum_w_vruntime_sub(cfs_rq, se); update_zero_vruntime(cfs_rq); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3334aa535423..ab1bfa05e894 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -678,7 +678,7 @@ struct cfs_rq { unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */ unsigned int h_nr_idle; /* SCHED_IDLE */ - s64 avg_vruntime; + s64 sum_w_vruntime; u64 sum_weight; u64 zero_vruntime; -- cgit v1.2.3 From 5758e48eefaf111d7764d8f1c8b666140fe5fa27 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 2 Dec 2025 16:10:32 +0100 Subject: sched/fair: Introduce and use the vruntime_cmp() and vruntime_op() wrappers for wrapped-signed aritmetics We have to be careful with vruntime comparisons and subtraction, due to the possibility of wrapping, so we have macros like: #define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; }) Which is used like this: if (vruntime_gt(min_vruntime, se, rse)) se->min_vruntime = rse->min_vruntime; Replace this with an easier to read pattern that uses the regular arithmetics operators: if (vruntime_cmp(se->min_vruntime, ">", rse->min_vruntime)) se->min_vruntime = rse->min_vruntime; Also replace vruntime subtractions with vruntime_op(): - delta = (s64)(sea->vruntime - seb->vruntime) + - (s64)(cfs_rqb->zero_vruntime_fi - cfs_rqa->zero_vruntime_fi); + delta = vruntime_op(sea->vruntime, "-", seb->vruntime) + + vruntime_op(cfs_rqb->zero_vruntime_fi, "-", cfs_rqa->zero_vruntime_fi); In the vruntime_cmp() and vruntime_op() macros use Use __builtin_strcmp(), because of __HAVE_ARCH_STRCMP might turn off the compiler optimizations we rely on here to catch usage bugs. No change in functionality. Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 66 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 51 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dcbd995de46d..a8ac68d17c6d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -524,10 +524,48 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); * Scheduling class tree data structure manipulation methods: */ +extern void __BUILD_BUG_vruntime_cmp(void); + +/* Use __builtin_strcmp() because of __HAVE_ARCH_STRCMP: */ + +#define vruntime_cmp(A, CMP_STR, B) ({ \ + int __res = 0; \ + \ + if (!__builtin_strcmp(CMP_STR, "<")) { \ + __res = ((s64)((A)-(B)) < 0); \ + } else if (!__builtin_strcmp(CMP_STR, "<=")) { \ + __res = ((s64)((A)-(B)) <= 0); \ + } else if (!__builtin_strcmp(CMP_STR, ">")) { \ + __res = ((s64)((A)-(B)) > 0); \ + } else if (!__builtin_strcmp(CMP_STR, ">=")) { \ + __res = ((s64)((A)-(B)) >= 0); \ + } else { \ + /* Unknown operator throws linker error: */ \ + __BUILD_BUG_vruntime_cmp(); \ + } \ + \ + __res; \ +}) + +extern void __BUILD_BUG_vruntime_op(void); + +#define vruntime_op(A, OP_STR, B) ({ \ + s64 __res = 0; \ + \ + if (!__builtin_strcmp(OP_STR, "-")) { \ + __res = (s64)((A)-(B)); \ + } else { \ + /* Unknown operator throws linker error: */ \ + __BUILD_BUG_vruntime_op(); \ + } \ + \ + __res; \ +}) + + static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime) { - s64 delta = (s64)(vruntime - max_vruntime); - if (delta > 0) + if (vruntime_cmp(vruntime, ">", max_vruntime)) max_vruntime = vruntime; return max_vruntime; @@ -535,8 +573,7 @@ static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime) static inline __maybe_unused u64 min_vruntime(u64 min_vruntime, u64 vruntime) { - s64 delta = (s64)(vruntime - min_vruntime); - if (delta < 0) + if (vruntime_cmp(vruntime, "<", min_vruntime)) min_vruntime = vruntime; return min_vruntime; @@ -549,12 +586,12 @@ static inline bool entity_before(const struct sched_entity *a, * Tiebreak on vruntime seems unnecessary since it can * hardly happen. */ - return (s64)(a->deadline - b->deadline) < 0; + return vruntime_cmp(a->deadline, "<", b->deadline); } static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) { - return (s64)(se->vruntime - cfs_rq->zero_vruntime); + return vruntime_op(se->vruntime, "-", cfs_rq->zero_vruntime); } #define __node_2_se(node) \ @@ -732,7 +769,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) load += weight; } - return avg >= (s64)(vruntime - cfs_rq->zero_vruntime) * load; + return avg >= vruntime_op(vruntime, "-", cfs_rq->zero_vruntime) * load; } int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -743,7 +780,7 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) static void update_zero_vruntime(struct cfs_rq *cfs_rq) { u64 vruntime = avg_vruntime(cfs_rq); - s64 delta = (s64)(vruntime - cfs_rq->zero_vruntime); + s64 delta = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime); sum_w_vruntime_update(cfs_rq, delta); @@ -770,13 +807,12 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) return entity_before(__node_2_se(a), __node_2_se(b)); } -#define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; }) - static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node) { if (node) { struct sched_entity *rse = __node_2_se(node); - if (vruntime_gt(min_vruntime, se, rse)) + + if (vruntime_cmp(se->min_vruntime, ">", rse->min_vruntime)) se->min_vruntime = rse->min_vruntime; } } @@ -887,7 +923,7 @@ static inline void update_protect_slice(struct cfs_rq *cfs_rq, struct sched_enti static inline bool protect_slice(struct sched_entity *se) { - return ((s64)(se->vprot - se->vruntime) > 0); + return vruntime_cmp(se->vruntime, "<", se->vprot); } static inline void cancel_protect_slice(struct sched_entity *se) @@ -1024,7 +1060,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); */ static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if ((s64)(se->vruntime - se->deadline) < 0) + if (vruntime_cmp(se->vruntime, "<", se->deadline)) return false; /* @@ -13293,8 +13329,8 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, * zero_vruntime_fi, which would have been updated in prior calls * to se_fi_update(). */ - delta = (s64)(sea->vruntime - seb->vruntime) + - (s64)(cfs_rqb->zero_vruntime_fi - cfs_rqa->zero_vruntime_fi); + delta = vruntime_op(sea->vruntime, "-", seb->vruntime) + + vruntime_op(cfs_rqb->zero_vruntime_fi, "-", cfs_rqa->zero_vruntime_fi); return delta > 0; } -- cgit v1.2.3 From 527a521029c3edd38fb9fc96cd58e3fd7393d28e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 2 Dec 2025 10:35:06 +0100 Subject: sched/fair: Sort out 'blocked_load*' namespace noise There's three layers of logic in the scheduler that deal with 'has_blocked' (load) handling of the NOHZ code: (1) nohz.has_blocked, (2) rq->has_blocked_load, deal with NOHZ idle balancing, (3) and cfs_rq_has_blocked(), which is part of the layer that is passing the SMP load-balancing signal to the NOHZ layers. The 'has_blocked' and 'has_blocked_load' names are used in a mixed fashion, sometimes within the same function. Standardize on 'has_blocked_load' to make it all easy to read and easy to grep. No change in functionality. Suggested-by: Vincent Guittot Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Shrikanth Hegde Link: https://patch.msgid.link/aS6yvxyc3JfMxxQW@gmail.com --- kernel/sched/fair.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a8ac68d17c6d..d588eb871657 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7140,7 +7140,7 @@ static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask); static struct { cpumask_var_t idle_cpus_mask; atomic_t nr_cpus; - int has_blocked; /* Idle CPUS has blocked load */ + int has_blocked_load; /* Idle CPUS has blocked load */ int needs_update; /* Newly idle CPUs need their next_balance collated */ unsigned long next_balance; /* in jiffy units */ unsigned long next_blocked; /* Next update of blocked load in jiffies */ @@ -9770,7 +9770,7 @@ static void attach_tasks(struct lb_env *env) } #ifdef CONFIG_NO_HZ_COMMON -static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) +static inline bool cfs_rq_has_blocked_load(struct cfs_rq *cfs_rq) { if (cfs_rq->avg.load_avg) return true; @@ -9803,16 +9803,16 @@ static inline void update_blocked_load_tick(struct rq *rq) WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies); } -static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) +static inline void update_has_blocked_load_status(struct rq *rq, bool has_blocked_load) { - if (!has_blocked) + if (!has_blocked_load) rq->has_blocked_load = 0; } #else /* !CONFIG_NO_HZ_COMMON: */ -static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; } +static inline bool cfs_rq_has_blocked_load(struct cfs_rq *cfs_rq) { return false; } static inline bool others_have_blocked(struct rq *rq) { return false; } static inline void update_blocked_load_tick(struct rq *rq) {} -static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} +static inline void update_has_blocked_load_status(struct rq *rq, bool has_blocked_load) {} #endif /* !CONFIG_NO_HZ_COMMON */ static bool __update_blocked_others(struct rq *rq, bool *done) @@ -9869,7 +9869,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) list_del_leaf_cfs_rq(cfs_rq); /* Don't need periodic decay once load/util_avg are null */ - if (cfs_rq_has_blocked(cfs_rq)) + if (cfs_rq_has_blocked_load(cfs_rq)) *done = false; } @@ -9929,7 +9929,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) bool decayed; decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); - if (cfs_rq_has_blocked(cfs_rq)) + if (cfs_rq_has_blocked_load(cfs_rq)) *done = false; return decayed; @@ -9950,7 +9950,7 @@ static void __sched_balance_update_blocked_averages(struct rq *rq) decayed |= __update_blocked_others(rq, &done); decayed |= __update_blocked_fair(rq, &done); - update_blocked_load_status(rq, !done); + update_has_blocked_load_status(rq, !done); if (decayed) cpufreq_update_util(rq, 0); } @@ -12446,7 +12446,7 @@ static void nohz_balancer_kick(struct rq *rq) if (likely(!atomic_read(&nohz.nr_cpus))) return; - if (READ_ONCE(nohz.has_blocked) && + if (READ_ONCE(nohz.has_blocked_load) && time_after(now, READ_ONCE(nohz.next_blocked))) flags = NOHZ_STATS_KICK; @@ -12607,9 +12607,9 @@ void nohz_balance_enter_idle(int cpu) /* * The tick is still stopped but load could have been added in the - * meantime. We set the nohz.has_blocked flag to trig a check of the + * meantime. We set the nohz.has_blocked_load flag to trig a check of the * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear - * of nohz.has_blocked can only happen after checking the new load + * of nohz.has_blocked_load can only happen after checking the new load */ if (rq->nohz_tick_stopped) goto out; @@ -12625,7 +12625,7 @@ void nohz_balance_enter_idle(int cpu) /* * Ensures that if nohz_idle_balance() fails to observe our - * @idle_cpus_mask store, it must observe the @has_blocked + * @idle_cpus_mask store, it must observe the @has_blocked_load * and @needs_update stores. */ smp_mb__after_atomic(); @@ -12638,7 +12638,7 @@ out: * Each time a cpu enter idle, we assume that it has blocked load and * enable the periodic update of the load of idle CPUs */ - WRITE_ONCE(nohz.has_blocked, 1); + WRITE_ONCE(nohz.has_blocked_load, 1); } static bool update_nohz_stats(struct rq *rq) @@ -12679,8 +12679,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) /* * We assume there will be no idle load after this update and clear - * the has_blocked flag. If a cpu enters idle in the mean time, it will - * set the has_blocked flag and trigger another update of idle load. + * the has_blocked_load flag. If a cpu enters idle in the mean time, it will + * set the has_blocked_load flag and trigger another update of idle load. * Because a cpu that becomes idle, is added to idle_cpus_mask before * setting the flag, we are sure to not clear the state and not * check the load of an idle cpu. @@ -12688,12 +12688,12 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) * Same applies to idle_cpus_mask vs needs_update. */ if (flags & NOHZ_STATS_KICK) - WRITE_ONCE(nohz.has_blocked, 0); + WRITE_ONCE(nohz.has_blocked_load, 0); if (flags & NOHZ_NEXT_KICK) WRITE_ONCE(nohz.needs_update, 0); /* - * Ensures that if we miss the CPU, we must see the has_blocked + * Ensures that if we miss the CPU, we must see the has_blocked_load * store from nohz_balance_enter_idle(). */ smp_mb(); @@ -12760,7 +12760,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) abort: /* There is still blocked load, enable periodic update */ if (has_blocked_load) - WRITE_ONCE(nohz.has_blocked, 1); + WRITE_ONCE(nohz.has_blocked_load, 1); } /* @@ -12822,7 +12822,7 @@ static void nohz_newidle_balance(struct rq *this_rq) return; /* Don't need to update blocked load of idle CPUs*/ - if (!READ_ONCE(nohz.has_blocked) || + if (!READ_ONCE(nohz.has_blocked_load) || time_before(jiffies, READ_ONCE(nohz.next_blocked))) return; -- cgit v1.2.3 From 704069649b5bfb7bf1fe32c0281fe9036806a59a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Dec 2025 09:06:50 +0100 Subject: sched/core: Rework sched_class::wakeup_preempt() and rq_modified_*() Change sched_class::wakeup_preempt() to also get called for cross-class wakeups, specifically those where the woken task is of a higher class than the previous highest class. In order to do this, track the current highest class of the runqueue in rq::next_class and have wakeup_preempt() track this upwards for each new wakeup. Additionally have schedule() re-set the value on pick. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://patch.msgid.link/20251127154725.901391274@infradead.org --- kernel/sched/core.c | 32 +++++++++++++++++++++++--------- kernel/sched/deadline.c | 14 +++++++++----- kernel/sched/ext.c | 9 ++++----- kernel/sched/fair.c | 17 ++++++++++------- kernel/sched/idle.c | 3 --- kernel/sched/rt.c | 9 ++++++--- kernel/sched/sched.h | 27 ++------------------------- kernel/sched/stop_task.c | 3 --- 8 files changed, 54 insertions(+), 60 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4479f7d1fdfb..7d0a862a8c75 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2090,7 +2090,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) */ uclamp_rq_inc(rq, p, flags); - rq->queue_mask |= p->sched_class->queue_mask; p->sched_class->enqueue_task(rq, p, flags); psi_enqueue(p, flags); @@ -2123,7 +2122,6 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) * and mark the task ->sched_delayed. */ uclamp_rq_dec(rq, p); - rq->queue_mask |= p->sched_class->queue_mask; return p->sched_class->dequeue_task(rq, p, flags); } @@ -2174,10 +2172,14 @@ void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags) { struct task_struct *donor = rq->donor; - if (p->sched_class == donor->sched_class) - donor->sched_class->wakeup_preempt(rq, p, flags); - else if (sched_class_above(p->sched_class, donor->sched_class)) + if (p->sched_class == rq->next_class) { + rq->next_class->wakeup_preempt(rq, p, flags); + + } else if (sched_class_above(p->sched_class, rq->next_class)) { + rq->next_class->wakeup_preempt(rq, p, flags); resched_curr(rq); + rq->next_class = p->sched_class; + } /* * A queue event has occurred, and we're going to schedule. In @@ -6804,6 +6806,7 @@ static void __sched notrace __schedule(int sched_mode) pick_again: next = pick_next_task(rq, rq->donor, &rf); rq_set_donor(rq, next); + rq->next_class = next->sched_class; if (unlikely(task_is_blocked(next))) { next = find_proxy_task(rq, next, &rf); if (!next) @@ -8650,6 +8653,8 @@ void __init sched_init(void) rq->rt.rt_runtime = global_rt_runtime(); init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif + rq->next_class = &idle_sched_class; + rq->sd = NULL; rq->rd = NULL; rq->cpu_capacity = SCHED_CAPACITY_SCALE; @@ -10775,10 +10780,8 @@ struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags |= DEQUEUE_NOCLOCK; } - if (flags & DEQUEUE_CLASS) { - if (p->sched_class->switching_from) - p->sched_class->switching_from(rq, p); - } + if ((flags & DEQUEUE_CLASS) && p->sched_class->switching_from) + p->sched_class->switching_from(rq, p); *ctx = (struct sched_change_ctx){ .p = p, @@ -10830,6 +10833,17 @@ void sched_change_end(struct sched_change_ctx *ctx) if (p->sched_class->switched_to) p->sched_class->switched_to(rq, p); + /* + * If this was a class promotion; let the old class know it + * got preempted. Note that none of the switch*_from() methods + * know the new class and none of the switch*_to() methods + * know the old class. + */ + if (ctx->running && sched_class_above(p->sched_class, ctx->class)) { + rq->next_class->wakeup_preempt(rq, p, 0); + rq->next_class = p->sched_class; + } + /* * If this was a degradation in class someone should have set * need_resched by now. diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 319439fe1870..80c9559a3e30 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2499,9 +2499,16 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) * Only called when both the current and waking task are -deadline * tasks. */ -static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, - int flags) +static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags) { + /* + * Can only get preempted by stop-class, and those should be + * few and short lived, doesn't really make sense to push + * anything away for that. + */ + if (p->sched_class != &dl_sched_class) + return; + if (dl_entity_preempt(&p->dl, &rq->donor->dl)) { resched_curr(rq); return; @@ -3346,9 +3353,6 @@ static int task_is_throttled_dl(struct task_struct *p, int cpu) #endif DEFINE_SCHED_CLASS(dl) = { - - .queue_mask = 8, - .enqueue_task = enqueue_task_dl, .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 05f5a49e9649..3b32e641b7ee 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2431,7 +2431,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) /* see kick_cpus_irq_workfn() */ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); - rq_modified_clear(rq); + rq->next_class = &ext_sched_class; rq_unpin_lock(rq, rf); balance_one(rq, prev); @@ -2446,7 +2446,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) * If @force_scx is true, always try to pick a SCHED_EXT task, * regardless of any higher-priority sched classes activity. */ - if (!force_scx && rq_modified_above(rq, &ext_sched_class)) + if (!force_scx && sched_class_above(rq->next_class, &ext_sched_class)) return RETRY_TASK; keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; @@ -3075,7 +3075,8 @@ static void switched_from_scx(struct rq *rq, struct task_struct *p) scx_disable_task(p); } -static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {} +static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) {} + static void switched_to_scx(struct rq *rq, struct task_struct *p) {} int scx_check_setscheduler(struct task_struct *p, int policy) @@ -3336,8 +3337,6 @@ static void scx_cgroup_unlock(void) {} * their current sched_class. Call them directly from sched core instead. */ DEFINE_SCHED_CLASS(ext) = { - .queue_mask = 1, - .enqueue_task = enqueue_task_scx, .dequeue_task = dequeue_task_scx, .yield_task = yield_task_scx, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d588eb871657..76f5e4b78b30 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8736,7 +8736,7 @@ preempt_sync(struct rq *rq, int wake_flags, /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags) +static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_flags) { enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK; struct task_struct *donor = rq->donor; @@ -8744,6 +8744,12 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int struct cfs_rq *cfs_rq = task_cfs_rq(donor); int cse_is_idle, pse_is_idle; + /* + * XXX Getting preempted by higher class, try and find idle CPU? + */ + if (p->sched_class != &fair_sched_class) + return; + if (unlikely(se == pse)) return; @@ -12911,7 +12917,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) t0 = sched_clock_cpu(this_cpu); __sched_balance_update_blocked_averages(this_rq); - rq_modified_clear(this_rq); + this_rq->next_class = &fair_sched_class; raw_spin_rq_unlock(this_rq); for_each_domain(this_cpu, sd) { @@ -12978,7 +12984,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) pulled_task = 1; /* If a higher prio class was modified, restart the pick */ - if (rq_modified_above(this_rq, &fair_sched_class)) + if (sched_class_above(this_rq->next_class, &fair_sched_class)) pulled_task = -1; out: @@ -13882,15 +13888,12 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task * All the scheduling class methods: */ DEFINE_SCHED_CLASS(fair) = { - - .queue_mask = 2, - .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, .yield_to_task = yield_to_task_fair, - .wakeup_preempt = check_preempt_wakeup_fair, + .wakeup_preempt = wakeup_preempt_fair, .pick_task = pick_task_fair, .pick_next_task = pick_next_task_fair, diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c174afe1dd17..65eb8f8c1a5d 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -536,9 +536,6 @@ static void update_curr_idle(struct rq *rq) * Simple, special scheduling class for the per-CPU idle tasks: */ DEFINE_SCHED_CLASS(idle) = { - - .queue_mask = 0, - /* no enqueue/yield_task for idle tasks */ /* dequeue is not valid, we print a debug message there: */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f1867fe8e5c5..0a9b2cd6da72 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1615,6 +1615,12 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) { struct task_struct *donor = rq->donor; + /* + * XXX If we're preempted by DL, queue a push? + */ + if (p->sched_class != &rt_sched_class) + return; + if (p->prio < donor->prio) { resched_curr(rq); return; @@ -2568,9 +2574,6 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu) #endif /* CONFIG_SCHED_CORE */ DEFINE_SCHED_CLASS(rt) = { - - .queue_mask = 4, - .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ab1bfa05e894..3ceaa9dc9a9e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1118,8 +1118,6 @@ struct rq { /* runqueue lock: */ raw_spinlock_t __lock; - /* Per class runqueue modification mask; bits in class order. */ - unsigned int queue_mask; unsigned int nr_running; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; @@ -1179,6 +1177,7 @@ struct rq { struct sched_dl_entity *dl_server; struct task_struct *idle; struct task_struct *stop; + const struct sched_class *next_class; unsigned long next_balance; struct mm_struct *prev_mm; @@ -2426,15 +2425,6 @@ struct sched_class { #ifdef CONFIG_UCLAMP_TASK int uclamp_enabled; #endif - /* - * idle: 0 - * ext: 1 - * fair: 2 - * rt: 4 - * dl: 8 - * stop: 16 - */ - unsigned int queue_mask; /* * move_queued_task/activate_task/enqueue_task: rq->lock @@ -2593,20 +2583,6 @@ struct sched_class { #endif }; -/* - * Does not nest; only used around sched_class::pick_task() rq-lock-breaks. - */ -static inline void rq_modified_clear(struct rq *rq) -{ - rq->queue_mask = 0; -} - -static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class) -{ - unsigned int mask = class->queue_mask; - return rq->queue_mask & ~((mask << 1) - 1); -} - static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { WARN_ON_ONCE(rq->donor != prev); @@ -3899,6 +3875,7 @@ void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_s deactivate_task(src_rq, task, 0); set_task_cpu(task, dst_rq->cpu); activate_task(dst_rq, task, 0); + wakeup_preempt(dst_rq, task, 0); } static inline diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 4f9192be4b5b..f95798baddeb 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -97,9 +97,6 @@ static void update_curr_stop(struct rq *rq) * Simple, special scheduling class for the per-CPU stop tasks: */ DEFINE_SCHED_CLASS(stop) = { - - .queue_mask = 16, - .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, .yield_task = yield_task_stop, -- cgit v1.2.3 From 1862d8e264def8425d682f1177e22f9fe7d947ea Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 17 Dec 2025 11:24:11 +0100 Subject: sched: Fix faulty assertion in sched_change_end() Commit 47efe2ddccb1f ("sched/core: Add assertions to QUEUE_CLASS") added an assert to sched_change_end() verifying that a class demotion would result in a reschedule. As it turns out; rt_mutex_setprio() does not force a resched on class demontion. Furthermore, this is only relevant to running tasks. Change the warning into a reschedule and make sure to only do so for running tasks. Fixes: 47efe2ddccb1f ("sched/core: Add assertions to QUEUE_CLASS") Reported-by: Naresh Kamboju Tested-by: Linux Kernel Functional Testing Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251216141725.GW3707837@noisy.programming.kicks-ass.net --- kernel/sched/core.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7d0a862a8c75..5b17d8e3cb55 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10833,23 +10833,24 @@ void sched_change_end(struct sched_change_ctx *ctx) if (p->sched_class->switched_to) p->sched_class->switched_to(rq, p); - /* - * If this was a class promotion; let the old class know it - * got preempted. Note that none of the switch*_from() methods - * know the new class and none of the switch*_to() methods - * know the old class. - */ - if (ctx->running && sched_class_above(p->sched_class, ctx->class)) { - rq->next_class->wakeup_preempt(rq, p, 0); - rq->next_class = p->sched_class; + if (ctx->running) { + /* + * If this was a class promotion; let the old class + * know it got preempted. Note that none of the + * switch*_from() methods know the new class and none + * of the switch*_to() methods know the old class. + */ + if (sched_class_above(p->sched_class, ctx->class)) { + rq->next_class->wakeup_preempt(rq, p, 0); + rq->next_class = p->sched_class; + } + /* + * If this was a degradation in class; make sure to + * reschedule. + */ + if (sched_class_above(ctx->class, p->sched_class)) + resched_curr(rq); } - - /* - * If this was a degradation in class someone should have set - * need_resched by now. - */ - WARN_ON_ONCE(sched_class_above(ctx->class, p->sched_class) && - !test_tsk_need_resched(p)); } else { p->sched_class->prio_changed(rq, p, ctx->prio); } -- cgit v1.2.3 From 6ab7973f254071faf20fe5fcc502a3fe9ca14a47 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 19 Dec 2025 09:04:45 +0100 Subject: sched/fair: Fix sched_avg fold After the robot reported a regression wrt commit: 089d84203ad4 ("sched/fair: Fold the sched_avg update"), Shrikanth noted that two spots missed a factor se_weight(). Fixes: 089d84203ad4 ("sched/fair: Fold the sched_avg update") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202512181208.753b9f6e-lkp@intel.com Debugged-by: Shrikanth Hegde Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251218102020.GO3707891@noisy.programming.kicks-ass.net --- kernel/sched/fair.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 76f5e4b78b30..7377f9117501 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3775,13 +3775,15 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) static inline void enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - __update_sa(&cfs_rq->avg, load, se->avg.load_avg, se->avg.load_sum); + __update_sa(&cfs_rq->avg, load, se->avg.load_avg, + se_weight(se) * se->avg.load_sum); } static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - __update_sa(&cfs_rq->avg, load, -se->avg.load_avg, -se->avg.load_sum); + __update_sa(&cfs_rq->avg, load, -se->avg.load_avg, + se_weight(se) * -se->avg.load_sum); } static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); -- cgit v1.2.3 From ff1de90dd7a69ef43586683535ad87ab899a1214 Mon Sep 17 00:00:00 2001 From: "Yury Norov (NVIDIA)" Date: Sat, 6 Dec 2025 23:05:42 -0500 Subject: sched/fair: Drop useless cpumask_empty() in find_energy_efficient_cpu() cpumask_empty() call is O(N) and useless because the previous cpumask_and() returns false for empty 'cpus'. Drop it. Signed-off-by: Yury Norov (NVIDIA) Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Madadi Vineeth Reddy Reviewed-by: K Prateek Nayak Reviewed-by: Vincent Guittot Link: https://patch.msgid.link/20251207040543.407695-1-yury.norov@gmail.com --- kernel/sched/fair.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7377f9117501..64275d75a964 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8359,9 +8359,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) int max_spare_cap_cpu = -1; int fits, max_fits = -1; - cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask); - - if (cpumask_empty(cpus)) + if (!cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask)) continue; /* Account external pressure for the energy estimation */ -- cgit v1.2.3 From 0ab25ea2a3b3a973fb914d0e47dc9c3c26049e8b Mon Sep 17 00:00:00 2001 From: "Yury Norov (NVIDIA)" Date: Sat, 6 Dec 2025 22:30:36 -0500 Subject: sched/fair: Simplify task_numa_find_cpu() Use for_each_cpu_and() and drop some housekeeping code. Signed-off-by: Yury Norov (NVIDIA) Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Reviewed-by: Phil Auld Link: https://patch.msgid.link/20251207033037.399608-1-yury.norov@gmail.com --- kernel/sched/fair.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 64275d75a964..842a0f20414f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2494,11 +2494,8 @@ static void task_numa_find_cpu(struct task_numa_env *env, maymove = !load_too_imbalanced(src_load, dst_load, env); } - for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { - /* Skip this CPU if the source task cannot migrate */ - if (!cpumask_test_cpu(cpu, env->p->cpus_ptr)) - continue; - + /* Skip CPUs if the source task cannot migrate */ + for_each_cpu_and(cpu, cpumask_of_node(env->dst_nid), env->p->cpus_ptr) { env->dst_cpu = cpu; if (task_numa_compare(env, taskimp, groupimp, maymove)) break; -- cgit v1.2.3 From 55b39b0cf183b9c682717a55a2fba06da69bba6b Mon Sep 17 00:00:00 2001 From: "Yury Norov (NVIDIA)" Date: Sat, 6 Dec 2025 22:42:47 -0500 Subject: sched/fair: Use cpumask_weight_and() in sched_balance_find_dst_group() In the group_has_spare case, the function creates a temporary cpumask to just calculate weight of (p->cpus_ptr & sched_group_span(local)). We've got a dedicated helper for it. Signed-off-by: Yury Norov (NVIDIA) Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Reviewed-by: K Prateek Nayak Reviewed-by: Fernand Sieber Link: https://patch.msgid.link/20251207034247.402926-1-yury.norov@gmail.com --- kernel/sched/fair.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 842a0f20414f..ebee20f75fa0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10974,10 +10974,9 @@ sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int * take care of it. */ if (p->nr_cpus_allowed != NR_CPUS) { - struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); - - cpumask_and(cpus, sched_group_span(local), p->cpus_ptr); - imb_numa_nr = min(cpumask_weight(cpus), sd->imb_numa_nr); + unsigned int w = cpumask_weight_and(p->cpus_ptr, + sched_group_span(local)); + imb_numa_nr = min(w, sd->imb_numa_nr); } imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus); -- cgit v1.2.3 From 89951fc1f8201df27366ac1eed1ddc9ee0f47729 Mon Sep 17 00:00:00 2001 From: Blake Jones Date: Mon, 1 Dec 2025 18:37:43 -0800 Subject: sched: Reorder some fields in struct rq This colocates some hot fields in "struct rq" to be on the same cache line as others that are often accessed at the same time or in similar ways. Using data from a Google-internal fleet-scale profiler, I found three distinct groups of hot fields in struct rq: - (1) The runqueue lock: __lock. - (2) Those accessed from hot code in pick_next_task_fair(): nr_running, nr_numa_running, nr_preferred_running, ttwu_pending, cpu_capacity, curr, idle. - (3) Those accessed from some other hot codepaths, e.g. update_curr(), update_rq_clock(), and scheduler_tick(): clock_task, clock_pelt, clock, lost_idle_time, clock_update_flags, clock_pelt_idle, clock_idle. The cycles spent on accessing these different groups of fields broke down roughly as follows: - 50% on group (1) (the runqueue lock, always read-write) - 39% on group (2) (load:store ratio around 38:1) - 8% on group (3) (load:store ratio around 5:1) - 3% on all the other fields Most of the fields in group (3) are already in a cache line grouping; this patch just adds "clock" and "clock_update_flags" to that group. The fields in group (2) are scattered across several cache lines; the main effect of this patch is to group them together, on a single line at the beginning of the structure. A few other less performance-critical fields (nr_switches, numa_migrate_on, has_blocked_load, nohz_csd, last_blocked_load_update_tick) were also reordered to reduce holes in the data structure. Since the runqueue lock is acquired from so many different contexts, and is basically always accessed using an atomic operation, putting it on either of the cache lines for groups (2) or (3) would slow down accesses to those fields dramatically, since those groups are read-mostly accesses. To test this, I wrote a focused load test that would put load on the pick_next_task_fair() path. A parent process would fork many child processes, and each child would nanosleep() for 1 msec many times in a loop. The load test was monitored with "perf", and I looked at the amount of cycles that were spent with sched_balance_rq() on the stack. The test was reliably spending ~5% of all of its cycles there. I ran it 100 times on a pair of 2-socket Intel Haswell machines (72 vCPUs per machine) - one running the tip of sched/core, the other running this change - using 360 child processes and 8192 1-msec sleeps per child. The mean cycle count dropped from 5.14B to 4.91B, or a *4.6% decrease* in relevant scheduler cycles. Given that this change reduces cache misses in a very hot kernel codepath, there's likely to be additional application performance improvement due to reduced cache conflicts from kernel data structures. On a Power11 system with 128-byte cache lines, my test showed a ~5% decrease in relevant scheduler cycles, along with a slight increase in user time - both positive indicators. This data comes from https://lore.kernel.org/lkml/affdc6b1-9980-44d1-89db-d90730c1e384@linux.ibm.com/ This is the case even though the additional "____cacheline_aligned" that puts the runqueue lock on the next cache line adds an additional 64 bytes of padding on those machines. This patch does not change the size of "struct rq" on machines with 64-byte cache lines. I also ran "hackbench" to try to test this change, but it didn't show conclusive results. Looking at a CPU cycle profile of the hackbench run, it was spending 95% of its cycles inside __alloc_skb(), __kfree_skb(), or kmem_cache_free() - almost all of which was spent updating memcg counters or contending on the list_lock in kmem_cache_node. And it spent less than 0.5% of its cycles inside either schedule() or try_to_wake_up(). So it's not surprising that it didn't show useful results here. The "__no_randomize_layout" was added to reflect the fact that performance of code that references this data structure is unusually sensitive to placement of its members. Signed-off-by: Blake Jones Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Madadi Vineeth Reddy Reviewed-by: Josh Don Tested-by: Madadi Vineeth Reddy Link: https://patch.msgid.link/20251202023743.1524247-1-blakejones@google.com --- kernel/sched/sched.h | 75 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3ceaa9dc9a9e..58c9d244f12b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1115,26 +1115,50 @@ DECLARE_STATIC_KEY_FALSE(sched_uclamp_used); * acquire operations must be ordered by ascending &runqueue. */ struct rq { - /* runqueue lock: */ - raw_spinlock_t __lock; - + /* + * The following members are loaded together, without holding the + * rq->lock, in an extremely hot loop in update_sg_lb_stats() + * (called from pick_next_task()). To reduce cache pollution from + * this operation, they are placed together on this dedicated cache + * line. Even though some of them are frequently modified, they are + * loaded much more frequently than they are stored. + */ unsigned int nr_running; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; - unsigned int numa_migrate_on; #endif + unsigned int ttwu_pending; + unsigned long cpu_capacity; +#ifdef CONFIG_SCHED_PROXY_EXEC + struct task_struct __rcu *donor; /* Scheduling context */ + struct task_struct __rcu *curr; /* Execution context */ +#else + union { + struct task_struct __rcu *donor; /* Scheduler context */ + struct task_struct __rcu *curr; /* Execution context */ + }; +#endif + struct task_struct *idle; + /* padding left here deliberately */ + + /* + * The next cacheline holds the (hot) runqueue lock, as well as + * some other less performance-critical fields. + */ + u64 nr_switches ____cacheline_aligned; + + /* runqueue lock: */ + raw_spinlock_t __lock; + #ifdef CONFIG_NO_HZ_COMMON - unsigned long last_blocked_load_update_tick; - unsigned int has_blocked_load; - call_single_data_t nohz_csd; unsigned int nohz_tick_stopped; atomic_t nohz_flags; + unsigned int has_blocked_load; + unsigned long last_blocked_load_update_tick; + call_single_data_t nohz_csd; #endif /* CONFIG_NO_HZ_COMMON */ - unsigned int ttwu_pending; - u64 nr_switches; - #ifdef CONFIG_UCLAMP_TASK /* Utilization clamp values based on CPU's RUNNABLE tasks */ struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned; @@ -1157,6 +1181,9 @@ struct rq { struct list_head *tmp_alone_branch; #endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_NUMA_BALANCING + unsigned int numa_migrate_on; +#endif /* * This is part of a global counter where only the total sum * over all CPUs matters. A task can increase this counter on @@ -1165,37 +1192,29 @@ struct rq { */ unsigned long nr_uninterruptible; -#ifdef CONFIG_SCHED_PROXY_EXEC - struct task_struct __rcu *donor; /* Scheduling context */ - struct task_struct __rcu *curr; /* Execution context */ -#else - union { - struct task_struct __rcu *donor; /* Scheduler context */ - struct task_struct __rcu *curr; /* Execution context */ - }; -#endif struct sched_dl_entity *dl_server; - struct task_struct *idle; struct task_struct *stop; const struct sched_class *next_class; unsigned long next_balance; struct mm_struct *prev_mm; - unsigned int clock_update_flags; - u64 clock; - /* Ensure that all clocks are in the same cache line */ + /* + * The following fields of clock data are frequently referenced + * and updated together, and should go on their own cache line. + */ u64 clock_task ____cacheline_aligned; u64 clock_pelt; + u64 clock; unsigned long lost_idle_time; + unsigned int clock_update_flags; u64 clock_pelt_idle; u64 clock_idle; + #ifndef CONFIG_64BIT u64 clock_pelt_idle_copy; u64 clock_idle_copy; #endif - atomic_t nr_iowait; - u64 last_seen_need_resched_ns; int ticks_without_resched; @@ -1206,8 +1225,6 @@ struct rq { struct root_domain *rd; struct sched_domain __rcu *sd; - unsigned long cpu_capacity; - struct balance_callback *balance_callback; unsigned char nohz_idle_balance; @@ -1317,7 +1334,9 @@ struct rq { call_single_data_t cfsb_csd; struct list_head cfsb_csd_list; #endif -}; + + atomic_t nr_iowait; +} __no_randomize_layout; #ifdef CONFIG_FAIR_GROUP_SCHED -- cgit v1.2.3 From 7dadeaa6e851e7d67733f3e24fc53ee107781d0f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 18 Dec 2025 15:25:10 +0100 Subject: sched: Further restrict the preemption modes The introduction of PREEMPT_LAZY was for multiple reasons: - PREEMPT_RT suffered from over-scheduling, hurting performance compared to !PREEMPT_RT. - the introduction of (more) features that rely on preemption; like folio_zero_user() which can do large memset() without preemption checks. (Xen already had a horrible hack to deal with long running hypercalls) - the endless and uncontrolled sprinkling of cond_resched() -- mostly cargo cult or in response to poor to replicate workloads. By moving to a model that is fundamentally preemptable these things become managable and avoid needing to introduce more horrible hacks. Since this is a requirement; limit PREEMPT_NONE to architectures that do not support preemption at all. Further limit PREEMPT_VOLUNTARY to those architectures that do not yet have PREEMPT_LAZY support (with the eventual goal to make this the empty set and completely remove voluntary preemption and cond_resched() -- notably VOLUNTARY is already limited to !ARCH_NO_PREEMPT.) This leaves up-to-date architectures (arm64, loongarch, powerpc, riscv, s390, x86) with only two preemption models: full and lazy. While Lazy has been the recommended setting for a while, not all distributions have managed to make the switch yet. Force things along. Keep the patch minimal in case of hard to address regressions that might pop up. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://patch.msgid.link/20251219101502.GB1132199@noisy.programming.kicks-ass.net --- kernel/Kconfig.preempt | 3 +++ kernel/sched/core.c | 2 +- kernel/sched/debug.c | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index da326800c1c9..88c594c6d7fc 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -16,11 +16,13 @@ config ARCH_HAS_PREEMPT_LAZY choice prompt "Preemption Model" + default PREEMPT_LAZY if ARCH_HAS_PREEMPT_LAZY default PREEMPT_NONE config PREEMPT_NONE bool "No Forced Preemption (Server)" depends on !PREEMPT_RT + depends on ARCH_NO_PREEMPT select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC help This is the traditional Linux preemption model, geared towards @@ -35,6 +37,7 @@ config PREEMPT_NONE config PREEMPT_VOLUNTARY bool "Voluntary Kernel Preemption (Desktop)" + depends on !ARCH_HAS_PREEMPT_LAZY depends on !ARCH_NO_PREEMPT depends on !PREEMPT_RT select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5b17d8e3cb55..fa720753e7d5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7553,7 +7553,7 @@ int preempt_dynamic_mode = preempt_dynamic_undefined; int sched_dynamic_mode(const char *str) { -# ifndef CONFIG_PREEMPT_RT +# if !(defined(CONFIG_PREEMPT_RT) || defined(CONFIG_ARCH_HAS_PREEMPT_LAZY)) if (!strcmp(str, "none")) return preempt_dynamic_none; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 41caa22e0680..5f9b77195159 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -243,7 +243,7 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, static int sched_dynamic_show(struct seq_file *m, void *v) { - int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2; + int i = (IS_ENABLED(CONFIG_PREEMPT_RT) || IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY)) * 2; int j; /* Count entries in NULL terminated preempt_modes */ -- cgit v1.2.3 From 6c125b85f3c87b4bf7dba91af6f27d9600b9dba0 Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Fri, 5 Dec 2025 14:16:16 +0100 Subject: sched: Export hidden tracepoints to modules The tracepoints sched_entry, sched_exit and sched_set_need_resched are not exported to tracefs as trace events, this allows only kernel code to access them. Helper modules like [1] can be used to still have the tracepoints available to ftrace for debugging purposes, but they do rely on the tracepoints being exported. Export the 3 not exported tracepoints. Note that sched_set_state is already exported as the macro is called from modules. [1] - https://github.com/qais-yousef/sched_tp.git Fixes: adcc3bfa8806 ("sched: Adapt sched tracepoints for RV task model") Signed-off-by: Gabriele Monaco Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Link: https://patch.msgid.link/20251205131621.135513-9-gmonaco@redhat.com --- kernel/sched/core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fa720753e7d5..b033f9751dfd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -119,6 +119,9 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_entry_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_exit_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_set_need_resched_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); DEFINE_PER_CPU(struct rnd_state, sched_rnd_state); -- cgit v1.2.3 From 8d737320166bd145af70a3133a9964b00ca81cba Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Mon, 12 Jan 2026 15:04:13 +0100 Subject: sched: Fix build for modules using set_tsk_need_resched() Commit adcc3bfa8806 ("sched: Adapt sched tracepoints for RV task model") added a tracepoint to the need_resched action that can be triggered also by set_tsk_need_resched. This function was previously accessible from out-of-tree modules but it's no longer available because the __trace_set_need_resched() symbol is not exported (together with the tracepoint itself, which was exported in a separate patch) and building such modules fails. Export __trace_set_need_resched to modules to fix those build issues. Fixes: adcc3bfa8806 ("sched: Adapt sched tracepoints for RV task model") Signed-off-by: Gabriele Monaco Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Link: https://patch.msgid.link/20260112140413.362202-1-gmonaco@redhat.com --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b033f9751dfd..3cca012d1259 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1139,6 +1139,7 @@ void __trace_set_need_resched(struct task_struct *curr, int tif) { trace_sched_set_need_resched_tp(curr, smp_processor_id(), tif); } +EXPORT_SYMBOL_GPL(__trace_set_need_resched); void resched_curr(struct rq *rq) { -- cgit v1.2.3 From 553255cc857c08d72658b57d01c04f76cde9a83a Mon Sep 17 00:00:00 2001 From: Zhan Xusheng Date: Wed, 14 Jan 2026 17:00:35 +0800 Subject: sched/fair: Fix math notation errors in avg_vruntime comment The avg_vruntime comment contains a couple of mathematical notation issues: - The summation over w_i * (V - v_i) is written in an ambiguous form - The delta term refers to v instead of v0, which is inconsistent with the code and preceding explanation Fix these to make the comment mathematically correct and consistent with the implementation. Signed-off-by: Zhan Xusheng Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260114090035.19033-1-zhanxusheng@xiaomi.com --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ebee20f75fa0..af120e88720f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -613,7 +613,7 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * * \Sum lag_i = 0 * \Sum w_i * (V - v_i) = 0 - * \Sum w_i * V - w_i * v_i = 0 + * \Sum (w_i * V - w_i * v_i) = 0 * * From which we can solve an expression for V in v_i (which we have in * se->vruntime): @@ -648,7 +648,7 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * \Sum w_i := cfs_rq->sum_weight * * Since zero_vruntime closely tracks the per-task service, these - * deltas: (v_i - v), will be in the order of the maximal (virtual) lag + * deltas: (v_i - v0), will be in the order of the maximal (virtual) lag * induced in the system due to quantisation. * * Also, we use scale_load_down() to reduce the size. -- cgit v1.2.3 From 6b67c8a72e56041f91f70ae5995bdb769761869a Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Thu, 15 Jan 2026 13:05:22 +0530 Subject: sched/fair: Move checking for nohz cpus after time check Current code does. - Read nohz.nr_cpus - Check if the time has passed to do NOHZ idle balance Instead do this. - Check if the time has passed to do NOHZ idle balance - Read nohz.nr_cpus This will skip the read most of the time in normal system usage. i.e when there are nohz.nr_cpus (system is not 100% busy). Note that when there are no idle CPUs(100% busy), even if the flag gets set to NOHZ_STATS_KICK | NOHZ_NEXT_KICK, find_new_ilb will fail and there will be no NOHZ idle balance. In such cases there will be a very narrow window where, kick_ilb will be called un-necessarily. However current functionality is still retained. Note: This patch doesn't solve any cacheline overheads. No improvement in performance apart from saving a few cycles of reading nohz.nr_cpus Reviewed-and-tested-by: K Prateek Nayak Signed-off-by: Shrikanth Hegde Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://patch.msgid.link/20260115073524.376643-2-sshegde@linux.ibm.com --- kernel/sched/fair.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index af120e88720f..9afe0c69a3c1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12441,20 +12441,29 @@ static void nohz_balancer_kick(struct rq *rq) */ nohz_balance_exit_idle(rq); - /* - * None are in tickless mode and hence no need for NOHZ idle load - * balancing: - */ - if (likely(!atomic_read(&nohz.nr_cpus))) - return; - if (READ_ONCE(nohz.has_blocked_load) && time_after(now, READ_ONCE(nohz.next_blocked))) flags = NOHZ_STATS_KICK; + /* + * Most of the time system is not 100% busy. i.e nohz.nr_cpus > 0 + * Skip the read if time is not due. + * + * If none are in tickless mode, there maybe a narrow window + * (28 jiffies, HZ=1000) where flags maybe set and kick_ilb called. + * But idle load balancing is not done as find_new_ilb fails. + * That's very rare. So read nohz.nr_cpus only if time is due. + */ if (time_before(now, nohz.next_balance)) goto out; + /* + * None are in tickless mode and hence no need for NOHZ idle load + * balancing: + */ + if (likely(!atomic_read(&nohz.nr_cpus))) + return; + if (rq->nr_running >= 2) { flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto out; -- cgit v1.2.3 From 94e70734b4d034b9df795bd1ad3452ea96e742ca Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Thu, 15 Jan 2026 13:05:23 +0530 Subject: sched/fair: Change likelyhood of nohz.nr_cpus These days most of the system have multi cores. The likelyhood of at least one or more CPUs in nohz (idle state) is higher. Give accurate hint to the branch predictor. Reviewed-and-tested-by: K Prateek Nayak Signed-off-by: Shrikanth Hegde Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://patch.msgid.link/20260115073524.376643-3-sshegde@linux.ibm.com --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9afe0c69a3c1..4ae06ce4a916 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12459,9 +12459,9 @@ static void nohz_balancer_kick(struct rq *rq) /* * None are in tickless mode and hence no need for NOHZ idle load - * balancing: + * balancing */ - if (likely(!atomic_read(&nohz.nr_cpus))) + if (unlikely(!atomic_read(&nohz.nr_cpus))) return; if (rq->nr_running >= 2) { -- cgit v1.2.3 From 5d86d542f68fda7ef6d543ac631b741db734101a Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Thu, 15 Jan 2026 13:05:24 +0530 Subject: sched/fair: Remove nohz.nr_cpus and use weight of cpumask instead nohz.nr_cpus was observed as contended cacheline when running enterprise workload on large systems. Fundamental scalability challenge with nohz.idle_cpus_mask and nohz.nr_cpus is the following: (1) nohz_balancer_kick() observes (reads) nohz.nr_cpus (or nohz.idle_cpu_mask) and nohz.has_blocked to see whether there's any nohz balancing work to do, in every scheduler tick. (2) nohz_balance_enter_idle() and nohz_balance_exit_idle() (through nohz_balancer_kick() via sched_tick()) modify (write) nohz.nr_cpus (and/or nohz.idle_cpu_mask) and nohz.has_blocked. The characteristic frequencies are the following: (1) nohz_balancer_kick() happens at scheduler (busy)tick frequency on CPU(which has not gone idle). This is a relatively constant frequency in the ~1 kHz range or lower. (2) happens at idle enter/exit frequency on every CPU that goes to idle. This is workload dependent, but can easily be hundreds of kHz for IO-bound loads and high CPU counts. Ie. can be orders of magnitude higher than (1), in which case a cachemiss at every invocation of (1) is almost inevitable. idle exit will trigger (1) on the CPU which is coming out of idle. There's two types of costs from these functions: (A) scheduler tick cost via (1): this happens on busy CPUs too, and is thus a primary scalability cost. But the rate here is constant and typically much lower than (B), hence the absolute benefit to workload scalability will be lower as well. (B) idle cost via (2): going-to-idle and coming-from-idle costs are secondary concerns, because they impact power efficiency more than they impact scalability. But in terms of absolute cost this scales up with nr_cpus as well, and a much faster rate, and thus may also approach and negatively impact system limits like memory bus/fabric bandwidth. Note that nohz.idle_cpus_mask and nohz.nr_cpus may appear to reside in the same cacheline, however under CONFIG_CPUMASK_OFFSTACK=y the backing storage for nohz.idle_cpus_mask will be elsewhere. With CPUMASK_OFFSTACK=n, the nohz.idle_cpus_mask and rest of nohz fields are in different cachelines under typical NR_CPUS=512/2048. This implies two separate cachelines being dirtied upon idle entry / exit. nohz.nr_cpus can be derived from the mask itself. Its usage doesn't warrant a functionally correct value. This means one less cacheline being dirtied in idle entry/exit path which helps to save some bus bandwidth w.r.t to those nohz functions(approx 50%). This in turn helps to improve enterprise workload throughput. On system with 480 CPUs, running "hackbench 40 process 10000 loops" (Avg of 3 runs) baseline: 0.81% hackbench [k] nohz_balance_exit_idle 0.21% hackbench [k] nohz_balancer_kick 0.09% swapper [k] nohz_run_idle_balance With patch: 0.35% hackbench [k] nohz_balance_exit_idle 0.09% hackbench [k] nohz_balancer_kick 0.07% swapper [k] nohz_run_idle_balance [Ingo Molnar: scalability analysis changlog] Reviewed-and-tested-by: K Prateek Nayak Signed-off-by: Shrikanth Hegde Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Vincent Guittot Link: https://patch.msgid.link/20260115073524.376643-4-sshegde@linux.ibm.com --- kernel/sched/fair.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4ae06ce4a916..04993c763a06 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7138,7 +7138,6 @@ static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask); static struct { cpumask_var_t idle_cpus_mask; - atomic_t nr_cpus; int has_blocked_load; /* Idle CPUS has blocked load */ int needs_update; /* Newly idle CPUs need their next_balance collated */ unsigned long next_balance; /* in jiffy units */ @@ -12461,7 +12460,7 @@ static void nohz_balancer_kick(struct rq *rq) * None are in tickless mode and hence no need for NOHZ idle load * balancing */ - if (unlikely(!atomic_read(&nohz.nr_cpus))) + if (unlikely(cpumask_empty(nohz.idle_cpus_mask))) return; if (rq->nr_running >= 2) { @@ -12574,7 +12573,6 @@ void nohz_balance_exit_idle(struct rq *rq) rq->nohz_tick_stopped = 0; cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask); - atomic_dec(&nohz.nr_cpus); set_cpu_sd_state_busy(rq->cpu); } @@ -12632,7 +12630,6 @@ void nohz_balance_enter_idle(int cpu) rq->nohz_tick_stopped = 1; cpumask_set_cpu(cpu, nohz.idle_cpus_mask); - atomic_inc(&nohz.nr_cpus); /* * Ensures that if nohz_idle_balance() fails to observe our -- cgit v1.2.3 From 4fe82cf3024a4bdd2571d584efc25598533d5c96 Mon Sep 17 00:00:00 2001 From: Fushuai Wang Date: Sat, 17 Jan 2026 22:56:14 +0800 Subject: sched/debug: Convert copy_from_user() + kstrtouint() to kstrtouint_from_user() Using kstrtouint_from_user() instead of copy_from_user() + kstrtouint() makes the code simpler and less error-prone. Suggested-by: Yury Norov Signed-off-by: Fushuai Wang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Yury Norov Link: https://patch.msgid.link/20260117145615.53455-2-fushuai.wang@linux.dev --- kernel/sched/debug.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 5f9b77195159..929fdf09e8e9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -172,18 +172,12 @@ static const struct file_operations sched_feat_fops = { static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - char buf[16]; unsigned int scaling; + int ret; - if (cnt > 15) - cnt = 15; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - buf[cnt] = '\0'; - - if (kstrtouint(buf, 10, &scaling)) - return -EINVAL; + ret = kstrtouint_from_user(ubuf, cnt, 10, &scaling); + if (ret) + return ret; if (scaling >= SCHED_TUNABLESCALING_END) return -EINVAL; -- cgit v1.2.3 From d7a5da7a0f7fa7ff081140c4f6f971db98882703 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Dec 2025 17:52:04 +0100 Subject: rseq: Add fields and constants for time slice extension Aside of a Kconfig knob add the following items: - Two flag bits for the rseq user space ABI, which allow user space to query the availability and enablement without a syscall. - A new member to the user space ABI struct rseq, which is going to be used to communicate request and grant between kernel and user space. - A rseq state struct to hold the kernel state of this - Documentation of the new mechanism Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251215155708.669472597@linutronix.de --- Documentation/userspace-api/index.rst | 1 + Documentation/userspace-api/rseq.rst | 135 ++++++++++++++++++++++++++++++++++ include/linux/rseq_types.h | 28 ++++++- include/uapi/linux/rseq.h | 38 ++++++++++ init/Kconfig | 12 +++ kernel/rseq.c | 7 ++ 6 files changed, 220 insertions(+), 1 deletion(-) create mode 100644 Documentation/userspace-api/rseq.rst (limited to 'kernel') diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst index 8a61ac4c1bf1..fa0fe8ada68e 100644 --- a/Documentation/userspace-api/index.rst +++ b/Documentation/userspace-api/index.rst @@ -21,6 +21,7 @@ System calls ebpf/index ioctl/index mseal + rseq Security-related interfaces =========================== diff --git a/Documentation/userspace-api/rseq.rst b/Documentation/userspace-api/rseq.rst new file mode 100644 index 000000000000..e1fdb0d5ce69 --- /dev/null +++ b/Documentation/userspace-api/rseq.rst @@ -0,0 +1,135 @@ +===================== +Restartable Sequences +===================== + +Restartable Sequences allow to register a per thread userspace memory area +to be used as an ABI between kernel and userspace for three purposes: + + * userspace restartable sequences + + * quick access to read the current CPU number, node ID from userspace + + * scheduler time slice extensions + +Restartable sequences (per-cpu atomics) +--------------------------------------- + +Restartable sequences allow userspace to perform update operations on +per-cpu data without requiring heavyweight atomic operations. The actual +ABI is unfortunately only available in the code and selftests. + +Quick access to CPU number, node ID +----------------------------------- + +Allows to implement per CPU data efficiently. Documentation is in code and +selftests. :( + +Scheduler time slice extensions +------------------------------- + +This allows a thread to request a time slice extension when it enters a +critical section to avoid contention on a resource when the thread is +scheduled out inside of the critical section. + +The prerequisites for this functionality are: + + * Enabled in Kconfig + + * Enabled at boot time (default is enabled) + + * A rseq userspace pointer has been registered for the thread + +The thread has to enable the functionality via prctl(2):: + + prctl(PR_RSEQ_SLICE_EXTENSION, PR_RSEQ_SLICE_EXTENSION_SET, + PR_RSEQ_SLICE_EXT_ENABLE, 0, 0); + +prctl() returns 0 on success or otherwise with the following error codes: + +========= ============================================================== +Errorcode Meaning +========= ============================================================== +EINVAL Functionality not available or invalid function arguments. + Note: arg4 and arg5 must be zero +ENOTSUPP Functionality was disabled on the kernel command line +ENXIO Available, but no rseq user struct registered +========= ============================================================== + +The state can be also queried via prctl(2):: + + prctl(PR_RSEQ_SLICE_EXTENSION, PR_RSEQ_SLICE_EXTENSION_GET, 0, 0, 0); + +prctl() returns ``PR_RSEQ_SLICE_EXT_ENABLE`` when it is enabled or 0 if +disabled. Otherwise it returns with the following error codes: + +========= ============================================================== +Errorcode Meaning +========= ============================================================== +EINVAL Functionality not available or invalid function arguments. + Note: arg3 and arg4 and arg5 must be zero +========= ============================================================== + +The availability and status is also exposed via the rseq ABI struct flags +field via the ``RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE_BIT`` and the +``RSEQ_CS_FLAG_SLICE_EXT_ENABLED_BIT``. These bits are read-only for user +space and only for informational purposes. + +If the mechanism was enabled via prctl(), the thread can request a time +slice extension by setting rseq::slice_ctrl::request to 1. If the thread is +interrupted and the interrupt results in a reschedule request in the +kernel, then the kernel can grant a time slice extension and return to +userspace instead of scheduling out. The length of the extension is +determined by the ``rseq_slice_extension_nsec`` sysctl. + +The kernel indicates the grant by clearing rseq::slice_ctrl::request and +setting rseq::slice_ctrl::granted to 1. If there is a reschedule of the +thread after granting the extension, the kernel clears the granted bit to +indicate that to userspace. + +If the request bit is still set when the leaving the critical section, +userspace can clear it and continue. + +If the granted bit is set, then userspace invokes rseq_slice_yield(2) when +leaving the critical section to relinquish the CPU. The kernel enforces +this by arming a timer to prevent misbehaving userspace from abusing this +mechanism. + +If both the request bit and the granted bit are false when leaving the +critical section, then this indicates that a grant was revoked and no +further action is required by userspace. + +The required code flow is as follows:: + + rseq->slice_ctrl.request = 1; + barrier(); // Prevent compiler reordering + critical_section(); + barrier(); // Prevent compiler reordering + rseq->slice_ctrl.request = 0; + if (rseq->slice_ctrl.granted) + rseq_slice_yield(); + +As all of this is strictly CPU local, there are no atomicity requirements. +Checking the granted state is racy, but that cannot be avoided at all:: + + if (rseq->slice_ctrl.granted) + -> Interrupt results in schedule and grant revocation + rseq_slice_yield(); + +So there is no point in pretending that this might be solved by an atomic +operation. + +If the thread issues a syscall other than rseq_slice_yield(2) within the +granted timeslice extension, the grant is also revoked and the CPU is +relinquished immediately when entering the kernel. This is required as +syscalls might consume arbitrary CPU time until they reach a scheduling +point when the preemption model is either NONE or VOLUNTARY and therefore +might exceed the grant by far. + +The preferred solution for user space is to use rseq_slice_yield(2) which +is side effect free. The support for arbitrary syscalls is required to +support onion layer architectured applications, where the code handling the +critical section and requesting the time slice extension has no control +over the code within the critical section. + +The kernel enforces flag consistency and terminates the thread with SIGSEGV +if it detects a violation. diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 332dc14b81c9..67e40c059b1b 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -72,13 +72,36 @@ struct rseq_ids { }; }; +/** + * union rseq_slice_state - Status information for rseq time slice extension + * @state: Compound to access the overall state + * @enabled: Time slice extension is enabled for the task + * @granted: Time slice extension was granted to the task + */ +union rseq_slice_state { + u16 state; + struct { + u8 enabled; + u8 granted; + }; +}; + +/** + * struct rseq_slice - Status information for rseq time slice extension + * @state: Time slice extension state + */ +struct rseq_slice { + union rseq_slice_state state; +}; + /** * struct rseq_data - Storage for all rseq related data * @usrptr: Pointer to the registered user space RSEQ memory * @len: Length of the RSEQ region - * @sig: Signature of critial section abort IPs + * @sig: Signature of critical section abort IPs * @event: Storage for event management * @ids: Storage for cached CPU ID and MM CID + * @slice: Storage for time slice extension data */ struct rseq_data { struct rseq __user *usrptr; @@ -86,6 +109,9 @@ struct rseq_data { u32 sig; struct rseq_event event; struct rseq_ids ids; +#ifdef CONFIG_RSEQ_SLICE_EXTENSION + struct rseq_slice slice; +#endif }; #else /* CONFIG_RSEQ */ diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index 1b76d508400c..6afc219d1545 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -23,9 +23,15 @@ enum rseq_flags { }; enum rseq_cs_flags_bit { + /* Historical and unsupported bits */ RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0, RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1, RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2, + /* (3) Intentional gap to put new bits into a separate byte */ + + /* User read only feature flags */ + RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE_BIT = 4, + RSEQ_CS_FLAG_SLICE_EXT_ENABLED_BIT = 5, }; enum rseq_cs_flags { @@ -35,6 +41,11 @@ enum rseq_cs_flags { (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT), RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT), + + RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE = + (1U << RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE_BIT), + RSEQ_CS_FLAG_SLICE_EXT_ENABLED = + (1U << RSEQ_CS_FLAG_SLICE_EXT_ENABLED_BIT), }; /* @@ -53,6 +64,27 @@ struct rseq_cs { __u64 abort_ip; } __attribute__((aligned(4 * sizeof(__u64)))); +/** + * rseq_slice_ctrl - Time slice extension control structure + * @all: Compound value + * @request: Request for a time slice extension + * @granted: Granted time slice extension + * + * @request is set by user space and can be cleared by user space or kernel + * space. @granted is set and cleared by the kernel and must only be read + * by user space. + */ +struct rseq_slice_ctrl { + union { + __u32 all; + struct { + __u8 request; + __u8 granted; + __u16 __reserved; + }; + }; +}; + /* * struct rseq is aligned on 4 * 8 bytes to ensure it is always * contained within a single cache-line. @@ -141,6 +173,12 @@ struct rseq { */ __u32 mm_cid; + /* + * Time slice extension control structure. CPU local updates from + * kernel and user space. + */ + struct rseq_slice_ctrl slice_ctrl; + /* * Flexible array member at end of structure, after last feature field. */ diff --git a/init/Kconfig b/init/Kconfig index fa79feb8fe57..00c6fbb66a5a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1938,6 +1938,18 @@ config RSEQ If unsure, say Y. +config RSEQ_SLICE_EXTENSION + bool "Enable rseq-based time slice extension mechanism" + depends on RSEQ && HIGH_RES_TIMERS && GENERIC_ENTRY && HAVE_GENERIC_TIF_BITS + help + Allows userspace to request a limited time slice extension when + returning from an interrupt to user space via the RSEQ shared + data ABI. If granted, that allows to complete a critical section, + so that other threads are not stuck on a conflicted resource, + while the task is scheduled out. + + If unsure, say N. + config RSEQ_STATS default n bool "Enable lightweight statistics of restartable sequences" if EXPERT diff --git a/kernel/rseq.c b/kernel/rseq.c index 395d8b002350..07c324d5a201 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -389,6 +389,8 @@ static bool rseq_reset_ids(void) */ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) { + u32 rseqfl = 0; + if (flags & RSEQ_FLAG_UNREGISTER) { if (flags & ~RSEQ_FLAG_UNREGISTER) return -EINVAL; @@ -440,6 +442,9 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 if (!access_ok(rseq, rseq_len)) return -EFAULT; + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) + rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; + scoped_user_write_access(rseq, efault) { /* * If the rseq_cs pointer is non-NULL on registration, clear it to @@ -449,11 +454,13 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 * clearing the fields. Don't bother reading it, just reset it. */ unsafe_put_user(0UL, &rseq->rseq_cs, efault); + unsafe_put_user(rseqfl, &rseq->flags, efault); /* Initialize IDs in user space */ unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault); unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); unsafe_put_user(0U, &rseq->node_id, efault); unsafe_put_user(0U, &rseq->mm_cid, efault); + unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); } /* -- cgit v1.2.3 From f8380f976804533df4c6c3d3a0b2cd03c2d262bc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Dec 2025 17:52:06 +0100 Subject: rseq: Provide static branch for time slice extensions Guard the time slice extension functionality with a static key, which can be disabled on the kernel command line. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251215155708.733429292@linutronix.de --- Documentation/admin-guide/kernel-parameters.txt | 5 +++++ include/linux/rseq_entry.h | 11 +++++++++++ kernel/rseq.c | 17 +++++++++++++++++ 3 files changed, 33 insertions(+) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a8d0afde7f85..f2348bca36a1 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6600,6 +6600,11 @@ Kernel parameters rootflags= [KNL] Set root filesystem mount option string + rseq_slice_ext= [KNL] RSEQ based time slice extension + Format: boolean + Control enablement of RSEQ based time slice extension. + Default is 'on'. + initramfs_options= [KNL] Specify mount options for for the initramfs mount. diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index a36b472627de..d0ec4714ffd6 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -75,6 +75,17 @@ DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); #define rseq_inline __always_inline #endif +#ifdef CONFIG_RSEQ_SLICE_EXTENSION +DECLARE_STATIC_KEY_TRUE(rseq_slice_extension_key); + +static __always_inline bool rseq_slice_extension_enabled(void) +{ + return static_branch_likely(&rseq_slice_extension_key); +} +#else /* CONFIG_RSEQ_SLICE_EXTENSION */ +static inline bool rseq_slice_extension_enabled(void) { return false; } +#endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ + bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); bool rseq_debug_validate_ids(struct task_struct *t); diff --git a/kernel/rseq.c b/kernel/rseq.c index 07c324d5a201..bf75268580ef 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -483,3 +483,20 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 efault: return -EFAULT; } + +#ifdef CONFIG_RSEQ_SLICE_EXTENSION +DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key); + +static int __init rseq_slice_cmdline(char *str) +{ + bool on; + + if (kstrtobool(str, &on)) + return 0; + + if (!on) + static_branch_disable(&rseq_slice_extension_key); + return 1; +} +__setup("rseq_slice_ext=", rseq_slice_cmdline); +#endif /* CONFIG_RSEQ_SLICE_EXTENSION */ -- cgit v1.2.3 From b5b8282441bc4f8f1ff505e19d566dbd7b805761 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Dec 2025 17:52:09 +0100 Subject: rseq: Add statistics for time slice extensions Extend the quick statistics with time slice specific fields. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251215155708.795202254@linutronix.de --- include/linux/rseq_entry.h | 5 +++++ kernel/rseq.c | 14 ++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'kernel') diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index d0ec4714ffd6..54d8e338b26e 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -15,6 +15,11 @@ struct rseq_stats { unsigned long cs; unsigned long clear; unsigned long fixup; + unsigned long s_granted; + unsigned long s_expired; + unsigned long s_revoked; + unsigned long s_yielded; + unsigned long s_aborted; }; DECLARE_PER_CPU(struct rseq_stats, rseq_stats); diff --git a/kernel/rseq.c b/kernel/rseq.c index bf75268580ef..415d75b6df2c 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -138,6 +138,13 @@ static int rseq_stats_show(struct seq_file *m, void *p) stats.cs += data_race(per_cpu(rseq_stats.cs, cpu)); stats.clear += data_race(per_cpu(rseq_stats.clear, cpu)); stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu)); + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { + stats.s_granted += data_race(per_cpu(rseq_stats.s_granted, cpu)); + stats.s_expired += data_race(per_cpu(rseq_stats.s_expired, cpu)); + stats.s_revoked += data_race(per_cpu(rseq_stats.s_revoked, cpu)); + stats.s_yielded += data_race(per_cpu(rseq_stats.s_yielded, cpu)); + stats.s_aborted += data_race(per_cpu(rseq_stats.s_aborted, cpu)); + } } seq_printf(m, "exit: %16lu\n", stats.exit); @@ -148,6 +155,13 @@ static int rseq_stats_show(struct seq_file *m, void *p) seq_printf(m, "cs: %16lu\n", stats.cs); seq_printf(m, "clear: %16lu\n", stats.clear); seq_printf(m, "fixup: %16lu\n", stats.fixup); + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { + seq_printf(m, "sgrant: %16lu\n", stats.s_granted); + seq_printf(m, "sexpir: %16lu\n", stats.s_expired); + seq_printf(m, "srevok: %16lu\n", stats.s_revoked); + seq_printf(m, "syield: %16lu\n", stats.s_yielded); + seq_printf(m, "sabort: %16lu\n", stats.s_aborted); + } return 0; } -- cgit v1.2.3 From 28621ec2d46c6adf7d33a6facbd83e2fa566bd34 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Dec 2025 17:52:12 +0100 Subject: rseq: Add prctl() to enable time slice extensions Implement a prctl() so that tasks can enable the time slice extension mechanism. This fails, when time slice extensions are disabled at compile time or on the kernel command line and when no rseq pointer is registered in the kernel. That allows to implement a single trivial check in the exit to user mode hotpath, to decide whether the whole mechanism needs to be invoked. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251215155708.858717691@linutronix.de --- include/linux/rseq.h | 9 ++++++++ include/uapi/linux/prctl.h | 10 +++++++++ kernel/rseq.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/sys.c | 6 ++++++ 4 files changed, 77 insertions(+) (limited to 'kernel') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 2266f4dc77b6..3c194a02ad0a 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -163,4 +163,13 @@ void rseq_syscall(struct pt_regs *regs); static inline void rseq_syscall(struct pt_regs *regs) { } #endif /* !CONFIG_DEBUG_RSEQ */ +#ifdef CONFIG_RSEQ_SLICE_EXTENSION +int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3); +#else /* CONFIG_RSEQ_SLICE_EXTENSION */ +static inline int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3) +{ + return -ENOTSUPP; +} +#endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ + #endif /* _LINUX_RSEQ_H */ diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 51c4e8c82b1e..79944b7ae50a 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -386,4 +386,14 @@ struct prctl_mm_map { # define PR_FUTEX_HASH_SET_SLOTS 1 # define PR_FUTEX_HASH_GET_SLOTS 2 +/* RSEQ time slice extensions */ +#define PR_RSEQ_SLICE_EXTENSION 79 +# define PR_RSEQ_SLICE_EXTENSION_GET 1 +# define PR_RSEQ_SLICE_EXTENSION_SET 2 +/* + * Bits for RSEQ_SLICE_EXTENSION_GET/SET + * PR_RSEQ_SLICE_EXT_ENABLE: Enable + */ +# define PR_RSEQ_SLICE_EXT_ENABLE 0x01 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/rseq.c b/kernel/rseq.c index 415d75b6df2c..09848bb14ec2 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -71,6 +71,7 @@ #define RSEQ_BUILD_SLOW_PATH #include +#include #include #include #include @@ -501,6 +502,57 @@ efault: #ifdef CONFIG_RSEQ_SLICE_EXTENSION DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key); +int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3) +{ + switch (arg2) { + case PR_RSEQ_SLICE_EXTENSION_GET: + if (arg3) + return -EINVAL; + return current->rseq.slice.state.enabled ? PR_RSEQ_SLICE_EXT_ENABLE : 0; + + case PR_RSEQ_SLICE_EXTENSION_SET: { + u32 rflags, valid = RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; + bool enable = !!(arg3 & PR_RSEQ_SLICE_EXT_ENABLE); + + if (arg3 & ~PR_RSEQ_SLICE_EXT_ENABLE) + return -EINVAL; + if (!rseq_slice_extension_enabled()) + return -ENOTSUPP; + if (!current->rseq.usrptr) + return -ENXIO; + + /* No change? */ + if (enable == !!current->rseq.slice.state.enabled) + return 0; + + if (get_user(rflags, ¤t->rseq.usrptr->flags)) + goto die; + + if (current->rseq.slice.state.enabled) + valid |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + + if ((rflags & valid) != valid) + goto die; + + rflags &= ~RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + rflags |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; + if (enable) + rflags |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + + if (put_user(rflags, ¤t->rseq.usrptr->flags)) + goto die; + + current->rseq.slice.state.enabled = enable; + return 0; + } + default: + return -EINVAL; + } +die: + force_sig(SIGSEGV); + return -EFAULT; +} + static int __init rseq_slice_cmdline(char *str) { bool on; diff --git a/kernel/sys.c b/kernel/sys.c index 8b58eece4e58..af71987df81c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include @@ -2868,6 +2869,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_FUTEX_HASH: error = futex_hash_prctl(arg2, arg3, arg4); break; + case PR_RSEQ_SLICE_EXTENSION: + if (arg4 || arg5) + return -EINVAL; + error = rseq_slice_extension_prctl(arg2, arg3); + break; default: trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5); error = -EINVAL; -- cgit v1.2.3 From 99d2592023e5d0a31f5f5a83c694df48239a1e6c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Dec 2025 17:52:15 +0100 Subject: rseq: Implement sys_rseq_slice_yield() Provide a new syscall which has the only purpose to yield the CPU after the kernel granted a time slice extension. sched_yield() is not suitable for that because it unconditionally schedules, but the end of the time slice extension is not required to schedule when the task was already preempted. This also allows to have a strict check for termination to catch user space invoking random syscalls including sched_yield() from a time slice extension region. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mathieu Desnoyers Acked-by: Arnd Bergmann Link: https://patch.msgid.link/20251215155708.929634896@linutronix.de --- arch/alpha/kernel/syscalls/syscall.tbl | 1 + arch/arm/tools/syscall.tbl | 1 + arch/arm64/tools/syscall_32.tbl | 1 + arch/m68k/kernel/syscalls/syscall.tbl | 1 + arch/microblaze/kernel/syscalls/syscall.tbl | 1 + arch/mips/kernel/syscalls/syscall_n32.tbl | 1 + arch/mips/kernel/syscalls/syscall_n64.tbl | 1 + arch/mips/kernel/syscalls/syscall_o32.tbl | 1 + arch/parisc/kernel/syscalls/syscall.tbl | 1 + arch/powerpc/kernel/syscalls/syscall.tbl | 1 + arch/s390/kernel/syscalls/syscall.tbl | 1 + arch/sh/kernel/syscalls/syscall.tbl | 1 + arch/sparc/kernel/syscalls/syscall.tbl | 1 + arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/xtensa/kernel/syscalls/syscall.tbl | 1 + include/linux/rseq_types.h | 2 ++ include/linux/syscalls.h | 1 + include/uapi/asm-generic/unistd.h | 5 ++++- kernel/rseq.c | 21 +++++++++++++++++++++ kernel/sys_ni.c | 1 + scripts/syscall.tbl | 1 + 22 files changed, 46 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 3fed97478058..f31b7afffc34 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -510,3 +510,4 @@ 578 common file_getattr sys_file_getattr 579 common file_setattr sys_file_setattr 580 common listns sys_listns +581 common rseq_slice_yield sys_rseq_slice_yield diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index fd09afae72a2..94351e22bfcf 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -485,3 +485,4 @@ 468 common file_getattr sys_file_getattr 469 common file_setattr sys_file_setattr 470 common listns sys_listns +471 common rseq_slice_yield sys_rseq_slice_yield diff --git a/arch/arm64/tools/syscall_32.tbl b/arch/arm64/tools/syscall_32.tbl index 8cdfe5d4dac9..62d93d88e0fe 100644 --- a/arch/arm64/tools/syscall_32.tbl +++ b/arch/arm64/tools/syscall_32.tbl @@ -482,3 +482,4 @@ 468 common file_getattr sys_file_getattr 469 common file_setattr sys_file_setattr 470 common listns sys_listns +471 common rseq_slice_yield sys_rseq_slice_yield diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 871a5d67bf41..248934257101 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -470,3 +470,4 @@ 468 common file_getattr sys_file_getattr 469 common file_setattr sys_file_setattr 470 common listns sys_listns +471 common rseq_slice_yield sys_rseq_slice_yield diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 022fc85d94b3..223d26303627 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -476,3 +476,4 @@ 468 common file_getattr sys_file_getattr 469 common file_setattr sys_file_setattr 470 common listns sys_listns +471 common rseq_slice_yield sys_rseq_slice_yield diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 8cedc83c3266..7430714e2b8f 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -409,3 +409,4 @@ 468 n32 file_getattr sys_file_getattr 469 n32 file_setattr sys_file_setattr 470 n32 listns sys_listns +471 n32 rseq_slice_yield sys_rseq_slice_yield diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index 9b92bddf06b5..630aab9e5425 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -385,3 +385,4 @@ 468 n64 file_getattr sys_file_getattr 469 n64 file_setattr sys_file_setattr 470 n64 listns sys_listns +471 n64 rseq_slice_yield sys_rseq_slice_yield diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index f810b8a55716..128653112284 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -458,3 +458,4 @@ 468 o32 file_getattr sys_file_getattr 469 o32 file_setattr sys_file_setattr 470 o32 listns sys_listns +471 o32 rseq_slice_yield sys_rseq_slice_yield diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 39bdacaa530b..f6e2d0379d57 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -469,3 +469,4 @@ 468 common file_getattr sys_file_getattr 469 common file_setattr sys_file_setattr 470 common listns sys_listns +471 common rseq_slice_yield sys_rseq_slice_yield diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index ec4458cdb97b..4fcc7c58a105 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -561,3 +561,4 @@ 468 common file_getattr sys_file_getattr 469 common file_setattr sys_file_setattr 470 common listns sys_listns +471 nospu rseq_slice_yield sys_rseq_slice_yield diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 417ed16b3c63..09a7ef04d979 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -397,3 +397,4 @@ 468 common file_getattr sys_file_getattr 469 common file_setattr sys_file_setattr 470 common listns sys_listns +471 common rseq_slice_yield sys_rseq_slice_yield diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index 969c11325ade..70b315cbe710 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -474,3 +474,4 @@ 468 common file_getattr sys_file_getattr 469 common file_setattr sys_file_setattr 470 common listns sys_listns +471 common rseq_slice_yield sys_rseq_slice_yield diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 39aa26b6a50b..d5b1a7198410 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -516,3 +516,4 @@ 468 common file_getattr sys_file_getattr 469 common file_setattr sys_file_setattr 470 common listns sys_listns +471 common rseq_slice_yield sys_rseq_slice_yield diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index e979a3eac7a3..f832ebd2d79b 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -476,3 +476,4 @@ 468 i386 file_getattr sys_file_getattr 469 i386 file_setattr sys_file_setattr 470 i386 listns sys_listns +471 i386 rseq_slice_yield sys_rseq_slice_yield diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 8a4ac4841be6..524155d655da 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -395,6 +395,7 @@ 468 common file_getattr sys_file_getattr 469 common file_setattr sys_file_setattr 470 common listns sys_listns +471 common rseq_slice_yield sys_rseq_slice_yield # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 438a3b170402..a9bca4e484de 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -441,3 +441,4 @@ 468 common file_getattr sys_file_getattr 469 common file_setattr sys_file_setattr 470 common listns sys_listns +471 common rseq_slice_yield sys_rseq_slice_yield diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 67e40c059b1b..8c540e775161 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -89,9 +89,11 @@ union rseq_slice_state { /** * struct rseq_slice - Status information for rseq time slice extension * @state: Time slice extension state + * @yielded: Indicator for rseq_slice_yield() */ struct rseq_slice { union rseq_slice_state state; + u8 yielded; }; /** diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index cf84d98964b2..6c8a570cf44a 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -961,6 +961,7 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, unsigned mask, struct statx __user *buffer); asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len, int flags, uint32_t sig); +asmlinkage long sys_rseq_slice_yield(void); asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags); asmlinkage long sys_open_tree_attr(int dfd, const char __user *path, unsigned flags, diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 942370b3f5d2..a627acc8fb5f 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -860,8 +860,11 @@ __SYSCALL(__NR_file_setattr, sys_file_setattr) #define __NR_listns 470 __SYSCALL(__NR_listns, sys_listns) +#define __NR_rseq_slice_yield 471 +__SYSCALL(__NR_rseq_slice_yield, sys_rseq_slice_yield) + #undef __NR_syscalls -#define __NR_syscalls 471 +#define __NR_syscalls 472 /* * 32 bit systems traditionally used different diff --git a/kernel/rseq.c b/kernel/rseq.c index 09848bb14ec2..d8e1992edffa 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -553,6 +553,27 @@ die: return -EFAULT; } +/** + * sys_rseq_slice_yield - yield the current processor side effect free if a + * task granted with a time slice extension is done with + * the critical work before being forced out. + * + * Return: 1 if the task successfully yielded the CPU within the granted slice. + * 0 if the slice extension was either never granted or was revoked by + * going over the granted extension, using a syscall other than this one + * or being scheduled out earlier due to a subsequent interrupt. + * + * The syscall does not schedule because the syscall entry work immediately + * relinquishes the CPU and schedules if required. + */ +SYSCALL_DEFINE0(rseq_slice_yield) +{ + int yielded = !!current->rseq.slice.yielded; + + current->rseq.slice.yielded = 0; + return yielded; +} + static int __init rseq_slice_cmdline(char *str) { bool on; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index bf5d05c635ff..add3032da16f 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -390,6 +390,7 @@ COND_SYSCALL(setuid16); /* restartable sequence */ COND_SYSCALL(rseq); +COND_SYSCALL(rseq_slice_yield); COND_SYSCALL(uretprobe); COND_SYSCALL(uprobe); diff --git a/scripts/syscall.tbl b/scripts/syscall.tbl index e74868be513c..7a42b32b6577 100644 --- a/scripts/syscall.tbl +++ b/scripts/syscall.tbl @@ -411,3 +411,4 @@ 468 common file_getattr sys_file_getattr 469 common file_setattr sys_file_setattr 470 common listns sys_listns +471 common rseq_slice_yield sys_rseq_slice_yield -- cgit v1.2.3 From dd0a04606937af5810e9117d343ee3792635bd3d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Dec 2025 17:52:19 +0100 Subject: rseq: Implement syscall entry work for time slice extensions The kernel sets SYSCALL_WORK_RSEQ_SLICE when it grants a time slice extension. This allows to handle the rseq_slice_yield() syscall, which is used by user space to relinquish the CPU after finishing the critical section for which it requested an extension. In case the kernel state is still GRANTED, the kernel resets both kernel and user space state with a set of sanity checks. If the kernel state is already cleared, then this raced against the timer or some other interrupt and just clears the work bit. Doing it in syscall entry work allows to catch misbehaving user space, which issues an arbitrary syscall, i.e. not rseq_slice_yield(), from the critical section. Contrary to the initial strict requirement to use rseq_slice_yield() arbitrary syscalls are not considered a violation of the ABI contract anymore to allow onion architecture applications, which cannot control the code inside a critical section, to utilize this as well. If the code detects inconsistent user space that result in a SIGSEGV for the application. If the grant was still active and the task was not preempted yet, the work code reschedules immediately before continuing through the syscall. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251215155709.005777059@linutronix.de --- include/linux/entry-common.h | 2 +- include/linux/rseq.h | 2 + include/linux/thread_info.h | 16 ++++---- kernel/entry/syscall-common.c | 11 +++++- kernel/rseq.c | 91 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 112 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 87efb38b7081..026201a44aa2 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -36,8 +36,8 @@ SYSCALL_WORK_SYSCALL_EMU | \ SYSCALL_WORK_SYSCALL_AUDIT | \ SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ + SYSCALL_WORK_SYSCALL_RSEQ_SLICE | \ ARCH_SYSCALL_WORK_ENTER) - #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ SYSCALL_WORK_SYSCALL_TRACE | \ SYSCALL_WORK_SYSCALL_AUDIT | \ diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 3c194a02ad0a..7a01a0760405 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -164,8 +164,10 @@ static inline void rseq_syscall(struct pt_regs *regs) { } #endif /* !CONFIG_DEBUG_RSEQ */ #ifdef CONFIG_RSEQ_SLICE_EXTENSION +void rseq_syscall_enter_work(long syscall); int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3); #else /* CONFIG_RSEQ_SLICE_EXTENSION */ +static inline void rseq_syscall_enter_work(long syscall) { } static inline int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3) { return -ENOTSUPP; diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index b40de9bab4b7..051e42902690 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -46,15 +46,17 @@ enum syscall_work_bit { SYSCALL_WORK_BIT_SYSCALL_AUDIT, SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH, SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP, + SYSCALL_WORK_BIT_SYSCALL_RSEQ_SLICE, }; -#define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) -#define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT) -#define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE) -#define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU) -#define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT) -#define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH) -#define SYSCALL_WORK_SYSCALL_EXIT_TRAP BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP) +#define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) +#define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT) +#define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE) +#define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU) +#define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT) +#define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH) +#define SYSCALL_WORK_SYSCALL_EXIT_TRAP BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP) +#define SYSCALL_WORK_SYSCALL_RSEQ_SLICE BIT(SYSCALL_WORK_BIT_SYSCALL_RSEQ_SLICE) #endif #include diff --git a/kernel/entry/syscall-common.c b/kernel/entry/syscall-common.c index 940a597ded40..f7ee25b9cf27 100644 --- a/kernel/entry/syscall-common.c +++ b/kernel/entry/syscall-common.c @@ -17,8 +17,7 @@ static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) } } -long syscall_trace_enter(struct pt_regs *regs, long syscall, - unsigned long work) +long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work) { long ret = 0; @@ -32,6 +31,14 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall, return -1L; } + /* + * User space got a time slice extension granted and relinquishes + * the CPU. The work stops the slice timer to avoid an extra round + * through hrtimer_interrupt(). + */ + if (work & SYSCALL_WORK_SYSCALL_RSEQ_SLICE) + rseq_syscall_enter_work(syscall); + /* Handle ptrace */ if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { ret = ptrace_report_syscall_entry(regs); diff --git a/kernel/rseq.c b/kernel/rseq.c index d8e1992edffa..8aa4821e3979 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -502,6 +502,97 @@ efault: #ifdef CONFIG_RSEQ_SLICE_EXTENSION DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key); +static inline void rseq_slice_set_need_resched(struct task_struct *curr) +{ + /* + * The interrupt guard is required to prevent inconsistent state in + * this case: + * + * set_tsk_need_resched() + * --> Interrupt + * wakeup() + * set_tsk_need_resched() + * set_preempt_need_resched() + * schedule_on_return() + * clear_tsk_need_resched() + * clear_preempt_need_resched() + * set_preempt_need_resched() <- Inconsistent state + * + * This is safe vs. a remote set of TIF_NEED_RESCHED because that + * only sets the already set bit and does not create inconsistent + * state. + */ + scoped_guard(irq) + set_need_resched_current(); +} + +static void rseq_slice_validate_ctrl(u32 expected) +{ + u32 __user *sctrl = ¤t->rseq.usrptr->slice_ctrl.all; + u32 uval; + + if (get_user(uval, sctrl) || uval != expected) + force_sig(SIGSEGV); +} + +/* + * Invoked from syscall entry if a time slice extension was granted and the + * kernel did not clear it before user space left the critical section. + * + * While the recommended way to relinquish the CPU side effect free is + * rseq_slice_yield(2), any syscall within a granted slice terminates the + * grant and immediately reschedules if required. This supports onion layer + * applications, where the code requesting the grant cannot control the + * code within the critical section. + */ +void rseq_syscall_enter_work(long syscall) +{ + struct task_struct *curr = current; + struct rseq_slice_ctrl ctrl = { .granted = curr->rseq.slice.state.granted }; + + clear_task_syscall_work(curr, SYSCALL_RSEQ_SLICE); + + if (static_branch_unlikely(&rseq_debug_enabled)) + rseq_slice_validate_ctrl(ctrl.all); + + /* + * The kernel might have raced, revoked the grant and updated + * userspace, but kept the SLICE work set. + */ + if (!ctrl.granted) + return; + + /* + * Required to make set_tsk_need_resched() correct on PREEMPT[RT] + * kernels. Leaving the scope will reschedule on preemption models + * FULL, LAZY and RT if necessary. + */ + scoped_guard(preempt) { + /* + * Now that preemption is disabled, quickly check whether + * the task was already rescheduled before arriving here. + */ + if (!curr->rseq.event.sched_switch) { + rseq_slice_set_need_resched(curr); + + if (syscall == __NR_rseq_slice_yield) { + rseq_stat_inc(rseq_stats.s_yielded); + /* Update the yielded state for syscall return */ + curr->rseq.slice.yielded = 1; + } else { + rseq_stat_inc(rseq_stats.s_aborted); + } + } + } + /* Reschedule on NONE/VOLUNTARY preemption models */ + cond_resched(); + + /* Clear the grant in kernel state and user space */ + curr->rseq.slice.state.granted = false; + if (put_user(0U, &curr->rseq.usrptr->slice_ctrl.all)) + force_sig(SIGSEGV); +} + int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3) { switch (arg2) { -- cgit v1.2.3 From 0ac3b5c3dc45085b28a10ee730fb2860841f08ef Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Dec 2025 17:52:22 +0100 Subject: rseq: Implement time slice extension enforcement timer If a time slice extension is granted and the reschedule delayed, the kernel has to ensure that user space cannot abuse the extension and exceed the maximum granted time. It was suggested to implement this via the existing hrtick() timer in the scheduler, but that turned out to be problematic for several reasons: 1) It creates a dependency on CONFIG_SCHED_HRTICK, which can be disabled independently of CONFIG_HIGHRES_TIMERS 2) HRTICK usage in the scheduler can be runtime disabled or is only used for certain aspects of scheduling. 3) The function is calling into the scheduler code and that might have unexpected consequences when this is invoked due to a time slice enforcement expiry. Especially when the task managed to clear the grant via sched_yield(0). It would be possible to address #2 and #3 by storing state in the scheduler, but that is extra complexity and fragility for no value. Implement a dedicated per CPU hrtimer instead, which is solely used for the purpose of time slice enforcement. The timer is armed when an extension was granted right before actually returning to user mode in rseq_exit_to_user_mode_restart(). It is disarmed, when the task relinquishes the CPU. This is expensive as the timer is probably the first expiring timer on the CPU, which means it has to reprogram the hardware. But that's less expensive than going through a full hrtimer interrupt cycle for nothing. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251215155709.068329497@linutronix.de --- Documentation/admin-guide/sysctl/kernel.rst | 11 +++ include/linux/rseq_entry.h | 38 +++++--- include/linux/rseq_types.h | 2 + kernel/rseq.c | 132 +++++++++++++++++++++++++++- 4 files changed, 170 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 239da22c4e28..b09d18e0f75b 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -1248,6 +1248,17 @@ reboot-cmd (SPARC only) ROM/Flash boot loader. Maybe to tell it what to do after rebooting. ??? +rseq_slice_extension_nsec +========================= + +A task can request to delay its scheduling if it is in a critical section +via the prctl(PR_RSEQ_SLICE_EXTENSION_SET) mechanism. This sets the maximum +allowed extension in nanoseconds before scheduling of the task is enforced. +Default value is 10000ns (10us). The possible range is 10000ns (10us) to +50000ns (50us). + +This value has a direct correlation to the worst case scheduling latency; +increment at your own risk. sched_energy_aware ================== diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index 54d8e338b26e..8d04611056aa 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -87,8 +87,24 @@ static __always_inline bool rseq_slice_extension_enabled(void) { return static_branch_likely(&rseq_slice_extension_key); } + +extern unsigned int rseq_slice_ext_nsecs; +bool __rseq_arm_slice_extension_timer(void); + +static __always_inline bool rseq_arm_slice_extension_timer(void) +{ + if (!rseq_slice_extension_enabled()) + return false; + + if (likely(!current->rseq.slice.state.granted)) + return false; + + return __rseq_arm_slice_extension_timer(); +} + #else /* CONFIG_RSEQ_SLICE_EXTENSION */ static inline bool rseq_slice_extension_enabled(void) { return false; } +static inline bool rseq_arm_slice_extension_timer(void) { return false; } #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); @@ -543,17 +559,19 @@ static __always_inline void clear_tif_rseq(void) { } static __always_inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) { - if (likely(!test_tif_rseq(ti_work))) - return false; - - if (unlikely(__rseq_exit_to_user_mode_restart(regs))) { - current->rseq.event.slowpath = true; - set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); - return true; + if (unlikely(test_tif_rseq(ti_work))) { + if (unlikely(__rseq_exit_to_user_mode_restart(regs))) { + current->rseq.event.slowpath = true; + set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); + return true; + } + clear_tif_rseq(); } - - clear_tif_rseq(); - return false; + /* + * Arm the slice extension timer if nothing to do anymore and the + * task really goes out to user space. + */ + return rseq_arm_slice_extension_timer(); } #else /* CONFIG_GENERIC_ENTRY */ diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 8c540e775161..8a2e76c5d2a8 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -89,10 +89,12 @@ union rseq_slice_state { /** * struct rseq_slice - Status information for rseq time slice extension * @state: Time slice extension state + * @expires: The time when a grant expires * @yielded: Indicator for rseq_slice_yield() */ struct rseq_slice { union rseq_slice_state state; + u64 expires; u8 yielded; }; diff --git a/kernel/rseq.c b/kernel/rseq.c index 8aa4821e3979..275d70114107 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -71,6 +71,8 @@ #define RSEQ_BUILD_SLOW_PATH #include +#include +#include #include #include #include @@ -500,8 +502,91 @@ efault: } #ifdef CONFIG_RSEQ_SLICE_EXTENSION +struct slice_timer { + struct hrtimer timer; + void *cookie; +}; + +unsigned int rseq_slice_ext_nsecs __read_mostly = 10 * NSEC_PER_USEC; +static DEFINE_PER_CPU(struct slice_timer, slice_timer); DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key); +/* + * When the timer expires and the task is still in user space, the return + * from interrupt will revoke the grant and schedule. If the task already + * entered the kernel via a syscall and the timer fires before the syscall + * work was able to cancel it, then depending on the preemption model this + * will either reschedule on return from interrupt or in the syscall work + * below. + */ +static enum hrtimer_restart rseq_slice_expired(struct hrtimer *tmr) +{ + struct slice_timer *st = container_of(tmr, struct slice_timer, timer); + + /* + * Validate that the task which armed the timer is still on the + * CPU. It could have been scheduled out without canceling the + * timer. + */ + if (st->cookie == current && current->rseq.slice.state.granted) { + rseq_stat_inc(rseq_stats.s_expired); + set_need_resched_current(); + } + return HRTIMER_NORESTART; +} + +bool __rseq_arm_slice_extension_timer(void) +{ + struct slice_timer *st = this_cpu_ptr(&slice_timer); + struct task_struct *curr = current; + + lockdep_assert_irqs_disabled(); + + /* + * This check prevents a task, which got a time slice extension + * granted, from exceeding the maximum scheduling latency when the + * grant expired before going out to user space. Don't bother to + * clear the grant here, it will be cleaned up automatically before + * going out to user space after being scheduled back in. + */ + if ((unlikely(curr->rseq.slice.expires < ktime_get_mono_fast_ns()))) { + set_need_resched_current(); + return true; + } + + /* + * Store the task pointer as a cookie for comparison in the timer + * function. This is safe as the timer is CPU local and cannot be + * in the expiry function at this point. + */ + st->cookie = curr; + hrtimer_start(&st->timer, curr->rseq.slice.expires, HRTIMER_MODE_ABS_PINNED_HARD); + /* Arm the syscall entry work */ + set_task_syscall_work(curr, SYSCALL_RSEQ_SLICE); + return false; +} + +static void rseq_cancel_slice_extension_timer(void) +{ + struct slice_timer *st = this_cpu_ptr(&slice_timer); + + /* + * st->cookie can be safely read as preemption is disabled and the + * timer is CPU local. + * + * As this is most probably the first expiring timer, the cancel is + * expensive as it has to reprogram the hardware, but that's less + * expensive than going through a full hrtimer_interrupt() cycle + * for nothing. + * + * hrtimer_try_to_cancel() is sufficient here as the timer is CPU + * local and once the hrtimer code disabled interrupts the timer + * callback cannot be running. + */ + if (st->cookie == current) + hrtimer_try_to_cancel(&st->timer); +} + static inline void rseq_slice_set_need_resched(struct task_struct *curr) { /* @@ -563,11 +648,14 @@ void rseq_syscall_enter_work(long syscall) return; /* - * Required to make set_tsk_need_resched() correct on PREEMPT[RT] - * kernels. Leaving the scope will reschedule on preemption models - * FULL, LAZY and RT if necessary. + * Required to stabilize the per CPU timer pointer and to make + * set_tsk_need_resched() correct on PREEMPT[RT] kernels. + * + * Leaving the scope will reschedule on preemption models FULL, + * LAZY and RT if necessary. */ scoped_guard(preempt) { + rseq_cancel_slice_extension_timer(); /* * Now that preemption is disabled, quickly check whether * the task was already rescheduled before arriving here. @@ -665,6 +753,31 @@ SYSCALL_DEFINE0(rseq_slice_yield) return yielded; } +#ifdef CONFIG_SYSCTL +static const unsigned int rseq_slice_ext_nsecs_min = 10 * NSEC_PER_USEC; +static const unsigned int rseq_slice_ext_nsecs_max = 50 * NSEC_PER_USEC; + +static const struct ctl_table rseq_slice_ext_sysctl[] = { + { + .procname = "rseq_slice_extension_nsec", + .data = &rseq_slice_ext_nsecs, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = (unsigned int *)&rseq_slice_ext_nsecs_min, + .extra2 = (unsigned int *)&rseq_slice_ext_nsecs_max, + }, +}; + +static void rseq_slice_sysctl_init(void) +{ + if (rseq_slice_extension_enabled()) + register_sysctl_init("kernel", rseq_slice_ext_sysctl); +} +#else /* CONFIG_SYSCTL */ +static inline void rseq_slice_sysctl_init(void) { } +#endif /* !CONFIG_SYSCTL */ + static int __init rseq_slice_cmdline(char *str) { bool on; @@ -677,4 +790,17 @@ static int __init rseq_slice_cmdline(char *str) return 1; } __setup("rseq_slice_ext=", rseq_slice_cmdline); + +static int __init rseq_slice_init(void) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) { + hrtimer_setup(per_cpu_ptr(&slice_timer.timer, cpu), rseq_slice_expired, + CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED_HARD); + } + rseq_slice_sysctl_init(); + return 0; +} +device_initcall(rseq_slice_init); #endif /* CONFIG_RSEQ_SLICE_EXTENSION */ -- cgit v1.2.3 From 3c78aaec19b0621bf952756670c8b066a55202fe Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Dec 2025 17:52:31 +0100 Subject: entry: Hook up rseq time slice extension Wire the grant decision function up in exit_to_user_mode_loop() Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251215155709.258157362@linutronix.de --- kernel/entry/common.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 5c792b30c58a..9ef63e414791 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -17,6 +17,27 @@ void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } #define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK) #endif +/* TIF bits, which prevent a time slice extension. */ +#ifdef CONFIG_PREEMPT_RT +/* + * Since rseq slice ext has a direct correlation to the worst case + * scheduling latency (schedule is delayed after all), only have it affect + * LAZY reschedules on PREEMPT_RT for now. + * + * However, since this delay is only applicable to userspace, a value + * for rseq_slice_extension_nsec that is strictly less than the worst case + * kernel space preempt_disable() region, should mean the scheduling latency + * is not affected, even for !LAZY. + * + * However, since this value depends on the hardware at hand, it cannot be + * pre-determined in any sensible way. Hence punt on this problem for now. + */ +# define TIF_SLICE_EXT_SCHED (_TIF_NEED_RESCHED_LAZY) +#else +# define TIF_SLICE_EXT_SCHED (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) +#endif +#define TIF_SLICE_EXT_DENY (EXIT_TO_USER_MODE_WORK & ~TIF_SLICE_EXT_SCHED) + static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work) { @@ -28,8 +49,10 @@ static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *re local_irq_enable_exit_to_user(ti_work); - if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) - schedule(); + if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) { + if (!rseq_grant_slice_extension(ti_work & TIF_SLICE_EXT_DENY)) + schedule(); + } if (ti_work & _TIF_UPROBE) uprobe_notify_resume(regs); -- cgit v1.2.3 From d6200245c75e832af2087bc60ba2e6641a90eee9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 19 Jan 2026 11:23:57 +0100 Subject: rseq: Allow registering RSEQ with slice extension Since glibc cares about the number of syscalls required to initialize a new thread, allow initializing rseq with slice extension on. This avoids having to do another prctl(). Requested-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260121143207.814193010@infradead.org --- include/uapi/linux/rseq.h | 3 ++- kernel/rseq.c | 12 ++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index 6afc219d1545..863c4a00a66b 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -19,7 +19,8 @@ enum rseq_cpu_id_state { }; enum rseq_flags { - RSEQ_FLAG_UNREGISTER = (1 << 0), + RSEQ_FLAG_UNREGISTER = (1 << 0), + RSEQ_FLAG_SLICE_EXT_DEFAULT_ON = (1 << 1), }; enum rseq_cs_flags_bit { diff --git a/kernel/rseq.c b/kernel/rseq.c index 275d70114107..1c5490a172a8 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -424,7 +424,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 return 0; } - if (unlikely(flags)) + if (unlikely(flags & ~(RSEQ_FLAG_SLICE_EXT_DEFAULT_ON))) return -EINVAL; if (current->rseq.usrptr) { @@ -459,8 +459,12 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 if (!access_ok(rseq, rseq_len)) return -EFAULT; - if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; + if (rseq_slice_extension_enabled() && + (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON)) + rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + } scoped_user_write_access(rseq, efault) { /* @@ -488,6 +492,10 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 current->rseq.len = rseq_len; current->rseq.sig = sig; +#ifdef CONFIG_RSEQ_SLICE_EXTENSION + current->rseq.slice.state.enabled = !!(rseqfl & RSEQ_CS_FLAG_SLICE_EXT_ENABLED); +#endif + /* * If rseq was previously inactive, and has just been * registered, ensure the cpu_id_start and cpu_id fields -- cgit v1.2.3 From e1d7f54900f1e1d3003a85b78cd7105a64203ff7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 21 Jan 2026 14:21:51 +0100 Subject: rseq: Move slice_ext_nsec to debugfs Move changing the slice ext duration to debugfs, a sliglty less permanent interface. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260121143207.923520192@infradead.org --- Documentation/admin-guide/sysctl/kernel.rst | 11 ----- Documentation/userspace-api/rseq.rst | 4 +- kernel/rseq.c | 69 +++++++++++++++++++---------- 3 files changed, 49 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index b09d18e0f75b..239da22c4e28 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -1248,17 +1248,6 @@ reboot-cmd (SPARC only) ROM/Flash boot loader. Maybe to tell it what to do after rebooting. ??? -rseq_slice_extension_nsec -========================= - -A task can request to delay its scheduling if it is in a critical section -via the prctl(PR_RSEQ_SLICE_EXTENSION_SET) mechanism. This sets the maximum -allowed extension in nanoseconds before scheduling of the task is enforced. -Default value is 10000ns (10us). The possible range is 10000ns (10us) to -50000ns (50us). - -This value has a direct correlation to the worst case scheduling latency; -increment at your own risk. sched_energy_aware ================== diff --git a/Documentation/userspace-api/rseq.rst b/Documentation/userspace-api/rseq.rst index e1fdb0d5ce69..29af6c300396 100644 --- a/Documentation/userspace-api/rseq.rst +++ b/Documentation/userspace-api/rseq.rst @@ -79,7 +79,9 @@ slice extension by setting rseq::slice_ctrl::request to 1. If the thread is interrupted and the interrupt results in a reschedule request in the kernel, then the kernel can grant a time slice extension and return to userspace instead of scheduling out. The length of the extension is -determined by the ``rseq_slice_extension_nsec`` sysctl. +determined by debugfs:rseq/slice_ext_nsec. The default value is 10 usec; which +is the minimum value. It can be incremented to 50 usecs, however doing so +can/will affect the minimum scheduling latency. The kernel indicates the grant by clearing rseq::slice_ctrl::request and setting rseq::slice_ctrl::granted to 1. If there is a reschedule of the diff --git a/kernel/rseq.c b/kernel/rseq.c index 1c5490a172a8..e423a9bc0a2c 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -123,7 +123,6 @@ void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, } #endif /* CONFIG_TRACEPOINTS */ -#ifdef CONFIG_DEBUG_FS #ifdef CONFIG_RSEQ_STATS DEFINE_PER_CPU(struct rseq_stats, rseq_stats); @@ -222,16 +221,19 @@ static const struct file_operations debug_ops = { .release = single_release, }; +static void rseq_slice_ext_init(struct dentry *root_dir); + static int __init rseq_debugfs_init(void) { struct dentry *root_dir = debugfs_create_dir("rseq", NULL); debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops); rseq_stats_init(root_dir); + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) + rseq_slice_ext_init(root_dir); return 0; } __initcall(rseq_debugfs_init); -#endif /* CONFIG_DEBUG_FS */ static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id) { @@ -515,7 +517,9 @@ struct slice_timer { void *cookie; }; -unsigned int rseq_slice_ext_nsecs __read_mostly = 10 * NSEC_PER_USEC; +static const unsigned int rseq_slice_ext_nsecs_min = 10 * NSEC_PER_USEC; +static const unsigned int rseq_slice_ext_nsecs_max = 50 * NSEC_PER_USEC; +unsigned int rseq_slice_ext_nsecs __read_mostly = rseq_slice_ext_nsecs_min; static DEFINE_PER_CPU(struct slice_timer, slice_timer); DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key); @@ -761,30 +765,48 @@ SYSCALL_DEFINE0(rseq_slice_yield) return yielded; } -#ifdef CONFIG_SYSCTL -static const unsigned int rseq_slice_ext_nsecs_min = 10 * NSEC_PER_USEC; -static const unsigned int rseq_slice_ext_nsecs_max = 50 * NSEC_PER_USEC; +static int rseq_slice_ext_show(struct seq_file *m, void *p) +{ + seq_printf(m, "%d\n", rseq_slice_ext_nsecs); + return 0; +} + +static ssize_t rseq_slice_ext_write(struct file *file, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + unsigned int nsecs; + + if (kstrtouint_from_user(ubuf, count, 10, &nsecs)) + return -EINVAL; + + if (nsecs < rseq_slice_ext_nsecs_min) + return -ERANGE; -static const struct ctl_table rseq_slice_ext_sysctl[] = { - { - .procname = "rseq_slice_extension_nsec", - .data = &rseq_slice_ext_nsecs, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_douintvec_minmax, - .extra1 = (unsigned int *)&rseq_slice_ext_nsecs_min, - .extra2 = (unsigned int *)&rseq_slice_ext_nsecs_max, - }, + if (nsecs > rseq_slice_ext_nsecs_max) + return -ERANGE; + + rseq_slice_ext_nsecs = nsecs; + + return count; +} + +static int rseq_slice_ext_open(struct inode *inode, struct file *file) +{ + return single_open(file, rseq_slice_ext_show, inode->i_private); +} + +static const struct file_operations slice_ext_ops = { + .open = rseq_slice_ext_open, + .read = seq_read, + .write = rseq_slice_ext_write, + .llseek = seq_lseek, + .release = single_release, }; -static void rseq_slice_sysctl_init(void) +static void rseq_slice_ext_init(struct dentry *root_dir) { - if (rseq_slice_extension_enabled()) - register_sysctl_init("kernel", rseq_slice_ext_sysctl); + debugfs_create_file("slice_ext_nsec", 0644, root_dir, NULL, &slice_ext_ops); } -#else /* CONFIG_SYSCTL */ -static inline void rseq_slice_sysctl_init(void) { } -#endif /* !CONFIG_SYSCTL */ static int __init rseq_slice_cmdline(char *str) { @@ -807,8 +829,9 @@ static int __init rseq_slice_init(void) hrtimer_setup(per_cpu_ptr(&slice_timer.timer, cpu), rseq_slice_expired, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED_HARD); } - rseq_slice_sysctl_init(); return 0; } device_initcall(rseq_slice_init); +#else +static void rseq_slice_ext_init(struct dentry *root_dir) { } #endif /* CONFIG_RSEQ_SLICE_EXTENSION */ -- cgit v1.2.3 From 21c0e92d0681fbd10ac024311bd09bca439e0bb1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 21 Jan 2026 14:25:04 +0100 Subject: rseq: Lower default slice extension Change the minimum slice extension to 5 usec. Since slice_test selftest reaches a staggering ~350 nsec extension: Task: slice_test Mean: 350.266 ns Latency (us) | Count ------------------------------ EXPIRED | 238 0 us | 143189 1 us | 167 2 us | 26 3 us | 11 4 us | 28 5 us | 31 6 us | 22 7 us | 23 8 us | 32 9 us | 16 10 us | 35 Lower the minimal (and default) value to 5 usecs -- which is still massive. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260121143208.073200729@infradead.org --- Documentation/userspace-api/rseq.rst | 2 +- kernel/rseq.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/Documentation/userspace-api/rseq.rst b/Documentation/userspace-api/rseq.rst index 29af6c300396..468f6bbe0e25 100644 --- a/Documentation/userspace-api/rseq.rst +++ b/Documentation/userspace-api/rseq.rst @@ -79,7 +79,7 @@ slice extension by setting rseq::slice_ctrl::request to 1. If the thread is interrupted and the interrupt results in a reschedule request in the kernel, then the kernel can grant a time slice extension and return to userspace instead of scheduling out. The length of the extension is -determined by debugfs:rseq/slice_ext_nsec. The default value is 10 usec; which +determined by debugfs:rseq/slice_ext_nsec. The default value is 5 usec; which is the minimum value. It can be incremented to 50 usecs, however doing so can/will affect the minimum scheduling latency. diff --git a/kernel/rseq.c b/kernel/rseq.c index e423a9bc0a2c..b0973d19f366 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -517,7 +517,7 @@ struct slice_timer { void *cookie; }; -static const unsigned int rseq_slice_ext_nsecs_min = 10 * NSEC_PER_USEC; +static const unsigned int rseq_slice_ext_nsecs_min = 5 * NSEC_PER_USEC; static const unsigned int rseq_slice_ext_nsecs_max = 50 * NSEC_PER_USEC; unsigned int rseq_slice_ext_nsecs __read_mostly = rseq_slice_ext_nsecs_min; static DEFINE_PER_CPU(struct slice_timer, slice_timer); -- cgit v1.2.3 From 5d6446f409da00e5a389125ddb5ce09f5bc404c9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 19 Jan 2026 11:38:34 +0100 Subject: hrtimer: Fix trace oddity It turns out that __run_hrtimer() will trace like: -0 [032] d.h2. 20705.474563: hrtimer_cancel: hrtimer=0xff2db8f77f8226e8 -0 [032] d.h1. 20705.474563: hrtimer_expire_entry: hrtimer=0xff2db8f77f8226e8 now=20699452001850 function=tick_nohz_handler/0x0 Which is a bit nonsensical, the timer doesn't get canceled on expiration. The cause is the use of the incorrect debug helper. Fixes: c6a2a1770245 ("hrtimer: Add tracepoint for hrtimers") Reported-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260121143208.219595606@infradead.org --- kernel/time/hrtimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index f8ea8c8fc895..d8935ab10a2f 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1742,7 +1742,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, lockdep_assert_held(&cpu_base->lock); - debug_deactivate(timer); + debug_hrtimer_deactivate(timer); base->running = timer; /* -- cgit v1.2.3 From 4b603f1551a73e2868b9e7a14b3938c23275cefb Mon Sep 17 00:00:00 2001 From: Shubhang Kaushik Date: Wed, 21 Jan 2026 01:31:53 -0800 Subject: sched: Update rq->avg_idle when a task is moved to an idle CPU Currently, rq->idle_stamp is only used to calculate avg_idle during wakeups. This means other paths that move a task to an idle CPU such as fork/clone, execve, or migrations, do not end the CPU's idle status in the scheduler's eyes, leading to an inaccurate avg_idle. This patch introduces update_rq_avg_idle() to provide a more accurate measurement of CPU idle duration. By invoking this helper in put_prev_task_idle(), we ensure avg_idle is updated whenever a CPU stops being idle, regardless of how the new task arrived. Testing on an 80-core Ampere Altra (ARMv8) with 6.19-rc5 baseline: - Hackbench : +7.2% performance gain at 16 threads. - Schbench: Reduced p99.9 tail latencies at high concurrency. Signed-off-by: Shubhang Kaushik Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Tested-by: Shubhang Kaushik Link: https://patch.msgid.link/20260121-v8-patch-series-v8-1-b7f1cbee5055@os.amperecomputing.com --- kernel/sched/core.c | 24 ++++++++++++------------ kernel/sched/idle.c | 1 + kernel/sched/sched.h | 1 + 3 files changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3cca012d1259..c5431afe23b0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3613,6 +3613,18 @@ static inline void ttwu_do_wakeup(struct task_struct *p) trace_sched_wakeup(p); } +void update_rq_avg_idle(struct rq *rq) +{ + u64 delta = rq_clock(rq) - rq->idle_stamp; + u64 max = 2*rq->max_idle_balance_cost; + + update_avg(&rq->avg_idle, delta); + + if (rq->avg_idle > max) + rq->avg_idle = max; + rq->idle_stamp = 0; +} + static void ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, struct rq_flags *rf) @@ -3648,18 +3660,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, p->sched_class->task_woken(rq, p); rq_repin_lock(rq, rf); } - - if (rq->idle_stamp) { - u64 delta = rq_clock(rq) - rq->idle_stamp; - u64 max = 2*rq->max_idle_balance_cost; - - update_avg(&rq->avg_idle, delta); - - if (rq->avg_idle > max) - rq->avg_idle = max; - - rq->idle_stamp = 0; - } } /* diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 65eb8f8c1a5d..aba5ad53c07d 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -460,6 +460,7 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct t { update_curr_idle(rq); scx_update_idle(rq, false, true); + update_rq_avg_idle(rq); } static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 58c9d244f12b..127633b1377b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1670,6 +1670,7 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) #endif /* !CONFIG_FAIR_GROUP_SCHED */ +extern void update_rq_avg_idle(struct rq *rq); extern void update_rq_clock(struct rq *rq); /* -- cgit v1.2.3 From 03150a9f84b328f5c724b8ed9ff8600c2d7e2d7b Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Wed, 28 Jan 2026 11:19:21 +0800 Subject: entry: Remove unused syscall argument from syscall_trace_enter() The 'syscall' argument of syscall_trace_enter() is immediately overwritten before any real use and serves only as a local variable, so drop the parameter. No functional change intended. Signed-off-by: Jinjie Ruan Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260128031934.3906955-2-ruanjinjie@huawei.com --- include/linux/entry-common.h | 4 ++-- kernel/entry/syscall-common.c | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 87efb38b7081..e4a8287af822 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -45,7 +45,7 @@ SYSCALL_WORK_SYSCALL_EXIT_TRAP | \ ARCH_SYSCALL_WORK_EXIT) -long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work); +long syscall_trace_enter(struct pt_regs *regs, unsigned long work); /** * syscall_enter_from_user_mode_work - Check and handle work before invoking @@ -75,7 +75,7 @@ static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *re unsigned long work = READ_ONCE(current_thread_info()->syscall_work); if (work & SYSCALL_WORK_ENTER) - syscall = syscall_trace_enter(regs, syscall, work); + syscall = syscall_trace_enter(regs, work); return syscall; } diff --git a/kernel/entry/syscall-common.c b/kernel/entry/syscall-common.c index 940a597ded40..e6237b536d8b 100644 --- a/kernel/entry/syscall-common.c +++ b/kernel/entry/syscall-common.c @@ -17,10 +17,9 @@ static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) } } -long syscall_trace_enter(struct pt_regs *regs, long syscall, - unsigned long work) +long syscall_trace_enter(struct pt_regs *regs, unsigned long work) { - long ret = 0; + long syscall, ret = 0; /* * Handle Syscall User Dispatch. This must comes first, since -- cgit v1.2.3 From 578b21fd3ab2d9901ce40ed802e428a41a40610d Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Wed, 28 Jan 2026 11:19:30 +0800 Subject: entry: Add arch_ptrace_report_syscall_entry/exit() ARM64 requires a architecture specific ptrace wrapper as it needs to save and restore scratch registers. Provide arch_ptrace_report_syscall_entry/exit() wrappers which fall back to ptrace_report_syscall_entry/exit() if the architecture does not provide them. No functional change intended. [ tglx: Massaged changelog and comments ] Suggested-by: Mark Rutland Suggested-by: Thomas Gleixner Signed-off-by: Jinjie Ruan Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Brodsky Link: https://patch.msgid.link/20260128031934.3906955-11-ruanjinjie@huawei.com --- include/linux/entry-common.h | 36 ++++++++++++++++++++++++++++++++++++ kernel/entry/syscall-common.c | 4 ++-- 2 files changed, 38 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 5316004940c0..bea207e32c58 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -45,6 +45,24 @@ SYSCALL_WORK_SYSCALL_EXIT_TRAP | \ ARCH_SYSCALL_WORK_EXIT) +/** + * arch_ptrace_report_syscall_entry - Architecture specific ptrace_report_syscall_entry() wrapper + * + * Invoked from syscall_trace_enter() to wrap ptrace_report_syscall_entry(). + * + * This allows architecture specific ptrace_report_syscall_entry() + * implementations. If not defined by the architecture this falls back to + * to ptrace_report_syscall_entry(). + */ +static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs); + +#ifndef arch_ptrace_report_syscall_entry +static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs) +{ + return ptrace_report_syscall_entry(regs); +} +#endif + long syscall_trace_enter(struct pt_regs *regs, unsigned long work); /** @@ -112,6 +130,24 @@ static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, l return ret; } +/** + * arch_ptrace_report_syscall_exit - Architecture specific ptrace_report_syscall_exit() + * + * This allows architecture specific ptrace_report_syscall_exit() + * implementations. If not defined by the architecture this falls back to + * to ptrace_report_syscall_exit(). + */ +static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs, + int step); + +#ifndef arch_ptrace_report_syscall_exit +static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs, + int step) +{ + ptrace_report_syscall_exit(regs, step); +} +#endif + /** * syscall_exit_work - Handle work before returning to user mode * @regs: Pointer to current pt_regs diff --git a/kernel/entry/syscall-common.c b/kernel/entry/syscall-common.c index e6237b536d8b..bb5f61f5629d 100644 --- a/kernel/entry/syscall-common.c +++ b/kernel/entry/syscall-common.c @@ -33,7 +33,7 @@ long syscall_trace_enter(struct pt_regs *regs, unsigned long work) /* Handle ptrace */ if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { - ret = ptrace_report_syscall_entry(regs); + ret = arch_ptrace_report_syscall_entry(regs); if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) return -1L; } @@ -99,5 +99,5 @@ void syscall_exit_work(struct pt_regs *regs, unsigned long work) step = report_single_step(work); if (step || work & SYSCALL_WORK_SYSCALL_TRACE) - ptrace_report_syscall_exit(regs, step); + arch_ptrace_report_syscall_exit(regs, step); } -- cgit v1.2.3 From 31c9387d0d84bc1d643a0c30155b6d92d05c92fc Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Wed, 28 Jan 2026 11:19:33 +0800 Subject: entry: Inline syscall_exit_work() and syscall_trace_enter() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After switching ARM64 to the generic entry code, a syscall_exit_work() appeared as a profiling hotspot because it is not inlined. Inlining both syscall_trace_enter() and syscall_exit_work() provides a performance gain when any of the work items is enabled. With audit enabled this results in a ~4% performance gain for perf bench basic syscall on a kunpeng920 system: | Metric | Baseline | Inlined | Change | | ---------- | ----------- | ----------- | ------ | | Total time | 2.353 [sec] | 2.264 [sec] | ↓3.8% | | usecs/op | 0.235374 | 0.226472 | ↓3.8% | | ops/sec | 4,248,588 | 4,415,554 | ↑3.9% | Small gains can be observed on x86 as well, though the generated code optimizes for the work case, which is counterproductive for high performance scenarios where such entry/exit work is usually avoided. Avoid this by marking the work check in syscall_enter_from_user_mode_work() unlikely, which is what the corresponding check in the exit path does already. [ tglx: Massage changelog and add the unlikely() ] Signed-off-by: Jinjie Ruan Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260128031934.3906955-14-ruanjinjie@huawei.com --- include/linux/entry-common.h | 94 ++++++++++++++++++++++++++++++++++- kernel/entry/common.h | 7 --- kernel/entry/syscall-common.c | 96 +++--------------------------------- kernel/entry/syscall_user_dispatch.c | 4 +- 4 files changed, 102 insertions(+), 99 deletions(-) delete mode 100644 kernel/entry/common.h (limited to 'kernel') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index bea207e32c58..e67e3afe39ad 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -2,6 +2,7 @@ #ifndef __LINUX_ENTRYCOMMON_H #define __LINUX_ENTRYCOMMON_H +#include #include #include #include @@ -63,7 +64,58 @@ static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs } #endif -long syscall_trace_enter(struct pt_regs *regs, unsigned long work); +bool syscall_user_dispatch(struct pt_regs *regs); +long trace_syscall_enter(struct pt_regs *regs, long syscall); +void trace_syscall_exit(struct pt_regs *regs, long ret); + +static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) +{ + if (unlikely(audit_context())) { + unsigned long args[6]; + + syscall_get_arguments(current, regs, args); + audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); + } +} + +static __always_inline long syscall_trace_enter(struct pt_regs *regs, unsigned long work) +{ + long syscall, ret = 0; + + /* + * Handle Syscall User Dispatch. This must comes first, since + * the ABI here can be something that doesn't make sense for + * other syscall_work features. + */ + if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { + if (syscall_user_dispatch(regs)) + return -1L; + } + + /* Handle ptrace */ + if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { + ret = arch_ptrace_report_syscall_entry(regs); + if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) + return -1L; + } + + /* Do seccomp after ptrace, to catch any tracer changes. */ + if (work & SYSCALL_WORK_SECCOMP) { + ret = __secure_computing(); + if (ret == -1L) + return ret; + } + + /* Either of the above might have changed the syscall number */ + syscall = syscall_get_nr(current, regs); + + if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) + syscall = trace_syscall_enter(regs, syscall); + + syscall_enter_audit(regs, syscall); + + return ret ? : syscall; +} /** * syscall_enter_from_user_mode_work - Check and handle work before invoking @@ -130,6 +182,19 @@ static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, l return ret; } +/* + * If SYSCALL_EMU is set, then the only reason to report is when + * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall + * instruction has been already reported in syscall_enter_from_user_mode(). + */ +static __always_inline bool report_single_step(unsigned long work) +{ + if (work & SYSCALL_WORK_SYSCALL_EMU) + return false; + + return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; +} + /** * arch_ptrace_report_syscall_exit - Architecture specific ptrace_report_syscall_exit() * @@ -155,7 +220,32 @@ static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs * * Do one-time syscall specific work. */ -void syscall_exit_work(struct pt_regs *regs, unsigned long work); +static __always_inline void syscall_exit_work(struct pt_regs *regs, unsigned long work) +{ + bool step; + + /* + * If the syscall was rolled back due to syscall user dispatching, + * then the tracers below are not invoked for the same reason as + * the entry side was not invoked in syscall_trace_enter(): The ABI + * of these syscalls is unknown. + */ + if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { + if (unlikely(current->syscall_dispatch.on_dispatch)) { + current->syscall_dispatch.on_dispatch = false; + return; + } + } + + audit_syscall_exit(regs); + + if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) + trace_syscall_exit(regs, syscall_get_return_value(current, regs)); + + step = report_single_step(work); + if (step || work & SYSCALL_WORK_SYSCALL_TRACE) + arch_ptrace_report_syscall_exit(regs, step); +} /** * syscall_exit_to_user_mode_work - Handle one time work before returning to user mode diff --git a/kernel/entry/common.h b/kernel/entry/common.h deleted file mode 100644 index f6e6d02f07fe..000000000000 --- a/kernel/entry/common.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _COMMON_H -#define _COMMON_H - -bool syscall_user_dispatch(struct pt_regs *regs); - -#endif diff --git a/kernel/entry/syscall-common.c b/kernel/entry/syscall-common.c index bb5f61f5629d..cd4967a9c53e 100644 --- a/kernel/entry/syscall-common.c +++ b/kernel/entry/syscall-common.c @@ -1,103 +1,23 @@ // SPDX-License-Identifier: GPL-2.0 -#include #include -#include "common.h" #define CREATE_TRACE_POINTS #include -static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) -{ - if (unlikely(audit_context())) { - unsigned long args[6]; - - syscall_get_arguments(current, regs, args); - audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); - } -} +/* Out of line to prevent tracepoint code duplication */ -long syscall_trace_enter(struct pt_regs *regs, unsigned long work) +long trace_syscall_enter(struct pt_regs *regs, long syscall) { - long syscall, ret = 0; - + trace_sys_enter(regs, syscall); /* - * Handle Syscall User Dispatch. This must comes first, since - * the ABI here can be something that doesn't make sense for - * other syscall_work features. + * Probes or BPF hooks in the tracepoint may have changed the + * system call number. Reread it. */ - if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { - if (syscall_user_dispatch(regs)) - return -1L; - } - - /* Handle ptrace */ - if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { - ret = arch_ptrace_report_syscall_entry(regs); - if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) - return -1L; - } - - /* Do seccomp after ptrace, to catch any tracer changes. */ - if (work & SYSCALL_WORK_SECCOMP) { - ret = __secure_computing(); - if (ret == -1L) - return ret; - } - - /* Either of the above might have changed the syscall number */ - syscall = syscall_get_nr(current, regs); - - if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) { - trace_sys_enter(regs, syscall); - /* - * Probes or BPF hooks in the tracepoint may have changed the - * system call number as well. - */ - syscall = syscall_get_nr(current, regs); - } - - syscall_enter_audit(regs, syscall); - - return ret ? : syscall; + return syscall_get_nr(current, regs); } -/* - * If SYSCALL_EMU is set, then the only reason to report is when - * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall - * instruction has been already reported in syscall_enter_from_user_mode(). - */ -static inline bool report_single_step(unsigned long work) +void trace_syscall_exit(struct pt_regs *regs, long ret) { - if (work & SYSCALL_WORK_SYSCALL_EMU) - return false; - - return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; -} - -void syscall_exit_work(struct pt_regs *regs, unsigned long work) -{ - bool step; - - /* - * If the syscall was rolled back due to syscall user dispatching, - * then the tracers below are not invoked for the same reason as - * the entry side was not invoked in syscall_trace_enter(): The ABI - * of these syscalls is unknown. - */ - if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { - if (unlikely(current->syscall_dispatch.on_dispatch)) { - current->syscall_dispatch.on_dispatch = false; - return; - } - } - - audit_syscall_exit(regs); - - if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) - trace_sys_exit(regs, syscall_get_return_value(current, regs)); - - step = report_single_step(work); - if (step || work & SYSCALL_WORK_SYSCALL_TRACE) - arch_ptrace_report_syscall_exit(regs, step); + trace_sys_exit(regs, ret); } diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c index a9055eccb27e..d89dffcc2d64 100644 --- a/kernel/entry/syscall_user_dispatch.c +++ b/kernel/entry/syscall_user_dispatch.c @@ -2,6 +2,8 @@ /* * Copyright (C) 2020 Collabora Ltd. */ + +#include #include #include #include @@ -15,8 +17,6 @@ #include -#include "common.h" - static void trigger_sigsys(struct pt_regs *regs) { struct kernel_siginfo info; -- cgit v1.2.3 From 3cb3b27693bf30defb16aa096158a3b24583b8d2 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 26 Jan 2026 10:58:59 +0100 Subject: sched/deadline: Clear the defer params The defer params were not cleared in __dl_clear_params. Clear them. Without this is some of my test cases are flaking and the DL timer is not starting correctly AFAICS. Fixes: a110a81c52a9 ("sched/deadline: Deferrable dl server") Signed-off-by: Joel Fernandes Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andrea Righi Acked-by: Juri Lelli Tested-by: Christian Loehle Link: https://patch.msgid.link/20260126100050.3854740-2-arighi@nvidia.com --- kernel/sched/deadline.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 82e7a214b961..7e181ec02220 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -3660,6 +3660,9 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se) dl_se->dl_non_contending = 0; dl_se->dl_overrun = 0; dl_se->dl_server = 0; + dl_se->dl_defer = 0; + dl_se->dl_defer_running = 0; + dl_se->dl_defer_armed = 0; #ifdef CONFIG_RT_MUTEXES dl_se->pi_se = dl_se; -- cgit v1.2.3 From 6080fb211672aec6ce8f2f5a2e0b4eae736f2027 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 26 Jan 2026 10:59:00 +0100 Subject: sched/debug: Fix updating of ppos on server write ops Updating "ppos" on error conditions does not make much sense. The pattern is to return the error code directly without modifying the position, or modify the position on success and return the number of bytes written. Since on success, the return value of apply is 0, there is no point in modifying ppos either. Fix it by removing all this and just returning error code or number of bytes written on success. Signed-off-by: Joel Fernandes Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juri Lelli Reviewed-by: Andrea Righi Acked-by: Tejun Heo Tested-by: Christian Loehle Link: https://patch.msgid.link/20260126100050.3854740-3-arighi@nvidia.com --- kernel/sched/debug.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 929fdf09e8e9..ed9254da5116 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -339,8 +339,8 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu long cpu = (long) ((struct seq_file *) filp->private_data)->private; struct rq *rq = cpu_rq(cpu); u64 runtime, period; + int retval = 0; size_t err; - int retval; u64 value; err = kstrtoull_from_user(ubuf, cnt, 10, &value); @@ -374,8 +374,6 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu dl_server_stop(&rq->fair_server); retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0); - if (retval) - cnt = retval; if (!runtime) printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n", @@ -383,6 +381,9 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu if (rq->cfs.h_nr_queued) dl_server_start(&rq->fair_server); + + if (retval < 0) + return retval; } *ppos += cnt; -- cgit v1.2.3 From 68ec89d0e99156803bdea3c986c0198624e40ea2 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 26 Jan 2026 10:59:01 +0100 Subject: sched/debug: Stop and start server based on if it was active Currently the DL server interface for applying parameters checks CFS-internals to identify if the server is active. This is error-prone and makes it difficult when adding new servers in the future. Fix it, by using dl_server_active() which is also used by the DL server code to determine if the DL server was started. Signed-off-by: Joel Fernandes Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juri Lelli Reviewed-by: Andrea Righi Acked-by: Tejun Heo Tested-by: Christian Loehle Link: https://patch.msgid.link/20260126100050.3854740-4-arighi@nvidia.com --- kernel/sched/debug.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index ed9254da5116..41e389569d06 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -348,6 +348,8 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu return err; scoped_guard (rq_lock_irqsave, rq) { + bool is_active; + runtime = rq->fair_server.dl_runtime; period = rq->fair_server.dl_period; @@ -370,8 +372,11 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu return -EINVAL; } - update_rq_clock(rq); - dl_server_stop(&rq->fair_server); + is_active = dl_server_active(&rq->fair_server); + if (is_active) { + update_rq_clock(rq); + dl_server_stop(&rq->fair_server); + } retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0); @@ -379,7 +384,7 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n", cpu_of(rq)); - if (rq->cfs.h_nr_queued) + if (is_active && runtime) dl_server_start(&rq->fair_server); if (retval < 0) -- cgit v1.2.3 From cd959a3562050d1c676be37f1d256a96cb067868 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Mon, 26 Jan 2026 10:59:02 +0100 Subject: sched_ext: Add a DL server for sched_ext tasks sched_ext currently suffers starvation due to RT. The same workload when converted to EXT can get zero runtime if RT is 100% running, causing EXT processes to stall. Fix it by adding a DL server for EXT. A kselftest is also included later to confirm that both DL servers are functioning correctly: # ./runner -t rt_stall ===== START ===== TEST: rt_stall DESCRIPTION: Verify that RT tasks cannot stall SCHED_EXT tasks OUTPUT: TAP version 13 1..1 # Runtime of FAIR task (PID 1511) is 0.250000 seconds # Runtime of RT task (PID 1512) is 4.750000 seconds # FAIR task got 5.00% of total runtime ok 1 PASS: FAIR task got more than 4.00% of runtime TAP version 13 1..1 # Runtime of EXT task (PID 1514) is 0.250000 seconds # Runtime of RT task (PID 1515) is 4.750000 seconds # EXT task got 5.00% of total runtime ok 2 PASS: EXT task got more than 4.00% of runtime TAP version 13 1..1 # Runtime of FAIR task (PID 1517) is 0.250000 seconds # Runtime of RT task (PID 1518) is 4.750000 seconds # FAIR task got 5.00% of total runtime ok 3 PASS: FAIR task got more than 4.00% of runtime TAP version 13 1..1 # Runtime of EXT task (PID 1521) is 0.250000 seconds # Runtime of RT task (PID 1522) is 4.750000 seconds # EXT task got 5.00% of total runtime ok 4 PASS: EXT task got more than 4.00% of runtime ok 1 rt_stall # ===== END ===== Co-developed-by: Joel Fernandes Signed-off-by: Joel Fernandes Signed-off-by: Andrea Righi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juri Lelli Tested-by: Christian Loehle Link: https://patch.msgid.link/20260126100050.3854740-5-arighi@nvidia.com --- kernel/sched/core.c | 6 ++++ kernel/sched/deadline.c | 83 +++++++++++++++++++++++++++++++++++-------------- kernel/sched/ext.c | 33 ++++++++++++++++++++ kernel/sched/idle.c | 3 ++ kernel/sched/sched.h | 2 ++ kernel/sched/topology.c | 5 +++ 6 files changed, 109 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 260633e6b212..8f2dc0a941ef 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8484,6 +8484,9 @@ int sched_cpu_dying(unsigned int cpu) dump_rq_tasks(rq, KERN_WARNING); } dl_server_stop(&rq->fair_server); +#ifdef CONFIG_SCHED_CLASS_EXT + dl_server_stop(&rq->ext_server); +#endif rq_unlock_irqrestore(rq, &rf); calc_load_migrate(rq); @@ -8689,6 +8692,9 @@ void __init sched_init(void) hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); fair_server_init(rq); +#ifdef CONFIG_SCHED_CLASS_EXT + ext_server_init(rq); +#endif #ifdef CONFIG_SCHED_CORE rq->core = rq; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 7e181ec02220..eae14e57adf1 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1449,8 +1449,8 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 dl_se->dl_defer_idle = 0; /* - * The fair server can consume its runtime while throttled (not queued/ - * running as regular CFS). + * The DL server can consume its runtime while throttled (not + * queued / running as regular CFS). * * If the server consumes its entire runtime in this state. The server * is not required for the current period. Thus, reset the server by @@ -1535,10 +1535,10 @@ throttle: } /* - * The fair server (sole dl_server) does not account for real-time - * workload because it is running fair work. + * The dl_server does not account for real-time workload because it + * is running fair work. */ - if (dl_se == &rq->fair_server) + if (dl_se->dl_server) return; #ifdef CONFIG_RT_GROUP_SCHED @@ -1573,9 +1573,9 @@ throttle: * In the non-defer mode, the idle time is not accounted, as the * server provides a guarantee. * - * If the dl_server is in defer mode, the idle time is also considered - * as time available for the fair server, avoiding a penalty for the - * rt scheduler that did not consumed that time. + * If the dl_server is in defer mode, the idle time is also considered as + * time available for the dl_server, avoiding a penalty for the rt + * scheduler that did not consumed that time. */ void dl_server_update_idle(struct sched_dl_entity *dl_se, s64 delta_exec) { @@ -1860,6 +1860,18 @@ void sched_init_dl_servers(void) dl_se->dl_server = 1; dl_se->dl_defer = 1; setup_new_dl_entity(dl_se); + +#ifdef CONFIG_SCHED_CLASS_EXT + dl_se = &rq->ext_server; + + WARN_ON(dl_server(dl_se)); + + dl_server_apply_params(dl_se, runtime, period, 1); + + dl_se->dl_server = 1; + dl_se->dl_defer = 1; + setup_new_dl_entity(dl_se); +#endif } } @@ -3198,6 +3210,36 @@ void dl_add_task_root_domain(struct task_struct *p) raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); } +static void dl_server_add_bw(struct root_domain *rd, int cpu) +{ + struct sched_dl_entity *dl_se; + + dl_se = &cpu_rq(cpu)->fair_server; + if (dl_server(dl_se) && cpu_active(cpu)) + __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu)); + +#ifdef CONFIG_SCHED_CLASS_EXT + dl_se = &cpu_rq(cpu)->ext_server; + if (dl_server(dl_se) && cpu_active(cpu)) + __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu)); +#endif +} + +static u64 dl_server_read_bw(int cpu) +{ + u64 dl_bw = 0; + + if (cpu_rq(cpu)->fair_server.dl_server) + dl_bw += cpu_rq(cpu)->fair_server.dl_bw; + +#ifdef CONFIG_SCHED_CLASS_EXT + if (cpu_rq(cpu)->ext_server.dl_server) + dl_bw += cpu_rq(cpu)->ext_server.dl_bw; +#endif + + return dl_bw; +} + void dl_clear_root_domain(struct root_domain *rd) { int i; @@ -3216,12 +3258,8 @@ void dl_clear_root_domain(struct root_domain *rd) * dl_servers are not tasks. Since dl_add_task_root_domain ignores * them, we need to account for them here explicitly. */ - for_each_cpu(i, rd->span) { - struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server; - - if (dl_server(dl_se) && cpu_active(i)) - __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(i)); - } + for_each_cpu(i, rd->span) + dl_server_add_bw(rd, i); } void dl_clear_root_domain_cpu(int cpu) @@ -3720,7 +3758,7 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) unsigned long flags, cap; struct dl_bw *dl_b; bool overflow = 0; - u64 fair_server_bw = 0; + u64 dl_server_bw = 0; rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); @@ -3753,27 +3791,26 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) cap -= arch_scale_cpu_capacity(cpu); /* - * cpu is going offline and NORMAL tasks will be moved away - * from it. We can thus discount dl_server bandwidth - * contribution as it won't need to be servicing tasks after - * the cpu is off. + * cpu is going offline and NORMAL and EXT tasks will be + * moved away from it. We can thus discount dl_server + * bandwidth contribution as it won't need to be servicing + * tasks after the cpu is off. */ - if (cpu_rq(cpu)->fair_server.dl_server) - fair_server_bw = cpu_rq(cpu)->fair_server.dl_bw; + dl_server_bw = dl_server_read_bw(cpu); /* * Not much to check if no DEADLINE bandwidth is present. * dl_servers we can discount, as tasks will be moved out the * offlined CPUs anyway. */ - if (dl_b->total_bw - fair_server_bw > 0) { + if (dl_b->total_bw - dl_server_bw > 0) { /* * Leaving at least one CPU for DEADLINE tasks seems a * wise thing to do. As said above, cpu is not offline * yet, so account for that. */ if (dl_bw_cpus(cpu) - 1) - overflow = __dl_overflow(dl_b, cap, fair_server_bw, 0); + overflow = __dl_overflow(dl_b, cap, dl_server_bw, 0); else overflow = 1; } diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index ce5e64ba8dfb..3bc49dc16021 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -958,6 +958,8 @@ static void update_curr_scx(struct rq *rq) if (!curr->scx.slice) touch_core_sched(rq, curr); } + + dl_server_update(&rq->ext_server, delta_exec); } static bool scx_dsq_priq_less(struct rb_node *node_a, @@ -1501,6 +1503,10 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags if (enq_flags & SCX_ENQ_WAKEUP) touch_core_sched(rq, p); + /* Start dl_server if this is the first task being enqueued */ + if (rq->scx.nr_running == 1) + dl_server_start(&rq->ext_server); + do_enqueue_task(rq, p, enq_flags, sticky_cpu); out: rq->scx.flags &= ~SCX_RQ_IN_WAKEUP; @@ -2512,6 +2518,33 @@ static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) return do_pick_task_scx(rq, rf, false); } +/* + * Select the next task to run from the ext scheduling class. + * + * Use do_pick_task_scx() directly with @force_scx enabled, since the + * dl_server must always select a sched_ext task. + */ +static struct task_struct * +ext_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf) +{ + if (!scx_enabled()) + return NULL; + + return do_pick_task_scx(dl_se->rq, rf, true); +} + +/* + * Initialize the ext server deadline entity. + */ +void ext_server_init(struct rq *rq) +{ + struct sched_dl_entity *dl_se = &rq->ext_server; + + init_dl_entity(dl_se); + + dl_server_init(dl_se, rq, ext_server_pick_task); +} + #ifdef CONFIG_SCHED_CORE /** * scx_prio_less - Task ordering for core-sched diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 46a9845735ff..3681b6ad9276 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -537,6 +537,9 @@ static void update_curr_idle(struct rq *rq) se->exec_start = now; dl_server_update_idle(&rq->fair_server, delta_exec); +#ifdef CONFIG_SCHED_CLASS_EXT + dl_server_update_idle(&rq->ext_server, delta_exec); +#endif } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 309101c90239..2aa4251c1520 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -414,6 +414,7 @@ extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, extern void sched_init_dl_servers(void); extern void fair_server_init(struct rq *rq); +extern void ext_server_init(struct rq *rq); extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq); extern int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init); @@ -1171,6 +1172,7 @@ struct rq { struct dl_rq dl; #ifdef CONFIG_SCHED_CLASS_EXT struct scx_rq scx; + struct sched_dl_entity ext_server; #endif struct sched_dl_entity fair_server; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index cf643a5ddedd..ac268da91778 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -508,6 +508,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) if (rq->fair_server.dl_server) __dl_server_attach_root(&rq->fair_server, rq); +#ifdef CONFIG_SCHED_CLASS_EXT + if (rq->ext_server.dl_server) + __dl_server_attach_root(&rq->ext_server, rq); +#endif + rq_unlock_irqrestore(rq, &rf); if (old_rd) -- cgit v1.2.3 From 76d12132ba459ab929cb66eb2030c666aacdb69a Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 26 Jan 2026 10:59:03 +0100 Subject: sched/debug: Add support to change sched_ext server params When a sched_ext server is loaded, tasks in the fair class are automatically moved to the sched_ext class. Add support to modify the ext server parameters similar to how the fair server parameters are modified. Re-use common code between ext and fair servers as needed. Co-developed-by: Andrea Righi Signed-off-by: Andrea Righi Signed-off-by: Joel Fernandes Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juri Lelli Tested-by: Christian Loehle Link: https://patch.msgid.link/20260126100050.3854740-6-arighi@nvidia.com --- kernel/sched/debug.c | 157 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 133 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 41e389569d06..59e650f9d436 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -330,14 +330,16 @@ enum dl_param { DL_PERIOD, }; -static unsigned long fair_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */ -static unsigned long fair_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */ +static unsigned long dl_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */ +static unsigned long dl_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */ -static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos, enum dl_param param) +static ssize_t sched_server_write_common(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos, enum dl_param param, + void *server) { long cpu = (long) ((struct seq_file *) filp->private_data)->private; struct rq *rq = cpu_rq(cpu); + struct sched_dl_entity *dl_se = (struct sched_dl_entity *)server; u64 runtime, period; int retval = 0; size_t err; @@ -350,8 +352,8 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu scoped_guard (rq_lock_irqsave, rq) { bool is_active; - runtime = rq->fair_server.dl_runtime; - period = rq->fair_server.dl_period; + runtime = dl_se->dl_runtime; + period = dl_se->dl_period; switch (param) { case DL_RUNTIME: @@ -367,25 +369,25 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu } if (runtime > period || - period > fair_server_period_max || - period < fair_server_period_min) { + period > dl_server_period_max || + period < dl_server_period_min) { return -EINVAL; } - is_active = dl_server_active(&rq->fair_server); + is_active = dl_server_active(dl_se); if (is_active) { update_rq_clock(rq); - dl_server_stop(&rq->fair_server); + dl_server_stop(dl_se); } - retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0); + retval = dl_server_apply_params(dl_se, runtime, period, 0); if (!runtime) - printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n", - cpu_of(rq)); + printk_deferred("%s server disabled in CPU %d, system may crash due to starvation.\n", + server == &rq->fair_server ? "Fair" : "Ext", cpu_of(rq)); if (is_active && runtime) - dl_server_start(&rq->fair_server); + dl_server_start(dl_se); if (retval < 0) return retval; @@ -395,36 +397,42 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu return cnt; } -static size_t sched_fair_server_show(struct seq_file *m, void *v, enum dl_param param) +static size_t sched_server_show_common(struct seq_file *m, void *v, enum dl_param param, + void *server) { - unsigned long cpu = (unsigned long) m->private; - struct rq *rq = cpu_rq(cpu); + struct sched_dl_entity *dl_se = (struct sched_dl_entity *)server; u64 value; switch (param) { case DL_RUNTIME: - value = rq->fair_server.dl_runtime; + value = dl_se->dl_runtime; break; case DL_PERIOD: - value = rq->fair_server.dl_period; + value = dl_se->dl_period; break; } seq_printf(m, "%llu\n", value); return 0; - } static ssize_t sched_fair_server_runtime_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_RUNTIME); + long cpu = (long) ((struct seq_file *) filp->private_data)->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_write_common(filp, ubuf, cnt, ppos, DL_RUNTIME, + &rq->fair_server); } static int sched_fair_server_runtime_show(struct seq_file *m, void *v) { - return sched_fair_server_show(m, v, DL_RUNTIME); + unsigned long cpu = (unsigned long) m->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_show_common(m, v, DL_RUNTIME, &rq->fair_server); } static int sched_fair_server_runtime_open(struct inode *inode, struct file *filp) @@ -440,16 +448,57 @@ static const struct file_operations fair_server_runtime_fops = { .release = single_release, }; +#ifdef CONFIG_SCHED_CLASS_EXT +static ssize_t +sched_ext_server_runtime_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + long cpu = (long) ((struct seq_file *) filp->private_data)->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_write_common(filp, ubuf, cnt, ppos, DL_RUNTIME, + &rq->ext_server); +} + +static int sched_ext_server_runtime_show(struct seq_file *m, void *v) +{ + unsigned long cpu = (unsigned long) m->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_show_common(m, v, DL_RUNTIME, &rq->ext_server); +} + +static int sched_ext_server_runtime_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_ext_server_runtime_show, inode->i_private); +} + +static const struct file_operations ext_server_runtime_fops = { + .open = sched_ext_server_runtime_open, + .write = sched_ext_server_runtime_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_SCHED_CLASS_EXT */ + static ssize_t sched_fair_server_period_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_PERIOD); + long cpu = (long) ((struct seq_file *) filp->private_data)->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_write_common(filp, ubuf, cnt, ppos, DL_PERIOD, + &rq->fair_server); } static int sched_fair_server_period_show(struct seq_file *m, void *v) { - return sched_fair_server_show(m, v, DL_PERIOD); + unsigned long cpu = (unsigned long) m->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_show_common(m, v, DL_PERIOD, &rq->fair_server); } static int sched_fair_server_period_open(struct inode *inode, struct file *filp) @@ -465,6 +514,40 @@ static const struct file_operations fair_server_period_fops = { .release = single_release, }; +#ifdef CONFIG_SCHED_CLASS_EXT +static ssize_t +sched_ext_server_period_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + long cpu = (long) ((struct seq_file *) filp->private_data)->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_write_common(filp, ubuf, cnt, ppos, DL_PERIOD, + &rq->ext_server); +} + +static int sched_ext_server_period_show(struct seq_file *m, void *v) +{ + unsigned long cpu = (unsigned long) m->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_show_common(m, v, DL_PERIOD, &rq->ext_server); +} + +static int sched_ext_server_period_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_ext_server_period_show, inode->i_private); +} + +static const struct file_operations ext_server_period_fops = { + .open = sched_ext_server_period_open, + .write = sched_ext_server_period_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_SCHED_CLASS_EXT */ + static struct dentry *debugfs_sched; static void debugfs_fair_server_init(void) @@ -488,6 +571,29 @@ static void debugfs_fair_server_init(void) } } +#ifdef CONFIG_SCHED_CLASS_EXT +static void debugfs_ext_server_init(void) +{ + struct dentry *d_ext; + unsigned long cpu; + + d_ext = debugfs_create_dir("ext_server", debugfs_sched); + if (!d_ext) + return; + + for_each_possible_cpu(cpu) { + struct dentry *d_cpu; + char buf[32]; + + snprintf(buf, sizeof(buf), "cpu%lu", cpu); + d_cpu = debugfs_create_dir(buf, d_ext); + + debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &ext_server_runtime_fops); + debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &ext_server_period_fops); + } +} +#endif /* CONFIG_SCHED_CLASS_EXT */ + static __init int sched_init_debug(void) { struct dentry __maybe_unused *numa; @@ -526,6 +632,9 @@ static __init int sched_init_debug(void) debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); debugfs_fair_server_init(); +#ifdef CONFIG_SCHED_CLASS_EXT + debugfs_ext_server_init(); +#endif return 0; } -- cgit v1.2.3 From 5a40a9bb56d455e7548ba4b6d7787918323cbaf0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 3 Feb 2026 11:05:12 +0100 Subject: sched/debug: Fix dl_server (re)start conditions There are two problems with sched_server_write_common() that can cause the dl_server to malfunction upon attempting to change the parameters: 1) when, after having disabled the dl_server by setting runtime=0, it is enabled again while tasks are already enqueued. In this case is_active would still be 0 and dl_server_start() would not be called. 2) when dl_server_apply_params() would fail, runtime is not applied and does not reflect the new state. Instead have dl_server_start() check its actual dl_runtime, and have sched_server_write_common() unconditionally (re)start the dl_server. It will automatically stop if there isn't anything to do, so spurious activation is harmless -- while failing to start it is a problem. While there, move the printk out of the locked region and make it symmetric, also printing on enable. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260203103407.GK1282955@noisy.programming.kicks-ass.net --- kernel/sched/deadline.c | 5 ++--- kernel/sched/debug.c | 32 ++++++++++++++------------------ 2 files changed, 16 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index eae14e57adf1..d08b00429323 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1799,7 +1799,7 @@ void dl_server_start(struct sched_dl_entity *dl_se) struct rq *rq = dl_se->rq; dl_se->dl_defer_idle = 0; - if (!dl_server(dl_se) || dl_se->dl_server_active) + if (!dl_server(dl_se) || dl_se->dl_server_active || !dl_se->dl_runtime) return; /* @@ -1898,7 +1898,6 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio int cpu = cpu_of(rq); struct dl_bw *dl_b; unsigned long cap; - int retval = 0; int cpus; dl_b = dl_bw_of(cpu); @@ -1930,7 +1929,7 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); - return retval; + return 0; } /* diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 59e650f9d436..b24f40f05019 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -338,9 +338,9 @@ static ssize_t sched_server_write_common(struct file *filp, const char __user *u void *server) { long cpu = (long) ((struct seq_file *) filp->private_data)->private; - struct rq *rq = cpu_rq(cpu); struct sched_dl_entity *dl_se = (struct sched_dl_entity *)server; - u64 runtime, period; + u64 old_runtime, runtime, period; + struct rq *rq = cpu_rq(cpu); int retval = 0; size_t err; u64 value; @@ -350,9 +350,7 @@ static ssize_t sched_server_write_common(struct file *filp, const char __user *u return err; scoped_guard (rq_lock_irqsave, rq) { - bool is_active; - - runtime = dl_se->dl_runtime; + old_runtime = runtime = dl_se->dl_runtime; period = dl_se->dl_period; switch (param) { @@ -374,25 +372,23 @@ static ssize_t sched_server_write_common(struct file *filp, const char __user *u return -EINVAL; } - is_active = dl_server_active(dl_se); - if (is_active) { - update_rq_clock(rq); - dl_server_stop(dl_se); - } - + update_rq_clock(rq); + dl_server_stop(dl_se); retval = dl_server_apply_params(dl_se, runtime, period, 0); - - if (!runtime) - printk_deferred("%s server disabled in CPU %d, system may crash due to starvation.\n", - server == &rq->fair_server ? "Fair" : "Ext", cpu_of(rq)); - - if (is_active && runtime) - dl_server_start(dl_se); + dl_server_start(dl_se); if (retval < 0) return retval; } + if (!!old_runtime ^ !!runtime) { + pr_info("%s server %sabled on CPU %d%s.\n", + server == &rq->fair_server ? "Fair" : "Ext", + runtime ? "en" : "dis", + cpu_of(rq), + runtime ? "" : ", system may malfunction due to starvation"); + } + *ppos += cnt; return cnt; } -- cgit v1.2.3 From 505da6689305b1103e9a8ab6636c6a7cf74cd5b1 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 27 Jan 2026 15:25:09 +0800 Subject: sched/clock: Avoid false sharing for sched_clock_irqtime Read-mostly sched_clock_irqtime may share the same cacheline with frequently updated nohz struct. Make it as static_key to avoid false sharing issue. The only user of disable_sched_clock_irqtime() is tsc_.*mark_unstable() which may be invoked under atomic context and require a workqueue to disable static_key. But both of them calls clear_sched_clock_stable() just before doing disable_sched_clock_irqtime(). We can reuse "sched_clock_work" to also disable sched_clock_irqtime(). One additional case need to handle is if the tsc is marked unstable before late_initcall() phase, sched_clock_work will not be invoked and sched_clock_irqtime will stay enabled although clock is unstable: tsc_init() enable_sched_clock_irqtime() # irqtime accounting is enabled here ... if (unsynchronized_tsc()) # true mark_tsc_unstable() clear_sched_clock_stable() __sched_clock_stable_early = 0; ... if (static_key_count(&sched_clock_running.key) == 2) # Only happens at sched_clock_init_late() __clear_sched_clock_stable(); # Never executed ... # late_initcall() phase sched_clock_init_late() if (__sched_clock_stable_early) # Already false __set_sched_clock_stable(); # sched_clock is never marked stable # TSC unstable, but sched_clock_work won't run to disable irqtime So we need to disable_sched_clock_irqtime() in sched_clock_init_late() if clock is unstable. Reported-by: Benjamin Lei Suggested-by: K Prateek Nayak Suggested-by: Peter Zijlstra Suggested-by: Shrikanth Hegde Signed-off-by: Wangyang Guo Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Reviewed-by: Tim Chen Reviewed-by: Tianyou Li Reviewed-by: Shrikanth Hegde Tested-by: K Prateek Nayak Link: https://patch.msgid.link/20260127072509.2627346-1-wangyang.guo@intel.com --- arch/x86/kernel/tsc.c | 2 -- kernel/sched/clock.c | 3 +++ kernel/sched/cputime.c | 9 +++++---- kernel/sched/sched.h | 4 ++-- 4 files changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 7d3e13e14eab..7be44b5198cf 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1143,7 +1143,6 @@ static void tsc_cs_mark_unstable(struct clocksource *cs) tsc_unstable = 1; if (using_native_sched_clock()) clear_sched_clock_stable(); - disable_sched_clock_irqtime(); pr_info("Marking TSC unstable due to clocksource watchdog\n"); } @@ -1213,7 +1212,6 @@ void mark_tsc_unstable(char *reason) tsc_unstable = 1; if (using_native_sched_clock()) clear_sched_clock_stable(); - disable_sched_clock_irqtime(); pr_info("Marking TSC unstable due to %s\n", reason); clocksource_mark_unstable(&clocksource_tsc_early); diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index f5e6dd6a6b3a..2ae4fbf13431 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -173,6 +173,7 @@ notrace static void __sched_clock_work(struct work_struct *work) scd->tick_gtod, __gtod_offset, scd->tick_raw, __sched_clock_offset); + disable_sched_clock_irqtime(); static_branch_disable(&__sched_clock_stable); } @@ -238,6 +239,8 @@ static int __init sched_clock_init_late(void) if (__sched_clock_stable_early) __set_sched_clock_stable(); + else + disable_sched_clock_irqtime(); /* disable if clock unstable. */ return 0; } diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 4f97896887ec..ff0dfca95420 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -12,6 +12,8 @@ #ifdef CONFIG_IRQ_TIME_ACCOUNTING +DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime); + /* * There are no locks covering percpu hardirq/softirq time. * They are only modified in vtime_account, on corresponding CPU @@ -25,16 +27,15 @@ */ DEFINE_PER_CPU(struct irqtime, cpu_irqtime); -int sched_clock_irqtime; - void enable_sched_clock_irqtime(void) { - sched_clock_irqtime = 1; + static_branch_enable(&sched_clock_irqtime); } void disable_sched_clock_irqtime(void) { - sched_clock_irqtime = 0; + if (irqtime_enabled()) + static_branch_disable(&sched_clock_irqtime); } static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2aa4251c1520..a821cc8b2dd8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3333,11 +3333,11 @@ struct irqtime { }; DECLARE_PER_CPU(struct irqtime, cpu_irqtime); -extern int sched_clock_irqtime; +DECLARE_STATIC_KEY_FALSE(sched_clock_irqtime); static inline int irqtime_enabled(void) { - return sched_clock_irqtime; + return static_branch_likely(&sched_clock_irqtime); } /* -- cgit v1.2.3 From 94894c9c477e53bcea052e075c53f89df3d2a33e Mon Sep 17 00:00:00 2001 From: Chen Jinghuang Date: Thu, 22 Jan 2026 01:25:33 +0000 Subject: sched/rt: Skip currently executing CPU in rto_next_cpu() CPU0 becomes overloaded when hosting a CPU-bound RT task, a non-CPU-bound RT task, and a CFS task stuck in kernel space. When other CPUs switch from RT to non-RT tasks, RT load balancing (LB) is triggered; with HAVE_RT_PUSH_IPI enabled, they send IPIs to CPU0 to drive the execution of rto_push_irq_work_func. During push_rt_task on CPU0, if next_task->prio < rq->donor->prio, resched_curr() sets NEED_RESCHED and after the push operation completes, CPU0 calls rto_next_cpu(). Since only CPU0 is overloaded in this scenario, rto_next_cpu() should ideally return -1 (no further IPI needed). However, multiple CPUs invoking tell_cpu_to_push() during LB increments rd->rto_loop_next. Even when rd->rto_cpu is set to -1, the mismatch between rd->rto_loop and rd->rto_loop_next forces rto_next_cpu() to restart its search from -1. With CPU0 remaining overloaded (satisfying rt_nr_migratory && rt_nr_total > 1), it gets reselected, causing CPU0 to queue irq_work to itself and send self-IPIs repeatedly. As long as CPU0 stays overloaded and other CPUs run pull_rt_tasks(), it falls into an infinite self-IPI loop, which triggers a CPU hardlockup due to continuous self-interrupts. The trigging scenario is as follows: cpu0 cpu1 cpu2 pull_rt_task tell_cpu_to_push <------------irq_work_queue_on rto_push_irq_work_func push_rt_task resched_curr(rq) pull_rt_task rto_next_cpu tell_cpu_to_push <-------------------------- atomic_inc(rto_loop_next) rd->rto_loop != next rto_next_cpu irq_work_queue_on rto_push_irq_work_func Fix redundant self-IPI by filtering the initiating CPU in rto_next_cpu(). This solution has been verified to effectively eliminate spurious self-IPIs and prevent CPU hardlockup scenarios. Fixes: 4bdced5c9a29 ("sched/rt: Simplify the IPI based RT balancing logic") Suggested-by: Steven Rostedt (Google) Suggested-by: K Prateek Nayak Signed-off-by: Chen Jinghuang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (Google) Reviewed-by: Valentin Schneider Link: https://patch.msgid.link/20260122012533.673768-1-chenjinghuang2@huawei.com --- kernel/sched/rt.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0a9b2cd6da72..a7680477fa6f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2106,6 +2106,7 @@ static void push_rt_tasks(struct rq *rq) */ static int rto_next_cpu(struct root_domain *rd) { + int this_cpu = smp_processor_id(); int next; int cpu; @@ -2129,6 +2130,10 @@ static int rto_next_cpu(struct root_domain *rd) rd->rto_cpu = cpu; + /* Do not send IPI to self */ + if (cpu == this_cpu) + continue; + if (cpu < nr_cpu_ids) return cpu; -- cgit v1.2.3 From 742fe830b7d9c01b5c36add9f664a5267caca4f5 Mon Sep 17 00:00:00 2001 From: zenghongling Date: Tue, 20 Jan 2026 16:33:33 +0800 Subject: sched/cpufreq: Use %pe format for PTR_ERR() printing Use %pe format specifier for printing PTR_ERR() error values to make error messages more readable. Found by Coccinelle: ./cpufreq_schedutil.c:685:49-56: WARNING: Consider using %pe to print PTR_ERR() Signed-off-by: zenghongling Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260120083333.148385-1-zenghongling@kylinos.cn --- kernel/sched/cpufreq_schedutil.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 0ab5f9d4bc59..cfc40181f66e 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -682,7 +682,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) "sugov:%d", cpumask_first(policy->related_cpus)); if (IS_ERR(thread)) { - pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread)); + pr_err("failed to create sugov thread: %pe\n", thread); return PTR_ERR(thread); } -- cgit v1.2.3 From e34881c84c255bc300f24d9fe685324be20da3d1 Mon Sep 17 00:00:00 2001 From: Zicheng Qu Date: Fri, 30 Jan 2026 08:34:38 +0000 Subject: sched: Re-evaluate scheduling when migrating queued tasks out of throttled cgroups Consider the following sequence on a CPU configured with nohz_full: 1) A task P runs in cgroup A, and cgroup A becomes throttled due to CFS bandwidth control. The gse (cgroup A) where the task P attached is dequeued and the CPU switches to idle. 2) Before cgroup A is unthrottled, task P is migrated from cgroup A to another cgroup B (not throttled). During sched_move_task(), the task P is observed as queued but not running, and therefore no resched_curr() is triggered. 3) Since the CPU is nohz_full, it remains in do_idle() waiting for an explicit scheduling event, i.e., resched_curr(). 4) For kernel <= 5.10: Later, cgroup A is unthrottled. However, the task P has already been migrated out of cgroup A, so unthrottle_cfs_rq() may observe load_weight == 0 and return early without resched_curr() called. For kernel >= 6.6: The unthrottling path normally triggers `resched_curr()` almost cases even when no runnable tasks remain in the unthrottled cgroup, preventing the idle stall described above. However, if cgroup A is removed before it gets unthrottled, the unthrottling path for cgroup A is never executed. In a result, no `resched_curr()` can be called. 5) At this point, the task P is runnable in cgroup B (not throttled), but the CPU remains in do_idle() with no pending reschedule point. The system stays in this state until an unrelated event (e.g. a new task wakeup or any cases) that can trigger a resched_curr() breaks the nohz_full idle state, and then the task P finally gets scheduled. The root cause is that sched_move_task() may classify the task as only queued, not running, and therefore fails to trigger a resched_curr(), while the later unthrottling path no longer has visibility of the migrated task. Preserve the existing behavior for running tasks by issuing resched_curr(), and explicitly invoke check_preempt_curr() for tasks that were queued at the time of migration. This ensures that runnable tasks are reconsidered for scheduling even when nohz_full suppresses periodic ticks. Fixes: 29f59db3a74b ("sched: group-scheduler core") Signed-off-by: Zicheng Qu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Reviewed-by: Aaron Lu Tested-by: Aaron Lu Link: https://patch.msgid.link/20260130083438.1122457-1-quzicheng@huawei.com --- kernel/sched/core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8f2dc0a941ef..b411e4feff7f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9126,6 +9126,7 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup) { unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; bool resched = false; + bool queued = false; struct rq *rq; CLASS(task_rq_lock, rq_guard)(tsk); @@ -9137,10 +9138,13 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup) scx_cgroup_move_task(tsk); if (scope->running) resched = true; + queued = scope->queued; } if (resched) resched_curr(rq); + else if (queued) + wakeup_preempt(rq, tsk, 0); __balance_callbacks(rq, &rq_guard.rf); } -- cgit v1.2.3