diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-03-01 11:09:24 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-03-01 11:09:24 -0800 |
| commit | 61706251492eff650e91c58507bc77e1a12c7fbb (patch) | |
| tree | 0c6be9a5c2a100a548a2c6fc683b6671df762b22 /kernel | |
| parent | cb36eabcaf28acaeb14b236fc090340f78b2cc48 (diff) | |
| parent | 3b68df978133ac3d46d570af065a73debbb68248 (diff) | |
Merge tag 'sched-urgent-2026-03-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar:
- Fix zero_vruntime tracking when there's a single task running
- Fix slice protection logic
- Fix the ->vprot logic for reniced tasks
- Fix lag clamping in mixed slice workloads
- Fix objtool uaccess warning (and bug) in the
!CONFIG_RSEQ_SLICE_EXTENSION case caused by unexpected un-inlining,
which triggers with older compilers
- Fix a comment in the rseq registration rseq_size bound check code
- Fix a legacy RSEQ ABI quirk that handled 32-byte area sizes
differently, which special size we now reached naturally and want to
avoid. The visible ugliness of the new reserved field will be avoided
the next time the RSEQ area is extended.
* tag 'sched-urgent-2026-03-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
rseq: slice ext: Ensure rseq feature size differs from original rseq size
rseq: Clarify rseq registration rseq_size bound check comment
sched/core: Fix wakeup_preempt's next_class tracking
rseq: Mark rseq_arm_slice_extension_timer() __always_inline
sched/fair: Fix lag clamp
sched/eevdf: Update se->vprot in reweight_entity()
sched/fair: Only set slice protection at pick time
sched/fair: Fix zero_vruntime tracking
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/rseq.c | 8 | ||||
| -rw-r--r-- | kernel/sched/core.c | 1 | ||||
| -rw-r--r-- | kernel/sched/ext.c | 4 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 150 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 11 |
5 files changed, 131 insertions, 43 deletions
diff --git a/kernel/rseq.c b/kernel/rseq.c index b0973d19f366..38d3ef540760 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -80,6 +80,7 @@ #include <linux/syscalls.h> #include <linux/uaccess.h> #include <linux/types.h> +#include <linux/rseq.h> #include <asm/ptrace.h> #define CREATE_TRACE_POINTS @@ -449,13 +450,14 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq * size, the required alignment is the original struct rseq alignment. * - * In order to be valid, rseq_len is either the original rseq size, or - * large enough to contain all supported fields, as communicated to + * The rseq_len is required to be greater or equal to the original rseq + * size. In order to be valid, rseq_len is either the original rseq size, + * or large enough to contain all supported fields, as communicated to * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. */ if (rseq_len < ORIG_RSEQ_SIZE || (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) || - (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || + (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, rseq_alloc_align()) || rseq_len < offsetof(struct rseq, end)))) return -EINVAL; if (!access_ok(rseq, rseq_len)) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 759777694c78..b7f77c165a6e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6830,6 +6830,7 @@ static void __sched notrace __schedule(int sched_mode) /* SCX must consult the BPF scheduler to tell if rq is empty */ if (!rq->nr_running && !scx_enabled()) { next = prev; + rq->next_class = &idle_sched_class; goto picked; } } else if (!preempt && prev_state) { diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 62b1f3ac5630..06cc0a4aec66 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2460,7 +2460,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) /* see kick_cpus_irq_workfn() */ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); - rq->next_class = &ext_sched_class; + rq_modified_begin(rq, &ext_sched_class); rq_unpin_lock(rq, rf); balance_one(rq, prev); @@ -2475,7 +2475,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) * If @force_scx is true, always try to pick a SCHED_EXT task, * regardless of any higher-priority sched classes activity. */ - if (!force_scx && sched_class_above(rq->next_class, &ext_sched_class)) + if (!force_scx && rq_modified_above(rq, &ext_sched_class)) return RETRY_TASK; keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index eea99ec01a3f..bf948db905ed 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -589,6 +589,21 @@ static inline bool entity_before(const struct sched_entity *a, return vruntime_cmp(a->deadline, "<", b->deadline); } +/* + * Per avg_vruntime() below, cfs_rq::zero_vruntime is only slightly stale + * and this value should be no more than two lag bounds. Which puts it in the + * general order of: + * + * (slice + TICK_NSEC) << NICE_0_LOAD_SHIFT + * + * which is around 44 bits in size (on 64bit); that is 20 for + * NICE_0_LOAD_SHIFT, another 20 for NSEC_PER_MSEC and then a handful for + * however many msec the actual slice+tick ends up begin. + * + * (disregarding the actual divide-by-weight part makes for the worst case + * weight of 2, which nicely cancels vs the fuzz in zero_vruntime not actually + * being the zero-lag point). + */ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) { return vruntime_op(se->vruntime, "-", cfs_rq->zero_vruntime); @@ -676,41 +691,65 @@ sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) } static inline -void sum_w_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) +void update_zero_vruntime(struct cfs_rq *cfs_rq, s64 delta) { /* - * v' = v + d ==> sum_w_vruntime' = sum_runtime - d*sum_weight + * v' = v + d ==> sum_w_vruntime' = sum_w_vruntime - d*sum_weight */ cfs_rq->sum_w_vruntime -= cfs_rq->sum_weight * delta; + cfs_rq->zero_vruntime += delta; } /* - * Specifically: avg_runtime() + 0 must result in entity_eligible() := true + * Specifically: avg_vruntime() + 0 must result in entity_eligible() := true * For this to be so, the result of this function must have a left bias. + * + * Called in: + * - place_entity() -- before enqueue + * - update_entity_lag() -- before dequeue + * - entity_tick() + * + * This means it is one entry 'behind' but that puts it close enough to where + * the bound on entity_key() is at most two lag bounds. */ u64 avg_vruntime(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; - s64 avg = cfs_rq->sum_w_vruntime; - long load = cfs_rq->sum_weight; + long weight = cfs_rq->sum_weight; + s64 delta = 0; - if (curr && curr->on_rq) { - unsigned long weight = scale_load_down(curr->load.weight); + if (curr && !curr->on_rq) + curr = NULL; - avg += entity_key(cfs_rq, curr) * weight; - load += weight; - } + if (weight) { + s64 runtime = cfs_rq->sum_w_vruntime; + + if (curr) { + unsigned long w = scale_load_down(curr->load.weight); + + runtime += entity_key(cfs_rq, curr) * w; + weight += w; + } - if (load) { /* sign flips effective floor / ceiling */ - if (avg < 0) - avg -= (load - 1); - avg = div_s64(avg, load); + if (runtime < 0) + runtime -= (weight - 1); + + delta = div_s64(runtime, weight); + } else if (curr) { + /* + * When there is but one element, it is the average. + */ + delta = curr->vruntime - cfs_rq->zero_vruntime; } - return cfs_rq->zero_vruntime + avg; + update_zero_vruntime(cfs_rq, delta); + + return cfs_rq->zero_vruntime; } +static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq); + /* * lag_i = S - s_i = w_i * (V - v_i) * @@ -724,17 +763,16 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) * EEVDF gives the following limit for a steady state system: * * -r_max < lag < max(r_max, q) - * - * XXX could add max_slice to the augmented data to track this. */ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) { + u64 max_slice = cfs_rq_max_slice(cfs_rq) + TICK_NSEC; s64 vlag, limit; WARN_ON_ONCE(!se->on_rq); vlag = avg_vruntime(cfs_rq) - se->vruntime; - limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); + limit = calc_delta_fair(max_slice, se); se->vlag = clamp(vlag, -limit, limit); } @@ -777,16 +815,6 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) return vruntime_eligible(cfs_rq, se->vruntime); } -static void update_zero_vruntime(struct cfs_rq *cfs_rq) -{ - u64 vruntime = avg_vruntime(cfs_rq); - s64 delta = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime); - - sum_w_vruntime_update(cfs_rq, delta); - - cfs_rq->zero_vruntime = vruntime; -} - static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq) { struct sched_entity *root = __pick_root_entity(cfs_rq); @@ -802,6 +830,21 @@ static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq) return min_slice; } +static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq) +{ + struct sched_entity *root = __pick_root_entity(cfs_rq); + struct sched_entity *curr = cfs_rq->curr; + u64 max_slice = 0ULL; + + if (curr && curr->on_rq) + max_slice = curr->slice; + + if (root) + max_slice = max(max_slice, root->max_slice); + + return max_slice; +} + static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) { return entity_before(__node_2_se(a), __node_2_se(b)); @@ -826,6 +869,15 @@ static inline void __min_slice_update(struct sched_entity *se, struct rb_node *n } } +static inline void __max_slice_update(struct sched_entity *se, struct rb_node *node) +{ + if (node) { + struct sched_entity *rse = __node_2_se(node); + if (rse->max_slice > se->max_slice) + se->max_slice = rse->max_slice; + } +} + /* * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime) */ @@ -833,6 +885,7 @@ static inline bool min_vruntime_update(struct sched_entity *se, bool exit) { u64 old_min_vruntime = se->min_vruntime; u64 old_min_slice = se->min_slice; + u64 old_max_slice = se->max_slice; struct rb_node *node = &se->run_node; se->min_vruntime = se->vruntime; @@ -843,8 +896,13 @@ static inline bool min_vruntime_update(struct sched_entity *se, bool exit) __min_slice_update(se, node->rb_right); __min_slice_update(se, node->rb_left); + se->max_slice = se->slice; + __max_slice_update(se, node->rb_right); + __max_slice_update(se, node->rb_left); + return se->min_vruntime == old_min_vruntime && - se->min_slice == old_min_slice; + se->min_slice == old_min_slice && + se->max_slice == old_max_slice; } RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, @@ -856,7 +914,6 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { sum_w_vruntime_add(cfs_rq, se); - update_zero_vruntime(cfs_rq); se->min_vruntime = se->vruntime; se->min_slice = se->slice; rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, @@ -868,7 +925,6 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, &min_vruntime_cb); sum_w_vruntime_sub(cfs_rq, se); - update_zero_vruntime(cfs_rq); } struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq) @@ -3790,6 +3846,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { bool curr = cfs_rq->curr == se; + bool rel_vprot = false; + u64 vprot; if (se->on_rq) { /* commit outstanding execution time */ @@ -3797,6 +3855,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, update_entity_lag(cfs_rq, se); se->deadline -= se->vruntime; se->rel_deadline = 1; + if (curr && protect_slice(se)) { + vprot = se->vprot - se->vruntime; + rel_vprot = true; + } + cfs_rq->nr_queued--; if (!curr) __dequeue_entity(cfs_rq, se); @@ -3812,6 +3875,9 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, if (se->rel_deadline) se->deadline = div_s64(se->deadline * se->load.weight, weight); + if (rel_vprot) + vprot = div_s64(vprot * se->load.weight, weight); + update_load_set(&se->load, weight); do { @@ -3823,6 +3889,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, enqueue_load_avg(cfs_rq, se); if (se->on_rq) { place_entity(cfs_rq, se, 0); + if (rel_vprot) + se->vprot = se->vruntime + vprot; update_load_add(&cfs_rq->load, se->load.weight); if (!curr) __enqueue_entity(cfs_rq, se); @@ -5420,7 +5488,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) } static void -set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, bool first) { clear_buddies(cfs_rq, se); @@ -5435,7 +5503,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) __dequeue_entity(cfs_rq, se); update_load_avg(cfs_rq, se, UPDATE_TG); - set_protect_slice(cfs_rq, se); + if (first) + set_protect_slice(cfs_rq, se); } update_stats_curr_start(cfs_rq, se); @@ -5524,6 +5593,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) update_load_avg(cfs_rq, curr, UPDATE_TG); update_cfs_group(curr); + /* + * Pulls along cfs_rq::zero_vruntime. + */ + avg_vruntime(cfs_rq); + #ifdef CONFIG_SCHED_HRTICK /* * queued ticks are scheduled to match the slice, so don't bother @@ -8948,13 +9022,13 @@ again: pse = parent_entity(pse); } if (se_depth >= pse_depth) { - set_next_entity(cfs_rq_of(se), se); + set_next_entity(cfs_rq_of(se), se, true); se = parent_entity(se); } } put_prev_entity(cfs_rq, pse); - set_next_entity(cfs_rq, se); + set_next_entity(cfs_rq, se, true); __set_next_task_fair(rq, p, true); } @@ -12908,7 +12982,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) t0 = sched_clock_cpu(this_cpu); __sched_balance_update_blocked_averages(this_rq); - this_rq->next_class = &fair_sched_class; + rq_modified_begin(this_rq, &fair_sched_class); raw_spin_rq_unlock(this_rq); for_each_domain(this_cpu, sd) { @@ -12975,7 +13049,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) pulled_task = 1; /* If a higher prio class was modified, restart the pick */ - if (sched_class_above(this_rq->next_class, &fair_sched_class)) + if (rq_modified_above(this_rq, &fair_sched_class)) pulled_task = -1; out: @@ -13568,7 +13642,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - set_next_entity(cfs_rq, se); + set_next_entity(cfs_rq, se, first); /* ensure bandwidth has been allocated on our new cfs_rq */ account_cfs_rq_runtime(cfs_rq, 0); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b82fb70a9d54..43bbf0693cca 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2748,6 +2748,17 @@ static inline const struct sched_class *next_active_class(const struct sched_cla #define sched_class_above(_a, _b) ((_a) < (_b)) +static inline void rq_modified_begin(struct rq *rq, const struct sched_class *class) +{ + if (sched_class_above(rq->next_class, class)) + rq->next_class = class; +} + +static inline bool rq_modified_above(struct rq *rq, const struct sched_class *class) +{ + return sched_class_above(rq->next_class, class); +} + static inline bool sched_stop_runnable(struct rq *rq) { return rq->stop && task_on_rq_queued(rq->stop); |
