diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-02-07 09:10:42 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-02-07 09:10:42 -0800 |
| commit | dda5df9823630a26ed24ca9150b33a7f56ba4546 (patch) | |
| tree | d2454c9a2b01b47dfd33cb35d66322e6be58a38b | |
| parent | 7e0b172c80ad797061dfa32e18bf908c81ceab0e (diff) | |
| parent | 4463c7aa11a6e67169ae48c6804968960c4bffea (diff) | |
Merge tag 'sched-urgent-2026-02-07' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar:
"Miscellaneous MMCID fixes to address bugs and performance regressions
in the recent rewrite of the SCHED_MM_CID management code:
- Fix livelock triggered by BPF CI testing
- Fix hard lockup on weakly ordered systems
- Simplify the dropping of CIDs in the exit path by removing an
unintended transition phase
- Fix performance/scalability regression on a thread-pool benchmark
by optimizing transitional CIDs when scheduling out"
* tag 'sched-urgent-2026-02-07' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/mmcid: Optimize transitional CIDs when scheduling out
sched/mmcid: Drop per CPU CID immediately when switching to per task mode
sched/mmcid: Protect transition on weakly ordered systems
sched/mmcid: Prevent live lock on task to CPU mode transition
| -rw-r--r-- | include/linux/rseq_types.h | 6 | ||||
| -rw-r--r-- | kernel/sched/core.c | 184 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 44 |
3 files changed, 163 insertions, 71 deletions
diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 332dc14b81c9..ef0811379c54 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -121,8 +121,7 @@ struct mm_cid_pcpu { /** * struct mm_mm_cid - Storage for per MM CID data * @pcpu: Per CPU storage for CIDs associated to a CPU - * @percpu: Set, when CIDs are in per CPU mode - * @transit: Set to MM_CID_TRANSIT during a mode change transition phase + * @mode: Indicates per CPU and transition mode * @max_cids: The exclusive maximum CID value for allocation and convergence * @irq_work: irq_work to handle the affinity mode change case * @work: Regular work to handle the affinity mode change case @@ -139,8 +138,7 @@ struct mm_cid_pcpu { struct mm_mm_cid { /* Hotpath read mostly members */ struct mm_cid_pcpu __percpu *pcpu; - unsigned int percpu; - unsigned int transit; + unsigned int mode; unsigned int max_cids; /* Rarely used. Moves @lock and @mutex into the second cacheline */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 045f83ad261e..854984967fe2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10269,7 +10269,8 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) * Serialization rules: * * mm::mm_cid::mutex: Serializes fork() and exit() and therefore - * protects mm::mm_cid::users. + * protects mm::mm_cid::users and mode switch + * transitions * * mm::mm_cid::lock: Serializes mm_update_max_cids() and * mm_update_cpus_allowed(). Nests in mm_cid::mutex @@ -10285,14 +10286,70 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) * * A CID is either owned by a task (stored in task_struct::mm_cid.cid) or * by a CPU (stored in mm::mm_cid.pcpu::cid). CIDs owned by CPUs have the - * MM_CID_ONCPU bit set. During transition from CPU to task ownership mode, - * MM_CID_TRANSIT is set on the per task CIDs. When this bit is set the - * task needs to drop the CID into the pool when scheduling out. Both bits - * (ONCPU and TRANSIT) are filtered out by task_cid() when the CID is - * actually handed over to user space in the RSEQ memory. + * MM_CID_ONCPU bit set. + * + * During the transition of ownership mode, the MM_CID_TRANSIT bit is set + * on the CIDs. When this bit is set the tasks drop the CID back into the + * pool when scheduling out. + * + * Both bits (ONCPU and TRANSIT) are filtered out by task_cid() when the + * CID is actually handed over to user space in the RSEQ memory. * * Mode switching: * + * The ownership mode is per process and stored in mm:mm_cid::mode with the + * following possible states: + * + * 0: Per task ownership + * 0 | MM_CID_TRANSIT: Transition from per CPU to per task + * MM_CID_ONCPU: Per CPU ownership + * MM_CID_ONCPU | MM_CID_TRANSIT: Transition from per task to per CPU + * + * All transitions of ownership mode happen in two phases: + * + * 1) mm:mm_cid::mode has the MM_CID_TRANSIT bit set. This is OR'ed on the + * CIDs and denotes that the CID is only temporarily owned by a + * task. When the task schedules out it drops the CID back into the + * pool if this bit is set. + * + * 2) The initiating context walks the per CPU space or the tasks to fixup + * or drop the CIDs and after completion it clears MM_CID_TRANSIT in + * mm:mm_cid::mode. After that point the CIDs are strictly task or CPU + * owned again. + * + * This two phase transition is required to prevent CID space exhaustion + * during the transition as a direct transfer of ownership would fail: + * + * - On task to CPU mode switch if a task is scheduled in on one CPU and + * then migrated to another CPU before the fixup freed enough per task + * CIDs. + * + * - On CPU to task mode switch if two tasks are scheduled in on the same + * CPU before the fixup freed per CPU CIDs. + * + * Both scenarios can result in a live lock because sched_in() is invoked + * with runqueue lock held and loops in search of a CID and the fixup + * thread can't make progress freeing them up because it is stuck on the + * same runqueue lock. + * + * While MM_CID_TRANSIT is active during the transition phase the MM_CID + * bitmap can be contended, but that's a temporary contention bound to the + * transition period. After that everything goes back into steady state and + * nothing except fork() and exit() will touch the bitmap. This is an + * acceptable tradeoff as it completely avoids complex serialization, + * memory barriers and atomic operations for the common case. + * + * Aside of that this mechanism also ensures RT compability: + * + * - The task which runs the fixup is fully preemptible except for the + * short runqueue lock held sections. + * + * - The transient impact of the bitmap contention is only problematic + * when there is a thundering herd scenario of tasks scheduling in and + * out concurrently. There is not much which can be done about that + * except for avoiding mode switching by a proper overall system + * configuration. + * * Switching to per CPU mode happens when the user count becomes greater * than the maximum number of CIDs, which is calculated by: * @@ -10306,12 +10363,13 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) * * At the point of switching to per CPU mode the new user is not yet * visible in the system, so the task which initiated the fork() runs the - * fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and - * either transfers each tasks owned CID to the CPU the task runs on or - * drops it into the CID pool if a task is not on a CPU at that point in - * time. Tasks which schedule in before the task walk reaches them do the - * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes - * it's guaranteed that no task related to that MM owns a CID anymore. + * fixup function. mm_cid_fixup_tasks_to_cpu() walks the thread list and + * either marks each task owned CID with MM_CID_TRANSIT if the task is + * running on a CPU or drops it into the CID pool if a task is not on a + * CPU. Tasks which schedule in before the task walk reaches them do the + * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() + * completes it is guaranteed that no task related to that MM owns a CID + * anymore. * * Switching back to task mode happens when the user count goes below the * threshold which was recorded on the per CPU mode switch: @@ -10327,28 +10385,11 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) * run either in the deferred update function in context of a workqueue or * by a task which forks a new one or by a task which exits. Whatever * happens first. mm_cid_fixup_cpus_to_task() walks through the possible - * CPUs and either transfers the CPU owned CIDs to a related task which - * runs on the CPU or drops it into the pool. Tasks which schedule in on a - * CPU which the walk did not cover yet do the handover themself. - * - * This transition from CPU to per task ownership happens in two phases: - * - * 1) mm:mm_cid.transit contains MM_CID_TRANSIT This is OR'ed on the task - * CID and denotes that the CID is only temporarily owned by the - * task. When it schedules out the task drops the CID back into the - * pool if this bit is set. - * - * 2) The initiating context walks the per CPU space and after completion - * clears mm:mm_cid.transit. So after that point the CIDs are strictly - * task owned again. - * - * This two phase transition is required to prevent CID space exhaustion - * during the transition as a direct transfer of ownership would fail if - * two tasks are scheduled in on the same CPU before the fixup freed per - * CPU CIDs. - * - * When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID - * related to that MM is owned by a CPU anymore. + * CPUs and either marks the CPU owned CIDs with MM_CID_TRANSIT if a + * related task is running on the CPU or drops it into the pool. Tasks + * which are scheduled in before the fixup covered them do the handover + * themself. When mm_cid_fixup_cpus_to_tasks() completes it is guaranteed + * that no CID related to that MM is owned by a CPU anymore. */ /* @@ -10379,6 +10420,7 @@ static inline unsigned int mm_cid_calc_pcpu_thrs(struct mm_mm_cid *mc) static bool mm_update_max_cids(struct mm_struct *mm) { struct mm_mm_cid *mc = &mm->mm_cid; + bool percpu = cid_on_cpu(mc->mode); lockdep_assert_held(&mm->mm_cid.lock); @@ -10387,7 +10429,7 @@ static bool mm_update_max_cids(struct mm_struct *mm) __mm_update_max_cids(mc); /* Check whether owner mode must be changed */ - if (!mc->percpu) { + if (!percpu) { /* Enable per CPU mode when the number of users is above max_cids */ if (mc->users > mc->max_cids) mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc); @@ -10398,12 +10440,17 @@ static bool mm_update_max_cids(struct mm_struct *mm) } /* Mode change required? */ - if (!!mc->percpu == !!mc->pcpu_thrs) + if (percpu == !!mc->pcpu_thrs) return false; - /* When switching back to per TASK mode, set the transition flag */ - if (!mc->pcpu_thrs) - WRITE_ONCE(mc->transit, MM_CID_TRANSIT); - WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs); + + /* Flip the mode and set the transition flag to bridge the transfer */ + WRITE_ONCE(mc->mode, mc->mode ^ (MM_CID_TRANSIT | MM_CID_ONCPU)); + /* + * Order the store against the subsequent fixups so that + * acquire(rq::lock) cannot be reordered by the CPU before the + * store. + */ + smp_mb(); return true; } @@ -10428,7 +10475,7 @@ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpu WRITE_ONCE(mc->nr_cpus_allowed, weight); __mm_update_max_cids(mc); - if (!mc->percpu) + if (!cid_on_cpu(mc->mode)) return; /* Adjust the threshold to the wider set */ @@ -10446,6 +10493,16 @@ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpu irq_work_queue(&mc->irq_work); } +static inline void mm_cid_complete_transit(struct mm_struct *mm, unsigned int mode) +{ + /* + * Ensure that the store removing the TRANSIT bit cannot be + * reordered by the CPU before the fixups have been completed. + */ + smp_mb(); + WRITE_ONCE(mm->mm_cid.mode, mode); +} + static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp) { if (cid_on_cpu(t->mm_cid.cid)) { @@ -10489,14 +10546,13 @@ static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm) } } } - /* Clear the transition bit */ - WRITE_ONCE(mm->mm_cid.transit, 0); + mm_cid_complete_transit(mm, 0); } -static inline void mm_cid_transfer_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp) +static inline void mm_cid_transit_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp) { if (cid_on_task(t->mm_cid.cid)) { - t->mm_cid.cid = cid_to_cpu_cid(t->mm_cid.cid); + t->mm_cid.cid = cid_to_transit_cid(t->mm_cid.cid); pcp->cid = t->mm_cid.cid; } } @@ -10509,18 +10565,17 @@ static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm if (!t->mm_cid.active) return false; if (cid_on_task(t->mm_cid.cid)) { - /* If running on the CPU, transfer the CID, otherwise drop it */ + /* If running on the CPU, put the CID in transit mode, otherwise drop it */ if (task_rq(t)->curr == t) - mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t))); + mm_cid_transit_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t))); else mm_unset_cid_on_task(t); } return true; } -static void mm_cid_fixup_tasks_to_cpus(void) +static void mm_cid_do_fixup_tasks_to_cpus(struct mm_struct *mm) { - struct mm_struct *mm = current->mm; struct task_struct *p, *t; unsigned int users; @@ -10558,6 +10613,14 @@ static void mm_cid_fixup_tasks_to_cpus(void) } } +static void mm_cid_fixup_tasks_to_cpus(void) +{ + struct mm_struct *mm = current->mm; + + mm_cid_do_fixup_tasks_to_cpus(mm); + mm_cid_complete_transit(mm, MM_CID_ONCPU); +} + static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm) { t->mm_cid.active = 1; @@ -10586,17 +10649,17 @@ void sched_mm_cid_fork(struct task_struct *t) } if (!sched_mm_cid_add_user(t, mm)) { - if (!mm->mm_cid.percpu) + if (!cid_on_cpu(mm->mm_cid.mode)) t->mm_cid.cid = mm_get_cid(mm); return; } /* Handle the mode change and transfer current's CID */ - percpu = !!mm->mm_cid.percpu; + percpu = cid_on_cpu(mm->mm_cid.mode); if (!percpu) mm_cid_transit_to_task(current, pcp); else - mm_cid_transfer_to_cpu(current, pcp); + mm_cid_transit_to_cpu(current, pcp); } if (percpu) { @@ -10631,7 +10694,7 @@ static bool __sched_mm_cid_exit(struct task_struct *t) * affinity change increased the number of allowed CPUs and the * deferred fixup did not run yet. */ - if (WARN_ON_ONCE(mm->mm_cid.percpu)) + if (WARN_ON_ONCE(cid_on_cpu(mm->mm_cid.mode))) return false; /* * A failed fork(2) cleanup never gets here, so @current must have @@ -10664,8 +10727,14 @@ void sched_mm_cid_exit(struct task_struct *t) scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { if (!__sched_mm_cid_exit(t)) return; - /* Mode change required. Transfer currents CID */ - mm_cid_transit_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu)); + /* + * Mode change. The task has the CID unset + * already. The CPU CID is still valid and + * does not have MM_CID_TRANSIT set as the + * mode change has just taken effect under + * mm::mm_cid::lock. Drop it. + */ + mm_drop_cid_on_cpu(mm, this_cpu_ptr(mm->mm_cid.pcpu)); } mm_cid_fixup_cpus_to_tasks(mm); return; @@ -10722,7 +10791,7 @@ static void mm_cid_work_fn(struct work_struct *work) if (!mm_update_max_cids(mm)) return; /* Affinity changes can only switch back to task mode */ - if (WARN_ON_ONCE(mm->mm_cid.percpu)) + if (WARN_ON_ONCE(cid_on_cpu(mm->mm_cid.mode))) return; } mm_cid_fixup_cpus_to_tasks(mm); @@ -10743,8 +10812,7 @@ static void mm_cid_irq_work(struct irq_work *work) void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { mm->mm_cid.max_cids = 0; - mm->mm_cid.percpu = 0; - mm->mm_cid.transit = 0; + mm->mm_cid.mode = 0; mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed; mm->mm_cid.users = 0; mm->mm_cid.pcpu_thrs = 0; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 93fce4bbff5e..bd350e40859d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3816,7 +3816,8 @@ static __always_inline void mm_cid_update_pcpu_cid(struct mm_struct *mm, unsigne __this_cpu_write(mm->mm_cid.pcpu->cid, cid); } -static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid) +static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid, + unsigned int mode) { unsigned int max_cids, tcid = t->mm_cid.cid; struct mm_struct *mm = t->mm; @@ -3841,12 +3842,17 @@ static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int /* Still nothing, allocate a new one */ if (!cid_on_cpu(cpu_cid)) cpu_cid = cid_to_cpu_cid(mm_get_cid(mm)); + + /* Handle the transition mode flag if required */ + if (mode & MM_CID_TRANSIT) + cpu_cid = cpu_cid_to_cid(cpu_cid) | MM_CID_TRANSIT; } mm_cid_update_pcpu_cid(mm, cpu_cid); mm_cid_update_task_cid(t, cpu_cid); } -static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid) +static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid, + unsigned int mode) { unsigned int max_cids, tcid = t->mm_cid.cid; struct mm_struct *mm = t->mm; @@ -3872,7 +3878,7 @@ static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int if (!cid_on_task(tcid)) tcid = mm_get_cid(mm); /* Set the transition mode flag if required */ - tcid |= READ_ONCE(mm->mm_cid.transit); + tcid |= mode & MM_CID_TRANSIT; } mm_cid_update_pcpu_cid(mm, tcid); mm_cid_update_task_cid(t, tcid); @@ -3881,26 +3887,46 @@ static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int static __always_inline void mm_cid_schedin(struct task_struct *next) { struct mm_struct *mm = next->mm; - unsigned int cpu_cid; + unsigned int cpu_cid, mode; if (!next->mm_cid.active) return; cpu_cid = __this_cpu_read(mm->mm_cid.pcpu->cid); - if (likely(!READ_ONCE(mm->mm_cid.percpu))) - mm_cid_from_task(next, cpu_cid); + mode = READ_ONCE(mm->mm_cid.mode); + if (likely(!cid_on_cpu(mode))) + mm_cid_from_task(next, cpu_cid, mode); else - mm_cid_from_cpu(next, cpu_cid); + mm_cid_from_cpu(next, cpu_cid, mode); } static __always_inline void mm_cid_schedout(struct task_struct *prev) { + struct mm_struct *mm = prev->mm; + unsigned int mode, cid; + /* During mode transitions CIDs are temporary and need to be dropped */ if (likely(!cid_in_transit(prev->mm_cid.cid))) return; - mm_drop_cid(prev->mm, cid_from_transit_cid(prev->mm_cid.cid)); - prev->mm_cid.cid = MM_CID_UNSET; + mode = READ_ONCE(mm->mm_cid.mode); + cid = cid_from_transit_cid(prev->mm_cid.cid); + + /* + * If transition mode is done, transfer ownership when the CID is + * within the convergence range to optimize the next schedule in. + */ + if (!cid_in_transit(mode) && cid < READ_ONCE(mm->mm_cid.max_cids)) { + if (cid_on_cpu(mode)) + cid = cid_to_cpu_cid(cid); + + /* Update both so that the next schedule in goes into the fast path */ + mm_cid_update_pcpu_cid(mm, cid); + prev->mm_cid.cid = cid; + } else { + mm_drop_cid(mm, cid); + prev->mm_cid.cid = MM_CID_UNSET; + } } static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) |
