diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/Kconfig.preempt | 3 | ||||
| -rw-r--r-- | kernel/entry/common.c | 27 | ||||
| -rw-r--r-- | kernel/entry/common.h | 7 | ||||
| -rw-r--r-- | kernel/entry/syscall-common.c | 97 | ||||
| -rw-r--r-- | kernel/entry/syscall_user_dispatch.c | 4 | ||||
| -rw-r--r-- | kernel/rseq.c | 365 | ||||
| -rw-r--r-- | kernel/sched/clock.c | 3 | ||||
| -rw-r--r-- | kernel/sched/core.c | 86 | ||||
| -rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 2 | ||||
| -rw-r--r-- | kernel/sched/cputime.c | 9 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 105 | ||||
| -rw-r--r-- | kernel/sched/debug.c | 187 | ||||
| -rw-r--r-- | kernel/sched/ext.c | 42 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 411 | ||||
| -rw-r--r-- | kernel/sched/idle.c | 7 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 14 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 142 | ||||
| -rw-r--r-- | kernel/sched/stop_task.c | 3 | ||||
| -rw-r--r-- | kernel/sched/topology.c | 5 | ||||
| -rw-r--r-- | kernel/sys.c | 6 | ||||
| -rw-r--r-- | kernel/sys_ni.c | 1 | ||||
| -rw-r--r-- | kernel/time/hrtimer.c | 2 |
22 files changed, 1036 insertions, 492 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index da326800c1c9..88c594c6d7fc 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -16,11 +16,13 @@ config ARCH_HAS_PREEMPT_LAZY choice prompt "Preemption Model" + default PREEMPT_LAZY if ARCH_HAS_PREEMPT_LAZY default PREEMPT_NONE config PREEMPT_NONE bool "No Forced Preemption (Server)" depends on !PREEMPT_RT + depends on ARCH_NO_PREEMPT select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC help This is the traditional Linux preemption model, geared towards @@ -35,6 +37,7 @@ config PREEMPT_NONE config PREEMPT_VOLUNTARY bool "Voluntary Kernel Preemption (Desktop)" + depends on !ARCH_HAS_PREEMPT_LAZY depends on !ARCH_NO_PREEMPT depends on !PREEMPT_RT select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 5c792b30c58a..9ef63e414791 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -17,6 +17,27 @@ void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } #define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK) #endif +/* TIF bits, which prevent a time slice extension. */ +#ifdef CONFIG_PREEMPT_RT +/* + * Since rseq slice ext has a direct correlation to the worst case + * scheduling latency (schedule is delayed after all), only have it affect + * LAZY reschedules on PREEMPT_RT for now. + * + * However, since this delay is only applicable to userspace, a value + * for rseq_slice_extension_nsec that is strictly less than the worst case + * kernel space preempt_disable() region, should mean the scheduling latency + * is not affected, even for !LAZY. + * + * However, since this value depends on the hardware at hand, it cannot be + * pre-determined in any sensible way. Hence punt on this problem for now. + */ +# define TIF_SLICE_EXT_SCHED (_TIF_NEED_RESCHED_LAZY) +#else +# define TIF_SLICE_EXT_SCHED (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) +#endif +#define TIF_SLICE_EXT_DENY (EXIT_TO_USER_MODE_WORK & ~TIF_SLICE_EXT_SCHED) + static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work) { @@ -28,8 +49,10 @@ static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *re local_irq_enable_exit_to_user(ti_work); - if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) - schedule(); + if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) { + if (!rseq_grant_slice_extension(ti_work & TIF_SLICE_EXT_DENY)) + schedule(); + } if (ti_work & _TIF_UPROBE) uprobe_notify_resume(regs); diff --git a/kernel/entry/common.h b/kernel/entry/common.h deleted file mode 100644 index f6e6d02f07fe..000000000000 --- a/kernel/entry/common.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _COMMON_H -#define _COMMON_H - -bool syscall_user_dispatch(struct pt_regs *regs); - -#endif diff --git a/kernel/entry/syscall-common.c b/kernel/entry/syscall-common.c index 940a597ded40..cd4967a9c53e 100644 --- a/kernel/entry/syscall-common.c +++ b/kernel/entry/syscall-common.c @@ -1,104 +1,23 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/audit.h> #include <linux/entry-common.h> -#include "common.h" #define CREATE_TRACE_POINTS #include <trace/events/syscalls.h> -static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) -{ - if (unlikely(audit_context())) { - unsigned long args[6]; - - syscall_get_arguments(current, regs, args); - audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); - } -} +/* Out of line to prevent tracepoint code duplication */ -long syscall_trace_enter(struct pt_regs *regs, long syscall, - unsigned long work) +long trace_syscall_enter(struct pt_regs *regs, long syscall) { - long ret = 0; - + trace_sys_enter(regs, syscall); /* - * Handle Syscall User Dispatch. This must comes first, since - * the ABI here can be something that doesn't make sense for - * other syscall_work features. + * Probes or BPF hooks in the tracepoint may have changed the + * system call number. Reread it. */ - if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { - if (syscall_user_dispatch(regs)) - return -1L; - } - - /* Handle ptrace */ - if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { - ret = ptrace_report_syscall_entry(regs); - if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) - return -1L; - } - - /* Do seccomp after ptrace, to catch any tracer changes. */ - if (work & SYSCALL_WORK_SECCOMP) { - ret = __secure_computing(); - if (ret == -1L) - return ret; - } - - /* Either of the above might have changed the syscall number */ - syscall = syscall_get_nr(current, regs); - - if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) { - trace_sys_enter(regs, syscall); - /* - * Probes or BPF hooks in the tracepoint may have changed the - * system call number as well. - */ - syscall = syscall_get_nr(current, regs); - } - - syscall_enter_audit(regs, syscall); - - return ret ? : syscall; + return syscall_get_nr(current, regs); } -/* - * If SYSCALL_EMU is set, then the only reason to report is when - * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall - * instruction has been already reported in syscall_enter_from_user_mode(). - */ -static inline bool report_single_step(unsigned long work) +void trace_syscall_exit(struct pt_regs *regs, long ret) { - if (work & SYSCALL_WORK_SYSCALL_EMU) - return false; - - return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; -} - -void syscall_exit_work(struct pt_regs *regs, unsigned long work) -{ - bool step; - - /* - * If the syscall was rolled back due to syscall user dispatching, - * then the tracers below are not invoked for the same reason as - * the entry side was not invoked in syscall_trace_enter(): The ABI - * of these syscalls is unknown. - */ - if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { - if (unlikely(current->syscall_dispatch.on_dispatch)) { - current->syscall_dispatch.on_dispatch = false; - return; - } - } - - audit_syscall_exit(regs); - - if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) - trace_sys_exit(regs, syscall_get_return_value(current, regs)); - - step = report_single_step(work); - if (step || work & SYSCALL_WORK_SYSCALL_TRACE) - ptrace_report_syscall_exit(regs, step); + trace_sys_exit(regs, ret); } diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c index a9055eccb27e..d89dffcc2d64 100644 --- a/kernel/entry/syscall_user_dispatch.c +++ b/kernel/entry/syscall_user_dispatch.c @@ -2,6 +2,8 @@ /* * Copyright (C) 2020 Collabora Ltd. */ + +#include <linux/entry-common.h> #include <linux/sched.h> #include <linux/prctl.h> #include <linux/ptrace.h> @@ -15,8 +17,6 @@ #include <asm/syscall.h> -#include "common.h" - static void trigger_sigsys(struct pt_regs *regs) { struct kernel_siginfo info; diff --git a/kernel/rseq.c b/kernel/rseq.c index 395d8b002350..b0973d19f366 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -71,6 +71,9 @@ #define RSEQ_BUILD_SLOW_PATH #include <linux/debugfs.h> +#include <linux/hrtimer.h> +#include <linux/percpu.h> +#include <linux/prctl.h> #include <linux/ratelimit.h> #include <linux/rseq_entry.h> #include <linux/sched.h> @@ -120,7 +123,6 @@ void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, } #endif /* CONFIG_TRACEPOINTS */ -#ifdef CONFIG_DEBUG_FS #ifdef CONFIG_RSEQ_STATS DEFINE_PER_CPU(struct rseq_stats, rseq_stats); @@ -138,6 +140,13 @@ static int rseq_stats_show(struct seq_file *m, void *p) stats.cs += data_race(per_cpu(rseq_stats.cs, cpu)); stats.clear += data_race(per_cpu(rseq_stats.clear, cpu)); stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu)); + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { + stats.s_granted += data_race(per_cpu(rseq_stats.s_granted, cpu)); + stats.s_expired += data_race(per_cpu(rseq_stats.s_expired, cpu)); + stats.s_revoked += data_race(per_cpu(rseq_stats.s_revoked, cpu)); + stats.s_yielded += data_race(per_cpu(rseq_stats.s_yielded, cpu)); + stats.s_aborted += data_race(per_cpu(rseq_stats.s_aborted, cpu)); + } } seq_printf(m, "exit: %16lu\n", stats.exit); @@ -148,6 +157,13 @@ static int rseq_stats_show(struct seq_file *m, void *p) seq_printf(m, "cs: %16lu\n", stats.cs); seq_printf(m, "clear: %16lu\n", stats.clear); seq_printf(m, "fixup: %16lu\n", stats.fixup); + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { + seq_printf(m, "sgrant: %16lu\n", stats.s_granted); + seq_printf(m, "sexpir: %16lu\n", stats.s_expired); + seq_printf(m, "srevok: %16lu\n", stats.s_revoked); + seq_printf(m, "syield: %16lu\n", stats.s_yielded); + seq_printf(m, "sabort: %16lu\n", stats.s_aborted); + } return 0; } @@ -205,16 +221,19 @@ static const struct file_operations debug_ops = { .release = single_release, }; +static void rseq_slice_ext_init(struct dentry *root_dir); + static int __init rseq_debugfs_init(void) { struct dentry *root_dir = debugfs_create_dir("rseq", NULL); debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops); rseq_stats_init(root_dir); + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) + rseq_slice_ext_init(root_dir); return 0; } __initcall(rseq_debugfs_init); -#endif /* CONFIG_DEBUG_FS */ static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id) { @@ -389,6 +408,8 @@ static bool rseq_reset_ids(void) */ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) { + u32 rseqfl = 0; + if (flags & RSEQ_FLAG_UNREGISTER) { if (flags & ~RSEQ_FLAG_UNREGISTER) return -EINVAL; @@ -405,7 +426,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 return 0; } - if (unlikely(flags)) + if (unlikely(flags & ~(RSEQ_FLAG_SLICE_EXT_DEFAULT_ON))) return -EINVAL; if (current->rseq.usrptr) { @@ -440,6 +461,13 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 if (!access_ok(rseq, rseq_len)) return -EFAULT; + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { + rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; + if (rseq_slice_extension_enabled() && + (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON)) + rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + } + scoped_user_write_access(rseq, efault) { /* * If the rseq_cs pointer is non-NULL on registration, clear it to @@ -449,11 +477,13 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 * clearing the fields. Don't bother reading it, just reset it. */ unsafe_put_user(0UL, &rseq->rseq_cs, efault); + unsafe_put_user(rseqfl, &rseq->flags, efault); /* Initialize IDs in user space */ unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault); unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); unsafe_put_user(0U, &rseq->node_id, efault); unsafe_put_user(0U, &rseq->mm_cid, efault); + unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); } /* @@ -464,6 +494,10 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 current->rseq.len = rseq_len; current->rseq.sig = sig; +#ifdef CONFIG_RSEQ_SLICE_EXTENSION + current->rseq.slice.state.enabled = !!(rseqfl & RSEQ_CS_FLAG_SLICE_EXT_ENABLED); +#endif + /* * If rseq was previously inactive, and has just been * registered, ensure the cpu_id_start and cpu_id fields @@ -476,3 +510,328 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 efault: return -EFAULT; } + +#ifdef CONFIG_RSEQ_SLICE_EXTENSION +struct slice_timer { + struct hrtimer timer; + void *cookie; +}; + +static const unsigned int rseq_slice_ext_nsecs_min = 5 * NSEC_PER_USEC; +static const unsigned int rseq_slice_ext_nsecs_max = 50 * NSEC_PER_USEC; +unsigned int rseq_slice_ext_nsecs __read_mostly = rseq_slice_ext_nsecs_min; +static DEFINE_PER_CPU(struct slice_timer, slice_timer); +DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key); + +/* + * When the timer expires and the task is still in user space, the return + * from interrupt will revoke the grant and schedule. If the task already + * entered the kernel via a syscall and the timer fires before the syscall + * work was able to cancel it, then depending on the preemption model this + * will either reschedule on return from interrupt or in the syscall work + * below. + */ +static enum hrtimer_restart rseq_slice_expired(struct hrtimer *tmr) +{ + struct slice_timer *st = container_of(tmr, struct slice_timer, timer); + + /* + * Validate that the task which armed the timer is still on the + * CPU. It could have been scheduled out without canceling the + * timer. + */ + if (st->cookie == current && current->rseq.slice.state.granted) { + rseq_stat_inc(rseq_stats.s_expired); + set_need_resched_current(); + } + return HRTIMER_NORESTART; +} + +bool __rseq_arm_slice_extension_timer(void) +{ + struct slice_timer *st = this_cpu_ptr(&slice_timer); + struct task_struct *curr = current; + + lockdep_assert_irqs_disabled(); + + /* + * This check prevents a task, which got a time slice extension + * granted, from exceeding the maximum scheduling latency when the + * grant expired before going out to user space. Don't bother to + * clear the grant here, it will be cleaned up automatically before + * going out to user space after being scheduled back in. + */ + if ((unlikely(curr->rseq.slice.expires < ktime_get_mono_fast_ns()))) { + set_need_resched_current(); + return true; + } + + /* + * Store the task pointer as a cookie for comparison in the timer + * function. This is safe as the timer is CPU local and cannot be + * in the expiry function at this point. + */ + st->cookie = curr; + hrtimer_start(&st->timer, curr->rseq.slice.expires, HRTIMER_MODE_ABS_PINNED_HARD); + /* Arm the syscall entry work */ + set_task_syscall_work(curr, SYSCALL_RSEQ_SLICE); + return false; +} + +static void rseq_cancel_slice_extension_timer(void) +{ + struct slice_timer *st = this_cpu_ptr(&slice_timer); + + /* + * st->cookie can be safely read as preemption is disabled and the + * timer is CPU local. + * + * As this is most probably the first expiring timer, the cancel is + * expensive as it has to reprogram the hardware, but that's less + * expensive than going through a full hrtimer_interrupt() cycle + * for nothing. + * + * hrtimer_try_to_cancel() is sufficient here as the timer is CPU + * local and once the hrtimer code disabled interrupts the timer + * callback cannot be running. + */ + if (st->cookie == current) + hrtimer_try_to_cancel(&st->timer); +} + +static inline void rseq_slice_set_need_resched(struct task_struct *curr) +{ + /* + * The interrupt guard is required to prevent inconsistent state in + * this case: + * + * set_tsk_need_resched() + * --> Interrupt + * wakeup() + * set_tsk_need_resched() + * set_preempt_need_resched() + * schedule_on_return() + * clear_tsk_need_resched() + * clear_preempt_need_resched() + * set_preempt_need_resched() <- Inconsistent state + * + * This is safe vs. a remote set of TIF_NEED_RESCHED because that + * only sets the already set bit and does not create inconsistent + * state. + */ + scoped_guard(irq) + set_need_resched_current(); +} + +static void rseq_slice_validate_ctrl(u32 expected) +{ + u32 __user *sctrl = ¤t->rseq.usrptr->slice_ctrl.all; + u32 uval; + + if (get_user(uval, sctrl) || uval != expected) + force_sig(SIGSEGV); +} + +/* + * Invoked from syscall entry if a time slice extension was granted and the + * kernel did not clear it before user space left the critical section. + * + * While the recommended way to relinquish the CPU side effect free is + * rseq_slice_yield(2), any syscall within a granted slice terminates the + * grant and immediately reschedules if required. This supports onion layer + * applications, where the code requesting the grant cannot control the + * code within the critical section. + */ +void rseq_syscall_enter_work(long syscall) +{ + struct task_struct *curr = current; + struct rseq_slice_ctrl ctrl = { .granted = curr->rseq.slice.state.granted }; + + clear_task_syscall_work(curr, SYSCALL_RSEQ_SLICE); + + if (static_branch_unlikely(&rseq_debug_enabled)) + rseq_slice_validate_ctrl(ctrl.all); + + /* + * The kernel might have raced, revoked the grant and updated + * userspace, but kept the SLICE work set. + */ + if (!ctrl.granted) + return; + + /* + * Required to stabilize the per CPU timer pointer and to make + * set_tsk_need_resched() correct on PREEMPT[RT] kernels. + * + * Leaving the scope will reschedule on preemption models FULL, + * LAZY and RT if necessary. + */ + scoped_guard(preempt) { + rseq_cancel_slice_extension_timer(); + /* + * Now that preemption is disabled, quickly check whether + * the task was already rescheduled before arriving here. + */ + if (!curr->rseq.event.sched_switch) { + rseq_slice_set_need_resched(curr); + + if (syscall == __NR_rseq_slice_yield) { + rseq_stat_inc(rseq_stats.s_yielded); + /* Update the yielded state for syscall return */ + curr->rseq.slice.yielded = 1; + } else { + rseq_stat_inc(rseq_stats.s_aborted); + } + } + } + /* Reschedule on NONE/VOLUNTARY preemption models */ + cond_resched(); + + /* Clear the grant in kernel state and user space */ + curr->rseq.slice.state.granted = false; + if (put_user(0U, &curr->rseq.usrptr->slice_ctrl.all)) + force_sig(SIGSEGV); +} + +int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3) +{ + switch (arg2) { + case PR_RSEQ_SLICE_EXTENSION_GET: + if (arg3) + return -EINVAL; + return current->rseq.slice.state.enabled ? PR_RSEQ_SLICE_EXT_ENABLE : 0; + + case PR_RSEQ_SLICE_EXTENSION_SET: { + u32 rflags, valid = RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; + bool enable = !!(arg3 & PR_RSEQ_SLICE_EXT_ENABLE); + + if (arg3 & ~PR_RSEQ_SLICE_EXT_ENABLE) + return -EINVAL; + if (!rseq_slice_extension_enabled()) + return -ENOTSUPP; + if (!current->rseq.usrptr) + return -ENXIO; + + /* No change? */ + if (enable == !!current->rseq.slice.state.enabled) + return 0; + + if (get_user(rflags, ¤t->rseq.usrptr->flags)) + goto die; + + if (current->rseq.slice.state.enabled) + valid |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + + if ((rflags & valid) != valid) + goto die; + + rflags &= ~RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + rflags |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; + if (enable) + rflags |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + + if (put_user(rflags, ¤t->rseq.usrptr->flags)) + goto die; + + current->rseq.slice.state.enabled = enable; + return 0; + } + default: + return -EINVAL; + } +die: + force_sig(SIGSEGV); + return -EFAULT; +} + +/** + * sys_rseq_slice_yield - yield the current processor side effect free if a + * task granted with a time slice extension is done with + * the critical work before being forced out. + * + * Return: 1 if the task successfully yielded the CPU within the granted slice. + * 0 if the slice extension was either never granted or was revoked by + * going over the granted extension, using a syscall other than this one + * or being scheduled out earlier due to a subsequent interrupt. + * + * The syscall does not schedule because the syscall entry work immediately + * relinquishes the CPU and schedules if required. + */ +SYSCALL_DEFINE0(rseq_slice_yield) +{ + int yielded = !!current->rseq.slice.yielded; + + current->rseq.slice.yielded = 0; + return yielded; +} + +static int rseq_slice_ext_show(struct seq_file *m, void *p) +{ + seq_printf(m, "%d\n", rseq_slice_ext_nsecs); + return 0; +} + +static ssize_t rseq_slice_ext_write(struct file *file, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + unsigned int nsecs; + + if (kstrtouint_from_user(ubuf, count, 10, &nsecs)) + return -EINVAL; + + if (nsecs < rseq_slice_ext_nsecs_min) + return -ERANGE; + + if (nsecs > rseq_slice_ext_nsecs_max) + return -ERANGE; + + rseq_slice_ext_nsecs = nsecs; + + return count; +} + +static int rseq_slice_ext_open(struct inode *inode, struct file *file) +{ + return single_open(file, rseq_slice_ext_show, inode->i_private); +} + +static const struct file_operations slice_ext_ops = { + .open = rseq_slice_ext_open, + .read = seq_read, + .write = rseq_slice_ext_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static void rseq_slice_ext_init(struct dentry *root_dir) +{ + debugfs_create_file("slice_ext_nsec", 0644, root_dir, NULL, &slice_ext_ops); +} + +static int __init rseq_slice_cmdline(char *str) +{ + bool on; + + if (kstrtobool(str, &on)) + return 0; + + if (!on) + static_branch_disable(&rseq_slice_extension_key); + return 1; +} +__setup("rseq_slice_ext=", rseq_slice_cmdline); + +static int __init rseq_slice_init(void) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) { + hrtimer_setup(per_cpu_ptr(&slice_timer.timer, cpu), rseq_slice_expired, + CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED_HARD); + } + return 0; +} +device_initcall(rseq_slice_init); +#else +static void rseq_slice_ext_init(struct dentry *root_dir) { } +#endif /* CONFIG_RSEQ_SLICE_EXTENSION */ diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index f5e6dd6a6b3a..2ae4fbf13431 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -173,6 +173,7 @@ notrace static void __sched_clock_work(struct work_struct *work) scd->tick_gtod, __gtod_offset, scd->tick_raw, __sched_clock_offset); + disable_sched_clock_irqtime(); static_branch_disable(&__sched_clock_stable); } @@ -238,6 +239,8 @@ static int __init sched_clock_init_late(void) if (__sched_clock_stable_early) __set_sched_clock_stable(); + else + disable_sched_clock_irqtime(); /* disable if clock unstable. */ return 0; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d78d7b6d30bb..23406f037dde 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -119,6 +119,9 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_entry_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_exit_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_set_need_resched_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); DEFINE_PER_CPU(struct rnd_state, sched_rnd_state); @@ -1141,6 +1144,7 @@ void __trace_set_need_resched(struct task_struct *curr, int tif) { trace_sched_set_need_resched_tp(curr, smp_processor_id(), tif); } +EXPORT_SYMBOL_GPL(__trace_set_need_resched); void resched_curr(struct rq *rq) { @@ -2095,7 +2099,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) */ uclamp_rq_inc(rq, p, flags); - rq->queue_mask |= p->sched_class->queue_mask; p->sched_class->enqueue_task(rq, p, flags); psi_enqueue(p, flags); @@ -2128,7 +2131,6 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) * and mark the task ->sched_delayed. */ uclamp_rq_dec(rq, p); - rq->queue_mask |= p->sched_class->queue_mask; return p->sched_class->dequeue_task(rq, p, flags); } @@ -2179,10 +2181,14 @@ void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags) { struct task_struct *donor = rq->donor; - if (p->sched_class == donor->sched_class) - donor->sched_class->wakeup_preempt(rq, p, flags); - else if (sched_class_above(p->sched_class, donor->sched_class)) + if (p->sched_class == rq->next_class) { + rq->next_class->wakeup_preempt(rq, p, flags); + + } else if (sched_class_above(p->sched_class, rq->next_class)) { + rq->next_class->wakeup_preempt(rq, p, flags); resched_curr(rq); + rq->next_class = p->sched_class; + } /* * A queue event has occurred, and we're going to schedule. In @@ -3620,6 +3626,18 @@ static inline void ttwu_do_wakeup(struct task_struct *p) trace_sched_wakeup(p); } +void update_rq_avg_idle(struct rq *rq) +{ + u64 delta = rq_clock(rq) - rq->idle_stamp; + u64 max = 2*rq->max_idle_balance_cost; + + update_avg(&rq->avg_idle, delta); + + if (rq->avg_idle > max) + rq->avg_idle = max; + rq->idle_stamp = 0; +} + static void ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, struct rq_flags *rf) @@ -3655,18 +3673,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, p->sched_class->task_woken(rq, p); rq_repin_lock(rq, rf); } - - if (rq->idle_stamp) { - u64 delta = rq_clock(rq) - rq->idle_stamp; - u64 max = 2*rq->max_idle_balance_cost; - - update_avg(&rq->avg_idle, delta); - - if (rq->avg_idle > max) - rq->avg_idle = max; - - rq->idle_stamp = 0; - } } /* @@ -6836,6 +6842,7 @@ static void __sched notrace __schedule(int sched_mode) pick_again: next = pick_next_task(rq, rq->donor, &rf); rq_set_donor(rq, next); + rq->next_class = next->sched_class; if (unlikely(task_is_blocked(next))) { next = find_proxy_task(rq, next, &rf); if (!next) @@ -7580,7 +7587,7 @@ int preempt_dynamic_mode = preempt_dynamic_undefined; int sched_dynamic_mode(const char *str) { -# ifndef CONFIG_PREEMPT_RT +# if !(defined(CONFIG_PREEMPT_RT) || defined(CONFIG_ARCH_HAS_PREEMPT_LAZY)) if (!strcmp(str, "none")) return preempt_dynamic_none; @@ -8512,6 +8519,9 @@ int sched_cpu_dying(unsigned int cpu) dump_rq_tasks(rq, KERN_WARNING); } dl_server_stop(&rq->fair_server); +#ifdef CONFIG_SCHED_CLASS_EXT + dl_server_stop(&rq->ext_server); +#endif rq_unlock_irqrestore(rq, &rf); calc_load_migrate(rq); @@ -8687,6 +8697,8 @@ void __init sched_init(void) rq->rt.rt_runtime = global_rt_runtime(); init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif + rq->next_class = &idle_sched_class; + rq->sd = NULL; rq->rd = NULL; rq->cpu_capacity = SCHED_CAPACITY_SCALE; @@ -8715,6 +8727,9 @@ void __init sched_init(void) hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); fair_server_init(rq); +#ifdef CONFIG_SCHED_CLASS_EXT + ext_server_init(rq); +#endif #ifdef CONFIG_SCHED_CORE rq->core = rq; @@ -9146,6 +9161,7 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup) { unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; bool resched = false; + bool queued = false; struct rq *rq; CLASS(task_rq_lock, rq_guard)(tsk); @@ -9157,10 +9173,13 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup) scx_cgroup_move_task(tsk); if (scope->running) resched = true; + queued = scope->queued; } if (resched) resched_curr(rq); + else if (queued) + wakeup_preempt(rq, tsk, 0); __balance_callbacks(rq, &rq_guard.rf); } @@ -10883,13 +10902,12 @@ struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags |= DEQUEUE_NOCLOCK; } - if (flags & DEQUEUE_CLASS) { - if (p->sched_class->switching_from) - p->sched_class->switching_from(rq, p); - } + if ((flags & DEQUEUE_CLASS) && p->sched_class->switching_from) + p->sched_class->switching_from(rq, p); *ctx = (struct sched_change_ctx){ .p = p, + .class = p->sched_class, .flags = flags, .queued = task_on_rq_queued(p), .running = task_current_donor(rq, p), @@ -10920,6 +10938,11 @@ void sched_change_end(struct sched_change_ctx *ctx) lockdep_assert_rq_held(rq); + /* + * Changing class without *QUEUE_CLASS is bad. + */ + WARN_ON_ONCE(p->sched_class != ctx->class && !(ctx->flags & ENQUEUE_CLASS)); + if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to) p->sched_class->switching_to(rq, p); @@ -10931,6 +10954,25 @@ void sched_change_end(struct sched_change_ctx *ctx) if (ctx->flags & ENQUEUE_CLASS) { if (p->sched_class->switched_to) p->sched_class->switched_to(rq, p); + + if (ctx->running) { + /* + * If this was a class promotion; let the old class + * know it got preempted. Note that none of the + * switch*_from() methods know the new class and none + * of the switch*_to() methods know the old class. + */ + if (sched_class_above(p->sched_class, ctx->class)) { + rq->next_class->wakeup_preempt(rq, p, 0); + rq->next_class = p->sched_class; + } + /* + * If this was a degradation in class; make sure to + * reschedule. + */ + if (sched_class_above(ctx->class, p->sched_class)) + resched_curr(rq); + } } else { p->sched_class->prio_changed(rq, p, ctx->prio); } diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 0ab5f9d4bc59..cfc40181f66e 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -682,7 +682,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) "sugov:%d", cpumask_first(policy->related_cpus)); if (IS_ERR(thread)) { - pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread)); + pr_err("failed to create sugov thread: %pe\n", thread); return PTR_ERR(thread); } diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 4f97896887ec..ff0dfca95420 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -12,6 +12,8 @@ #ifdef CONFIG_IRQ_TIME_ACCOUNTING +DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime); + /* * There are no locks covering percpu hardirq/softirq time. * They are only modified in vtime_account, on corresponding CPU @@ -25,16 +27,15 @@ */ DEFINE_PER_CPU(struct irqtime, cpu_irqtime); -int sched_clock_irqtime; - void enable_sched_clock_irqtime(void) { - sched_clock_irqtime = 1; + static_branch_enable(&sched_clock_irqtime); } void disable_sched_clock_irqtime(void) { - sched_clock_irqtime = 0; + if (irqtime_enabled()) + static_branch_disable(&sched_clock_irqtime); } static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 7bcde7114f1b..d08b00429323 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1449,8 +1449,8 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 dl_se->dl_defer_idle = 0; /* - * The fair server can consume its runtime while throttled (not queued/ - * running as regular CFS). + * The DL server can consume its runtime while throttled (not + * queued / running as regular CFS). * * If the server consumes its entire runtime in this state. The server * is not required for the current period. Thus, reset the server by @@ -1535,10 +1535,10 @@ throttle: } /* - * The fair server (sole dl_server) does not account for real-time - * workload because it is running fair work. + * The dl_server does not account for real-time workload because it + * is running fair work. */ - if (dl_se == &rq->fair_server) + if (dl_se->dl_server) return; #ifdef CONFIG_RT_GROUP_SCHED @@ -1573,9 +1573,9 @@ throttle: * In the non-defer mode, the idle time is not accounted, as the * server provides a guarantee. * - * If the dl_server is in defer mode, the idle time is also considered - * as time available for the fair server, avoiding a penalty for the - * rt scheduler that did not consumed that time. + * If the dl_server is in defer mode, the idle time is also considered as + * time available for the dl_server, avoiding a penalty for the rt + * scheduler that did not consumed that time. */ void dl_server_update_idle(struct sched_dl_entity *dl_se, s64 delta_exec) { @@ -1799,7 +1799,7 @@ void dl_server_start(struct sched_dl_entity *dl_se) struct rq *rq = dl_se->rq; dl_se->dl_defer_idle = 0; - if (!dl_server(dl_se) || dl_se->dl_server_active) + if (!dl_server(dl_se) || dl_se->dl_server_active || !dl_se->dl_runtime) return; /* @@ -1860,6 +1860,18 @@ void sched_init_dl_servers(void) dl_se->dl_server = 1; dl_se->dl_defer = 1; setup_new_dl_entity(dl_se); + +#ifdef CONFIG_SCHED_CLASS_EXT + dl_se = &rq->ext_server; + + WARN_ON(dl_server(dl_se)); + + dl_server_apply_params(dl_se, runtime, period, 1); + + dl_se->dl_server = 1; + dl_se->dl_defer = 1; + setup_new_dl_entity(dl_se); +#endif } } @@ -1886,7 +1898,6 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio int cpu = cpu_of(rq); struct dl_bw *dl_b; unsigned long cap; - int retval = 0; int cpus; dl_b = dl_bw_of(cpu); @@ -1918,7 +1929,7 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); - return retval; + return 0; } /* @@ -2515,9 +2526,16 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) * Only called when both the current and waking task are -deadline * tasks. */ -static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, - int flags) +static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags) { + /* + * Can only get preempted by stop-class, and those should be + * few and short lived, doesn't really make sense to push + * anything away for that. + */ + if (p->sched_class != &dl_sched_class) + return; + if (dl_entity_preempt(&p->dl, &rq->donor->dl)) { resched_curr(rq); return; @@ -3191,6 +3209,36 @@ void dl_add_task_root_domain(struct task_struct *p) raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); } +static void dl_server_add_bw(struct root_domain *rd, int cpu) +{ + struct sched_dl_entity *dl_se; + + dl_se = &cpu_rq(cpu)->fair_server; + if (dl_server(dl_se) && cpu_active(cpu)) + __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu)); + +#ifdef CONFIG_SCHED_CLASS_EXT + dl_se = &cpu_rq(cpu)->ext_server; + if (dl_server(dl_se) && cpu_active(cpu)) + __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu)); +#endif +} + +static u64 dl_server_read_bw(int cpu) +{ + u64 dl_bw = 0; + + if (cpu_rq(cpu)->fair_server.dl_server) + dl_bw += cpu_rq(cpu)->fair_server.dl_bw; + +#ifdef CONFIG_SCHED_CLASS_EXT + if (cpu_rq(cpu)->ext_server.dl_server) + dl_bw += cpu_rq(cpu)->ext_server.dl_bw; +#endif + + return dl_bw; +} + void dl_clear_root_domain(struct root_domain *rd) { int i; @@ -3209,12 +3257,8 @@ void dl_clear_root_domain(struct root_domain *rd) * dl_servers are not tasks. Since dl_add_task_root_domain ignores * them, we need to account for them here explicitly. */ - for_each_cpu(i, rd->span) { - struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server; - - if (dl_server(dl_se) && cpu_active(i)) - __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(i)); - } + for_each_cpu(i, rd->span) + dl_server_add_bw(rd, i); } void dl_clear_root_domain_cpu(int cpu) @@ -3360,9 +3404,6 @@ static int task_is_throttled_dl(struct task_struct *p, int cpu) #endif DEFINE_SCHED_CLASS(dl) = { - - .queue_mask = 8, - .enqueue_task = enqueue_task_dl, .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, @@ -3656,6 +3697,9 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se) dl_se->dl_non_contending = 0; dl_se->dl_overrun = 0; dl_se->dl_server = 0; + dl_se->dl_defer = 0; + dl_se->dl_defer_running = 0; + dl_se->dl_defer_armed = 0; #ifdef CONFIG_RT_MUTEXES dl_se->pi_se = dl_se; @@ -3713,7 +3757,7 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) unsigned long flags, cap; struct dl_bw *dl_b; bool overflow = 0; - u64 fair_server_bw = 0; + u64 dl_server_bw = 0; rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); @@ -3746,27 +3790,26 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) cap -= arch_scale_cpu_capacity(cpu); /* - * cpu is going offline and NORMAL tasks will be moved away - * from it. We can thus discount dl_server bandwidth - * contribution as it won't need to be servicing tasks after - * the cpu is off. + * cpu is going offline and NORMAL and EXT tasks will be + * moved away from it. We can thus discount dl_server + * bandwidth contribution as it won't need to be servicing + * tasks after the cpu is off. */ - if (cpu_rq(cpu)->fair_server.dl_server) - fair_server_bw = cpu_rq(cpu)->fair_server.dl_bw; + dl_server_bw = dl_server_read_bw(cpu); /* * Not much to check if no DEADLINE bandwidth is present. * dl_servers we can discount, as tasks will be moved out the * offlined CPUs anyway. */ - if (dl_b->total_bw - fair_server_bw > 0) { + if (dl_b->total_bw - dl_server_bw > 0) { /* * Leaving at least one CPU for DEADLINE tasks seems a * wise thing to do. As said above, cpu is not offline * yet, so account for that. */ if (dl_bw_cpus(cpu) - 1) - overflow = __dl_overflow(dl_b, cap, fair_server_bw, 0); + overflow = __dl_overflow(dl_b, cap, dl_server_bw, 0); else overflow = 1; } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 41caa22e0680..b24f40f05019 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -172,18 +172,12 @@ static const struct file_operations sched_feat_fops = { static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - char buf[16]; unsigned int scaling; + int ret; - if (cnt > 15) - cnt = 15; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - buf[cnt] = '\0'; - - if (kstrtouint(buf, 10, &scaling)) - return -EINVAL; + ret = kstrtouint_from_user(ubuf, cnt, 10, &scaling); + if (ret) + return ret; if (scaling >= SCHED_TUNABLESCALING_END) return -EINVAL; @@ -243,7 +237,7 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, static int sched_dynamic_show(struct seq_file *m, void *v) { - int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2; + int i = (IS_ENABLED(CONFIG_PREEMPT_RT) || IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY)) * 2; int j; /* Count entries in NULL terminated preempt_modes */ @@ -336,17 +330,19 @@ enum dl_param { DL_PERIOD, }; -static unsigned long fair_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */ -static unsigned long fair_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */ +static unsigned long dl_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */ +static unsigned long dl_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */ -static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos, enum dl_param param) +static ssize_t sched_server_write_common(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos, enum dl_param param, + void *server) { long cpu = (long) ((struct seq_file *) filp->private_data)->private; + struct sched_dl_entity *dl_se = (struct sched_dl_entity *)server; + u64 old_runtime, runtime, period; struct rq *rq = cpu_rq(cpu); - u64 runtime, period; + int retval = 0; size_t err; - int retval; u64 value; err = kstrtoull_from_user(ubuf, cnt, 10, &value); @@ -354,8 +350,8 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu return err; scoped_guard (rq_lock_irqsave, rq) { - runtime = rq->fair_server.dl_runtime; - period = rq->fair_server.dl_period; + old_runtime = runtime = dl_se->dl_runtime; + period = dl_se->dl_period; switch (param) { case DL_RUNTIME: @@ -371,60 +367,68 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu } if (runtime > period || - period > fair_server_period_max || - period < fair_server_period_min) { + period > dl_server_period_max || + period < dl_server_period_min) { return -EINVAL; } update_rq_clock(rq); - dl_server_stop(&rq->fair_server); - - retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0); - if (retval) - cnt = retval; + dl_server_stop(dl_se); + retval = dl_server_apply_params(dl_se, runtime, period, 0); + dl_server_start(dl_se); - if (!runtime) - printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n", - cpu_of(rq)); + if (retval < 0) + return retval; + } - if (rq->cfs.h_nr_queued) - dl_server_start(&rq->fair_server); + if (!!old_runtime ^ !!runtime) { + pr_info("%s server %sabled on CPU %d%s.\n", + server == &rq->fair_server ? "Fair" : "Ext", + runtime ? "en" : "dis", + cpu_of(rq), + runtime ? "" : ", system may malfunction due to starvation"); } *ppos += cnt; return cnt; } -static size_t sched_fair_server_show(struct seq_file *m, void *v, enum dl_param param) +static size_t sched_server_show_common(struct seq_file *m, void *v, enum dl_param param, + void *server) { - unsigned long cpu = (unsigned long) m->private; - struct rq *rq = cpu_rq(cpu); + struct sched_dl_entity *dl_se = (struct sched_dl_entity *)server; u64 value; switch (param) { case DL_RUNTIME: - value = rq->fair_server.dl_runtime; + value = dl_se->dl_runtime; break; case DL_PERIOD: - value = rq->fair_server.dl_period; + value = dl_se->dl_period; break; } seq_printf(m, "%llu\n", value); return 0; - } static ssize_t sched_fair_server_runtime_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_RUNTIME); + long cpu = (long) ((struct seq_file *) filp->private_data)->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_write_common(filp, ubuf, cnt, ppos, DL_RUNTIME, + &rq->fair_server); } static int sched_fair_server_runtime_show(struct seq_file *m, void *v) { - return sched_fair_server_show(m, v, DL_RUNTIME); + unsigned long cpu = (unsigned long) m->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_show_common(m, v, DL_RUNTIME, &rq->fair_server); } static int sched_fair_server_runtime_open(struct inode *inode, struct file *filp) @@ -440,16 +444,57 @@ static const struct file_operations fair_server_runtime_fops = { .release = single_release, }; +#ifdef CONFIG_SCHED_CLASS_EXT +static ssize_t +sched_ext_server_runtime_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + long cpu = (long) ((struct seq_file *) filp->private_data)->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_write_common(filp, ubuf, cnt, ppos, DL_RUNTIME, + &rq->ext_server); +} + +static int sched_ext_server_runtime_show(struct seq_file *m, void *v) +{ + unsigned long cpu = (unsigned long) m->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_show_common(m, v, DL_RUNTIME, &rq->ext_server); +} + +static int sched_ext_server_runtime_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_ext_server_runtime_show, inode->i_private); +} + +static const struct file_operations ext_server_runtime_fops = { + .open = sched_ext_server_runtime_open, + .write = sched_ext_server_runtime_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_SCHED_CLASS_EXT */ + static ssize_t sched_fair_server_period_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_PERIOD); + long cpu = (long) ((struct seq_file *) filp->private_data)->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_write_common(filp, ubuf, cnt, ppos, DL_PERIOD, + &rq->fair_server); } static int sched_fair_server_period_show(struct seq_file *m, void *v) { - return sched_fair_server_show(m, v, DL_PERIOD); + unsigned long cpu = (unsigned long) m->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_show_common(m, v, DL_PERIOD, &rq->fair_server); } static int sched_fair_server_period_open(struct inode *inode, struct file *filp) @@ -465,6 +510,40 @@ static const struct file_operations fair_server_period_fops = { .release = single_release, }; +#ifdef CONFIG_SCHED_CLASS_EXT +static ssize_t +sched_ext_server_period_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + long cpu = (long) ((struct seq_file *) filp->private_data)->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_write_common(filp, ubuf, cnt, ppos, DL_PERIOD, + &rq->ext_server); +} + +static int sched_ext_server_period_show(struct seq_file *m, void *v) +{ + unsigned long cpu = (unsigned long) m->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_show_common(m, v, DL_PERIOD, &rq->ext_server); +} + +static int sched_ext_server_period_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_ext_server_period_show, inode->i_private); +} + +static const struct file_operations ext_server_period_fops = { + .open = sched_ext_server_period_open, + .write = sched_ext_server_period_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_SCHED_CLASS_EXT */ + static struct dentry *debugfs_sched; static void debugfs_fair_server_init(void) @@ -488,6 +567,29 @@ static void debugfs_fair_server_init(void) } } +#ifdef CONFIG_SCHED_CLASS_EXT +static void debugfs_ext_server_init(void) +{ + struct dentry *d_ext; + unsigned long cpu; + + d_ext = debugfs_create_dir("ext_server", debugfs_sched); + if (!d_ext) + return; + + for_each_possible_cpu(cpu) { + struct dentry *d_cpu; + char buf[32]; + + snprintf(buf, sizeof(buf), "cpu%lu", cpu); + d_cpu = debugfs_create_dir(buf, d_ext); + + debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &ext_server_runtime_fops); + debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &ext_server_period_fops); + } +} +#endif /* CONFIG_SCHED_CLASS_EXT */ + static __init int sched_init_debug(void) { struct dentry __maybe_unused *numa; @@ -526,6 +628,9 @@ static __init int sched_init_debug(void) debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); debugfs_fair_server_init(); +#ifdef CONFIG_SCHED_CLASS_EXT + debugfs_ext_server_init(); +#endif return 0; } diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7ccd84c17792..e6bf73456176 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -959,6 +959,8 @@ static void update_curr_scx(struct rq *rq) if (!curr->scx.slice) touch_core_sched(rq, curr); } + + dl_server_update(&rq->ext_server, delta_exec); } static bool scx_dsq_priq_less(struct rb_node *node_a, @@ -1502,6 +1504,10 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags if (enq_flags & SCX_ENQ_WAKEUP) touch_core_sched(rq, p); + /* Start dl_server if this is the first task being enqueued */ + if (rq->scx.nr_running == 1) + dl_server_start(&rq->ext_server); + do_enqueue_task(rq, p, enq_flags, sticky_cpu); out: rq->scx.flags &= ~SCX_RQ_IN_WAKEUP; @@ -2454,7 +2460,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) /* see kick_cpus_irq_workfn() */ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); - rq_modified_clear(rq); + rq->next_class = &ext_sched_class; rq_unpin_lock(rq, rf); balance_one(rq, prev); @@ -2469,7 +2475,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) * If @force_scx is true, always try to pick a SCHED_EXT task, * regardless of any higher-priority sched classes activity. */ - if (!force_scx && rq_modified_above(rq, &ext_sched_class)) + if (!force_scx && sched_class_above(rq->next_class, &ext_sched_class)) return RETRY_TASK; keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; @@ -2513,6 +2519,33 @@ static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) return do_pick_task_scx(rq, rf, false); } +/* + * Select the next task to run from the ext scheduling class. + * + * Use do_pick_task_scx() directly with @force_scx enabled, since the + * dl_server must always select a sched_ext task. + */ +static struct task_struct * +ext_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf) +{ + if (!scx_enabled()) + return NULL; + + return do_pick_task_scx(dl_se->rq, rf, true); +} + +/* + * Initialize the ext server deadline entity. + */ +void ext_server_init(struct rq *rq) +{ + struct sched_dl_entity *dl_se = &rq->ext_server; + + init_dl_entity(dl_se); + + dl_server_init(dl_se, rq, ext_server_pick_task); +} + #ifdef CONFIG_SCHED_CORE /** * scx_prio_less - Task ordering for core-sched @@ -3141,7 +3174,8 @@ static void switched_from_scx(struct rq *rq, struct task_struct *p) scx_disable_task(p); } -static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {} +static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) {} + static void switched_to_scx(struct rq *rq, struct task_struct *p) {} int scx_check_setscheduler(struct task_struct *p, int policy) @@ -3402,8 +3436,6 @@ static void scx_cgroup_unlock(void) {} * their current sched_class. Call them directly from sched core instead. */ DEFINE_SCHED_CLASS(ext) = { - .queue_mask = 1, - .enqueue_task = enqueue_task_scx, .dequeue_task = dequeue_task_scx, .yield_task = yield_task_scx, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c3502c3c7b88..1e22b7fadd70 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -524,10 +524,48 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); * Scheduling class tree data structure manipulation methods: */ +extern void __BUILD_BUG_vruntime_cmp(void); + +/* Use __builtin_strcmp() because of __HAVE_ARCH_STRCMP: */ + +#define vruntime_cmp(A, CMP_STR, B) ({ \ + int __res = 0; \ + \ + if (!__builtin_strcmp(CMP_STR, "<")) { \ + __res = ((s64)((A)-(B)) < 0); \ + } else if (!__builtin_strcmp(CMP_STR, "<=")) { \ + __res = ((s64)((A)-(B)) <= 0); \ + } else if (!__builtin_strcmp(CMP_STR, ">")) { \ + __res = ((s64)((A)-(B)) > 0); \ + } else if (!__builtin_strcmp(CMP_STR, ">=")) { \ + __res = ((s64)((A)-(B)) >= 0); \ + } else { \ + /* Unknown operator throws linker error: */ \ + __BUILD_BUG_vruntime_cmp(); \ + } \ + \ + __res; \ +}) + +extern void __BUILD_BUG_vruntime_op(void); + +#define vruntime_op(A, OP_STR, B) ({ \ + s64 __res = 0; \ + \ + if (!__builtin_strcmp(OP_STR, "-")) { \ + __res = (s64)((A)-(B)); \ + } else { \ + /* Unknown operator throws linker error: */ \ + __BUILD_BUG_vruntime_op(); \ + } \ + \ + __res; \ +}) + + static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime) { - s64 delta = (s64)(vruntime - max_vruntime); - if (delta > 0) + if (vruntime_cmp(vruntime, ">", max_vruntime)) max_vruntime = vruntime; return max_vruntime; @@ -535,8 +573,7 @@ static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime) static inline __maybe_unused u64 min_vruntime(u64 min_vruntime, u64 vruntime) { - s64 delta = (s64)(vruntime - min_vruntime); - if (delta < 0) + if (vruntime_cmp(vruntime, "<", min_vruntime)) min_vruntime = vruntime; return min_vruntime; @@ -549,12 +586,12 @@ static inline bool entity_before(const struct sched_entity *a, * Tiebreak on vruntime seems unnecessary since it can * hardly happen. */ - return (s64)(a->deadline - b->deadline) < 0; + return vruntime_cmp(a->deadline, "<", b->deadline); } static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) { - return (s64)(se->vruntime - cfs_rq->zero_vruntime); + return vruntime_op(se->vruntime, "-", cfs_rq->zero_vruntime); } #define __node_2_se(node) \ @@ -576,7 +613,7 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * * \Sum lag_i = 0 * \Sum w_i * (V - v_i) = 0 - * \Sum w_i * V - w_i * v_i = 0 + * \Sum (w_i * V - w_i * v_i) = 0 * * From which we can solve an expression for V in v_i (which we have in * se->vruntime): @@ -607,11 +644,11 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * Which we track using: * * v0 := cfs_rq->zero_vruntime - * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime - * \Sum w_i := cfs_rq->avg_load + * \Sum (v_i - v0) * w_i := cfs_rq->sum_w_vruntime + * \Sum w_i := cfs_rq->sum_weight * * Since zero_vruntime closely tracks the per-task service, these - * deltas: (v_i - v), will be in the order of the maximal (virtual) lag + * deltas: (v_i - v0), will be in the order of the maximal (virtual) lag * induced in the system due to quantisation. * * Also, we use scale_load_down() to reduce the size. @@ -619,32 +656,32 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * As measured, the max (key * weight) value was ~44 bits for a kernel build. */ static void -avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) { unsigned long weight = scale_load_down(se->load.weight); s64 key = entity_key(cfs_rq, se); - cfs_rq->avg_vruntime += key * weight; - cfs_rq->avg_load += weight; + cfs_rq->sum_w_vruntime += key * weight; + cfs_rq->sum_weight += weight; } static void -avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) +sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) { unsigned long weight = scale_load_down(se->load.weight); s64 key = entity_key(cfs_rq, se); - cfs_rq->avg_vruntime -= key * weight; - cfs_rq->avg_load -= weight; + cfs_rq->sum_w_vruntime -= key * weight; + cfs_rq->sum_weight -= weight; } static inline -void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) +void sum_w_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) { /* - * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load + * v' = v + d ==> sum_w_vruntime' = sum_runtime - d*sum_weight */ - cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta; + cfs_rq->sum_w_vruntime -= cfs_rq->sum_weight * delta; } /* @@ -654,8 +691,8 @@ void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) u64 avg_vruntime(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; - s64 avg = cfs_rq->avg_vruntime; - long load = cfs_rq->avg_load; + s64 avg = cfs_rq->sum_w_vruntime; + long load = cfs_rq->sum_weight; if (curr && curr->on_rq) { unsigned long weight = scale_load_down(curr->load.weight); @@ -722,8 +759,8 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) { struct sched_entity *curr = cfs_rq->curr; - s64 avg = cfs_rq->avg_vruntime; - long load = cfs_rq->avg_load; + s64 avg = cfs_rq->sum_w_vruntime; + long load = cfs_rq->sum_weight; if (curr && curr->on_rq) { unsigned long weight = scale_load_down(curr->load.weight); @@ -732,7 +769,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) load += weight; } - return avg >= (s64)(vruntime - cfs_rq->zero_vruntime) * load; + return avg >= vruntime_op(vruntime, "-", cfs_rq->zero_vruntime) * load; } int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -743,9 +780,9 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) static void update_zero_vruntime(struct cfs_rq *cfs_rq) { u64 vruntime = avg_vruntime(cfs_rq); - s64 delta = (s64)(vruntime - cfs_rq->zero_vruntime); + s64 delta = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime); - avg_vruntime_update(cfs_rq, delta); + sum_w_vruntime_update(cfs_rq, delta); cfs_rq->zero_vruntime = vruntime; } @@ -770,13 +807,12 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) return entity_before(__node_2_se(a), __node_2_se(b)); } -#define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; }) - static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node) { if (node) { struct sched_entity *rse = __node_2_se(node); - if (vruntime_gt(min_vruntime, se, rse)) + + if (vruntime_cmp(se->min_vruntime, ">", rse->min_vruntime)) se->min_vruntime = rse->min_vruntime; } } @@ -819,7 +855,7 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, */ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - avg_vruntime_add(cfs_rq, se); + sum_w_vruntime_add(cfs_rq, se); update_zero_vruntime(cfs_rq); se->min_vruntime = se->vruntime; se->min_slice = se->slice; @@ -831,7 +867,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, &min_vruntime_cb); - avg_vruntime_sub(cfs_rq, se); + sum_w_vruntime_sub(cfs_rq, se); update_zero_vruntime(cfs_rq); } @@ -887,7 +923,7 @@ static inline void update_protect_slice(struct cfs_rq *cfs_rq, struct sched_enti static inline bool protect_slice(struct sched_entity *se) { - return ((s64)(se->vprot - se->vruntime) > 0); + return vruntime_cmp(se->vruntime, "<", se->vprot); } static inline void cancel_protect_slice(struct sched_entity *se) @@ -1024,7 +1060,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); */ static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if ((s64)(se->vruntime - se->deadline) < 0) + if (vruntime_cmp(se->vruntime, "<", se->deadline)) return false; /* @@ -1513,7 +1549,7 @@ static unsigned int task_scan_start(struct task_struct *p) /* Scale the maximum scan period with the amount of shared memory. */ rcu_read_lock(); - ng = rcu_dereference(p->numa_group); + ng = rcu_dereference_all(p->numa_group); if (ng) { unsigned long shared = group_faults_shared(ng); unsigned long private = group_faults_priv(ng); @@ -1580,7 +1616,7 @@ pid_t task_numa_group_id(struct task_struct *p) pid_t gid = 0; rcu_read_lock(); - ng = rcu_dereference(p->numa_group); + ng = rcu_dereference_all(p->numa_group); if (ng) gid = ng->gid; rcu_read_unlock(); @@ -2239,7 +2275,7 @@ static bool task_numa_compare(struct task_numa_env *env, return false; rcu_read_lock(); - cur = rcu_dereference(dst_rq->curr); + cur = rcu_dereference_all(dst_rq->curr); if (cur && ((cur->flags & (PF_EXITING | PF_KTHREAD)) || !cur->mm)) cur = NULL; @@ -2284,7 +2320,7 @@ static bool task_numa_compare(struct task_numa_env *env, * If dst and source tasks are in the same NUMA group, or not * in any group then look only at task weights. */ - cur_ng = rcu_dereference(cur->numa_group); + cur_ng = rcu_dereference_all(cur->numa_group); if (cur_ng == p_ng) { /* * Do not swap within a group or between tasks that have @@ -2458,11 +2494,8 @@ static void task_numa_find_cpu(struct task_numa_env *env, maymove = !load_too_imbalanced(src_load, dst_load, env); } - for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { - /* Skip this CPU if the source task cannot migrate */ - if (!cpumask_test_cpu(cpu, env->p->cpus_ptr)) - continue; - + /* Skip CPUs if the source task cannot migrate */ + for_each_cpu_and(cpu, cpumask_of_node(env->dst_nid), env->p->cpus_ptr) { env->dst_cpu = cpu; if (task_numa_compare(env, taskimp, groupimp, maymove)) break; @@ -2499,7 +2532,7 @@ static int task_numa_migrate(struct task_struct *p) * to satisfy here. */ rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); + sd = rcu_dereference_all(per_cpu(sd_numa, env.src_cpu)); if (sd) { env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; env.imb_numa_nr = sd->imb_numa_nr; @@ -3023,7 +3056,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (!cpupid_match_pid(tsk, cpupid)) goto no_join; - grp = rcu_dereference(tsk->numa_group); + grp = rcu_dereference_all(tsk->numa_group); if (!grp) goto no_join; @@ -3694,7 +3727,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) */ #define add_positive(_ptr, _val) do { \ typeof(_ptr) ptr = (_ptr); \ - typeof(_val) val = (_val); \ + __signed_scalar_typeof(*ptr) val = (_val); \ typeof(*ptr) res, var = READ_ONCE(*ptr); \ \ res = var + val; \ @@ -3706,23 +3739,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } while (0) /* - * Unsigned subtract and clamp on underflow. - * - * Explicitly do a load-store to ensure the intermediate value never hits - * memory. This allows lockless observations without ever seeing the negative - * values. - */ -#define sub_positive(_ptr, _val) do { \ - typeof(_ptr) ptr = (_ptr); \ - typeof(*ptr) val = (_val); \ - typeof(*ptr) res, var = READ_ONCE(*ptr); \ - res = var - val; \ - if (res > var) \ - res = 0; \ - WRITE_ONCE(*ptr, res); \ -} while (0) - -/* * Remove and clamp on negative, from a local variable. * * A variant of sub_positive(), which does not use explicit load-store @@ -3733,21 +3749,39 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) *ptr -= min_t(typeof(*ptr), *ptr, _val); \ } while (0) + +/* + * Because of rounding, se->util_sum might ends up being +1 more than + * cfs->util_sum. Although this is not a problem by itself, detaching + * a lot of tasks with the rounding problem between 2 updates of + * util_avg (~1ms) can make cfs->util_sum becoming null whereas + * cfs_util_avg is not. + * + * Check that util_sum is still above its lower bound for the new + * util_avg. Given that period_contrib might have moved since the last + * sync, we are only sure that util_sum must be above or equal to + * util_avg * minimum possible divider + */ +#define __update_sa(sa, name, delta_avg, delta_sum) do { \ + add_positive(&(sa)->name##_avg, delta_avg); \ + add_positive(&(sa)->name##_sum, delta_sum); \ + (sa)->name##_sum = max_t(typeof((sa)->name##_sum), \ + (sa)->name##_sum, \ + (sa)->name##_avg * PELT_MIN_DIVIDER); \ +} while (0) + static inline void enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - cfs_rq->avg.load_avg += se->avg.load_avg; - cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum; + __update_sa(&cfs_rq->avg, load, se->avg.load_avg, + se_weight(se) * se->avg.load_sum); } static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); - sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, - cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); + __update_sa(&cfs_rq->avg, load, -se->avg.load_avg, + se_weight(se) * -se->avg.load_sum); } static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); @@ -4243,7 +4277,6 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq */ divider = get_pelt_divider(&cfs_rq->avg); - /* Set new sched_entity's utilization */ se->avg.util_avg = gcfs_rq->avg.util_avg; new_sum = se->avg.util_avg * divider; @@ -4251,12 +4284,7 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq se->avg.util_sum = new_sum; /* Update parent cfs_rq utilization */ - add_positive(&cfs_rq->avg.util_avg, delta_avg); - add_positive(&cfs_rq->avg.util_sum, delta_sum); - - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, - cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); + __update_sa(&cfs_rq->avg, util, delta_avg, delta_sum); } static inline void @@ -4282,11 +4310,7 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf se->avg.runnable_sum = new_sum; /* Update parent cfs_rq runnable */ - add_positive(&cfs_rq->avg.runnable_avg, delta_avg); - add_positive(&cfs_rq->avg.runnable_sum, delta_sum); - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, - cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); + __update_sa(&cfs_rq->avg, runnable, delta_avg, delta_sum); } static inline void @@ -4350,11 +4374,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq se->avg.load_sum = runnable_sum; se->avg.load_avg = load_avg; - add_positive(&cfs_rq->avg.load_avg, delta_avg); - add_positive(&cfs_rq->avg.load_sum, delta_sum); - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, - cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); + __update_sa(&cfs_rq->avg, load, delta_avg, delta_sum); } static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) @@ -4451,7 +4471,7 @@ static inline void migrate_se_pelt_lag(struct sched_entity *se) rq = rq_of(cfs_rq); rcu_read_lock(); - is_idle = is_idle_task(rcu_dereference(rq->curr)); + is_idle = is_idle_task(rcu_dereference_all(rq->curr)); rcu_read_unlock(); /* @@ -4553,33 +4573,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) raw_spin_unlock(&cfs_rq->removed.lock); r = removed_load; - sub_positive(&sa->load_avg, r); - sub_positive(&sa->load_sum, r * divider); - /* See sa->util_sum below */ - sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER); + __update_sa(sa, load, -r, -r*divider); r = removed_util; - sub_positive(&sa->util_avg, r); - sub_positive(&sa->util_sum, r * divider); - /* - * Because of rounding, se->util_sum might ends up being +1 more than - * cfs->util_sum. Although this is not a problem by itself, detaching - * a lot of tasks with the rounding problem between 2 updates of - * util_avg (~1ms) can make cfs->util_sum becoming null whereas - * cfs_util_avg is not. - * Check that util_sum is still above its lower bound for the new - * util_avg. Given that period_contrib might have moved since the last - * sync, we are only sure that util_sum must be above or equal to - * util_avg * minimum possible divider - */ - sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER); + __update_sa(sa, util, -r, -r*divider); r = removed_runnable; - sub_positive(&sa->runnable_avg, r); - sub_positive(&sa->runnable_sum, r * divider); - /* See sa->util_sum above */ - sa->runnable_sum = max_t(u32, sa->runnable_sum, - sa->runnable_avg * PELT_MIN_DIVIDER); + __update_sa(sa, runnable, -r, -r*divider); /* * removed_runnable is the unweighted version of removed_load so we @@ -4664,17 +4664,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { dequeue_load_avg(cfs_rq, se); - sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); - sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, - cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); - - sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); - sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum); - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, - cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); + __update_sa(&cfs_rq->avg, util, -se->avg.util_avg, -se->avg.util_sum); + __update_sa(&cfs_rq->avg, runnable, -se->avg.runnable_avg, -se->avg.runnable_sum); add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); @@ -5177,7 +5168,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * * vl_i = (W + w_i)*vl'_i / W */ - load = cfs_rq->avg_load; + load = cfs_rq->sum_weight; if (curr && curr->on_rq) load += scale_load_down(curr->load.weight); @@ -7150,8 +7141,7 @@ static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask); static struct { cpumask_var_t idle_cpus_mask; - atomic_t nr_cpus; - int has_blocked; /* Idle CPUS has blocked load */ + int has_blocked_load; /* Idle CPUS has blocked load */ int needs_update; /* Newly idle CPUs need their next_balance collated */ unsigned long next_balance; /* in jiffy units */ unsigned long next_blocked; /* Next update of blocked load in jiffies */ @@ -7509,7 +7499,7 @@ static inline void set_idle_cores(int cpu, int val) { struct sched_domain_shared *sds; - sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); if (sds) WRITE_ONCE(sds->has_idle_cores, val); } @@ -7518,7 +7508,7 @@ static inline bool test_idle_cores(int cpu) { struct sched_domain_shared *sds; - sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); if (sds) return READ_ONCE(sds->has_idle_cores); @@ -7647,7 +7637,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); if (sched_feat(SIS_UTIL)) { - sd_share = rcu_dereference(per_cpu(sd_llc_shared, target)); + sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, target)); if (sd_share) { /* because !--nr is the condition to stop scan */ nr = READ_ONCE(sd_share->nr_idle_scan) + 1; @@ -7853,7 +7843,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * sd_asym_cpucapacity rather than sd_llc. */ if (sched_asym_cpucap_active()) { - sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); + sd = rcu_dereference_all(per_cpu(sd_asym_cpucapacity, target)); /* * On an asymmetric CPU capacity system where an exclusive * cpuset defines a symmetric island (i.e. one unique @@ -7868,7 +7858,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) } } - sd = rcu_dereference(per_cpu(sd_llc, target)); + sd = rcu_dereference_all(per_cpu(sd_llc, target)); if (!sd) return target; @@ -8337,7 +8327,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) struct energy_env eenv; rcu_read_lock(); - pd = rcu_dereference(rd->pd); + pd = rcu_dereference_all(rd->pd); if (!pd) goto unlock; @@ -8345,7 +8335,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) * Energy-aware wake-up happens on the lowest sched_domain starting * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu. */ - sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity)); + sd = rcu_dereference_all(*this_cpu_ptr(&sd_asym_cpucapacity)); while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) sd = sd->parent; if (!sd) @@ -8368,9 +8358,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) int max_spare_cap_cpu = -1; int fits, max_fits = -1; - cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask); - - if (cpumask_empty(cpus)) + if (!cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask)) continue; /* Account external pressure for the energy estimation */ @@ -8747,7 +8735,7 @@ preempt_sync(struct rq *rq, int wake_flags, /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags) +static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_flags) { enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK; struct task_struct *donor = rq->donor; @@ -8755,6 +8743,12 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int struct cfs_rq *cfs_rq = task_cfs_rq(donor); int cse_is_idle, pse_is_idle; + /* + * XXX Getting preempted by higher class, try and find idle CPU? + */ + if (p->sched_class != &fair_sched_class) + return; + if (unlikely(se == pse)) return; @@ -9337,7 +9331,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) */ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { - struct numa_group *numa_group = rcu_dereference(p->numa_group); + struct numa_group *numa_group = rcu_dereference_all(p->numa_group); unsigned long src_weight, dst_weight; int src_nid, dst_nid, dist; @@ -9766,7 +9760,7 @@ static void attach_tasks(struct lb_env *env) } #ifdef CONFIG_NO_HZ_COMMON -static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) +static inline bool cfs_rq_has_blocked_load(struct cfs_rq *cfs_rq) { if (cfs_rq->avg.load_avg) return true; @@ -9799,16 +9793,16 @@ static inline void update_blocked_load_tick(struct rq *rq) WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies); } -static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) +static inline void update_has_blocked_load_status(struct rq *rq, bool has_blocked_load) { - if (!has_blocked) + if (!has_blocked_load) rq->has_blocked_load = 0; } #else /* !CONFIG_NO_HZ_COMMON: */ -static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; } +static inline bool cfs_rq_has_blocked_load(struct cfs_rq *cfs_rq) { return false; } static inline bool others_have_blocked(struct rq *rq) { return false; } static inline void update_blocked_load_tick(struct rq *rq) {} -static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} +static inline void update_has_blocked_load_status(struct rq *rq, bool has_blocked_load) {} #endif /* !CONFIG_NO_HZ_COMMON */ static bool __update_blocked_others(struct rq *rq, bool *done) @@ -9865,7 +9859,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) list_del_leaf_cfs_rq(cfs_rq); /* Don't need periodic decay once load/util_avg are null */ - if (cfs_rq_has_blocked(cfs_rq)) + if (cfs_rq_has_blocked_load(cfs_rq)) *done = false; } @@ -9925,7 +9919,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) bool decayed; decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); - if (cfs_rq_has_blocked(cfs_rq)) + if (cfs_rq_has_blocked_load(cfs_rq)) *done = false; return decayed; @@ -9937,23 +9931,27 @@ static unsigned long task_h_load(struct task_struct *p) } #endif /* !CONFIG_FAIR_GROUP_SCHED */ -static void sched_balance_update_blocked_averages(int cpu) +static void __sched_balance_update_blocked_averages(struct rq *rq) { bool decayed = false, done = true; - struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; - rq_lock_irqsave(rq, &rf); update_blocked_load_tick(rq); - update_rq_clock(rq); decayed |= __update_blocked_others(rq, &done); decayed |= __update_blocked_fair(rq, &done); - update_blocked_load_status(rq, !done); + update_has_blocked_load_status(rq, !done); if (decayed) cpufreq_update_util(rq, 0); - rq_unlock_irqrestore(rq, &rf); +} + +static void sched_balance_update_blocked_averages(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + guard(rq_lock_irqsave)(rq); + update_rq_clock(rq); + __sched_balance_update_blocked_averages(rq); } /********** Helpers for sched_balance_find_src_group ************************/ @@ -10963,10 +10961,9 @@ sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int * take care of it. */ if (p->nr_cpus_allowed != NR_CPUS) { - struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); - - cpumask_and(cpus, sched_group_span(local), p->cpus_ptr); - imb_numa_nr = min(cpumask_weight(cpus), sd->imb_numa_nr); + unsigned int w = cpumask_weight_and(p->cpus_ptr, + sched_group_span(local)); + imb_numa_nr = min(w, sd->imb_numa_nr); } imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus); @@ -11013,7 +11010,7 @@ static void update_idle_cpu_scan(struct lb_env *env, if (env->sd->span_weight != llc_weight) return; - sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu)); + sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, env->dst_cpu)); if (!sd_share) return; @@ -11363,7 +11360,7 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env) goto force_balance; if (!is_rd_overutilized(env->dst_rq->rd) && - rcu_dereference(env->dst_rq->rd->pd)) + rcu_dereference_all(env->dst_rq->rd->pd)) goto out_balanced; /* ASYM feature bypasses nice load balance check */ @@ -12431,20 +12428,29 @@ static void nohz_balancer_kick(struct rq *rq) */ nohz_balance_exit_idle(rq); - /* - * None are in tickless mode and hence no need for NOHZ idle load - * balancing: - */ - if (likely(!atomic_read(&nohz.nr_cpus))) - return; - - if (READ_ONCE(nohz.has_blocked) && + if (READ_ONCE(nohz.has_blocked_load) && time_after(now, READ_ONCE(nohz.next_blocked))) flags = NOHZ_STATS_KICK; + /* + * Most of the time system is not 100% busy. i.e nohz.nr_cpus > 0 + * Skip the read if time is not due. + * + * If none are in tickless mode, there maybe a narrow window + * (28 jiffies, HZ=1000) where flags maybe set and kick_ilb called. + * But idle load balancing is not done as find_new_ilb fails. + * That's very rare. So read nohz.nr_cpus only if time is due. + */ if (time_before(now, nohz.next_balance)) goto out; + /* + * None are in tickless mode and hence no need for NOHZ idle load + * balancing + */ + if (unlikely(cpumask_empty(nohz.idle_cpus_mask))) + return; + if (rq->nr_running >= 2) { flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto out; @@ -12452,7 +12458,7 @@ static void nohz_balancer_kick(struct rq *rq) rcu_read_lock(); - sd = rcu_dereference(rq->sd); + sd = rcu_dereference_all(rq->sd); if (sd) { /* * If there's a runnable CFS task and the current CPU has reduced @@ -12464,7 +12470,7 @@ static void nohz_balancer_kick(struct rq *rq) } } - sd = rcu_dereference(per_cpu(sd_asym_packing, cpu)); + sd = rcu_dereference_all(per_cpu(sd_asym_packing, cpu)); if (sd) { /* * When ASYM_PACKING; see if there's a more preferred CPU @@ -12482,7 +12488,7 @@ static void nohz_balancer_kick(struct rq *rq) } } - sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu)); + sd = rcu_dereference_all(per_cpu(sd_asym_cpucapacity, cpu)); if (sd) { /* * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU @@ -12503,7 +12509,7 @@ static void nohz_balancer_kick(struct rq *rq) goto unlock; } - sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); if (sds) { /* * If there is an imbalance between LLC domains (IOW we could @@ -12535,7 +12541,7 @@ static void set_cpu_sd_state_busy(int cpu) struct sched_domain *sd; rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_llc, cpu)); + sd = rcu_dereference_all(per_cpu(sd_llc, cpu)); if (!sd || !sd->nohz_idle) goto unlock; @@ -12555,7 +12561,6 @@ void nohz_balance_exit_idle(struct rq *rq) rq->nohz_tick_stopped = 0; cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask); - atomic_dec(&nohz.nr_cpus); set_cpu_sd_state_busy(rq->cpu); } @@ -12565,7 +12570,7 @@ static void set_cpu_sd_state_idle(int cpu) struct sched_domain *sd; rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_llc, cpu)); + sd = rcu_dereference_all(per_cpu(sd_llc, cpu)); if (!sd || sd->nohz_idle) goto unlock; @@ -12599,9 +12604,9 @@ void nohz_balance_enter_idle(int cpu) /* * The tick is still stopped but load could have been added in the - * meantime. We set the nohz.has_blocked flag to trig a check of the + * meantime. We set the nohz.has_blocked_load flag to trig a check of the * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear - * of nohz.has_blocked can only happen after checking the new load + * of nohz.has_blocked_load can only happen after checking the new load */ if (rq->nohz_tick_stopped) goto out; @@ -12613,11 +12618,10 @@ void nohz_balance_enter_idle(int cpu) rq->nohz_tick_stopped = 1; cpumask_set_cpu(cpu, nohz.idle_cpus_mask); - atomic_inc(&nohz.nr_cpus); /* * Ensures that if nohz_idle_balance() fails to observe our - * @idle_cpus_mask store, it must observe the @has_blocked + * @idle_cpus_mask store, it must observe the @has_blocked_load * and @needs_update stores. */ smp_mb__after_atomic(); @@ -12630,7 +12634,7 @@ out: * Each time a cpu enter idle, we assume that it has blocked load and * enable the periodic update of the load of idle CPUs */ - WRITE_ONCE(nohz.has_blocked, 1); + WRITE_ONCE(nohz.has_blocked_load, 1); } static bool update_nohz_stats(struct rq *rq) @@ -12671,8 +12675,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) /* * We assume there will be no idle load after this update and clear - * the has_blocked flag. If a cpu enters idle in the mean time, it will - * set the has_blocked flag and trigger another update of idle load. + * the has_blocked_load flag. If a cpu enters idle in the mean time, it will + * set the has_blocked_load flag and trigger another update of idle load. * Because a cpu that becomes idle, is added to idle_cpus_mask before * setting the flag, we are sure to not clear the state and not * check the load of an idle cpu. @@ -12680,12 +12684,12 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) * Same applies to idle_cpus_mask vs needs_update. */ if (flags & NOHZ_STATS_KICK) - WRITE_ONCE(nohz.has_blocked, 0); + WRITE_ONCE(nohz.has_blocked_load, 0); if (flags & NOHZ_NEXT_KICK) WRITE_ONCE(nohz.needs_update, 0); /* - * Ensures that if we miss the CPU, we must see the has_blocked + * Ensures that if we miss the CPU, we must see the has_blocked_load * store from nohz_balance_enter_idle(). */ smp_mb(); @@ -12752,7 +12756,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) abort: /* There is still blocked load, enable periodic update */ if (has_blocked_load) - WRITE_ONCE(nohz.has_blocked, 1); + WRITE_ONCE(nohz.has_blocked_load, 1); } /* @@ -12814,7 +12818,7 @@ static void nohz_newidle_balance(struct rq *this_rq) return; /* Don't need to update blocked load of idle CPUs*/ - if (!READ_ONCE(nohz.has_blocked) || + if (!READ_ONCE(nohz.has_blocked_load) || time_before(jiffies, READ_ONCE(nohz.next_blocked))) return; @@ -12885,29 +12889,28 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) */ rq_unpin_lock(this_rq, rf); - rcu_read_lock(); - sd = rcu_dereference_check_sched_domain(this_rq->sd); - if (!sd) { - rcu_read_unlock(); + sd = rcu_dereference_sched_domain(this_rq->sd); + if (!sd) goto out; - } if (!get_rd_overloaded(this_rq->rd) || this_rq->avg_idle < sd->max_newidle_lb_cost) { update_next_balance(sd, &next_balance); - rcu_read_unlock(); goto out; } - rcu_read_unlock(); - - rq_modified_clear(this_rq); - raw_spin_rq_unlock(this_rq); + /* + * Include sched_balance_update_blocked_averages() in the cost + * calculation because it can be quite costly -- this ensures we skip + * it when avg_idle gets to be very low. + */ t0 = sched_clock_cpu(this_cpu); - sched_balance_update_blocked_averages(this_cpu); + __sched_balance_update_blocked_averages(this_rq); + + this_rq->next_class = &fair_sched_class; + raw_spin_rq_unlock(this_rq); - rcu_read_lock(); for_each_domain(this_cpu, sd) { u64 domain_cost; @@ -12957,7 +12960,6 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) if (pulled_task || !continue_balancing) break; } - rcu_read_unlock(); raw_spin_rq_lock(this_rq); @@ -12973,7 +12975,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) pulled_task = 1; /* If a higher prio class was modified, restart the pick */ - if (rq_modified_above(this_rq, &fair_sched_class)) + if (sched_class_above(this_rq->next_class, &fair_sched_class)) pulled_task = -1; out: @@ -13324,8 +13326,8 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, * zero_vruntime_fi, which would have been updated in prior calls * to se_fi_update(). */ - delta = (s64)(sea->vruntime - seb->vruntime) + - (s64)(cfs_rqb->zero_vruntime_fi - cfs_rqa->zero_vruntime_fi); + delta = vruntime_op(sea->vruntime, "-", seb->vruntime) + + vruntime_op(cfs_rqb->zero_vruntime_fi, "-", cfs_rqa->zero_vruntime_fi); return delta > 0; } @@ -13363,6 +13365,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) entity_tick(cfs_rq, se, queued); } + if (queued) { + if (!need_resched()) + hrtick_start_fair(rq, curr); + return; + } + if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); @@ -13871,15 +13879,12 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task * All the scheduling class methods: */ DEFINE_SCHED_CLASS(fair) = { - - .queue_mask = 2, - .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, .yield_to_task = yield_to_task_fair, - .wakeup_preempt = check_preempt_wakeup_fair, + .wakeup_preempt = wakeup_preempt_fair, .pick_task = pick_task_fair, .pick_next_task = pick_next_task_fair, @@ -13939,7 +13944,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) struct numa_group *ng; rcu_read_lock(); - ng = rcu_dereference(p->numa_group); + ng = rcu_dereference_all(p->numa_group); for_each_online_node(node) { if (p->numa_faults) { tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)]; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index abf8f15d60c9..3681b6ad9276 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -460,6 +460,7 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct t { update_curr_idle(rq); scx_update_idle(rq, false, true); + update_rq_avg_idle(rq); } static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first) @@ -536,15 +537,15 @@ static void update_curr_idle(struct rq *rq) se->exec_start = now; dl_server_update_idle(&rq->fair_server, delta_exec); +#ifdef CONFIG_SCHED_CLASS_EXT + dl_server_update_idle(&rq->ext_server, delta_exec); +#endif } /* * Simple, special scheduling class for the per-CPU idle tasks: */ DEFINE_SCHED_CLASS(idle) = { - - .queue_mask = 0, - /* no enqueue/yield_task for idle tasks */ /* dequeue is not valid, we print a debug message there: */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f1867fe8e5c5..a7680477fa6f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1615,6 +1615,12 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) { struct task_struct *donor = rq->donor; + /* + * XXX If we're preempted by DL, queue a push? + */ + if (p->sched_class != &rt_sched_class) + return; + if (p->prio < donor->prio) { resched_curr(rq); return; @@ -2100,6 +2106,7 @@ static void push_rt_tasks(struct rq *rq) */ static int rto_next_cpu(struct root_domain *rd) { + int this_cpu = smp_processor_id(); int next; int cpu; @@ -2123,6 +2130,10 @@ static int rto_next_cpu(struct root_domain *rd) rd->rto_cpu = cpu; + /* Do not send IPI to self */ + if (cpu == this_cpu) + continue; + if (cpu < nr_cpu_ids) return cpu; @@ -2568,9 +2579,6 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu) #endif /* CONFIG_SCHED_CORE */ DEFINE_SCHED_CLASS(rt) = { - - .queue_mask = 4, - .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 275370854481..62f9278b1663 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -418,6 +418,7 @@ extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, extern void sched_init_dl_servers(void); extern void fair_server_init(struct rq *rq); +extern void ext_server_init(struct rq *rq); extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq); extern int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init); @@ -674,16 +675,16 @@ struct balance_callback { void (*func)(struct rq *rq); }; -/* CFS-related fields in a runqueue */ +/* Fair scheduling SCHED_{NORMAL,BATCH,IDLE} related fields in a runqueue: */ struct cfs_rq { struct load_weight load; unsigned int nr_queued; - unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */ - unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */ - unsigned int h_nr_idle; /* SCHED_IDLE */ + unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int h_nr_idle; /* SCHED_IDLE */ - s64 avg_vruntime; - u64 avg_load; + s64 sum_w_vruntime; + u64 sum_weight; u64 zero_vruntime; #ifdef CONFIG_SCHED_CORE @@ -694,7 +695,7 @@ struct cfs_rq { struct rb_root_cached tasks_timeline; /* - * 'curr' points to currently running entity on this cfs_rq. + * 'curr' points to the currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ struct sched_entity *curr; @@ -730,9 +731,7 @@ struct cfs_rq { unsigned long h_load; u64 last_h_load_update; struct sched_entity *h_load_next; -#endif /* CONFIG_FAIR_GROUP_SCHED */ -#ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ /* @@ -745,19 +744,19 @@ struct cfs_rq { */ int on_list; struct list_head leaf_cfs_rq_list; - struct task_group *tg; /* group that "owns" this runqueue */ + struct task_group *tg; /* Group that "owns" this runqueue */ /* Locally cached copy of our task_group's idle value */ int idle; -#ifdef CONFIG_CFS_BANDWIDTH +# ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; s64 runtime_remaining; u64 throttled_pelt_idle; -#ifndef CONFIG_64BIT +# ifndef CONFIG_64BIT u64 throttled_pelt_idle_copy; -#endif +# endif u64 throttled_clock; u64 throttled_clock_pelt; u64 throttled_clock_pelt_time; @@ -769,7 +768,7 @@ struct cfs_rq { struct list_head throttled_list; struct list_head throttled_csd_list; struct list_head throttled_limbo_list; -#endif /* CONFIG_CFS_BANDWIDTH */ +# endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ }; @@ -1121,28 +1120,50 @@ DECLARE_STATIC_KEY_FALSE(sched_uclamp_used); * acquire operations must be ordered by ascending &runqueue. */ struct rq { - /* runqueue lock: */ - raw_spinlock_t __lock; - - /* Per class runqueue modification mask; bits in class order. */ - unsigned int queue_mask; + /* + * The following members are loaded together, without holding the + * rq->lock, in an extremely hot loop in update_sg_lb_stats() + * (called from pick_next_task()). To reduce cache pollution from + * this operation, they are placed together on this dedicated cache + * line. Even though some of them are frequently modified, they are + * loaded much more frequently than they are stored. + */ unsigned int nr_running; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; - unsigned int numa_migrate_on; #endif + unsigned int ttwu_pending; + unsigned long cpu_capacity; +#ifdef CONFIG_SCHED_PROXY_EXEC + struct task_struct __rcu *donor; /* Scheduling context */ + struct task_struct __rcu *curr; /* Execution context */ +#else + union { + struct task_struct __rcu *donor; /* Scheduler context */ + struct task_struct __rcu *curr; /* Execution context */ + }; +#endif + struct task_struct *idle; + /* padding left here deliberately */ + + /* + * The next cacheline holds the (hot) runqueue lock, as well as + * some other less performance-critical fields. + */ + u64 nr_switches ____cacheline_aligned; + + /* runqueue lock: */ + raw_spinlock_t __lock; + #ifdef CONFIG_NO_HZ_COMMON - unsigned long last_blocked_load_update_tick; - unsigned int has_blocked_load; - call_single_data_t nohz_csd; unsigned int nohz_tick_stopped; atomic_t nohz_flags; + unsigned int has_blocked_load; + unsigned long last_blocked_load_update_tick; + call_single_data_t nohz_csd; #endif /* CONFIG_NO_HZ_COMMON */ - unsigned int ttwu_pending; - u64 nr_switches; - #ifdef CONFIG_UCLAMP_TASK /* Utilization clamp values based on CPU's RUNNABLE tasks */ struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned; @@ -1155,6 +1176,7 @@ struct rq { struct dl_rq dl; #ifdef CONFIG_SCHED_CLASS_EXT struct scx_rq scx; + struct sched_dl_entity ext_server; #endif struct sched_dl_entity fair_server; @@ -1165,6 +1187,9 @@ struct rq { struct list_head *tmp_alone_branch; #endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_NUMA_BALANCING + unsigned int numa_migrate_on; +#endif /* * This is part of a global counter where only the total sum * over all CPUs matters. A task can increase this counter on @@ -1173,36 +1198,29 @@ struct rq { */ unsigned long nr_uninterruptible; -#ifdef CONFIG_SCHED_PROXY_EXEC - struct task_struct __rcu *donor; /* Scheduling context */ - struct task_struct __rcu *curr; /* Execution context */ -#else - union { - struct task_struct __rcu *donor; /* Scheduler context */ - struct task_struct __rcu *curr; /* Execution context */ - }; -#endif struct sched_dl_entity *dl_server; - struct task_struct *idle; struct task_struct *stop; + const struct sched_class *next_class; unsigned long next_balance; struct mm_struct *prev_mm; - unsigned int clock_update_flags; - u64 clock; - /* Ensure that all clocks are in the same cache line */ + /* + * The following fields of clock data are frequently referenced + * and updated together, and should go on their own cache line. + */ u64 clock_task ____cacheline_aligned; u64 clock_pelt; + u64 clock; unsigned long lost_idle_time; + unsigned int clock_update_flags; u64 clock_pelt_idle; u64 clock_idle; + #ifndef CONFIG_64BIT u64 clock_pelt_idle_copy; u64 clock_idle_copy; #endif - atomic_t nr_iowait; - u64 last_seen_need_resched_ns; int ticks_without_resched; @@ -1213,8 +1231,6 @@ struct rq { struct root_domain *rd; struct sched_domain __rcu *sd; - unsigned long cpu_capacity; - struct balance_callback *balance_callback; unsigned char nohz_idle_balance; @@ -1324,7 +1340,9 @@ struct rq { call_single_data_t cfsb_csd; struct list_head cfsb_csd_list; #endif -}; + + atomic_t nr_iowait; +} __no_randomize_layout; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1698,6 +1716,7 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) #endif /* !CONFIG_FAIR_GROUP_SCHED */ +extern void update_rq_avg_idle(struct rq *rq); extern void update_rq_clock(struct rq *rq); /* @@ -2062,8 +2081,8 @@ queue_balance_callback(struct rq *rq, rq->balance_callback = head; } -#define rcu_dereference_check_sched_domain(p) \ - rcu_dereference_check((p), lockdep_is_held(&sched_domains_mutex)) +#define rcu_dereference_sched_domain(p) \ + rcu_dereference_all_check((p), lockdep_is_held(&sched_domains_mutex)) /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. @@ -2073,7 +2092,7 @@ queue_balance_callback(struct rq *rq, * preempt-disabled sections. */ #define for_each_domain(cpu, __sd) \ - for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ + for (__sd = rcu_dereference_sched_domain(cpu_rq(cpu)->sd); \ __sd; __sd = __sd->parent) /* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */ @@ -2481,15 +2500,6 @@ struct sched_class { #ifdef CONFIG_UCLAMP_TASK int uclamp_enabled; #endif - /* - * idle: 0 - * ext: 1 - * fair: 2 - * rt: 4 - * dl: 8 - * stop: 16 - */ - unsigned int queue_mask; /* * move_queued_task/activate_task/enqueue_task: rq->lock @@ -2648,20 +2658,6 @@ struct sched_class { #endif }; -/* - * Does not nest; only used around sched_class::pick_task() rq-lock-breaks. - */ -static inline void rq_modified_clear(struct rq *rq) -{ - rq->queue_mask = 0; -} - -static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class) -{ - unsigned int mask = class->queue_mask; - return rq->queue_mask & ~((mask << 1) - 1); -} - static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { WARN_ON_ONCE(rq->donor != prev); @@ -3397,11 +3393,11 @@ struct irqtime { }; DECLARE_PER_CPU(struct irqtime, cpu_irqtime); -extern int sched_clock_irqtime; +DECLARE_STATIC_KEY_FALSE(sched_clock_irqtime); static inline int irqtime_enabled(void) { - return sched_clock_irqtime; + return static_branch_likely(&sched_clock_irqtime); } /* @@ -4010,6 +4006,7 @@ void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_s deactivate_task(src_rq, task, 0); set_task_cpu(task, dst_rq->cpu); activate_task(dst_rq, task, 0); + wakeup_preempt(dst_rq, task, 0); } static inline @@ -4079,6 +4076,7 @@ extern void balance_callbacks(struct rq *rq, struct balance_callback *head); struct sched_change_ctx { u64 prio; struct task_struct *p; + const struct sched_class *class; int flags; bool queued; bool running; diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 4f9192be4b5b..f95798baddeb 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -97,9 +97,6 @@ static void update_curr_stop(struct rq *rq) * Simple, special scheduling class for the per-CPU stop tasks: */ DEFINE_SCHED_CLASS(stop) = { - - .queue_mask = 16, - .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, .yield_task = yield_task_stop, diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index cf643a5ddedd..ac268da91778 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -508,6 +508,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) if (rq->fair_server.dl_server) __dl_server_attach_root(&rq->fair_server, rq); +#ifdef CONFIG_SCHED_CLASS_EXT + if (rq->ext_server.dl_server) + __dl_server_attach_root(&rq->ext_server, rq); +#endif + rq_unlock_irqrestore(rq, &rf); if (old_rd) diff --git a/kernel/sys.c b/kernel/sys.c index 8b58eece4e58..af71987df81c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -53,6 +53,7 @@ #include <linux/time_namespace.h> #include <linux/binfmts.h> #include <linux/futex.h> +#include <linux/rseq.h> #include <linux/sched.h> #include <linux/sched/autogroup.h> @@ -2868,6 +2869,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_FUTEX_HASH: error = futex_hash_prctl(arg2, arg3, arg4); break; + case PR_RSEQ_SLICE_EXTENSION: + if (arg4 || arg5) + return -EINVAL; + error = rseq_slice_extension_prctl(arg2, arg3); + break; default: trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5); error = -EINVAL; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index bf5d05c635ff..add3032da16f 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -390,6 +390,7 @@ COND_SYSCALL(setuid16); /* restartable sequence */ COND_SYSCALL(rseq); +COND_SYSCALL(rseq_slice_yield); COND_SYSCALL(uretprobe); COND_SYSCALL(uprobe); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 1caf02a72ba8..59d22f1bd0a8 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1742,7 +1742,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, lockdep_assert_held(&cpu_base->lock); - debug_deactivate(timer); + debug_hrtimer_deactivate(timer); base->running = timer; /* |
