summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt3
-rw-r--r--kernel/entry/common.c27
-rw-r--r--kernel/entry/common.h7
-rw-r--r--kernel/entry/syscall-common.c97
-rw-r--r--kernel/entry/syscall_user_dispatch.c4
-rw-r--r--kernel/rseq.c365
-rw-r--r--kernel/sched/clock.c3
-rw-r--r--kernel/sched/core.c86
-rw-r--r--kernel/sched/cpufreq_schedutil.c2
-rw-r--r--kernel/sched/cputime.c9
-rw-r--r--kernel/sched/deadline.c105
-rw-r--r--kernel/sched/debug.c187
-rw-r--r--kernel/sched/ext.c42
-rw-r--r--kernel/sched/fair.c411
-rw-r--r--kernel/sched/idle.c7
-rw-r--r--kernel/sched/rt.c14
-rw-r--r--kernel/sched/sched.h142
-rw-r--r--kernel/sched/stop_task.c3
-rw-r--r--kernel/sched/topology.c5
-rw-r--r--kernel/sys.c6
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/time/hrtimer.c2
22 files changed, 1036 insertions, 492 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index da326800c1c9..88c594c6d7fc 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -16,11 +16,13 @@ config ARCH_HAS_PREEMPT_LAZY
choice
prompt "Preemption Model"
+ default PREEMPT_LAZY if ARCH_HAS_PREEMPT_LAZY
default PREEMPT_NONE
config PREEMPT_NONE
bool "No Forced Preemption (Server)"
depends on !PREEMPT_RT
+ depends on ARCH_NO_PREEMPT
select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC
help
This is the traditional Linux preemption model, geared towards
@@ -35,6 +37,7 @@ config PREEMPT_NONE
config PREEMPT_VOLUNTARY
bool "Voluntary Kernel Preemption (Desktop)"
+ depends on !ARCH_HAS_PREEMPT_LAZY
depends on !ARCH_NO_PREEMPT
depends on !PREEMPT_RT
select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 5c792b30c58a..9ef63e414791 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -17,6 +17,27 @@ void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
#define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK)
#endif
+/* TIF bits, which prevent a time slice extension. */
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * Since rseq slice ext has a direct correlation to the worst case
+ * scheduling latency (schedule is delayed after all), only have it affect
+ * LAZY reschedules on PREEMPT_RT for now.
+ *
+ * However, since this delay is only applicable to userspace, a value
+ * for rseq_slice_extension_nsec that is strictly less than the worst case
+ * kernel space preempt_disable() region, should mean the scheduling latency
+ * is not affected, even for !LAZY.
+ *
+ * However, since this value depends on the hardware at hand, it cannot be
+ * pre-determined in any sensible way. Hence punt on this problem for now.
+ */
+# define TIF_SLICE_EXT_SCHED (_TIF_NEED_RESCHED_LAZY)
+#else
+# define TIF_SLICE_EXT_SCHED (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
+#endif
+#define TIF_SLICE_EXT_DENY (EXIT_TO_USER_MODE_WORK & ~TIF_SLICE_EXT_SCHED)
+
static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs,
unsigned long ti_work)
{
@@ -28,8 +49,10 @@ static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *re
local_irq_enable_exit_to_user(ti_work);
- if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
- schedule();
+ if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
+ if (!rseq_grant_slice_extension(ti_work & TIF_SLICE_EXT_DENY))
+ schedule();
+ }
if (ti_work & _TIF_UPROBE)
uprobe_notify_resume(regs);
diff --git a/kernel/entry/common.h b/kernel/entry/common.h
deleted file mode 100644
index f6e6d02f07fe..000000000000
--- a/kernel/entry/common.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _COMMON_H
-#define _COMMON_H
-
-bool syscall_user_dispatch(struct pt_regs *regs);
-
-#endif
diff --git a/kernel/entry/syscall-common.c b/kernel/entry/syscall-common.c
index 940a597ded40..cd4967a9c53e 100644
--- a/kernel/entry/syscall-common.c
+++ b/kernel/entry/syscall-common.c
@@ -1,104 +1,23 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/audit.h>
#include <linux/entry-common.h>
-#include "common.h"
#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
-static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
-{
- if (unlikely(audit_context())) {
- unsigned long args[6];
-
- syscall_get_arguments(current, regs, args);
- audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
- }
-}
+/* Out of line to prevent tracepoint code duplication */
-long syscall_trace_enter(struct pt_regs *regs, long syscall,
- unsigned long work)
+long trace_syscall_enter(struct pt_regs *regs, long syscall)
{
- long ret = 0;
-
+ trace_sys_enter(regs, syscall);
/*
- * Handle Syscall User Dispatch. This must comes first, since
- * the ABI here can be something that doesn't make sense for
- * other syscall_work features.
+ * Probes or BPF hooks in the tracepoint may have changed the
+ * system call number. Reread it.
*/
- if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
- if (syscall_user_dispatch(regs))
- return -1L;
- }
-
- /* Handle ptrace */
- if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
- ret = ptrace_report_syscall_entry(regs);
- if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
- return -1L;
- }
-
- /* Do seccomp after ptrace, to catch any tracer changes. */
- if (work & SYSCALL_WORK_SECCOMP) {
- ret = __secure_computing();
- if (ret == -1L)
- return ret;
- }
-
- /* Either of the above might have changed the syscall number */
- syscall = syscall_get_nr(current, regs);
-
- if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) {
- trace_sys_enter(regs, syscall);
- /*
- * Probes or BPF hooks in the tracepoint may have changed the
- * system call number as well.
- */
- syscall = syscall_get_nr(current, regs);
- }
-
- syscall_enter_audit(regs, syscall);
-
- return ret ? : syscall;
+ return syscall_get_nr(current, regs);
}
-/*
- * If SYSCALL_EMU is set, then the only reason to report is when
- * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
- * instruction has been already reported in syscall_enter_from_user_mode().
- */
-static inline bool report_single_step(unsigned long work)
+void trace_syscall_exit(struct pt_regs *regs, long ret)
{
- if (work & SYSCALL_WORK_SYSCALL_EMU)
- return false;
-
- return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
-}
-
-void syscall_exit_work(struct pt_regs *regs, unsigned long work)
-{
- bool step;
-
- /*
- * If the syscall was rolled back due to syscall user dispatching,
- * then the tracers below are not invoked for the same reason as
- * the entry side was not invoked in syscall_trace_enter(): The ABI
- * of these syscalls is unknown.
- */
- if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
- if (unlikely(current->syscall_dispatch.on_dispatch)) {
- current->syscall_dispatch.on_dispatch = false;
- return;
- }
- }
-
- audit_syscall_exit(regs);
-
- if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
- trace_sys_exit(regs, syscall_get_return_value(current, regs));
-
- step = report_single_step(work);
- if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
- ptrace_report_syscall_exit(regs, step);
+ trace_sys_exit(regs, ret);
}
diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c
index a9055eccb27e..d89dffcc2d64 100644
--- a/kernel/entry/syscall_user_dispatch.c
+++ b/kernel/entry/syscall_user_dispatch.c
@@ -2,6 +2,8 @@
/*
* Copyright (C) 2020 Collabora Ltd.
*/
+
+#include <linux/entry-common.h>
#include <linux/sched.h>
#include <linux/prctl.h>
#include <linux/ptrace.h>
@@ -15,8 +17,6 @@
#include <asm/syscall.h>
-#include "common.h"
-
static void trigger_sigsys(struct pt_regs *regs)
{
struct kernel_siginfo info;
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 395d8b002350..b0973d19f366 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -71,6 +71,9 @@
#define RSEQ_BUILD_SLOW_PATH
#include <linux/debugfs.h>
+#include <linux/hrtimer.h>
+#include <linux/percpu.h>
+#include <linux/prctl.h>
#include <linux/ratelimit.h>
#include <linux/rseq_entry.h>
#include <linux/sched.h>
@@ -120,7 +123,6 @@ void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
}
#endif /* CONFIG_TRACEPOINTS */
-#ifdef CONFIG_DEBUG_FS
#ifdef CONFIG_RSEQ_STATS
DEFINE_PER_CPU(struct rseq_stats, rseq_stats);
@@ -138,6 +140,13 @@ static int rseq_stats_show(struct seq_file *m, void *p)
stats.cs += data_race(per_cpu(rseq_stats.cs, cpu));
stats.clear += data_race(per_cpu(rseq_stats.clear, cpu));
stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu));
+ if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) {
+ stats.s_granted += data_race(per_cpu(rseq_stats.s_granted, cpu));
+ stats.s_expired += data_race(per_cpu(rseq_stats.s_expired, cpu));
+ stats.s_revoked += data_race(per_cpu(rseq_stats.s_revoked, cpu));
+ stats.s_yielded += data_race(per_cpu(rseq_stats.s_yielded, cpu));
+ stats.s_aborted += data_race(per_cpu(rseq_stats.s_aborted, cpu));
+ }
}
seq_printf(m, "exit: %16lu\n", stats.exit);
@@ -148,6 +157,13 @@ static int rseq_stats_show(struct seq_file *m, void *p)
seq_printf(m, "cs: %16lu\n", stats.cs);
seq_printf(m, "clear: %16lu\n", stats.clear);
seq_printf(m, "fixup: %16lu\n", stats.fixup);
+ if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) {
+ seq_printf(m, "sgrant: %16lu\n", stats.s_granted);
+ seq_printf(m, "sexpir: %16lu\n", stats.s_expired);
+ seq_printf(m, "srevok: %16lu\n", stats.s_revoked);
+ seq_printf(m, "syield: %16lu\n", stats.s_yielded);
+ seq_printf(m, "sabort: %16lu\n", stats.s_aborted);
+ }
return 0;
}
@@ -205,16 +221,19 @@ static const struct file_operations debug_ops = {
.release = single_release,
};
+static void rseq_slice_ext_init(struct dentry *root_dir);
+
static int __init rseq_debugfs_init(void)
{
struct dentry *root_dir = debugfs_create_dir("rseq", NULL);
debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops);
rseq_stats_init(root_dir);
+ if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION))
+ rseq_slice_ext_init(root_dir);
return 0;
}
__initcall(rseq_debugfs_init);
-#endif /* CONFIG_DEBUG_FS */
static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id)
{
@@ -389,6 +408,8 @@ static bool rseq_reset_ids(void)
*/
SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig)
{
+ u32 rseqfl = 0;
+
if (flags & RSEQ_FLAG_UNREGISTER) {
if (flags & ~RSEQ_FLAG_UNREGISTER)
return -EINVAL;
@@ -405,7 +426,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
return 0;
}
- if (unlikely(flags))
+ if (unlikely(flags & ~(RSEQ_FLAG_SLICE_EXT_DEFAULT_ON)))
return -EINVAL;
if (current->rseq.usrptr) {
@@ -440,6 +461,13 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
if (!access_ok(rseq, rseq_len))
return -EFAULT;
+ if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) {
+ rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE;
+ if (rseq_slice_extension_enabled() &&
+ (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON))
+ rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
+ }
+
scoped_user_write_access(rseq, efault) {
/*
* If the rseq_cs pointer is non-NULL on registration, clear it to
@@ -449,11 +477,13 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
* clearing the fields. Don't bother reading it, just reset it.
*/
unsafe_put_user(0UL, &rseq->rseq_cs, efault);
+ unsafe_put_user(rseqfl, &rseq->flags, efault);
/* Initialize IDs in user space */
unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault);
unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault);
unsafe_put_user(0U, &rseq->node_id, efault);
unsafe_put_user(0U, &rseq->mm_cid, efault);
+ unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
}
/*
@@ -464,6 +494,10 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
current->rseq.len = rseq_len;
current->rseq.sig = sig;
+#ifdef CONFIG_RSEQ_SLICE_EXTENSION
+ current->rseq.slice.state.enabled = !!(rseqfl & RSEQ_CS_FLAG_SLICE_EXT_ENABLED);
+#endif
+
/*
* If rseq was previously inactive, and has just been
* registered, ensure the cpu_id_start and cpu_id fields
@@ -476,3 +510,328 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
efault:
return -EFAULT;
}
+
+#ifdef CONFIG_RSEQ_SLICE_EXTENSION
+struct slice_timer {
+ struct hrtimer timer;
+ void *cookie;
+};
+
+static const unsigned int rseq_slice_ext_nsecs_min = 5 * NSEC_PER_USEC;
+static const unsigned int rseq_slice_ext_nsecs_max = 50 * NSEC_PER_USEC;
+unsigned int rseq_slice_ext_nsecs __read_mostly = rseq_slice_ext_nsecs_min;
+static DEFINE_PER_CPU(struct slice_timer, slice_timer);
+DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key);
+
+/*
+ * When the timer expires and the task is still in user space, the return
+ * from interrupt will revoke the grant and schedule. If the task already
+ * entered the kernel via a syscall and the timer fires before the syscall
+ * work was able to cancel it, then depending on the preemption model this
+ * will either reschedule on return from interrupt or in the syscall work
+ * below.
+ */
+static enum hrtimer_restart rseq_slice_expired(struct hrtimer *tmr)
+{
+ struct slice_timer *st = container_of(tmr, struct slice_timer, timer);
+
+ /*
+ * Validate that the task which armed the timer is still on the
+ * CPU. It could have been scheduled out without canceling the
+ * timer.
+ */
+ if (st->cookie == current && current->rseq.slice.state.granted) {
+ rseq_stat_inc(rseq_stats.s_expired);
+ set_need_resched_current();
+ }
+ return HRTIMER_NORESTART;
+}
+
+bool __rseq_arm_slice_extension_timer(void)
+{
+ struct slice_timer *st = this_cpu_ptr(&slice_timer);
+ struct task_struct *curr = current;
+
+ lockdep_assert_irqs_disabled();
+
+ /*
+ * This check prevents a task, which got a time slice extension
+ * granted, from exceeding the maximum scheduling latency when the
+ * grant expired before going out to user space. Don't bother to
+ * clear the grant here, it will be cleaned up automatically before
+ * going out to user space after being scheduled back in.
+ */
+ if ((unlikely(curr->rseq.slice.expires < ktime_get_mono_fast_ns()))) {
+ set_need_resched_current();
+ return true;
+ }
+
+ /*
+ * Store the task pointer as a cookie for comparison in the timer
+ * function. This is safe as the timer is CPU local and cannot be
+ * in the expiry function at this point.
+ */
+ st->cookie = curr;
+ hrtimer_start(&st->timer, curr->rseq.slice.expires, HRTIMER_MODE_ABS_PINNED_HARD);
+ /* Arm the syscall entry work */
+ set_task_syscall_work(curr, SYSCALL_RSEQ_SLICE);
+ return false;
+}
+
+static void rseq_cancel_slice_extension_timer(void)
+{
+ struct slice_timer *st = this_cpu_ptr(&slice_timer);
+
+ /*
+ * st->cookie can be safely read as preemption is disabled and the
+ * timer is CPU local.
+ *
+ * As this is most probably the first expiring timer, the cancel is
+ * expensive as it has to reprogram the hardware, but that's less
+ * expensive than going through a full hrtimer_interrupt() cycle
+ * for nothing.
+ *
+ * hrtimer_try_to_cancel() is sufficient here as the timer is CPU
+ * local and once the hrtimer code disabled interrupts the timer
+ * callback cannot be running.
+ */
+ if (st->cookie == current)
+ hrtimer_try_to_cancel(&st->timer);
+}
+
+static inline void rseq_slice_set_need_resched(struct task_struct *curr)
+{
+ /*
+ * The interrupt guard is required to prevent inconsistent state in
+ * this case:
+ *
+ * set_tsk_need_resched()
+ * --> Interrupt
+ * wakeup()
+ * set_tsk_need_resched()
+ * set_preempt_need_resched()
+ * schedule_on_return()
+ * clear_tsk_need_resched()
+ * clear_preempt_need_resched()
+ * set_preempt_need_resched() <- Inconsistent state
+ *
+ * This is safe vs. a remote set of TIF_NEED_RESCHED because that
+ * only sets the already set bit and does not create inconsistent
+ * state.
+ */
+ scoped_guard(irq)
+ set_need_resched_current();
+}
+
+static void rseq_slice_validate_ctrl(u32 expected)
+{
+ u32 __user *sctrl = &current->rseq.usrptr->slice_ctrl.all;
+ u32 uval;
+
+ if (get_user(uval, sctrl) || uval != expected)
+ force_sig(SIGSEGV);
+}
+
+/*
+ * Invoked from syscall entry if a time slice extension was granted and the
+ * kernel did not clear it before user space left the critical section.
+ *
+ * While the recommended way to relinquish the CPU side effect free is
+ * rseq_slice_yield(2), any syscall within a granted slice terminates the
+ * grant and immediately reschedules if required. This supports onion layer
+ * applications, where the code requesting the grant cannot control the
+ * code within the critical section.
+ */
+void rseq_syscall_enter_work(long syscall)
+{
+ struct task_struct *curr = current;
+ struct rseq_slice_ctrl ctrl = { .granted = curr->rseq.slice.state.granted };
+
+ clear_task_syscall_work(curr, SYSCALL_RSEQ_SLICE);
+
+ if (static_branch_unlikely(&rseq_debug_enabled))
+ rseq_slice_validate_ctrl(ctrl.all);
+
+ /*
+ * The kernel might have raced, revoked the grant and updated
+ * userspace, but kept the SLICE work set.
+ */
+ if (!ctrl.granted)
+ return;
+
+ /*
+ * Required to stabilize the per CPU timer pointer and to make
+ * set_tsk_need_resched() correct on PREEMPT[RT] kernels.
+ *
+ * Leaving the scope will reschedule on preemption models FULL,
+ * LAZY and RT if necessary.
+ */
+ scoped_guard(preempt) {
+ rseq_cancel_slice_extension_timer();
+ /*
+ * Now that preemption is disabled, quickly check whether
+ * the task was already rescheduled before arriving here.
+ */
+ if (!curr->rseq.event.sched_switch) {
+ rseq_slice_set_need_resched(curr);
+
+ if (syscall == __NR_rseq_slice_yield) {
+ rseq_stat_inc(rseq_stats.s_yielded);
+ /* Update the yielded state for syscall return */
+ curr->rseq.slice.yielded = 1;
+ } else {
+ rseq_stat_inc(rseq_stats.s_aborted);
+ }
+ }
+ }
+ /* Reschedule on NONE/VOLUNTARY preemption models */
+ cond_resched();
+
+ /* Clear the grant in kernel state and user space */
+ curr->rseq.slice.state.granted = false;
+ if (put_user(0U, &curr->rseq.usrptr->slice_ctrl.all))
+ force_sig(SIGSEGV);
+}
+
+int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)
+{
+ switch (arg2) {
+ case PR_RSEQ_SLICE_EXTENSION_GET:
+ if (arg3)
+ return -EINVAL;
+ return current->rseq.slice.state.enabled ? PR_RSEQ_SLICE_EXT_ENABLE : 0;
+
+ case PR_RSEQ_SLICE_EXTENSION_SET: {
+ u32 rflags, valid = RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE;
+ bool enable = !!(arg3 & PR_RSEQ_SLICE_EXT_ENABLE);
+
+ if (arg3 & ~PR_RSEQ_SLICE_EXT_ENABLE)
+ return -EINVAL;
+ if (!rseq_slice_extension_enabled())
+ return -ENOTSUPP;
+ if (!current->rseq.usrptr)
+ return -ENXIO;
+
+ /* No change? */
+ if (enable == !!current->rseq.slice.state.enabled)
+ return 0;
+
+ if (get_user(rflags, &current->rseq.usrptr->flags))
+ goto die;
+
+ if (current->rseq.slice.state.enabled)
+ valid |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
+
+ if ((rflags & valid) != valid)
+ goto die;
+
+ rflags &= ~RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
+ rflags |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE;
+ if (enable)
+ rflags |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
+
+ if (put_user(rflags, &current->rseq.usrptr->flags))
+ goto die;
+
+ current->rseq.slice.state.enabled = enable;
+ return 0;
+ }
+ default:
+ return -EINVAL;
+ }
+die:
+ force_sig(SIGSEGV);
+ return -EFAULT;
+}
+
+/**
+ * sys_rseq_slice_yield - yield the current processor side effect free if a
+ * task granted with a time slice extension is done with
+ * the critical work before being forced out.
+ *
+ * Return: 1 if the task successfully yielded the CPU within the granted slice.
+ * 0 if the slice extension was either never granted or was revoked by
+ * going over the granted extension, using a syscall other than this one
+ * or being scheduled out earlier due to a subsequent interrupt.
+ *
+ * The syscall does not schedule because the syscall entry work immediately
+ * relinquishes the CPU and schedules if required.
+ */
+SYSCALL_DEFINE0(rseq_slice_yield)
+{
+ int yielded = !!current->rseq.slice.yielded;
+
+ current->rseq.slice.yielded = 0;
+ return yielded;
+}
+
+static int rseq_slice_ext_show(struct seq_file *m, void *p)
+{
+ seq_printf(m, "%d\n", rseq_slice_ext_nsecs);
+ return 0;
+}
+
+static ssize_t rseq_slice_ext_write(struct file *file, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ unsigned int nsecs;
+
+ if (kstrtouint_from_user(ubuf, count, 10, &nsecs))
+ return -EINVAL;
+
+ if (nsecs < rseq_slice_ext_nsecs_min)
+ return -ERANGE;
+
+ if (nsecs > rseq_slice_ext_nsecs_max)
+ return -ERANGE;
+
+ rseq_slice_ext_nsecs = nsecs;
+
+ return count;
+}
+
+static int rseq_slice_ext_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, rseq_slice_ext_show, inode->i_private);
+}
+
+static const struct file_operations slice_ext_ops = {
+ .open = rseq_slice_ext_open,
+ .read = seq_read,
+ .write = rseq_slice_ext_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void rseq_slice_ext_init(struct dentry *root_dir)
+{
+ debugfs_create_file("slice_ext_nsec", 0644, root_dir, NULL, &slice_ext_ops);
+}
+
+static int __init rseq_slice_cmdline(char *str)
+{
+ bool on;
+
+ if (kstrtobool(str, &on))
+ return 0;
+
+ if (!on)
+ static_branch_disable(&rseq_slice_extension_key);
+ return 1;
+}
+__setup("rseq_slice_ext=", rseq_slice_cmdline);
+
+static int __init rseq_slice_init(void)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ hrtimer_setup(per_cpu_ptr(&slice_timer.timer, cpu), rseq_slice_expired,
+ CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED_HARD);
+ }
+ return 0;
+}
+device_initcall(rseq_slice_init);
+#else
+static void rseq_slice_ext_init(struct dentry *root_dir) { }
+#endif /* CONFIG_RSEQ_SLICE_EXTENSION */
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index f5e6dd6a6b3a..2ae4fbf13431 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -173,6 +173,7 @@ notrace static void __sched_clock_work(struct work_struct *work)
scd->tick_gtod, __gtod_offset,
scd->tick_raw, __sched_clock_offset);
+ disable_sched_clock_irqtime();
static_branch_disable(&__sched_clock_stable);
}
@@ -238,6 +239,8 @@ static int __init sched_clock_init_late(void)
if (__sched_clock_stable_early)
__set_sched_clock_stable();
+ else
+ disable_sched_clock_irqtime(); /* disable if clock unstable. */
return 0;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d78d7b6d30bb..23406f037dde 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -119,6 +119,9 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_entry_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_exit_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_set_need_resched_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
@@ -1141,6 +1144,7 @@ void __trace_set_need_resched(struct task_struct *curr, int tif)
{
trace_sched_set_need_resched_tp(curr, smp_processor_id(), tif);
}
+EXPORT_SYMBOL_GPL(__trace_set_need_resched);
void resched_curr(struct rq *rq)
{
@@ -2095,7 +2099,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
*/
uclamp_rq_inc(rq, p, flags);
- rq->queue_mask |= p->sched_class->queue_mask;
p->sched_class->enqueue_task(rq, p, flags);
psi_enqueue(p, flags);
@@ -2128,7 +2131,6 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
* and mark the task ->sched_delayed.
*/
uclamp_rq_dec(rq, p);
- rq->queue_mask |= p->sched_class->queue_mask;
return p->sched_class->dequeue_task(rq, p, flags);
}
@@ -2179,10 +2181,14 @@ void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
{
struct task_struct *donor = rq->donor;
- if (p->sched_class == donor->sched_class)
- donor->sched_class->wakeup_preempt(rq, p, flags);
- else if (sched_class_above(p->sched_class, donor->sched_class))
+ if (p->sched_class == rq->next_class) {
+ rq->next_class->wakeup_preempt(rq, p, flags);
+
+ } else if (sched_class_above(p->sched_class, rq->next_class)) {
+ rq->next_class->wakeup_preempt(rq, p, flags);
resched_curr(rq);
+ rq->next_class = p->sched_class;
+ }
/*
* A queue event has occurred, and we're going to schedule. In
@@ -3620,6 +3626,18 @@ static inline void ttwu_do_wakeup(struct task_struct *p)
trace_sched_wakeup(p);
}
+void update_rq_avg_idle(struct rq *rq)
+{
+ u64 delta = rq_clock(rq) - rq->idle_stamp;
+ u64 max = 2*rq->max_idle_balance_cost;
+
+ update_avg(&rq->avg_idle, delta);
+
+ if (rq->avg_idle > max)
+ rq->avg_idle = max;
+ rq->idle_stamp = 0;
+}
+
static void
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
struct rq_flags *rf)
@@ -3655,18 +3673,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
p->sched_class->task_woken(rq, p);
rq_repin_lock(rq, rf);
}
-
- if (rq->idle_stamp) {
- u64 delta = rq_clock(rq) - rq->idle_stamp;
- u64 max = 2*rq->max_idle_balance_cost;
-
- update_avg(&rq->avg_idle, delta);
-
- if (rq->avg_idle > max)
- rq->avg_idle = max;
-
- rq->idle_stamp = 0;
- }
}
/*
@@ -6836,6 +6842,7 @@ static void __sched notrace __schedule(int sched_mode)
pick_again:
next = pick_next_task(rq, rq->donor, &rf);
rq_set_donor(rq, next);
+ rq->next_class = next->sched_class;
if (unlikely(task_is_blocked(next))) {
next = find_proxy_task(rq, next, &rf);
if (!next)
@@ -7580,7 +7587,7 @@ int preempt_dynamic_mode = preempt_dynamic_undefined;
int sched_dynamic_mode(const char *str)
{
-# ifndef CONFIG_PREEMPT_RT
+# if !(defined(CONFIG_PREEMPT_RT) || defined(CONFIG_ARCH_HAS_PREEMPT_LAZY))
if (!strcmp(str, "none"))
return preempt_dynamic_none;
@@ -8512,6 +8519,9 @@ int sched_cpu_dying(unsigned int cpu)
dump_rq_tasks(rq, KERN_WARNING);
}
dl_server_stop(&rq->fair_server);
+#ifdef CONFIG_SCHED_CLASS_EXT
+ dl_server_stop(&rq->ext_server);
+#endif
rq_unlock_irqrestore(rq, &rf);
calc_load_migrate(rq);
@@ -8687,6 +8697,8 @@ void __init sched_init(void)
rq->rt.rt_runtime = global_rt_runtime();
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
+ rq->next_class = &idle_sched_class;
+
rq->sd = NULL;
rq->rd = NULL;
rq->cpu_capacity = SCHED_CAPACITY_SCALE;
@@ -8715,6 +8727,9 @@ void __init sched_init(void)
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
fair_server_init(rq);
+#ifdef CONFIG_SCHED_CLASS_EXT
+ ext_server_init(rq);
+#endif
#ifdef CONFIG_SCHED_CORE
rq->core = rq;
@@ -9146,6 +9161,7 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup)
{
unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
bool resched = false;
+ bool queued = false;
struct rq *rq;
CLASS(task_rq_lock, rq_guard)(tsk);
@@ -9157,10 +9173,13 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup)
scx_cgroup_move_task(tsk);
if (scope->running)
resched = true;
+ queued = scope->queued;
}
if (resched)
resched_curr(rq);
+ else if (queued)
+ wakeup_preempt(rq, tsk, 0);
__balance_callbacks(rq, &rq_guard.rf);
}
@@ -10883,13 +10902,12 @@ struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int
flags |= DEQUEUE_NOCLOCK;
}
- if (flags & DEQUEUE_CLASS) {
- if (p->sched_class->switching_from)
- p->sched_class->switching_from(rq, p);
- }
+ if ((flags & DEQUEUE_CLASS) && p->sched_class->switching_from)
+ p->sched_class->switching_from(rq, p);
*ctx = (struct sched_change_ctx){
.p = p,
+ .class = p->sched_class,
.flags = flags,
.queued = task_on_rq_queued(p),
.running = task_current_donor(rq, p),
@@ -10920,6 +10938,11 @@ void sched_change_end(struct sched_change_ctx *ctx)
lockdep_assert_rq_held(rq);
+ /*
+ * Changing class without *QUEUE_CLASS is bad.
+ */
+ WARN_ON_ONCE(p->sched_class != ctx->class && !(ctx->flags & ENQUEUE_CLASS));
+
if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to)
p->sched_class->switching_to(rq, p);
@@ -10931,6 +10954,25 @@ void sched_change_end(struct sched_change_ctx *ctx)
if (ctx->flags & ENQUEUE_CLASS) {
if (p->sched_class->switched_to)
p->sched_class->switched_to(rq, p);
+
+ if (ctx->running) {
+ /*
+ * If this was a class promotion; let the old class
+ * know it got preempted. Note that none of the
+ * switch*_from() methods know the new class and none
+ * of the switch*_to() methods know the old class.
+ */
+ if (sched_class_above(p->sched_class, ctx->class)) {
+ rq->next_class->wakeup_preempt(rq, p, 0);
+ rq->next_class = p->sched_class;
+ }
+ /*
+ * If this was a degradation in class; make sure to
+ * reschedule.
+ */
+ if (sched_class_above(ctx->class, p->sched_class))
+ resched_curr(rq);
+ }
} else {
p->sched_class->prio_changed(rq, p, ctx->prio);
}
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 0ab5f9d4bc59..cfc40181f66e 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -682,7 +682,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
"sugov:%d",
cpumask_first(policy->related_cpus));
if (IS_ERR(thread)) {
- pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
+ pr_err("failed to create sugov thread: %pe\n", thread);
return PTR_ERR(thread);
}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 4f97896887ec..ff0dfca95420 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -12,6 +12,8 @@
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime);
+
/*
* There are no locks covering percpu hardirq/softirq time.
* They are only modified in vtime_account, on corresponding CPU
@@ -25,16 +27,15 @@
*/
DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
-int sched_clock_irqtime;
-
void enable_sched_clock_irqtime(void)
{
- sched_clock_irqtime = 1;
+ static_branch_enable(&sched_clock_irqtime);
}
void disable_sched_clock_irqtime(void)
{
- sched_clock_irqtime = 0;
+ if (irqtime_enabled())
+ static_branch_disable(&sched_clock_irqtime);
}
static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 7bcde7114f1b..d08b00429323 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1449,8 +1449,8 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
dl_se->dl_defer_idle = 0;
/*
- * The fair server can consume its runtime while throttled (not queued/
- * running as regular CFS).
+ * The DL server can consume its runtime while throttled (not
+ * queued / running as regular CFS).
*
* If the server consumes its entire runtime in this state. The server
* is not required for the current period. Thus, reset the server by
@@ -1535,10 +1535,10 @@ throttle:
}
/*
- * The fair server (sole dl_server) does not account for real-time
- * workload because it is running fair work.
+ * The dl_server does not account for real-time workload because it
+ * is running fair work.
*/
- if (dl_se == &rq->fair_server)
+ if (dl_se->dl_server)
return;
#ifdef CONFIG_RT_GROUP_SCHED
@@ -1573,9 +1573,9 @@ throttle:
* In the non-defer mode, the idle time is not accounted, as the
* server provides a guarantee.
*
- * If the dl_server is in defer mode, the idle time is also considered
- * as time available for the fair server, avoiding a penalty for the
- * rt scheduler that did not consumed that time.
+ * If the dl_server is in defer mode, the idle time is also considered as
+ * time available for the dl_server, avoiding a penalty for the rt
+ * scheduler that did not consumed that time.
*/
void dl_server_update_idle(struct sched_dl_entity *dl_se, s64 delta_exec)
{
@@ -1799,7 +1799,7 @@ void dl_server_start(struct sched_dl_entity *dl_se)
struct rq *rq = dl_se->rq;
dl_se->dl_defer_idle = 0;
- if (!dl_server(dl_se) || dl_se->dl_server_active)
+ if (!dl_server(dl_se) || dl_se->dl_server_active || !dl_se->dl_runtime)
return;
/*
@@ -1860,6 +1860,18 @@ void sched_init_dl_servers(void)
dl_se->dl_server = 1;
dl_se->dl_defer = 1;
setup_new_dl_entity(dl_se);
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+ dl_se = &rq->ext_server;
+
+ WARN_ON(dl_server(dl_se));
+
+ dl_server_apply_params(dl_se, runtime, period, 1);
+
+ dl_se->dl_server = 1;
+ dl_se->dl_defer = 1;
+ setup_new_dl_entity(dl_se);
+#endif
}
}
@@ -1886,7 +1898,6 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio
int cpu = cpu_of(rq);
struct dl_bw *dl_b;
unsigned long cap;
- int retval = 0;
int cpus;
dl_b = dl_bw_of(cpu);
@@ -1918,7 +1929,7 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
- return retval;
+ return 0;
}
/*
@@ -2515,9 +2526,16 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
* Only called when both the current and waking task are -deadline
* tasks.
*/
-static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
- int flags)
+static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags)
{
+ /*
+ * Can only get preempted by stop-class, and those should be
+ * few and short lived, doesn't really make sense to push
+ * anything away for that.
+ */
+ if (p->sched_class != &dl_sched_class)
+ return;
+
if (dl_entity_preempt(&p->dl, &rq->donor->dl)) {
resched_curr(rq);
return;
@@ -3191,6 +3209,36 @@ void dl_add_task_root_domain(struct task_struct *p)
raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
}
+static void dl_server_add_bw(struct root_domain *rd, int cpu)
+{
+ struct sched_dl_entity *dl_se;
+
+ dl_se = &cpu_rq(cpu)->fair_server;
+ if (dl_server(dl_se) && cpu_active(cpu))
+ __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu));
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+ dl_se = &cpu_rq(cpu)->ext_server;
+ if (dl_server(dl_se) && cpu_active(cpu))
+ __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu));
+#endif
+}
+
+static u64 dl_server_read_bw(int cpu)
+{
+ u64 dl_bw = 0;
+
+ if (cpu_rq(cpu)->fair_server.dl_server)
+ dl_bw += cpu_rq(cpu)->fair_server.dl_bw;
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+ if (cpu_rq(cpu)->ext_server.dl_server)
+ dl_bw += cpu_rq(cpu)->ext_server.dl_bw;
+#endif
+
+ return dl_bw;
+}
+
void dl_clear_root_domain(struct root_domain *rd)
{
int i;
@@ -3209,12 +3257,8 @@ void dl_clear_root_domain(struct root_domain *rd)
* dl_servers are not tasks. Since dl_add_task_root_domain ignores
* them, we need to account for them here explicitly.
*/
- for_each_cpu(i, rd->span) {
- struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server;
-
- if (dl_server(dl_se) && cpu_active(i))
- __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(i));
- }
+ for_each_cpu(i, rd->span)
+ dl_server_add_bw(rd, i);
}
void dl_clear_root_domain_cpu(int cpu)
@@ -3360,9 +3404,6 @@ static int task_is_throttled_dl(struct task_struct *p, int cpu)
#endif
DEFINE_SCHED_CLASS(dl) = {
-
- .queue_mask = 8,
-
.enqueue_task = enqueue_task_dl,
.dequeue_task = dequeue_task_dl,
.yield_task = yield_task_dl,
@@ -3656,6 +3697,9 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se)
dl_se->dl_non_contending = 0;
dl_se->dl_overrun = 0;
dl_se->dl_server = 0;
+ dl_se->dl_defer = 0;
+ dl_se->dl_defer_running = 0;
+ dl_se->dl_defer_armed = 0;
#ifdef CONFIG_RT_MUTEXES
dl_se->pi_se = dl_se;
@@ -3713,7 +3757,7 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
unsigned long flags, cap;
struct dl_bw *dl_b;
bool overflow = 0;
- u64 fair_server_bw = 0;
+ u64 dl_server_bw = 0;
rcu_read_lock_sched();
dl_b = dl_bw_of(cpu);
@@ -3746,27 +3790,26 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
cap -= arch_scale_cpu_capacity(cpu);
/*
- * cpu is going offline and NORMAL tasks will be moved away
- * from it. We can thus discount dl_server bandwidth
- * contribution as it won't need to be servicing tasks after
- * the cpu is off.
+ * cpu is going offline and NORMAL and EXT tasks will be
+ * moved away from it. We can thus discount dl_server
+ * bandwidth contribution as it won't need to be servicing
+ * tasks after the cpu is off.
*/
- if (cpu_rq(cpu)->fair_server.dl_server)
- fair_server_bw = cpu_rq(cpu)->fair_server.dl_bw;
+ dl_server_bw = dl_server_read_bw(cpu);
/*
* Not much to check if no DEADLINE bandwidth is present.
* dl_servers we can discount, as tasks will be moved out the
* offlined CPUs anyway.
*/
- if (dl_b->total_bw - fair_server_bw > 0) {
+ if (dl_b->total_bw - dl_server_bw > 0) {
/*
* Leaving at least one CPU for DEADLINE tasks seems a
* wise thing to do. As said above, cpu is not offline
* yet, so account for that.
*/
if (dl_bw_cpus(cpu) - 1)
- overflow = __dl_overflow(dl_b, cap, fair_server_bw, 0);
+ overflow = __dl_overflow(dl_b, cap, dl_server_bw, 0);
else
overflow = 1;
}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 41caa22e0680..b24f40f05019 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -172,18 +172,12 @@ static const struct file_operations sched_feat_fops = {
static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- char buf[16];
unsigned int scaling;
+ int ret;
- if (cnt > 15)
- cnt = 15;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
- buf[cnt] = '\0';
-
- if (kstrtouint(buf, 10, &scaling))
- return -EINVAL;
+ ret = kstrtouint_from_user(ubuf, cnt, 10, &scaling);
+ if (ret)
+ return ret;
if (scaling >= SCHED_TUNABLESCALING_END)
return -EINVAL;
@@ -243,7 +237,7 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
static int sched_dynamic_show(struct seq_file *m, void *v)
{
- int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2;
+ int i = (IS_ENABLED(CONFIG_PREEMPT_RT) || IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY)) * 2;
int j;
/* Count entries in NULL terminated preempt_modes */
@@ -336,17 +330,19 @@ enum dl_param {
DL_PERIOD,
};
-static unsigned long fair_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */
-static unsigned long fair_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */
+static unsigned long dl_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */
+static unsigned long dl_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */
-static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos, enum dl_param param)
+static ssize_t sched_server_write_common(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos, enum dl_param param,
+ void *server)
{
long cpu = (long) ((struct seq_file *) filp->private_data)->private;
+ struct sched_dl_entity *dl_se = (struct sched_dl_entity *)server;
+ u64 old_runtime, runtime, period;
struct rq *rq = cpu_rq(cpu);
- u64 runtime, period;
+ int retval = 0;
size_t err;
- int retval;
u64 value;
err = kstrtoull_from_user(ubuf, cnt, 10, &value);
@@ -354,8 +350,8 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
return err;
scoped_guard (rq_lock_irqsave, rq) {
- runtime = rq->fair_server.dl_runtime;
- period = rq->fair_server.dl_period;
+ old_runtime = runtime = dl_se->dl_runtime;
+ period = dl_se->dl_period;
switch (param) {
case DL_RUNTIME:
@@ -371,60 +367,68 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
}
if (runtime > period ||
- period > fair_server_period_max ||
- period < fair_server_period_min) {
+ period > dl_server_period_max ||
+ period < dl_server_period_min) {
return -EINVAL;
}
update_rq_clock(rq);
- dl_server_stop(&rq->fair_server);
-
- retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0);
- if (retval)
- cnt = retval;
+ dl_server_stop(dl_se);
+ retval = dl_server_apply_params(dl_se, runtime, period, 0);
+ dl_server_start(dl_se);
- if (!runtime)
- printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n",
- cpu_of(rq));
+ if (retval < 0)
+ return retval;
+ }
- if (rq->cfs.h_nr_queued)
- dl_server_start(&rq->fair_server);
+ if (!!old_runtime ^ !!runtime) {
+ pr_info("%s server %sabled on CPU %d%s.\n",
+ server == &rq->fair_server ? "Fair" : "Ext",
+ runtime ? "en" : "dis",
+ cpu_of(rq),
+ runtime ? "" : ", system may malfunction due to starvation");
}
*ppos += cnt;
return cnt;
}
-static size_t sched_fair_server_show(struct seq_file *m, void *v, enum dl_param param)
+static size_t sched_server_show_common(struct seq_file *m, void *v, enum dl_param param,
+ void *server)
{
- unsigned long cpu = (unsigned long) m->private;
- struct rq *rq = cpu_rq(cpu);
+ struct sched_dl_entity *dl_se = (struct sched_dl_entity *)server;
u64 value;
switch (param) {
case DL_RUNTIME:
- value = rq->fair_server.dl_runtime;
+ value = dl_se->dl_runtime;
break;
case DL_PERIOD:
- value = rq->fair_server.dl_period;
+ value = dl_se->dl_period;
break;
}
seq_printf(m, "%llu\n", value);
return 0;
-
}
static ssize_t
sched_fair_server_runtime_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_RUNTIME);
+ long cpu = (long) ((struct seq_file *) filp->private_data)->private;
+ struct rq *rq = cpu_rq(cpu);
+
+ return sched_server_write_common(filp, ubuf, cnt, ppos, DL_RUNTIME,
+ &rq->fair_server);
}
static int sched_fair_server_runtime_show(struct seq_file *m, void *v)
{
- return sched_fair_server_show(m, v, DL_RUNTIME);
+ unsigned long cpu = (unsigned long) m->private;
+ struct rq *rq = cpu_rq(cpu);
+
+ return sched_server_show_common(m, v, DL_RUNTIME, &rq->fair_server);
}
static int sched_fair_server_runtime_open(struct inode *inode, struct file *filp)
@@ -440,16 +444,57 @@ static const struct file_operations fair_server_runtime_fops = {
.release = single_release,
};
+#ifdef CONFIG_SCHED_CLASS_EXT
+static ssize_t
+sched_ext_server_runtime_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ long cpu = (long) ((struct seq_file *) filp->private_data)->private;
+ struct rq *rq = cpu_rq(cpu);
+
+ return sched_server_write_common(filp, ubuf, cnt, ppos, DL_RUNTIME,
+ &rq->ext_server);
+}
+
+static int sched_ext_server_runtime_show(struct seq_file *m, void *v)
+{
+ unsigned long cpu = (unsigned long) m->private;
+ struct rq *rq = cpu_rq(cpu);
+
+ return sched_server_show_common(m, v, DL_RUNTIME, &rq->ext_server);
+}
+
+static int sched_ext_server_runtime_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_ext_server_runtime_show, inode->i_private);
+}
+
+static const struct file_operations ext_server_runtime_fops = {
+ .open = sched_ext_server_runtime_open,
+ .write = sched_ext_server_runtime_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif /* CONFIG_SCHED_CLASS_EXT */
+
static ssize_t
sched_fair_server_period_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_PERIOD);
+ long cpu = (long) ((struct seq_file *) filp->private_data)->private;
+ struct rq *rq = cpu_rq(cpu);
+
+ return sched_server_write_common(filp, ubuf, cnt, ppos, DL_PERIOD,
+ &rq->fair_server);
}
static int sched_fair_server_period_show(struct seq_file *m, void *v)
{
- return sched_fair_server_show(m, v, DL_PERIOD);
+ unsigned long cpu = (unsigned long) m->private;
+ struct rq *rq = cpu_rq(cpu);
+
+ return sched_server_show_common(m, v, DL_PERIOD, &rq->fair_server);
}
static int sched_fair_server_period_open(struct inode *inode, struct file *filp)
@@ -465,6 +510,40 @@ static const struct file_operations fair_server_period_fops = {
.release = single_release,
};
+#ifdef CONFIG_SCHED_CLASS_EXT
+static ssize_t
+sched_ext_server_period_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ long cpu = (long) ((struct seq_file *) filp->private_data)->private;
+ struct rq *rq = cpu_rq(cpu);
+
+ return sched_server_write_common(filp, ubuf, cnt, ppos, DL_PERIOD,
+ &rq->ext_server);
+}
+
+static int sched_ext_server_period_show(struct seq_file *m, void *v)
+{
+ unsigned long cpu = (unsigned long) m->private;
+ struct rq *rq = cpu_rq(cpu);
+
+ return sched_server_show_common(m, v, DL_PERIOD, &rq->ext_server);
+}
+
+static int sched_ext_server_period_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_ext_server_period_show, inode->i_private);
+}
+
+static const struct file_operations ext_server_period_fops = {
+ .open = sched_ext_server_period_open,
+ .write = sched_ext_server_period_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif /* CONFIG_SCHED_CLASS_EXT */
+
static struct dentry *debugfs_sched;
static void debugfs_fair_server_init(void)
@@ -488,6 +567,29 @@ static void debugfs_fair_server_init(void)
}
}
+#ifdef CONFIG_SCHED_CLASS_EXT
+static void debugfs_ext_server_init(void)
+{
+ struct dentry *d_ext;
+ unsigned long cpu;
+
+ d_ext = debugfs_create_dir("ext_server", debugfs_sched);
+ if (!d_ext)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct dentry *d_cpu;
+ char buf[32];
+
+ snprintf(buf, sizeof(buf), "cpu%lu", cpu);
+ d_cpu = debugfs_create_dir(buf, d_ext);
+
+ debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &ext_server_runtime_fops);
+ debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &ext_server_period_fops);
+ }
+}
+#endif /* CONFIG_SCHED_CLASS_EXT */
+
static __init int sched_init_debug(void)
{
struct dentry __maybe_unused *numa;
@@ -526,6 +628,9 @@ static __init int sched_init_debug(void)
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
debugfs_fair_server_init();
+#ifdef CONFIG_SCHED_CLASS_EXT
+ debugfs_ext_server_init();
+#endif
return 0;
}
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7ccd84c17792..e6bf73456176 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -959,6 +959,8 @@ static void update_curr_scx(struct rq *rq)
if (!curr->scx.slice)
touch_core_sched(rq, curr);
}
+
+ dl_server_update(&rq->ext_server, delta_exec);
}
static bool scx_dsq_priq_less(struct rb_node *node_a,
@@ -1502,6 +1504,10 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
if (enq_flags & SCX_ENQ_WAKEUP)
touch_core_sched(rq, p);
+ /* Start dl_server if this is the first task being enqueued */
+ if (rq->scx.nr_running == 1)
+ dl_server_start(&rq->ext_server);
+
do_enqueue_task(rq, p, enq_flags, sticky_cpu);
out:
rq->scx.flags &= ~SCX_RQ_IN_WAKEUP;
@@ -2454,7 +2460,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
/* see kick_cpus_irq_workfn() */
smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
- rq_modified_clear(rq);
+ rq->next_class = &ext_sched_class;
rq_unpin_lock(rq, rf);
balance_one(rq, prev);
@@ -2469,7 +2475,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
* If @force_scx is true, always try to pick a SCHED_EXT task,
* regardless of any higher-priority sched classes activity.
*/
- if (!force_scx && rq_modified_above(rq, &ext_sched_class))
+ if (!force_scx && sched_class_above(rq->next_class, &ext_sched_class))
return RETRY_TASK;
keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
@@ -2513,6 +2519,33 @@ static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf)
return do_pick_task_scx(rq, rf, false);
}
+/*
+ * Select the next task to run from the ext scheduling class.
+ *
+ * Use do_pick_task_scx() directly with @force_scx enabled, since the
+ * dl_server must always select a sched_ext task.
+ */
+static struct task_struct *
+ext_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf)
+{
+ if (!scx_enabled())
+ return NULL;
+
+ return do_pick_task_scx(dl_se->rq, rf, true);
+}
+
+/*
+ * Initialize the ext server deadline entity.
+ */
+void ext_server_init(struct rq *rq)
+{
+ struct sched_dl_entity *dl_se = &rq->ext_server;
+
+ init_dl_entity(dl_se);
+
+ dl_server_init(dl_se, rq, ext_server_pick_task);
+}
+
#ifdef CONFIG_SCHED_CORE
/**
* scx_prio_less - Task ordering for core-sched
@@ -3141,7 +3174,8 @@ static void switched_from_scx(struct rq *rq, struct task_struct *p)
scx_disable_task(p);
}
-static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
+static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) {}
+
static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
int scx_check_setscheduler(struct task_struct *p, int policy)
@@ -3402,8 +3436,6 @@ static void scx_cgroup_unlock(void) {}
* their current sched_class. Call them directly from sched core instead.
*/
DEFINE_SCHED_CLASS(ext) = {
- .queue_mask = 1,
-
.enqueue_task = enqueue_task_scx,
.dequeue_task = dequeue_task_scx,
.yield_task = yield_task_scx,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c3502c3c7b88..1e22b7fadd70 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -524,10 +524,48 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
* Scheduling class tree data structure manipulation methods:
*/
+extern void __BUILD_BUG_vruntime_cmp(void);
+
+/* Use __builtin_strcmp() because of __HAVE_ARCH_STRCMP: */
+
+#define vruntime_cmp(A, CMP_STR, B) ({ \
+ int __res = 0; \
+ \
+ if (!__builtin_strcmp(CMP_STR, "<")) { \
+ __res = ((s64)((A)-(B)) < 0); \
+ } else if (!__builtin_strcmp(CMP_STR, "<=")) { \
+ __res = ((s64)((A)-(B)) <= 0); \
+ } else if (!__builtin_strcmp(CMP_STR, ">")) { \
+ __res = ((s64)((A)-(B)) > 0); \
+ } else if (!__builtin_strcmp(CMP_STR, ">=")) { \
+ __res = ((s64)((A)-(B)) >= 0); \
+ } else { \
+ /* Unknown operator throws linker error: */ \
+ __BUILD_BUG_vruntime_cmp(); \
+ } \
+ \
+ __res; \
+})
+
+extern void __BUILD_BUG_vruntime_op(void);
+
+#define vruntime_op(A, OP_STR, B) ({ \
+ s64 __res = 0; \
+ \
+ if (!__builtin_strcmp(OP_STR, "-")) { \
+ __res = (s64)((A)-(B)); \
+ } else { \
+ /* Unknown operator throws linker error: */ \
+ __BUILD_BUG_vruntime_op(); \
+ } \
+ \
+ __res; \
+})
+
+
static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime)
{
- s64 delta = (s64)(vruntime - max_vruntime);
- if (delta > 0)
+ if (vruntime_cmp(vruntime, ">", max_vruntime))
max_vruntime = vruntime;
return max_vruntime;
@@ -535,8 +573,7 @@ static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime)
static inline __maybe_unused u64 min_vruntime(u64 min_vruntime, u64 vruntime)
{
- s64 delta = (s64)(vruntime - min_vruntime);
- if (delta < 0)
+ if (vruntime_cmp(vruntime, "<", min_vruntime))
min_vruntime = vruntime;
return min_vruntime;
@@ -549,12 +586,12 @@ static inline bool entity_before(const struct sched_entity *a,
* Tiebreak on vruntime seems unnecessary since it can
* hardly happen.
*/
- return (s64)(a->deadline - b->deadline) < 0;
+ return vruntime_cmp(a->deadline, "<", b->deadline);
}
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- return (s64)(se->vruntime - cfs_rq->zero_vruntime);
+ return vruntime_op(se->vruntime, "-", cfs_rq->zero_vruntime);
}
#define __node_2_se(node) \
@@ -576,7 +613,7 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
*
* \Sum lag_i = 0
* \Sum w_i * (V - v_i) = 0
- * \Sum w_i * V - w_i * v_i = 0
+ * \Sum (w_i * V - w_i * v_i) = 0
*
* From which we can solve an expression for V in v_i (which we have in
* se->vruntime):
@@ -607,11 +644,11 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Which we track using:
*
* v0 := cfs_rq->zero_vruntime
- * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
- * \Sum w_i := cfs_rq->avg_load
+ * \Sum (v_i - v0) * w_i := cfs_rq->sum_w_vruntime
+ * \Sum w_i := cfs_rq->sum_weight
*
* Since zero_vruntime closely tracks the per-task service, these
- * deltas: (v_i - v), will be in the order of the maximal (virtual) lag
+ * deltas: (v_i - v0), will be in the order of the maximal (virtual) lag
* induced in the system due to quantisation.
*
* Also, we use scale_load_down() to reduce the size.
@@ -619,32 +656,32 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
* As measured, the max (key * weight) value was ~44 bits for a kernel build.
*/
static void
-avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
unsigned long weight = scale_load_down(se->load.weight);
s64 key = entity_key(cfs_rq, se);
- cfs_rq->avg_vruntime += key * weight;
- cfs_rq->avg_load += weight;
+ cfs_rq->sum_w_vruntime += key * weight;
+ cfs_rq->sum_weight += weight;
}
static void
-avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
+sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
unsigned long weight = scale_load_down(se->load.weight);
s64 key = entity_key(cfs_rq, se);
- cfs_rq->avg_vruntime -= key * weight;
- cfs_rq->avg_load -= weight;
+ cfs_rq->sum_w_vruntime -= key * weight;
+ cfs_rq->sum_weight -= weight;
}
static inline
-void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
+void sum_w_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
{
/*
- * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
+ * v' = v + d ==> sum_w_vruntime' = sum_runtime - d*sum_weight
*/
- cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
+ cfs_rq->sum_w_vruntime -= cfs_rq->sum_weight * delta;
}
/*
@@ -654,8 +691,8 @@ void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
u64 avg_vruntime(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
- s64 avg = cfs_rq->avg_vruntime;
- long load = cfs_rq->avg_load;
+ s64 avg = cfs_rq->sum_w_vruntime;
+ long load = cfs_rq->sum_weight;
if (curr && curr->on_rq) {
unsigned long weight = scale_load_down(curr->load.weight);
@@ -722,8 +759,8 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
{
struct sched_entity *curr = cfs_rq->curr;
- s64 avg = cfs_rq->avg_vruntime;
- long load = cfs_rq->avg_load;
+ s64 avg = cfs_rq->sum_w_vruntime;
+ long load = cfs_rq->sum_weight;
if (curr && curr->on_rq) {
unsigned long weight = scale_load_down(curr->load.weight);
@@ -732,7 +769,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
load += weight;
}
- return avg >= (s64)(vruntime - cfs_rq->zero_vruntime) * load;
+ return avg >= vruntime_op(vruntime, "-", cfs_rq->zero_vruntime) * load;
}
int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -743,9 +780,9 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
static void update_zero_vruntime(struct cfs_rq *cfs_rq)
{
u64 vruntime = avg_vruntime(cfs_rq);
- s64 delta = (s64)(vruntime - cfs_rq->zero_vruntime);
+ s64 delta = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime);
- avg_vruntime_update(cfs_rq, delta);
+ sum_w_vruntime_update(cfs_rq, delta);
cfs_rq->zero_vruntime = vruntime;
}
@@ -770,13 +807,12 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
return entity_before(__node_2_se(a), __node_2_se(b));
}
-#define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
-
static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node)
{
if (node) {
struct sched_entity *rse = __node_2_se(node);
- if (vruntime_gt(min_vruntime, se, rse))
+
+ if (vruntime_cmp(se->min_vruntime, ">", rse->min_vruntime))
se->min_vruntime = rse->min_vruntime;
}
}
@@ -819,7 +855,7 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- avg_vruntime_add(cfs_rq, se);
+ sum_w_vruntime_add(cfs_rq, se);
update_zero_vruntime(cfs_rq);
se->min_vruntime = se->vruntime;
se->min_slice = se->slice;
@@ -831,7 +867,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
&min_vruntime_cb);
- avg_vruntime_sub(cfs_rq, se);
+ sum_w_vruntime_sub(cfs_rq, se);
update_zero_vruntime(cfs_rq);
}
@@ -887,7 +923,7 @@ static inline void update_protect_slice(struct cfs_rq *cfs_rq, struct sched_enti
static inline bool protect_slice(struct sched_entity *se)
{
- return ((s64)(se->vprot - se->vruntime) > 0);
+ return vruntime_cmp(se->vruntime, "<", se->vprot);
}
static inline void cancel_protect_slice(struct sched_entity *se)
@@ -1024,7 +1060,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
*/
static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if ((s64)(se->vruntime - se->deadline) < 0)
+ if (vruntime_cmp(se->vruntime, "<", se->deadline))
return false;
/*
@@ -1513,7 +1549,7 @@ static unsigned int task_scan_start(struct task_struct *p)
/* Scale the maximum scan period with the amount of shared memory. */
rcu_read_lock();
- ng = rcu_dereference(p->numa_group);
+ ng = rcu_dereference_all(p->numa_group);
if (ng) {
unsigned long shared = group_faults_shared(ng);
unsigned long private = group_faults_priv(ng);
@@ -1580,7 +1616,7 @@ pid_t task_numa_group_id(struct task_struct *p)
pid_t gid = 0;
rcu_read_lock();
- ng = rcu_dereference(p->numa_group);
+ ng = rcu_dereference_all(p->numa_group);
if (ng)
gid = ng->gid;
rcu_read_unlock();
@@ -2239,7 +2275,7 @@ static bool task_numa_compare(struct task_numa_env *env,
return false;
rcu_read_lock();
- cur = rcu_dereference(dst_rq->curr);
+ cur = rcu_dereference_all(dst_rq->curr);
if (cur && ((cur->flags & (PF_EXITING | PF_KTHREAD)) ||
!cur->mm))
cur = NULL;
@@ -2284,7 +2320,7 @@ static bool task_numa_compare(struct task_numa_env *env,
* If dst and source tasks are in the same NUMA group, or not
* in any group then look only at task weights.
*/
- cur_ng = rcu_dereference(cur->numa_group);
+ cur_ng = rcu_dereference_all(cur->numa_group);
if (cur_ng == p_ng) {
/*
* Do not swap within a group or between tasks that have
@@ -2458,11 +2494,8 @@ static void task_numa_find_cpu(struct task_numa_env *env,
maymove = !load_too_imbalanced(src_load, dst_load, env);
}
- for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
- /* Skip this CPU if the source task cannot migrate */
- if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
- continue;
-
+ /* Skip CPUs if the source task cannot migrate */
+ for_each_cpu_and(cpu, cpumask_of_node(env->dst_nid), env->p->cpus_ptr) {
env->dst_cpu = cpu;
if (task_numa_compare(env, taskimp, groupimp, maymove))
break;
@@ -2499,7 +2532,7 @@ static int task_numa_migrate(struct task_struct *p)
* to satisfy here.
*/
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
+ sd = rcu_dereference_all(per_cpu(sd_numa, env.src_cpu));
if (sd) {
env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
env.imb_numa_nr = sd->imb_numa_nr;
@@ -3023,7 +3056,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
if (!cpupid_match_pid(tsk, cpupid))
goto no_join;
- grp = rcu_dereference(tsk->numa_group);
+ grp = rcu_dereference_all(tsk->numa_group);
if (!grp)
goto no_join;
@@ -3694,7 +3727,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
#define add_positive(_ptr, _val) do { \
typeof(_ptr) ptr = (_ptr); \
- typeof(_val) val = (_val); \
+ __signed_scalar_typeof(*ptr) val = (_val); \
typeof(*ptr) res, var = READ_ONCE(*ptr); \
\
res = var + val; \
@@ -3706,23 +3739,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
} while (0)
/*
- * Unsigned subtract and clamp on underflow.
- *
- * Explicitly do a load-store to ensure the intermediate value never hits
- * memory. This allows lockless observations without ever seeing the negative
- * values.
- */
-#define sub_positive(_ptr, _val) do { \
- typeof(_ptr) ptr = (_ptr); \
- typeof(*ptr) val = (_val); \
- typeof(*ptr) res, var = READ_ONCE(*ptr); \
- res = var - val; \
- if (res > var) \
- res = 0; \
- WRITE_ONCE(*ptr, res); \
-} while (0)
-
-/*
* Remove and clamp on negative, from a local variable.
*
* A variant of sub_positive(), which does not use explicit load-store
@@ -3733,21 +3749,39 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
*ptr -= min_t(typeof(*ptr), *ptr, _val); \
} while (0)
+
+/*
+ * Because of rounding, se->util_sum might ends up being +1 more than
+ * cfs->util_sum. Although this is not a problem by itself, detaching
+ * a lot of tasks with the rounding problem between 2 updates of
+ * util_avg (~1ms) can make cfs->util_sum becoming null whereas
+ * cfs_util_avg is not.
+ *
+ * Check that util_sum is still above its lower bound for the new
+ * util_avg. Given that period_contrib might have moved since the last
+ * sync, we are only sure that util_sum must be above or equal to
+ * util_avg * minimum possible divider
+ */
+#define __update_sa(sa, name, delta_avg, delta_sum) do { \
+ add_positive(&(sa)->name##_avg, delta_avg); \
+ add_positive(&(sa)->name##_sum, delta_sum); \
+ (sa)->name##_sum = max_t(typeof((sa)->name##_sum), \
+ (sa)->name##_sum, \
+ (sa)->name##_avg * PELT_MIN_DIVIDER); \
+} while (0)
+
static inline void
enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- cfs_rq->avg.load_avg += se->avg.load_avg;
- cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
+ __update_sa(&cfs_rq->avg, load, se->avg.load_avg,
+ se_weight(se) * se->avg.load_sum);
}
static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
- sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
- cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
+ __update_sa(&cfs_rq->avg, load, -se->avg.load_avg,
+ se_weight(se) * -se->avg.load_sum);
}
static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
@@ -4243,7 +4277,6 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
*/
divider = get_pelt_divider(&cfs_rq->avg);
-
/* Set new sched_entity's utilization */
se->avg.util_avg = gcfs_rq->avg.util_avg;
new_sum = se->avg.util_avg * divider;
@@ -4251,12 +4284,7 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
se->avg.util_sum = new_sum;
/* Update parent cfs_rq utilization */
- add_positive(&cfs_rq->avg.util_avg, delta_avg);
- add_positive(&cfs_rq->avg.util_sum, delta_sum);
-
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
- cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
+ __update_sa(&cfs_rq->avg, util, delta_avg, delta_sum);
}
static inline void
@@ -4282,11 +4310,7 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
se->avg.runnable_sum = new_sum;
/* Update parent cfs_rq runnable */
- add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
- add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
- cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
+ __update_sa(&cfs_rq->avg, runnable, delta_avg, delta_sum);
}
static inline void
@@ -4350,11 +4374,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
se->avg.load_sum = runnable_sum;
se->avg.load_avg = load_avg;
- add_positive(&cfs_rq->avg.load_avg, delta_avg);
- add_positive(&cfs_rq->avg.load_sum, delta_sum);
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
- cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
+ __update_sa(&cfs_rq->avg, load, delta_avg, delta_sum);
}
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
@@ -4451,7 +4471,7 @@ static inline void migrate_se_pelt_lag(struct sched_entity *se)
rq = rq_of(cfs_rq);
rcu_read_lock();
- is_idle = is_idle_task(rcu_dereference(rq->curr));
+ is_idle = is_idle_task(rcu_dereference_all(rq->curr));
rcu_read_unlock();
/*
@@ -4553,33 +4573,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
raw_spin_unlock(&cfs_rq->removed.lock);
r = removed_load;
- sub_positive(&sa->load_avg, r);
- sub_positive(&sa->load_sum, r * divider);
- /* See sa->util_sum below */
- sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);
+ __update_sa(sa, load, -r, -r*divider);
r = removed_util;
- sub_positive(&sa->util_avg, r);
- sub_positive(&sa->util_sum, r * divider);
- /*
- * Because of rounding, se->util_sum might ends up being +1 more than
- * cfs->util_sum. Although this is not a problem by itself, detaching
- * a lot of tasks with the rounding problem between 2 updates of
- * util_avg (~1ms) can make cfs->util_sum becoming null whereas
- * cfs_util_avg is not.
- * Check that util_sum is still above its lower bound for the new
- * util_avg. Given that period_contrib might have moved since the last
- * sync, we are only sure that util_sum must be above or equal to
- * util_avg * minimum possible divider
- */
- sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
+ __update_sa(sa, util, -r, -r*divider);
r = removed_runnable;
- sub_positive(&sa->runnable_avg, r);
- sub_positive(&sa->runnable_sum, r * divider);
- /* See sa->util_sum above */
- sa->runnable_sum = max_t(u32, sa->runnable_sum,
- sa->runnable_avg * PELT_MIN_DIVIDER);
+ __update_sa(sa, runnable, -r, -r*divider);
/*
* removed_runnable is the unweighted version of removed_load so we
@@ -4664,17 +4664,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
dequeue_load_avg(cfs_rq, se);
- sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
- sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
- cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
-
- sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
- sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
- cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
+ __update_sa(&cfs_rq->avg, util, -se->avg.util_avg, -se->avg.util_sum);
+ __update_sa(&cfs_rq->avg, runnable, -se->avg.runnable_avg, -se->avg.runnable_sum);
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
@@ -5177,7 +5168,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*
* vl_i = (W + w_i)*vl'_i / W
*/
- load = cfs_rq->avg_load;
+ load = cfs_rq->sum_weight;
if (curr && curr->on_rq)
load += scale_load_down(curr->load.weight);
@@ -7150,8 +7141,7 @@ static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
static struct {
cpumask_var_t idle_cpus_mask;
- atomic_t nr_cpus;
- int has_blocked; /* Idle CPUS has blocked load */
+ int has_blocked_load; /* Idle CPUS has blocked load */
int needs_update; /* Newly idle CPUs need their next_balance collated */
unsigned long next_balance; /* in jiffy units */
unsigned long next_blocked; /* Next update of blocked load in jiffies */
@@ -7509,7 +7499,7 @@ static inline void set_idle_cores(int cpu, int val)
{
struct sched_domain_shared *sds;
- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
if (sds)
WRITE_ONCE(sds->has_idle_cores, val);
}
@@ -7518,7 +7508,7 @@ static inline bool test_idle_cores(int cpu)
{
struct sched_domain_shared *sds;
- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
if (sds)
return READ_ONCE(sds->has_idle_cores);
@@ -7647,7 +7637,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
if (sched_feat(SIS_UTIL)) {
- sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
+ sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, target));
if (sd_share) {
/* because !--nr is the condition to stop scan */
nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
@@ -7853,7 +7843,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* sd_asym_cpucapacity rather than sd_llc.
*/
if (sched_asym_cpucap_active()) {
- sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
+ sd = rcu_dereference_all(per_cpu(sd_asym_cpucapacity, target));
/*
* On an asymmetric CPU capacity system where an exclusive
* cpuset defines a symmetric island (i.e. one unique
@@ -7868,7 +7858,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
}
}
- sd = rcu_dereference(per_cpu(sd_llc, target));
+ sd = rcu_dereference_all(per_cpu(sd_llc, target));
if (!sd)
return target;
@@ -8337,7 +8327,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
struct energy_env eenv;
rcu_read_lock();
- pd = rcu_dereference(rd->pd);
+ pd = rcu_dereference_all(rd->pd);
if (!pd)
goto unlock;
@@ -8345,7 +8335,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
* Energy-aware wake-up happens on the lowest sched_domain starting
* from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
*/
- sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
+ sd = rcu_dereference_all(*this_cpu_ptr(&sd_asym_cpucapacity));
while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
sd = sd->parent;
if (!sd)
@@ -8368,9 +8358,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
int max_spare_cap_cpu = -1;
int fits, max_fits = -1;
- cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
-
- if (cpumask_empty(cpus))
+ if (!cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask))
continue;
/* Account external pressure for the energy estimation */
@@ -8747,7 +8735,7 @@ preempt_sync(struct rq *rq, int wake_flags,
/*
* Preempt the current task with a newly woken task if needed:
*/
-static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
+static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_flags)
{
enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK;
struct task_struct *donor = rq->donor;
@@ -8755,6 +8743,12 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
struct cfs_rq *cfs_rq = task_cfs_rq(donor);
int cse_is_idle, pse_is_idle;
+ /*
+ * XXX Getting preempted by higher class, try and find idle CPU?
+ */
+ if (p->sched_class != &fair_sched_class)
+ return;
+
if (unlikely(se == pse))
return;
@@ -9337,7 +9331,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
*/
static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
{
- struct numa_group *numa_group = rcu_dereference(p->numa_group);
+ struct numa_group *numa_group = rcu_dereference_all(p->numa_group);
unsigned long src_weight, dst_weight;
int src_nid, dst_nid, dist;
@@ -9766,7 +9760,7 @@ static void attach_tasks(struct lb_env *env)
}
#ifdef CONFIG_NO_HZ_COMMON
-static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
+static inline bool cfs_rq_has_blocked_load(struct cfs_rq *cfs_rq)
{
if (cfs_rq->avg.load_avg)
return true;
@@ -9799,16 +9793,16 @@ static inline void update_blocked_load_tick(struct rq *rq)
WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
}
-static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+static inline void update_has_blocked_load_status(struct rq *rq, bool has_blocked_load)
{
- if (!has_blocked)
+ if (!has_blocked_load)
rq->has_blocked_load = 0;
}
#else /* !CONFIG_NO_HZ_COMMON: */
-static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
+static inline bool cfs_rq_has_blocked_load(struct cfs_rq *cfs_rq) { return false; }
static inline bool others_have_blocked(struct rq *rq) { return false; }
static inline void update_blocked_load_tick(struct rq *rq) {}
-static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
+static inline void update_has_blocked_load_status(struct rq *rq, bool has_blocked_load) {}
#endif /* !CONFIG_NO_HZ_COMMON */
static bool __update_blocked_others(struct rq *rq, bool *done)
@@ -9865,7 +9859,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
list_del_leaf_cfs_rq(cfs_rq);
/* Don't need periodic decay once load/util_avg are null */
- if (cfs_rq_has_blocked(cfs_rq))
+ if (cfs_rq_has_blocked_load(cfs_rq))
*done = false;
}
@@ -9925,7 +9919,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
bool decayed;
decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
- if (cfs_rq_has_blocked(cfs_rq))
+ if (cfs_rq_has_blocked_load(cfs_rq))
*done = false;
return decayed;
@@ -9937,23 +9931,27 @@ static unsigned long task_h_load(struct task_struct *p)
}
#endif /* !CONFIG_FAIR_GROUP_SCHED */
-static void sched_balance_update_blocked_averages(int cpu)
+static void __sched_balance_update_blocked_averages(struct rq *rq)
{
bool decayed = false, done = true;
- struct rq *rq = cpu_rq(cpu);
- struct rq_flags rf;
- rq_lock_irqsave(rq, &rf);
update_blocked_load_tick(rq);
- update_rq_clock(rq);
decayed |= __update_blocked_others(rq, &done);
decayed |= __update_blocked_fair(rq, &done);
- update_blocked_load_status(rq, !done);
+ update_has_blocked_load_status(rq, !done);
if (decayed)
cpufreq_update_util(rq, 0);
- rq_unlock_irqrestore(rq, &rf);
+}
+
+static void sched_balance_update_blocked_averages(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ guard(rq_lock_irqsave)(rq);
+ update_rq_clock(rq);
+ __sched_balance_update_blocked_averages(rq);
}
/********** Helpers for sched_balance_find_src_group ************************/
@@ -10963,10 +10961,9 @@ sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int
* take care of it.
*/
if (p->nr_cpus_allowed != NR_CPUS) {
- struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
-
- cpumask_and(cpus, sched_group_span(local), p->cpus_ptr);
- imb_numa_nr = min(cpumask_weight(cpus), sd->imb_numa_nr);
+ unsigned int w = cpumask_weight_and(p->cpus_ptr,
+ sched_group_span(local));
+ imb_numa_nr = min(w, sd->imb_numa_nr);
}
imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus);
@@ -11013,7 +11010,7 @@ static void update_idle_cpu_scan(struct lb_env *env,
if (env->sd->span_weight != llc_weight)
return;
- sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu));
+ sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, env->dst_cpu));
if (!sd_share)
return;
@@ -11363,7 +11360,7 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
goto force_balance;
if (!is_rd_overutilized(env->dst_rq->rd) &&
- rcu_dereference(env->dst_rq->rd->pd))
+ rcu_dereference_all(env->dst_rq->rd->pd))
goto out_balanced;
/* ASYM feature bypasses nice load balance check */
@@ -12431,20 +12428,29 @@ static void nohz_balancer_kick(struct rq *rq)
*/
nohz_balance_exit_idle(rq);
- /*
- * None are in tickless mode and hence no need for NOHZ idle load
- * balancing:
- */
- if (likely(!atomic_read(&nohz.nr_cpus)))
- return;
-
- if (READ_ONCE(nohz.has_blocked) &&
+ if (READ_ONCE(nohz.has_blocked_load) &&
time_after(now, READ_ONCE(nohz.next_blocked)))
flags = NOHZ_STATS_KICK;
+ /*
+ * Most of the time system is not 100% busy. i.e nohz.nr_cpus > 0
+ * Skip the read if time is not due.
+ *
+ * If none are in tickless mode, there maybe a narrow window
+ * (28 jiffies, HZ=1000) where flags maybe set and kick_ilb called.
+ * But idle load balancing is not done as find_new_ilb fails.
+ * That's very rare. So read nohz.nr_cpus only if time is due.
+ */
if (time_before(now, nohz.next_balance))
goto out;
+ /*
+ * None are in tickless mode and hence no need for NOHZ idle load
+ * balancing
+ */
+ if (unlikely(cpumask_empty(nohz.idle_cpus_mask)))
+ return;
+
if (rq->nr_running >= 2) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto out;
@@ -12452,7 +12458,7 @@ static void nohz_balancer_kick(struct rq *rq)
rcu_read_lock();
- sd = rcu_dereference(rq->sd);
+ sd = rcu_dereference_all(rq->sd);
if (sd) {
/*
* If there's a runnable CFS task and the current CPU has reduced
@@ -12464,7 +12470,7 @@ static void nohz_balancer_kick(struct rq *rq)
}
}
- sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
+ sd = rcu_dereference_all(per_cpu(sd_asym_packing, cpu));
if (sd) {
/*
* When ASYM_PACKING; see if there's a more preferred CPU
@@ -12482,7 +12488,7 @@ static void nohz_balancer_kick(struct rq *rq)
}
}
- sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
+ sd = rcu_dereference_all(per_cpu(sd_asym_cpucapacity, cpu));
if (sd) {
/*
* When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
@@ -12503,7 +12509,7 @@ static void nohz_balancer_kick(struct rq *rq)
goto unlock;
}
- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
if (sds) {
/*
* If there is an imbalance between LLC domains (IOW we could
@@ -12535,7 +12541,7 @@ static void set_cpu_sd_state_busy(int cpu)
struct sched_domain *sd;
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ sd = rcu_dereference_all(per_cpu(sd_llc, cpu));
if (!sd || !sd->nohz_idle)
goto unlock;
@@ -12555,7 +12561,6 @@ void nohz_balance_exit_idle(struct rq *rq)
rq->nohz_tick_stopped = 0;
cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
- atomic_dec(&nohz.nr_cpus);
set_cpu_sd_state_busy(rq->cpu);
}
@@ -12565,7 +12570,7 @@ static void set_cpu_sd_state_idle(int cpu)
struct sched_domain *sd;
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ sd = rcu_dereference_all(per_cpu(sd_llc, cpu));
if (!sd || sd->nohz_idle)
goto unlock;
@@ -12599,9 +12604,9 @@ void nohz_balance_enter_idle(int cpu)
/*
* The tick is still stopped but load could have been added in the
- * meantime. We set the nohz.has_blocked flag to trig a check of the
+ * meantime. We set the nohz.has_blocked_load flag to trig a check of the
* *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
- * of nohz.has_blocked can only happen after checking the new load
+ * of nohz.has_blocked_load can only happen after checking the new load
*/
if (rq->nohz_tick_stopped)
goto out;
@@ -12613,11 +12618,10 @@ void nohz_balance_enter_idle(int cpu)
rq->nohz_tick_stopped = 1;
cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
- atomic_inc(&nohz.nr_cpus);
/*
* Ensures that if nohz_idle_balance() fails to observe our
- * @idle_cpus_mask store, it must observe the @has_blocked
+ * @idle_cpus_mask store, it must observe the @has_blocked_load
* and @needs_update stores.
*/
smp_mb__after_atomic();
@@ -12630,7 +12634,7 @@ out:
* Each time a cpu enter idle, we assume that it has blocked load and
* enable the periodic update of the load of idle CPUs
*/
- WRITE_ONCE(nohz.has_blocked, 1);
+ WRITE_ONCE(nohz.has_blocked_load, 1);
}
static bool update_nohz_stats(struct rq *rq)
@@ -12671,8 +12675,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
/*
* We assume there will be no idle load after this update and clear
- * the has_blocked flag. If a cpu enters idle in the mean time, it will
- * set the has_blocked flag and trigger another update of idle load.
+ * the has_blocked_load flag. If a cpu enters idle in the mean time, it will
+ * set the has_blocked_load flag and trigger another update of idle load.
* Because a cpu that becomes idle, is added to idle_cpus_mask before
* setting the flag, we are sure to not clear the state and not
* check the load of an idle cpu.
@@ -12680,12 +12684,12 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
* Same applies to idle_cpus_mask vs needs_update.
*/
if (flags & NOHZ_STATS_KICK)
- WRITE_ONCE(nohz.has_blocked, 0);
+ WRITE_ONCE(nohz.has_blocked_load, 0);
if (flags & NOHZ_NEXT_KICK)
WRITE_ONCE(nohz.needs_update, 0);
/*
- * Ensures that if we miss the CPU, we must see the has_blocked
+ * Ensures that if we miss the CPU, we must see the has_blocked_load
* store from nohz_balance_enter_idle().
*/
smp_mb();
@@ -12752,7 +12756,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
abort:
/* There is still blocked load, enable periodic update */
if (has_blocked_load)
- WRITE_ONCE(nohz.has_blocked, 1);
+ WRITE_ONCE(nohz.has_blocked_load, 1);
}
/*
@@ -12814,7 +12818,7 @@ static void nohz_newidle_balance(struct rq *this_rq)
return;
/* Don't need to update blocked load of idle CPUs*/
- if (!READ_ONCE(nohz.has_blocked) ||
+ if (!READ_ONCE(nohz.has_blocked_load) ||
time_before(jiffies, READ_ONCE(nohz.next_blocked)))
return;
@@ -12885,29 +12889,28 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
*/
rq_unpin_lock(this_rq, rf);
- rcu_read_lock();
- sd = rcu_dereference_check_sched_domain(this_rq->sd);
- if (!sd) {
- rcu_read_unlock();
+ sd = rcu_dereference_sched_domain(this_rq->sd);
+ if (!sd)
goto out;
- }
if (!get_rd_overloaded(this_rq->rd) ||
this_rq->avg_idle < sd->max_newidle_lb_cost) {
update_next_balance(sd, &next_balance);
- rcu_read_unlock();
goto out;
}
- rcu_read_unlock();
-
- rq_modified_clear(this_rq);
- raw_spin_rq_unlock(this_rq);
+ /*
+ * Include sched_balance_update_blocked_averages() in the cost
+ * calculation because it can be quite costly -- this ensures we skip
+ * it when avg_idle gets to be very low.
+ */
t0 = sched_clock_cpu(this_cpu);
- sched_balance_update_blocked_averages(this_cpu);
+ __sched_balance_update_blocked_averages(this_rq);
+
+ this_rq->next_class = &fair_sched_class;
+ raw_spin_rq_unlock(this_rq);
- rcu_read_lock();
for_each_domain(this_cpu, sd) {
u64 domain_cost;
@@ -12957,7 +12960,6 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
if (pulled_task || !continue_balancing)
break;
}
- rcu_read_unlock();
raw_spin_rq_lock(this_rq);
@@ -12973,7 +12975,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
pulled_task = 1;
/* If a higher prio class was modified, restart the pick */
- if (rq_modified_above(this_rq, &fair_sched_class))
+ if (sched_class_above(this_rq->next_class, &fair_sched_class))
pulled_task = -1;
out:
@@ -13324,8 +13326,8 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
* zero_vruntime_fi, which would have been updated in prior calls
* to se_fi_update().
*/
- delta = (s64)(sea->vruntime - seb->vruntime) +
- (s64)(cfs_rqb->zero_vruntime_fi - cfs_rqa->zero_vruntime_fi);
+ delta = vruntime_op(sea->vruntime, "-", seb->vruntime) +
+ vruntime_op(cfs_rqb->zero_vruntime_fi, "-", cfs_rqa->zero_vruntime_fi);
return delta > 0;
}
@@ -13363,6 +13365,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
entity_tick(cfs_rq, se, queued);
}
+ if (queued) {
+ if (!need_resched())
+ hrtick_start_fair(rq, curr);
+ return;
+ }
+
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
@@ -13871,15 +13879,12 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
* All the scheduling class methods:
*/
DEFINE_SCHED_CLASS(fair) = {
-
- .queue_mask = 2,
-
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
.yield_to_task = yield_to_task_fair,
- .wakeup_preempt = check_preempt_wakeup_fair,
+ .wakeup_preempt = wakeup_preempt_fair,
.pick_task = pick_task_fair,
.pick_next_task = pick_next_task_fair,
@@ -13939,7 +13944,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
struct numa_group *ng;
rcu_read_lock();
- ng = rcu_dereference(p->numa_group);
+ ng = rcu_dereference_all(p->numa_group);
for_each_online_node(node) {
if (p->numa_faults) {
tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index abf8f15d60c9..3681b6ad9276 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -460,6 +460,7 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct t
{
update_curr_idle(rq);
scx_update_idle(rq, false, true);
+ update_rq_avg_idle(rq);
}
static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
@@ -536,15 +537,15 @@ static void update_curr_idle(struct rq *rq)
se->exec_start = now;
dl_server_update_idle(&rq->fair_server, delta_exec);
+#ifdef CONFIG_SCHED_CLASS_EXT
+ dl_server_update_idle(&rq->ext_server, delta_exec);
+#endif
}
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
DEFINE_SCHED_CLASS(idle) = {
-
- .queue_mask = 0,
-
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f1867fe8e5c5..a7680477fa6f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1615,6 +1615,12 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
{
struct task_struct *donor = rq->donor;
+ /*
+ * XXX If we're preempted by DL, queue a push?
+ */
+ if (p->sched_class != &rt_sched_class)
+ return;
+
if (p->prio < donor->prio) {
resched_curr(rq);
return;
@@ -2100,6 +2106,7 @@ static void push_rt_tasks(struct rq *rq)
*/
static int rto_next_cpu(struct root_domain *rd)
{
+ int this_cpu = smp_processor_id();
int next;
int cpu;
@@ -2123,6 +2130,10 @@ static int rto_next_cpu(struct root_domain *rd)
rd->rto_cpu = cpu;
+ /* Do not send IPI to self */
+ if (cpu == this_cpu)
+ continue;
+
if (cpu < nr_cpu_ids)
return cpu;
@@ -2568,9 +2579,6 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu)
#endif /* CONFIG_SCHED_CORE */
DEFINE_SCHED_CLASS(rt) = {
-
- .queue_mask = 4,
-
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 275370854481..62f9278b1663 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -418,6 +418,7 @@ extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
extern void sched_init_dl_servers(void);
extern void fair_server_init(struct rq *rq);
+extern void ext_server_init(struct rq *rq);
extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq);
extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
u64 runtime, u64 period, bool init);
@@ -674,16 +675,16 @@ struct balance_callback {
void (*func)(struct rq *rq);
};
-/* CFS-related fields in a runqueue */
+/* Fair scheduling SCHED_{NORMAL,BATCH,IDLE} related fields in a runqueue: */
struct cfs_rq {
struct load_weight load;
unsigned int nr_queued;
- unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */
- unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */
- unsigned int h_nr_idle; /* SCHED_IDLE */
+ unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int h_nr_idle; /* SCHED_IDLE */
- s64 avg_vruntime;
- u64 avg_load;
+ s64 sum_w_vruntime;
+ u64 sum_weight;
u64 zero_vruntime;
#ifdef CONFIG_SCHED_CORE
@@ -694,7 +695,7 @@ struct cfs_rq {
struct rb_root_cached tasks_timeline;
/*
- * 'curr' points to currently running entity on this cfs_rq.
+ * 'curr' points to the currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
struct sched_entity *curr;
@@ -730,9 +731,7 @@ struct cfs_rq {
unsigned long h_load;
u64 last_h_load_update;
struct sched_entity *h_load_next;
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */
/*
@@ -745,19 +744,19 @@ struct cfs_rq {
*/
int on_list;
struct list_head leaf_cfs_rq_list;
- struct task_group *tg; /* group that "owns" this runqueue */
+ struct task_group *tg; /* Group that "owns" this runqueue */
/* Locally cached copy of our task_group's idle value */
int idle;
-#ifdef CONFIG_CFS_BANDWIDTH
+# ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
s64 runtime_remaining;
u64 throttled_pelt_idle;
-#ifndef CONFIG_64BIT
+# ifndef CONFIG_64BIT
u64 throttled_pelt_idle_copy;
-#endif
+# endif
u64 throttled_clock;
u64 throttled_clock_pelt;
u64 throttled_clock_pelt_time;
@@ -769,7 +768,7 @@ struct cfs_rq {
struct list_head throttled_list;
struct list_head throttled_csd_list;
struct list_head throttled_limbo_list;
-#endif /* CONFIG_CFS_BANDWIDTH */
+# endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
@@ -1121,28 +1120,50 @@ DECLARE_STATIC_KEY_FALSE(sched_uclamp_used);
* acquire operations must be ordered by ascending &runqueue.
*/
struct rq {
- /* runqueue lock: */
- raw_spinlock_t __lock;
-
- /* Per class runqueue modification mask; bits in class order. */
- unsigned int queue_mask;
+ /*
+ * The following members are loaded together, without holding the
+ * rq->lock, in an extremely hot loop in update_sg_lb_stats()
+ * (called from pick_next_task()). To reduce cache pollution from
+ * this operation, they are placed together on this dedicated cache
+ * line. Even though some of them are frequently modified, they are
+ * loaded much more frequently than they are stored.
+ */
unsigned int nr_running;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
- unsigned int numa_migrate_on;
#endif
+ unsigned int ttwu_pending;
+ unsigned long cpu_capacity;
+#ifdef CONFIG_SCHED_PROXY_EXEC
+ struct task_struct __rcu *donor; /* Scheduling context */
+ struct task_struct __rcu *curr; /* Execution context */
+#else
+ union {
+ struct task_struct __rcu *donor; /* Scheduler context */
+ struct task_struct __rcu *curr; /* Execution context */
+ };
+#endif
+ struct task_struct *idle;
+ /* padding left here deliberately */
+
+ /*
+ * The next cacheline holds the (hot) runqueue lock, as well as
+ * some other less performance-critical fields.
+ */
+ u64 nr_switches ____cacheline_aligned;
+
+ /* runqueue lock: */
+ raw_spinlock_t __lock;
+
#ifdef CONFIG_NO_HZ_COMMON
- unsigned long last_blocked_load_update_tick;
- unsigned int has_blocked_load;
- call_single_data_t nohz_csd;
unsigned int nohz_tick_stopped;
atomic_t nohz_flags;
+ unsigned int has_blocked_load;
+ unsigned long last_blocked_load_update_tick;
+ call_single_data_t nohz_csd;
#endif /* CONFIG_NO_HZ_COMMON */
- unsigned int ttwu_pending;
- u64 nr_switches;
-
#ifdef CONFIG_UCLAMP_TASK
/* Utilization clamp values based on CPU's RUNNABLE tasks */
struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned;
@@ -1155,6 +1176,7 @@ struct rq {
struct dl_rq dl;
#ifdef CONFIG_SCHED_CLASS_EXT
struct scx_rq scx;
+ struct sched_dl_entity ext_server;
#endif
struct sched_dl_entity fair_server;
@@ -1165,6 +1187,9 @@ struct rq {
struct list_head *tmp_alone_branch;
#endif /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_NUMA_BALANCING
+ unsigned int numa_migrate_on;
+#endif
/*
* This is part of a global counter where only the total sum
* over all CPUs matters. A task can increase this counter on
@@ -1173,36 +1198,29 @@ struct rq {
*/
unsigned long nr_uninterruptible;
-#ifdef CONFIG_SCHED_PROXY_EXEC
- struct task_struct __rcu *donor; /* Scheduling context */
- struct task_struct __rcu *curr; /* Execution context */
-#else
- union {
- struct task_struct __rcu *donor; /* Scheduler context */
- struct task_struct __rcu *curr; /* Execution context */
- };
-#endif
struct sched_dl_entity *dl_server;
- struct task_struct *idle;
struct task_struct *stop;
+ const struct sched_class *next_class;
unsigned long next_balance;
struct mm_struct *prev_mm;
- unsigned int clock_update_flags;
- u64 clock;
- /* Ensure that all clocks are in the same cache line */
+ /*
+ * The following fields of clock data are frequently referenced
+ * and updated together, and should go on their own cache line.
+ */
u64 clock_task ____cacheline_aligned;
u64 clock_pelt;
+ u64 clock;
unsigned long lost_idle_time;
+ unsigned int clock_update_flags;
u64 clock_pelt_idle;
u64 clock_idle;
+
#ifndef CONFIG_64BIT
u64 clock_pelt_idle_copy;
u64 clock_idle_copy;
#endif
- atomic_t nr_iowait;
-
u64 last_seen_need_resched_ns;
int ticks_without_resched;
@@ -1213,8 +1231,6 @@ struct rq {
struct root_domain *rd;
struct sched_domain __rcu *sd;
- unsigned long cpu_capacity;
-
struct balance_callback *balance_callback;
unsigned char nohz_idle_balance;
@@ -1324,7 +1340,9 @@ struct rq {
call_single_data_t cfsb_csd;
struct list_head cfsb_csd_list;
#endif
-};
+
+ atomic_t nr_iowait;
+} __no_randomize_layout;
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1698,6 +1716,7 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
#endif /* !CONFIG_FAIR_GROUP_SCHED */
+extern void update_rq_avg_idle(struct rq *rq);
extern void update_rq_clock(struct rq *rq);
/*
@@ -2062,8 +2081,8 @@ queue_balance_callback(struct rq *rq,
rq->balance_callback = head;
}
-#define rcu_dereference_check_sched_domain(p) \
- rcu_dereference_check((p), lockdep_is_held(&sched_domains_mutex))
+#define rcu_dereference_sched_domain(p) \
+ rcu_dereference_all_check((p), lockdep_is_held(&sched_domains_mutex))
/*
* The domain tree (rq->sd) is protected by RCU's quiescent state transition.
@@ -2073,7 +2092,7 @@ queue_balance_callback(struct rq *rq,
* preempt-disabled sections.
*/
#define for_each_domain(cpu, __sd) \
- for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
+ for (__sd = rcu_dereference_sched_domain(cpu_rq(cpu)->sd); \
__sd; __sd = __sd->parent)
/* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */
@@ -2481,15 +2500,6 @@ struct sched_class {
#ifdef CONFIG_UCLAMP_TASK
int uclamp_enabled;
#endif
- /*
- * idle: 0
- * ext: 1
- * fair: 2
- * rt: 4
- * dl: 8
- * stop: 16
- */
- unsigned int queue_mask;
/*
* move_queued_task/activate_task/enqueue_task: rq->lock
@@ -2648,20 +2658,6 @@ struct sched_class {
#endif
};
-/*
- * Does not nest; only used around sched_class::pick_task() rq-lock-breaks.
- */
-static inline void rq_modified_clear(struct rq *rq)
-{
- rq->queue_mask = 0;
-}
-
-static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class)
-{
- unsigned int mask = class->queue_mask;
- return rq->queue_mask & ~((mask << 1) - 1);
-}
-
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
WARN_ON_ONCE(rq->donor != prev);
@@ -3397,11 +3393,11 @@ struct irqtime {
};
DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
-extern int sched_clock_irqtime;
+DECLARE_STATIC_KEY_FALSE(sched_clock_irqtime);
static inline int irqtime_enabled(void)
{
- return sched_clock_irqtime;
+ return static_branch_likely(&sched_clock_irqtime);
}
/*
@@ -4010,6 +4006,7 @@ void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_s
deactivate_task(src_rq, task, 0);
set_task_cpu(task, dst_rq->cpu);
activate_task(dst_rq, task, 0);
+ wakeup_preempt(dst_rq, task, 0);
}
static inline
@@ -4079,6 +4076,7 @@ extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
struct sched_change_ctx {
u64 prio;
struct task_struct *p;
+ const struct sched_class *class;
int flags;
bool queued;
bool running;
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 4f9192be4b5b..f95798baddeb 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -97,9 +97,6 @@ static void update_curr_stop(struct rq *rq)
* Simple, special scheduling class for the per-CPU stop tasks:
*/
DEFINE_SCHED_CLASS(stop) = {
-
- .queue_mask = 16,
-
.enqueue_task = enqueue_task_stop,
.dequeue_task = dequeue_task_stop,
.yield_task = yield_task_stop,
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index cf643a5ddedd..ac268da91778 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -508,6 +508,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
if (rq->fair_server.dl_server)
__dl_server_attach_root(&rq->fair_server, rq);
+#ifdef CONFIG_SCHED_CLASS_EXT
+ if (rq->ext_server.dl_server)
+ __dl_server_attach_root(&rq->ext_server, rq);
+#endif
+
rq_unlock_irqrestore(rq, &rf);
if (old_rd)
diff --git a/kernel/sys.c b/kernel/sys.c
index 8b58eece4e58..af71987df81c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -53,6 +53,7 @@
#include <linux/time_namespace.h>
#include <linux/binfmts.h>
#include <linux/futex.h>
+#include <linux/rseq.h>
#include <linux/sched.h>
#include <linux/sched/autogroup.h>
@@ -2868,6 +2869,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_FUTEX_HASH:
error = futex_hash_prctl(arg2, arg3, arg4);
break;
+ case PR_RSEQ_SLICE_EXTENSION:
+ if (arg4 || arg5)
+ return -EINVAL;
+ error = rseq_slice_extension_prctl(arg2, arg3);
+ break;
default:
trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5);
error = -EINVAL;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index bf5d05c635ff..add3032da16f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -390,6 +390,7 @@ COND_SYSCALL(setuid16);
/* restartable sequence */
COND_SYSCALL(rseq);
+COND_SYSCALL(rseq_slice_yield);
COND_SYSCALL(uretprobe);
COND_SYSCALL(uprobe);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 1caf02a72ba8..59d22f1bd0a8 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1742,7 +1742,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
lockdep_assert_held(&cpu_base->lock);
- debug_deactivate(timer);
+ debug_hrtimer_deactivate(timer);
base->running = timer;
/*