summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt5
-rw-r--r--Documentation/userspace-api/index.rst1
-rw-r--r--Documentation/userspace-api/rseq.rst140
-rw-r--r--arch/alpha/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/arm/tools/syscall.tbl1
-rw-r--r--arch/arm64/tools/syscall_32.tbl1
-rw-r--r--arch/m68k/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/microblaze/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/mips/kernel/syscalls/syscall_n32.tbl1
-rw-r--r--arch/mips/kernel/syscalls/syscall_n64.tbl1
-rw-r--r--arch/mips/kernel/syscalls/syscall_o32.tbl1
-rw-r--r--arch/parisc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/powerpc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/s390/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/sh/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/sparc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--arch/x86/kernel/tsc.c2
-rw-r--r--arch/xtensa/kernel/syscalls/syscall.tbl1
-rw-r--r--include/linux/compiler_types.h19
-rw-r--r--include/linux/entry-common.h167
-rw-r--r--include/linux/rseq.h11
-rw-r--r--include/linux/rseq_entry.h192
-rw-r--r--include/linux/rseq_types.h32
-rw-r--r--include/linux/sched.h14
-rw-r--r--include/linux/syscalls.h1
-rw-r--r--include/linux/thread_info.h16
-rw-r--r--include/uapi/asm-generic/unistd.h5
-rw-r--r--include/uapi/linux/prctl.h10
-rw-r--r--include/uapi/linux/rseq.h41
-rw-r--r--init/Kconfig12
-rw-r--r--init/init_task.c1
-rw-r--r--kernel/Kconfig.preempt3
-rw-r--r--kernel/entry/common.c27
-rw-r--r--kernel/entry/common.h7
-rw-r--r--kernel/entry/syscall-common.c97
-rw-r--r--kernel/entry/syscall_user_dispatch.c4
-rw-r--r--kernel/rseq.c365
-rw-r--r--kernel/sched/clock.c3
-rw-r--r--kernel/sched/core.c86
-rw-r--r--kernel/sched/cpufreq_schedutil.c2
-rw-r--r--kernel/sched/cputime.c9
-rw-r--r--kernel/sched/deadline.c105
-rw-r--r--kernel/sched/debug.c187
-rw-r--r--kernel/sched/ext.c42
-rw-r--r--kernel/sched/fair.c411
-rw-r--r--kernel/sched/idle.c7
-rw-r--r--kernel/sched/rt.c14
-rw-r--r--kernel/sched/sched.h142
-rw-r--r--kernel/sched/stop_task.c3
-rw-r--r--kernel/sched/topology.c5
-rw-r--r--kernel/sys.c6
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/time/hrtimer.c2
-rw-r--r--scripts/syscall.tbl1
-rw-r--r--tools/testing/selftests/rseq/.gitignore1
-rw-r--r--tools/testing/selftests/rseq/Makefile5
-rw-r--r--tools/testing/selftests/rseq/rseq-abi.h27
-rw-r--r--tools/testing/selftests/rseq/rseq-slice-hist.py132
-rw-r--r--tools/testing/selftests/rseq/slice_test.c219
-rw-r--r--tools/testing/selftests/sched_ext/Makefile2
-rw-r--r--tools/testing/selftests/sched_ext/rt_stall.bpf.c23
-rw-r--r--tools/testing/selftests/sched_ext/rt_stall.c240
-rw-r--r--tools/testing/selftests/sched_ext/total_bw.c281
65 files changed, 2598 insertions, 546 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index fc18080365ff..55f5c71c5d3b 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6627,6 +6627,11 @@ Kernel parameters
rootflags= [KNL] Set root filesystem mount option string
+ rseq_slice_ext= [KNL] RSEQ based time slice extension
+ Format: boolean
+ Control enablement of RSEQ based time slice extension.
+ Default is 'on'.
+
initramfs_options= [KNL]
Specify mount options for for the initramfs mount.
diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst
index 6f0235ecc572..a68b1bea57a8 100644
--- a/Documentation/userspace-api/index.rst
+++ b/Documentation/userspace-api/index.rst
@@ -21,6 +21,7 @@ System calls
ebpf/index
ioctl/index
mseal
+ rseq
Security-related interfaces
===========================
diff --git a/Documentation/userspace-api/rseq.rst b/Documentation/userspace-api/rseq.rst
new file mode 100644
index 000000000000..3cd27a3c7c7e
--- /dev/null
+++ b/Documentation/userspace-api/rseq.rst
@@ -0,0 +1,140 @@
+=====================
+Restartable Sequences
+=====================
+
+Restartable Sequences allow to register a per thread userspace memory area
+to be used as an ABI between kernel and userspace for three purposes:
+
+ * userspace restartable sequences
+
+ * quick access to read the current CPU number, node ID from userspace
+
+ * scheduler time slice extensions
+
+Restartable sequences (per-cpu atomics)
+---------------------------------------
+
+Restartable sequences allow userspace to perform update operations on
+per-cpu data without requiring heavyweight atomic operations. The actual
+ABI is unfortunately only available in the code and selftests.
+
+Quick access to CPU number, node ID
+-----------------------------------
+
+Allows to implement per CPU data efficiently. Documentation is in code and
+selftests. :(
+
+Scheduler time slice extensions
+-------------------------------
+
+This allows a thread to request a time slice extension when it enters a
+critical section to avoid contention on a resource when the thread is
+scheduled out inside of the critical section.
+
+The prerequisites for this functionality are:
+
+ * Enabled in Kconfig
+
+ * Enabled at boot time (default is enabled)
+
+ * A rseq userspace pointer has been registered for the thread
+
+The thread has to enable the functionality via prctl(2)::
+
+ prctl(PR_RSEQ_SLICE_EXTENSION, PR_RSEQ_SLICE_EXTENSION_SET,
+ PR_RSEQ_SLICE_EXT_ENABLE, 0, 0);
+
+prctl() returns 0 on success or otherwise with the following error codes:
+
+========= ==============================================================
+Errorcode Meaning
+========= ==============================================================
+EINVAL Functionality not available or invalid function arguments.
+ Note: arg4 and arg5 must be zero
+ENOTSUPP Functionality was disabled on the kernel command line
+ENXIO Available, but no rseq user struct registered
+========= ==============================================================
+
+The state can be also queried via prctl(2)::
+
+ prctl(PR_RSEQ_SLICE_EXTENSION, PR_RSEQ_SLICE_EXTENSION_GET, 0, 0, 0);
+
+prctl() returns ``PR_RSEQ_SLICE_EXT_ENABLE`` when it is enabled or 0 if
+disabled. Otherwise it returns with the following error codes:
+
+========= ==============================================================
+Errorcode Meaning
+========= ==============================================================
+EINVAL Functionality not available or invalid function arguments.
+ Note: arg3 and arg4 and arg5 must be zero
+========= ==============================================================
+
+The availability and status is also exposed via the rseq ABI struct flags
+field via the ``RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE_BIT`` and the
+``RSEQ_CS_FLAG_SLICE_EXT_ENABLED_BIT``. These bits are read-only for user
+space and only for informational purposes.
+
+If the mechanism was enabled via prctl(), the thread can request a time
+slice extension by setting rseq::slice_ctrl::request to 1. If the thread is
+interrupted and the interrupt results in a reschedule request in the
+kernel, then the kernel can grant a time slice extension and return to
+userspace instead of scheduling out. The length of the extension is
+determined by debugfs:rseq/slice_ext_nsec. The default value is 5 usec; which
+is the minimum value. It can be incremented to 50 usecs, however doing so
+can/will affect the minimum scheduling latency.
+
+Any proposed changes to this default will have to come with a selftest and
+rseq-slice-hist.py output that shows the new value has merrit.
+
+The kernel indicates the grant by clearing rseq::slice_ctrl::request and
+setting rseq::slice_ctrl::granted to 1. If there is a reschedule of the
+thread after granting the extension, the kernel clears the granted bit to
+indicate that to userspace.
+
+If the request bit is still set when the leaving the critical section,
+userspace can clear it and continue.
+
+If the granted bit is set, then userspace invokes rseq_slice_yield(2) when
+leaving the critical section to relinquish the CPU. The kernel enforces
+this by arming a timer to prevent misbehaving userspace from abusing this
+mechanism.
+
+If both the request bit and the granted bit are false when leaving the
+critical section, then this indicates that a grant was revoked and no
+further action is required by userspace.
+
+The required code flow is as follows::
+
+ rseq->slice_ctrl.request = 1;
+ barrier(); // Prevent compiler reordering
+ critical_section();
+ barrier(); // Prevent compiler reordering
+ rseq->slice_ctrl.request = 0;
+ if (rseq->slice_ctrl.granted)
+ rseq_slice_yield();
+
+As all of this is strictly CPU local, there are no atomicity requirements.
+Checking the granted state is racy, but that cannot be avoided at all::
+
+ if (rseq->slice_ctrl.granted)
+ -> Interrupt results in schedule and grant revocation
+ rseq_slice_yield();
+
+So there is no point in pretending that this might be solved by an atomic
+operation.
+
+If the thread issues a syscall other than rseq_slice_yield(2) within the
+granted timeslice extension, the grant is also revoked and the CPU is
+relinquished immediately when entering the kernel. This is required as
+syscalls might consume arbitrary CPU time until they reach a scheduling
+point when the preemption model is either NONE or VOLUNTARY and therefore
+might exceed the grant by far.
+
+The preferred solution for user space is to use rseq_slice_yield(2) which
+is side effect free. The support for arbitrary syscalls is required to
+support onion layer architectured applications, where the code handling the
+critical section and requesting the time slice extension has no control
+over the code within the critical section.
+
+The kernel enforces flag consistency and terminates the thread with SIGSEGV
+if it detects a violation.
diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 3fed97478058..f31b7afffc34 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -510,3 +510,4 @@
578 common file_getattr sys_file_getattr
579 common file_setattr sys_file_setattr
580 common listns sys_listns
+581 common rseq_slice_yield sys_rseq_slice_yield
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index fd09afae72a2..94351e22bfcf 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -485,3 +485,4 @@
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
470 common listns sys_listns
+471 common rseq_slice_yield sys_rseq_slice_yield
diff --git a/arch/arm64/tools/syscall_32.tbl b/arch/arm64/tools/syscall_32.tbl
index 8cdfe5d4dac9..62d93d88e0fe 100644
--- a/arch/arm64/tools/syscall_32.tbl
+++ b/arch/arm64/tools/syscall_32.tbl
@@ -482,3 +482,4 @@
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
470 common listns sys_listns
+471 common rseq_slice_yield sys_rseq_slice_yield
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 871a5d67bf41..248934257101 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -470,3 +470,4 @@
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
470 common listns sys_listns
+471 common rseq_slice_yield sys_rseq_slice_yield
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 022fc85d94b3..223d26303627 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -476,3 +476,4 @@
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
470 common listns sys_listns
+471 common rseq_slice_yield sys_rseq_slice_yield
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 8cedc83c3266..7430714e2b8f 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -409,3 +409,4 @@
468 n32 file_getattr sys_file_getattr
469 n32 file_setattr sys_file_setattr
470 n32 listns sys_listns
+471 n32 rseq_slice_yield sys_rseq_slice_yield
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 9b92bddf06b5..630aab9e5425 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -385,3 +385,4 @@
468 n64 file_getattr sys_file_getattr
469 n64 file_setattr sys_file_setattr
470 n64 listns sys_listns
+471 n64 rseq_slice_yield sys_rseq_slice_yield
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index f810b8a55716..128653112284 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -458,3 +458,4 @@
468 o32 file_getattr sys_file_getattr
469 o32 file_setattr sys_file_setattr
470 o32 listns sys_listns
+471 o32 rseq_slice_yield sys_rseq_slice_yield
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 39bdacaa530b..f6e2d0379d57 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -469,3 +469,4 @@
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
470 common listns sys_listns
+471 common rseq_slice_yield sys_rseq_slice_yield
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index ec4458cdb97b..4fcc7c58a105 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -561,3 +561,4 @@
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
470 common listns sys_listns
+471 nospu rseq_slice_yield sys_rseq_slice_yield
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 417ed16b3c63..09a7ef04d979 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -397,3 +397,4 @@
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
470 common listns sys_listns
+471 common rseq_slice_yield sys_rseq_slice_yield
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 969c11325ade..70b315cbe710 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -474,3 +474,4 @@
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
470 common listns sys_listns
+471 common rseq_slice_yield sys_rseq_slice_yield
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index c0307bb09892..7e71bf7fcd14 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -516,3 +516,4 @@
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
470 common listns sys_listns
+471 common rseq_slice_yield sys_rseq_slice_yield
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index e979a3eac7a3..f832ebd2d79b 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -476,3 +476,4 @@
468 i386 file_getattr sys_file_getattr
469 i386 file_setattr sys_file_setattr
470 i386 listns sys_listns
+471 i386 rseq_slice_yield sys_rseq_slice_yield
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 8a4ac4841be6..524155d655da 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -395,6 +395,7 @@
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
470 common listns sys_listns
+471 common rseq_slice_yield sys_rseq_slice_yield
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7d3e13e14eab..7be44b5198cf 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1143,7 +1143,6 @@ static void tsc_cs_mark_unstable(struct clocksource *cs)
tsc_unstable = 1;
if (using_native_sched_clock())
clear_sched_clock_stable();
- disable_sched_clock_irqtime();
pr_info("Marking TSC unstable due to clocksource watchdog\n");
}
@@ -1213,7 +1212,6 @@ void mark_tsc_unstable(char *reason)
tsc_unstable = 1;
if (using_native_sched_clock())
clear_sched_clock_stable();
- disable_sched_clock_irqtime();
pr_info("Marking TSC unstable due to %s\n", reason);
clocksource_mark_unstable(&clocksource_tsc_early);
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 438a3b170402..a9bca4e484de 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -441,3 +441,4 @@
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
470 common listns sys_listns
+471 common rseq_slice_yield sys_rseq_slice_yield
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index ea96a4466d82..26d322d43224 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -620,6 +620,25 @@ struct ftrace_likely_data {
__scalar_type_to_expr_cases(long long), \
default: (x)))
+/*
+ * __signed_scalar_typeof(x) - Declare a signed scalar type, leaving
+ * non-scalar types unchanged.
+ */
+
+#define __scalar_type_to_signed_cases(type) \
+ unsigned type: (signed type)0, \
+ signed type: (signed type)0
+
+#define __signed_scalar_typeof(x) typeof( \
+ _Generic((x), \
+ char: (signed char)0, \
+ __scalar_type_to_signed_cases(char), \
+ __scalar_type_to_signed_cases(short), \
+ __scalar_type_to_signed_cases(int), \
+ __scalar_type_to_signed_cases(long), \
+ __scalar_type_to_signed_cases(long long), \
+ default: (x)))
+
/* Is this type a native word size -- useful for atomic operations */
#define __native_word(t) \
(sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || \
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 87efb38b7081..f83ca0abf2cd 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -2,6 +2,7 @@
#ifndef __LINUX_ENTRYCOMMON_H
#define __LINUX_ENTRYCOMMON_H
+#include <linux/audit.h>
#include <linux/irq-entry-common.h>
#include <linux/livepatch.h>
#include <linux/ptrace.h>
@@ -36,8 +37,8 @@
SYSCALL_WORK_SYSCALL_EMU | \
SYSCALL_WORK_SYSCALL_AUDIT | \
SYSCALL_WORK_SYSCALL_USER_DISPATCH | \
+ SYSCALL_WORK_SYSCALL_RSEQ_SLICE | \
ARCH_SYSCALL_WORK_ENTER)
-
#define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \
SYSCALL_WORK_SYSCALL_TRACE | \
SYSCALL_WORK_SYSCALL_AUDIT | \
@@ -45,7 +46,84 @@
SYSCALL_WORK_SYSCALL_EXIT_TRAP | \
ARCH_SYSCALL_WORK_EXIT)
-long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work);
+/**
+ * arch_ptrace_report_syscall_entry - Architecture specific ptrace_report_syscall_entry() wrapper
+ *
+ * Invoked from syscall_trace_enter() to wrap ptrace_report_syscall_entry().
+ *
+ * This allows architecture specific ptrace_report_syscall_entry()
+ * implementations. If not defined by the architecture this falls back to
+ * to ptrace_report_syscall_entry().
+ */
+static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs);
+
+#ifndef arch_ptrace_report_syscall_entry
+static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs)
+{
+ return ptrace_report_syscall_entry(regs);
+}
+#endif
+
+bool syscall_user_dispatch(struct pt_regs *regs);
+long trace_syscall_enter(struct pt_regs *regs, long syscall);
+void trace_syscall_exit(struct pt_regs *regs, long ret);
+
+static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
+{
+ if (unlikely(audit_context())) {
+ unsigned long args[6];
+
+ syscall_get_arguments(current, regs, args);
+ audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
+ }
+}
+
+static __always_inline long syscall_trace_enter(struct pt_regs *regs, unsigned long work)
+{
+ long syscall, ret = 0;
+
+ /*
+ * Handle Syscall User Dispatch. This must comes first, since
+ * the ABI here can be something that doesn't make sense for
+ * other syscall_work features.
+ */
+ if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
+ if (syscall_user_dispatch(regs))
+ return -1L;
+ }
+
+ /*
+ * User space got a time slice extension granted and relinquishes
+ * the CPU. The work stops the slice timer to avoid an extra round
+ * through hrtimer_interrupt().
+ */
+ if (work & SYSCALL_WORK_SYSCALL_RSEQ_SLICE)
+ rseq_syscall_enter_work(syscall_get_nr(current, regs));
+
+ /* Handle ptrace */
+ if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
+ ret = arch_ptrace_report_syscall_entry(regs);
+ if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
+ return -1L;
+ }
+
+ /* Do seccomp after ptrace, to catch any tracer changes. */
+ if (work & SYSCALL_WORK_SECCOMP) {
+ ret = __secure_computing();
+ if (ret == -1L)
+ return ret;
+ }
+
+ /* Either of the above might have changed the syscall number */
+ syscall = syscall_get_nr(current, regs);
+
+ if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
+ syscall = trace_syscall_enter(regs, syscall);
+
+ syscall_enter_audit(regs, syscall);
+
+ return ret ? : syscall;
+}
/**
* syscall_enter_from_user_mode_work - Check and handle work before invoking
@@ -75,7 +153,7 @@ static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *re
unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
if (work & SYSCALL_WORK_ENTER)
- syscall = syscall_trace_enter(regs, syscall, work);
+ syscall = syscall_trace_enter(regs, work);
return syscall;
}
@@ -112,6 +190,37 @@ static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, l
return ret;
}
+/*
+ * If SYSCALL_EMU is set, then the only reason to report is when
+ * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
+ * instruction has been already reported in syscall_enter_from_user_mode().
+ */
+static __always_inline bool report_single_step(unsigned long work)
+{
+ if (work & SYSCALL_WORK_SYSCALL_EMU)
+ return false;
+
+ return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
+}
+
+/**
+ * arch_ptrace_report_syscall_exit - Architecture specific ptrace_report_syscall_exit()
+ *
+ * This allows architecture specific ptrace_report_syscall_exit()
+ * implementations. If not defined by the architecture this falls back to
+ * to ptrace_report_syscall_exit().
+ */
+static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs,
+ int step);
+
+#ifndef arch_ptrace_report_syscall_exit
+static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs,
+ int step)
+{
+ ptrace_report_syscall_exit(regs, step);
+}
+#endif
+
/**
* syscall_exit_work - Handle work before returning to user mode
* @regs: Pointer to current pt_regs
@@ -119,20 +228,40 @@ static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, l
*
* Do one-time syscall specific work.
*/
-void syscall_exit_work(struct pt_regs *regs, unsigned long work);
+static __always_inline void syscall_exit_work(struct pt_regs *regs, unsigned long work)
+{
+ bool step;
+
+ /*
+ * If the syscall was rolled back due to syscall user dispatching,
+ * then the tracers below are not invoked for the same reason as
+ * the entry side was not invoked in syscall_trace_enter(): The ABI
+ * of these syscalls is unknown.
+ */
+ if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
+ if (unlikely(current->syscall_dispatch.on_dispatch)) {
+ current->syscall_dispatch.on_dispatch = false;
+ return;
+ }
+ }
+
+ audit_syscall_exit(regs);
+
+ if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
+ trace_syscall_exit(regs, syscall_get_return_value(current, regs));
+
+ step = report_single_step(work);
+ if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
+ arch_ptrace_report_syscall_exit(regs, step);
+}
/**
- * syscall_exit_to_user_mode_work - Handle work before returning to user mode
+ * syscall_exit_to_user_mode_work - Handle one time work before returning to user mode
* @regs: Pointer to currents pt_regs
*
- * Same as step 1 and 2 of syscall_exit_to_user_mode() but without calling
- * exit_to_user_mode() to perform the final transition to user mode.
+ * Step 1 of syscall_exit_to_user_mode() with the same calling convention.
*
- * Calling convention is the same as for syscall_exit_to_user_mode() and it
- * returns with all work handled and interrupts disabled. The caller must
- * invoke exit_to_user_mode() before actually switching to user mode to
- * make the final state transitions. Interrupts must stay disabled between
- * return from this function and the invocation of exit_to_user_mode().
+ * The caller must invoke steps 2-3 of syscall_exit_to_user_mode() afterwards.
*/
static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
{
@@ -155,15 +284,13 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
*/
if (unlikely(work & SYSCALL_WORK_EXIT))
syscall_exit_work(regs, work);
- local_irq_disable_exit_to_user();
- syscall_exit_to_user_mode_prepare(regs);
}
/**
* syscall_exit_to_user_mode - Handle work before returning to user mode
* @regs: Pointer to currents pt_regs
*
- * Invoked with interrupts enabled and fully valid regs. Returns with all
+ * Invoked with interrupts enabled and fully valid @regs. Returns with all
* work handled, interrupts disabled such that the caller can immediately
* switch to user mode. Called from architecture specific syscall and ret
* from fork code.
@@ -176,6 +303,7 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
* - ptrace (single stepping)
*
* 2) Preparatory work
+ * - Disable interrupts
* - Exit to user mode loop (common TIF handling). Invokes
* arch_exit_to_user_mode_work() for architecture specific TIF work
* - Architecture specific one time work arch_exit_to_user_mode_prepare()
@@ -184,14 +312,17 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
* 3) Final transition (lockdep, tracing, context tracking, RCU), i.e. the
* functionality in exit_to_user_mode().
*
- * This is a combination of syscall_exit_to_user_mode_work() (1,2) and
- * exit_to_user_mode(). This function is preferred unless there is a
- * compelling architectural reason to use the separate functions.
+ * This is a combination of syscall_exit_to_user_mode_work() (1), disabling
+ * interrupts followed by syscall_exit_to_user_mode_prepare() (2) and
+ * exit_to_user_mode() (3). This function is preferred unless there is a
+ * compelling architectural reason to invoke the functions separately.
*/
static __always_inline void syscall_exit_to_user_mode(struct pt_regs *regs)
{
instrumentation_begin();
syscall_exit_to_user_mode_work(regs);
+ local_irq_disable_exit_to_user();
+ syscall_exit_to_user_mode_prepare(regs);
instrumentation_end();
exit_to_user_mode();
}
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index 2266f4dc77b6..7a01a0760405 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -163,4 +163,15 @@ void rseq_syscall(struct pt_regs *regs);
static inline void rseq_syscall(struct pt_regs *regs) { }
#endif /* !CONFIG_DEBUG_RSEQ */
+#ifdef CONFIG_RSEQ_SLICE_EXTENSION
+void rseq_syscall_enter_work(long syscall);
+int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3);
+#else /* CONFIG_RSEQ_SLICE_EXTENSION */
+static inline void rseq_syscall_enter_work(long syscall) { }
+static inline int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)
+{
+ return -ENOTSUPP;
+}
+#endif /* !CONFIG_RSEQ_SLICE_EXTENSION */
+
#endif /* _LINUX_RSEQ_H */
diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h
index a36b472627de..cbc4a791618b 100644
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -15,6 +15,11 @@ struct rseq_stats {
unsigned long cs;
unsigned long clear;
unsigned long fixup;
+ unsigned long s_granted;
+ unsigned long s_expired;
+ unsigned long s_revoked;
+ unsigned long s_yielded;
+ unsigned long s_aborted;
};
DECLARE_PER_CPU(struct rseq_stats, rseq_stats);
@@ -37,6 +42,7 @@ DECLARE_PER_CPU(struct rseq_stats, rseq_stats);
#ifdef CONFIG_RSEQ
#include <linux/jump_label.h>
#include <linux/rseq.h>
+#include <linux/sched/signal.h>
#include <linux/uaccess.h>
#include <linux/tracepoint-defs.h>
@@ -75,6 +81,147 @@ DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
#define rseq_inline __always_inline
#endif
+#ifdef CONFIG_RSEQ_SLICE_EXTENSION
+DECLARE_STATIC_KEY_TRUE(rseq_slice_extension_key);
+
+static __always_inline bool rseq_slice_extension_enabled(void)
+{
+ return static_branch_likely(&rseq_slice_extension_key);
+}
+
+extern unsigned int rseq_slice_ext_nsecs;
+bool __rseq_arm_slice_extension_timer(void);
+
+static __always_inline bool rseq_arm_slice_extension_timer(void)
+{
+ if (!rseq_slice_extension_enabled())
+ return false;
+
+ if (likely(!current->rseq.slice.state.granted))
+ return false;
+
+ return __rseq_arm_slice_extension_timer();
+}
+
+static __always_inline void rseq_slice_clear_grant(struct task_struct *t)
+{
+ if (IS_ENABLED(CONFIG_RSEQ_STATS) && t->rseq.slice.state.granted)
+ rseq_stat_inc(rseq_stats.s_revoked);
+ t->rseq.slice.state.granted = false;
+}
+
+static __always_inline bool rseq_grant_slice_extension(bool work_pending)
+{
+ struct task_struct *curr = current;
+ struct rseq_slice_ctrl usr_ctrl;
+ union rseq_slice_state state;
+ struct rseq __user *rseq;
+
+ if (!rseq_slice_extension_enabled())
+ return false;
+
+ /* If not enabled or not a return from interrupt, nothing to do. */
+ state = curr->rseq.slice.state;
+ state.enabled &= curr->rseq.event.user_irq;
+ if (likely(!state.state))
+ return false;
+
+ rseq = curr->rseq.usrptr;
+ scoped_user_rw_access(rseq, efault) {
+
+ /*
+ * Quick check conditions where a grant is not possible or
+ * needs to be revoked.
+ *
+ * 1) Any TIF bit which needs to do extra work aside of
+ * rescheduling prevents a grant.
+ *
+ * 2) A previous rescheduling request resulted in a slice
+ * extension grant.
+ */
+ if (unlikely(work_pending || state.granted)) {
+ /* Clear user control unconditionally. No point for checking */
+ unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
+ rseq_slice_clear_grant(curr);
+ return false;
+ }
+
+ unsafe_get_user(usr_ctrl.all, &rseq->slice_ctrl.all, efault);
+ if (likely(!(usr_ctrl.request)))
+ return false;
+
+ /* Grant the slice extention */
+ usr_ctrl.request = 0;
+ usr_ctrl.granted = 1;
+ unsafe_put_user(usr_ctrl.all, &rseq->slice_ctrl.all, efault);
+ }
+
+ rseq_stat_inc(rseq_stats.s_granted);
+
+ curr->rseq.slice.state.granted = true;
+ /* Store expiry time for arming the timer on the way out */
+ curr->rseq.slice.expires = data_race(rseq_slice_ext_nsecs) + ktime_get_mono_fast_ns();
+ /*
+ * This is racy against a remote CPU setting TIF_NEED_RESCHED in
+ * several ways:
+ *
+ * 1)
+ * CPU0 CPU1
+ * clear_tsk()
+ * set_tsk()
+ * clear_preempt()
+ * Raise scheduler IPI on CPU0
+ * --> IPI
+ * fold_need_resched() -> Folds correctly
+ * 2)
+ * CPU0 CPU1
+ * set_tsk()
+ * clear_tsk()
+ * clear_preempt()
+ * Raise scheduler IPI on CPU0
+ * --> IPI
+ * fold_need_resched() <- NOOP as TIF_NEED_RESCHED is false
+ *
+ * #1 is not any different from a regular remote reschedule as it
+ * sets the previously not set bit and then raises the IPI which
+ * folds it into the preempt counter
+ *
+ * #2 is obviously incorrect from a scheduler POV, but it's not
+ * differently incorrect than the code below clearing the
+ * reschedule request with the safety net of the timer.
+ *
+ * The important part is that the clearing is protected against the
+ * scheduler IPI and also against any other interrupt which might
+ * end up waking up a task and setting the bits in the middle of
+ * the operation:
+ *
+ * clear_tsk()
+ * ---> Interrupt
+ * wakeup_on_this_cpu()
+ * set_tsk()
+ * set_preempt()
+ * clear_preempt()
+ *
+ * which would be inconsistent state.
+ */
+ scoped_guard(irq) {
+ clear_tsk_need_resched(curr);
+ clear_preempt_need_resched();
+ }
+ return true;
+
+efault:
+ force_sig(SIGSEGV);
+ return false;
+}
+
+#else /* CONFIG_RSEQ_SLICE_EXTENSION */
+static inline bool rseq_slice_extension_enabled(void) { return false; }
+static inline bool rseq_arm_slice_extension_timer(void) { return false; }
+static inline void rseq_slice_clear_grant(struct task_struct *t) { }
+static inline bool rseq_grant_slice_extension(bool work_pending) { return false; }
+#endif /* !CONFIG_RSEQ_SLICE_EXTENSION */
+
bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
bool rseq_debug_validate_ids(struct task_struct *t);
@@ -359,8 +506,15 @@ bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids,
unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault);
if (csaddr)
unsafe_get_user(*csaddr, &rseq->rseq_cs, efault);
+
+ /* Open coded, so it's in the same user access region */
+ if (rseq_slice_extension_enabled()) {
+ /* Unconditionally clear it, no point in conditionals */
+ unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
+ }
}
+ rseq_slice_clear_grant(t);
/* Cache the new values */
t->rseq.ids.cpu_cid = ids->cpu_cid;
rseq_stat_inc(rseq_stats.ids);
@@ -456,8 +610,17 @@ static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct t
*/
u64 csaddr;
- if (unlikely(get_user_inline(csaddr, &rseq->rseq_cs)))
- return false;
+ scoped_user_rw_access(rseq, efault) {
+ unsafe_get_user(csaddr, &rseq->rseq_cs, efault);
+
+ /* Open coded, so it's in the same user access region */
+ if (rseq_slice_extension_enabled()) {
+ /* Unconditionally clear it, no point in conditionals */
+ unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
+ }
+ }
+
+ rseq_slice_clear_grant(t);
if (static_branch_unlikely(&rseq_debug_enabled) || unlikely(csaddr)) {
if (unlikely(!rseq_update_user_cs(t, regs, csaddr)))
@@ -473,6 +636,8 @@ static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct t
u32 node_id = cpu_to_node(ids.cpu_id);
return rseq_update_usr(t, regs, &ids, node_id);
+efault:
+ return false;
}
static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *regs)
@@ -527,17 +692,19 @@ static __always_inline void clear_tif_rseq(void) { }
static __always_inline bool
rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
{
- if (likely(!test_tif_rseq(ti_work)))
- return false;
-
- if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
- current->rseq.event.slowpath = true;
- set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
- return true;
+ if (unlikely(test_tif_rseq(ti_work))) {
+ if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
+ current->rseq.event.slowpath = true;
+ set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
+ return true;
+ }
+ clear_tif_rseq();
}
-
- clear_tif_rseq();
- return false;
+ /*
+ * Arm the slice extension timer if nothing to do anymore and the
+ * task really goes out to user space.
+ */
+ return rseq_arm_slice_extension_timer();
}
#else /* CONFIG_GENERIC_ENTRY */
@@ -611,6 +778,7 @@ static inline void rseq_syscall_exit_to_user_mode(void) { }
static inline void rseq_irqentry_exit_to_user_mode(void) { }
static inline void rseq_exit_to_user_mode_legacy(void) { }
static inline void rseq_debug_syscall_return(struct pt_regs *regs) { }
+static inline bool rseq_grant_slice_extension(bool work_pending) { return false; }
#endif /* !CONFIG_RSEQ */
#endif /* _LINUX_RSEQ_ENTRY_H */
diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
index ef0811379c54..da5fa6f40294 100644
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -73,12 +73,39 @@ struct rseq_ids {
};
/**
+ * union rseq_slice_state - Status information for rseq time slice extension
+ * @state: Compound to access the overall state
+ * @enabled: Time slice extension is enabled for the task
+ * @granted: Time slice extension was granted to the task
+ */
+union rseq_slice_state {
+ u16 state;
+ struct {
+ u8 enabled;
+ u8 granted;
+ };
+};
+
+/**
+ * struct rseq_slice - Status information for rseq time slice extension
+ * @state: Time slice extension state
+ * @expires: The time when a grant expires
+ * @yielded: Indicator for rseq_slice_yield()
+ */
+struct rseq_slice {
+ union rseq_slice_state state;
+ u64 expires;
+ u8 yielded;
+};
+
+/**
* struct rseq_data - Storage for all rseq related data
* @usrptr: Pointer to the registered user space RSEQ memory
* @len: Length of the RSEQ region
- * @sig: Signature of critial section abort IPs
+ * @sig: Signature of critical section abort IPs
* @event: Storage for event management
* @ids: Storage for cached CPU ID and MM CID
+ * @slice: Storage for time slice extension data
*/
struct rseq_data {
struct rseq __user *usrptr;
@@ -86,6 +113,9 @@ struct rseq_data {
u32 sig;
struct rseq_event event;
struct rseq_ids ids;
+#ifdef CONFIG_RSEQ_SLICE_EXTENSION
+ struct rseq_slice slice;
+#endif
};
#else /* CONFIG_RSEQ */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8faf653803a1..36ae08ca0c62 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -586,15 +586,10 @@ struct sched_entity {
u64 sum_exec_runtime;
u64 prev_sum_exec_runtime;
u64 vruntime;
- union {
- /*
- * When !@on_rq this field is vlag.
- * When cfs_rq->curr == se (which implies @on_rq)
- * this field is vprot. See protect_slice().
- */
- s64 vlag;
- u64 vprot;
- };
+ /* Approximated virtual lag: */
+ s64 vlag;
+ /* 'Protected' deadline, to give out minimum quantums: */
+ u64 vprot;
u64 slice;
u64 nr_migrations;
@@ -956,7 +951,6 @@ struct task_struct {
struct mm_struct *mm;
struct mm_struct *active_mm;
- struct address_space *faults_disabled_mapping;
int exit_state;
int exit_code;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index cf84d98964b2..6c8a570cf44a 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -961,6 +961,7 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
unsigned mask, struct statx __user *buffer);
asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
int flags, uint32_t sig);
+asmlinkage long sys_rseq_slice_yield(void);
asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags);
asmlinkage long sys_open_tree_attr(int dfd, const char __user *path,
unsigned flags,
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index b40de9bab4b7..051e42902690 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -46,15 +46,17 @@ enum syscall_work_bit {
SYSCALL_WORK_BIT_SYSCALL_AUDIT,
SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH,
SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP,
+ SYSCALL_WORK_BIT_SYSCALL_RSEQ_SLICE,
};
-#define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP)
-#define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT)
-#define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE)
-#define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU)
-#define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT)
-#define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH)
-#define SYSCALL_WORK_SYSCALL_EXIT_TRAP BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP)
+#define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP)
+#define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT)
+#define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE)
+#define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU)
+#define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT)
+#define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH)
+#define SYSCALL_WORK_SYSCALL_EXIT_TRAP BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP)
+#define SYSCALL_WORK_SYSCALL_RSEQ_SLICE BIT(SYSCALL_WORK_BIT_SYSCALL_RSEQ_SLICE)
#endif
#include <asm/thread_info.h>
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 942370b3f5d2..a627acc8fb5f 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -860,8 +860,11 @@ __SYSCALL(__NR_file_setattr, sys_file_setattr)
#define __NR_listns 470
__SYSCALL(__NR_listns, sys_listns)
+#define __NR_rseq_slice_yield 471
+__SYSCALL(__NR_rseq_slice_yield, sys_rseq_slice_yield)
+
#undef __NR_syscalls
-#define __NR_syscalls 471
+#define __NR_syscalls 472
/*
* 32 bit systems traditionally used different
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 51c4e8c82b1e..79944b7ae50a 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -386,4 +386,14 @@ struct prctl_mm_map {
# define PR_FUTEX_HASH_SET_SLOTS 1
# define PR_FUTEX_HASH_GET_SLOTS 2
+/* RSEQ time slice extensions */
+#define PR_RSEQ_SLICE_EXTENSION 79
+# define PR_RSEQ_SLICE_EXTENSION_GET 1
+# define PR_RSEQ_SLICE_EXTENSION_SET 2
+/*
+ * Bits for RSEQ_SLICE_EXTENSION_GET/SET
+ * PR_RSEQ_SLICE_EXT_ENABLE: Enable
+ */
+# define PR_RSEQ_SLICE_EXT_ENABLE 0x01
+
#endif /* _LINUX_PRCTL_H */
diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
index 1b76d508400c..863c4a00a66b 100644
--- a/include/uapi/linux/rseq.h
+++ b/include/uapi/linux/rseq.h
@@ -19,13 +19,20 @@ enum rseq_cpu_id_state {
};
enum rseq_flags {
- RSEQ_FLAG_UNREGISTER = (1 << 0),
+ RSEQ_FLAG_UNREGISTER = (1 << 0),
+ RSEQ_FLAG_SLICE_EXT_DEFAULT_ON = (1 << 1),
};
enum rseq_cs_flags_bit {
+ /* Historical and unsupported bits */
RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0,
RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1,
RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2,
+ /* (3) Intentional gap to put new bits into a separate byte */
+
+ /* User read only feature flags */
+ RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE_BIT = 4,
+ RSEQ_CS_FLAG_SLICE_EXT_ENABLED_BIT = 5,
};
enum rseq_cs_flags {
@@ -35,6 +42,11 @@ enum rseq_cs_flags {
(1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT),
RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE =
(1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT),
+
+ RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE =
+ (1U << RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE_BIT),
+ RSEQ_CS_FLAG_SLICE_EXT_ENABLED =
+ (1U << RSEQ_CS_FLAG_SLICE_EXT_ENABLED_BIT),
};
/*
@@ -53,6 +65,27 @@ struct rseq_cs {
__u64 abort_ip;
} __attribute__((aligned(4 * sizeof(__u64))));
+/**
+ * rseq_slice_ctrl - Time slice extension control structure
+ * @all: Compound value
+ * @request: Request for a time slice extension
+ * @granted: Granted time slice extension
+ *
+ * @request is set by user space and can be cleared by user space or kernel
+ * space. @granted is set and cleared by the kernel and must only be read
+ * by user space.
+ */
+struct rseq_slice_ctrl {
+ union {
+ __u32 all;
+ struct {
+ __u8 request;
+ __u8 granted;
+ __u16 __reserved;
+ };
+ };
+};
+
/*
* struct rseq is aligned on 4 * 8 bytes to ensure it is always
* contained within a single cache-line.
@@ -142,6 +175,12 @@ struct rseq {
__u32 mm_cid;
/*
+ * Time slice extension control structure. CPU local updates from
+ * kernel and user space.
+ */
+ struct rseq_slice_ctrl slice_ctrl;
+
+ /*
* Flexible array member at end of structure, after last feature field.
*/
char end[];
diff --git a/init/Kconfig b/init/Kconfig
index 66f02480368b..04008331bff3 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1949,6 +1949,18 @@ config RSEQ
If unsure, say Y.
+config RSEQ_SLICE_EXTENSION
+ bool "Enable rseq-based time slice extension mechanism"
+ depends on RSEQ && HIGH_RES_TIMERS && GENERIC_ENTRY && HAVE_GENERIC_TIF_BITS
+ help
+ Allows userspace to request a limited time slice extension when
+ returning from an interrupt to user space via the RSEQ shared
+ data ABI. If granted, that allows to complete a critical section,
+ so that other threads are not stuck on a conflicted resource,
+ while the task is scheduled out.
+
+ If unsure, say N.
+
config RSEQ_STATS
default n
bool "Enable lightweight statistics of restartable sequences" if EXPERT
diff --git a/init/init_task.c b/init/init_task.c
index db92c404d59a..5c838757fc10 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -113,7 +113,6 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
.nr_cpus_allowed= NR_CPUS,
.mm = NULL,
.active_mm = &init_mm,
- .faults_disabled_mapping = NULL,
.restart_block = {
.fn = do_no_restart_syscall,
},
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index da326800c1c9..88c594c6d7fc 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -16,11 +16,13 @@ config ARCH_HAS_PREEMPT_LAZY
choice
prompt "Preemption Model"
+ default PREEMPT_LAZY if ARCH_HAS_PREEMPT_LAZY
default PREEMPT_NONE
config PREEMPT_NONE
bool "No Forced Preemption (Server)"
depends on !PREEMPT_RT
+ depends on ARCH_NO_PREEMPT
select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC
help
This is the traditional Linux preemption model, geared towards
@@ -35,6 +37,7 @@ config PREEMPT_NONE
config PREEMPT_VOLUNTARY
bool "Voluntary Kernel Preemption (Desktop)"
+ depends on !ARCH_HAS_PREEMPT_LAZY
depends on !ARCH_NO_PREEMPT
depends on !PREEMPT_RT
select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 5c792b30c58a..9ef63e414791 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -17,6 +17,27 @@ void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
#define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK)
#endif
+/* TIF bits, which prevent a time slice extension. */
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * Since rseq slice ext has a direct correlation to the worst case
+ * scheduling latency (schedule is delayed after all), only have it affect
+ * LAZY reschedules on PREEMPT_RT for now.
+ *
+ * However, since this delay is only applicable to userspace, a value
+ * for rseq_slice_extension_nsec that is strictly less than the worst case
+ * kernel space preempt_disable() region, should mean the scheduling latency
+ * is not affected, even for !LAZY.
+ *
+ * However, since this value depends on the hardware at hand, it cannot be
+ * pre-determined in any sensible way. Hence punt on this problem for now.
+ */
+# define TIF_SLICE_EXT_SCHED (_TIF_NEED_RESCHED_LAZY)
+#else
+# define TIF_SLICE_EXT_SCHED (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
+#endif
+#define TIF_SLICE_EXT_DENY (EXIT_TO_USER_MODE_WORK & ~TIF_SLICE_EXT_SCHED)
+
static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs,
unsigned long ti_work)
{
@@ -28,8 +49,10 @@ static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *re
local_irq_enable_exit_to_user(ti_work);
- if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
- schedule();
+ if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
+ if (!rseq_grant_slice_extension(ti_work & TIF_SLICE_EXT_DENY))
+ schedule();
+ }
if (ti_work & _TIF_UPROBE)
uprobe_notify_resume(regs);
diff --git a/kernel/entry/common.h b/kernel/entry/common.h
deleted file mode 100644
index f6e6d02f07fe..000000000000
--- a/kernel/entry/common.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _COMMON_H
-#define _COMMON_H
-
-bool syscall_user_dispatch(struct pt_regs *regs);
-
-#endif
diff --git a/kernel/entry/syscall-common.c b/kernel/entry/syscall-common.c
index 940a597ded40..cd4967a9c53e 100644
--- a/kernel/entry/syscall-common.c
+++ b/kernel/entry/syscall-common.c
@@ -1,104 +1,23 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/audit.h>
#include <linux/entry-common.h>
-#include "common.h"
#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
-static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
-{
- if (unlikely(audit_context())) {
- unsigned long args[6];
-
- syscall_get_arguments(current, regs, args);
- audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
- }
-}
+/* Out of line to prevent tracepoint code duplication */
-long syscall_trace_enter(struct pt_regs *regs, long syscall,
- unsigned long work)
+long trace_syscall_enter(struct pt_regs *regs, long syscall)
{
- long ret = 0;
-
+ trace_sys_enter(regs, syscall);
/*
- * Handle Syscall User Dispatch. This must comes first, since
- * the ABI here can be something that doesn't make sense for
- * other syscall_work features.
+ * Probes or BPF hooks in the tracepoint may have changed the
+ * system call number. Reread it.
*/
- if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
- if (syscall_user_dispatch(regs))
- return -1L;
- }
-
- /* Handle ptrace */
- if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
- ret = ptrace_report_syscall_entry(regs);
- if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
- return -1L;
- }
-
- /* Do seccomp after ptrace, to catch any tracer changes. */
- if (work & SYSCALL_WORK_SECCOMP) {
- ret = __secure_computing();
- if (ret == -1L)
- return ret;
- }
-
- /* Either of the above might have changed the syscall number */
- syscall = syscall_get_nr(current, regs);
-
- if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) {
- trace_sys_enter(regs, syscall);
- /*
- * Probes or BPF hooks in the tracepoint may have changed the
- * system call number as well.
- */
- syscall = syscall_get_nr(current, regs);
- }
-
- syscall_enter_audit(regs, syscall);
-
- return ret ? : syscall;
+ return syscall_get_nr(current, regs);
}
-/*
- * If SYSCALL_EMU is set, then the only reason to report is when
- * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
- * instruction has been already reported in syscall_enter_from_user_mode().
- */
-static inline bool report_single_step(unsigned long work)
+void trace_syscall_exit(struct pt_regs *regs, long ret)
{
- if (work & SYSCALL_WORK_SYSCALL_EMU)
- return false;
-
- return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
-}
-
-void syscall_exit_work(struct pt_regs *regs, unsigned long work)
-{
- bool step;
-
- /*
- * If the syscall was rolled back due to syscall user dispatching,
- * then the tracers below are not invoked for the same reason as
- * the entry side was not invoked in syscall_trace_enter(): The ABI
- * of these syscalls is unknown.
- */
- if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
- if (unlikely(current->syscall_dispatch.on_dispatch)) {
- current->syscall_dispatch.on_dispatch = false;
- return;
- }
- }
-
- audit_syscall_exit(regs);
-
- if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
- trace_sys_exit(regs, syscall_get_return_value(current, regs));
-
- step = report_single_step(work);
- if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
- ptrace_report_syscall_exit(regs, step);
+ trace_sys_exit(regs, ret);
}
diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c
index a9055eccb27e..d89dffcc2d64 100644
--- a/kernel/entry/syscall_user_dispatch.c
+++ b/kernel/entry/syscall_user_dispatch.c
@@ -2,6 +2,8 @@
/*
* Copyright (C) 2020 Collabora Ltd.
*/
+
+#include <linux/entry-common.h>
#include <linux/sched.h>
#include <linux/prctl.h>
#include <linux/ptrace.h>
@@ -15,8 +17,6 @@
#include <asm/syscall.h>
-#include "common.h"
-
static void trigger_sigsys(struct pt_regs *regs)
{
struct kernel_siginfo info;
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 395d8b002350..b0973d19f366 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -71,6 +71,9 @@
#define RSEQ_BUILD_SLOW_PATH
#include <linux/debugfs.h>
+#include <linux/hrtimer.h>
+#include <linux/percpu.h>
+#include <linux/prctl.h>
#include <linux/ratelimit.h>
#include <linux/rseq_entry.h>
#include <linux/sched.h>
@@ -120,7 +123,6 @@ void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
}
#endif /* CONFIG_TRACEPOINTS */
-#ifdef CONFIG_DEBUG_FS
#ifdef CONFIG_RSEQ_STATS
DEFINE_PER_CPU(struct rseq_stats, rseq_stats);
@@ -138,6 +140,13 @@ static int rseq_stats_show(struct seq_file *m, void *p)
stats.cs += data_race(per_cpu(rseq_stats.cs, cpu));
stats.clear += data_race(per_cpu(rseq_stats.clear, cpu));
stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu));
+ if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) {
+ stats.s_granted += data_race(per_cpu(rseq_stats.s_granted, cpu));
+ stats.s_expired += data_race(per_cpu(rseq_stats.s_expired, cpu));
+ stats.s_revoked += data_race(per_cpu(rseq_stats.s_revoked, cpu));
+ stats.s_yielded += data_race(per_cpu(rseq_stats.s_yielded, cpu));
+ stats.s_aborted += data_race(per_cpu(rseq_stats.s_aborted, cpu));
+ }
}
seq_printf(m, "exit: %16lu\n", stats.exit);
@@ -148,6 +157,13 @@ static int rseq_stats_show(struct seq_file *m, void *p)
seq_printf(m, "cs: %16lu\n", stats.cs);
seq_printf(m, "clear: %16lu\n", stats.clear);
seq_printf(m, "fixup: %16lu\n", stats.fixup);
+ if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) {
+ seq_printf(m, "sgrant: %16lu\n", stats.s_granted);
+ seq_printf(m, "sexpir: %16lu\n", stats.s_expired);
+ seq_printf(m, "srevok: %16lu\n", stats.s_revoked);
+ seq_printf(m, "syield: %16lu\n", stats.s_yielded);
+ seq_printf(m, "sabort: %16lu\n", stats.s_aborted);
+ }
return 0;
}
@@ -205,16 +221,19 @@ static const struct file_operations debug_ops = {
.release = single_release,
};
+static void rseq_slice_ext_init(struct dentry *root_dir);
+
static int __init rseq_debugfs_init(void)
{
struct dentry *root_dir = debugfs_create_dir("rseq", NULL);
debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops);
rseq_stats_init(root_dir);
+ if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION))
+ rseq_slice_ext_init(root_dir);
return 0;
}
__initcall(rseq_debugfs_init);
-#endif /* CONFIG_DEBUG_FS */
static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id)
{
@@ -389,6 +408,8 @@ static bool rseq_reset_ids(void)
*/
SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig)
{
+ u32 rseqfl = 0;
+
if (flags & RSEQ_FLAG_UNREGISTER) {
if (flags & ~RSEQ_FLAG_UNREGISTER)
return -EINVAL;
@@ -405,7 +426,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
return 0;
}
- if (unlikely(flags))
+ if (unlikely(flags & ~(RSEQ_FLAG_SLICE_EXT_DEFAULT_ON)))
return -EINVAL;
if (current->rseq.usrptr) {
@@ -440,6 +461,13 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
if (!access_ok(rseq, rseq_len))
return -EFAULT;
+ if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) {
+ rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE;
+ if (rseq_slice_extension_enabled() &&
+ (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON))
+ rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
+ }
+
scoped_user_write_access(rseq, efault) {
/*
* If the rseq_cs pointer is non-NULL on registration, clear it to
@@ -449,11 +477,13 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
* clearing the fields. Don't bother reading it, just reset it.
*/
unsafe_put_user(0UL, &rseq->rseq_cs, efault);
+ unsafe_put_user(rseqfl, &rseq->flags, efault);
/* Initialize IDs in user space */
unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault);
unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault);
unsafe_put_user(0U, &rseq->node_id, efault);
unsafe_put_user(0U, &rseq->mm_cid, efault);
+ unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
}
/*
@@ -464,6 +494,10 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
current->rseq.len = rseq_len;
current->rseq.sig = sig;
+#ifdef CONFIG_RSEQ_SLICE_EXTENSION
+ current->rseq.slice.state.enabled = !!(rseqfl & RSEQ_CS_FLAG_SLICE_EXT_ENABLED);
+#endif
+
/*
* If rseq was previously inactive, and has just been
* registered, ensure the cpu_id_start and cpu_id fields
@@ -476,3 +510,328 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32
efault:
return -EFAULT;
}
+
+#ifdef CONFIG_RSEQ_SLICE_EXTENSION
+struct slice_timer {
+ struct hrtimer timer;
+ void *cookie;
+};
+
+static const unsigned int rseq_slice_ext_nsecs_min = 5 * NSEC_PER_USEC;
+static const unsigned int rseq_slice_ext_nsecs_max = 50 * NSEC_PER_USEC;
+unsigned int rseq_slice_ext_nsecs __read_mostly = rseq_slice_ext_nsecs_min;
+static DEFINE_PER_CPU(struct slice_timer, slice_timer);
+DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key);
+
+/*
+ * When the timer expires and the task is still in user space, the return
+ * from interrupt will revoke the grant and schedule. If the task already
+ * entered the kernel via a syscall and the timer fires before the syscall
+ * work was able to cancel it, then depending on the preemption model this
+ * will either reschedule on return from interrupt or in the syscall work
+ * below.
+ */
+static enum hrtimer_restart rseq_slice_expired(struct hrtimer *tmr)
+{
+ struct slice_timer *st = container_of(tmr, struct slice_timer, timer);
+
+ /*
+ * Validate that the task which armed the timer is still on the
+ * CPU. It could have been scheduled out without canceling the
+ * timer.
+ */
+ if (st->cookie == current && current->rseq.slice.state.granted) {
+ rseq_stat_inc(rseq_stats.s_expired);
+ set_need_resched_current();
+ }
+ return HRTIMER_NORESTART;
+}
+
+bool __rseq_arm_slice_extension_timer(void)
+{
+ struct slice_timer *st = this_cpu_ptr(&slice_timer);
+ struct task_struct *curr = current;
+
+ lockdep_assert_irqs_disabled();
+
+ /*
+ * This check prevents a task, which got a time slice extension
+ * granted, from exceeding the maximum scheduling latency when the
+ * grant expired before going out to user space. Don't bother to
+ * clear the grant here, it will be cleaned up automatically before
+ * going out to user space after being scheduled back in.
+ */
+ if ((unlikely(curr->rseq.slice.expires < ktime_get_mono_fast_ns()))) {
+ set_need_resched_current();
+ return true;
+ }
+
+ /*
+ * Store the task pointer as a cookie for comparison in the timer
+ * function. This is safe as the timer is CPU local and cannot be
+ * in the expiry function at this point.
+ */
+ st->cookie = curr;
+ hrtimer_start(&st->timer, curr->rseq.slice.expires, HRTIMER_MODE_ABS_PINNED_HARD);
+ /* Arm the syscall entry work */
+ set_task_syscall_work(curr, SYSCALL_RSEQ_SLICE);
+ return false;
+}
+
+static void rseq_cancel_slice_extension_timer(void)
+{
+ struct slice_timer *st = this_cpu_ptr(&slice_timer);
+
+ /*
+ * st->cookie can be safely read as preemption is disabled and the
+ * timer is CPU local.
+ *
+ * As this is most probably the first expiring timer, the cancel is
+ * expensive as it has to reprogram the hardware, but that's less
+ * expensive than going through a full hrtimer_interrupt() cycle
+ * for nothing.
+ *
+ * hrtimer_try_to_cancel() is sufficient here as the timer is CPU
+ * local and once the hrtimer code disabled interrupts the timer
+ * callback cannot be running.
+ */
+ if (st->cookie == current)
+ hrtimer_try_to_cancel(&st->timer);
+}
+
+static inline void rseq_slice_set_need_resched(struct task_struct *curr)
+{
+ /*
+ * The interrupt guard is required to prevent inconsistent state in
+ * this case:
+ *
+ * set_tsk_need_resched()
+ * --> Interrupt
+ * wakeup()
+ * set_tsk_need_resched()
+ * set_preempt_need_resched()
+ * schedule_on_return()
+ * clear_tsk_need_resched()
+ * clear_preempt_need_resched()
+ * set_preempt_need_resched() <- Inconsistent state
+ *
+ * This is safe vs. a remote set of TIF_NEED_RESCHED because that
+ * only sets the already set bit and does not create inconsistent
+ * state.
+ */
+ scoped_guard(irq)
+ set_need_resched_current();
+}
+
+static void rseq_slice_validate_ctrl(u32 expected)
+{
+ u32 __user *sctrl = &current->rseq.usrptr->slice_ctrl.all;
+ u32 uval;
+
+ if (get_user(uval, sctrl) || uval != expected)
+ force_sig(SIGSEGV);
+}
+
+/*
+ * Invoked from syscall entry if a time slice extension was granted and the
+ * kernel did not clear it before user space left the critical section.
+ *
+ * While the recommended way to relinquish the CPU side effect free is
+ * rseq_slice_yield(2), any syscall within a granted slice terminates the
+ * grant and immediately reschedules if required. This supports onion layer
+ * applications, where the code requesting the grant cannot control the
+ * code within the critical section.
+ */
+void rseq_syscall_enter_work(long syscall)
+{
+ struct task_struct *curr = current;
+ struct rseq_slice_ctrl ctrl = { .granted = curr->rseq.slice.state.granted };
+
+ clear_task_syscall_work(curr, SYSCALL_RSEQ_SLICE);
+
+ if (static_branch_unlikely(&rseq_debug_enabled))
+ rseq_slice_validate_ctrl(ctrl.all);
+
+ /*
+ * The kernel might have raced, revoked the grant and updated
+ * userspace, but kept the SLICE work set.
+ */
+ if (!ctrl.granted)
+ return;
+
+ /*
+ * Required to stabilize the per CPU timer pointer and to make
+ * set_tsk_need_resched() correct on PREEMPT[RT] kernels.
+ *
+ * Leaving the scope will reschedule on preemption models FULL,
+ * LAZY and RT if necessary.
+ */
+ scoped_guard(preempt) {
+ rseq_cancel_slice_extension_timer();
+ /*
+ * Now that preemption is disabled, quickly check whether
+ * the task was already rescheduled before arriving here.
+ */
+ if (!curr->rseq.event.sched_switch) {
+ rseq_slice_set_need_resched(curr);
+
+ if (syscall == __NR_rseq_slice_yield) {
+ rseq_stat_inc(rseq_stats.s_yielded);
+ /* Update the yielded state for syscall return */
+ curr->rseq.slice.yielded = 1;
+ } else {
+ rseq_stat_inc(rseq_stats.s_aborted);
+ }
+ }
+ }
+ /* Reschedule on NONE/VOLUNTARY preemption models */
+ cond_resched();
+
+ /* Clear the grant in kernel state and user space */
+ curr->rseq.slice.state.granted = false;
+ if (put_user(0U, &curr->rseq.usrptr->slice_ctrl.all))
+ force_sig(SIGSEGV);
+}
+
+int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)
+{
+ switch (arg2) {
+ case PR_RSEQ_SLICE_EXTENSION_GET:
+ if (arg3)
+ return -EINVAL;
+ return current->rseq.slice.state.enabled ? PR_RSEQ_SLICE_EXT_ENABLE : 0;
+
+ case PR_RSEQ_SLICE_EXTENSION_SET: {
+ u32 rflags, valid = RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE;
+ bool enable = !!(arg3 & PR_RSEQ_SLICE_EXT_ENABLE);
+
+ if (arg3 & ~PR_RSEQ_SLICE_EXT_ENABLE)
+ return -EINVAL;
+ if (!rseq_slice_extension_enabled())
+ return -ENOTSUPP;
+ if (!current->rseq.usrptr)
+ return -ENXIO;
+
+ /* No change? */
+ if (enable == !!current->rseq.slice.state.enabled)
+ return 0;
+
+ if (get_user(rflags, &current->rseq.usrptr->flags))
+ goto die;
+
+ if (current->rseq.slice.state.enabled)
+ valid |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
+
+ if ((rflags & valid) != valid)
+ goto die;
+
+ rflags &= ~RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
+ rflags |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE;
+ if (enable)
+ rflags |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED;
+
+ if (put_user(rflags, &current->rseq.usrptr->flags))
+ goto die;
+
+ current->rseq.slice.state.enabled = enable;
+ return 0;
+ }
+ default:
+ return -EINVAL;
+ }
+die:
+ force_sig(SIGSEGV);
+ return -EFAULT;
+}
+
+/**
+ * sys_rseq_slice_yield - yield the current processor side effect free if a
+ * task granted with a time slice extension is done with
+ * the critical work before being forced out.
+ *
+ * Return: 1 if the task successfully yielded the CPU within the granted slice.
+ * 0 if the slice extension was either never granted or was revoked by
+ * going over the granted extension, using a syscall other than this one
+ * or being scheduled out earlier due to a subsequent interrupt.
+ *
+ * The syscall does not schedule because the syscall entry work immediately
+ * relinquishes the CPU and schedules if required.
+ */
+SYSCALL_DEFINE0(rseq_slice_yield)
+{
+ int yielded = !!current->rseq.slice.yielded;
+
+ current->rseq.slice.yielded = 0;
+ return yielded;
+}
+
+static int rseq_slice_ext_show(struct seq_file *m, void *p)
+{
+ seq_printf(m, "%d\n", rseq_slice_ext_nsecs);
+ return 0;
+}
+
+static ssize_t rseq_slice_ext_write(struct file *file, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ unsigned int nsecs;
+
+ if (kstrtouint_from_user(ubuf, count, 10, &nsecs))
+ return -EINVAL;
+
+ if (nsecs < rseq_slice_ext_nsecs_min)
+ return -ERANGE;
+
+ if (nsecs > rseq_slice_ext_nsecs_max)
+ return -ERANGE;
+
+ rseq_slice_ext_nsecs = nsecs;
+
+ return count;
+}
+
+static int rseq_slice_ext_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, rseq_slice_ext_show, inode->i_private);
+}
+
+static const struct file_operations slice_ext_ops = {
+ .open = rseq_slice_ext_open,
+ .read = seq_read,
+ .write = rseq_slice_ext_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void rseq_slice_ext_init(struct dentry *root_dir)
+{
+ debugfs_create_file("slice_ext_nsec", 0644, root_dir, NULL, &slice_ext_ops);
+}
+
+static int __init rseq_slice_cmdline(char *str)
+{
+ bool on;
+
+ if (kstrtobool(str, &on))
+ return 0;
+
+ if (!on)
+ static_branch_disable(&rseq_slice_extension_key);
+ return 1;
+}
+__setup("rseq_slice_ext=", rseq_slice_cmdline);
+
+static int __init rseq_slice_init(void)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ hrtimer_setup(per_cpu_ptr(&slice_timer.timer, cpu), rseq_slice_expired,
+ CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED_HARD);
+ }
+ return 0;
+}
+device_initcall(rseq_slice_init);
+#else
+static void rseq_slice_ext_init(struct dentry *root_dir) { }
+#endif /* CONFIG_RSEQ_SLICE_EXTENSION */
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index f5e6dd6a6b3a..2ae4fbf13431 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -173,6 +173,7 @@ notrace static void __sched_clock_work(struct work_struct *work)
scd->tick_gtod, __gtod_offset,
scd->tick_raw, __sched_clock_offset);
+ disable_sched_clock_irqtime();
static_branch_disable(&__sched_clock_stable);
}
@@ -238,6 +239,8 @@ static int __init sched_clock_init_late(void)
if (__sched_clock_stable_early)
__set_sched_clock_stable();
+ else
+ disable_sched_clock_irqtime(); /* disable if clock unstable. */
return 0;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d78d7b6d30bb..23406f037dde 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -119,6 +119,9 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_entry_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_exit_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_set_need_resched_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
@@ -1141,6 +1144,7 @@ void __trace_set_need_resched(struct task_struct *curr, int tif)
{
trace_sched_set_need_resched_tp(curr, smp_processor_id(), tif);
}
+EXPORT_SYMBOL_GPL(__trace_set_need_resched);
void resched_curr(struct rq *rq)
{
@@ -2095,7 +2099,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
*/
uclamp_rq_inc(rq, p, flags);
- rq->queue_mask |= p->sched_class->queue_mask;
p->sched_class->enqueue_task(rq, p, flags);
psi_enqueue(p, flags);
@@ -2128,7 +2131,6 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
* and mark the task ->sched_delayed.
*/
uclamp_rq_dec(rq, p);
- rq->queue_mask |= p->sched_class->queue_mask;
return p->sched_class->dequeue_task(rq, p, flags);
}
@@ -2179,10 +2181,14 @@ void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
{
struct task_struct *donor = rq->donor;
- if (p->sched_class == donor->sched_class)
- donor->sched_class->wakeup_preempt(rq, p, flags);
- else if (sched_class_above(p->sched_class, donor->sched_class))
+ if (p->sched_class == rq->next_class) {
+ rq->next_class->wakeup_preempt(rq, p, flags);
+
+ } else if (sched_class_above(p->sched_class, rq->next_class)) {
+ rq->next_class->wakeup_preempt(rq, p, flags);
resched_curr(rq);
+ rq->next_class = p->sched_class;
+ }
/*
* A queue event has occurred, and we're going to schedule. In
@@ -3620,6 +3626,18 @@ static inline void ttwu_do_wakeup(struct task_struct *p)
trace_sched_wakeup(p);
}
+void update_rq_avg_idle(struct rq *rq)
+{
+ u64 delta = rq_clock(rq) - rq->idle_stamp;
+ u64 max = 2*rq->max_idle_balance_cost;
+
+ update_avg(&rq->avg_idle, delta);
+
+ if (rq->avg_idle > max)
+ rq->avg_idle = max;
+ rq->idle_stamp = 0;
+}
+
static void
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
struct rq_flags *rf)
@@ -3655,18 +3673,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
p->sched_class->task_woken(rq, p);
rq_repin_lock(rq, rf);
}
-
- if (rq->idle_stamp) {
- u64 delta = rq_clock(rq) - rq->idle_stamp;
- u64 max = 2*rq->max_idle_balance_cost;
-
- update_avg(&rq->avg_idle, delta);
-
- if (rq->avg_idle > max)
- rq->avg_idle = max;
-
- rq->idle_stamp = 0;
- }
}
/*
@@ -6836,6 +6842,7 @@ static void __sched notrace __schedule(int sched_mode)
pick_again:
next = pick_next_task(rq, rq->donor, &rf);
rq_set_donor(rq, next);
+ rq->next_class = next->sched_class;
if (unlikely(task_is_blocked(next))) {
next = find_proxy_task(rq, next, &rf);
if (!next)
@@ -7580,7 +7587,7 @@ int preempt_dynamic_mode = preempt_dynamic_undefined;
int sched_dynamic_mode(const char *str)
{
-# ifndef CONFIG_PREEMPT_RT
+# if !(defined(CONFIG_PREEMPT_RT) || defined(CONFIG_ARCH_HAS_PREEMPT_LAZY))
if (!strcmp(str, "none"))
return preempt_dynamic_none;
@@ -8512,6 +8519,9 @@ int sched_cpu_dying(unsigned int cpu)
dump_rq_tasks(rq, KERN_WARNING);
}
dl_server_stop(&rq->fair_server);
+#ifdef CONFIG_SCHED_CLASS_EXT
+ dl_server_stop(&rq->ext_server);
+#endif
rq_unlock_irqrestore(rq, &rf);
calc_load_migrate(rq);
@@ -8687,6 +8697,8 @@ void __init sched_init(void)
rq->rt.rt_runtime = global_rt_runtime();
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
+ rq->next_class = &idle_sched_class;
+
rq->sd = NULL;
rq->rd = NULL;
rq->cpu_capacity = SCHED_CAPACITY_SCALE;
@@ -8715,6 +8727,9 @@ void __init sched_init(void)
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
fair_server_init(rq);
+#ifdef CONFIG_SCHED_CLASS_EXT
+ ext_server_init(rq);
+#endif
#ifdef CONFIG_SCHED_CORE
rq->core = rq;
@@ -9146,6 +9161,7 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup)
{
unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
bool resched = false;
+ bool queued = false;
struct rq *rq;
CLASS(task_rq_lock, rq_guard)(tsk);
@@ -9157,10 +9173,13 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup)
scx_cgroup_move_task(tsk);
if (scope->running)
resched = true;
+ queued = scope->queued;
}
if (resched)
resched_curr(rq);
+ else if (queued)
+ wakeup_preempt(rq, tsk, 0);
__balance_callbacks(rq, &rq_guard.rf);
}
@@ -10883,13 +10902,12 @@ struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int
flags |= DEQUEUE_NOCLOCK;
}
- if (flags & DEQUEUE_CLASS) {
- if (p->sched_class->switching_from)
- p->sched_class->switching_from(rq, p);
- }
+ if ((flags & DEQUEUE_CLASS) && p->sched_class->switching_from)
+ p->sched_class->switching_from(rq, p);
*ctx = (struct sched_change_ctx){
.p = p,
+ .class = p->sched_class,
.flags = flags,
.queued = task_on_rq_queued(p),
.running = task_current_donor(rq, p),
@@ -10920,6 +10938,11 @@ void sched_change_end(struct sched_change_ctx *ctx)
lockdep_assert_rq_held(rq);
+ /*
+ * Changing class without *QUEUE_CLASS is bad.
+ */
+ WARN_ON_ONCE(p->sched_class != ctx->class && !(ctx->flags & ENQUEUE_CLASS));
+
if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to)
p->sched_class->switching_to(rq, p);
@@ -10931,6 +10954,25 @@ void sched_change_end(struct sched_change_ctx *ctx)
if (ctx->flags & ENQUEUE_CLASS) {
if (p->sched_class->switched_to)
p->sched_class->switched_to(rq, p);
+
+ if (ctx->running) {
+ /*
+ * If this was a class promotion; let the old class
+ * know it got preempted. Note that none of the
+ * switch*_from() methods know the new class and none
+ * of the switch*_to() methods know the old class.
+ */
+ if (sched_class_above(p->sched_class, ctx->class)) {
+ rq->next_class->wakeup_preempt(rq, p, 0);
+ rq->next_class = p->sched_class;
+ }
+ /*
+ * If this was a degradation in class; make sure to
+ * reschedule.
+ */
+ if (sched_class_above(ctx->class, p->sched_class))
+ resched_curr(rq);
+ }
} else {
p->sched_class->prio_changed(rq, p, ctx->prio);
}
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 0ab5f9d4bc59..cfc40181f66e 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -682,7 +682,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
"sugov:%d",
cpumask_first(policy->related_cpus));
if (IS_ERR(thread)) {
- pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
+ pr_err("failed to create sugov thread: %pe\n", thread);
return PTR_ERR(thread);
}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 4f97896887ec..ff0dfca95420 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -12,6 +12,8 @@
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime);
+
/*
* There are no locks covering percpu hardirq/softirq time.
* They are only modified in vtime_account, on corresponding CPU
@@ -25,16 +27,15 @@
*/
DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
-int sched_clock_irqtime;
-
void enable_sched_clock_irqtime(void)
{
- sched_clock_irqtime = 1;
+ static_branch_enable(&sched_clock_irqtime);
}
void disable_sched_clock_irqtime(void)
{
- sched_clock_irqtime = 0;
+ if (irqtime_enabled())
+ static_branch_disable(&sched_clock_irqtime);
}
static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 7bcde7114f1b..d08b00429323 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1449,8 +1449,8 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
dl_se->dl_defer_idle = 0;
/*
- * The fair server can consume its runtime while throttled (not queued/
- * running as regular CFS).
+ * The DL server can consume its runtime while throttled (not
+ * queued / running as regular CFS).
*
* If the server consumes its entire runtime in this state. The server
* is not required for the current period. Thus, reset the server by
@@ -1535,10 +1535,10 @@ throttle:
}
/*
- * The fair server (sole dl_server) does not account for real-time
- * workload because it is running fair work.
+ * The dl_server does not account for real-time workload because it
+ * is running fair work.
*/
- if (dl_se == &rq->fair_server)
+ if (dl_se->dl_server)
return;
#ifdef CONFIG_RT_GROUP_SCHED
@@ -1573,9 +1573,9 @@ throttle:
* In the non-defer mode, the idle time is not accounted, as the
* server provides a guarantee.
*
- * If the dl_server is in defer mode, the idle time is also considered
- * as time available for the fair server, avoiding a penalty for the
- * rt scheduler that did not consumed that time.
+ * If the dl_server is in defer mode, the idle time is also considered as
+ * time available for the dl_server, avoiding a penalty for the rt
+ * scheduler that did not consumed that time.
*/
void dl_server_update_idle(struct sched_dl_entity *dl_se, s64 delta_exec)
{
@@ -1799,7 +1799,7 @@ void dl_server_start(struct sched_dl_entity *dl_se)
struct rq *rq = dl_se->rq;
dl_se->dl_defer_idle = 0;
- if (!dl_server(dl_se) || dl_se->dl_server_active)
+ if (!dl_server(dl_se) || dl_se->dl_server_active || !dl_se->dl_runtime)
return;
/*
@@ -1860,6 +1860,18 @@ void sched_init_dl_servers(void)
dl_se->dl_server = 1;
dl_se->dl_defer = 1;
setup_new_dl_entity(dl_se);
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+ dl_se = &rq->ext_server;
+
+ WARN_ON(dl_server(dl_se));
+
+ dl_server_apply_params(dl_se, runtime, period, 1);
+
+ dl_se->dl_server = 1;
+ dl_se->dl_defer = 1;
+ setup_new_dl_entity(dl_se);
+#endif
}
}
@@ -1886,7 +1898,6 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio
int cpu = cpu_of(rq);
struct dl_bw *dl_b;
unsigned long cap;
- int retval = 0;
int cpus;
dl_b = dl_bw_of(cpu);
@@ -1918,7 +1929,7 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
- return retval;
+ return 0;
}
/*
@@ -2515,9 +2526,16 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
* Only called when both the current and waking task are -deadline
* tasks.
*/
-static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
- int flags)
+static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags)
{
+ /*
+ * Can only get preempted by stop-class, and those should be
+ * few and short lived, doesn't really make sense to push
+ * anything away for that.
+ */
+ if (p->sched_class != &dl_sched_class)
+ return;
+
if (dl_entity_preempt(&p->dl, &rq->donor->dl)) {
resched_curr(rq);
return;
@@ -3191,6 +3209,36 @@ void dl_add_task_root_domain(struct task_struct *p)
raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
}
+static void dl_server_add_bw(struct root_domain *rd, int cpu)
+{
+ struct sched_dl_entity *dl_se;
+
+ dl_se = &cpu_rq(cpu)->fair_server;
+ if (dl_server(dl_se) && cpu_active(cpu))
+ __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu));
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+ dl_se = &cpu_rq(cpu)->ext_server;
+ if (dl_server(dl_se) && cpu_active(cpu))
+ __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu));
+#endif
+}
+
+static u64 dl_server_read_bw(int cpu)
+{
+ u64 dl_bw = 0;
+
+ if (cpu_rq(cpu)->fair_server.dl_server)
+ dl_bw += cpu_rq(cpu)->fair_server.dl_bw;
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+ if (cpu_rq(cpu)->ext_server.dl_server)
+ dl_bw += cpu_rq(cpu)->ext_server.dl_bw;
+#endif
+
+ return dl_bw;
+}
+
void dl_clear_root_domain(struct root_domain *rd)
{
int i;
@@ -3209,12 +3257,8 @@ void dl_clear_root_domain(struct root_domain *rd)
* dl_servers are not tasks. Since dl_add_task_root_domain ignores
* them, we need to account for them here explicitly.
*/
- for_each_cpu(i, rd->span) {
- struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server;
-
- if (dl_server(dl_se) && cpu_active(i))
- __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(i));
- }
+ for_each_cpu(i, rd->span)
+ dl_server_add_bw(rd, i);
}
void dl_clear_root_domain_cpu(int cpu)
@@ -3360,9 +3404,6 @@ static int task_is_throttled_dl(struct task_struct *p, int cpu)
#endif
DEFINE_SCHED_CLASS(dl) = {
-
- .queue_mask = 8,
-
.enqueue_task = enqueue_task_dl,
.dequeue_task = dequeue_task_dl,
.yield_task = yield_task_dl,
@@ -3656,6 +3697,9 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se)
dl_se->dl_non_contending = 0;
dl_se->dl_overrun = 0;
dl_se->dl_server = 0;
+ dl_se->dl_defer = 0;
+ dl_se->dl_defer_running = 0;
+ dl_se->dl_defer_armed = 0;
#ifdef CONFIG_RT_MUTEXES
dl_se->pi_se = dl_se;
@@ -3713,7 +3757,7 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
unsigned long flags, cap;
struct dl_bw *dl_b;
bool overflow = 0;
- u64 fair_server_bw = 0;
+ u64 dl_server_bw = 0;
rcu_read_lock_sched();
dl_b = dl_bw_of(cpu);
@@ -3746,27 +3790,26 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
cap -= arch_scale_cpu_capacity(cpu);
/*
- * cpu is going offline and NORMAL tasks will be moved away
- * from it. We can thus discount dl_server bandwidth
- * contribution as it won't need to be servicing tasks after
- * the cpu is off.
+ * cpu is going offline and NORMAL and EXT tasks will be
+ * moved away from it. We can thus discount dl_server
+ * bandwidth contribution as it won't need to be servicing
+ * tasks after the cpu is off.
*/
- if (cpu_rq(cpu)->fair_server.dl_server)
- fair_server_bw = cpu_rq(cpu)->fair_server.dl_bw;
+ dl_server_bw = dl_server_read_bw(cpu);
/*
* Not much to check if no DEADLINE bandwidth is present.
* dl_servers we can discount, as tasks will be moved out the
* offlined CPUs anyway.
*/
- if (dl_b->total_bw - fair_server_bw > 0) {
+ if (dl_b->total_bw - dl_server_bw > 0) {
/*
* Leaving at least one CPU for DEADLINE tasks seems a
* wise thing to do. As said above, cpu is not offline
* yet, so account for that.
*/
if (dl_bw_cpus(cpu) - 1)
- overflow = __dl_overflow(dl_b, cap, fair_server_bw, 0);
+ overflow = __dl_overflow(dl_b, cap, dl_server_bw, 0);
else
overflow = 1;
}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 41caa22e0680..b24f40f05019 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -172,18 +172,12 @@ static const struct file_operations sched_feat_fops = {
static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- char buf[16];
unsigned int scaling;
+ int ret;
- if (cnt > 15)
- cnt = 15;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
- buf[cnt] = '\0';
-
- if (kstrtouint(buf, 10, &scaling))
- return -EINVAL;
+ ret = kstrtouint_from_user(ubuf, cnt, 10, &scaling);
+ if (ret)
+ return ret;
if (scaling >= SCHED_TUNABLESCALING_END)
return -EINVAL;
@@ -243,7 +237,7 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
static int sched_dynamic_show(struct seq_file *m, void *v)
{
- int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2;
+ int i = (IS_ENABLED(CONFIG_PREEMPT_RT) || IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY)) * 2;
int j;
/* Count entries in NULL terminated preempt_modes */
@@ -336,17 +330,19 @@ enum dl_param {
DL_PERIOD,
};
-static unsigned long fair_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */
-static unsigned long fair_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */
+static unsigned long dl_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */
+static unsigned long dl_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */
-static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos, enum dl_param param)
+static ssize_t sched_server_write_common(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos, enum dl_param param,
+ void *server)
{
long cpu = (long) ((struct seq_file *) filp->private_data)->private;
+ struct sched_dl_entity *dl_se = (struct sched_dl_entity *)server;
+ u64 old_runtime, runtime, period;
struct rq *rq = cpu_rq(cpu);
- u64 runtime, period;
+ int retval = 0;
size_t err;
- int retval;
u64 value;
err = kstrtoull_from_user(ubuf, cnt, 10, &value);
@@ -354,8 +350,8 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
return err;
scoped_guard (rq_lock_irqsave, rq) {
- runtime = rq->fair_server.dl_runtime;
- period = rq->fair_server.dl_period;
+ old_runtime = runtime = dl_se->dl_runtime;
+ period = dl_se->dl_period;
switch (param) {
case DL_RUNTIME:
@@ -371,60 +367,68 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
}
if (runtime > period ||
- period > fair_server_period_max ||
- period < fair_server_period_min) {
+ period > dl_server_period_max ||
+ period < dl_server_period_min) {
return -EINVAL;
}
update_rq_clock(rq);
- dl_server_stop(&rq->fair_server);
-
- retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0);
- if (retval)
- cnt = retval;
+ dl_server_stop(dl_se);
+ retval = dl_server_apply_params(dl_se, runtime, period, 0);
+ dl_server_start(dl_se);
- if (!runtime)
- printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n",
- cpu_of(rq));
+ if (retval < 0)
+ return retval;
+ }
- if (rq->cfs.h_nr_queued)
- dl_server_start(&rq->fair_server);
+ if (!!old_runtime ^ !!runtime) {
+ pr_info("%s server %sabled on CPU %d%s.\n",
+ server == &rq->fair_server ? "Fair" : "Ext",
+ runtime ? "en" : "dis",
+ cpu_of(rq),
+ runtime ? "" : ", system may malfunction due to starvation");
}
*ppos += cnt;
return cnt;
}
-static size_t sched_fair_server_show(struct seq_file *m, void *v, enum dl_param param)
+static size_t sched_server_show_common(struct seq_file *m, void *v, enum dl_param param,
+ void *server)
{
- unsigned long cpu = (unsigned long) m->private;
- struct rq *rq = cpu_rq(cpu);
+ struct sched_dl_entity *dl_se = (struct sched_dl_entity *)server;
u64 value;
switch (param) {
case DL_RUNTIME:
- value = rq->fair_server.dl_runtime;
+ value = dl_se->dl_runtime;
break;
case DL_PERIOD:
- value = rq->fair_server.dl_period;
+ value = dl_se->dl_period;
break;
}
seq_printf(m, "%llu\n", value);
return 0;
-
}
static ssize_t
sched_fair_server_runtime_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_RUNTIME);
+ long cpu = (long) ((struct seq_file *) filp->private_data)->private;
+ struct rq *rq = cpu_rq(cpu);
+
+ return sched_server_write_common(filp, ubuf, cnt, ppos, DL_RUNTIME,
+ &rq->fair_server);
}
static int sched_fair_server_runtime_show(struct seq_file *m, void *v)
{
- return sched_fair_server_show(m, v, DL_RUNTIME);
+ unsigned long cpu = (unsigned long) m->private;
+ struct rq *rq = cpu_rq(cpu);
+
+ return sched_server_show_common(m, v, DL_RUNTIME, &rq->fair_server);
}
static int sched_fair_server_runtime_open(struct inode *inode, struct file *filp)
@@ -440,16 +444,57 @@ static const struct file_operations fair_server_runtime_fops = {
.release = single_release,
};
+#ifdef CONFIG_SCHED_CLASS_EXT
+static ssize_t
+sched_ext_server_runtime_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ long cpu = (long) ((struct seq_file *) filp->private_data)->private;
+ struct rq *rq = cpu_rq(cpu);
+
+ return sched_server_write_common(filp, ubuf, cnt, ppos, DL_RUNTIME,
+ &rq->ext_server);
+}
+
+static int sched_ext_server_runtime_show(struct seq_file *m, void *v)
+{
+ unsigned long cpu = (unsigned long) m->private;
+ struct rq *rq = cpu_rq(cpu);
+
+ return sched_server_show_common(m, v, DL_RUNTIME, &rq->ext_server);
+}
+
+static int sched_ext_server_runtime_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_ext_server_runtime_show, inode->i_private);
+}
+
+static const struct file_operations ext_server_runtime_fops = {
+ .open = sched_ext_server_runtime_open,
+ .write = sched_ext_server_runtime_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif /* CONFIG_SCHED_CLASS_EXT */
+
static ssize_t
sched_fair_server_period_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_PERIOD);
+ long cpu = (long) ((struct seq_file *) filp->private_data)->private;
+ struct rq *rq = cpu_rq(cpu);
+
+ return sched_server_write_common(filp, ubuf, cnt, ppos, DL_PERIOD,
+ &rq->fair_server);
}
static int sched_fair_server_period_show(struct seq_file *m, void *v)
{
- return sched_fair_server_show(m, v, DL_PERIOD);
+ unsigned long cpu = (unsigned long) m->private;
+ struct rq *rq = cpu_rq(cpu);
+
+ return sched_server_show_common(m, v, DL_PERIOD, &rq->fair_server);
}
static int sched_fair_server_period_open(struct inode *inode, struct file *filp)
@@ -465,6 +510,40 @@ static const struct file_operations fair_server_period_fops = {
.release = single_release,
};
+#ifdef CONFIG_SCHED_CLASS_EXT
+static ssize_t
+sched_ext_server_period_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ long cpu = (long) ((struct seq_file *) filp->private_data)->private;
+ struct rq *rq = cpu_rq(cpu);
+
+ return sched_server_write_common(filp, ubuf, cnt, ppos, DL_PERIOD,
+ &rq->ext_server);
+}
+
+static int sched_ext_server_period_show(struct seq_file *m, void *v)
+{
+ unsigned long cpu = (unsigned long) m->private;
+ struct rq *rq = cpu_rq(cpu);
+
+ return sched_server_show_common(m, v, DL_PERIOD, &rq->ext_server);
+}
+
+static int sched_ext_server_period_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_ext_server_period_show, inode->i_private);
+}
+
+static const struct file_operations ext_server_period_fops = {
+ .open = sched_ext_server_period_open,
+ .write = sched_ext_server_period_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif /* CONFIG_SCHED_CLASS_EXT */
+
static struct dentry *debugfs_sched;
static void debugfs_fair_server_init(void)
@@ -488,6 +567,29 @@ static void debugfs_fair_server_init(void)
}
}
+#ifdef CONFIG_SCHED_CLASS_EXT
+static void debugfs_ext_server_init(void)
+{
+ struct dentry *d_ext;
+ unsigned long cpu;
+
+ d_ext = debugfs_create_dir("ext_server", debugfs_sched);
+ if (!d_ext)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct dentry *d_cpu;
+ char buf[32];
+
+ snprintf(buf, sizeof(buf), "cpu%lu", cpu);
+ d_cpu = debugfs_create_dir(buf, d_ext);
+
+ debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &ext_server_runtime_fops);
+ debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &ext_server_period_fops);
+ }
+}
+#endif /* CONFIG_SCHED_CLASS_EXT */
+
static __init int sched_init_debug(void)
{
struct dentry __maybe_unused *numa;
@@ -526,6 +628,9 @@ static __init int sched_init_debug(void)
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
debugfs_fair_server_init();
+#ifdef CONFIG_SCHED_CLASS_EXT
+ debugfs_ext_server_init();
+#endif
return 0;
}
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7ccd84c17792..e6bf73456176 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -959,6 +959,8 @@ static void update_curr_scx(struct rq *rq)
if (!curr->scx.slice)
touch_core_sched(rq, curr);
}
+
+ dl_server_update(&rq->ext_server, delta_exec);
}
static bool scx_dsq_priq_less(struct rb_node *node_a,
@@ -1502,6 +1504,10 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
if (enq_flags & SCX_ENQ_WAKEUP)
touch_core_sched(rq, p);
+ /* Start dl_server if this is the first task being enqueued */
+ if (rq->scx.nr_running == 1)
+ dl_server_start(&rq->ext_server);
+
do_enqueue_task(rq, p, enq_flags, sticky_cpu);
out:
rq->scx.flags &= ~SCX_RQ_IN_WAKEUP;
@@ -2454,7 +2460,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
/* see kick_cpus_irq_workfn() */
smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
- rq_modified_clear(rq);
+ rq->next_class = &ext_sched_class;
rq_unpin_lock(rq, rf);
balance_one(rq, prev);
@@ -2469,7 +2475,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
* If @force_scx is true, always try to pick a SCHED_EXT task,
* regardless of any higher-priority sched classes activity.
*/
- if (!force_scx && rq_modified_above(rq, &ext_sched_class))
+ if (!force_scx && sched_class_above(rq->next_class, &ext_sched_class))
return RETRY_TASK;
keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
@@ -2513,6 +2519,33 @@ static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf)
return do_pick_task_scx(rq, rf, false);
}
+/*
+ * Select the next task to run from the ext scheduling class.
+ *
+ * Use do_pick_task_scx() directly with @force_scx enabled, since the
+ * dl_server must always select a sched_ext task.
+ */
+static struct task_struct *
+ext_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf)
+{
+ if (!scx_enabled())
+ return NULL;
+
+ return do_pick_task_scx(dl_se->rq, rf, true);
+}
+
+/*
+ * Initialize the ext server deadline entity.
+ */
+void ext_server_init(struct rq *rq)
+{
+ struct sched_dl_entity *dl_se = &rq->ext_server;
+
+ init_dl_entity(dl_se);
+
+ dl_server_init(dl_se, rq, ext_server_pick_task);
+}
+
#ifdef CONFIG_SCHED_CORE
/**
* scx_prio_less - Task ordering for core-sched
@@ -3141,7 +3174,8 @@ static void switched_from_scx(struct rq *rq, struct task_struct *p)
scx_disable_task(p);
}
-static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
+static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) {}
+
static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
int scx_check_setscheduler(struct task_struct *p, int policy)
@@ -3402,8 +3436,6 @@ static void scx_cgroup_unlock(void) {}
* their current sched_class. Call them directly from sched core instead.
*/
DEFINE_SCHED_CLASS(ext) = {
- .queue_mask = 1,
-
.enqueue_task = enqueue_task_scx,
.dequeue_task = dequeue_task_scx,
.yield_task = yield_task_scx,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c3502c3c7b88..1e22b7fadd70 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -524,10 +524,48 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
* Scheduling class tree data structure manipulation methods:
*/
+extern void __BUILD_BUG_vruntime_cmp(void);
+
+/* Use __builtin_strcmp() because of __HAVE_ARCH_STRCMP: */
+
+#define vruntime_cmp(A, CMP_STR, B) ({ \
+ int __res = 0; \
+ \
+ if (!__builtin_strcmp(CMP_STR, "<")) { \
+ __res = ((s64)((A)-(B)) < 0); \
+ } else if (!__builtin_strcmp(CMP_STR, "<=")) { \
+ __res = ((s64)((A)-(B)) <= 0); \
+ } else if (!__builtin_strcmp(CMP_STR, ">")) { \
+ __res = ((s64)((A)-(B)) > 0); \
+ } else if (!__builtin_strcmp(CMP_STR, ">=")) { \
+ __res = ((s64)((A)-(B)) >= 0); \
+ } else { \
+ /* Unknown operator throws linker error: */ \
+ __BUILD_BUG_vruntime_cmp(); \
+ } \
+ \
+ __res; \
+})
+
+extern void __BUILD_BUG_vruntime_op(void);
+
+#define vruntime_op(A, OP_STR, B) ({ \
+ s64 __res = 0; \
+ \
+ if (!__builtin_strcmp(OP_STR, "-")) { \
+ __res = (s64)((A)-(B)); \
+ } else { \
+ /* Unknown operator throws linker error: */ \
+ __BUILD_BUG_vruntime_op(); \
+ } \
+ \
+ __res; \
+})
+
+
static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime)
{
- s64 delta = (s64)(vruntime - max_vruntime);
- if (delta > 0)
+ if (vruntime_cmp(vruntime, ">", max_vruntime))
max_vruntime = vruntime;
return max_vruntime;
@@ -535,8 +573,7 @@ static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime)
static inline __maybe_unused u64 min_vruntime(u64 min_vruntime, u64 vruntime)
{
- s64 delta = (s64)(vruntime - min_vruntime);
- if (delta < 0)
+ if (vruntime_cmp(vruntime, "<", min_vruntime))
min_vruntime = vruntime;
return min_vruntime;
@@ -549,12 +586,12 @@ static inline bool entity_before(const struct sched_entity *a,
* Tiebreak on vruntime seems unnecessary since it can
* hardly happen.
*/
- return (s64)(a->deadline - b->deadline) < 0;
+ return vruntime_cmp(a->deadline, "<", b->deadline);
}
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- return (s64)(se->vruntime - cfs_rq->zero_vruntime);
+ return vruntime_op(se->vruntime, "-", cfs_rq->zero_vruntime);
}
#define __node_2_se(node) \
@@ -576,7 +613,7 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
*
* \Sum lag_i = 0
* \Sum w_i * (V - v_i) = 0
- * \Sum w_i * V - w_i * v_i = 0
+ * \Sum (w_i * V - w_i * v_i) = 0
*
* From which we can solve an expression for V in v_i (which we have in
* se->vruntime):
@@ -607,11 +644,11 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Which we track using:
*
* v0 := cfs_rq->zero_vruntime
- * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
- * \Sum w_i := cfs_rq->avg_load
+ * \Sum (v_i - v0) * w_i := cfs_rq->sum_w_vruntime
+ * \Sum w_i := cfs_rq->sum_weight
*
* Since zero_vruntime closely tracks the per-task service, these
- * deltas: (v_i - v), will be in the order of the maximal (virtual) lag
+ * deltas: (v_i - v0), will be in the order of the maximal (virtual) lag
* induced in the system due to quantisation.
*
* Also, we use scale_load_down() to reduce the size.
@@ -619,32 +656,32 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
* As measured, the max (key * weight) value was ~44 bits for a kernel build.
*/
static void
-avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
unsigned long weight = scale_load_down(se->load.weight);
s64 key = entity_key(cfs_rq, se);
- cfs_rq->avg_vruntime += key * weight;
- cfs_rq->avg_load += weight;
+ cfs_rq->sum_w_vruntime += key * weight;
+ cfs_rq->sum_weight += weight;
}
static void
-avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
+sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
unsigned long weight = scale_load_down(se->load.weight);
s64 key = entity_key(cfs_rq, se);
- cfs_rq->avg_vruntime -= key * weight;
- cfs_rq->avg_load -= weight;
+ cfs_rq->sum_w_vruntime -= key * weight;
+ cfs_rq->sum_weight -= weight;
}
static inline
-void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
+void sum_w_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
{
/*
- * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
+ * v' = v + d ==> sum_w_vruntime' = sum_runtime - d*sum_weight
*/
- cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
+ cfs_rq->sum_w_vruntime -= cfs_rq->sum_weight * delta;
}
/*
@@ -654,8 +691,8 @@ void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
u64 avg_vruntime(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
- s64 avg = cfs_rq->avg_vruntime;
- long load = cfs_rq->avg_load;
+ s64 avg = cfs_rq->sum_w_vruntime;
+ long load = cfs_rq->sum_weight;
if (curr && curr->on_rq) {
unsigned long weight = scale_load_down(curr->load.weight);
@@ -722,8 +759,8 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
{
struct sched_entity *curr = cfs_rq->curr;
- s64 avg = cfs_rq->avg_vruntime;
- long load = cfs_rq->avg_load;
+ s64 avg = cfs_rq->sum_w_vruntime;
+ long load = cfs_rq->sum_weight;
if (curr && curr->on_rq) {
unsigned long weight = scale_load_down(curr->load.weight);
@@ -732,7 +769,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
load += weight;
}
- return avg >= (s64)(vruntime - cfs_rq->zero_vruntime) * load;
+ return avg >= vruntime_op(vruntime, "-", cfs_rq->zero_vruntime) * load;
}
int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -743,9 +780,9 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
static void update_zero_vruntime(struct cfs_rq *cfs_rq)
{
u64 vruntime = avg_vruntime(cfs_rq);
- s64 delta = (s64)(vruntime - cfs_rq->zero_vruntime);
+ s64 delta = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime);
- avg_vruntime_update(cfs_rq, delta);
+ sum_w_vruntime_update(cfs_rq, delta);
cfs_rq->zero_vruntime = vruntime;
}
@@ -770,13 +807,12 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
return entity_before(__node_2_se(a), __node_2_se(b));
}
-#define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
-
static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node)
{
if (node) {
struct sched_entity *rse = __node_2_se(node);
- if (vruntime_gt(min_vruntime, se, rse))
+
+ if (vruntime_cmp(se->min_vruntime, ">", rse->min_vruntime))
se->min_vruntime = rse->min_vruntime;
}
}
@@ -819,7 +855,7 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- avg_vruntime_add(cfs_rq, se);
+ sum_w_vruntime_add(cfs_rq, se);
update_zero_vruntime(cfs_rq);
se->min_vruntime = se->vruntime;
se->min_slice = se->slice;
@@ -831,7 +867,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
&min_vruntime_cb);
- avg_vruntime_sub(cfs_rq, se);
+ sum_w_vruntime_sub(cfs_rq, se);
update_zero_vruntime(cfs_rq);
}
@@ -887,7 +923,7 @@ static inline void update_protect_slice(struct cfs_rq *cfs_rq, struct sched_enti
static inline bool protect_slice(struct sched_entity *se)
{
- return ((s64)(se->vprot - se->vruntime) > 0);
+ return vruntime_cmp(se->vruntime, "<", se->vprot);
}
static inline void cancel_protect_slice(struct sched_entity *se)
@@ -1024,7 +1060,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
*/
static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if ((s64)(se->vruntime - se->deadline) < 0)
+ if (vruntime_cmp(se->vruntime, "<", se->deadline))
return false;
/*
@@ -1513,7 +1549,7 @@ static unsigned int task_scan_start(struct task_struct *p)
/* Scale the maximum scan period with the amount of shared memory. */
rcu_read_lock();
- ng = rcu_dereference(p->numa_group);
+ ng = rcu_dereference_all(p->numa_group);
if (ng) {
unsigned long shared = group_faults_shared(ng);
unsigned long private = group_faults_priv(ng);
@@ -1580,7 +1616,7 @@ pid_t task_numa_group_id(struct task_struct *p)
pid_t gid = 0;
rcu_read_lock();
- ng = rcu_dereference(p->numa_group);
+ ng = rcu_dereference_all(p->numa_group);
if (ng)
gid = ng->gid;
rcu_read_unlock();
@@ -2239,7 +2275,7 @@ static bool task_numa_compare(struct task_numa_env *env,
return false;
rcu_read_lock();
- cur = rcu_dereference(dst_rq->curr);
+ cur = rcu_dereference_all(dst_rq->curr);
if (cur && ((cur->flags & (PF_EXITING | PF_KTHREAD)) ||
!cur->mm))
cur = NULL;
@@ -2284,7 +2320,7 @@ static bool task_numa_compare(struct task_numa_env *env,
* If dst and source tasks are in the same NUMA group, or not
* in any group then look only at task weights.
*/
- cur_ng = rcu_dereference(cur->numa_group);
+ cur_ng = rcu_dereference_all(cur->numa_group);
if (cur_ng == p_ng) {
/*
* Do not swap within a group or between tasks that have
@@ -2458,11 +2494,8 @@ static void task_numa_find_cpu(struct task_numa_env *env,
maymove = !load_too_imbalanced(src_load, dst_load, env);
}
- for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
- /* Skip this CPU if the source task cannot migrate */
- if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
- continue;
-
+ /* Skip CPUs if the source task cannot migrate */
+ for_each_cpu_and(cpu, cpumask_of_node(env->dst_nid), env->p->cpus_ptr) {
env->dst_cpu = cpu;
if (task_numa_compare(env, taskimp, groupimp, maymove))
break;
@@ -2499,7 +2532,7 @@ static int task_numa_migrate(struct task_struct *p)
* to satisfy here.
*/
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
+ sd = rcu_dereference_all(per_cpu(sd_numa, env.src_cpu));
if (sd) {
env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
env.imb_numa_nr = sd->imb_numa_nr;
@@ -3023,7 +3056,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
if (!cpupid_match_pid(tsk, cpupid))
goto no_join;
- grp = rcu_dereference(tsk->numa_group);
+ grp = rcu_dereference_all(tsk->numa_group);
if (!grp)
goto no_join;
@@ -3694,7 +3727,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
#define add_positive(_ptr, _val) do { \
typeof(_ptr) ptr = (_ptr); \
- typeof(_val) val = (_val); \
+ __signed_scalar_typeof(*ptr) val = (_val); \
typeof(*ptr) res, var = READ_ONCE(*ptr); \
\
res = var + val; \
@@ -3706,23 +3739,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
} while (0)
/*
- * Unsigned subtract and clamp on underflow.
- *
- * Explicitly do a load-store to ensure the intermediate value never hits
- * memory. This allows lockless observations without ever seeing the negative
- * values.
- */
-#define sub_positive(_ptr, _val) do { \
- typeof(_ptr) ptr = (_ptr); \
- typeof(*ptr) val = (_val); \
- typeof(*ptr) res, var = READ_ONCE(*ptr); \
- res = var - val; \
- if (res > var) \
- res = 0; \
- WRITE_ONCE(*ptr, res); \
-} while (0)
-
-/*
* Remove and clamp on negative, from a local variable.
*
* A variant of sub_positive(), which does not use explicit load-store
@@ -3733,21 +3749,39 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
*ptr -= min_t(typeof(*ptr), *ptr, _val); \
} while (0)
+
+/*
+ * Because of rounding, se->util_sum might ends up being +1 more than
+ * cfs->util_sum. Although this is not a problem by itself, detaching
+ * a lot of tasks with the rounding problem between 2 updates of
+ * util_avg (~1ms) can make cfs->util_sum becoming null whereas
+ * cfs_util_avg is not.
+ *
+ * Check that util_sum is still above its lower bound for the new
+ * util_avg. Given that period_contrib might have moved since the last
+ * sync, we are only sure that util_sum must be above or equal to
+ * util_avg * minimum possible divider
+ */
+#define __update_sa(sa, name, delta_avg, delta_sum) do { \
+ add_positive(&(sa)->name##_avg, delta_avg); \
+ add_positive(&(sa)->name##_sum, delta_sum); \
+ (sa)->name##_sum = max_t(typeof((sa)->name##_sum), \
+ (sa)->name##_sum, \
+ (sa)->name##_avg * PELT_MIN_DIVIDER); \
+} while (0)
+
static inline void
enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- cfs_rq->avg.load_avg += se->avg.load_avg;
- cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
+ __update_sa(&cfs_rq->avg, load, se->avg.load_avg,
+ se_weight(se) * se->avg.load_sum);
}
static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
- sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
- cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
+ __update_sa(&cfs_rq->avg, load, -se->avg.load_avg,
+ se_weight(se) * -se->avg.load_sum);
}
static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
@@ -4243,7 +4277,6 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
*/
divider = get_pelt_divider(&cfs_rq->avg);
-
/* Set new sched_entity's utilization */
se->avg.util_avg = gcfs_rq->avg.util_avg;
new_sum = se->avg.util_avg * divider;
@@ -4251,12 +4284,7 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
se->avg.util_sum = new_sum;
/* Update parent cfs_rq utilization */
- add_positive(&cfs_rq->avg.util_avg, delta_avg);
- add_positive(&cfs_rq->avg.util_sum, delta_sum);
-
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
- cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
+ __update_sa(&cfs_rq->avg, util, delta_avg, delta_sum);
}
static inline void
@@ -4282,11 +4310,7 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
se->avg.runnable_sum = new_sum;
/* Update parent cfs_rq runnable */
- add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
- add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
- cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
+ __update_sa(&cfs_rq->avg, runnable, delta_avg, delta_sum);
}
static inline void
@@ -4350,11 +4374,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
se->avg.load_sum = runnable_sum;
se->avg.load_avg = load_avg;
- add_positive(&cfs_rq->avg.load_avg, delta_avg);
- add_positive(&cfs_rq->avg.load_sum, delta_sum);
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
- cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
+ __update_sa(&cfs_rq->avg, load, delta_avg, delta_sum);
}
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
@@ -4451,7 +4471,7 @@ static inline void migrate_se_pelt_lag(struct sched_entity *se)
rq = rq_of(cfs_rq);
rcu_read_lock();
- is_idle = is_idle_task(rcu_dereference(rq->curr));
+ is_idle = is_idle_task(rcu_dereference_all(rq->curr));
rcu_read_unlock();
/*
@@ -4553,33 +4573,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
raw_spin_unlock(&cfs_rq->removed.lock);
r = removed_load;
- sub_positive(&sa->load_avg, r);
- sub_positive(&sa->load_sum, r * divider);
- /* See sa->util_sum below */
- sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);
+ __update_sa(sa, load, -r, -r*divider);
r = removed_util;
- sub_positive(&sa->util_avg, r);
- sub_positive(&sa->util_sum, r * divider);
- /*
- * Because of rounding, se->util_sum might ends up being +1 more than
- * cfs->util_sum. Although this is not a problem by itself, detaching
- * a lot of tasks with the rounding problem between 2 updates of
- * util_avg (~1ms) can make cfs->util_sum becoming null whereas
- * cfs_util_avg is not.
- * Check that util_sum is still above its lower bound for the new
- * util_avg. Given that period_contrib might have moved since the last
- * sync, we are only sure that util_sum must be above or equal to
- * util_avg * minimum possible divider
- */
- sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
+ __update_sa(sa, util, -r, -r*divider);
r = removed_runnable;
- sub_positive(&sa->runnable_avg, r);
- sub_positive(&sa->runnable_sum, r * divider);
- /* See sa->util_sum above */
- sa->runnable_sum = max_t(u32, sa->runnable_sum,
- sa->runnable_avg * PELT_MIN_DIVIDER);
+ __update_sa(sa, runnable, -r, -r*divider);
/*
* removed_runnable is the unweighted version of removed_load so we
@@ -4664,17 +4664,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
dequeue_load_avg(cfs_rq, se);
- sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
- sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
- cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
-
- sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
- sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
- cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
+ __update_sa(&cfs_rq->avg, util, -se->avg.util_avg, -se->avg.util_sum);
+ __update_sa(&cfs_rq->avg, runnable, -se->avg.runnable_avg, -se->avg.runnable_sum);
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
@@ -5177,7 +5168,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*
* vl_i = (W + w_i)*vl'_i / W
*/
- load = cfs_rq->avg_load;
+ load = cfs_rq->sum_weight;
if (curr && curr->on_rq)
load += scale_load_down(curr->load.weight);
@@ -7150,8 +7141,7 @@ static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
static struct {
cpumask_var_t idle_cpus_mask;
- atomic_t nr_cpus;
- int has_blocked; /* Idle CPUS has blocked load */
+ int has_blocked_load; /* Idle CPUS has blocked load */
int needs_update; /* Newly idle CPUs need their next_balance collated */
unsigned long next_balance; /* in jiffy units */
unsigned long next_blocked; /* Next update of blocked load in jiffies */
@@ -7509,7 +7499,7 @@ static inline void set_idle_cores(int cpu, int val)
{
struct sched_domain_shared *sds;
- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
if (sds)
WRITE_ONCE(sds->has_idle_cores, val);
}
@@ -7518,7 +7508,7 @@ static inline bool test_idle_cores(int cpu)
{
struct sched_domain_shared *sds;
- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
if (sds)
return READ_ONCE(sds->has_idle_cores);
@@ -7647,7 +7637,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
if (sched_feat(SIS_UTIL)) {
- sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
+ sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, target));
if (sd_share) {
/* because !--nr is the condition to stop scan */
nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
@@ -7853,7 +7843,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* sd_asym_cpucapacity rather than sd_llc.
*/
if (sched_asym_cpucap_active()) {
- sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
+ sd = rcu_dereference_all(per_cpu(sd_asym_cpucapacity, target));
/*
* On an asymmetric CPU capacity system where an exclusive
* cpuset defines a symmetric island (i.e. one unique
@@ -7868,7 +7858,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
}
}
- sd = rcu_dereference(per_cpu(sd_llc, target));
+ sd = rcu_dereference_all(per_cpu(sd_llc, target));
if (!sd)
return target;
@@ -8337,7 +8327,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
struct energy_env eenv;
rcu_read_lock();
- pd = rcu_dereference(rd->pd);
+ pd = rcu_dereference_all(rd->pd);
if (!pd)
goto unlock;
@@ -8345,7 +8335,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
* Energy-aware wake-up happens on the lowest sched_domain starting
* from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
*/
- sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
+ sd = rcu_dereference_all(*this_cpu_ptr(&sd_asym_cpucapacity));
while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
sd = sd->parent;
if (!sd)
@@ -8368,9 +8358,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
int max_spare_cap_cpu = -1;
int fits, max_fits = -1;
- cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
-
- if (cpumask_empty(cpus))
+ if (!cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask))
continue;
/* Account external pressure for the energy estimation */
@@ -8747,7 +8735,7 @@ preempt_sync(struct rq *rq, int wake_flags,
/*
* Preempt the current task with a newly woken task if needed:
*/
-static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
+static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_flags)
{
enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK;
struct task_struct *donor = rq->donor;
@@ -8755,6 +8743,12 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
struct cfs_rq *cfs_rq = task_cfs_rq(donor);
int cse_is_idle, pse_is_idle;
+ /*
+ * XXX Getting preempted by higher class, try and find idle CPU?
+ */
+ if (p->sched_class != &fair_sched_class)
+ return;
+
if (unlikely(se == pse))
return;
@@ -9337,7 +9331,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
*/
static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
{
- struct numa_group *numa_group = rcu_dereference(p->numa_group);
+ struct numa_group *numa_group = rcu_dereference_all(p->numa_group);
unsigned long src_weight, dst_weight;
int src_nid, dst_nid, dist;
@@ -9766,7 +9760,7 @@ static void attach_tasks(struct lb_env *env)
}
#ifdef CONFIG_NO_HZ_COMMON
-static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
+static inline bool cfs_rq_has_blocked_load(struct cfs_rq *cfs_rq)
{
if (cfs_rq->avg.load_avg)
return true;
@@ -9799,16 +9793,16 @@ static inline void update_blocked_load_tick(struct rq *rq)
WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
}
-static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+static inline void update_has_blocked_load_status(struct rq *rq, bool has_blocked_load)
{
- if (!has_blocked)
+ if (!has_blocked_load)
rq->has_blocked_load = 0;
}
#else /* !CONFIG_NO_HZ_COMMON: */
-static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
+static inline bool cfs_rq_has_blocked_load(struct cfs_rq *cfs_rq) { return false; }
static inline bool others_have_blocked(struct rq *rq) { return false; }
static inline void update_blocked_load_tick(struct rq *rq) {}
-static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
+static inline void update_has_blocked_load_status(struct rq *rq, bool has_blocked_load) {}
#endif /* !CONFIG_NO_HZ_COMMON */
static bool __update_blocked_others(struct rq *rq, bool *done)
@@ -9865,7 +9859,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
list_del_leaf_cfs_rq(cfs_rq);
/* Don't need periodic decay once load/util_avg are null */
- if (cfs_rq_has_blocked(cfs_rq))
+ if (cfs_rq_has_blocked_load(cfs_rq))
*done = false;
}
@@ -9925,7 +9919,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
bool decayed;
decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
- if (cfs_rq_has_blocked(cfs_rq))
+ if (cfs_rq_has_blocked_load(cfs_rq))
*done = false;
return decayed;
@@ -9937,23 +9931,27 @@ static unsigned long task_h_load(struct task_struct *p)
}
#endif /* !CONFIG_FAIR_GROUP_SCHED */
-static void sched_balance_update_blocked_averages(int cpu)
+static void __sched_balance_update_blocked_averages(struct rq *rq)
{
bool decayed = false, done = true;
- struct rq *rq = cpu_rq(cpu);
- struct rq_flags rf;
- rq_lock_irqsave(rq, &rf);
update_blocked_load_tick(rq);
- update_rq_clock(rq);
decayed |= __update_blocked_others(rq, &done);
decayed |= __update_blocked_fair(rq, &done);
- update_blocked_load_status(rq, !done);
+ update_has_blocked_load_status(rq, !done);
if (decayed)
cpufreq_update_util(rq, 0);
- rq_unlock_irqrestore(rq, &rf);
+}
+
+static void sched_balance_update_blocked_averages(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ guard(rq_lock_irqsave)(rq);
+ update_rq_clock(rq);
+ __sched_balance_update_blocked_averages(rq);
}
/********** Helpers for sched_balance_find_src_group ************************/
@@ -10963,10 +10961,9 @@ sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int
* take care of it.
*/
if (p->nr_cpus_allowed != NR_CPUS) {
- struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
-
- cpumask_and(cpus, sched_group_span(local), p->cpus_ptr);
- imb_numa_nr = min(cpumask_weight(cpus), sd->imb_numa_nr);
+ unsigned int w = cpumask_weight_and(p->cpus_ptr,
+ sched_group_span(local));
+ imb_numa_nr = min(w, sd->imb_numa_nr);
}
imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus);
@@ -11013,7 +11010,7 @@ static void update_idle_cpu_scan(struct lb_env *env,
if (env->sd->span_weight != llc_weight)
return;
- sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu));
+ sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, env->dst_cpu));
if (!sd_share)
return;
@@ -11363,7 +11360,7 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
goto force_balance;
if (!is_rd_overutilized(env->dst_rq->rd) &&
- rcu_dereference(env->dst_rq->rd->pd))
+ rcu_dereference_all(env->dst_rq->rd->pd))
goto out_balanced;
/* ASYM feature bypasses nice load balance check */
@@ -12431,20 +12428,29 @@ static void nohz_balancer_kick(struct rq *rq)
*/
nohz_balance_exit_idle(rq);
- /*
- * None are in tickless mode and hence no need for NOHZ idle load
- * balancing:
- */
- if (likely(!atomic_read(&nohz.nr_cpus)))
- return;
-
- if (READ_ONCE(nohz.has_blocked) &&
+ if (READ_ONCE(nohz.has_blocked_load) &&
time_after(now, READ_ONCE(nohz.next_blocked)))
flags = NOHZ_STATS_KICK;
+ /*
+ * Most of the time system is not 100% busy. i.e nohz.nr_cpus > 0
+ * Skip the read if time is not due.
+ *
+ * If none are in tickless mode, there maybe a narrow window
+ * (28 jiffies, HZ=1000) where flags maybe set and kick_ilb called.
+ * But idle load balancing is not done as find_new_ilb fails.
+ * That's very rare. So read nohz.nr_cpus only if time is due.
+ */
if (time_before(now, nohz.next_balance))
goto out;
+ /*
+ * None are in tickless mode and hence no need for NOHZ idle load
+ * balancing
+ */
+ if (unlikely(cpumask_empty(nohz.idle_cpus_mask)))
+ return;
+
if (rq->nr_running >= 2) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto out;
@@ -12452,7 +12458,7 @@ static void nohz_balancer_kick(struct rq *rq)
rcu_read_lock();
- sd = rcu_dereference(rq->sd);
+ sd = rcu_dereference_all(rq->sd);
if (sd) {
/*
* If there's a runnable CFS task and the current CPU has reduced
@@ -12464,7 +12470,7 @@ static void nohz_balancer_kick(struct rq *rq)
}
}
- sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
+ sd = rcu_dereference_all(per_cpu(sd_asym_packing, cpu));
if (sd) {
/*
* When ASYM_PACKING; see if there's a more preferred CPU
@@ -12482,7 +12488,7 @@ static void nohz_balancer_kick(struct rq *rq)
}
}
- sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
+ sd = rcu_dereference_all(per_cpu(sd_asym_cpucapacity, cpu));
if (sd) {
/*
* When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
@@ -12503,7 +12509,7 @@ static void nohz_balancer_kick(struct rq *rq)
goto unlock;
}
- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
if (sds) {
/*
* If there is an imbalance between LLC domains (IOW we could
@@ -12535,7 +12541,7 @@ static void set_cpu_sd_state_busy(int cpu)
struct sched_domain *sd;
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ sd = rcu_dereference_all(per_cpu(sd_llc, cpu));
if (!sd || !sd->nohz_idle)
goto unlock;
@@ -12555,7 +12561,6 @@ void nohz_balance_exit_idle(struct rq *rq)
rq->nohz_tick_stopped = 0;
cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
- atomic_dec(&nohz.nr_cpus);
set_cpu_sd_state_busy(rq->cpu);
}
@@ -12565,7 +12570,7 @@ static void set_cpu_sd_state_idle(int cpu)
struct sched_domain *sd;
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ sd = rcu_dereference_all(per_cpu(sd_llc, cpu));
if (!sd || sd->nohz_idle)
goto unlock;
@@ -12599,9 +12604,9 @@ void nohz_balance_enter_idle(int cpu)
/*
* The tick is still stopped but load could have been added in the
- * meantime. We set the nohz.has_blocked flag to trig a check of the
+ * meantime. We set the nohz.has_blocked_load flag to trig a check of the
* *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
- * of nohz.has_blocked can only happen after checking the new load
+ * of nohz.has_blocked_load can only happen after checking the new load
*/
if (rq->nohz_tick_stopped)
goto out;
@@ -12613,11 +12618,10 @@ void nohz_balance_enter_idle(int cpu)
rq->nohz_tick_stopped = 1;
cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
- atomic_inc(&nohz.nr_cpus);
/*
* Ensures that if nohz_idle_balance() fails to observe our
- * @idle_cpus_mask store, it must observe the @has_blocked
+ * @idle_cpus_mask store, it must observe the @has_blocked_load
* and @needs_update stores.
*/
smp_mb__after_atomic();
@@ -12630,7 +12634,7 @@ out:
* Each time a cpu enter idle, we assume that it has blocked load and
* enable the periodic update of the load of idle CPUs
*/
- WRITE_ONCE(nohz.has_blocked, 1);
+ WRITE_ONCE(nohz.has_blocked_load, 1);
}
static bool update_nohz_stats(struct rq *rq)
@@ -12671,8 +12675,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
/*
* We assume there will be no idle load after this update and clear
- * the has_blocked flag. If a cpu enters idle in the mean time, it will
- * set the has_blocked flag and trigger another update of idle load.
+ * the has_blocked_load flag. If a cpu enters idle in the mean time, it will
+ * set the has_blocked_load flag and trigger another update of idle load.
* Because a cpu that becomes idle, is added to idle_cpus_mask before
* setting the flag, we are sure to not clear the state and not
* check the load of an idle cpu.
@@ -12680,12 +12684,12 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
* Same applies to idle_cpus_mask vs needs_update.
*/
if (flags & NOHZ_STATS_KICK)
- WRITE_ONCE(nohz.has_blocked, 0);
+ WRITE_ONCE(nohz.has_blocked_load, 0);
if (flags & NOHZ_NEXT_KICK)
WRITE_ONCE(nohz.needs_update, 0);
/*
- * Ensures that if we miss the CPU, we must see the has_blocked
+ * Ensures that if we miss the CPU, we must see the has_blocked_load
* store from nohz_balance_enter_idle().
*/
smp_mb();
@@ -12752,7 +12756,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
abort:
/* There is still blocked load, enable periodic update */
if (has_blocked_load)
- WRITE_ONCE(nohz.has_blocked, 1);
+ WRITE_ONCE(nohz.has_blocked_load, 1);
}
/*
@@ -12814,7 +12818,7 @@ static void nohz_newidle_balance(struct rq *this_rq)
return;
/* Don't need to update blocked load of idle CPUs*/
- if (!READ_ONCE(nohz.has_blocked) ||
+ if (!READ_ONCE(nohz.has_blocked_load) ||
time_before(jiffies, READ_ONCE(nohz.next_blocked)))
return;
@@ -12885,29 +12889,28 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
*/
rq_unpin_lock(this_rq, rf);
- rcu_read_lock();
- sd = rcu_dereference_check_sched_domain(this_rq->sd);
- if (!sd) {
- rcu_read_unlock();
+ sd = rcu_dereference_sched_domain(this_rq->sd);
+ if (!sd)
goto out;
- }
if (!get_rd_overloaded(this_rq->rd) ||
this_rq->avg_idle < sd->max_newidle_lb_cost) {
update_next_balance(sd, &next_balance);
- rcu_read_unlock();
goto out;
}
- rcu_read_unlock();
-
- rq_modified_clear(this_rq);
- raw_spin_rq_unlock(this_rq);
+ /*
+ * Include sched_balance_update_blocked_averages() in the cost
+ * calculation because it can be quite costly -- this ensures we skip
+ * it when avg_idle gets to be very low.
+ */
t0 = sched_clock_cpu(this_cpu);
- sched_balance_update_blocked_averages(this_cpu);
+ __sched_balance_update_blocked_averages(this_rq);
+
+ this_rq->next_class = &fair_sched_class;
+ raw_spin_rq_unlock(this_rq);
- rcu_read_lock();
for_each_domain(this_cpu, sd) {
u64 domain_cost;
@@ -12957,7 +12960,6 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
if (pulled_task || !continue_balancing)
break;
}
- rcu_read_unlock();
raw_spin_rq_lock(this_rq);
@@ -12973,7 +12975,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
pulled_task = 1;
/* If a higher prio class was modified, restart the pick */
- if (rq_modified_above(this_rq, &fair_sched_class))
+ if (sched_class_above(this_rq->next_class, &fair_sched_class))
pulled_task = -1;
out:
@@ -13324,8 +13326,8 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
* zero_vruntime_fi, which would have been updated in prior calls
* to se_fi_update().
*/
- delta = (s64)(sea->vruntime - seb->vruntime) +
- (s64)(cfs_rqb->zero_vruntime_fi - cfs_rqa->zero_vruntime_fi);
+ delta = vruntime_op(sea->vruntime, "-", seb->vruntime) +
+ vruntime_op(cfs_rqb->zero_vruntime_fi, "-", cfs_rqa->zero_vruntime_fi);
return delta > 0;
}
@@ -13363,6 +13365,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
entity_tick(cfs_rq, se, queued);
}
+ if (queued) {
+ if (!need_resched())
+ hrtick_start_fair(rq, curr);
+ return;
+ }
+
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
@@ -13871,15 +13879,12 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
* All the scheduling class methods:
*/
DEFINE_SCHED_CLASS(fair) = {
-
- .queue_mask = 2,
-
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
.yield_to_task = yield_to_task_fair,
- .wakeup_preempt = check_preempt_wakeup_fair,
+ .wakeup_preempt = wakeup_preempt_fair,
.pick_task = pick_task_fair,
.pick_next_task = pick_next_task_fair,
@@ -13939,7 +13944,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
struct numa_group *ng;
rcu_read_lock();
- ng = rcu_dereference(p->numa_group);
+ ng = rcu_dereference_all(p->numa_group);
for_each_online_node(node) {
if (p->numa_faults) {
tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index abf8f15d60c9..3681b6ad9276 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -460,6 +460,7 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct t
{
update_curr_idle(rq);
scx_update_idle(rq, false, true);
+ update_rq_avg_idle(rq);
}
static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
@@ -536,15 +537,15 @@ static void update_curr_idle(struct rq *rq)
se->exec_start = now;
dl_server_update_idle(&rq->fair_server, delta_exec);
+#ifdef CONFIG_SCHED_CLASS_EXT
+ dl_server_update_idle(&rq->ext_server, delta_exec);
+#endif
}
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
DEFINE_SCHED_CLASS(idle) = {
-
- .queue_mask = 0,
-
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f1867fe8e5c5..a7680477fa6f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1615,6 +1615,12 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
{
struct task_struct *donor = rq->donor;
+ /*
+ * XXX If we're preempted by DL, queue a push?
+ */
+ if (p->sched_class != &rt_sched_class)
+ return;
+
if (p->prio < donor->prio) {
resched_curr(rq);
return;
@@ -2100,6 +2106,7 @@ static void push_rt_tasks(struct rq *rq)
*/
static int rto_next_cpu(struct root_domain *rd)
{
+ int this_cpu = smp_processor_id();
int next;
int cpu;
@@ -2123,6 +2130,10 @@ static int rto_next_cpu(struct root_domain *rd)
rd->rto_cpu = cpu;
+ /* Do not send IPI to self */
+ if (cpu == this_cpu)
+ continue;
+
if (cpu < nr_cpu_ids)
return cpu;
@@ -2568,9 +2579,6 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu)
#endif /* CONFIG_SCHED_CORE */
DEFINE_SCHED_CLASS(rt) = {
-
- .queue_mask = 4,
-
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 275370854481..62f9278b1663 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -418,6 +418,7 @@ extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
extern void sched_init_dl_servers(void);
extern void fair_server_init(struct rq *rq);
+extern void ext_server_init(struct rq *rq);
extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq);
extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
u64 runtime, u64 period, bool init);
@@ -674,16 +675,16 @@ struct balance_callback {
void (*func)(struct rq *rq);
};
-/* CFS-related fields in a runqueue */
+/* Fair scheduling SCHED_{NORMAL,BATCH,IDLE} related fields in a runqueue: */
struct cfs_rq {
struct load_weight load;
unsigned int nr_queued;
- unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */
- unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */
- unsigned int h_nr_idle; /* SCHED_IDLE */
+ unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int h_nr_idle; /* SCHED_IDLE */
- s64 avg_vruntime;
- u64 avg_load;
+ s64 sum_w_vruntime;
+ u64 sum_weight;
u64 zero_vruntime;
#ifdef CONFIG_SCHED_CORE
@@ -694,7 +695,7 @@ struct cfs_rq {
struct rb_root_cached tasks_timeline;
/*
- * 'curr' points to currently running entity on this cfs_rq.
+ * 'curr' points to the currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
struct sched_entity *curr;
@@ -730,9 +731,7 @@ struct cfs_rq {
unsigned long h_load;
u64 last_h_load_update;
struct sched_entity *h_load_next;
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */
/*
@@ -745,19 +744,19 @@ struct cfs_rq {
*/
int on_list;
struct list_head leaf_cfs_rq_list;
- struct task_group *tg; /* group that "owns" this runqueue */
+ struct task_group *tg; /* Group that "owns" this runqueue */
/* Locally cached copy of our task_group's idle value */
int idle;
-#ifdef CONFIG_CFS_BANDWIDTH
+# ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
s64 runtime_remaining;
u64 throttled_pelt_idle;
-#ifndef CONFIG_64BIT
+# ifndef CONFIG_64BIT
u64 throttled_pelt_idle_copy;
-#endif
+# endif
u64 throttled_clock;
u64 throttled_clock_pelt;
u64 throttled_clock_pelt_time;
@@ -769,7 +768,7 @@ struct cfs_rq {
struct list_head throttled_list;
struct list_head throttled_csd_list;
struct list_head throttled_limbo_list;
-#endif /* CONFIG_CFS_BANDWIDTH */
+# endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
@@ -1121,28 +1120,50 @@ DECLARE_STATIC_KEY_FALSE(sched_uclamp_used);
* acquire operations must be ordered by ascending &runqueue.
*/
struct rq {
- /* runqueue lock: */
- raw_spinlock_t __lock;
-
- /* Per class runqueue modification mask; bits in class order. */
- unsigned int queue_mask;
+ /*
+ * The following members are loaded together, without holding the
+ * rq->lock, in an extremely hot loop in update_sg_lb_stats()
+ * (called from pick_next_task()). To reduce cache pollution from
+ * this operation, they are placed together on this dedicated cache
+ * line. Even though some of them are frequently modified, they are
+ * loaded much more frequently than they are stored.
+ */
unsigned int nr_running;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
- unsigned int numa_migrate_on;
#endif
+ unsigned int ttwu_pending;
+ unsigned long cpu_capacity;
+#ifdef CONFIG_SCHED_PROXY_EXEC
+ struct task_struct __rcu *donor; /* Scheduling context */
+ struct task_struct __rcu *curr; /* Execution context */
+#else
+ union {
+ struct task_struct __rcu *donor; /* Scheduler context */
+ struct task_struct __rcu *curr; /* Execution context */
+ };
+#endif
+ struct task_struct *idle;
+ /* padding left here deliberately */
+
+ /*
+ * The next cacheline holds the (hot) runqueue lock, as well as
+ * some other less performance-critical fields.
+ */
+ u64 nr_switches ____cacheline_aligned;
+
+ /* runqueue lock: */
+ raw_spinlock_t __lock;
+
#ifdef CONFIG_NO_HZ_COMMON
- unsigned long last_blocked_load_update_tick;
- unsigned int has_blocked_load;
- call_single_data_t nohz_csd;
unsigned int nohz_tick_stopped;
atomic_t nohz_flags;
+ unsigned int has_blocked_load;
+ unsigned long last_blocked_load_update_tick;
+ call_single_data_t nohz_csd;
#endif /* CONFIG_NO_HZ_COMMON */
- unsigned int ttwu_pending;
- u64 nr_switches;
-
#ifdef CONFIG_UCLAMP_TASK
/* Utilization clamp values based on CPU's RUNNABLE tasks */
struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned;
@@ -1155,6 +1176,7 @@ struct rq {
struct dl_rq dl;
#ifdef CONFIG_SCHED_CLASS_EXT
struct scx_rq scx;
+ struct sched_dl_entity ext_server;
#endif
struct sched_dl_entity fair_server;
@@ -1165,6 +1187,9 @@ struct rq {
struct list_head *tmp_alone_branch;
#endif /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_NUMA_BALANCING
+ unsigned int numa_migrate_on;
+#endif
/*
* This is part of a global counter where only the total sum
* over all CPUs matters. A task can increase this counter on
@@ -1173,36 +1198,29 @@ struct rq {
*/
unsigned long nr_uninterruptible;
-#ifdef CONFIG_SCHED_PROXY_EXEC
- struct task_struct __rcu *donor; /* Scheduling context */
- struct task_struct __rcu *curr; /* Execution context */
-#else
- union {
- struct task_struct __rcu *donor; /* Scheduler context */
- struct task_struct __rcu *curr; /* Execution context */
- };
-#endif
struct sched_dl_entity *dl_server;
- struct task_struct *idle;
struct task_struct *stop;
+ const struct sched_class *next_class;
unsigned long next_balance;
struct mm_struct *prev_mm;
- unsigned int clock_update_flags;
- u64 clock;
- /* Ensure that all clocks are in the same cache line */
+ /*
+ * The following fields of clock data are frequently referenced
+ * and updated together, and should go on their own cache line.
+ */
u64 clock_task ____cacheline_aligned;
u64 clock_pelt;
+ u64 clock;
unsigned long lost_idle_time;
+ unsigned int clock_update_flags;
u64 clock_pelt_idle;
u64 clock_idle;
+
#ifndef CONFIG_64BIT
u64 clock_pelt_idle_copy;
u64 clock_idle_copy;
#endif
- atomic_t nr_iowait;
-
u64 last_seen_need_resched_ns;
int ticks_without_resched;
@@ -1213,8 +1231,6 @@ struct rq {
struct root_domain *rd;
struct sched_domain __rcu *sd;
- unsigned long cpu_capacity;
-
struct balance_callback *balance_callback;
unsigned char nohz_idle_balance;
@@ -1324,7 +1340,9 @@ struct rq {
call_single_data_t cfsb_csd;
struct list_head cfsb_csd_list;
#endif
-};
+
+ atomic_t nr_iowait;
+} __no_randomize_layout;
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1698,6 +1716,7 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
#endif /* !CONFIG_FAIR_GROUP_SCHED */
+extern void update_rq_avg_idle(struct rq *rq);
extern void update_rq_clock(struct rq *rq);
/*
@@ -2062,8 +2081,8 @@ queue_balance_callback(struct rq *rq,
rq->balance_callback = head;
}
-#define rcu_dereference_check_sched_domain(p) \
- rcu_dereference_check((p), lockdep_is_held(&sched_domains_mutex))
+#define rcu_dereference_sched_domain(p) \
+ rcu_dereference_all_check((p), lockdep_is_held(&sched_domains_mutex))
/*
* The domain tree (rq->sd) is protected by RCU's quiescent state transition.
@@ -2073,7 +2092,7 @@ queue_balance_callback(struct rq *rq,
* preempt-disabled sections.
*/
#define for_each_domain(cpu, __sd) \
- for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
+ for (__sd = rcu_dereference_sched_domain(cpu_rq(cpu)->sd); \
__sd; __sd = __sd->parent)
/* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */
@@ -2481,15 +2500,6 @@ struct sched_class {
#ifdef CONFIG_UCLAMP_TASK
int uclamp_enabled;
#endif
- /*
- * idle: 0
- * ext: 1
- * fair: 2
- * rt: 4
- * dl: 8
- * stop: 16
- */
- unsigned int queue_mask;
/*
* move_queued_task/activate_task/enqueue_task: rq->lock
@@ -2648,20 +2658,6 @@ struct sched_class {
#endif
};
-/*
- * Does not nest; only used around sched_class::pick_task() rq-lock-breaks.
- */
-static inline void rq_modified_clear(struct rq *rq)
-{
- rq->queue_mask = 0;
-}
-
-static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class)
-{
- unsigned int mask = class->queue_mask;
- return rq->queue_mask & ~((mask << 1) - 1);
-}
-
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
WARN_ON_ONCE(rq->donor != prev);
@@ -3397,11 +3393,11 @@ struct irqtime {
};
DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
-extern int sched_clock_irqtime;
+DECLARE_STATIC_KEY_FALSE(sched_clock_irqtime);
static inline int irqtime_enabled(void)
{
- return sched_clock_irqtime;
+ return static_branch_likely(&sched_clock_irqtime);
}
/*
@@ -4010,6 +4006,7 @@ void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_s
deactivate_task(src_rq, task, 0);
set_task_cpu(task, dst_rq->cpu);
activate_task(dst_rq, task, 0);
+ wakeup_preempt(dst_rq, task, 0);
}
static inline
@@ -4079,6 +4076,7 @@ extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
struct sched_change_ctx {
u64 prio;
struct task_struct *p;
+ const struct sched_class *class;
int flags;
bool queued;
bool running;
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 4f9192be4b5b..f95798baddeb 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -97,9 +97,6 @@ static void update_curr_stop(struct rq *rq)
* Simple, special scheduling class for the per-CPU stop tasks:
*/
DEFINE_SCHED_CLASS(stop) = {
-
- .queue_mask = 16,
-
.enqueue_task = enqueue_task_stop,
.dequeue_task = dequeue_task_stop,
.yield_task = yield_task_stop,
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index cf643a5ddedd..ac268da91778 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -508,6 +508,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
if (rq->fair_server.dl_server)
__dl_server_attach_root(&rq->fair_server, rq);
+#ifdef CONFIG_SCHED_CLASS_EXT
+ if (rq->ext_server.dl_server)
+ __dl_server_attach_root(&rq->ext_server, rq);
+#endif
+
rq_unlock_irqrestore(rq, &rf);
if (old_rd)
diff --git a/kernel/sys.c b/kernel/sys.c
index 8b58eece4e58..af71987df81c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -53,6 +53,7 @@
#include <linux/time_namespace.h>
#include <linux/binfmts.h>
#include <linux/futex.h>
+#include <linux/rseq.h>
#include <linux/sched.h>
#include <linux/sched/autogroup.h>
@@ -2868,6 +2869,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_FUTEX_HASH:
error = futex_hash_prctl(arg2, arg3, arg4);
break;
+ case PR_RSEQ_SLICE_EXTENSION:
+ if (arg4 || arg5)
+ return -EINVAL;
+ error = rseq_slice_extension_prctl(arg2, arg3);
+ break;
default:
trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5);
error = -EINVAL;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index bf5d05c635ff..add3032da16f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -390,6 +390,7 @@ COND_SYSCALL(setuid16);
/* restartable sequence */
COND_SYSCALL(rseq);
+COND_SYSCALL(rseq_slice_yield);
COND_SYSCALL(uretprobe);
COND_SYSCALL(uprobe);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 1caf02a72ba8..59d22f1bd0a8 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1742,7 +1742,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
lockdep_assert_held(&cpu_base->lock);
- debug_deactivate(timer);
+ debug_hrtimer_deactivate(timer);
base->running = timer;
/*
diff --git a/scripts/syscall.tbl b/scripts/syscall.tbl
index e74868be513c..7a42b32b6577 100644
--- a/scripts/syscall.tbl
+++ b/scripts/syscall.tbl
@@ -411,3 +411,4 @@
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
470 common listns sys_listns
+471 common rseq_slice_yield sys_rseq_slice_yield
diff --git a/tools/testing/selftests/rseq/.gitignore b/tools/testing/selftests/rseq/.gitignore
index 0fda241fa62b..ec01d164c1f0 100644
--- a/tools/testing/selftests/rseq/.gitignore
+++ b/tools/testing/selftests/rseq/.gitignore
@@ -10,3 +10,4 @@ param_test_mm_cid
param_test_mm_cid_benchmark
param_test_mm_cid_compare_twice
syscall_errors_test
+slice_test
diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile
index 0d0a5fae5954..4ef90823b652 100644
--- a/tools/testing/selftests/rseq/Makefile
+++ b/tools/testing/selftests/rseq/Makefile
@@ -17,7 +17,7 @@ OVERRIDE_TARGETS = 1
TEST_GEN_PROGS = basic_test basic_percpu_ops_test basic_percpu_ops_mm_cid_test param_test \
param_test_benchmark param_test_compare_twice param_test_mm_cid \
param_test_mm_cid_benchmark param_test_mm_cid_compare_twice \
- syscall_errors_test
+ syscall_errors_test slice_test
TEST_GEN_PROGS_EXTENDED = librseq.so
@@ -59,3 +59,6 @@ $(OUTPUT)/param_test_mm_cid_compare_twice: param_test.c $(TEST_GEN_PROGS_EXTENDE
$(OUTPUT)/syscall_errors_test: syscall_errors_test.c $(TEST_GEN_PROGS_EXTENDED) \
rseq.h rseq-*.h
$(CC) $(CFLAGS) $< $(LDLIBS) -lrseq -o $@
+
+$(OUTPUT)/slice_test: slice_test.c $(TEST_GEN_PROGS_EXTENDED) rseq.h rseq-*.h
+ $(CC) $(CFLAGS) $< $(LDLIBS) -lrseq -o $@
diff --git a/tools/testing/selftests/rseq/rseq-abi.h b/tools/testing/selftests/rseq/rseq-abi.h
index fb4ec8a75dd4..ecef315204b2 100644
--- a/tools/testing/selftests/rseq/rseq-abi.h
+++ b/tools/testing/selftests/rseq/rseq-abi.h
@@ -53,6 +53,27 @@ struct rseq_abi_cs {
__u64 abort_ip;
} __attribute__((aligned(4 * sizeof(__u64))));
+/**
+ * rseq_abi_slice_ctrl - Time slice extension control structure
+ * @all: Compound value
+ * @request: Request for a time slice extension
+ * @granted: Granted time slice extension
+ *
+ * @request is set by user space and can be cleared by user space or kernel
+ * space. @granted is set and cleared by the kernel and must only be read
+ * by user space.
+ */
+struct rseq_abi_slice_ctrl {
+ union {
+ __u32 all;
+ struct {
+ __u8 request;
+ __u8 granted;
+ __u16 __reserved;
+ };
+ };
+};
+
/*
* struct rseq_abi is aligned on 4 * 8 bytes to ensure it is always
* contained within a single cache-line.
@@ -165,6 +186,12 @@ struct rseq_abi {
__u32 mm_cid;
/*
+ * Time slice extension control structure. CPU local updates from
+ * kernel and user space.
+ */
+ struct rseq_abi_slice_ctrl slice_ctrl;
+
+ /*
* Flexible array member at end of structure, after last feature field.
*/
char end[];
diff --git a/tools/testing/selftests/rseq/rseq-slice-hist.py b/tools/testing/selftests/rseq/rseq-slice-hist.py
new file mode 100644
index 000000000000..b7933eeaefb9
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq-slice-hist.py
@@ -0,0 +1,132 @@
+#!/usr/bin/python3
+
+#
+# trace-cmd record -e hrtimer_start -e hrtimer_cancel -e hrtimer_expire_entry -- $cmd
+#
+
+from tracecmd import *
+
+def load_kallsyms(file_path='/proc/kallsyms'):
+ """
+ Parses /proc/kallsyms into a dictionary.
+ Returns: { address_int: symbol_name }
+ """
+ kallsyms_map = {}
+
+ try:
+ with open(file_path, 'r') as f:
+ for line in f:
+ # The format is: [address] [type] [name] [module]
+ parts = line.split()
+ if len(parts) < 3:
+ continue
+
+ addr = int(parts[0], 16)
+ name = parts[2]
+
+ kallsyms_map[addr] = name
+
+ except PermissionError:
+ print(f"Error: Permission denied reading {file_path}. Try running with sudo.")
+ except FileNotFoundError:
+ print(f"Error: {file_path} not found.")
+
+ return kallsyms_map
+
+ksyms = load_kallsyms()
+
+# pending[timer_ptr] = {'ts': timestamp, 'comm': comm}
+pending = {}
+
+# histograms[comm][bucket] = count
+histograms = {}
+
+class OnlineHarmonicMean:
+ def __init__(self):
+ self.n = 0 # Count of elements
+ self.S = 0.0 # Cumulative sum of reciprocals
+
+ def update(self, x):
+ if x == 0:
+ raise ValueError("Harmonic mean is undefined for zero.")
+
+ self.n += 1
+ self.S += 1.0 / x
+ return self.n / self.S
+
+ @property
+ def mean(self):
+ return self.n / self.S if self.n > 0 else 0
+
+ohms = {}
+
+def handle_start(record):
+ func_name = ksyms[record.num_field("function")]
+ if "rseq_slice_expired" in func_name:
+ timer_ptr = record.num_field("hrtimer")
+ pending[timer_ptr] = {
+ 'ts': record.ts,
+ 'comm': record.comm
+ }
+ return None
+
+def handle_cancel(record):
+ timer_ptr = record.num_field("hrtimer")
+
+ if timer_ptr in pending:
+ start_data = pending.pop(timer_ptr)
+ duration_ns = record.ts - start_data['ts']
+ duration_us = duration_ns // 1000
+
+ comm = start_data['comm']
+
+ if comm not in ohms:
+ ohms[comm] = OnlineHarmonicMean()
+
+ ohms[comm].update(duration_ns)
+
+ if comm not in histograms:
+ histograms[comm] = {}
+
+ histograms[comm][duration_us] = histograms[comm].get(duration_us, 0) + 1
+ return None
+
+def handle_expire(record):
+ timer_ptr = record.num_field("hrtimer")
+
+ if timer_ptr in pending:
+ start_data = pending.pop(timer_ptr)
+ comm = start_data['comm']
+
+ if comm not in histograms:
+ histograms[comm] = {}
+
+ # Record -1 bucket for expired (failed to cancel)
+ histograms[comm][-1] = histograms[comm].get(-1, 0) + 1
+ return None
+
+if __name__ == "__main__":
+ t = Trace("trace.dat")
+ for cpu in range(0, t.cpus):
+ ev = t.read_event(cpu)
+ while ev:
+ if "hrtimer_start" in ev.name:
+ handle_start(ev)
+ if "hrtimer_cancel" in ev.name:
+ handle_cancel(ev)
+ if "hrtimer_expire_entry" in ev.name:
+ handle_expire(ev)
+
+ ev = t.read_event(cpu)
+
+ print("\n" + "="*40)
+ print("RSEQ SLICE HISTOGRAM (us)")
+ print("="*40)
+ for comm, buckets in histograms.items():
+ print(f"\nTask: {comm} Mean: {ohms[comm].mean:.3f} ns")
+ print(f" {'Latency (us)':<15} | {'Count'}")
+ print(f" {'-'*30}")
+ # Sort buckets numerically, putting -1 at the top
+ for bucket in sorted(buckets.keys()):
+ label = "EXPIRED" if bucket == -1 else f"{bucket} us"
+ print(f" {label:<15} | {buckets[bucket]}")
diff --git a/tools/testing/selftests/rseq/slice_test.c b/tools/testing/selftests/rseq/slice_test.c
new file mode 100644
index 000000000000..357122dcb487
--- /dev/null
+++ b/tools/testing/selftests/rseq/slice_test.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: LGPL-2.1
+#define _GNU_SOURCE
+#include <assert.h>
+#include <pthread.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+#include <syscall.h>
+#include <unistd.h>
+
+#include <linux/prctl.h>
+#include <sys/prctl.h>
+#include <sys/time.h>
+
+#include "rseq.h"
+
+#include "../kselftest_harness.h"
+
+#ifndef __NR_rseq_slice_yield
+# define __NR_rseq_slice_yield 471
+#endif
+
+#define BITS_PER_INT 32
+#define BITS_PER_BYTE 8
+
+#ifndef PR_RSEQ_SLICE_EXTENSION
+# define PR_RSEQ_SLICE_EXTENSION 79
+# define PR_RSEQ_SLICE_EXTENSION_GET 1
+# define PR_RSEQ_SLICE_EXTENSION_SET 2
+# define PR_RSEQ_SLICE_EXT_ENABLE 0x01
+#endif
+
+#ifndef RSEQ_SLICE_EXT_REQUEST_BIT
+# define RSEQ_SLICE_EXT_REQUEST_BIT 0
+# define RSEQ_SLICE_EXT_GRANTED_BIT 1
+#endif
+
+#ifndef asm_inline
+# define asm_inline asm __inline
+#endif
+
+#define NSEC_PER_SEC 1000000000L
+#define NSEC_PER_USEC 1000L
+
+struct noise_params {
+ int64_t noise_nsecs;
+ int64_t sleep_nsecs;
+ int64_t run;
+};
+
+FIXTURE(slice_ext)
+{
+ pthread_t noise_thread;
+ struct noise_params noise_params;
+};
+
+FIXTURE_VARIANT(slice_ext)
+{
+ int64_t total_nsecs;
+ int64_t slice_nsecs;
+ int64_t noise_nsecs;
+ int64_t sleep_nsecs;
+ bool no_yield;
+};
+
+FIXTURE_VARIANT_ADD(slice_ext, n2_2_50)
+{
+ .total_nsecs = 5LL * NSEC_PER_SEC,
+ .slice_nsecs = 2LL * NSEC_PER_USEC,
+ .noise_nsecs = 2LL * NSEC_PER_USEC,
+ .sleep_nsecs = 50LL * NSEC_PER_USEC,
+};
+
+FIXTURE_VARIANT_ADD(slice_ext, n50_2_50)
+{
+ .total_nsecs = 5LL * NSEC_PER_SEC,
+ .slice_nsecs = 50LL * NSEC_PER_USEC,
+ .noise_nsecs = 2LL * NSEC_PER_USEC,
+ .sleep_nsecs = 50LL * NSEC_PER_USEC,
+};
+
+FIXTURE_VARIANT_ADD(slice_ext, n2_2_50_no_yield)
+{
+ .total_nsecs = 5LL * NSEC_PER_SEC,
+ .slice_nsecs = 2LL * NSEC_PER_USEC,
+ .noise_nsecs = 2LL * NSEC_PER_USEC,
+ .sleep_nsecs = 50LL * NSEC_PER_USEC,
+ .no_yield = true,
+};
+
+
+static inline bool elapsed(struct timespec *start, struct timespec *now,
+ int64_t span)
+{
+ int64_t delta = now->tv_sec - start->tv_sec;
+
+ delta *= NSEC_PER_SEC;
+ delta += now->tv_nsec - start->tv_nsec;
+ return delta >= span;
+}
+
+static void *noise_thread(void *arg)
+{
+ struct noise_params *p = arg;
+
+ while (RSEQ_READ_ONCE(p->run)) {
+ struct timespec ts_start, ts_now;
+
+ clock_gettime(CLOCK_MONOTONIC, &ts_start);
+ do {
+ clock_gettime(CLOCK_MONOTONIC, &ts_now);
+ } while (!elapsed(&ts_start, &ts_now, p->noise_nsecs));
+
+ ts_start.tv_sec = 0;
+ ts_start.tv_nsec = p->sleep_nsecs;
+ clock_nanosleep(CLOCK_MONOTONIC, 0, &ts_start, NULL);
+ }
+ return NULL;
+}
+
+FIXTURE_SETUP(slice_ext)
+{
+ cpu_set_t affinity;
+
+ ASSERT_EQ(sched_getaffinity(0, sizeof(affinity), &affinity), 0);
+
+ /* Pin it on a single CPU. Avoid CPU 0 */
+ for (int i = 1; i < CPU_SETSIZE; i++) {
+ if (!CPU_ISSET(i, &affinity))
+ continue;
+
+ CPU_ZERO(&affinity);
+ CPU_SET(i, &affinity);
+ ASSERT_EQ(sched_setaffinity(0, sizeof(affinity), &affinity), 0);
+ break;
+ }
+
+ ASSERT_EQ(rseq_register_current_thread(), 0);
+
+ ASSERT_EQ(prctl(PR_RSEQ_SLICE_EXTENSION, PR_RSEQ_SLICE_EXTENSION_SET,
+ PR_RSEQ_SLICE_EXT_ENABLE, 0, 0), 0);
+
+ self->noise_params.noise_nsecs = variant->noise_nsecs;
+ self->noise_params.sleep_nsecs = variant->sleep_nsecs;
+ self->noise_params.run = 1;
+
+ ASSERT_EQ(pthread_create(&self->noise_thread, NULL, noise_thread, &self->noise_params), 0);
+}
+
+FIXTURE_TEARDOWN(slice_ext)
+{
+ self->noise_params.run = 0;
+ pthread_join(self->noise_thread, NULL);
+}
+
+TEST_F(slice_ext, slice_test)
+{
+ unsigned long success = 0, yielded = 0, scheduled = 0, raced = 0;
+ unsigned long total = 0, aborted = 0;
+ struct rseq_abi *rs = rseq_get_abi();
+ struct timespec ts_start, ts_now;
+
+ ASSERT_NE(rs, NULL);
+
+ clock_gettime(CLOCK_MONOTONIC, &ts_start);
+ do {
+ struct timespec ts_cs;
+ bool req = false;
+
+ clock_gettime(CLOCK_MONOTONIC, &ts_cs);
+
+ total++;
+ RSEQ_WRITE_ONCE(rs->slice_ctrl.request, 1);
+ do {
+ clock_gettime(CLOCK_MONOTONIC, &ts_now);
+ } while (!elapsed(&ts_cs, &ts_now, variant->slice_nsecs));
+
+ /*
+ * request can be cleared unconditionally, but for making
+ * the stats work this is actually checking it first
+ */
+ if (RSEQ_READ_ONCE(rs->slice_ctrl.request)) {
+ RSEQ_WRITE_ONCE(rs->slice_ctrl.request, 0);
+ /* Race between check and clear! */
+ req = true;
+ success++;
+ }
+
+ if (RSEQ_READ_ONCE(rs->slice_ctrl.granted)) {
+ /* The above raced against a late grant */
+ if (req)
+ success--;
+ if (variant->no_yield) {
+ syscall(__NR_getpid);
+ aborted++;
+ } else {
+ yielded++;
+ if (!syscall(__NR_rseq_slice_yield))
+ raced++;
+ }
+ } else {
+ if (!req)
+ scheduled++;
+ }
+
+ clock_gettime(CLOCK_MONOTONIC, &ts_now);
+ } while (!elapsed(&ts_start, &ts_now, variant->total_nsecs));
+
+ printf("# Total %12ld\n", total);
+ printf("# Success %12ld\n", success);
+ printf("# Yielded %12ld\n", yielded);
+ printf("# Aborted %12ld\n", aborted);
+ printf("# Scheduled %12ld\n", scheduled);
+ printf("# Raced %12ld\n", raced);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
index 5fe45f9c5f8f..2c601a7eaff5 100644
--- a/tools/testing/selftests/sched_ext/Makefile
+++ b/tools/testing/selftests/sched_ext/Makefile
@@ -183,7 +183,9 @@ auto-test-targets := \
select_cpu_dispatch_bad_dsq \
select_cpu_dispatch_dbl_dsp \
select_cpu_vtime \
+ rt_stall \
test_example \
+ total_bw \
testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets)))
diff --git a/tools/testing/selftests/sched_ext/rt_stall.bpf.c b/tools/testing/selftests/sched_ext/rt_stall.bpf.c
new file mode 100644
index 000000000000..80086779dd1e
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/rt_stall.bpf.c
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A scheduler that verified if RT tasks can stall SCHED_EXT tasks.
+ *
+ * Copyright (c) 2025 NVIDIA Corporation.
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+void BPF_STRUCT_OPS(rt_stall_exit, struct scx_exit_info *ei)
+{
+ UEI_RECORD(uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops rt_stall_ops = {
+ .exit = (void *)rt_stall_exit,
+ .name = "rt_stall",
+};
diff --git a/tools/testing/selftests/sched_ext/rt_stall.c b/tools/testing/selftests/sched_ext/rt_stall.c
new file mode 100644
index 000000000000..015200f80f6e
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/rt_stall.c
@@ -0,0 +1,240 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2025 NVIDIA Corporation.
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sched.h>
+#include <sys/prctl.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <time.h>
+#include <linux/sched.h>
+#include <signal.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <unistd.h>
+#include "rt_stall.bpf.skel.h"
+#include "scx_test.h"
+#include "../kselftest.h"
+
+#define CORE_ID 0 /* CPU to pin tasks to */
+#define RUN_TIME 5 /* How long to run the test in seconds */
+
+/* Simple busy-wait function for test tasks */
+static void process_func(void)
+{
+ while (1) {
+ /* Busy wait */
+ for (volatile unsigned long i = 0; i < 10000000UL; i++)
+ ;
+ }
+}
+
+/* Set CPU affinity to a specific core */
+static void set_affinity(int cpu)
+{
+ cpu_set_t mask;
+
+ CPU_ZERO(&mask);
+ CPU_SET(cpu, &mask);
+ if (sched_setaffinity(0, sizeof(mask), &mask) != 0) {
+ perror("sched_setaffinity");
+ exit(EXIT_FAILURE);
+ }
+}
+
+/* Set task scheduling policy and priority */
+static void set_sched(int policy, int priority)
+{
+ struct sched_param param;
+
+ param.sched_priority = priority;
+ if (sched_setscheduler(0, policy, &param) != 0) {
+ perror("sched_setscheduler");
+ exit(EXIT_FAILURE);
+ }
+}
+
+/* Get process runtime from /proc/<pid>/stat */
+static float get_process_runtime(int pid)
+{
+ char path[256];
+ FILE *file;
+ long utime, stime;
+ int fields;
+
+ snprintf(path, sizeof(path), "/proc/%d/stat", pid);
+ file = fopen(path, "r");
+ if (file == NULL) {
+ perror("Failed to open stat file");
+ return -1;
+ }
+
+ /* Skip the first 13 fields and read the 14th and 15th */
+ fields = fscanf(file,
+ "%*d %*s %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u %lu %lu",
+ &utime, &stime);
+ fclose(file);
+
+ if (fields != 2) {
+ fprintf(stderr, "Failed to read stat file\n");
+ return -1;
+ }
+
+ /* Calculate the total time spent in the process */
+ long total_time = utime + stime;
+ long ticks_per_second = sysconf(_SC_CLK_TCK);
+ float runtime_seconds = total_time * 1.0 / ticks_per_second;
+
+ return runtime_seconds;
+}
+
+static enum scx_test_status setup(void **ctx)
+{
+ struct rt_stall *skel;
+
+ skel = rt_stall__open();
+ SCX_FAIL_IF(!skel, "Failed to open");
+ SCX_ENUM_INIT(skel);
+ SCX_FAIL_IF(rt_stall__load(skel), "Failed to load skel");
+
+ *ctx = skel;
+
+ return SCX_TEST_PASS;
+}
+
+static bool sched_stress_test(bool is_ext)
+{
+ /*
+ * We're expecting the EXT task to get around 5% of CPU time when
+ * competing with the RT task (small 1% fluctuations are expected).
+ *
+ * However, the EXT task should get at least 4% of the CPU to prove
+ * that the EXT deadline server is working correctly. A percentage
+ * less than 4% indicates a bug where RT tasks can potentially
+ * stall SCHED_EXT tasks, causing the test to fail.
+ */
+ const float expected_min_ratio = 0.04; /* 4% */
+ const char *class_str = is_ext ? "EXT" : "FAIR";
+
+ float ext_runtime, rt_runtime, actual_ratio;
+ int ext_pid, rt_pid;
+
+ ksft_print_header();
+ ksft_set_plan(1);
+
+ /* Create and set up a EXT task */
+ ext_pid = fork();
+ if (ext_pid == 0) {
+ set_affinity(CORE_ID);
+ process_func();
+ exit(0);
+ } else if (ext_pid < 0) {
+ perror("fork task");
+ ksft_exit_fail();
+ }
+
+ /* Create an RT task */
+ rt_pid = fork();
+ if (rt_pid == 0) {
+ set_affinity(CORE_ID);
+ set_sched(SCHED_FIFO, 50);
+ process_func();
+ exit(0);
+ } else if (rt_pid < 0) {
+ perror("fork for RT task");
+ ksft_exit_fail();
+ }
+
+ /* Let the processes run for the specified time */
+ sleep(RUN_TIME);
+
+ /* Get runtime for the EXT task */
+ ext_runtime = get_process_runtime(ext_pid);
+ if (ext_runtime == -1)
+ ksft_exit_fail_msg("Error getting runtime for %s task (PID %d)\n",
+ class_str, ext_pid);
+ ksft_print_msg("Runtime of %s task (PID %d) is %f seconds\n",
+ class_str, ext_pid, ext_runtime);
+
+ /* Get runtime for the RT task */
+ rt_runtime = get_process_runtime(rt_pid);
+ if (rt_runtime == -1)
+ ksft_exit_fail_msg("Error getting runtime for RT task (PID %d)\n", rt_pid);
+ ksft_print_msg("Runtime of RT task (PID %d) is %f seconds\n", rt_pid, rt_runtime);
+
+ /* Kill the processes */
+ kill(ext_pid, SIGKILL);
+ kill(rt_pid, SIGKILL);
+ waitpid(ext_pid, NULL, 0);
+ waitpid(rt_pid, NULL, 0);
+
+ /* Verify that the scx task got enough runtime */
+ actual_ratio = ext_runtime / (ext_runtime + rt_runtime);
+ ksft_print_msg("%s task got %.2f%% of total runtime\n",
+ class_str, actual_ratio * 100);
+
+ if (actual_ratio >= expected_min_ratio) {
+ ksft_test_result_pass("PASS: %s task got more than %.2f%% of runtime\n",
+ class_str, expected_min_ratio * 100);
+ return true;
+ }
+ ksft_test_result_fail("FAIL: %s task got less than %.2f%% of runtime\n",
+ class_str, expected_min_ratio * 100);
+ return false;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+ struct rt_stall *skel = ctx;
+ struct bpf_link *link = NULL;
+ bool res;
+ int i;
+
+ /*
+ * Test if the dl_server is working both with and without the
+ * sched_ext scheduler attached.
+ *
+ * This ensures all the scenarios are covered:
+ * - fair_server stop -> ext_server start
+ * - ext_server stop -> fair_server stop
+ */
+ for (i = 0; i < 4; i++) {
+ bool is_ext = i % 2;
+
+ if (is_ext) {
+ memset(&skel->data->uei, 0, sizeof(skel->data->uei));
+ link = bpf_map__attach_struct_ops(skel->maps.rt_stall_ops);
+ SCX_FAIL_IF(!link, "Failed to attach scheduler");
+ }
+ res = sched_stress_test(is_ext);
+ if (is_ext) {
+ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE));
+ bpf_link__destroy(link);
+ }
+
+ if (!res)
+ ksft_exit_fail();
+ }
+
+ return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+ struct rt_stall *skel = ctx;
+
+ rt_stall__destroy(skel);
+}
+
+struct scx_test rt_stall = {
+ .name = "rt_stall",
+ .description = "Verify that RT tasks cannot stall SCHED_EXT tasks",
+ .setup = setup,
+ .run = run,
+ .cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&rt_stall)
diff --git a/tools/testing/selftests/sched_ext/total_bw.c b/tools/testing/selftests/sched_ext/total_bw.c
new file mode 100644
index 000000000000..5b0a619bab86
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/total_bw.c
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test to verify that total_bw value remains consistent across all CPUs
+ * in different BPF program states.
+ *
+ * Copyright (C) 2025 NVIDIA Corporation.
+ */
+#include <bpf/bpf.h>
+#include <errno.h>
+#include <pthread.h>
+#include <scx/common.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "minimal.bpf.skel.h"
+#include "scx_test.h"
+
+#define MAX_CPUS 512
+#define STRESS_DURATION_SEC 5
+
+struct total_bw_ctx {
+ struct minimal *skel;
+ long baseline_bw[MAX_CPUS];
+ int nr_cpus;
+};
+
+static void *cpu_stress_thread(void *arg)
+{
+ volatile int i;
+ time_t end_time = time(NULL) + STRESS_DURATION_SEC;
+
+ while (time(NULL) < end_time)
+ for (i = 0; i < 1000000; i++)
+ ;
+
+ return NULL;
+}
+
+/*
+ * The first enqueue on a CPU causes the DL server to start, for that
+ * reason run stressor threads in the hopes it schedules on all CPUs.
+ */
+static int run_cpu_stress(int nr_cpus)
+{
+ pthread_t *threads;
+ int i, ret = 0;
+
+ threads = calloc(nr_cpus, sizeof(pthread_t));
+ if (!threads)
+ return -ENOMEM;
+
+ /* Create threads to run on each CPU */
+ for (i = 0; i < nr_cpus; i++) {
+ if (pthread_create(&threads[i], NULL, cpu_stress_thread, NULL)) {
+ ret = -errno;
+ fprintf(stderr, "Failed to create thread %d: %s\n", i, strerror(-ret));
+ break;
+ }
+ }
+
+ /* Wait for all threads to complete */
+ for (i = 0; i < nr_cpus; i++) {
+ if (threads[i])
+ pthread_join(threads[i], NULL);
+ }
+
+ free(threads);
+ return ret;
+}
+
+static int read_total_bw_values(long *bw_values, int max_cpus)
+{
+ FILE *fp;
+ char line[256];
+ int cpu_count = 0;
+
+ fp = fopen("/sys/kernel/debug/sched/debug", "r");
+ if (!fp) {
+ SCX_ERR("Failed to open debug file");
+ return -1;
+ }
+
+ while (fgets(line, sizeof(line), fp)) {
+ char *bw_str = strstr(line, "total_bw");
+
+ if (bw_str) {
+ bw_str = strchr(bw_str, ':');
+ if (bw_str) {
+ /* Only store up to max_cpus values */
+ if (cpu_count < max_cpus)
+ bw_values[cpu_count] = atol(bw_str + 1);
+ cpu_count++;
+ }
+ }
+ }
+
+ fclose(fp);
+ return cpu_count;
+}
+
+static bool verify_total_bw_consistency(long *bw_values, int count)
+{
+ int i;
+ long first_value;
+
+ if (count <= 0)
+ return false;
+
+ first_value = bw_values[0];
+
+ for (i = 1; i < count; i++) {
+ if (bw_values[i] != first_value) {
+ SCX_ERR("Inconsistent total_bw: CPU0=%ld, CPU%d=%ld",
+ first_value, i, bw_values[i]);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static int fetch_verify_total_bw(long *bw_values, int nr_cpus)
+{
+ int attempts = 0;
+ int max_attempts = 10;
+ int count;
+
+ /*
+ * The first enqueue on a CPU causes the DL server to start, for that
+ * reason run stressor threads in the hopes it schedules on all CPUs.
+ */
+ if (run_cpu_stress(nr_cpus) < 0) {
+ SCX_ERR("Failed to run CPU stress");
+ return -1;
+ }
+
+ /* Try multiple times to get stable values */
+ while (attempts < max_attempts) {
+ count = read_total_bw_values(bw_values, nr_cpus);
+ fprintf(stderr, "Read %d total_bw values (testing %d CPUs)\n", count, nr_cpus);
+ /* If system has more CPUs than we're testing, that's OK */
+ if (count < nr_cpus) {
+ SCX_ERR("Expected at least %d CPUs, got %d", nr_cpus, count);
+ attempts++;
+ sleep(1);
+ continue;
+ }
+
+ /* Only verify the CPUs we're testing */
+ if (verify_total_bw_consistency(bw_values, nr_cpus)) {
+ fprintf(stderr, "Values are consistent: %ld\n", bw_values[0]);
+ return 0;
+ }
+
+ attempts++;
+ sleep(1);
+ }
+
+ return -1;
+}
+
+static enum scx_test_status setup(void **ctx)
+{
+ struct total_bw_ctx *test_ctx;
+
+ if (access("/sys/kernel/debug/sched/debug", R_OK) != 0) {
+ fprintf(stderr, "Skipping test: debugfs sched/debug not accessible\n");
+ return SCX_TEST_SKIP;
+ }
+
+ test_ctx = calloc(1, sizeof(*test_ctx));
+ if (!test_ctx)
+ return SCX_TEST_FAIL;
+
+ test_ctx->nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+ if (test_ctx->nr_cpus <= 0) {
+ free(test_ctx);
+ return SCX_TEST_FAIL;
+ }
+
+ /* If system has more CPUs than MAX_CPUS, just test the first MAX_CPUS */
+ if (test_ctx->nr_cpus > MAX_CPUS)
+ test_ctx->nr_cpus = MAX_CPUS;
+
+ /* Test scenario 1: BPF program not loaded */
+ /* Read and verify baseline total_bw before loading BPF program */
+ fprintf(stderr, "BPF prog initially not loaded, reading total_bw values\n");
+ if (fetch_verify_total_bw(test_ctx->baseline_bw, test_ctx->nr_cpus) < 0) {
+ SCX_ERR("Failed to get stable baseline values");
+ free(test_ctx);
+ return SCX_TEST_FAIL;
+ }
+
+ /* Load the BPF skeleton */
+ test_ctx->skel = minimal__open();
+ if (!test_ctx->skel) {
+ free(test_ctx);
+ return SCX_TEST_FAIL;
+ }
+
+ SCX_ENUM_INIT(test_ctx->skel);
+ if (minimal__load(test_ctx->skel)) {
+ minimal__destroy(test_ctx->skel);
+ free(test_ctx);
+ return SCX_TEST_FAIL;
+ }
+
+ *ctx = test_ctx;
+ return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+ struct total_bw_ctx *test_ctx = ctx;
+ struct bpf_link *link;
+ long loaded_bw[MAX_CPUS];
+ long unloaded_bw[MAX_CPUS];
+ int i;
+
+ /* Test scenario 2: BPF program loaded */
+ link = bpf_map__attach_struct_ops(test_ctx->skel->maps.minimal_ops);
+ if (!link) {
+ SCX_ERR("Failed to attach scheduler");
+ return SCX_TEST_FAIL;
+ }
+
+ fprintf(stderr, "BPF program loaded, reading total_bw values\n");
+ if (fetch_verify_total_bw(loaded_bw, test_ctx->nr_cpus) < 0) {
+ SCX_ERR("Failed to get stable values with BPF loaded");
+ bpf_link__destroy(link);
+ return SCX_TEST_FAIL;
+ }
+ bpf_link__destroy(link);
+
+ /* Test scenario 3: BPF program unloaded */
+ fprintf(stderr, "BPF program unloaded, reading total_bw values\n");
+ if (fetch_verify_total_bw(unloaded_bw, test_ctx->nr_cpus) < 0) {
+ SCX_ERR("Failed to get stable values after BPF unload");
+ return SCX_TEST_FAIL;
+ }
+
+ /* Verify all three scenarios have the same total_bw values */
+ for (i = 0; i < test_ctx->nr_cpus; i++) {
+ if (test_ctx->baseline_bw[i] != loaded_bw[i]) {
+ SCX_ERR("CPU%d: baseline_bw=%ld != loaded_bw=%ld",
+ i, test_ctx->baseline_bw[i], loaded_bw[i]);
+ return SCX_TEST_FAIL;
+ }
+
+ if (test_ctx->baseline_bw[i] != unloaded_bw[i]) {
+ SCX_ERR("CPU%d: baseline_bw=%ld != unloaded_bw=%ld",
+ i, test_ctx->baseline_bw[i], unloaded_bw[i]);
+ return SCX_TEST_FAIL;
+ }
+ }
+
+ fprintf(stderr, "All total_bw values are consistent across all scenarios\n");
+ return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+ struct total_bw_ctx *test_ctx = ctx;
+
+ if (test_ctx) {
+ if (test_ctx->skel)
+ minimal__destroy(test_ctx->skel);
+ free(test_ctx);
+ }
+}
+
+struct scx_test total_bw = {
+ .name = "total_bw",
+ .description = "Verify total_bw consistency across BPF program states",
+ .setup = setup,
+ .run = run,
+ .cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&total_bw)