summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/build_policy.c1
-rw-r--r--kernel/sched/core.c76
-rw-r--r--kernel/sched/deadline.c124
-rw-r--r--kernel/sched/debug.c6
-rw-r--r--kernel/sched/ext.c1566
-rw-r--r--kernel/sched/ext.h25
-rw-r--r--kernel/sched/ext_idle.c174
-rw-r--r--kernel/sched/ext_internal.h1078
-rw-r--r--kernel/sched/fair.c498
-rw-r--r--kernel/sched/pelt.h4
-rw-r--r--kernel/sched/rq-offsets.c12
-rw-r--r--kernel/sched/sched.h48
-rw-r--r--kernel/sched/topology.c75
13 files changed, 2030 insertions, 1657 deletions
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index c4a488e67aa7..755883faf751 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -58,6 +58,7 @@
#include "deadline.c"
#ifdef CONFIG_SCHED_CLASS_EXT
+# include "ext_internal.h"
# include "ext.c"
# include "ext_idle.c"
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index be00629f0ba4..198d2dd45f59 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7,6 +7,8 @@
* Copyright (C) 1991-2002 Linus Torvalds
* Copyright (C) 1998-2024 Ingo Molnar, Red Hat
*/
+#define INSTANTIATE_EXPORTED_MIGRATE_DISABLE
+#include <linux/sched.h>
#include <linux/highmem.h>
#include <linux/hrtimer_api.h>
#include <linux/ktime_api.h>
@@ -917,7 +919,7 @@ void hrtick_start(struct rq *rq, u64 delay)
* doesn't make sense and can cause timer DoS.
*/
delta = max_t(s64, delay, 10000LL);
- rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
+ rq->hrtick_time = ktime_add_ns(hrtimer_cb_get_time(timer), delta);
if (rq == this_rq())
__hrtick_restart(rq);
@@ -2381,28 +2383,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
__do_set_cpus_allowed(p, &ac);
}
-void migrate_disable(void)
-{
- struct task_struct *p = current;
-
- if (p->migration_disabled) {
-#ifdef CONFIG_DEBUG_PREEMPT
- /*
- *Warn about overflow half-way through the range.
- */
- WARN_ON_ONCE((s16)p->migration_disabled < 0);
-#endif
- p->migration_disabled++;
- return;
- }
-
- guard(preempt)();
- this_rq()->nr_pinned++;
- p->migration_disabled = 1;
-}
-EXPORT_SYMBOL_GPL(migrate_disable);
-
-void migrate_enable(void)
+void ___migrate_enable(void)
{
struct task_struct *p = current;
struct affinity_context ac = {
@@ -2410,35 +2391,19 @@ void migrate_enable(void)
.flags = SCA_MIGRATE_ENABLE,
};
-#ifdef CONFIG_DEBUG_PREEMPT
- /*
- * Check both overflow from migrate_disable() and superfluous
- * migrate_enable().
- */
- if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
- return;
-#endif
+ __set_cpus_allowed_ptr(p, &ac);
+}
+EXPORT_SYMBOL_GPL(___migrate_enable);
- if (p->migration_disabled > 1) {
- p->migration_disabled--;
- return;
- }
+void migrate_disable(void)
+{
+ __migrate_disable();
+}
+EXPORT_SYMBOL_GPL(migrate_disable);
- /*
- * Ensure stop_task runs either before or after this, and that
- * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
- */
- guard(preempt)();
- if (p->cpus_ptr != &p->cpus_mask)
- __set_cpus_allowed_ptr(p, &ac);
- /*
- * Mustn't clear migration_disabled() until cpus_ptr points back at the
- * regular cpus_mask, otherwise things that race (eg.
- * select_fallback_rq) get confused.
- */
- barrier();
- p->migration_disabled = 0;
- this_rq()->nr_pinned--;
+void migrate_enable(void)
+{
+ __migrate_enable();
}
EXPORT_SYMBOL_GPL(migrate_enable);
@@ -4472,7 +4437,7 @@ int wake_up_state(struct task_struct *p, unsigned int state)
* __sched_fork() is basic setup which is also used by sched_init() to
* initialize the boot CPU's idle task.
*/
-static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+static void __sched_fork(u64 clone_flags, struct task_struct *p)
{
p->on_rq = 0;
@@ -4490,6 +4455,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = NULL;
+#ifdef CONFIG_CFS_BANDWIDTH
+ init_cfs_throttle_work(p);
+#endif
#endif
#ifdef CONFIG_SCHEDSTATS
@@ -4707,7 +4675,7 @@ late_initcall(sched_core_sysctl_init);
/*
* fork()/clone()-time setup:
*/
-int sched_fork(unsigned long clone_flags, struct task_struct *p)
+int sched_fork(u64 clone_flags, struct task_struct *p)
{
__sched_fork(clone_flags, p);
/*
@@ -9362,8 +9330,6 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
cgroup_taskset_for_each(task, css, tset)
sched_move_task(task, false);
-
- scx_cgroup_finish_attach();
}
static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset)
@@ -9551,7 +9517,7 @@ static unsigned long tg_weight(struct task_group *tg)
#ifdef CONFIG_FAIR_GROUP_SCHED
return scale_load_down(tg->shares);
#else
- return sched_weight_from_cgroup(tg->scx_weight);
+ return sched_weight_from_cgroup(tg->scx.weight);
#endif
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index e2d51f4306b3..615411a0a881 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -875,7 +875,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
*/
if (dl_se->dl_defer && !dl_se->dl_defer_running &&
dl_time_before(rq_clock(dl_se->rq), dl_se->deadline - dl_se->runtime)) {
- if (!is_dl_boosted(dl_se) && dl_se->server_has_tasks(dl_se)) {
+ if (!is_dl_boosted(dl_se)) {
/*
* Set dl_se->dl_defer_armed and dl_throttled variables to
@@ -1152,8 +1152,6 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf)
/* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */
static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC;
-static bool dl_server_stopped(struct sched_dl_entity *dl_se);
-
static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se)
{
struct rq *rq = rq_of_dl_se(dl_se);
@@ -1171,12 +1169,6 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_
if (!dl_se->dl_runtime)
return HRTIMER_NORESTART;
- if (!dl_se->server_has_tasks(dl_se)) {
- replenish_dl_entity(dl_se);
- dl_server_stopped(dl_se);
- return HRTIMER_NORESTART;
- }
-
if (dl_se->dl_defer_armed) {
/*
* First check if the server could consume runtime in background.
@@ -1496,10 +1488,12 @@ throttle:
}
if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) {
- if (dl_server(dl_se))
- enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH);
- else
+ if (dl_server(dl_se)) {
+ replenish_dl_new_period(dl_se, rq);
+ start_dl_timer(dl_se);
+ } else {
enqueue_task_dl(rq, dl_task_of(dl_se), ENQUEUE_REPLENISH);
+ }
}
if (!is_leftmost(dl_se, &rq->dl))
@@ -1577,10 +1571,8 @@ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p)
void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
{
/* 0 runtime = fair server disabled */
- if (dl_se->dl_runtime) {
- dl_se->dl_server_idle = 0;
+ if (dl_se->dl_runtime)
update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
- }
}
void dl_server_start(struct sched_dl_entity *dl_se)
@@ -1608,26 +1600,10 @@ void dl_server_stop(struct sched_dl_entity *dl_se)
dl_se->dl_server_active = 0;
}
-static bool dl_server_stopped(struct sched_dl_entity *dl_se)
-{
- if (!dl_se->dl_server_active)
- return false;
-
- if (dl_se->dl_server_idle) {
- dl_server_stop(dl_se);
- return true;
- }
-
- dl_se->dl_server_idle = 1;
- return false;
-}
-
void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
- dl_server_has_tasks_f has_tasks,
dl_server_pick_f pick_task)
{
dl_se->rq = rq;
- dl_se->server_has_tasks = has_tasks;
dl_se->server_pick_task = pick_task;
}
@@ -1849,7 +1825,9 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
u64 deadline = dl_se->deadline;
dl_rq->dl_nr_running++;
- add_nr_running(rq_of_dl_rq(dl_rq), 1);
+
+ if (!dl_server(dl_se))
+ add_nr_running(rq_of_dl_rq(dl_rq), 1);
inc_dl_deadline(dl_rq, deadline);
}
@@ -1859,7 +1837,9 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
WARN_ON(!dl_rq->dl_nr_running);
dl_rq->dl_nr_running--;
- sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+
+ if (!dl_server(dl_se))
+ sub_nr_running(rq_of_dl_rq(dl_rq), 1);
dec_dl_deadline(dl_rq, dl_se->deadline);
}
@@ -2388,10 +2368,7 @@ again:
if (dl_server(dl_se)) {
p = dl_se->server_pick_task(dl_se);
if (!p) {
- if (!dl_server_stopped(dl_se)) {
- dl_se->dl_yielded = 1;
- update_curr_dl_se(rq, dl_se, 0);
- }
+ dl_server_stop(dl_se);
goto again;
}
rq->dl_server = dl_se;
@@ -2574,6 +2551,25 @@ static int find_later_rq(struct task_struct *task)
return -1;
}
+static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
+{
+ struct task_struct *p;
+
+ if (!has_pushable_dl_tasks(rq))
+ return NULL;
+
+ p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
+
+ WARN_ON_ONCE(rq->cpu != task_cpu(p));
+ WARN_ON_ONCE(task_current(rq, p));
+ WARN_ON_ONCE(p->nr_cpus_allowed <= 1);
+
+ WARN_ON_ONCE(!task_on_rq_queued(p));
+ WARN_ON_ONCE(!dl_task(p));
+
+ return p;
+}
+
/* Locks the rq it finds */
static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
{
@@ -2601,12 +2597,37 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
/* Retry if something changed. */
if (double_lock_balance(rq, later_rq)) {
- if (unlikely(task_rq(task) != rq ||
+ /*
+ * double_lock_balance had to release rq->lock, in the
+ * meantime, task may no longer be fit to be migrated.
+ * Check the following to ensure that the task is
+ * still suitable for migration:
+ * 1. It is possible the task was scheduled,
+ * migrate_disabled was set and then got preempted,
+ * so we must check the task migration disable
+ * flag.
+ * 2. The CPU picked is in the task's affinity.
+ * 3. For throttled task (dl_task_offline_migration),
+ * check the following:
+ * - the task is not on the rq anymore (it was
+ * migrated)
+ * - the task is not on CPU anymore
+ * - the task is still a dl task
+ * - the task is not queued on the rq anymore
+ * 4. For the non-throttled task (push_dl_task), the
+ * check to ensure that this task is still at the
+ * head of the pushable tasks list is enough.
+ */
+ if (unlikely(is_migration_disabled(task) ||
!cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) ||
- task_on_cpu(rq, task) ||
- !dl_task(task) ||
- is_migration_disabled(task) ||
- !task_on_rq_queued(task))) {
+ (task->dl.dl_throttled &&
+ (task_rq(task) != rq ||
+ task_on_cpu(rq, task) ||
+ !dl_task(task) ||
+ !task_on_rq_queued(task))) ||
+ (!task->dl.dl_throttled &&
+ task != pick_next_pushable_dl_task(rq)))) {
+
double_unlock_balance(rq, later_rq);
later_rq = NULL;
break;
@@ -2629,25 +2650,6 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
return later_rq;
}
-static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
-{
- struct task_struct *p;
-
- if (!has_pushable_dl_tasks(rq))
- return NULL;
-
- p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
-
- WARN_ON_ONCE(rq->cpu != task_cpu(p));
- WARN_ON_ONCE(task_current(rq, p));
- WARN_ON_ONCE(p->nr_cpus_allowed <= 1);
-
- WARN_ON_ONCE(!task_on_rq_queued(p));
- WARN_ON_ONCE(!dl_task(p));
-
- return p;
-}
-
/*
* See if the non running -deadline tasks on this rq
* can be sent to some other CPU where they can preempt
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 3f06ab84d53f..02e16b70a790 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -376,10 +376,8 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
return -EINVAL;
}
- if (rq->cfs.h_nr_queued) {
- update_rq_clock(rq);
- dl_server_stop(&rq->fair_server);
- }
+ update_rq_clock(rq);
+ dl_server_stop(&rq->fair_server);
retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0);
if (retval)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7dedc9a16281..2b0e88206d07 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -9,1040 +9,6 @@
#include <linux/btf_ids.h>
#include "ext_idle.h"
-#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
-
-enum scx_consts {
- SCX_DSP_DFL_MAX_BATCH = 32,
- SCX_DSP_MAX_LOOPS = 32,
- SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ,
-
- SCX_EXIT_BT_LEN = 64,
- SCX_EXIT_MSG_LEN = 1024,
- SCX_EXIT_DUMP_DFL_LEN = 32768,
-
- SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE,
-
- /*
- * Iterating all tasks may take a while. Periodically drop
- * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
- */
- SCX_TASK_ITER_BATCH = 32,
-};
-
-enum scx_exit_kind {
- SCX_EXIT_NONE,
- SCX_EXIT_DONE,
-
- SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */
- SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */
- SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */
- SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */
-
- SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */
- SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */
- SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */
-};
-
-/*
- * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(),
- * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes
- * are 64bit of the format:
- *
- * Bits: [63 .. 48 47 .. 32 31 .. 0]
- * [ SYS ACT ] [ SYS RSN ] [ USR ]
- *
- * SYS ACT: System-defined exit actions
- * SYS RSN: System-defined exit reasons
- * USR : User-defined exit codes and reasons
- *
- * Using the above, users may communicate intention and context by ORing system
- * actions and/or system reasons with a user-defined exit code.
- */
-enum scx_exit_code {
- /* Reasons */
- SCX_ECODE_RSN_HOTPLUG = 1LLU << 32,
-
- /* Actions */
- SCX_ECODE_ACT_RESTART = 1LLU << 48,
-};
-
-/*
- * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is
- * being disabled.
- */
-struct scx_exit_info {
- /* %SCX_EXIT_* - broad category of the exit reason */
- enum scx_exit_kind kind;
-
- /* exit code if gracefully exiting */
- s64 exit_code;
-
- /* textual representation of the above */
- const char *reason;
-
- /* backtrace if exiting due to an error */
- unsigned long *bt;
- u32 bt_len;
-
- /* informational message */
- char *msg;
-
- /* debug dump */
- char *dump;
-};
-
-/* sched_ext_ops.flags */
-enum scx_ops_flags {
- /*
- * Keep built-in idle tracking even if ops.update_idle() is implemented.
- */
- SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
-
- /*
- * By default, if there are no other task to run on the CPU, ext core
- * keeps running the current task even after its slice expires. If this
- * flag is specified, such tasks are passed to ops.enqueue() with
- * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
- */
- SCX_OPS_ENQ_LAST = 1LLU << 1,
-
- /*
- * An exiting task may schedule after PF_EXITING is set. In such cases,
- * bpf_task_from_pid() may not be able to find the task and if the BPF
- * scheduler depends on pid lookup for dispatching, the task will be
- * lost leading to various issues including RCU grace period stalls.
- *
- * To mask this problem, by default, unhashed tasks are automatically
- * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't
- * depend on pid lookups and wants to handle these tasks directly, the
- * following flag can be used.
- */
- SCX_OPS_ENQ_EXITING = 1LLU << 2,
-
- /*
- * If set, only tasks with policy set to SCHED_EXT are attached to
- * sched_ext. If clear, SCHED_NORMAL tasks are also included.
- */
- SCX_OPS_SWITCH_PARTIAL = 1LLU << 3,
-
- /*
- * A migration disabled task can only execute on its current CPU. By
- * default, such tasks are automatically put on the CPU's local DSQ with
- * the default slice on enqueue. If this ops flag is set, they also go
- * through ops.enqueue().
- *
- * A migration disabled task never invokes ops.select_cpu() as it can
- * only select the current CPU. Also, p->cpus_ptr will only contain its
- * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
- * and thus may disagree with cpumask_weight(p->cpus_ptr).
- */
- SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4,
-
- /*
- * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes
- * ops.enqueue() on the ops.select_cpu() selected or the wakee's
- * previous CPU via IPI (inter-processor interrupt) to reduce cacheline
- * transfers. When this optimization is enabled, ops.select_cpu() is
- * skipped in some cases (when racing against the wakee switching out).
- * As the BPF scheduler may depend on ops.select_cpu() being invoked
- * during wakeups, queued wakeup is disabled by default.
- *
- * If this ops flag is set, queued wakeup optimization is enabled and
- * the BPF scheduler must be able to handle ops.enqueue() invoked on the
- * wakee's CPU without preceding ops.select_cpu() even for tasks which
- * may be executed on multiple CPUs.
- */
- SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5,
-
- /*
- * If set, enable per-node idle cpumasks. If clear, use a single global
- * flat idle cpumask.
- */
- SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6,
-
- /*
- * CPU cgroup support flags
- */
- SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */
-
- SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
- SCX_OPS_ENQ_LAST |
- SCX_OPS_ENQ_EXITING |
- SCX_OPS_ENQ_MIGRATION_DISABLED |
- SCX_OPS_ALLOW_QUEUED_WAKEUP |
- SCX_OPS_SWITCH_PARTIAL |
- SCX_OPS_BUILTIN_IDLE_PER_NODE |
- SCX_OPS_HAS_CGROUP_WEIGHT,
-
- /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */
- __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56,
-
- SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56,
-};
-
-/* argument container for ops.init_task() */
-struct scx_init_task_args {
- /*
- * Set if ops.init_task() is being invoked on the fork path, as opposed
- * to the scheduler transition path.
- */
- bool fork;
-#ifdef CONFIG_EXT_GROUP_SCHED
- /* the cgroup the task is joining */
- struct cgroup *cgroup;
-#endif
-};
-
-/* argument container for ops.exit_task() */
-struct scx_exit_task_args {
- /* Whether the task exited before running on sched_ext. */
- bool cancelled;
-};
-
-/* argument container for ops->cgroup_init() */
-struct scx_cgroup_init_args {
- /* the weight of the cgroup [1..10000] */
- u32 weight;
-
- /* bandwidth control parameters from cpu.max and cpu.max.burst */
- u64 bw_period_us;
- u64 bw_quota_us;
- u64 bw_burst_us;
-};
-
-enum scx_cpu_preempt_reason {
- /* next task is being scheduled by &sched_class_rt */
- SCX_CPU_PREEMPT_RT,
- /* next task is being scheduled by &sched_class_dl */
- SCX_CPU_PREEMPT_DL,
- /* next task is being scheduled by &sched_class_stop */
- SCX_CPU_PREEMPT_STOP,
- /* unknown reason for SCX being preempted */
- SCX_CPU_PREEMPT_UNKNOWN,
-};
-
-/*
- * Argument container for ops->cpu_acquire(). Currently empty, but may be
- * expanded in the future.
- */
-struct scx_cpu_acquire_args {};
-
-/* argument container for ops->cpu_release() */
-struct scx_cpu_release_args {
- /* the reason the CPU was preempted */
- enum scx_cpu_preempt_reason reason;
-
- /* the task that's going to be scheduled on the CPU */
- struct task_struct *task;
-};
-
-/*
- * Informational context provided to dump operations.
- */
-struct scx_dump_ctx {
- enum scx_exit_kind kind;
- s64 exit_code;
- const char *reason;
- u64 at_ns;
- u64 at_jiffies;
-};
-
-/**
- * struct sched_ext_ops - Operation table for BPF scheduler implementation
- *
- * A BPF scheduler can implement an arbitrary scheduling policy by
- * implementing and loading operations in this table. Note that a userland
- * scheduling policy can also be implemented using the BPF scheduler
- * as a shim layer.
- */
-struct sched_ext_ops {
- /**
- * @select_cpu: Pick the target CPU for a task which is being woken up
- * @p: task being woken up
- * @prev_cpu: the cpu @p was on before sleeping
- * @wake_flags: SCX_WAKE_*
- *
- * Decision made here isn't final. @p may be moved to any CPU while it
- * is getting dispatched for execution later. However, as @p is not on
- * the rq at this point, getting the eventual execution CPU right here
- * saves a small bit of overhead down the line.
- *
- * If an idle CPU is returned, the CPU is kicked and will try to
- * dispatch. While an explicit custom mechanism can be added,
- * select_cpu() serves as the default way to wake up idle CPUs.
- *
- * @p may be inserted into a DSQ directly by calling
- * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped.
- * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ
- * of the CPU returned by this operation.
- *
- * Note that select_cpu() is never called for tasks that can only run
- * on a single CPU or tasks with migration disabled, as they don't have
- * the option to select a different CPU. See select_task_rq() for
- * details.
- */
- s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
-
- /**
- * @enqueue: Enqueue a task on the BPF scheduler
- * @p: task being enqueued
- * @enq_flags: %SCX_ENQ_*
- *
- * @p is ready to run. Insert directly into a DSQ by calling
- * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly
- * inserted, the bpf scheduler owns @p and if it fails to dispatch @p,
- * the task will stall.
- *
- * If @p was inserted into a DSQ from ops.select_cpu(), this callback is
- * skipped.
- */
- void (*enqueue)(struct task_struct *p, u64 enq_flags);
-
- /**
- * @dequeue: Remove a task from the BPF scheduler
- * @p: task being dequeued
- * @deq_flags: %SCX_DEQ_*
- *
- * Remove @p from the BPF scheduler. This is usually called to isolate
- * the task while updating its scheduling properties (e.g. priority).
- *
- * The ext core keeps track of whether the BPF side owns a given task or
- * not and can gracefully ignore spurious dispatches from BPF side,
- * which makes it safe to not implement this method. However, depending
- * on the scheduling logic, this can lead to confusing behaviors - e.g.
- * scheduling position not being updated across a priority change.
- */
- void (*dequeue)(struct task_struct *p, u64 deq_flags);
-
- /**
- * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs
- * @cpu: CPU to dispatch tasks for
- * @prev: previous task being switched out
- *
- * Called when a CPU's local dsq is empty. The operation should dispatch
- * one or more tasks from the BPF scheduler into the DSQs using
- * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ
- * using scx_bpf_dsq_move_to_local().
- *
- * The maximum number of times scx_bpf_dsq_insert() can be called
- * without an intervening scx_bpf_dsq_move_to_local() is specified by
- * ops.dispatch_max_batch. See the comments on top of the two functions
- * for more details.
- *
- * When not %NULL, @prev is an SCX task with its slice depleted. If
- * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in
- * @prev->scx.flags, it is not enqueued yet and will be enqueued after
- * ops.dispatch() returns. To keep executing @prev, return without
- * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST.
- */
- void (*dispatch)(s32 cpu, struct task_struct *prev);
-
- /**
- * @tick: Periodic tick
- * @p: task running currently
- *
- * This operation is called every 1/HZ seconds on CPUs which are
- * executing an SCX task. Setting @p->scx.slice to 0 will trigger an
- * immediate dispatch cycle on the CPU.
- */
- void (*tick)(struct task_struct *p);
-
- /**
- * @runnable: A task is becoming runnable on its associated CPU
- * @p: task becoming runnable
- * @enq_flags: %SCX_ENQ_*
- *
- * This and the following three functions can be used to track a task's
- * execution state transitions. A task becomes ->runnable() on a CPU,
- * and then goes through one or more ->running() and ->stopping() pairs
- * as it runs on the CPU, and eventually becomes ->quiescent() when it's
- * done running on the CPU.
- *
- * @p is becoming runnable on the CPU because it's
- *
- * - waking up (%SCX_ENQ_WAKEUP)
- * - being moved from another CPU
- * - being restored after temporarily taken off the queue for an
- * attribute change.
- *
- * This and ->enqueue() are related but not coupled. This operation
- * notifies @p's state transition and may not be followed by ->enqueue()
- * e.g. when @p is being dispatched to a remote CPU, or when @p is
- * being enqueued on a CPU experiencing a hotplug event. Likewise, a
- * task may be ->enqueue()'d without being preceded by this operation
- * e.g. after exhausting its slice.
- */
- void (*runnable)(struct task_struct *p, u64 enq_flags);
-
- /**
- * @running: A task is starting to run on its associated CPU
- * @p: task starting to run
- *
- * Note that this callback may be called from a CPU other than the
- * one the task is going to run on. This can happen when a task
- * property is changed (i.e., affinity), since scx_next_task_scx(),
- * which triggers this callback, may run on a CPU different from
- * the task's assigned CPU.
- *
- * Therefore, always use scx_bpf_task_cpu(@p) to determine the
- * target CPU the task is going to use.
- *
- * See ->runnable() for explanation on the task state notifiers.
- */
- void (*running)(struct task_struct *p);
-
- /**
- * @stopping: A task is stopping execution
- * @p: task stopping to run
- * @runnable: is task @p still runnable?
- *
- * Note that this callback may be called from a CPU other than the
- * one the task was running on. This can happen when a task
- * property is changed (i.e., affinity), since dequeue_task_scx(),
- * which triggers this callback, may run on a CPU different from
- * the task's assigned CPU.
- *
- * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU
- * the task was running on.
- *
- * See ->runnable() for explanation on the task state notifiers. If
- * !@runnable, ->quiescent() will be invoked after this operation
- * returns.
- */
- void (*stopping)(struct task_struct *p, bool runnable);
-
- /**
- * @quiescent: A task is becoming not runnable on its associated CPU
- * @p: task becoming not runnable
- * @deq_flags: %SCX_DEQ_*
- *
- * See ->runnable() for explanation on the task state notifiers.
- *
- * @p is becoming quiescent on the CPU because it's
- *
- * - sleeping (%SCX_DEQ_SLEEP)
- * - being moved to another CPU
- * - being temporarily taken off the queue for an attribute change
- * (%SCX_DEQ_SAVE)
- *
- * This and ->dequeue() are related but not coupled. This operation
- * notifies @p's state transition and may not be preceded by ->dequeue()
- * e.g. when @p is being dispatched to a remote CPU.
- */
- void (*quiescent)(struct task_struct *p, u64 deq_flags);
-
- /**
- * @yield: Yield CPU
- * @from: yielding task
- * @to: optional yield target task
- *
- * If @to is NULL, @from is yielding the CPU to other runnable tasks.
- * The BPF scheduler should ensure that other available tasks are
- * dispatched before the yielding task. Return value is ignored in this
- * case.
- *
- * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf
- * scheduler can implement the request, return %true; otherwise, %false.
- */
- bool (*yield)(struct task_struct *from, struct task_struct *to);
-
- /**
- * @core_sched_before: Task ordering for core-sched
- * @a: task A
- * @b: task B
- *
- * Used by core-sched to determine the ordering between two tasks. See
- * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on
- * core-sched.
- *
- * Both @a and @b are runnable and may or may not currently be queued on
- * the BPF scheduler. Should return %true if @a should run before @b.
- * %false if there's no required ordering or @b should run before @a.
- *
- * If not specified, the default is ordering them according to when they
- * became runnable.
- */
- bool (*core_sched_before)(struct task_struct *a, struct task_struct *b);
-
- /**
- * @set_weight: Set task weight
- * @p: task to set weight for
- * @weight: new weight [1..10000]
- *
- * Update @p's weight to @weight.
- */
- void (*set_weight)(struct task_struct *p, u32 weight);
-
- /**
- * @set_cpumask: Set CPU affinity
- * @p: task to set CPU affinity for
- * @cpumask: cpumask of cpus that @p can run on
- *
- * Update @p's CPU affinity to @cpumask.
- */
- void (*set_cpumask)(struct task_struct *p,
- const struct cpumask *cpumask);
-
- /**
- * @update_idle: Update the idle state of a CPU
- * @cpu: CPU to update the idle state for
- * @idle: whether entering or exiting the idle state
- *
- * This operation is called when @rq's CPU goes or leaves the idle
- * state. By default, implementing this operation disables the built-in
- * idle CPU tracking and the following helpers become unavailable:
- *
- * - scx_bpf_select_cpu_dfl()
- * - scx_bpf_select_cpu_and()
- * - scx_bpf_test_and_clear_cpu_idle()
- * - scx_bpf_pick_idle_cpu()
- *
- * The user also must implement ops.select_cpu() as the default
- * implementation relies on scx_bpf_select_cpu_dfl().
- *
- * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle
- * tracking.
- */
- void (*update_idle)(s32 cpu, bool idle);
-
- /**
- * @cpu_acquire: A CPU is becoming available to the BPF scheduler
- * @cpu: The CPU being acquired by the BPF scheduler.
- * @args: Acquire arguments, see the struct definition.
- *
- * A CPU that was previously released from the BPF scheduler is now once
- * again under its control.
- */
- void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
-
- /**
- * @cpu_release: A CPU is taken away from the BPF scheduler
- * @cpu: The CPU being released by the BPF scheduler.
- * @args: Release arguments, see the struct definition.
- *
- * The specified CPU is no longer under the control of the BPF
- * scheduler. This could be because it was preempted by a higher
- * priority sched_class, though there may be other reasons as well. The
- * caller should consult @args->reason to determine the cause.
- */
- void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
-
- /**
- * @init_task: Initialize a task to run in a BPF scheduler
- * @p: task to initialize for BPF scheduling
- * @args: init arguments, see the struct definition
- *
- * Either we're loading a BPF scheduler or a new task is being forked.
- * Initialize @p for BPF scheduling. This operation may block and can
- * be used for allocations, and is called exactly once for a task.
- *
- * Return 0 for success, -errno for failure. An error return while
- * loading will abort loading of the BPF scheduler. During a fork, it
- * will abort that specific fork.
- */
- s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args);
-
- /**
- * @exit_task: Exit a previously-running task from the system
- * @p: task to exit
- * @args: exit arguments, see the struct definition
- *
- * @p is exiting or the BPF scheduler is being unloaded. Perform any
- * necessary cleanup for @p.
- */
- void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args);
-
- /**
- * @enable: Enable BPF scheduling for a task
- * @p: task to enable BPF scheduling for
- *
- * Enable @p for BPF scheduling. enable() is called on @p any time it
- * enters SCX, and is always paired with a matching disable().
- */
- void (*enable)(struct task_struct *p);
-
- /**
- * @disable: Disable BPF scheduling for a task
- * @p: task to disable BPF scheduling for
- *
- * @p is exiting, leaving SCX or the BPF scheduler is being unloaded.
- * Disable BPF scheduling for @p. A disable() call is always matched
- * with a prior enable() call.
- */
- void (*disable)(struct task_struct *p);
-
- /**
- * @dump: Dump BPF scheduler state on error
- * @ctx: debug dump context
- *
- * Use scx_bpf_dump() to generate BPF scheduler specific debug dump.
- */
- void (*dump)(struct scx_dump_ctx *ctx);
-
- /**
- * @dump_cpu: Dump BPF scheduler state for a CPU on error
- * @ctx: debug dump context
- * @cpu: CPU to generate debug dump for
- * @idle: @cpu is currently idle without any runnable tasks
- *
- * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
- * @cpu. If @idle is %true and this operation doesn't produce any
- * output, @cpu is skipped for dump.
- */
- void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle);
-
- /**
- * @dump_task: Dump BPF scheduler state for a runnable task on error
- * @ctx: debug dump context
- * @p: runnable task to generate debug dump for
- *
- * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
- * @p.
- */
- void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);
-
-#ifdef CONFIG_EXT_GROUP_SCHED
- /**
- * @cgroup_init: Initialize a cgroup
- * @cgrp: cgroup being initialized
- * @args: init arguments, see the struct definition
- *
- * Either the BPF scheduler is being loaded or @cgrp created, initialize
- * @cgrp for sched_ext. This operation may block.
- *
- * Return 0 for success, -errno for failure. An error return while
- * loading will abort loading of the BPF scheduler. During cgroup
- * creation, it will abort the specific cgroup creation.
- */
- s32 (*cgroup_init)(struct cgroup *cgrp,
- struct scx_cgroup_init_args *args);
-
- /**
- * @cgroup_exit: Exit a cgroup
- * @cgrp: cgroup being exited
- *
- * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit
- * @cgrp for sched_ext. This operation my block.
- */
- void (*cgroup_exit)(struct cgroup *cgrp);
-
- /**
- * @cgroup_prep_move: Prepare a task to be moved to a different cgroup
- * @p: task being moved
- * @from: cgroup @p is being moved from
- * @to: cgroup @p is being moved to
- *
- * Prepare @p for move from cgroup @from to @to. This operation may
- * block and can be used for allocations.
- *
- * Return 0 for success, -errno for failure. An error return aborts the
- * migration.
- */
- s32 (*cgroup_prep_move)(struct task_struct *p,
- struct cgroup *from, struct cgroup *to);
-
- /**
- * @cgroup_move: Commit cgroup move
- * @p: task being moved
- * @from: cgroup @p is being moved from
- * @to: cgroup @p is being moved to
- *
- * Commit the move. @p is dequeued during this operation.
- */
- void (*cgroup_move)(struct task_struct *p,
- struct cgroup *from, struct cgroup *to);
-
- /**
- * @cgroup_cancel_move: Cancel cgroup move
- * @p: task whose cgroup move is being canceled
- * @from: cgroup @p was being moved from
- * @to: cgroup @p was being moved to
- *
- * @p was cgroup_prep_move()'d but failed before reaching cgroup_move().
- * Undo the preparation.
- */
- void (*cgroup_cancel_move)(struct task_struct *p,
- struct cgroup *from, struct cgroup *to);
-
- /**
- * @cgroup_set_weight: A cgroup's weight is being changed
- * @cgrp: cgroup whose weight is being updated
- * @weight: new weight [1..10000]
- *
- * Update @cgrp's weight to @weight.
- */
- void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
-
- /**
- * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed
- * @cgrp: cgroup whose bandwidth is being updated
- * @period_us: bandwidth control period
- * @quota_us: bandwidth control quota
- * @burst_us: bandwidth control burst
- *
- * Update @cgrp's bandwidth control parameters. This is from the cpu.max
- * cgroup interface.
- *
- * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled
- * to. For example, if @period_us is 1_000_000 and @quota_us is
- * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be
- * interpreted in the same fashion and specifies how much @cgrp can
- * burst temporarily. The specific control mechanism and thus the
- * interpretation of @period_us and burstiness is upto to the BPF
- * scheduler.
- */
- void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
- u64 period_us, u64 quota_us, u64 burst_us);
-
-#endif /* CONFIG_EXT_GROUP_SCHED */
-
- /*
- * All online ops must come before ops.cpu_online().
- */
-
- /**
- * @cpu_online: A CPU became online
- * @cpu: CPU which just came up
- *
- * @cpu just came online. @cpu will not call ops.enqueue() or
- * ops.dispatch(), nor run tasks associated with other CPUs beforehand.
- */
- void (*cpu_online)(s32 cpu);
-
- /**
- * @cpu_offline: A CPU is going offline
- * @cpu: CPU which is going offline
- *
- * @cpu is going offline. @cpu will not call ops.enqueue() or
- * ops.dispatch(), nor run tasks associated with other CPUs afterwards.
- */
- void (*cpu_offline)(s32 cpu);
-
- /*
- * All CPU hotplug ops must come before ops.init().
- */
-
- /**
- * @init: Initialize the BPF scheduler
- */
- s32 (*init)(void);
-
- /**
- * @exit: Clean up after the BPF scheduler
- * @info: Exit info
- *
- * ops.exit() is also called on ops.init() failure, which is a bit
- * unusual. This is to allow rich reporting through @info on how
- * ops.init() failed.
- */
- void (*exit)(struct scx_exit_info *info);
-
- /**
- * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch
- */
- u32 dispatch_max_batch;
-
- /**
- * @flags: %SCX_OPS_* flags
- */
- u64 flags;
-
- /**
- * @timeout_ms: The maximum amount of time, in milliseconds, that a
- * runnable task should be able to wait before being scheduled. The
- * maximum timeout may not exceed the default timeout of 30 seconds.
- *
- * Defaults to the maximum allowed timeout value of 30 seconds.
- */
- u32 timeout_ms;
-
- /**
- * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default
- * value of 32768 is used.
- */
- u32 exit_dump_len;
-
- /**
- * @hotplug_seq: A sequence number that may be set by the scheduler to
- * detect when a hotplug event has occurred during the loading process.
- * If 0, no detection occurs. Otherwise, the scheduler will fail to
- * load if the sequence number does not match @scx_hotplug_seq on the
- * enable path.
- */
- u64 hotplug_seq;
-
- /**
- * @name: BPF scheduler's name
- *
- * Must be a non-zero valid BPF object name including only isalnum(),
- * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the
- * BPF scheduler is enabled.
- */
- char name[SCX_OPS_NAME_LEN];
-
- /* internal use only, must be NULL */
- void *priv;
-};
-
-enum scx_opi {
- SCX_OPI_BEGIN = 0,
- SCX_OPI_NORMAL_BEGIN = 0,
- SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online),
- SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online),
- SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init),
- SCX_OPI_END = SCX_OP_IDX(init),
-};
-
-/*
- * Collection of event counters. Event types are placed in descending order.
- */
-struct scx_event_stats {
- /*
- * If ops.select_cpu() returns a CPU which can't be used by the task,
- * the core scheduler code silently picks a fallback CPU.
- */
- s64 SCX_EV_SELECT_CPU_FALLBACK;
-
- /*
- * When dispatching to a local DSQ, the CPU may have gone offline in
- * the meantime. In this case, the task is bounced to the global DSQ.
- */
- s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
-
- /*
- * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
- * continued to run because there were no other tasks on the CPU.
- */
- s64 SCX_EV_DISPATCH_KEEP_LAST;
-
- /*
- * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
- * is dispatched to a local DSQ when exiting.
- */
- s64 SCX_EV_ENQ_SKIP_EXITING;
-
- /*
- * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
- * migration disabled task skips ops.enqueue() and is dispatched to its
- * local DSQ.
- */
- s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
-
- /*
- * Total number of times a task's time slice was refilled with the
- * default value (SCX_SLICE_DFL).
- */
- s64 SCX_EV_REFILL_SLICE_DFL;
-
- /*
- * The total duration of bypass modes in nanoseconds.
- */
- s64 SCX_EV_BYPASS_DURATION;
-
- /*
- * The number of tasks dispatched in the bypassing mode.
- */
- s64 SCX_EV_BYPASS_DISPATCH;
-
- /*
- * The number of times the bypassing mode has been activated.
- */
- s64 SCX_EV_BYPASS_ACTIVATE;
-};
-
-struct scx_sched {
- struct sched_ext_ops ops;
- DECLARE_BITMAP(has_op, SCX_OPI_END);
-
- /*
- * Dispatch queues.
- *
- * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability.
- * This is to avoid live-locking in bypass mode where all tasks are
- * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If
- * per-node split isn't sufficient, it can be further split.
- */
- struct rhashtable dsq_hash;
- struct scx_dispatch_q **global_dsqs;
-
- /*
- * The event counters are in a per-CPU variable to minimize the
- * accounting overhead. A system-wide view on the event counter is
- * constructed when requested by scx_bpf_events().
- */
- struct scx_event_stats __percpu *event_stats_cpu;
-
- bool warned_zero_slice;
-
- atomic_t exit_kind;
- struct scx_exit_info *exit_info;
-
- struct kobject kobj;
-
- struct kthread_worker *helper;
- struct irq_work error_irq_work;
- struct kthread_work disable_work;
- struct rcu_work rcu_work;
-};
-
-enum scx_wake_flags {
- /* expose select WF_* flags as enums */
- SCX_WAKE_FORK = WF_FORK,
- SCX_WAKE_TTWU = WF_TTWU,
- SCX_WAKE_SYNC = WF_SYNC,
-};
-
-enum scx_enq_flags {
- /* expose select ENQUEUE_* flags as enums */
- SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP,
- SCX_ENQ_HEAD = ENQUEUE_HEAD,
- SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED,
-
- /* high 32bits are SCX specific */
-
- /*
- * Set the following to trigger preemption when calling
- * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the
- * current task is cleared to zero and the CPU is kicked into the
- * scheduling path. Implies %SCX_ENQ_HEAD.
- */
- SCX_ENQ_PREEMPT = 1LLU << 32,
-
- /*
- * The task being enqueued was previously enqueued on the current CPU's
- * %SCX_DSQ_LOCAL, but was removed from it in a call to the
- * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was
- * invoked in a ->cpu_release() callback, and the task is again
- * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
- * task will not be scheduled on the CPU until at least the next invocation
- * of the ->cpu_acquire() callback.
- */
- SCX_ENQ_REENQ = 1LLU << 40,
-
- /*
- * The task being enqueued is the only task available for the cpu. By
- * default, ext core keeps executing such tasks but when
- * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the
- * %SCX_ENQ_LAST flag set.
- *
- * The BPF scheduler is responsible for triggering a follow-up
- * scheduling event. Otherwise, Execution may stall.
- */
- SCX_ENQ_LAST = 1LLU << 41,
-
- /* high 8 bits are internal */
- __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56,
-
- SCX_ENQ_CLEAR_OPSS = 1LLU << 56,
- SCX_ENQ_DSQ_PRIQ = 1LLU << 57,
-};
-
-enum scx_deq_flags {
- /* expose select DEQUEUE_* flags as enums */
- SCX_DEQ_SLEEP = DEQUEUE_SLEEP,
-
- /* high 32bits are SCX specific */
-
- /*
- * The generic core-sched layer decided to execute the task even though
- * it hasn't been dispatched yet. Dequeue from the BPF side.
- */
- SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32,
-};
-
-enum scx_pick_idle_cpu_flags {
- SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */
- SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */
-};
-
-enum scx_kick_flags {
- /*
- * Kick the target CPU if idle. Guarantees that the target CPU goes
- * through at least one full scheduling cycle before going idle. If the
- * target CPU can be determined to be currently not idle and going to go
- * through a scheduling cycle before going idle, noop.
- */
- SCX_KICK_IDLE = 1LLU << 0,
-
- /*
- * Preempt the current task and execute the dispatch path. If the
- * current task of the target CPU is an SCX task, its ->scx.slice is
- * cleared to zero before the scheduling path is invoked so that the
- * task expires and the dispatch path is invoked.
- */
- SCX_KICK_PREEMPT = 1LLU << 1,
-
- /*
- * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will
- * return after the target CPU finishes picking the next task.
- */
- SCX_KICK_WAIT = 1LLU << 2,
-};
-
-enum scx_tg_flags {
- SCX_TG_ONLINE = 1U << 0,
- SCX_TG_INITED = 1U << 1,
-};
-
-enum scx_enable_state {
- SCX_ENABLING,
- SCX_ENABLED,
- SCX_DISABLING,
- SCX_DISABLED,
-};
-
-static const char *scx_enable_state_str[] = {
- [SCX_ENABLING] = "enabling",
- [SCX_ENABLED] = "enabled",
- [SCX_DISABLING] = "disabling",
- [SCX_DISABLED] = "disabled",
-};
-
-/*
- * sched_ext_entity->ops_state
- *
- * Used to track the task ownership between the SCX core and the BPF scheduler.
- * State transitions look as follows:
- *
- * NONE -> QUEUEING -> QUEUED -> DISPATCHING
- * ^ | |
- * | v v
- * \-------------------------------/
- *
- * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
- * sites for explanations on the conditions being waited upon and why they are
- * safe. Transitions out of them into NONE or QUEUED must store_release and the
- * waiters should load_acquire.
- *
- * Tracking scx_ops_state enables sched_ext core to reliably determine whether
- * any given task can be dispatched by the BPF scheduler at all times and thus
- * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
- * to try to dispatch any task anytime regardless of its state as the SCX core
- * can safely reject invalid dispatches.
- */
-enum scx_ops_state {
- SCX_OPSS_NONE, /* owned by the SCX core */
- SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */
- SCX_OPSS_QUEUED, /* owned by the BPF scheduler */
- SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */
-
- /*
- * QSEQ brands each QUEUED instance so that, when dispatch races
- * dequeue/requeue, the dispatcher can tell whether it still has a claim
- * on the task being dispatched.
- *
- * As some 32bit archs can't do 64bit store_release/load_acquire,
- * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on
- * 32bit machines. The dispatch race window QSEQ protects is very narrow
- * and runs with IRQ disabled. 30 bits should be sufficient.
- */
- SCX_OPSS_QSEQ_SHIFT = 2,
-};
-
-/* Use macros to ensure that the type is unsigned long for the masks */
-#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
-#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK)
-
/*
* NOTE: sched_ext is in the process of growing multiple scheduler support and
* scx_root usage is in a transitional state. Naked dereferences are safe if the
@@ -1170,7 +136,7 @@ static struct kset *scx_kset;
#include <trace/events/sched_ext.h>
static void process_ddsp_deferred_locals(struct rq *rq);
-static void scx_bpf_kick_cpu(s32 cpu, u64 flags);
+static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags);
static void scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
s64 exit_code, const char *fmt, va_list args);
@@ -1185,24 +151,7 @@ static __printf(4, 5) void scx_exit(struct scx_sched *sch,
va_end(args);
}
-static __printf(3, 4) void scx_kf_exit(enum scx_exit_kind kind, s64 exit_code,
- const char *fmt, ...)
-{
- struct scx_sched *sch;
- va_list args;
-
- rcu_read_lock();
- sch = rcu_dereference(scx_root);
- if (sch) {
- va_start(args, fmt);
- scx_vexit(sch, kind, exit_code, fmt, args);
- va_end(args);
- }
- rcu_read_unlock();
-}
-
#define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args)
-#define scx_kf_error(fmt, args...) scx_kf_exit(SCX_EXIT_ERROR, 0, fmt, ##args)
#define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op)
@@ -1232,10 +181,9 @@ static bool u32_before(u32 a, u32 b)
return (s32)(a - b) < 0;
}
-static struct scx_dispatch_q *find_global_dsq(struct task_struct *p)
+static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch,
+ struct task_struct *p)
{
- struct scx_sched *sch = scx_root;
-
return sch->global_dsqs[cpu_to_node(task_cpu(p))];
}
@@ -1363,11 +311,11 @@ do { \
})
/* @mask is constant, always inline to cull unnecessary branches */
-static __always_inline bool scx_kf_allowed(u32 mask)
+static __always_inline bool scx_kf_allowed(struct scx_sched *sch, u32 mask)
{
if (unlikely(!(current->scx.kf_mask & mask))) {
- scx_kf_error("kfunc with mask 0x%x called from an operation only allowing 0x%x",
- mask, current->scx.kf_mask);
+ scx_error(sch, "kfunc with mask 0x%x called from an operation only allowing 0x%x",
+ mask, current->scx.kf_mask);
return false;
}
@@ -1380,13 +328,13 @@ static __always_inline bool scx_kf_allowed(u32 mask)
*/
if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
(current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
- scx_kf_error("cpu_release kfunc called from a nested operation");
+ scx_error(sch, "cpu_release kfunc called from a nested operation");
return false;
}
if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
(current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
- scx_kf_error("dispatch kfunc called from a nested operation");
+ scx_error(sch, "dispatch kfunc called from a nested operation");
return false;
}
@@ -1394,15 +342,16 @@ static __always_inline bool scx_kf_allowed(u32 mask)
}
/* see SCX_CALL_OP_TASK() */
-static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask,
+static __always_inline bool scx_kf_allowed_on_arg_tasks(struct scx_sched *sch,
+ u32 mask,
struct task_struct *p)
{
- if (!scx_kf_allowed(mask))
+ if (!scx_kf_allowed(sch, mask))
return false;
if (unlikely((p != current->scx.kf_tasks[0] &&
p != current->scx.kf_tasks[1]))) {
- scx_kf_error("called on a task not being operated on");
+ scx_error(sch, "called on a task not being operated on");
return false;
}
@@ -1488,10 +437,11 @@ struct bpf_iter_scx_dsq {
*/
struct scx_task_iter {
struct sched_ext_entity cursor;
- struct task_struct *locked;
+ struct task_struct *locked_task;
struct rq *rq;
struct rq_flags rf;
u32 cnt;
+ bool list_locked;
};
/**
@@ -1519,15 +469,16 @@ static void scx_task_iter_start(struct scx_task_iter *iter)
iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
list_add(&iter->cursor.tasks_node, &scx_tasks);
- iter->locked = NULL;
+ iter->locked_task = NULL;
iter->cnt = 0;
+ iter->list_locked = true;
}
static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
{
- if (iter->locked) {
- task_rq_unlock(iter->rq, iter->locked, &iter->rf);
- iter->locked = NULL;
+ if (iter->locked_task) {
+ task_rq_unlock(iter->rq, iter->locked_task, &iter->rf);
+ iter->locked_task = NULL;
}
}
@@ -1537,24 +488,24 @@ static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
*
* If @iter is in the middle of a locked iteration, it may be locking the rq of
* the task currently being visited in addition to scx_tasks_lock. Unlock both.
- * This function can be safely called anytime during an iteration.
+ * This function can be safely called anytime during an iteration. The next
+ * iterator operation will automatically restore the necessary locking.
*/
static void scx_task_iter_unlock(struct scx_task_iter *iter)
{
__scx_task_iter_rq_unlock(iter);
- spin_unlock_irq(&scx_tasks_lock);
+ if (iter->list_locked) {
+ iter->list_locked = false;
+ spin_unlock_irq(&scx_tasks_lock);
+ }
}
-/**
- * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock()
- * @iter: iterator to re-lock
- *
- * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it
- * doesn't re-lock the rq lock. Must be called before other iterator operations.
- */
-static void scx_task_iter_relock(struct scx_task_iter *iter)
+static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter)
{
- spin_lock_irq(&scx_tasks_lock);
+ if (!iter->list_locked) {
+ spin_lock_irq(&scx_tasks_lock);
+ iter->list_locked = true;
+ }
}
/**
@@ -1567,6 +518,7 @@ static void scx_task_iter_relock(struct scx_task_iter *iter)
*/
static void scx_task_iter_stop(struct scx_task_iter *iter)
{
+ __scx_task_iter_maybe_relock(iter);
list_del_init(&iter->cursor.tasks_node);
scx_task_iter_unlock(iter);
}
@@ -1584,10 +536,12 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
struct list_head *cursor = &iter->cursor.tasks_node;
struct sched_ext_entity *pos;
+ __scx_task_iter_maybe_relock(iter);
+
if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) {
scx_task_iter_unlock(iter);
cond_resched();
- scx_task_iter_relock(iter);
+ __scx_task_iter_maybe_relock(iter);
}
list_for_each_entry(pos, cursor, tasks_node) {
@@ -1650,7 +604,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
return NULL;
iter->rq = task_rq_lock(p, &iter->rf);
- iter->locked = p;
+ iter->locked_task = p;
return p;
}
@@ -1664,7 +618,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
* This can be used when preemption is not disabled.
*/
#define scx_add_event(sch, name, cnt) do { \
- this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \
+ this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \
trace_sched_ext_event(#name, (cnt)); \
} while(0)
@@ -1677,7 +631,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
* This should be used only when preemption is disabled.
*/
#define __scx_add_event(sch, name, cnt) do { \
- __this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \
+ __this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \
trace_sched_ext_event(#name, cnt); \
} while(0)
@@ -1766,23 +720,6 @@ static bool ops_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where)
}
/**
- * kf_cpu_valid - Verify a CPU number, to be used on kfunc input args
- * @cpu: cpu number which came from a BPF ops
- * @where: extra information reported on error
- *
- * The same as ops_cpu_valid() but @sch is implicit.
- */
-static bool kf_cpu_valid(u32 cpu, const char *where)
-{
- if (__cpu_valid(cpu)) {
- return true;
- } else {
- scx_kf_error("invalid CPU %d%s%s", cpu, where ? " " : "", where ?: "");
- return false;
- }
-}
-
-/**
* ops_sanitize_err - Sanitize a -errno value
* @sch: scx_sched to error out on error
* @ops_name: operation to blame on failure
@@ -1942,10 +879,10 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
WRITE_ONCE(dsq->nr, dsq->nr + delta);
}
-static void refill_task_slice_dfl(struct task_struct *p)
+static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p)
{
p->scx.slice = SCX_SLICE_DFL;
- __scx_add_event(scx_root, SCX_EV_REFILL_SLICE_DFL, 1);
+ __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1);
}
static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
@@ -1963,7 +900,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
scx_error(sch, "attempting to dispatch to a destroyed dsq");
/* fall back to the global dsq */
raw_spin_unlock(&dsq->lock);
- dsq = find_global_dsq(p);
+ dsq = find_global_dsq(sch, p);
raw_spin_lock(&dsq->lock);
}
}
@@ -2142,26 +1079,27 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch,
s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
- return find_global_dsq(p);
+ return find_global_dsq(sch, p);
return &cpu_rq(cpu)->scx.local_dsq;
}
if (dsq_id == SCX_DSQ_GLOBAL)
- dsq = find_global_dsq(p);
+ dsq = find_global_dsq(sch, p);
else
dsq = find_user_dsq(sch, dsq_id);
if (unlikely(!dsq)) {
scx_error(sch, "non-existent DSQ 0x%llx for %s[%d]",
dsq_id, p->comm, p->pid);
- return find_global_dsq(p);
+ return find_global_dsq(sch, p);
}
return dsq;
}
-static void mark_direct_dispatch(struct task_struct *ddsp_task,
+static void mark_direct_dispatch(struct scx_sched *sch,
+ struct task_struct *ddsp_task,
struct task_struct *p, u64 dsq_id,
u64 enq_flags)
{
@@ -2175,10 +1113,10 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task,
/* @p must match the task on the enqueue path */
if (unlikely(p != ddsp_task)) {
if (IS_ERR(ddsp_task))
- scx_kf_error("%s[%d] already direct-dispatched",
+ scx_error(sch, "%s[%d] already direct-dispatched",
p->comm, p->pid);
else
- scx_kf_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
+ scx_error(sch, "scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
ddsp_task->comm, ddsp_task->pid,
p->comm, p->pid);
return;
@@ -2333,15 +1271,15 @@ local:
* higher priority it becomes from scx_prio_less()'s POV.
*/
touch_core_sched(rq, p);
- refill_task_slice_dfl(p);
+ refill_task_slice_dfl(sch, p);
local_norefill:
dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags);
return;
global:
touch_core_sched(rq, p); /* see the comment in local: */
- refill_task_slice_dfl(p);
- dispatch_enqueue(sch, find_global_dsq(p), p, enq_flags);
+ refill_task_slice_dfl(sch, p);
+ dispatch_enqueue(sch, find_global_dsq(sch, p), p, enq_flags);
}
static bool task_runnable(const struct task_struct *p)
@@ -2651,8 +1589,7 @@ static bool task_can_run_on_remote_rq(struct scx_sched *sch,
if (!scx_rq_online(rq)) {
if (enforce)
- __scx_add_event(scx_root,
- SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
+ __scx_add_event(sch, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
return false;
}
@@ -2754,7 +1691,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch,
dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
if (src_rq != dst_rq &&
unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
- dst_dsq = find_global_dsq(p);
+ dst_dsq = find_global_dsq(sch, p);
dst_rq = src_rq;
}
} else {
@@ -2910,7 +1847,7 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
if (src_rq != dst_rq &&
unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
- dispatch_enqueue(sch, find_global_dsq(p), p,
+ dispatch_enqueue(sch, find_global_dsq(sch, p), p,
enq_flags | SCX_ENQ_CLEAR_OPSS);
return;
}
@@ -3155,10 +2092,10 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* balance(), we want to complete this scheduling cycle and then
* start a new one. IOW, we want to call resched_curr() on the
* next, most likely idle, task, not the current one. Use
- * scx_bpf_kick_cpu() for deferred kicking.
+ * scx_kick_cpu() for deferred kicking.
*/
if (unlikely(!--nr_loops)) {
- scx_bpf_kick_cpu(cpu_of(rq), 0);
+ scx_kick_cpu(sch, cpu_of(rq), 0);
break;
}
} while (dspc->nr_tasks);
@@ -3442,24 +2379,25 @@ static struct task_struct *pick_task_scx(struct rq *rq)
if (keep_prev) {
p = prev;
if (!p->scx.slice)
- refill_task_slice_dfl(p);
+ refill_task_slice_dfl(rcu_dereference_sched(scx_root), p);
} else {
p = first_local_task(rq);
if (!p) {
if (kick_idle)
- scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE);
+ scx_kick_cpu(rcu_dereference_sched(scx_root),
+ cpu_of(rq), SCX_KICK_IDLE);
return NULL;
}
if (unlikely(!p->scx.slice)) {
- struct scx_sched *sch = scx_root;
+ struct scx_sched *sch = rcu_dereference_sched(scx_root);
if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) {
printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n",
p->comm, p->pid, __func__);
sch->warned_zero_slice = true;
}
- refill_task_slice_dfl(p);
+ refill_task_slice_dfl(sch, p);
}
}
@@ -3548,7 +2486,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0);
if (cpu >= 0) {
- refill_task_slice_dfl(p);
+ refill_task_slice_dfl(sch, p);
p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL;
} else {
cpu = prev_cpu;
@@ -4084,7 +3022,7 @@ bool scx_can_stop_tick(struct rq *rq)
#ifdef CONFIG_EXT_GROUP_SCHED
-DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem);
+DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_ops_rwsem);
static bool scx_cgroup_enabled;
void scx_tg_init(struct task_group *tg)
@@ -4101,8 +3039,6 @@ int scx_tg_online(struct task_group *tg)
WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED));
- percpu_down_read(&scx_cgroup_rwsem);
-
if (scx_cgroup_enabled) {
if (SCX_HAS_OP(sch, cgroup_init)) {
struct scx_cgroup_init_args args =
@@ -4122,7 +3058,6 @@ int scx_tg_online(struct task_group *tg)
tg->scx.flags |= SCX_TG_ONLINE;
}
- percpu_up_read(&scx_cgroup_rwsem);
return ret;
}
@@ -4132,15 +3067,11 @@ void scx_tg_offline(struct task_group *tg)
WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE));
- percpu_down_read(&scx_cgroup_rwsem);
-
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) &&
(tg->scx.flags & SCX_TG_INITED))
SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
tg->css.cgroup);
tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
-
- percpu_up_read(&scx_cgroup_rwsem);
}
int scx_cgroup_can_attach(struct cgroup_taskset *tset)
@@ -4150,9 +3081,6 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
struct task_struct *p;
int ret;
- /* released in scx_finish/cancel_attach() */
- percpu_down_read(&scx_cgroup_rwsem);
-
if (!scx_cgroup_enabled)
return 0;
@@ -4192,7 +3120,6 @@ err:
p->scx.cgrp_moving_from = NULL;
}
- percpu_up_read(&scx_cgroup_rwsem);
return ops_sanitize_err(sch, "cgroup_prep_move", ret);
}
@@ -4215,11 +3142,6 @@ void scx_cgroup_move_task(struct task_struct *p)
p->scx.cgrp_moving_from = NULL;
}
-void scx_cgroup_finish_attach(void)
-{
- percpu_up_read(&scx_cgroup_rwsem);
-}
-
void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
{
struct scx_sched *sch = scx_root;
@@ -4227,7 +3149,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
struct task_struct *p;
if (!scx_cgroup_enabled)
- goto out_unlock;
+ return;
cgroup_taskset_for_each(p, css, tset) {
if (SCX_HAS_OP(sch, cgroup_cancel_move) &&
@@ -4236,15 +3158,13 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
p, p->scx.cgrp_moving_from, css->cgroup);
p->scx.cgrp_moving_from = NULL;
}
-out_unlock:
- percpu_up_read(&scx_cgroup_rwsem);
}
void scx_group_set_weight(struct task_group *tg, unsigned long weight)
{
struct scx_sched *sch = scx_root;
- percpu_down_read(&scx_cgroup_rwsem);
+ percpu_down_read(&scx_cgroup_ops_rwsem);
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) &&
tg->scx.weight != weight)
@@ -4253,7 +3173,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
tg->scx.weight = weight;
- percpu_up_read(&scx_cgroup_rwsem);
+ percpu_up_read(&scx_cgroup_ops_rwsem);
}
void scx_group_set_idle(struct task_group *tg, bool idle)
@@ -4266,7 +3186,7 @@ void scx_group_set_bandwidth(struct task_group *tg,
{
struct scx_sched *sch = scx_root;
- percpu_down_read(&scx_cgroup_rwsem);
+ percpu_down_read(&scx_cgroup_ops_rwsem);
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) &&
(tg->scx.bw_period_us != period_us ||
@@ -4279,23 +3199,25 @@ void scx_group_set_bandwidth(struct task_group *tg,
tg->scx.bw_quota_us = quota_us;
tg->scx.bw_burst_us = burst_us;
- percpu_up_read(&scx_cgroup_rwsem);
+ percpu_up_read(&scx_cgroup_ops_rwsem);
}
static void scx_cgroup_lock(void)
{
- percpu_down_write(&scx_cgroup_rwsem);
+ percpu_down_write(&scx_cgroup_ops_rwsem);
+ cgroup_lock();
}
static void scx_cgroup_unlock(void)
{
- percpu_up_write(&scx_cgroup_rwsem);
+ cgroup_unlock();
+ percpu_up_write(&scx_cgroup_ops_rwsem);
}
#else /* CONFIG_EXT_GROUP_SCHED */
-static inline void scx_cgroup_lock(void) {}
-static inline void scx_cgroup_unlock(void) {}
+static void scx_cgroup_lock(void) {}
+static void scx_cgroup_unlock(void) {}
#endif /* CONFIG_EXT_GROUP_SCHED */
@@ -4411,15 +3333,12 @@ static void scx_cgroup_exit(struct scx_sched *sch)
{
struct cgroup_subsys_state *css;
- percpu_rwsem_assert_held(&scx_cgroup_rwsem);
-
scx_cgroup_enabled = false;
/*
- * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
+ * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk
* cgroups and exit all the inited ones, all online cgroups are exited.
*/
- rcu_read_lock();
css_for_each_descendant_post(css, &root_task_group.css) {
struct task_group *tg = css_tg(css);
@@ -4430,17 +3349,9 @@ static void scx_cgroup_exit(struct scx_sched *sch)
if (!sch->ops.cgroup_exit)
continue;
- if (WARN_ON_ONCE(!css_tryget(css)))
- continue;
- rcu_read_unlock();
-
SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
css->cgroup);
-
- rcu_read_lock();
- css_put(css);
}
- rcu_read_unlock();
}
static int scx_cgroup_init(struct scx_sched *sch)
@@ -4448,13 +3359,10 @@ static int scx_cgroup_init(struct scx_sched *sch)
struct cgroup_subsys_state *css;
int ret;
- percpu_rwsem_assert_held(&scx_cgroup_rwsem);
-
/*
- * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
+ * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk
* cgroups and init, all online cgroups are initialized.
*/
- rcu_read_lock();
css_for_each_descendant_pre(css, &root_task_group.css) {
struct task_group *tg = css_tg(css);
struct scx_cgroup_init_args args = {
@@ -4473,10 +3381,6 @@ static int scx_cgroup_init(struct scx_sched *sch)
continue;
}
- if (WARN_ON_ONCE(!css_tryget(css)))
- continue;
- rcu_read_unlock();
-
ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL,
css->cgroup, &args);
if (ret) {
@@ -4485,11 +3389,7 @@ static int scx_cgroup_init(struct scx_sched *sch)
return ret;
}
tg->scx.flags |= SCX_TG_INITED;
-
- rcu_read_lock();
- css_put(css);
}
- rcu_read_unlock();
WARN_ON_ONCE(scx_cgroup_enabled);
scx_cgroup_enabled = true;
@@ -4572,7 +3472,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
int node;
kthread_stop(sch->helper->task);
- free_percpu(sch->event_stats_cpu);
+ free_percpu(sch->pcpu);
for_each_node_state(node, N_POSSIBLE)
kfree(sch->global_dsqs[node]);
@@ -4671,9 +3571,22 @@ bool task_should_scx(int policy)
bool scx_allow_ttwu_queue(const struct task_struct *p)
{
- return !scx_enabled() ||
- (scx_root->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) ||
- p->sched_class != &ext_sched_class;
+ struct scx_sched *sch;
+
+ if (!scx_enabled())
+ return true;
+
+ sch = rcu_dereference_sched(scx_root);
+ if (unlikely(!sch))
+ return true;
+
+ if (sch->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP)
+ return true;
+
+ if (unlikely(p->sched_class != &ext_sched_class))
+ return true;
+
+ return false;
}
/**
@@ -4789,7 +3702,7 @@ static void scx_clear_softlockup(void)
*
* - pick_next_task() suppresses zero slice warning.
*
- * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM
+ * - scx_kick_cpu() is disabled to avoid irq_work malfunction during PM
* operations.
*
* - scx_prio_less() reverts to the default core_sched_at order.
@@ -5234,7 +4147,8 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf);
dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u",
p->scx.dsq_vtime, p->scx.slice, p->scx.weight);
- dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr));
+ dump_line(s, " cpus=%*pb no_mig=%u", cpumask_pr_args(p->cpus_ptr),
+ p->migration_disabled);
if (SCX_HAS_OP(sch, dump_task)) {
ops_dump_init(s, " ");
@@ -5473,13 +4387,13 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
sch->global_dsqs[node] = dsq;
}
- sch->event_stats_cpu = alloc_percpu(struct scx_event_stats);
- if (!sch->event_stats_cpu)
+ sch->pcpu = alloc_percpu(struct scx_sched_pcpu);
+ if (!sch->pcpu)
goto err_free_gdsqs;
sch->helper = kthread_run_worker(0, "sched_ext_helper");
if (!sch->helper)
- goto err_free_event_stats;
+ goto err_free_pcpu;
sched_set_fifo(sch->helper->task);
atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
@@ -5497,8 +4411,8 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
err_stop_helper:
kthread_stop(sch->helper->task);
-err_free_event_stats:
- free_percpu(sch->event_stats_cpu);
+err_free_pcpu:
+ free_percpu(sch->pcpu);
err_free_gdsqs:
for_each_node_state(node, N_POSSIBLE)
kfree(sch->global_dsqs[node]);
@@ -5621,6 +4535,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_error(sch, "ops.init() failed (%d)", ret);
goto err_disable;
}
+ sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
}
for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
@@ -5713,7 +4628,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
ret = scx_init_task(p, task_group(p), false);
if (ret) {
put_task_struct(p);
- scx_task_iter_relock(&sti);
scx_task_iter_stop(&sti);
scx_error(sch, "ops.init_task() failed (%d) for %s[%d]",
ret, p->comm, p->pid);
@@ -5723,7 +4637,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_set_task_state(p, SCX_TASK_READY);
put_task_struct(p);
- scx_task_iter_relock(&sti);
}
scx_task_iter_stop(&sti);
scx_cgroup_unlock();
@@ -5749,6 +4662,9 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
__setscheduler_class(p->policy, p->prio);
struct sched_enq_and_set_ctx ctx;
+ if (!tryget_task_struct(p))
+ continue;
+
if (old_class != new_class && p->se.sched_delayed)
dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
@@ -5761,6 +4677,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
sched_enq_and_set_task(&ctx);
check_class_changed(task_rq(p), p, old_class, p->prio);
+ put_task_struct(p);
}
scx_task_iter_stop(&sti);
percpu_up_write(&scx_fork_rwsem);
@@ -5791,7 +4708,7 @@ err_unlock:
err_disable_unlock_all:
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);
- scx_bypass(false);
+ /* we'll soon enter disable path, keep bypass on */
err_disable:
mutex_unlock(&scx_enable_mutex);
/*
@@ -6324,40 +5241,41 @@ void __init init_sched_ext_class(void)
/********************************************************************************
* Helpers that can be called from the BPF scheduler.
*/
-static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags)
+static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p,
+ u64 enq_flags)
{
- if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
+ if (!scx_kf_allowed(sch, SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
return false;
lockdep_assert_irqs_disabled();
if (unlikely(!p)) {
- scx_kf_error("called with NULL task");
+ scx_error(sch, "called with NULL task");
return false;
}
if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) {
- scx_kf_error("invalid enq_flags 0x%llx", enq_flags);
+ scx_error(sch, "invalid enq_flags 0x%llx", enq_flags);
return false;
}
return true;
}
-static void scx_dsq_insert_commit(struct task_struct *p, u64 dsq_id,
- u64 enq_flags)
+static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p,
+ u64 dsq_id, u64 enq_flags)
{
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
struct task_struct *ddsp_task;
ddsp_task = __this_cpu_read(direct_dispatch_task);
if (ddsp_task) {
- mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags);
+ mark_direct_dispatch(sch, ddsp_task, p, dsq_id, enq_flags);
return;
}
if (unlikely(dspc->cursor >= scx_dsp_max_batch)) {
- scx_kf_error("dispatch buffer overflow");
+ scx_error(sch, "dispatch buffer overflow");
return;
}
@@ -6409,7 +5327,14 @@ __bpf_kfunc_start_defs();
__bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice,
u64 enq_flags)
{
- if (!scx_dsq_insert_preamble(p, enq_flags))
+ struct scx_sched *sch;
+
+ guard(rcu)();
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return;
+
+ if (!scx_dsq_insert_preamble(sch, p, enq_flags))
return;
if (slice)
@@ -6417,7 +5342,7 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice
else
p->scx.slice = p->scx.slice ?: 1;
- scx_dsq_insert_commit(p, dsq_id, enq_flags);
+ scx_dsq_insert_commit(sch, p, dsq_id, enq_flags);
}
/**
@@ -6444,7 +5369,14 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice
__bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
u64 slice, u64 vtime, u64 enq_flags)
{
- if (!scx_dsq_insert_preamble(p, enq_flags))
+ struct scx_sched *sch;
+
+ guard(rcu)();
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return;
+
+ if (!scx_dsq_insert_preamble(sch, p, enq_flags))
return;
if (slice)
@@ -6454,7 +5386,7 @@ __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
p->scx.dsq_vtime = vtime;
- scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+ scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
}
__bpf_kfunc_end_defs();
@@ -6479,7 +5411,8 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
bool in_balance;
unsigned long flags;
- if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH))
+ if (!scx_kf_allowed_if_unlocked() &&
+ !scx_kf_allowed(sch, SCX_KF_DISPATCH))
return false;
/*
@@ -6564,7 +5497,15 @@ __bpf_kfunc_start_defs();
*/
__bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void)
{
- if (!scx_kf_allowed(SCX_KF_DISPATCH))
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return 0;
+
+ if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
return 0;
return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor);
@@ -6579,14 +5520,21 @@ __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void)
__bpf_kfunc void scx_bpf_dispatch_cancel(void)
{
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
+ struct scx_sched *sch;
+
+ guard(rcu)();
- if (!scx_kf_allowed(SCX_KF_DISPATCH))
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return;
+
+ if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
return;
if (dspc->cursor > 0)
dspc->cursor--;
else
- scx_kf_error("dispatch buffer underflow");
+ scx_error(sch, "dispatch buffer underflow");
}
/**
@@ -6605,11 +5553,17 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void)
*/
__bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
{
- struct scx_sched *sch = scx_root;
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
struct scx_dispatch_q *dsq;
+ struct scx_sched *sch;
- if (!scx_kf_allowed(SCX_KF_DISPATCH))
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return false;
+
+ if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
return false;
flush_dispatch_buf(sch, dspc->rq);
@@ -6756,12 +5710,18 @@ __bpf_kfunc_start_defs();
*/
__bpf_kfunc u32 scx_bpf_reenqueue_local(void)
{
+ struct scx_sched *sch;
LIST_HEAD(tasks);
u32 nr_enqueued = 0;
struct rq *rq;
struct task_struct *p, *n;
- if (!scx_kf_allowed(SCX_KF_CPU_RELEASE))
+ guard(rcu)();
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return 0;
+
+ if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE))
return 0;
rq = cpu_rq(smp_processor_id());
@@ -6784,12 +5744,8 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void)
* CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to
* the current local DSQ for running tasks and thus are not
* visible to the BPF scheduler.
- *
- * Also skip re-enqueueing tasks that can only run on this
- * CPU, as they would just be re-added to the same local
- * DSQ without any benefit.
*/
- if (p->migration_pending || is_migration_disabled(p) || p->nr_cpus_allowed == 1)
+ if (p->migration_pending)
continue;
dispatch_dequeue(rq, p);
@@ -6877,22 +5833,12 @@ static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = {
__bpf_kfunc_start_defs();
-/**
- * scx_bpf_kick_cpu - Trigger reschedule on a CPU
- * @cpu: cpu to kick
- * @flags: %SCX_KICK_* flags
- *
- * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or
- * trigger rescheduling on a busy CPU. This can be called from any online
- * scx_ops operation and the actual kicking is performed asynchronously through
- * an irq work.
- */
-__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
+static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags)
{
struct rq *this_rq;
unsigned long irq_flags;
- if (!kf_cpu_valid(cpu, NULL))
+ if (!ops_cpu_valid(sch, cpu, NULL))
return;
local_irq_save(irq_flags);
@@ -6916,7 +5862,7 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
struct rq *target_rq = cpu_rq(cpu);
if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT)))
- scx_kf_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE");
+ scx_error(sch, "PREEMPT/WAIT cannot be used with SCX_KICK_IDLE");
if (raw_spin_rq_trylock(target_rq)) {
if (can_skip_idle_kick(target_rq)) {
@@ -6941,6 +5887,26 @@ out:
}
/**
+ * scx_bpf_kick_cpu - Trigger reschedule on a CPU
+ * @cpu: cpu to kick
+ * @flags: %SCX_KICK_* flags
+ *
+ * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or
+ * trigger rescheduling on a busy CPU. This can be called from any online
+ * scx_ops operation and the actual kicking is performed asynchronously through
+ * an irq work.
+ */
+__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+ sch = rcu_dereference(scx_root);
+ if (likely(sch))
+ scx_kick_cpu(sch, cpu, flags);
+}
+
+/**
* scx_bpf_dsq_nr_queued - Return the number of queued tasks
* @dsq_id: id of the DSQ
*
@@ -7120,28 +6086,29 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it)
__bpf_kfunc_end_defs();
-static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size,
- char *fmt, unsigned long long *data, u32 data__sz)
+static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf,
+ size_t line_size, char *fmt, unsigned long long *data,
+ u32 data__sz)
{
struct bpf_bprintf_data bprintf_data = { .get_bin_args = true };
s32 ret;
if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 ||
(data__sz && !data)) {
- scx_kf_error("invalid data=%p and data__sz=%u", (void *)data, data__sz);
+ scx_error(sch, "invalid data=%p and data__sz=%u", (void *)data, data__sz);
return -EINVAL;
}
ret = copy_from_kernel_nofault(data_buf, data, data__sz);
if (ret < 0) {
- scx_kf_error("failed to read data fields (%d)", ret);
+ scx_error(sch, "failed to read data fields (%d)", ret);
return ret;
}
ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8,
&bprintf_data);
if (ret < 0) {
- scx_kf_error("format preparation failed (%d)", ret);
+ scx_error(sch, "format preparation failed (%d)", ret);
return ret;
}
@@ -7149,17 +6116,17 @@ static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size,
bprintf_data.bin_args);
bpf_bprintf_cleanup(&bprintf_data);
if (ret < 0) {
- scx_kf_error("(\"%s\", %p, %u) failed to format", fmt, data, data__sz);
+ scx_error(sch, "(\"%s\", %p, %u) failed to format", fmt, data, data__sz);
return ret;
}
return ret;
}
-static s32 bstr_format(struct scx_bstr_buf *buf,
+static s32 bstr_format(struct scx_sched *sch, struct scx_bstr_buf *buf,
char *fmt, unsigned long long *data, u32 data__sz)
{
- return __bstr_format(buf->data, buf->line, sizeof(buf->line),
+ return __bstr_format(sch, buf->data, buf->line, sizeof(buf->line),
fmt, data, data__sz);
}
@@ -7178,11 +6145,14 @@ __bpf_kfunc_start_defs();
__bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
unsigned long long *data, u32 data__sz)
{
+ struct scx_sched *sch;
unsigned long flags;
raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
- if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
- scx_kf_exit(SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line);
+ sch = rcu_dereference_bh(scx_root);
+ if (likely(sch) &&
+ bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
+ scx_exit(sch, SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line);
raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
}
@@ -7198,11 +6168,14 @@ __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
__bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
u32 data__sz)
{
+ struct scx_sched *sch;
unsigned long flags;
raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
- if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
- scx_kf_exit(SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line);
+ sch = rcu_dereference_bh(scx_root);
+ if (likely(sch) &&
+ bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
+ scx_exit(sch, SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line);
raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
}
@@ -7221,17 +6194,24 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
__bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
u32 data__sz)
{
+ struct scx_sched *sch;
struct scx_dump_data *dd = &scx_dump_data;
struct scx_bstr_buf *buf = &dd->buf;
s32 ret;
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return;
+
if (raw_smp_processor_id() != dd->cpu) {
- scx_kf_error("scx_bpf_dump() must only be called from ops.dump() and friends");
+ scx_error(sch, "scx_bpf_dump() must only be called from ops.dump() and friends");
return;
}
/* append the formatted string to the line buf */
- ret = __bstr_format(buf->data, buf->line + dd->cursor,
+ ret = __bstr_format(sch, buf->data, buf->line + dd->cursor,
sizeof(buf->line) - dd->cursor, fmt, data, data__sz);
if (ret < 0) {
dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)",
@@ -7267,7 +6247,12 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
*/
__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
{
- if (kf_cpu_valid(cpu, NULL))
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (likely(sch) && ops_cpu_valid(sch, cpu, NULL))
return arch_scale_cpu_capacity(cpu);
else
return SCX_CPUPERF_ONE;
@@ -7289,7 +6274,12 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
*/
__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
{
- if (kf_cpu_valid(cpu, NULL))
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (likely(sch) && ops_cpu_valid(sch, cpu, NULL))
return arch_scale_freq_capacity(cpu);
else
return SCX_CPUPERF_ONE;
@@ -7311,12 +6301,20 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
*/
__bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(sch);
+ if (unlikely(!sch))
+ return;
+
if (unlikely(perf > SCX_CPUPERF_ONE)) {
- scx_kf_error("Invalid cpuperf target %u for CPU %d", perf, cpu);
+ scx_error(sch, "Invalid cpuperf target %u for CPU %d", perf, cpu);
return;
}
- if (kf_cpu_valid(cpu, NULL)) {
+ if (ops_cpu_valid(sch, cpu, NULL)) {
struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq();
struct rq_flags rf;
@@ -7325,7 +6323,7 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
* to the corresponding CPU to prevent ABBA deadlocks.
*/
if (locked_rq && rq != locked_rq) {
- scx_kf_error("Invalid target CPU %d", cpu);
+ scx_error(sch, "Invalid target CPU %d", cpu);
return;
}
@@ -7420,13 +6418,76 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p)
*/
__bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu)
{
- if (!kf_cpu_valid(cpu, NULL))
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
return NULL;
+ if (!ops_cpu_valid(sch, cpu, NULL))
+ return NULL;
+
+ if (!sch->warned_deprecated_rq) {
+ printk_deferred(KERN_WARNING "sched_ext: %s() is deprecated; "
+ "use scx_bpf_locked_rq() when holding rq lock "
+ "or scx_bpf_cpu_curr() to read remote curr safely.\n", __func__);
+ sch->warned_deprecated_rq = true;
+ }
+
return cpu_rq(cpu);
}
/**
+ * scx_bpf_locked_rq - Return the rq currently locked by SCX
+ *
+ * Returns the rq if a rq lock is currently held by SCX.
+ * Otherwise emits an error and returns NULL.
+ */
+__bpf_kfunc struct rq *scx_bpf_locked_rq(void)
+{
+ struct scx_sched *sch;
+ struct rq *rq;
+
+ guard(preempt)();
+
+ sch = rcu_dereference_sched(scx_root);
+ if (unlikely(!sch))
+ return NULL;
+
+ rq = scx_locked_rq();
+ if (!rq) {
+ scx_error(sch, "accessing rq without holding rq lock");
+ return NULL;
+ }
+
+ return rq;
+}
+
+/**
+ * scx_bpf_cpu_curr - Return remote CPU's curr task
+ * @cpu: CPU of interest
+ *
+ * Callers must hold RCU read lock (KF_RCU).
+ */
+__bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return NULL;
+
+ if (!ops_cpu_valid(sch, cpu, NULL))
+ return NULL;
+
+ return rcu_dereference(cpu_rq(cpu)->curr);
+}
+
+/**
* scx_bpf_task_cgroup - Return the sched cgroup of a task
* @p: task of interest
*
@@ -7442,8 +6503,15 @@ __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p)
{
struct task_group *tg = p->sched_task_group;
struct cgroup *cgrp = &cgrp_dfl_root.cgrp;
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ goto out;
- if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p))
+ if (!scx_kf_allowed_on_arg_tasks(sch, __SCX_KF_RQ_LOCKED, p))
goto out;
cgrp = tg_cgrp(tg);
@@ -7524,7 +6592,7 @@ static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *event
/* Aggregate per-CPU event counters into @events. */
memset(events, 0, sizeof(*events));
for_each_possible_cpu(cpu) {
- e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu);
+ e_cpu = &per_cpu_ptr(sch->pcpu, cpu)->event_stats;
scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
@@ -7590,6 +6658,8 @@ BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE)
BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
+BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_RET_NULL)
+BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU_PROTECTED)
#ifdef CONFIG_CGROUP_SCHED
BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
#endif
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 292bb41a242e..43429b33e52c 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -8,29 +8,6 @@
*/
#ifdef CONFIG_SCHED_CLASS_EXT
-static inline bool scx_kf_allowed_if_unlocked(void)
-{
- return !current->scx.kf_mask;
-}
-
-static inline bool scx_rq_bypassing(struct rq *rq)
-{
- return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
-}
-
-DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
-
-DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
-
-/*
- * Return the rq currently locked from an scx callback, or NULL if no rq is
- * locked.
- */
-static inline struct rq *scx_locked_rq(void)
-{
- return __this_cpu_read(scx_locked_rq_state);
-}
-
void scx_tick(struct rq *rq);
void init_scx_entity(struct sched_ext_entity *scx);
void scx_pre_fork(struct task_struct *p);
@@ -100,7 +77,6 @@ int scx_tg_online(struct task_group *tg);
void scx_tg_offline(struct task_group *tg);
int scx_cgroup_can_attach(struct cgroup_taskset *tset);
void scx_cgroup_move_task(struct task_struct *p);
-void scx_cgroup_finish_attach(void);
void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
void scx_group_set_idle(struct task_group *tg, bool idle);
@@ -111,7 +87,6 @@ static inline int scx_tg_online(struct task_group *tg) { return 0; }
static inline void scx_tg_offline(struct task_group *tg) {}
static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; }
static inline void scx_cgroup_move_task(struct task_struct *p) {}
-static inline void scx_cgroup_finish_attach(void) {}
static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
static inline void scx_group_set_idle(struct task_group *tg, bool idle) {}
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 7174e1c1a392..d2434c954848 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -819,10 +819,10 @@ void scx_idle_disable(void)
* Helpers that can be called from the BPF scheduler.
*/
-static int validate_node(int node)
+static int validate_node(struct scx_sched *sch, int node)
{
if (!static_branch_likely(&scx_builtin_idle_per_node)) {
- scx_kf_error("per-node idle tracking is disabled");
+ scx_error(sch, "per-node idle tracking is disabled");
return -EOPNOTSUPP;
}
@@ -832,13 +832,13 @@ static int validate_node(int node)
/* Make sure node is in a valid range */
if (node < 0 || node >= nr_node_ids) {
- scx_kf_error("invalid node %d", node);
+ scx_error(sch, "invalid node %d", node);
return -EINVAL;
}
/* Make sure the node is part of the set of possible nodes */
if (!node_possible(node)) {
- scx_kf_error("unavailable node %d", node);
+ scx_error(sch, "unavailable node %d", node);
return -EINVAL;
}
@@ -847,26 +847,53 @@ static int validate_node(int node)
__bpf_kfunc_start_defs();
-static bool check_builtin_idle_enabled(void)
+static bool check_builtin_idle_enabled(struct scx_sched *sch)
{
if (static_branch_likely(&scx_builtin_idle_enabled))
return true;
- scx_kf_error("built-in idle tracking is disabled");
+ scx_error(sch, "built-in idle tracking is disabled");
return false;
}
-static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+/*
+ * Determine whether @p is a migration-disabled task in the context of BPF
+ * code.
+ *
+ * We can't simply check whether @p->migration_disabled is set in a
+ * sched_ext callback, because migration is always disabled for the current
+ * task while running BPF code.
+ *
+ * The prolog (__bpf_prog_enter) and epilog (__bpf_prog_exit) respectively
+ * disable and re-enable migration. For this reason, the current task
+ * inside a sched_ext callback is always a migration-disabled task.
+ *
+ * Therefore, when @p->migration_disabled == 1, check whether @p is the
+ * current task or not: if it is, then migration was not disabled before
+ * entering the callback, otherwise migration was disabled.
+ *
+ * Returns true if @p is migration-disabled, false otherwise.
+ */
+static bool is_bpf_migration_disabled(const struct task_struct *p)
+{
+ if (p->migration_disabled == 1)
+ return p != current;
+ else
+ return p->migration_disabled;
+}
+
+static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p,
+ s32 prev_cpu, u64 wake_flags,
const struct cpumask *allowed, u64 flags)
{
struct rq *rq;
struct rq_flags rf;
s32 cpu;
- if (!kf_cpu_valid(prev_cpu, NULL))
+ if (!ops_cpu_valid(sch, prev_cpu, NULL))
return -EINVAL;
- if (!check_builtin_idle_enabled())
+ if (!check_builtin_idle_enabled(sch))
return -EBUSY;
/*
@@ -879,7 +906,7 @@ static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_f
if (scx_kf_allowed_if_unlocked()) {
rq = task_rq_lock(p, &rf);
} else {
- if (!scx_kf_allowed(SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE))
+ if (!scx_kf_allowed(sch, SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE))
return -EPERM;
rq = scx_locked_rq();
}
@@ -898,7 +925,7 @@ static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_f
* selection optimizations and simply check whether the previously
* used CPU is idle and within the allowed cpumask.
*/
- if (p->nr_cpus_allowed == 1 || is_migration_disabled(p)) {
+ if (p->nr_cpus_allowed == 1 || is_bpf_migration_disabled(p)) {
if (cpumask_test_cpu(prev_cpu, allowed ?: p->cpus_ptr) &&
scx_idle_test_and_clear_cpu(prev_cpu))
cpu = prev_cpu;
@@ -922,9 +949,13 @@ static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_f
*/
__bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
{
- if (!kf_cpu_valid(cpu, NULL))
- return NUMA_NO_NODE;
+ struct scx_sched *sch;
+
+ guard(rcu)();
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch) || !ops_cpu_valid(sch, cpu, NULL))
+ return NUMA_NO_NODE;
return cpu_to_node(cpu);
}
@@ -946,15 +977,21 @@ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
u64 wake_flags, bool *is_idle)
{
+ struct scx_sched *sch;
s32 cpu;
- cpu = select_cpu_from_kfunc(p, prev_cpu, wake_flags, NULL, 0);
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
+ cpu = select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags, NULL, 0);
if (cpu >= 0) {
*is_idle = true;
return cpu;
}
*is_idle = false;
-
return prev_cpu;
}
@@ -981,7 +1018,16 @@ __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
__bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
const struct cpumask *cpus_allowed, u64 flags)
{
- return select_cpu_from_kfunc(p, prev_cpu, wake_flags, cpus_allowed, flags);
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
+ return select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags,
+ cpus_allowed, flags);
}
/**
@@ -995,7 +1041,15 @@ __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
{
- node = validate_node(node);
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return cpu_none_mask;
+
+ node = validate_node(sch, node);
if (node < 0)
return cpu_none_mask;
@@ -1011,12 +1065,20 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return cpu_none_mask;
+
if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
- scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+ scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
return cpu_none_mask;
}
- if (!check_builtin_idle_enabled())
+ if (!check_builtin_idle_enabled(sch))
return cpu_none_mask;
return idle_cpumask(NUMA_NO_NODE)->cpu;
@@ -1034,7 +1096,15 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
{
- node = validate_node(node);
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return cpu_none_mask;
+
+ node = validate_node(sch, node);
if (node < 0)
return cpu_none_mask;
@@ -1054,12 +1124,20 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return cpu_none_mask;
+
if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
- scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+ scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
return cpu_none_mask;
}
- if (!check_builtin_idle_enabled())
+ if (!check_builtin_idle_enabled(sch))
return cpu_none_mask;
if (sched_smt_active())
@@ -1095,10 +1173,18 @@ __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
*/
__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
{
- if (!check_builtin_idle_enabled())
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
return false;
- if (!kf_cpu_valid(cpu, NULL))
+ if (!check_builtin_idle_enabled(sch))
+ return false;
+
+ if (!ops_cpu_valid(sch, cpu, NULL))
return false;
return scx_idle_test_and_clear_cpu(cpu);
@@ -1126,7 +1212,15 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
__bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed,
int node, u64 flags)
{
- node = validate_node(node);
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
+ node = validate_node(sch, node);
if (node < 0)
return node;
@@ -1158,12 +1252,20 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed,
__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
u64 flags)
{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
- scx_kf_error("per-node idle tracking is enabled");
+ scx_error(sch, "per-node idle tracking is enabled");
return -EBUSY;
}
- if (!check_builtin_idle_enabled())
+ if (!check_builtin_idle_enabled(sch))
return -EBUSY;
return scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags);
@@ -1193,9 +1295,16 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
__bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed,
int node, u64 flags)
{
+ struct scx_sched *sch;
s32 cpu;
- node = validate_node(node);
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
+ node = validate_node(sch, node);
if (node < 0)
return node;
@@ -1233,10 +1342,17 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed,
__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
u64 flags)
{
+ struct scx_sched *sch;
s32 cpu;
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
- scx_kf_error("per-node idle tracking is enabled");
+ scx_error(sch, "per-node idle tracking is enabled");
return -EBUSY;
}
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
new file mode 100644
index 000000000000..b3617abed510
--- /dev/null
+++ b/kernel/sched/ext_internal.h
@@ -0,0 +1,1078 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2025 Tejun Heo <tj@kernel.org>
+ */
+#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
+
+enum scx_consts {
+ SCX_DSP_DFL_MAX_BATCH = 32,
+ SCX_DSP_MAX_LOOPS = 32,
+ SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ,
+
+ SCX_EXIT_BT_LEN = 64,
+ SCX_EXIT_MSG_LEN = 1024,
+ SCX_EXIT_DUMP_DFL_LEN = 32768,
+
+ SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE,
+
+ /*
+ * Iterating all tasks may take a while. Periodically drop
+ * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
+ */
+ SCX_TASK_ITER_BATCH = 32,
+};
+
+enum scx_exit_kind {
+ SCX_EXIT_NONE,
+ SCX_EXIT_DONE,
+
+ SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */
+ SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */
+ SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */
+ SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */
+
+ SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */
+ SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */
+ SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */
+};
+
+/*
+ * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(),
+ * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes
+ * are 64bit of the format:
+ *
+ * Bits: [63 .. 48 47 .. 32 31 .. 0]
+ * [ SYS ACT ] [ SYS RSN ] [ USR ]
+ *
+ * SYS ACT: System-defined exit actions
+ * SYS RSN: System-defined exit reasons
+ * USR : User-defined exit codes and reasons
+ *
+ * Using the above, users may communicate intention and context by ORing system
+ * actions and/or system reasons with a user-defined exit code.
+ */
+enum scx_exit_code {
+ /* Reasons */
+ SCX_ECODE_RSN_HOTPLUG = 1LLU << 32,
+
+ /* Actions */
+ SCX_ECODE_ACT_RESTART = 1LLU << 48,
+};
+
+enum scx_exit_flags {
+ /*
+ * ops.exit() may be called even if the loading failed before ops.init()
+ * finishes successfully. This is because ops.exit() allows rich exit
+ * info communication. The following flag indicates whether ops.init()
+ * finished successfully.
+ */
+ SCX_EFLAG_INITIALIZED,
+};
+
+/*
+ * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is
+ * being disabled.
+ */
+struct scx_exit_info {
+ /* %SCX_EXIT_* - broad category of the exit reason */
+ enum scx_exit_kind kind;
+
+ /* exit code if gracefully exiting */
+ s64 exit_code;
+
+ /* %SCX_EFLAG_* */
+ u64 flags;
+
+ /* textual representation of the above */
+ const char *reason;
+
+ /* backtrace if exiting due to an error */
+ unsigned long *bt;
+ u32 bt_len;
+
+ /* informational message */
+ char *msg;
+
+ /* debug dump */
+ char *dump;
+};
+
+/* sched_ext_ops.flags */
+enum scx_ops_flags {
+ /*
+ * Keep built-in idle tracking even if ops.update_idle() is implemented.
+ */
+ SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
+
+ /*
+ * By default, if there are no other task to run on the CPU, ext core
+ * keeps running the current task even after its slice expires. If this
+ * flag is specified, such tasks are passed to ops.enqueue() with
+ * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
+ */
+ SCX_OPS_ENQ_LAST = 1LLU << 1,
+
+ /*
+ * An exiting task may schedule after PF_EXITING is set. In such cases,
+ * bpf_task_from_pid() may not be able to find the task and if the BPF
+ * scheduler depends on pid lookup for dispatching, the task will be
+ * lost leading to various issues including RCU grace period stalls.
+ *
+ * To mask this problem, by default, unhashed tasks are automatically
+ * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't
+ * depend on pid lookups and wants to handle these tasks directly, the
+ * following flag can be used.
+ */
+ SCX_OPS_ENQ_EXITING = 1LLU << 2,
+
+ /*
+ * If set, only tasks with policy set to SCHED_EXT are attached to
+ * sched_ext. If clear, SCHED_NORMAL tasks are also included.
+ */
+ SCX_OPS_SWITCH_PARTIAL = 1LLU << 3,
+
+ /*
+ * A migration disabled task can only execute on its current CPU. By
+ * default, such tasks are automatically put on the CPU's local DSQ with
+ * the default slice on enqueue. If this ops flag is set, they also go
+ * through ops.enqueue().
+ *
+ * A migration disabled task never invokes ops.select_cpu() as it can
+ * only select the current CPU. Also, p->cpus_ptr will only contain its
+ * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
+ * and thus may disagree with cpumask_weight(p->cpus_ptr).
+ */
+ SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4,
+
+ /*
+ * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes
+ * ops.enqueue() on the ops.select_cpu() selected or the wakee's
+ * previous CPU via IPI (inter-processor interrupt) to reduce cacheline
+ * transfers. When this optimization is enabled, ops.select_cpu() is
+ * skipped in some cases (when racing against the wakee switching out).
+ * As the BPF scheduler may depend on ops.select_cpu() being invoked
+ * during wakeups, queued wakeup is disabled by default.
+ *
+ * If this ops flag is set, queued wakeup optimization is enabled and
+ * the BPF scheduler must be able to handle ops.enqueue() invoked on the
+ * wakee's CPU without preceding ops.select_cpu() even for tasks which
+ * may be executed on multiple CPUs.
+ */
+ SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5,
+
+ /*
+ * If set, enable per-node idle cpumasks. If clear, use a single global
+ * flat idle cpumask.
+ */
+ SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6,
+
+ /*
+ * CPU cgroup support flags
+ */
+ SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */
+
+ SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
+ SCX_OPS_ENQ_LAST |
+ SCX_OPS_ENQ_EXITING |
+ SCX_OPS_ENQ_MIGRATION_DISABLED |
+ SCX_OPS_ALLOW_QUEUED_WAKEUP |
+ SCX_OPS_SWITCH_PARTIAL |
+ SCX_OPS_BUILTIN_IDLE_PER_NODE |
+ SCX_OPS_HAS_CGROUP_WEIGHT,
+
+ /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */
+ __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56,
+
+ SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56,
+};
+
+/* argument container for ops.init_task() */
+struct scx_init_task_args {
+ /*
+ * Set if ops.init_task() is being invoked on the fork path, as opposed
+ * to the scheduler transition path.
+ */
+ bool fork;
+#ifdef CONFIG_EXT_GROUP_SCHED
+ /* the cgroup the task is joining */
+ struct cgroup *cgroup;
+#endif
+};
+
+/* argument container for ops.exit_task() */
+struct scx_exit_task_args {
+ /* Whether the task exited before running on sched_ext. */
+ bool cancelled;
+};
+
+/* argument container for ops->cgroup_init() */
+struct scx_cgroup_init_args {
+ /* the weight of the cgroup [1..10000] */
+ u32 weight;
+
+ /* bandwidth control parameters from cpu.max and cpu.max.burst */
+ u64 bw_period_us;
+ u64 bw_quota_us;
+ u64 bw_burst_us;
+};
+
+enum scx_cpu_preempt_reason {
+ /* next task is being scheduled by &sched_class_rt */
+ SCX_CPU_PREEMPT_RT,
+ /* next task is being scheduled by &sched_class_dl */
+ SCX_CPU_PREEMPT_DL,
+ /* next task is being scheduled by &sched_class_stop */
+ SCX_CPU_PREEMPT_STOP,
+ /* unknown reason for SCX being preempted */
+ SCX_CPU_PREEMPT_UNKNOWN,
+};
+
+/*
+ * Argument container for ops->cpu_acquire(). Currently empty, but may be
+ * expanded in the future.
+ */
+struct scx_cpu_acquire_args {};
+
+/* argument container for ops->cpu_release() */
+struct scx_cpu_release_args {
+ /* the reason the CPU was preempted */
+ enum scx_cpu_preempt_reason reason;
+
+ /* the task that's going to be scheduled on the CPU */
+ struct task_struct *task;
+};
+
+/*
+ * Informational context provided to dump operations.
+ */
+struct scx_dump_ctx {
+ enum scx_exit_kind kind;
+ s64 exit_code;
+ const char *reason;
+ u64 at_ns;
+ u64 at_jiffies;
+};
+
+/**
+ * struct sched_ext_ops - Operation table for BPF scheduler implementation
+ *
+ * A BPF scheduler can implement an arbitrary scheduling policy by
+ * implementing and loading operations in this table. Note that a userland
+ * scheduling policy can also be implemented using the BPF scheduler
+ * as a shim layer.
+ */
+struct sched_ext_ops {
+ /**
+ * @select_cpu: Pick the target CPU for a task which is being woken up
+ * @p: task being woken up
+ * @prev_cpu: the cpu @p was on before sleeping
+ * @wake_flags: SCX_WAKE_*
+ *
+ * Decision made here isn't final. @p may be moved to any CPU while it
+ * is getting dispatched for execution later. However, as @p is not on
+ * the rq at this point, getting the eventual execution CPU right here
+ * saves a small bit of overhead down the line.
+ *
+ * If an idle CPU is returned, the CPU is kicked and will try to
+ * dispatch. While an explicit custom mechanism can be added,
+ * select_cpu() serves as the default way to wake up idle CPUs.
+ *
+ * @p may be inserted into a DSQ directly by calling
+ * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped.
+ * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ
+ * of the CPU returned by this operation.
+ *
+ * Note that select_cpu() is never called for tasks that can only run
+ * on a single CPU or tasks with migration disabled, as they don't have
+ * the option to select a different CPU. See select_task_rq() for
+ * details.
+ */
+ s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
+
+ /**
+ * @enqueue: Enqueue a task on the BPF scheduler
+ * @p: task being enqueued
+ * @enq_flags: %SCX_ENQ_*
+ *
+ * @p is ready to run. Insert directly into a DSQ by calling
+ * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly
+ * inserted, the bpf scheduler owns @p and if it fails to dispatch @p,
+ * the task will stall.
+ *
+ * If @p was inserted into a DSQ from ops.select_cpu(), this callback is
+ * skipped.
+ */
+ void (*enqueue)(struct task_struct *p, u64 enq_flags);
+
+ /**
+ * @dequeue: Remove a task from the BPF scheduler
+ * @p: task being dequeued
+ * @deq_flags: %SCX_DEQ_*
+ *
+ * Remove @p from the BPF scheduler. This is usually called to isolate
+ * the task while updating its scheduling properties (e.g. priority).
+ *
+ * The ext core keeps track of whether the BPF side owns a given task or
+ * not and can gracefully ignore spurious dispatches from BPF side,
+ * which makes it safe to not implement this method. However, depending
+ * on the scheduling logic, this can lead to confusing behaviors - e.g.
+ * scheduling position not being updated across a priority change.
+ */
+ void (*dequeue)(struct task_struct *p, u64 deq_flags);
+
+ /**
+ * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs
+ * @cpu: CPU to dispatch tasks for
+ * @prev: previous task being switched out
+ *
+ * Called when a CPU's local dsq is empty. The operation should dispatch
+ * one or more tasks from the BPF scheduler into the DSQs using
+ * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ
+ * using scx_bpf_dsq_move_to_local().
+ *
+ * The maximum number of times scx_bpf_dsq_insert() can be called
+ * without an intervening scx_bpf_dsq_move_to_local() is specified by
+ * ops.dispatch_max_batch. See the comments on top of the two functions
+ * for more details.
+ *
+ * When not %NULL, @prev is an SCX task with its slice depleted. If
+ * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in
+ * @prev->scx.flags, it is not enqueued yet and will be enqueued after
+ * ops.dispatch() returns. To keep executing @prev, return without
+ * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST.
+ */
+ void (*dispatch)(s32 cpu, struct task_struct *prev);
+
+ /**
+ * @tick: Periodic tick
+ * @p: task running currently
+ *
+ * This operation is called every 1/HZ seconds on CPUs which are
+ * executing an SCX task. Setting @p->scx.slice to 0 will trigger an
+ * immediate dispatch cycle on the CPU.
+ */
+ void (*tick)(struct task_struct *p);
+
+ /**
+ * @runnable: A task is becoming runnable on its associated CPU
+ * @p: task becoming runnable
+ * @enq_flags: %SCX_ENQ_*
+ *
+ * This and the following three functions can be used to track a task's
+ * execution state transitions. A task becomes ->runnable() on a CPU,
+ * and then goes through one or more ->running() and ->stopping() pairs
+ * as it runs on the CPU, and eventually becomes ->quiescent() when it's
+ * done running on the CPU.
+ *
+ * @p is becoming runnable on the CPU because it's
+ *
+ * - waking up (%SCX_ENQ_WAKEUP)
+ * - being moved from another CPU
+ * - being restored after temporarily taken off the queue for an
+ * attribute change.
+ *
+ * This and ->enqueue() are related but not coupled. This operation
+ * notifies @p's state transition and may not be followed by ->enqueue()
+ * e.g. when @p is being dispatched to a remote CPU, or when @p is
+ * being enqueued on a CPU experiencing a hotplug event. Likewise, a
+ * task may be ->enqueue()'d without being preceded by this operation
+ * e.g. after exhausting its slice.
+ */
+ void (*runnable)(struct task_struct *p, u64 enq_flags);
+
+ /**
+ * @running: A task is starting to run on its associated CPU
+ * @p: task starting to run
+ *
+ * Note that this callback may be called from a CPU other than the
+ * one the task is going to run on. This can happen when a task
+ * property is changed (i.e., affinity), since scx_next_task_scx(),
+ * which triggers this callback, may run on a CPU different from
+ * the task's assigned CPU.
+ *
+ * Therefore, always use scx_bpf_task_cpu(@p) to determine the
+ * target CPU the task is going to use.
+ *
+ * See ->runnable() for explanation on the task state notifiers.
+ */
+ void (*running)(struct task_struct *p);
+
+ /**
+ * @stopping: A task is stopping execution
+ * @p: task stopping to run
+ * @runnable: is task @p still runnable?
+ *
+ * Note that this callback may be called from a CPU other than the
+ * one the task was running on. This can happen when a task
+ * property is changed (i.e., affinity), since dequeue_task_scx(),
+ * which triggers this callback, may run on a CPU different from
+ * the task's assigned CPU.
+ *
+ * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU
+ * the task was running on.
+ *
+ * See ->runnable() for explanation on the task state notifiers. If
+ * !@runnable, ->quiescent() will be invoked after this operation
+ * returns.
+ */
+ void (*stopping)(struct task_struct *p, bool runnable);
+
+ /**
+ * @quiescent: A task is becoming not runnable on its associated CPU
+ * @p: task becoming not runnable
+ * @deq_flags: %SCX_DEQ_*
+ *
+ * See ->runnable() for explanation on the task state notifiers.
+ *
+ * @p is becoming quiescent on the CPU because it's
+ *
+ * - sleeping (%SCX_DEQ_SLEEP)
+ * - being moved to another CPU
+ * - being temporarily taken off the queue for an attribute change
+ * (%SCX_DEQ_SAVE)
+ *
+ * This and ->dequeue() are related but not coupled. This operation
+ * notifies @p's state transition and may not be preceded by ->dequeue()
+ * e.g. when @p is being dispatched to a remote CPU.
+ */
+ void (*quiescent)(struct task_struct *p, u64 deq_flags);
+
+ /**
+ * @yield: Yield CPU
+ * @from: yielding task
+ * @to: optional yield target task
+ *
+ * If @to is NULL, @from is yielding the CPU to other runnable tasks.
+ * The BPF scheduler should ensure that other available tasks are
+ * dispatched before the yielding task. Return value is ignored in this
+ * case.
+ *
+ * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf
+ * scheduler can implement the request, return %true; otherwise, %false.
+ */
+ bool (*yield)(struct task_struct *from, struct task_struct *to);
+
+ /**
+ * @core_sched_before: Task ordering for core-sched
+ * @a: task A
+ * @b: task B
+ *
+ * Used by core-sched to determine the ordering between two tasks. See
+ * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on
+ * core-sched.
+ *
+ * Both @a and @b are runnable and may or may not currently be queued on
+ * the BPF scheduler. Should return %true if @a should run before @b.
+ * %false if there's no required ordering or @b should run before @a.
+ *
+ * If not specified, the default is ordering them according to when they
+ * became runnable.
+ */
+ bool (*core_sched_before)(struct task_struct *a, struct task_struct *b);
+
+ /**
+ * @set_weight: Set task weight
+ * @p: task to set weight for
+ * @weight: new weight [1..10000]
+ *
+ * Update @p's weight to @weight.
+ */
+ void (*set_weight)(struct task_struct *p, u32 weight);
+
+ /**
+ * @set_cpumask: Set CPU affinity
+ * @p: task to set CPU affinity for
+ * @cpumask: cpumask of cpus that @p can run on
+ *
+ * Update @p's CPU affinity to @cpumask.
+ */
+ void (*set_cpumask)(struct task_struct *p,
+ const struct cpumask *cpumask);
+
+ /**
+ * @update_idle: Update the idle state of a CPU
+ * @cpu: CPU to update the idle state for
+ * @idle: whether entering or exiting the idle state
+ *
+ * This operation is called when @rq's CPU goes or leaves the idle
+ * state. By default, implementing this operation disables the built-in
+ * idle CPU tracking and the following helpers become unavailable:
+ *
+ * - scx_bpf_select_cpu_dfl()
+ * - scx_bpf_select_cpu_and()
+ * - scx_bpf_test_and_clear_cpu_idle()
+ * - scx_bpf_pick_idle_cpu()
+ *
+ * The user also must implement ops.select_cpu() as the default
+ * implementation relies on scx_bpf_select_cpu_dfl().
+ *
+ * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle
+ * tracking.
+ */
+ void (*update_idle)(s32 cpu, bool idle);
+
+ /**
+ * @cpu_acquire: A CPU is becoming available to the BPF scheduler
+ * @cpu: The CPU being acquired by the BPF scheduler.
+ * @args: Acquire arguments, see the struct definition.
+ *
+ * A CPU that was previously released from the BPF scheduler is now once
+ * again under its control.
+ */
+ void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
+
+ /**
+ * @cpu_release: A CPU is taken away from the BPF scheduler
+ * @cpu: The CPU being released by the BPF scheduler.
+ * @args: Release arguments, see the struct definition.
+ *
+ * The specified CPU is no longer under the control of the BPF
+ * scheduler. This could be because it was preempted by a higher
+ * priority sched_class, though there may be other reasons as well. The
+ * caller should consult @args->reason to determine the cause.
+ */
+ void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
+
+ /**
+ * @init_task: Initialize a task to run in a BPF scheduler
+ * @p: task to initialize for BPF scheduling
+ * @args: init arguments, see the struct definition
+ *
+ * Either we're loading a BPF scheduler or a new task is being forked.
+ * Initialize @p for BPF scheduling. This operation may block and can
+ * be used for allocations, and is called exactly once for a task.
+ *
+ * Return 0 for success, -errno for failure. An error return while
+ * loading will abort loading of the BPF scheduler. During a fork, it
+ * will abort that specific fork.
+ */
+ s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args);
+
+ /**
+ * @exit_task: Exit a previously-running task from the system
+ * @p: task to exit
+ * @args: exit arguments, see the struct definition
+ *
+ * @p is exiting or the BPF scheduler is being unloaded. Perform any
+ * necessary cleanup for @p.
+ */
+ void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args);
+
+ /**
+ * @enable: Enable BPF scheduling for a task
+ * @p: task to enable BPF scheduling for
+ *
+ * Enable @p for BPF scheduling. enable() is called on @p any time it
+ * enters SCX, and is always paired with a matching disable().
+ */
+ void (*enable)(struct task_struct *p);
+
+ /**
+ * @disable: Disable BPF scheduling for a task
+ * @p: task to disable BPF scheduling for
+ *
+ * @p is exiting, leaving SCX or the BPF scheduler is being unloaded.
+ * Disable BPF scheduling for @p. A disable() call is always matched
+ * with a prior enable() call.
+ */
+ void (*disable)(struct task_struct *p);
+
+ /**
+ * @dump: Dump BPF scheduler state on error
+ * @ctx: debug dump context
+ *
+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump.
+ */
+ void (*dump)(struct scx_dump_ctx *ctx);
+
+ /**
+ * @dump_cpu: Dump BPF scheduler state for a CPU on error
+ * @ctx: debug dump context
+ * @cpu: CPU to generate debug dump for
+ * @idle: @cpu is currently idle without any runnable tasks
+ *
+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
+ * @cpu. If @idle is %true and this operation doesn't produce any
+ * output, @cpu is skipped for dump.
+ */
+ void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle);
+
+ /**
+ * @dump_task: Dump BPF scheduler state for a runnable task on error
+ * @ctx: debug dump context
+ * @p: runnable task to generate debug dump for
+ *
+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
+ * @p.
+ */
+ void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);
+
+#ifdef CONFIG_EXT_GROUP_SCHED
+ /**
+ * @cgroup_init: Initialize a cgroup
+ * @cgrp: cgroup being initialized
+ * @args: init arguments, see the struct definition
+ *
+ * Either the BPF scheduler is being loaded or @cgrp created, initialize
+ * @cgrp for sched_ext. This operation may block.
+ *
+ * Return 0 for success, -errno for failure. An error return while
+ * loading will abort loading of the BPF scheduler. During cgroup
+ * creation, it will abort the specific cgroup creation.
+ */
+ s32 (*cgroup_init)(struct cgroup *cgrp,
+ struct scx_cgroup_init_args *args);
+
+ /**
+ * @cgroup_exit: Exit a cgroup
+ * @cgrp: cgroup being exited
+ *
+ * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit
+ * @cgrp for sched_ext. This operation my block.
+ */
+ void (*cgroup_exit)(struct cgroup *cgrp);
+
+ /**
+ * @cgroup_prep_move: Prepare a task to be moved to a different cgroup
+ * @p: task being moved
+ * @from: cgroup @p is being moved from
+ * @to: cgroup @p is being moved to
+ *
+ * Prepare @p for move from cgroup @from to @to. This operation may
+ * block and can be used for allocations.
+ *
+ * Return 0 for success, -errno for failure. An error return aborts the
+ * migration.
+ */
+ s32 (*cgroup_prep_move)(struct task_struct *p,
+ struct cgroup *from, struct cgroup *to);
+
+ /**
+ * @cgroup_move: Commit cgroup move
+ * @p: task being moved
+ * @from: cgroup @p is being moved from
+ * @to: cgroup @p is being moved to
+ *
+ * Commit the move. @p is dequeued during this operation.
+ */
+ void (*cgroup_move)(struct task_struct *p,
+ struct cgroup *from, struct cgroup *to);
+
+ /**
+ * @cgroup_cancel_move: Cancel cgroup move
+ * @p: task whose cgroup move is being canceled
+ * @from: cgroup @p was being moved from
+ * @to: cgroup @p was being moved to
+ *
+ * @p was cgroup_prep_move()'d but failed before reaching cgroup_move().
+ * Undo the preparation.
+ */
+ void (*cgroup_cancel_move)(struct task_struct *p,
+ struct cgroup *from, struct cgroup *to);
+
+ /**
+ * @cgroup_set_weight: A cgroup's weight is being changed
+ * @cgrp: cgroup whose weight is being updated
+ * @weight: new weight [1..10000]
+ *
+ * Update @cgrp's weight to @weight.
+ */
+ void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
+
+ /**
+ * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed
+ * @cgrp: cgroup whose bandwidth is being updated
+ * @period_us: bandwidth control period
+ * @quota_us: bandwidth control quota
+ * @burst_us: bandwidth control burst
+ *
+ * Update @cgrp's bandwidth control parameters. This is from the cpu.max
+ * cgroup interface.
+ *
+ * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled
+ * to. For example, if @period_us is 1_000_000 and @quota_us is
+ * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be
+ * interpreted in the same fashion and specifies how much @cgrp can
+ * burst temporarily. The specific control mechanism and thus the
+ * interpretation of @period_us and burstiness is upto to the BPF
+ * scheduler.
+ */
+ void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
+ u64 period_us, u64 quota_us, u64 burst_us);
+
+#endif /* CONFIG_EXT_GROUP_SCHED */
+
+ /*
+ * All online ops must come before ops.cpu_online().
+ */
+
+ /**
+ * @cpu_online: A CPU became online
+ * @cpu: CPU which just came up
+ *
+ * @cpu just came online. @cpu will not call ops.enqueue() or
+ * ops.dispatch(), nor run tasks associated with other CPUs beforehand.
+ */
+ void (*cpu_online)(s32 cpu);
+
+ /**
+ * @cpu_offline: A CPU is going offline
+ * @cpu: CPU which is going offline
+ *
+ * @cpu is going offline. @cpu will not call ops.enqueue() or
+ * ops.dispatch(), nor run tasks associated with other CPUs afterwards.
+ */
+ void (*cpu_offline)(s32 cpu);
+
+ /*
+ * All CPU hotplug ops must come before ops.init().
+ */
+
+ /**
+ * @init: Initialize the BPF scheduler
+ */
+ s32 (*init)(void);
+
+ /**
+ * @exit: Clean up after the BPF scheduler
+ * @info: Exit info
+ *
+ * ops.exit() is also called on ops.init() failure, which is a bit
+ * unusual. This is to allow rich reporting through @info on how
+ * ops.init() failed.
+ */
+ void (*exit)(struct scx_exit_info *info);
+
+ /**
+ * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch
+ */
+ u32 dispatch_max_batch;
+
+ /**
+ * @flags: %SCX_OPS_* flags
+ */
+ u64 flags;
+
+ /**
+ * @timeout_ms: The maximum amount of time, in milliseconds, that a
+ * runnable task should be able to wait before being scheduled. The
+ * maximum timeout may not exceed the default timeout of 30 seconds.
+ *
+ * Defaults to the maximum allowed timeout value of 30 seconds.
+ */
+ u32 timeout_ms;
+
+ /**
+ * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default
+ * value of 32768 is used.
+ */
+ u32 exit_dump_len;
+
+ /**
+ * @hotplug_seq: A sequence number that may be set by the scheduler to
+ * detect when a hotplug event has occurred during the loading process.
+ * If 0, no detection occurs. Otherwise, the scheduler will fail to
+ * load if the sequence number does not match @scx_hotplug_seq on the
+ * enable path.
+ */
+ u64 hotplug_seq;
+
+ /**
+ * @name: BPF scheduler's name
+ *
+ * Must be a non-zero valid BPF object name including only isalnum(),
+ * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the
+ * BPF scheduler is enabled.
+ */
+ char name[SCX_OPS_NAME_LEN];
+
+ /* internal use only, must be NULL */
+ void *priv;
+};
+
+enum scx_opi {
+ SCX_OPI_BEGIN = 0,
+ SCX_OPI_NORMAL_BEGIN = 0,
+ SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online),
+ SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online),
+ SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init),
+ SCX_OPI_END = SCX_OP_IDX(init),
+};
+
+/*
+ * Collection of event counters. Event types are placed in descending order.
+ */
+struct scx_event_stats {
+ /*
+ * If ops.select_cpu() returns a CPU which can't be used by the task,
+ * the core scheduler code silently picks a fallback CPU.
+ */
+ s64 SCX_EV_SELECT_CPU_FALLBACK;
+
+ /*
+ * When dispatching to a local DSQ, the CPU may have gone offline in
+ * the meantime. In this case, the task is bounced to the global DSQ.
+ */
+ s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
+
+ /*
+ * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
+ * continued to run because there were no other tasks on the CPU.
+ */
+ s64 SCX_EV_DISPATCH_KEEP_LAST;
+
+ /*
+ * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
+ * is dispatched to a local DSQ when exiting.
+ */
+ s64 SCX_EV_ENQ_SKIP_EXITING;
+
+ /*
+ * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
+ * migration disabled task skips ops.enqueue() and is dispatched to its
+ * local DSQ.
+ */
+ s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
+
+ /*
+ * Total number of times a task's time slice was refilled with the
+ * default value (SCX_SLICE_DFL).
+ */
+ s64 SCX_EV_REFILL_SLICE_DFL;
+
+ /*
+ * The total duration of bypass modes in nanoseconds.
+ */
+ s64 SCX_EV_BYPASS_DURATION;
+
+ /*
+ * The number of tasks dispatched in the bypassing mode.
+ */
+ s64 SCX_EV_BYPASS_DISPATCH;
+
+ /*
+ * The number of times the bypassing mode has been activated.
+ */
+ s64 SCX_EV_BYPASS_ACTIVATE;
+};
+
+struct scx_sched_pcpu {
+ /*
+ * The event counters are in a per-CPU variable to minimize the
+ * accounting overhead. A system-wide view on the event counter is
+ * constructed when requested by scx_bpf_events().
+ */
+ struct scx_event_stats event_stats;
+};
+
+struct scx_sched {
+ struct sched_ext_ops ops;
+ DECLARE_BITMAP(has_op, SCX_OPI_END);
+
+ /*
+ * Dispatch queues.
+ *
+ * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability.
+ * This is to avoid live-locking in bypass mode where all tasks are
+ * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If
+ * per-node split isn't sufficient, it can be further split.
+ */
+ struct rhashtable dsq_hash;
+ struct scx_dispatch_q **global_dsqs;
+ struct scx_sched_pcpu __percpu *pcpu;
+
+ bool warned_zero_slice:1;
+ bool warned_deprecated_rq:1;
+
+ atomic_t exit_kind;
+ struct scx_exit_info *exit_info;
+
+ struct kobject kobj;
+
+ struct kthread_worker *helper;
+ struct irq_work error_irq_work;
+ struct kthread_work disable_work;
+ struct rcu_work rcu_work;
+};
+
+enum scx_wake_flags {
+ /* expose select WF_* flags as enums */
+ SCX_WAKE_FORK = WF_FORK,
+ SCX_WAKE_TTWU = WF_TTWU,
+ SCX_WAKE_SYNC = WF_SYNC,
+};
+
+enum scx_enq_flags {
+ /* expose select ENQUEUE_* flags as enums */
+ SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP,
+ SCX_ENQ_HEAD = ENQUEUE_HEAD,
+ SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED,
+
+ /* high 32bits are SCX specific */
+
+ /*
+ * Set the following to trigger preemption when calling
+ * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the
+ * current task is cleared to zero and the CPU is kicked into the
+ * scheduling path. Implies %SCX_ENQ_HEAD.
+ */
+ SCX_ENQ_PREEMPT = 1LLU << 32,
+
+ /*
+ * The task being enqueued was previously enqueued on the current CPU's
+ * %SCX_DSQ_LOCAL, but was removed from it in a call to the
+ * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was
+ * invoked in a ->cpu_release() callback, and the task is again
+ * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
+ * task will not be scheduled on the CPU until at least the next invocation
+ * of the ->cpu_acquire() callback.
+ */
+ SCX_ENQ_REENQ = 1LLU << 40,
+
+ /*
+ * The task being enqueued is the only task available for the cpu. By
+ * default, ext core keeps executing such tasks but when
+ * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the
+ * %SCX_ENQ_LAST flag set.
+ *
+ * The BPF scheduler is responsible for triggering a follow-up
+ * scheduling event. Otherwise, Execution may stall.
+ */
+ SCX_ENQ_LAST = 1LLU << 41,
+
+ /* high 8 bits are internal */
+ __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56,
+
+ SCX_ENQ_CLEAR_OPSS = 1LLU << 56,
+ SCX_ENQ_DSQ_PRIQ = 1LLU << 57,
+};
+
+enum scx_deq_flags {
+ /* expose select DEQUEUE_* flags as enums */
+ SCX_DEQ_SLEEP = DEQUEUE_SLEEP,
+
+ /* high 32bits are SCX specific */
+
+ /*
+ * The generic core-sched layer decided to execute the task even though
+ * it hasn't been dispatched yet. Dequeue from the BPF side.
+ */
+ SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32,
+};
+
+enum scx_pick_idle_cpu_flags {
+ SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */
+ SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */
+};
+
+enum scx_kick_flags {
+ /*
+ * Kick the target CPU if idle. Guarantees that the target CPU goes
+ * through at least one full scheduling cycle before going idle. If the
+ * target CPU can be determined to be currently not idle and going to go
+ * through a scheduling cycle before going idle, noop.
+ */
+ SCX_KICK_IDLE = 1LLU << 0,
+
+ /*
+ * Preempt the current task and execute the dispatch path. If the
+ * current task of the target CPU is an SCX task, its ->scx.slice is
+ * cleared to zero before the scheduling path is invoked so that the
+ * task expires and the dispatch path is invoked.
+ */
+ SCX_KICK_PREEMPT = 1LLU << 1,
+
+ /*
+ * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will
+ * return after the target CPU finishes picking the next task.
+ */
+ SCX_KICK_WAIT = 1LLU << 2,
+};
+
+enum scx_tg_flags {
+ SCX_TG_ONLINE = 1U << 0,
+ SCX_TG_INITED = 1U << 1,
+};
+
+enum scx_enable_state {
+ SCX_ENABLING,
+ SCX_ENABLED,
+ SCX_DISABLING,
+ SCX_DISABLED,
+};
+
+static const char *scx_enable_state_str[] = {
+ [SCX_ENABLING] = "enabling",
+ [SCX_ENABLED] = "enabled",
+ [SCX_DISABLING] = "disabling",
+ [SCX_DISABLED] = "disabled",
+};
+
+/*
+ * sched_ext_entity->ops_state
+ *
+ * Used to track the task ownership between the SCX core and the BPF scheduler.
+ * State transitions look as follows:
+ *
+ * NONE -> QUEUEING -> QUEUED -> DISPATCHING
+ * ^ | |
+ * | v v
+ * \-------------------------------/
+ *
+ * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
+ * sites for explanations on the conditions being waited upon and why they are
+ * safe. Transitions out of them into NONE or QUEUED must store_release and the
+ * waiters should load_acquire.
+ *
+ * Tracking scx_ops_state enables sched_ext core to reliably determine whether
+ * any given task can be dispatched by the BPF scheduler at all times and thus
+ * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
+ * to try to dispatch any task anytime regardless of its state as the SCX core
+ * can safely reject invalid dispatches.
+ */
+enum scx_ops_state {
+ SCX_OPSS_NONE, /* owned by the SCX core */
+ SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */
+ SCX_OPSS_QUEUED, /* owned by the BPF scheduler */
+ SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */
+
+ /*
+ * QSEQ brands each QUEUED instance so that, when dispatch races
+ * dequeue/requeue, the dispatcher can tell whether it still has a claim
+ * on the task being dispatched.
+ *
+ * As some 32bit archs can't do 64bit store_release/load_acquire,
+ * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on
+ * 32bit machines. The dispatch race window QSEQ protects is very narrow
+ * and runs with IRQ disabled. 30 bits should be sufficient.
+ */
+ SCX_OPSS_QSEQ_SHIFT = 2,
+};
+
+/* Use macros to ensure that the type is unsigned long for the masks */
+#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
+#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK)
+
+DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
+
+/*
+ * Return the rq currently locked from an scx callback, or NULL if no rq is
+ * locked.
+ */
+static inline struct rq *scx_locked_rq(void)
+{
+ return __this_cpu_read(scx_locked_rq_state);
+}
+
+static inline bool scx_kf_allowed_if_unlocked(void)
+{
+ return !current->scx.kf_mask;
+}
+
+static inline bool scx_rq_bypassing(struct rq *rq)
+{
+ return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
+}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b173a059315c..3a89f949e307 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3542,7 +3542,7 @@ out:
}
}
-void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+void init_numa_balancing(u64 clone_flags, struct task_struct *p)
{
int mm_users = 0;
struct mm_struct *mm = p->mm;
@@ -3957,9 +3957,6 @@ static void update_cfs_group(struct sched_entity *se)
if (!gcfs_rq || !gcfs_rq->load.weight)
return;
- if (throttled_hierarchy(gcfs_rq))
- return;
-
shares = calc_group_shares(gcfs_rq);
if (unlikely(se->load.weight != shares))
reweight_entity(cfs_rq_of(se), se, shares);
@@ -5291,18 +5288,16 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (cfs_rq->nr_queued == 1) {
check_enqueue_throttle(cfs_rq);
- if (!throttled_hierarchy(cfs_rq)) {
- list_add_leaf_cfs_rq(cfs_rq);
- } else {
+ list_add_leaf_cfs_rq(cfs_rq);
#ifdef CONFIG_CFS_BANDWIDTH
+ if (cfs_rq->pelt_clock_throttled) {
struct rq *rq = rq_of(cfs_rq);
- if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
- cfs_rq->throttled_clock = rq_clock(rq);
- if (!cfs_rq->throttled_clock_self)
- cfs_rq->throttled_clock_self = rq_clock(rq);
-#endif
+ cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
+ cfs_rq->throttled_clock_pelt;
+ cfs_rq->pelt_clock_throttled = 0;
}
+#endif
}
}
@@ -5341,8 +5336,6 @@ static void set_delayed(struct sched_entity *se)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_runnable--;
- if (cfs_rq_throttled(cfs_rq))
- break;
}
}
@@ -5363,8 +5356,6 @@ static void clear_delayed(struct sched_entity *se)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_runnable++;
- if (cfs_rq_throttled(cfs_rq))
- break;
}
}
@@ -5392,7 +5383,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* DELAY_DEQUEUE relies on spurious wakeups, special task
* states must not suffer spurious wakeups, excempt them.
*/
- if (flags & DEQUEUE_SPECIAL)
+ if (flags & (DEQUEUE_SPECIAL | DEQUEUE_THROTTLE))
delay = false;
WARN_ON_ONCE(delay && se->sched_delayed);
@@ -5450,8 +5441,18 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (flags & DEQUEUE_DELAYED)
finish_delayed_dequeue_entity(se);
- if (cfs_rq->nr_queued == 0)
+ if (cfs_rq->nr_queued == 0) {
update_idle_cfs_rq_clock_pelt(cfs_rq);
+#ifdef CONFIG_CFS_BANDWIDTH
+ if (throttled_hierarchy(cfs_rq)) {
+ struct rq *rq = rq_of(cfs_rq);
+
+ list_del_leaf_cfs_rq(cfs_rq);
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+ cfs_rq->pelt_clock_throttled = 1;
+ }
+#endif
+ }
return true;
}
@@ -5725,74 +5726,253 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
return cfs_bandwidth_used() && cfs_rq->throttled;
}
+static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
+{
+ return cfs_bandwidth_used() && cfs_rq->pelt_clock_throttled;
+}
+
/* check whether cfs_rq, or any parent, is throttled */
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
{
return cfs_bandwidth_used() && cfs_rq->throttle_count;
}
+static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu)
+{
+ return throttled_hierarchy(task_group(p)->cfs_rq[dst_cpu]);
+}
+
+static inline bool task_is_throttled(struct task_struct *p)
+{
+ return cfs_bandwidth_used() && p->throttled;
+}
+
+static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
+static void throttle_cfs_rq_work(struct callback_head *work)
+{
+ struct task_struct *p = container_of(work, struct task_struct, sched_throttle_work);
+ struct sched_entity *se;
+ struct cfs_rq *cfs_rq;
+ struct rq *rq;
+
+ WARN_ON_ONCE(p != current);
+ p->sched_throttle_work.next = &p->sched_throttle_work;
+
+ /*
+ * If task is exiting, then there won't be a return to userspace, so we
+ * don't have to bother with any of this.
+ */
+ if ((p->flags & PF_EXITING))
+ return;
+
+ scoped_guard(task_rq_lock, p) {
+ se = &p->se;
+ cfs_rq = cfs_rq_of(se);
+
+ /* Raced, forget */
+ if (p->sched_class != &fair_sched_class)
+ return;
+
+ /*
+ * If not in limbo, then either replenish has happened or this
+ * task got migrated out of the throttled cfs_rq, move along.
+ */
+ if (!cfs_rq->throttle_count)
+ return;
+ rq = scope.rq;
+ update_rq_clock(rq);
+ WARN_ON_ONCE(p->throttled || !list_empty(&p->throttle_node));
+ dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_THROTTLE);
+ list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+ /*
+ * Must not set throttled before dequeue or dequeue will
+ * mistakenly regard this task as an already throttled one.
+ */
+ p->throttled = true;
+ resched_curr(rq);
+ }
+}
+
+void init_cfs_throttle_work(struct task_struct *p)
+{
+ init_task_work(&p->sched_throttle_work, throttle_cfs_rq_work);
+ /* Protect against double add, see throttle_cfs_rq() and throttle_cfs_rq_work() */
+ p->sched_throttle_work.next = &p->sched_throttle_work;
+ INIT_LIST_HEAD(&p->throttle_node);
+}
+
/*
- * Ensure that neither of the group entities corresponding to src_cpu or
- * dest_cpu are members of a throttled hierarchy when performing group
- * load-balance operations.
+ * Task is throttled and someone wants to dequeue it again:
+ * it could be sched/core when core needs to do things like
+ * task affinity change, task group change, task sched class
+ * change etc. and in these cases, DEQUEUE_SLEEP is not set;
+ * or the task is blocked after throttled due to freezer etc.
+ * and in these cases, DEQUEUE_SLEEP is set.
*/
-static inline int throttled_lb_pair(struct task_group *tg,
- int src_cpu, int dest_cpu)
+static void detach_task_cfs_rq(struct task_struct *p);
+static void dequeue_throttled_task(struct task_struct *p, int flags)
+{
+ WARN_ON_ONCE(p->se.on_rq);
+ list_del_init(&p->throttle_node);
+
+ /* task blocked after throttled */
+ if (flags & DEQUEUE_SLEEP) {
+ p->throttled = false;
+ return;
+ }
+
+ /*
+ * task is migrating off its old cfs_rq, detach
+ * the task's load from its old cfs_rq.
+ */
+ if (task_on_rq_migrating(p))
+ detach_task_cfs_rq(p);
+}
+
+static bool enqueue_throttled_task(struct task_struct *p)
{
- struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
+ struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
- src_cfs_rq = tg->cfs_rq[src_cpu];
- dest_cfs_rq = tg->cfs_rq[dest_cpu];
+ /* @p should have gone through dequeue_throttled_task() first */
+ WARN_ON_ONCE(!list_empty(&p->throttle_node));
+
+ /*
+ * If the throttled task @p is enqueued to a throttled cfs_rq,
+ * take the fast path by directly putting the task on the
+ * target cfs_rq's limbo list.
+ *
+ * Do not do that when @p is current because the following race can
+ * cause @p's group_node to be incorectly re-insterted in its rq's
+ * cfs_tasks list, despite being throttled:
+ *
+ * cpuX cpuY
+ * p ret2user
+ * throttle_cfs_rq_work() sched_move_task(p)
+ * LOCK task_rq_lock
+ * dequeue_task_fair(p)
+ * UNLOCK task_rq_lock
+ * LOCK task_rq_lock
+ * task_current_donor(p) == true
+ * task_on_rq_queued(p) == true
+ * dequeue_task(p)
+ * put_prev_task(p)
+ * sched_change_group()
+ * enqueue_task(p) -> p's new cfs_rq
+ * is throttled, go
+ * fast path and skip
+ * actual enqueue
+ * set_next_task(p)
+ * list_move(&se->group_node, &rq->cfs_tasks); // bug
+ * schedule()
+ *
+ * In the above race case, @p current cfs_rq is in the same rq as
+ * its previous cfs_rq because sched_move_task() only moves a task
+ * to a different group from the same rq, so we can use its current
+ * cfs_rq to derive rq and test if the task is current.
+ */
+ if (throttled_hierarchy(cfs_rq) &&
+ !task_current_donor(rq_of(cfs_rq), p)) {
+ list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+ return true;
+ }
- return throttled_hierarchy(src_cfs_rq) ||
- throttled_hierarchy(dest_cfs_rq);
+ /* we can't take the fast path, do an actual enqueue*/
+ p->throttled = false;
+ return false;
}
+static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
static int tg_unthrottle_up(struct task_group *tg, void *data)
{
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+ struct task_struct *p, *tmp;
+
+ if (--cfs_rq->throttle_count)
+ return 0;
- cfs_rq->throttle_count--;
- if (!cfs_rq->throttle_count) {
+ if (cfs_rq->pelt_clock_throttled) {
cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
cfs_rq->throttled_clock_pelt;
+ cfs_rq->pelt_clock_throttled = 0;
+ }
- /* Add cfs_rq with load or one or more already running entities to the list */
- if (!cfs_rq_is_decayed(cfs_rq))
- list_add_leaf_cfs_rq(cfs_rq);
+ if (cfs_rq->throttled_clock_self) {
+ u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
- if (cfs_rq->throttled_clock_self) {
- u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
+ cfs_rq->throttled_clock_self = 0;
- cfs_rq->throttled_clock_self = 0;
+ if (WARN_ON_ONCE((s64)delta < 0))
+ delta = 0;
- if (WARN_ON_ONCE((s64)delta < 0))
- delta = 0;
+ cfs_rq->throttled_clock_self_time += delta;
+ }
- cfs_rq->throttled_clock_self_time += delta;
- }
+ /* Re-enqueue the tasks that have been throttled at this level. */
+ list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) {
+ list_del_init(&p->throttle_node);
+ p->throttled = false;
+ enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
}
+ /* Add cfs_rq with load or one or more already running entities to the list */
+ if (!cfs_rq_is_decayed(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
+
return 0;
}
+static inline bool task_has_throttle_work(struct task_struct *p)
+{
+ return p->sched_throttle_work.next != &p->sched_throttle_work;
+}
+
+static inline void task_throttle_setup_work(struct task_struct *p)
+{
+ if (task_has_throttle_work(p))
+ return;
+
+ /*
+ * Kthreads and exiting tasks don't return to userspace, so adding the
+ * work is pointless
+ */
+ if ((p->flags & (PF_EXITING | PF_KTHREAD)))
+ return;
+
+ task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
+}
+
+static void record_throttle_clock(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+
+ if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
+ cfs_rq->throttled_clock = rq_clock(rq);
+
+ if (!cfs_rq->throttled_clock_self)
+ cfs_rq->throttled_clock_self = rq_clock(rq);
+}
+
static int tg_throttle_down(struct task_group *tg, void *data)
{
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
- /* group is entering throttled state, stop time */
- if (!cfs_rq->throttle_count) {
- cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
- list_del_leaf_cfs_rq(cfs_rq);
+ if (cfs_rq->throttle_count++)
+ return 0;
- WARN_ON_ONCE(cfs_rq->throttled_clock_self);
- if (cfs_rq->nr_queued)
- cfs_rq->throttled_clock_self = rq_clock(rq);
+ /*
+ * For cfs_rqs that still have entities enqueued, PELT clock
+ * stop happens at dequeue time when all entities are dequeued.
+ */
+ if (!cfs_rq->nr_queued) {
+ list_del_leaf_cfs_rq(cfs_rq);
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+ cfs_rq->pelt_clock_throttled = 1;
}
- cfs_rq->throttle_count++;
+ WARN_ON_ONCE(cfs_rq->throttled_clock_self);
+ WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
return 0;
}
@@ -5800,8 +5980,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- struct sched_entity *se;
- long queued_delta, runnable_delta, idle_delta, dequeue = 1;
+ int dequeue = 1;
raw_spin_lock(&cfs_b->lock);
/* This will start the period timer if necessary */
@@ -5824,76 +6003,17 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
if (!dequeue)
return false; /* Throttle no longer required. */
- se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
-
/* freeze hierarchy runnable averages while throttled */
rcu_read_lock();
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
- queued_delta = cfs_rq->h_nr_queued;
- runnable_delta = cfs_rq->h_nr_runnable;
- idle_delta = cfs_rq->h_nr_idle;
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- int flags;
-
- /* throttled entity or throttle-on-deactivate */
- if (!se->on_rq)
- goto done;
-
- /*
- * Abuse SPECIAL to avoid delayed dequeue in this instance.
- * This avoids teaching dequeue_entities() about throttled
- * entities and keeps things relatively simple.
- */
- flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
- if (se->sched_delayed)
- flags |= DEQUEUE_DELAYED;
- dequeue_entity(qcfs_rq, se, flags);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_delta = cfs_rq->h_nr_queued;
-
- qcfs_rq->h_nr_queued -= queued_delta;
- qcfs_rq->h_nr_runnable -= runnable_delta;
- qcfs_rq->h_nr_idle -= idle_delta;
-
- if (qcfs_rq->load.weight) {
- /* Avoid re-evaluating load for this entity: */
- se = parent_entity(se);
- break;
- }
- }
-
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- /* throttled entity or throttle-on-deactivate */
- if (!se->on_rq)
- goto done;
-
- update_load_avg(qcfs_rq, se, 0);
- se_update_runnable(se);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_delta = cfs_rq->h_nr_queued;
-
- qcfs_rq->h_nr_queued -= queued_delta;
- qcfs_rq->h_nr_runnable -= runnable_delta;
- qcfs_rq->h_nr_idle -= idle_delta;
- }
-
- /* At this point se is NULL and we are at root level*/
- sub_nr_running(rq, queued_delta);
-done:
/*
* Note: distribution will already see us throttled via the
* throttled-list. rq->lock protects completion.
*/
cfs_rq->throttled = 1;
WARN_ON_ONCE(cfs_rq->throttled_clock);
- if (cfs_rq->nr_queued)
- cfs_rq->throttled_clock = rq_clock(rq);
return true;
}
@@ -5901,9 +6021,20 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- struct sched_entity *se;
- long queued_delta, runnable_delta, idle_delta;
- long rq_h_nr_queued = rq->cfs.h_nr_queued;
+ struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
+
+ /*
+ * It's possible we are called with !runtime_remaining due to things
+ * like user changed quota setting(see tg_set_cfs_bandwidth()) or async
+ * unthrottled us with a positive runtime_remaining but other still
+ * running entities consumed those runtime before we reached here.
+ *
+ * Anyway, we can't unthrottle this cfs_rq without any runtime remaining
+ * because any enqueue in tg_unthrottle_up() will immediately trigger a
+ * throttle, which is not supposed to happen on unthrottle path.
+ */
+ if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0)
+ return;
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -5933,62 +6064,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
break;
}
- goto unthrottle_throttle;
- }
-
- queued_delta = cfs_rq->h_nr_queued;
- runnable_delta = cfs_rq->h_nr_runnable;
- idle_delta = cfs_rq->h_nr_idle;
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
- /* Handle any unfinished DELAY_DEQUEUE business first. */
- if (se->sched_delayed) {
- int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED;
-
- dequeue_entity(qcfs_rq, se, flags);
- } else if (se->on_rq)
- break;
- enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_delta = cfs_rq->h_nr_queued;
-
- qcfs_rq->h_nr_queued += queued_delta;
- qcfs_rq->h_nr_runnable += runnable_delta;
- qcfs_rq->h_nr_idle += idle_delta;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(qcfs_rq))
- goto unthrottle_throttle;
- }
-
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
- update_load_avg(qcfs_rq, se, UPDATE_TG);
- se_update_runnable(se);
-
- if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_delta = cfs_rq->h_nr_queued;
-
- qcfs_rq->h_nr_queued += queued_delta;
- qcfs_rq->h_nr_runnable += runnable_delta;
- qcfs_rq->h_nr_idle += idle_delta;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(qcfs_rq))
- goto unthrottle_throttle;
}
- /* Start the fair server if un-throttling resulted in new runnable tasks */
- if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
- dl_server_start(&rq->fair_server);
-
- /* At this point se is NULL and we are at root level*/
- add_nr_running(rq, queued_delta);
-
-unthrottle_throttle:
assert_list_leaf_cfs_rq(rq);
/* Determine whether we need to wake up potentially idle CPU: */
@@ -6472,6 +6549,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_rq->runtime_enabled = 0;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
+ INIT_LIST_HEAD(&cfs_rq->throttled_limbo_list);
}
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -6639,19 +6717,28 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
static inline void sync_throttle(struct task_group *tg, int cpu) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static void task_throttle_setup_work(struct task_struct *p) {}
+static bool task_is_throttled(struct task_struct *p) { return false; }
+static void dequeue_throttled_task(struct task_struct *p, int flags) {}
+static bool enqueue_throttled_task(struct task_struct *p) { return false; }
+static void record_throttle_clock(struct cfs_rq *cfs_rq) {}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
return 0;
}
+static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
+{
+ return false;
+}
+
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
{
return 0;
}
-static inline int throttled_lb_pair(struct task_group *tg,
- int src_cpu, int dest_cpu)
+static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu)
{
return 0;
}
@@ -6831,6 +6918,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
int rq_h_nr_queued = rq->cfs.h_nr_queued;
u64 slice = 0;
+ if (task_is_throttled(p) && enqueue_throttled_task(p))
+ return;
+
/*
* The code below (indirectly) updates schedutil which looks at
* the cfs_rq utilization to select a frequency.
@@ -6883,10 +6973,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = 1;
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- goto enqueue_throttle;
-
flags = ENQUEUE_WAKEUP;
}
@@ -6908,10 +6994,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = 1;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- goto enqueue_throttle;
}
if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
@@ -6941,7 +7023,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!task_new)
check_update_overutilized_status(rq);
-enqueue_throttle:
assert_list_leaf_cfs_rq(rq);
hrtick_update(rq);
@@ -6963,6 +7044,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
bool was_sched_idle = sched_idle_rq(rq);
bool task_sleep = flags & DEQUEUE_SLEEP;
bool task_delayed = flags & DEQUEUE_DELAYED;
+ bool task_throttled = flags & DEQUEUE_THROTTLE;
struct task_struct *p = NULL;
int h_nr_idle = 0;
int h_nr_queued = 0;
@@ -6996,9 +7078,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = h_nr_queued;
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- return 0;
+ if (throttled_hierarchy(cfs_rq) && task_throttled)
+ record_throttle_clock(cfs_rq);
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
@@ -7010,7 +7091,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
* Bias pick_next to pick a task from this cfs_rq, as
* p is sleeping when it is within its sched_slice.
*/
- if (task_sleep && se && !throttled_hierarchy(cfs_rq))
+ if (task_sleep && se)
set_next_buddy(se);
break;
}
@@ -7037,9 +7118,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = h_nr_queued;
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- return 0;
+ if (throttled_hierarchy(cfs_rq) && task_throttled)
+ record_throttle_clock(cfs_rq);
}
sub_nr_running(rq, h_nr_queued);
@@ -7073,6 +7153,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
*/
static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
+ if (task_is_throttled(p)) {
+ dequeue_throttled_task(p, flags);
+ return true;
+ }
+
if (!p->se.sched_delayed)
util_est_dequeue(&rq->cfs, p);
@@ -8660,7 +8745,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
* lead to a throttle). This both saves work and prevents false
* next-buddy nomination below.
*/
- if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
+ if (task_is_throttled(p))
return;
if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) {
@@ -8741,19 +8826,22 @@ static struct task_struct *pick_task_fair(struct rq *rq)
{
struct sched_entity *se;
struct cfs_rq *cfs_rq;
+ struct task_struct *p;
+ bool throttled;
again:
cfs_rq = &rq->cfs;
if (!cfs_rq->nr_queued)
return NULL;
+ throttled = false;
+
do {
/* Might not have done put_prev_entity() */
if (cfs_rq->curr && cfs_rq->curr->on_rq)
update_curr(cfs_rq);
- if (unlikely(check_cfs_rq_runtime(cfs_rq)))
- goto again;
+ throttled |= check_cfs_rq_runtime(cfs_rq);
se = pick_next_entity(rq, cfs_rq);
if (!se)
@@ -8761,7 +8849,10 @@ again:
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
- return task_of(se);
+ p = task_of(se);
+ if (unlikely(throttled))
+ task_throttle_setup_work(p);
+ return p;
}
static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
@@ -8859,11 +8950,6 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_stru
return pick_next_task_fair(rq, prev, NULL);
}
-static bool fair_server_has_tasks(struct sched_dl_entity *dl_se)
-{
- return !!dl_se->rq->cfs.nr_queued;
-}
-
static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se)
{
return pick_task_fair(dl_se->rq);
@@ -8875,7 +8961,7 @@ void fair_server_init(struct rq *rq)
init_dl_entity(dl_se);
- dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task);
+ dl_server_init(dl_se, rq, fair_server_pick_task);
}
/*
@@ -8928,8 +9014,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
- /* throttled hierarchies are not runnable */
- if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
+ /* !se->on_rq also covers throttled task */
+ if (!se->on_rq)
return false;
/* Tell the scheduler that we'd really like se to run next. */
@@ -9288,7 +9374,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/*
* We do not migrate tasks that are:
* 1) delayed dequeued unless we migrate load, or
- * 2) throttled_lb_pair, or
+ * 2) target cfs_rq is in throttled hierarchy, or
* 3) cannot be migrated to this CPU due to cpus_ptr, or
* 4) running (obviously), or
* 5) are cache-hot on their current CPU, or
@@ -9297,7 +9383,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
return 0;
- if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
+ if (lb_throttled_hierarchy(p, env->dst_cpu))
return 0;
/*
@@ -13081,10 +13167,13 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq_throttled(cfs_rq))
- return;
-
- if (!throttled_hierarchy(cfs_rq))
+ /*
+ * If a task gets attached to this cfs_rq and before being queued,
+ * it gets migrated to another CPU due to reasons like affinity
+ * change, make sure this cfs_rq stays on leaf cfs_rq list to have
+ * that removed load decayed or it can cause faireness problem.
+ */
+ if (!cfs_rq_pelt_clock_throttled(cfs_rq))
list_add_leaf_cfs_rq(cfs_rq);
/* Start to propagate at parent */
@@ -13095,10 +13184,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
update_load_avg(cfs_rq, se, UPDATE_TG);
- if (cfs_rq_throttled(cfs_rq))
- break;
-
- if (!throttled_hierarchy(cfs_rq))
+ if (!cfs_rq_pelt_clock_throttled(cfs_rq))
list_add_leaf_cfs_rq(cfs_rq);
}
}
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 62c3fa543c0f..f921302dc40f 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -162,7 +162,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
u64 throttled;
- if (unlikely(cfs_rq->throttle_count))
+ if (unlikely(cfs_rq->pelt_clock_throttled))
throttled = U64_MAX;
else
throttled = cfs_rq->throttled_clock_pelt_time;
@@ -173,7 +173,7 @@ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
- if (unlikely(cfs_rq->throttle_count))
+ if (unlikely(cfs_rq->pelt_clock_throttled))
return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time;
return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
diff --git a/kernel/sched/rq-offsets.c b/kernel/sched/rq-offsets.c
new file mode 100644
index 000000000000..a23747bbe25b
--- /dev/null
+++ b/kernel/sched/rq-offsets.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+#define COMPILE_OFFSETS
+#include <linux/kbuild.h>
+#include <linux/types.h>
+#include "sched.h"
+
+int main(void)
+{
+ DEFINE(RQ_nr_pinned, offsetof(struct rq, nr_pinned));
+
+ return 0;
+}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index be9745d104f7..1f5d07067f60 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -365,25 +365,50 @@ extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s6
*
* dl_se::rq -- runqueue we belong to.
*
- * dl_se::server_has_tasks() -- used on bandwidth enforcement; we 'stop' the
- * server when it runs out of tasks to run.
- *
* dl_se::server_pick() -- nested pick_next_task(); we yield the period if this
* returns NULL.
*
* dl_server_update() -- called from update_curr_common(), propagates runtime
* to the server.
*
- * dl_server_start()
- * dl_server_stop() -- start/stop the server when it has (no) tasks.
+ * dl_server_start() -- start the server when it has tasks; it will stop
+ * automatically when there are no more tasks, per
+ * dl_se::server_pick() returning NULL.
+ *
+ * dl_server_stop() -- (force) stop the server; use when updating
+ * parameters.
*
* dl_server_init() -- initializes the server.
+ *
+ * When started the dl_server will (per dl_defer) schedule a timer for its
+ * zero-laxity point -- that is, unlike regular EDF tasks which run ASAP, a
+ * server will run at the very end of its period.
+ *
+ * This is done such that any runtime from the target class can be accounted
+ * against the server -- through dl_server_update() above -- such that when it
+ * becomes time to run, it might already be out of runtime and get deferred
+ * until the next period. In this case dl_server_timer() will alternate
+ * between defer and replenish but never actually enqueue the server.
+ *
+ * Only when the target class does not manage to exhaust the server's runtime
+ * (there's actualy starvation in the given period), will the dl_server get on
+ * the runqueue. Once queued it will pick tasks from the target class and run
+ * them until either its runtime is exhaused, at which point its back to
+ * dl_server_timer, or until there are no more tasks to run, at which point
+ * the dl_server stops itself.
+ *
+ * By stopping at this point the dl_server retains bandwidth, which, if a new
+ * task wakes up imminently (starting the server again), can be used --
+ * subject to CBS wakeup rules -- without having to wait for the next period.
+ *
+ * Additionally, because of the dl_defer behaviour the start/stop behaviour is
+ * naturally thottled to once per period, avoiding high context switch
+ * workloads from spamming the hrtimer program/cancel paths.
*/
extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec);
extern void dl_server_start(struct sched_dl_entity *dl_se);
extern void dl_server_stop(struct sched_dl_entity *dl_se);
extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
- dl_server_has_tasks_f has_tasks,
dl_server_pick_f pick_task);
extern void sched_init_dl_servers(void);
@@ -735,10 +760,12 @@ struct cfs_rq {
u64 throttled_clock_pelt_time;
u64 throttled_clock_self;
u64 throttled_clock_self_time;
- int throttled;
+ bool throttled:1;
+ bool pelt_clock_throttled:1;
int throttle_count;
struct list_head throttled_list;
struct list_head throttled_csd_list;
+ struct list_head throttled_limbo_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
@@ -1935,12 +1962,12 @@ extern void sched_setnuma(struct task_struct *p, int node);
extern int migrate_task_to(struct task_struct *p, int cpu);
extern int migrate_swap(struct task_struct *p, struct task_struct *t,
int cpu, int scpu);
-extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
+extern void init_numa_balancing(u64 clone_flags, struct task_struct *p);
#else /* !CONFIG_NUMA_BALANCING: */
static inline void
-init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+init_numa_balancing(u64 clone_flags, struct task_struct *p)
{
}
@@ -2342,6 +2369,7 @@ extern const u32 sched_prio_to_wmult[40];
#define DEQUEUE_SPECIAL 0x10
#define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */
#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */
+#define DEQUEUE_THROTTLE 0x800
#define ENQUEUE_WAKEUP 0x01
#define ENQUEUE_RESTORE 0x02
@@ -2658,6 +2686,8 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
extern void init_dl_entity(struct sched_dl_entity *dl_se);
+extern void init_cfs_throttle_work(struct task_struct *p);
+
#define BW_SHIFT 20
#define BW_UNIT (1 << BW_SHIFT)
#define RATIO_SHIFT 8
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 977e133bb8a4..444bdfdab731 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1591,7 +1591,6 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
enum numa_topology_type sched_numa_topology_type;
static int sched_domains_numa_levels;
-static int sched_domains_curr_level;
int sched_max_numa_distance;
static int *sched_domains_numa_distance;
@@ -1632,14 +1631,7 @@ sd_init(struct sched_domain_topology_level *tl,
int sd_id, sd_weight, sd_flags = 0;
struct cpumask *sd_span;
-#ifdef CONFIG_NUMA
- /*
- * Ugly hack to pass state to sd_numa_mask()...
- */
- sched_domains_curr_level = tl->numa_level;
-#endif
-
- sd_weight = cpumask_weight(tl->mask(cpu));
+ sd_weight = cpumask_weight(tl->mask(tl, cpu));
if (tl->sd_flags)
sd_flags = (*tl->sd_flags)();
@@ -1677,7 +1669,7 @@ sd_init(struct sched_domain_topology_level *tl,
};
sd_span = sched_domain_span(sd);
- cpumask_and(sd_span, cpu_map, tl->mask(cpu));
+ cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu));
sd_id = cpumask_first(sd_span);
sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map);
@@ -1732,22 +1724,63 @@ sd_init(struct sched_domain_topology_level *tl,
return sd;
}
+#ifdef CONFIG_SCHED_SMT
+int cpu_smt_flags(void)
+{
+ return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC;
+}
+
+const struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+ return cpu_smt_mask(cpu);
+}
+#endif
+
+#ifdef CONFIG_SCHED_CLUSTER
+int cpu_cluster_flags(void)
+{
+ return SD_CLUSTER | SD_SHARE_LLC;
+}
+
+const struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+ return cpu_clustergroup_mask(cpu);
+}
+#endif
+
+#ifdef CONFIG_SCHED_MC
+int cpu_core_flags(void)
+{
+ return SD_SHARE_LLC;
+}
+
+const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+ return cpu_coregroup_mask(cpu);
+}
+#endif
+
+const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+ return cpu_node_mask(cpu);
+}
+
/*
* Topology list, bottom-up.
*/
static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
- SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT),
+ SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT),
#endif
#ifdef CONFIG_SCHED_CLUSTER
- SDTL_INIT(cpu_clustergroup_mask, cpu_cluster_flags, CLS),
+ SDTL_INIT(tl_cls_mask, cpu_cluster_flags, CLS),
#endif
#ifdef CONFIG_SCHED_MC
- SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC),
+ SDTL_INIT(tl_mc_mask, cpu_core_flags, MC),
#endif
- SDTL_INIT(cpu_cpu_mask, NULL, PKG),
+ SDTL_INIT(tl_pkg_mask, NULL, PKG),
{ NULL, },
};
@@ -1768,10 +1801,14 @@ void __init set_sched_topology(struct sched_domain_topology_level *tl)
}
#ifdef CONFIG_NUMA
+static int cpu_numa_flags(void)
+{
+ return SD_NUMA;
+}
-static const struct cpumask *sd_numa_mask(int cpu)
+static const struct cpumask *sd_numa_mask(struct sched_domain_topology_level *tl, int cpu)
{
- return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+ return sched_domains_numa_masks[tl->numa_level][cpu_to_node(cpu)];
}
static void sched_numa_warn(const char *str)
@@ -2201,6 +2238,8 @@ int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
goto unlock;
hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), hop_cmp);
+ if (!hop_masks)
+ goto unlock;
hop = hop_masks - k.masks;
ret = hop ?
@@ -2411,7 +2450,7 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
* breaks the linking done for an earlier span.
*/
for_each_cpu(cpu, cpu_map) {
- const struct cpumask *tl_cpu_mask = tl->mask(cpu);
+ const struct cpumask *tl_cpu_mask = tl->mask(tl, cpu);
int id;
/* lowest bit set in this mask is used as a unique id */
@@ -2419,7 +2458,7 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
if (cpumask_test_cpu(id, id_seen)) {
/* First CPU has already been seen, ensure identical spans */
- if (!cpumask_equal(tl->mask(id), tl_cpu_mask))
+ if (!cpumask_equal(tl->mask(tl, id), tl_cpu_mask))
return false;
} else {
/* First CPU hasn't been seen before, ensure it's a completely new span */