summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/autogroup.c2
-rw-r--r--kernel/sched/core.c132
-rw-r--r--kernel/sched/cpufreq_schedutil.c43
-rw-r--r--kernel/sched/cputime.c16
-rw-r--r--kernel/sched/deadline.c129
-rw-r--r--kernel/sched/debug.c24
-rw-r--r--kernel/sched/ext.c426
-rw-r--r--kernel/sched/ext.h8
-rw-r--r--kernel/sched/fair.c630
-rw-r--r--kernel/sched/features.h9
-rw-r--r--kernel/sched/idle.c5
-rw-r--r--kernel/sched/isolation.c22
-rw-r--r--kernel/sched/pelt.c4
-rw-r--r--kernel/sched/psi.c7
-rw-r--r--kernel/sched/rt.c2
-rw-r--r--kernel/sched/sched.h87
-rw-r--r--kernel/sched/stats.c11
-rw-r--r--kernel/sched/stats.h13
-rw-r--r--kernel/sched/syscalls.c25
-rw-r--r--kernel/sched/topology.c14
20 files changed, 953 insertions, 656 deletions
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index db68a964e34e..83d46b9b8ec8 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -9,7 +9,7 @@ static struct autogroup autogroup_default;
static atomic_t autogroup_seq_nr;
#ifdef CONFIG_SYSCTL
-static struct ctl_table sched_autogroup_sysctls[] = {
+static const struct ctl_table sched_autogroup_sysctls[] = {
{
.procname = "sched_autogroup_enabled",
.data = &sysctl_sched_autogroup_enabled,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c6d8232ad9ee..165c90ba64ea 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -740,39 +740,43 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
s64 __maybe_unused steal = 0, irq_delta = 0;
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
- irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+ if (irqtime_enabled()) {
+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
- /*
- * Since irq_time is only updated on {soft,}irq_exit, we might run into
- * this case when a previous update_rq_clock() happened inside a
- * {soft,}IRQ region.
- *
- * When this happens, we stop ->clock_task and only update the
- * prev_irq_time stamp to account for the part that fit, so that a next
- * update will consume the rest. This ensures ->clock_task is
- * monotonic.
- *
- * It does however cause some slight miss-attribution of {soft,}IRQ
- * time, a more accurate solution would be to update the irq_time using
- * the current rq->clock timestamp, except that would require using
- * atomic ops.
- */
- if (irq_delta > delta)
- irq_delta = delta;
+ /*
+ * Since irq_time is only updated on {soft,}irq_exit, we might run into
+ * this case when a previous update_rq_clock() happened inside a
+ * {soft,}IRQ region.
+ *
+ * When this happens, we stop ->clock_task and only update the
+ * prev_irq_time stamp to account for the part that fit, so that a next
+ * update will consume the rest. This ensures ->clock_task is
+ * monotonic.
+ *
+ * It does however cause some slight miss-attribution of {soft,}IRQ
+ * time, a more accurate solution would be to update the irq_time using
+ * the current rq->clock timestamp, except that would require using
+ * atomic ops.
+ */
+ if (irq_delta > delta)
+ irq_delta = delta;
- rq->prev_irq_time += irq_delta;
- delta -= irq_delta;
- delayacct_irq(rq->curr, irq_delta);
+ rq->prev_irq_time += irq_delta;
+ delta -= irq_delta;
+ delayacct_irq(rq->curr, irq_delta);
+ }
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((&paravirt_steal_rq_enabled))) {
- steal = paravirt_steal_clock(cpu_of(rq));
+ u64 prev_steal;
+
+ steal = prev_steal = paravirt_steal_clock(cpu_of(rq));
steal -= rq->prev_steal_time_rq;
if (unlikely(steal > delta))
steal = delta;
- rq->prev_steal_time_rq += steal;
+ rq->prev_steal_time_rq = prev_steal;
delta -= steal;
}
#endif
@@ -789,6 +793,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
void update_rq_clock(struct rq *rq)
{
s64 delta;
+ u64 clock;
lockdep_assert_rq_held(rq);
@@ -800,11 +805,14 @@ void update_rq_clock(struct rq *rq)
SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
rq->clock_update_flags |= RQCF_UPDATED;
#endif
+ clock = sched_clock_cpu(cpu_of(rq));
+ scx_rq_clock_update(rq, clock);
- delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+ delta = clock - rq->clock;
if (delta < 0)
return;
rq->clock += delta;
+
update_rq_clock_task(rq, delta);
}
@@ -1168,13 +1176,13 @@ int get_nohz_timer_target(void)
struct sched_domain *sd;
const struct cpumask *hk_mask;
- if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) {
+ if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) {
if (!idle_cpu(cpu))
return cpu;
default_cpu = cpu;
}
- hk_mask = housekeeping_cpumask(HK_TYPE_TIMER);
+ hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
guard(rcu)();
@@ -1189,7 +1197,7 @@ int get_nohz_timer_target(void)
}
if (default_cpu == -1)
- default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
+ default_cpu = housekeeping_any_cpu(HK_TYPE_KERNEL_NOISE);
return default_cpu;
}
@@ -1341,7 +1349,7 @@ bool sched_can_stop_tick(struct rq *rq)
if (scx_enabled() && !scx_can_stop_tick(rq))
return false;
- if (rq->cfs.nr_running > 1)
+ if (rq->cfs.h_nr_queued > 1)
return false;
/*
@@ -3534,7 +3542,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
*
* More yuck to audit.
*/
- do_set_cpus_allowed(p, task_cpu_possible_mask(p));
+ do_set_cpus_allowed(p, task_cpu_fallback_mask(p));
state = fail;
break;
case fail:
@@ -4646,7 +4654,7 @@ static int sysctl_schedstats(const struct ctl_table *table, int write, void *buf
#endif /* CONFIG_SCHEDSTATS */
#ifdef CONFIG_SYSCTL
-static struct ctl_table sched_core_sysctls[] = {
+static const struct ctl_table sched_core_sysctls[] = {
#ifdef CONFIG_SCHEDSTATS
{
.procname = "sched_schedstats",
@@ -5632,7 +5640,7 @@ void sched_tick(void)
unsigned long hw_pressure;
u64 resched_latency;
- if (housekeeping_cpu(cpu, HK_TYPE_TICK))
+ if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
arch_scale_freq_tick();
sched_clock_tick();
@@ -5771,7 +5779,7 @@ static void sched_tick_start(int cpu)
int os;
struct tick_work *twork;
- if (housekeeping_cpu(cpu, HK_TYPE_TICK))
+ if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
return;
WARN_ON_ONCE(!tick_work_cpu);
@@ -5792,7 +5800,7 @@ static void sched_tick_stop(int cpu)
struct tick_work *twork;
int os;
- if (housekeeping_cpu(cpu, HK_TYPE_TICK))
+ if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
return;
WARN_ON_ONCE(!tick_work_cpu);
@@ -6018,7 +6026,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* opportunity to pull in more work from other CPUs.
*/
if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
- rq->nr_running == rq->cfs.h_nr_running)) {
+ rq->nr_running == rq->cfs.h_nr_queued)) {
p = pick_next_task_fair(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
@@ -6641,7 +6649,6 @@ static void __sched notrace __schedule(int sched_mode)
* as a preemption by schedule_debug() and RCU.
*/
bool preempt = sched_mode > SM_NONE;
- bool block = false;
unsigned long *switch_count;
unsigned long prev_state;
struct rq_flags rf;
@@ -6702,7 +6709,7 @@ static void __sched notrace __schedule(int sched_mode)
goto picked;
}
} else if (!preempt && prev_state) {
- block = try_to_block_task(rq, prev, prev_state);
+ try_to_block_task(rq, prev, prev_state);
switch_count = &prev->nvcsw;
}
@@ -6748,7 +6755,8 @@ picked:
migrate_disable_switch(rq, prev);
psi_account_irqtime(rq, prev, next);
- psi_sched_switch(prev, next, block);
+ psi_sched_switch(prev, next, !task_on_rq_queued(prev) ||
+ prev->se.sched_delayed);
trace_sched_switch(preempt, prev, next, prev_state);
@@ -7701,9 +7709,9 @@ void sched_show_task(struct task_struct *p)
if (pid_alive(p))
ppid = task_pid_nr(rcu_dereference(p->real_parent));
rcu_read_unlock();
- pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d flags:0x%08lx\n",
+ pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d task_flags:0x%04x flags:0x%08lx\n",
free, task_pid_nr(p), task_tgid_nr(p),
- ppid, read_task_thread_flags(p));
+ ppid, p->flags, read_task_thread_flags(p));
print_worker_info(KERN_INFO, p);
print_stop_info(KERN_INFO, p);
@@ -7930,19 +7938,26 @@ void sched_setnuma(struct task_struct *p, int nid)
#ifdef CONFIG_HOTPLUG_CPU
/*
- * Ensure that the idle task is using init_mm right before its CPU goes
- * offline.
+ * Invoked on the outgoing CPU in context of the CPU hotplug thread
+ * after ensuring that there are no user space tasks left on the CPU.
+ *
+ * If there is a lazy mm in use on the hotplug thread, drop it and
+ * switch to init_mm.
+ *
+ * The reference count on init_mm is dropped in finish_cpu().
*/
-void idle_task_exit(void)
+static void sched_force_init_mm(void)
{
struct mm_struct *mm = current->active_mm;
- BUG_ON(cpu_online(smp_processor_id()));
- BUG_ON(current != this_rq()->idle);
-
if (mm != &init_mm) {
- switch_mm(mm, &init_mm, current);
+ mmgrab_lazy_tlb(&init_mm);
+ local_irq_disable();
+ current->active_mm = &init_mm;
+ switch_mm_irqs_off(mm, &init_mm, current);
+ local_irq_enable();
finish_arch_post_lock_switch();
+ mmdrop_lazy_tlb(mm);
}
/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
@@ -8180,19 +8195,14 @@ static void cpuset_cpu_active(void)
cpuset_update_active_cpus();
}
-static int cpuset_cpu_inactive(unsigned int cpu)
+static void cpuset_cpu_inactive(unsigned int cpu)
{
if (!cpuhp_tasks_frozen) {
- int ret = dl_bw_check_overflow(cpu);
-
- if (ret)
- return ret;
cpuset_update_active_cpus();
} else {
num_cpus_frozen++;
partition_sched_domains(1, NULL, NULL);
}
- return 0;
}
static inline void sched_smt_present_inc(int cpu)
@@ -8254,6 +8264,11 @@ int sched_cpu_deactivate(unsigned int cpu)
struct rq *rq = cpu_rq(cpu);
int ret;
+ ret = dl_bw_deactivate(cpu);
+
+ if (ret)
+ return ret;
+
/*
* Remove CPU from nohz.idle_cpus_mask to prevent participating in
* load balancing when not active
@@ -8299,15 +8314,7 @@ int sched_cpu_deactivate(unsigned int cpu)
return 0;
sched_update_numa(cpu, false);
- ret = cpuset_cpu_inactive(cpu);
- if (ret) {
- sched_smt_present_inc(cpu);
- sched_set_rq_online(rq, cpu);
- balance_push_set(cpu, false);
- set_cpu_active(cpu, true);
- sched_update_numa(cpu, true);
- return ret;
- }
+ cpuset_cpu_inactive(cpu);
sched_domains_numa_masks_clear(cpu);
return 0;
}
@@ -8344,6 +8351,7 @@ int sched_cpu_starting(unsigned int cpu)
int sched_cpu_wait_empty(unsigned int cpu)
{
balance_hotplug_wait();
+ sched_force_init_mm();
return 0;
}
@@ -10590,7 +10598,7 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
return;
/* No page allocation under rq lock */
- task_work_add(curr, work, TWA_RESUME | TWAF_NO_ALLOC);
+ task_work_add(curr, work, TWA_RESUME);
}
void sched_mm_cid_exit_signals(struct task_struct *t)
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 28c77904ea74..1a19d69b91ed 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -83,7 +83,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
if (unlikely(sg_policy->limits_changed)) {
sg_policy->limits_changed = false;
- sg_policy->need_freq_update = true;
+ sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
return true;
}
@@ -96,7 +96,7 @@ static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
unsigned int next_freq)
{
if (sg_policy->need_freq_update)
- sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
+ sg_policy->need_freq_update = false;
else if (sg_policy->next_freq == next_freq)
return false;
@@ -604,31 +604,6 @@ static const struct kobj_type sugov_tunables_ktype = {
/********************** cpufreq governor interface *********************/
-#ifdef CONFIG_ENERGY_MODEL
-static void rebuild_sd_workfn(struct work_struct *work)
-{
- rebuild_sched_domains_energy();
-}
-
-static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
-
-/*
- * EAS shouldn't be attempted without sugov, so rebuild the sched_domains
- * on governor changes to make sure the scheduler knows about it.
- */
-static void sugov_eas_rebuild_sd(void)
-{
- /*
- * When called from the cpufreq_register_driver() path, the
- * cpu_hotplug_lock is already held, so use a work item to
- * avoid nested locking in rebuild_sched_domains().
- */
- schedule_work(&rebuild_sd_work);
-}
-#else
-static inline void sugov_eas_rebuild_sd(void) { };
-#endif
-
struct cpufreq_governor schedutil_gov;
static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
@@ -691,7 +666,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
}
sg_policy->thread = thread;
- kthread_bind_mask(thread, policy->related_cpus);
+ if (policy->dvfs_possible_from_any_cpu)
+ set_cpus_allowed_ptr(thread, policy->related_cpus);
+ else
+ kthread_bind_mask(thread, policy->related_cpus);
+
init_irq_work(&sg_policy->irq_work, sugov_irq_work);
mutex_init(&sg_policy->work_lock);
@@ -784,7 +763,11 @@ static int sugov_init(struct cpufreq_policy *policy)
goto fail;
out:
- sugov_eas_rebuild_sd();
+ /*
+ * Schedutil is the preferred governor for EAS, so rebuild sched domains
+ * on governor changes to make sure the scheduler knows about them.
+ */
+ em_rebuild_sched_domains();
mutex_unlock(&global_tunables_lock);
return 0;
@@ -826,7 +809,7 @@ static void sugov_exit(struct cpufreq_policy *policy)
sugov_policy_free(sg_policy);
cpufreq_disable_fast_switch(policy);
- sugov_eas_rebuild_sd();
+ em_rebuild_sched_domains();
}
static int sugov_start(struct cpufreq_policy *policy)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 0bed0fa1acd9..5d9143dd0879 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -9,6 +9,8 @@
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime);
+
/*
* There are no locks covering percpu hardirq/softirq time.
* They are only modified in vtime_account, on corresponding CPU
@@ -22,16 +24,14 @@
*/
DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
-static int sched_clock_irqtime;
-
void enable_sched_clock_irqtime(void)
{
- sched_clock_irqtime = 1;
+ static_branch_enable(&sched_clock_irqtime);
}
void disable_sched_clock_irqtime(void)
{
- sched_clock_irqtime = 0;
+ static_branch_disable(&sched_clock_irqtime);
}
static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
@@ -57,7 +57,7 @@ void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
s64 delta;
int cpu;
- if (!sched_clock_irqtime)
+ if (!irqtime_enabled())
return;
cpu = smp_processor_id();
@@ -90,8 +90,6 @@ static u64 irqtime_tick_accounted(u64 maxtime)
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-#define sched_clock_irqtime (0)
-
static u64 irqtime_tick_accounted(u64 dummy)
{
return 0;
@@ -478,7 +476,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
if (vtime_accounting_enabled_this_cpu())
return;
- if (sched_clock_irqtime) {
+ if (irqtime_enabled()) {
irqtime_account_process_tick(p, user_tick, 1);
return;
}
@@ -507,7 +505,7 @@ void account_idle_ticks(unsigned long ticks)
{
u64 cputime, steal;
- if (sched_clock_irqtime) {
+ if (irqtime_enabled()) {
irqtime_account_idle_ticks(ticks);
return;
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index db47f33cb7d2..38e4537790af 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -26,7 +26,7 @@
static unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */
static unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */
#ifdef CONFIG_SYSCTL
-static struct ctl_table sched_dl_sysctls[] = {
+static const struct ctl_table sched_dl_sysctls[] = {
{
.procname = "sched_deadline_period_max_us",
.data = &sysctl_sched_dl_period_max,
@@ -342,6 +342,29 @@ static void dl_rq_change_utilization(struct rq *rq, struct sched_dl_entity *dl_s
__add_rq_bw(new_bw, &rq->dl);
}
+static __always_inline
+void cancel_dl_timer(struct sched_dl_entity *dl_se, struct hrtimer *timer)
+{
+ /*
+ * If the timer callback was running (hrtimer_try_to_cancel == -1),
+ * it will eventually call put_task_struct().
+ */
+ if (hrtimer_try_to_cancel(timer) == 1 && !dl_server(dl_se))
+ put_task_struct(dl_task_of(dl_se));
+}
+
+static __always_inline
+void cancel_replenish_timer(struct sched_dl_entity *dl_se)
+{
+ cancel_dl_timer(dl_se, &dl_se->dl_timer);
+}
+
+static __always_inline
+void cancel_inactive_timer(struct sched_dl_entity *dl_se)
+{
+ cancel_dl_timer(dl_se, &dl_se->inactive_timer);
+}
+
static void dl_change_utilization(struct task_struct *p, u64 new_bw)
{
WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV);
@@ -495,10 +518,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
* will not touch the rq's active utilization,
* so we are still safe.
*/
- if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) {
- if (!dl_server(dl_se))
- put_task_struct(dl_task_of(dl_se));
- }
+ cancel_inactive_timer(dl_se);
} else {
/*
* Since "dl_non_contending" is not set, the
@@ -1647,6 +1667,7 @@ void dl_server_start(struct sched_dl_entity *dl_se)
if (!dl_se->dl_runtime)
return;
+ dl_se->dl_server_active = 1;
enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP);
if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl))
resched_curr(dl_se->rq);
@@ -1661,6 +1682,7 @@ void dl_server_stop(struct sched_dl_entity *dl_se)
hrtimer_try_to_cancel(&dl_se->dl_timer);
dl_se->dl_defer_armed = 0;
dl_se->dl_throttled = 0;
+ dl_se->dl_server_active = 0;
}
void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
@@ -2113,13 +2135,8 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* The replenish timer needs to be canceled. No
* problem if it fires concurrently: boosted threads
* are ignored in dl_task_timer().
- *
- * If the timer callback was running (hrtimer_try_to_cancel == -1),
- * it will eventually call put_task_struct().
*/
- if (hrtimer_try_to_cancel(&p->dl.dl_timer) == 1 &&
- !dl_server(&p->dl))
- put_task_struct(p);
+ cancel_replenish_timer(&p->dl);
p->dl.dl_throttled = 0;
}
} else if (!dl_prio(p->normal_prio)) {
@@ -2287,8 +2304,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
* will not touch the rq's active utilization,
* so we are still safe.
*/
- if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
- put_task_struct(p);
+ cancel_inactive_timer(&p->dl);
}
sub_rq_bw(&p->dl, &rq->dl);
rq_unlock(rq, &rf);
@@ -2421,8 +2437,10 @@ again:
if (dl_server(dl_se)) {
p = dl_se->server_pick_task(dl_se);
if (!p) {
- dl_se->dl_yielded = 1;
- update_curr_dl_se(rq, dl_se, 0);
+ if (dl_server_active(dl_se)) {
+ dl_se->dl_yielded = 1;
+ update_curr_dl_se(rq, dl_se, 0);
+ }
goto again;
}
rq->dl_server = dl_se;
@@ -2502,16 +2520,13 @@ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu
return NULL;
next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root);
-
-next_node:
- if (next_node) {
+ while (next_node) {
p = __node_2_pdl(next_node);
if (task_is_pushable(rq, p, cpu))
return p;
next_node = rb_next(next_node);
- goto next_node;
}
return NULL;
@@ -2960,11 +2975,22 @@ void dl_add_task_root_domain(struct task_struct *p)
void dl_clear_root_domain(struct root_domain *rd)
{
- unsigned long flags;
+ int i;
- raw_spin_lock_irqsave(&rd->dl_bw.lock, flags);
+ guard(raw_spinlock_irqsave)(&rd->dl_bw.lock);
rd->dl_bw.total_bw = 0;
- raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags);
+
+ /*
+ * dl_server bandwidth is only restored when CPUs are attached to root
+ * domains (after domains are created or CPUs moved back to the
+ * default root doamin).
+ */
+ for_each_cpu(i, rd->span) {
+ struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server;
+
+ if (dl_server(dl_se) && cpu_active(i))
+ rd->dl_bw.total_bw += dl_se->dl_bw;
+ }
}
#endif /* CONFIG_SMP */
@@ -3025,8 +3051,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
*/
static void switched_to_dl(struct rq *rq, struct task_struct *p)
{
- if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
- put_task_struct(p);
+ cancel_inactive_timer(&p->dl);
/*
* In case a task is setscheduled to SCHED_DEADLINE we need to keep
@@ -3449,29 +3474,31 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
}
enum dl_bw_request {
- dl_bw_req_check_overflow = 0,
+ dl_bw_req_deactivate = 0,
dl_bw_req_alloc,
dl_bw_req_free
};
static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
{
- unsigned long flags;
+ unsigned long flags, cap;
struct dl_bw *dl_b;
bool overflow = 0;
+ u64 fair_server_bw = 0;
rcu_read_lock_sched();
dl_b = dl_bw_of(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
- if (req == dl_bw_req_free) {
+ cap = dl_bw_capacity(cpu);
+ switch (req) {
+ case dl_bw_req_free:
__dl_sub(dl_b, dl_bw, dl_bw_cpus(cpu));
- } else {
- unsigned long cap = dl_bw_capacity(cpu);
-
+ break;
+ case dl_bw_req_alloc:
overflow = __dl_overflow(dl_b, cap, 0, dl_bw);
- if (req == dl_bw_req_alloc && !overflow) {
+ if (!overflow) {
/*
* We reserve space in the destination
* root_domain, as we can't fail after this point.
@@ -3480,6 +3507,42 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
*/
__dl_add(dl_b, dl_bw, dl_bw_cpus(cpu));
}
+ break;
+ case dl_bw_req_deactivate:
+ /*
+ * cpu is not off yet, but we need to do the math by
+ * considering it off already (i.e., what would happen if we
+ * turn cpu off?).
+ */
+ cap -= arch_scale_cpu_capacity(cpu);
+
+ /*
+ * cpu is going offline and NORMAL tasks will be moved away
+ * from it. We can thus discount dl_server bandwidth
+ * contribution as it won't need to be servicing tasks after
+ * the cpu is off.
+ */
+ if (cpu_rq(cpu)->fair_server.dl_server)
+ fair_server_bw = cpu_rq(cpu)->fair_server.dl_bw;
+
+ /*
+ * Not much to check if no DEADLINE bandwidth is present.
+ * dl_servers we can discount, as tasks will be moved out the
+ * offlined CPUs anyway.
+ */
+ if (dl_b->total_bw - fair_server_bw > 0) {
+ /*
+ * Leaving at least one CPU for DEADLINE tasks seems a
+ * wise thing to do. As said above, cpu is not offline
+ * yet, so account for that.
+ */
+ if (dl_bw_cpus(cpu) - 1)
+ overflow = __dl_overflow(dl_b, cap, fair_server_bw, 0);
+ else
+ overflow = 1;
+ }
+
+ break;
}
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
@@ -3488,9 +3551,9 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
return overflow ? -EBUSY : 0;
}
-int dl_bw_check_overflow(int cpu)
+int dl_bw_deactivate(int cpu)
{
- return dl_bw_manage(dl_bw_req_check_overflow, cpu, 0);
+ return dl_bw_manage(dl_bw_req_deactivate, cpu, 0);
}
int dl_bw_alloc(int cpu, u64 dl_bw)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index a48b2a701ec2..fd7e85220715 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -379,7 +379,7 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
return -EINVAL;
}
- if (rq->cfs.h_nr_running) {
+ if (rq->cfs.h_nr_queued) {
update_rq_clock(rq);
dl_server_stop(&rq->fair_server);
}
@@ -392,7 +392,7 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n",
cpu_of(rq));
- if (rq->cfs.h_nr_running)
+ if (rq->cfs.h_nr_queued)
dl_server_start(&rq->fair_server);
}
@@ -843,12 +843,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(right_vruntime));
spread = right_vruntime - left_vruntime;
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
- SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
- SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
- SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running",
- cfs_rq->idle_nr_running);
- SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running",
- cfs_rq->idle_h_nr_running);
+ SEQ_printf(m, " .%-30s: %d\n", "nr_queued", cfs_rq->nr_queued);
+ SEQ_printf(m, " .%-30s: %d\n", "h_nr_runnable", cfs_rq->h_nr_runnable);
+ SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued);
+ SEQ_printf(m, " .%-30s: %d\n", "h_nr_idle", cfs_rq->h_nr_idle);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_SMP
SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
@@ -1294,8 +1292,10 @@ void resched_latency_warn(int cpu, u64 latency)
{
static DEFINE_RATELIMIT_STATE(latency_check_ratelimit, 60 * 60 * HZ, 1);
- WARN(__ratelimit(&latency_check_ratelimit),
- "sched: CPU %d need_resched set for > %llu ns (%d ticks) "
- "without schedule\n",
- cpu, latency, cpu_rq(cpu)->ticks_without_resched);
+ if (likely(!__ratelimit(&latency_check_ratelimit)))
+ return;
+
+ pr_err("sched: CPU %d need_resched set for > %llu ns (%d ticks) without schedule\n",
+ cpu, latency, cpu_rq(cpu)->ticks_without_resched);
+ dump_stack();
}
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7fff1d045477..8857c0709bdd 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -206,7 +206,7 @@ struct scx_dump_ctx {
*/
struct sched_ext_ops {
/**
- * select_cpu - Pick the target CPU for a task which is being woken up
+ * @select_cpu: Pick the target CPU for a task which is being woken up
* @p: task being woken up
* @prev_cpu: the cpu @p was on before sleeping
* @wake_flags: SCX_WAKE_*
@@ -233,7 +233,7 @@ struct sched_ext_ops {
s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
/**
- * enqueue - Enqueue a task on the BPF scheduler
+ * @enqueue: Enqueue a task on the BPF scheduler
* @p: task being enqueued
* @enq_flags: %SCX_ENQ_*
*
@@ -248,7 +248,7 @@ struct sched_ext_ops {
void (*enqueue)(struct task_struct *p, u64 enq_flags);
/**
- * dequeue - Remove a task from the BPF scheduler
+ * @dequeue: Remove a task from the BPF scheduler
* @p: task being dequeued
* @deq_flags: %SCX_DEQ_*
*
@@ -264,7 +264,7 @@ struct sched_ext_ops {
void (*dequeue)(struct task_struct *p, u64 deq_flags);
/**
- * dispatch - Dispatch tasks from the BPF scheduler and/or user DSQs
+ * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs
* @cpu: CPU to dispatch tasks for
* @prev: previous task being switched out
*
@@ -287,7 +287,7 @@ struct sched_ext_ops {
void (*dispatch)(s32 cpu, struct task_struct *prev);
/**
- * tick - Periodic tick
+ * @tick: Periodic tick
* @p: task running currently
*
* This operation is called every 1/HZ seconds on CPUs which are
@@ -297,7 +297,7 @@ struct sched_ext_ops {
void (*tick)(struct task_struct *p);
/**
- * runnable - A task is becoming runnable on its associated CPU
+ * @runnable: A task is becoming runnable on its associated CPU
* @p: task becoming runnable
* @enq_flags: %SCX_ENQ_*
*
@@ -324,7 +324,7 @@ struct sched_ext_ops {
void (*runnable)(struct task_struct *p, u64 enq_flags);
/**
- * running - A task is starting to run on its associated CPU
+ * @running: A task is starting to run on its associated CPU
* @p: task starting to run
*
* See ->runnable() for explanation on the task state notifiers.
@@ -332,7 +332,7 @@ struct sched_ext_ops {
void (*running)(struct task_struct *p);
/**
- * stopping - A task is stopping execution
+ * @stopping: A task is stopping execution
* @p: task stopping to run
* @runnable: is task @p still runnable?
*
@@ -343,7 +343,7 @@ struct sched_ext_ops {
void (*stopping)(struct task_struct *p, bool runnable);
/**
- * quiescent - A task is becoming not runnable on its associated CPU
+ * @quiescent: A task is becoming not runnable on its associated CPU
* @p: task becoming not runnable
* @deq_flags: %SCX_DEQ_*
*
@@ -363,7 +363,7 @@ struct sched_ext_ops {
void (*quiescent)(struct task_struct *p, u64 deq_flags);
/**
- * yield - Yield CPU
+ * @yield: Yield CPU
* @from: yielding task
* @to: optional yield target task
*
@@ -378,7 +378,7 @@ struct sched_ext_ops {
bool (*yield)(struct task_struct *from, struct task_struct *to);
/**
- * core_sched_before - Task ordering for core-sched
+ * @core_sched_before: Task ordering for core-sched
* @a: task A
* @b: task B
*
@@ -396,7 +396,7 @@ struct sched_ext_ops {
bool (*core_sched_before)(struct task_struct *a, struct task_struct *b);
/**
- * set_weight - Set task weight
+ * @set_weight: Set task weight
* @p: task to set weight for
* @weight: new weight [1..10000]
*
@@ -405,7 +405,7 @@ struct sched_ext_ops {
void (*set_weight)(struct task_struct *p, u32 weight);
/**
- * set_cpumask - Set CPU affinity
+ * @set_cpumask: Set CPU affinity
* @p: task to set CPU affinity for
* @cpumask: cpumask of cpus that @p can run on
*
@@ -415,7 +415,7 @@ struct sched_ext_ops {
const struct cpumask *cpumask);
/**
- * update_idle - Update the idle state of a CPU
+ * @update_idle: Update the idle state of a CPU
* @cpu: CPU to udpate the idle state for
* @idle: whether entering or exiting the idle state
*
@@ -436,7 +436,7 @@ struct sched_ext_ops {
void (*update_idle)(s32 cpu, bool idle);
/**
- * cpu_acquire - A CPU is becoming available to the BPF scheduler
+ * @cpu_acquire: A CPU is becoming available to the BPF scheduler
* @cpu: The CPU being acquired by the BPF scheduler.
* @args: Acquire arguments, see the struct definition.
*
@@ -446,7 +446,7 @@ struct sched_ext_ops {
void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
/**
- * cpu_release - A CPU is taken away from the BPF scheduler
+ * @cpu_release: A CPU is taken away from the BPF scheduler
* @cpu: The CPU being released by the BPF scheduler.
* @args: Release arguments, see the struct definition.
*
@@ -458,7 +458,7 @@ struct sched_ext_ops {
void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
/**
- * init_task - Initialize a task to run in a BPF scheduler
+ * @init_task: Initialize a task to run in a BPF scheduler
* @p: task to initialize for BPF scheduling
* @args: init arguments, see the struct definition
*
@@ -473,8 +473,9 @@ struct sched_ext_ops {
s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args);
/**
- * exit_task - Exit a previously-running task from the system
+ * @exit_task: Exit a previously-running task from the system
* @p: task to exit
+ * @args: exit arguments, see the struct definition
*
* @p is exiting or the BPF scheduler is being unloaded. Perform any
* necessary cleanup for @p.
@@ -482,7 +483,7 @@ struct sched_ext_ops {
void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args);
/**
- * enable - Enable BPF scheduling for a task
+ * @enable: Enable BPF scheduling for a task
* @p: task to enable BPF scheduling for
*
* Enable @p for BPF scheduling. enable() is called on @p any time it
@@ -491,7 +492,7 @@ struct sched_ext_ops {
void (*enable)(struct task_struct *p);
/**
- * disable - Disable BPF scheduling for a task
+ * @disable: Disable BPF scheduling for a task
* @p: task to disable BPF scheduling for
*
* @p is exiting, leaving SCX or the BPF scheduler is being unloaded.
@@ -501,7 +502,7 @@ struct sched_ext_ops {
void (*disable)(struct task_struct *p);
/**
- * dump - Dump BPF scheduler state on error
+ * @dump: Dump BPF scheduler state on error
* @ctx: debug dump context
*
* Use scx_bpf_dump() to generate BPF scheduler specific debug dump.
@@ -509,7 +510,7 @@ struct sched_ext_ops {
void (*dump)(struct scx_dump_ctx *ctx);
/**
- * dump_cpu - Dump BPF scheduler state for a CPU on error
+ * @dump_cpu: Dump BPF scheduler state for a CPU on error
* @ctx: debug dump context
* @cpu: CPU to generate debug dump for
* @idle: @cpu is currently idle without any runnable tasks
@@ -521,7 +522,7 @@ struct sched_ext_ops {
void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle);
/**
- * dump_task - Dump BPF scheduler state for a runnable task on error
+ * @dump_task: Dump BPF scheduler state for a runnable task on error
* @ctx: debug dump context
* @p: runnable task to generate debug dump for
*
@@ -532,7 +533,7 @@ struct sched_ext_ops {
#ifdef CONFIG_EXT_GROUP_SCHED
/**
- * cgroup_init - Initialize a cgroup
+ * @cgroup_init: Initialize a cgroup
* @cgrp: cgroup being initialized
* @args: init arguments, see the struct definition
*
@@ -547,7 +548,7 @@ struct sched_ext_ops {
struct scx_cgroup_init_args *args);
/**
- * cgroup_exit - Exit a cgroup
+ * @cgroup_exit: Exit a cgroup
* @cgrp: cgroup being exited
*
* Either the BPF scheduler is being unloaded or @cgrp destroyed, exit
@@ -556,7 +557,7 @@ struct sched_ext_ops {
void (*cgroup_exit)(struct cgroup *cgrp);
/**
- * cgroup_prep_move - Prepare a task to be moved to a different cgroup
+ * @cgroup_prep_move: Prepare a task to be moved to a different cgroup
* @p: task being moved
* @from: cgroup @p is being moved from
* @to: cgroup @p is being moved to
@@ -571,7 +572,7 @@ struct sched_ext_ops {
struct cgroup *from, struct cgroup *to);
/**
- * cgroup_move - Commit cgroup move
+ * @cgroup_move: Commit cgroup move
* @p: task being moved
* @from: cgroup @p is being moved from
* @to: cgroup @p is being moved to
@@ -582,7 +583,7 @@ struct sched_ext_ops {
struct cgroup *from, struct cgroup *to);
/**
- * cgroup_cancel_move - Cancel cgroup move
+ * @cgroup_cancel_move: Cancel cgroup move
* @p: task whose cgroup move is being canceled
* @from: cgroup @p was being moved from
* @to: cgroup @p was being moved to
@@ -594,7 +595,7 @@ struct sched_ext_ops {
struct cgroup *from, struct cgroup *to);
/**
- * cgroup_set_weight - A cgroup's weight is being changed
+ * @cgroup_set_weight: A cgroup's weight is being changed
* @cgrp: cgroup whose weight is being updated
* @weight: new weight [1..10000]
*
@@ -608,7 +609,7 @@ struct sched_ext_ops {
*/
/**
- * cpu_online - A CPU became online
+ * @cpu_online: A CPU became online
* @cpu: CPU which just came up
*
* @cpu just came online. @cpu will not call ops.enqueue() or
@@ -617,7 +618,7 @@ struct sched_ext_ops {
void (*cpu_online)(s32 cpu);
/**
- * cpu_offline - A CPU is going offline
+ * @cpu_offline: A CPU is going offline
* @cpu: CPU which is going offline
*
* @cpu is going offline. @cpu will not call ops.enqueue() or
@@ -630,12 +631,12 @@ struct sched_ext_ops {
*/
/**
- * init - Initialize the BPF scheduler
+ * @init: Initialize the BPF scheduler
*/
s32 (*init)(void);
/**
- * exit - Clean up after the BPF scheduler
+ * @exit: Clean up after the BPF scheduler
* @info: Exit info
*
* ops.exit() is also called on ops.init() failure, which is a bit
@@ -645,17 +646,17 @@ struct sched_ext_ops {
void (*exit)(struct scx_exit_info *info);
/**
- * dispatch_max_batch - Max nr of tasks that dispatch() can dispatch
+ * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch
*/
u32 dispatch_max_batch;
/**
- * flags - %SCX_OPS_* flags
+ * @flags: %SCX_OPS_* flags
*/
u64 flags;
/**
- * timeout_ms - The maximum amount of time, in milliseconds, that a
+ * @timeout_ms: The maximum amount of time, in milliseconds, that a
* runnable task should be able to wait before being scheduled. The
* maximum timeout may not exceed the default timeout of 30 seconds.
*
@@ -664,13 +665,13 @@ struct sched_ext_ops {
u32 timeout_ms;
/**
- * exit_dump_len - scx_exit_info.dump buffer length. If 0, the default
+ * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default
* value of 32768 is used.
*/
u32 exit_dump_len;
/**
- * hotplug_seq - A sequence number that may be set by the scheduler to
+ * @hotplug_seq: A sequence number that may be set by the scheduler to
* detect when a hotplug event has occurred during the loading process.
* If 0, no detection occurs. Otherwise, the scheduler will fail to
* load if the sequence number does not match @scx_hotplug_seq on the
@@ -679,7 +680,7 @@ struct sched_ext_ops {
u64 hotplug_seq;
/**
- * name - BPF scheduler's name
+ * @name: BPF scheduler's name
*
* Must be a non-zero valid BPF object name including only isalnum(),
* '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the
@@ -960,7 +961,7 @@ static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task);
static struct scx_dispatch_q **global_dsqs;
static const struct rhashtable_params dsq_hash_params = {
- .key_len = 8,
+ .key_len = sizeof_field(struct scx_dispatch_q, id),
.key_offset = offsetof(struct scx_dispatch_q, id),
.head_offset = offsetof(struct scx_dispatch_q, hash_node),
};
@@ -1408,7 +1409,6 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
/**
* scx_task_iter_next_locked - Next non-idle task with its rq locked
* @iter: iterator to walk
- * @include_dead: Whether we should include dead tasks in the iteration
*
* Visit the non-idle task with its rq lock held. Allows callers to specify
* whether they would like to filter out dead tasks. See scx_task_iter_start()
@@ -2747,6 +2747,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
{
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
bool prev_on_scx = prev->sched_class == &ext_sched_class;
+ bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED;
int nr_loops = SCX_DSP_MAX_LOOPS;
lockdep_assert_rq_held(rq);
@@ -2779,8 +2780,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* See scx_ops_disable_workfn() for the explanation on the
* bypassing test.
*/
- if ((prev->scx.flags & SCX_TASK_QUEUED) &&
- prev->scx.slice && !scx_rq_bypassing(rq)) {
+ if (prev_on_rq && prev->scx.slice && !scx_rq_bypassing(rq)) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
goto has_tasks;
}
@@ -2813,6 +2813,10 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
flush_dispatch_buf(rq);
+ if (prev_on_rq && prev->scx.slice) {
+ rq->scx.flags |= SCX_RQ_BAL_KEEP;
+ goto has_tasks;
+ }
if (rq->scx.local_dsq.nr)
goto has_tasks;
if (consume_global_dsq(rq))
@@ -2838,8 +2842,7 @@ no_tasks:
* Didn't find another task to run. Keep running @prev unless
* %SCX_OPS_ENQ_LAST is in effect.
*/
- if ((prev->scx.flags & SCX_TASK_QUEUED) &&
- (!static_branch_unlikely(&scx_ops_enq_last) ||
+ if (prev_on_rq && (!static_branch_unlikely(&scx_ops_enq_last) ||
scx_rq_bypassing(rq))) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
goto has_tasks;
@@ -3034,7 +3037,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
*/
if (p->scx.slice && !scx_rq_bypassing(rq)) {
dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
- return;
+ goto switch_class;
}
/*
@@ -3051,6 +3054,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
}
}
+switch_class:
if (next && next->sched_class != &ext_sched_class)
switch_class(rq, next);
}
@@ -3132,6 +3136,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)
* scx_prio_less - Task ordering for core-sched
* @a: task A
* @b: task B
+ * @in_fi: in forced idle state
*
* Core-sched is implemented as an additional scheduling layer on top of the
* usual sched_class'es and needs to find out the expected task ordering. For
@@ -3180,6 +3185,10 @@ static bool test_and_clear_cpu_idle(int cpu)
* scx_pick_idle_cpu() can get caught in an infinite loop as
* @cpu is never cleared from idle_masks.smt. Ensure that @cpu
* is eventually cleared.
+ *
+ * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to
+ * reduce memory writes, which may help alleviate cache
+ * coherence pressure.
*/
if (cpumask_intersects(smt, idle_masks.smt))
cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
@@ -3216,6 +3225,74 @@ found:
}
/*
+ * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC
+ * domain is not defined).
+ */
+static unsigned int llc_weight(s32 cpu)
+{
+ struct sched_domain *sd;
+
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ if (!sd)
+ return 0;
+
+ return sd->span_weight;
+}
+
+/*
+ * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC
+ * domain is not defined).
+ */
+static struct cpumask *llc_span(s32 cpu)
+{
+ struct sched_domain *sd;
+
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ if (!sd)
+ return 0;
+
+ return sched_domain_span(sd);
+}
+
+/*
+ * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the
+ * NUMA domain is not defined).
+ */
+static unsigned int numa_weight(s32 cpu)
+{
+ struct sched_domain *sd;
+ struct sched_group *sg;
+
+ sd = rcu_dereference(per_cpu(sd_numa, cpu));
+ if (!sd)
+ return 0;
+ sg = sd->groups;
+ if (!sg)
+ return 0;
+
+ return sg->group_weight;
+}
+
+/*
+ * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA
+ * domain is not defined).
+ */
+static struct cpumask *numa_span(s32 cpu)
+{
+ struct sched_domain *sd;
+ struct sched_group *sg;
+
+ sd = rcu_dereference(per_cpu(sd_numa, cpu));
+ if (!sd)
+ return NULL;
+ sg = sd->groups;
+ if (!sg)
+ return NULL;
+
+ return sched_group_span(sg);
+}
+
+/*
* Return true if the LLC domains do not perfectly overlap with the NUMA
* domains, false otherwise.
*/
@@ -3246,19 +3323,10 @@ static bool llc_numa_mismatch(void)
* overlapping, which is incorrect (as NUMA 1 has two distinct LLC
* domains).
*/
- for_each_online_cpu(cpu) {
- const struct cpumask *numa_cpus;
- struct sched_domain *sd;
-
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
- if (!sd)
+ for_each_online_cpu(cpu)
+ if (llc_weight(cpu) != numa_weight(cpu))
return true;
- numa_cpus = cpumask_of_node(cpu_to_node(cpu));
- if (sd->span_weight != cpumask_weight(numa_cpus))
- return true;
- }
-
return false;
}
@@ -3276,8 +3344,7 @@ static bool llc_numa_mismatch(void)
static void update_selcpu_topology(void)
{
bool enable_llc = false, enable_numa = false;
- struct sched_domain *sd;
- const struct cpumask *cpus;
+ unsigned int nr_cpus;
s32 cpu = cpumask_first(cpu_online_mask);
/*
@@ -3291,10 +3358,12 @@ static void update_selcpu_topology(void)
* CPUs.
*/
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
- if (sd) {
- if (sd->span_weight < num_online_cpus())
+ nr_cpus = llc_weight(cpu);
+ if (nr_cpus > 0) {
+ if (nr_cpus < num_online_cpus())
enable_llc = true;
+ pr_debug("sched_ext: LLC=%*pb weight=%u\n",
+ cpumask_pr_args(llc_span(cpu)), llc_weight(cpu));
}
/*
@@ -3306,15 +3375,19 @@ static void update_selcpu_topology(void)
* enabling both NUMA and LLC optimizations is unnecessary, as checking
* for an idle CPU in the same domain twice is redundant.
*/
- cpus = cpumask_of_node(cpu_to_node(cpu));
- if ((cpumask_weight(cpus) < num_online_cpus()) && llc_numa_mismatch())
- enable_numa = true;
+ nr_cpus = numa_weight(cpu);
+ if (nr_cpus > 0) {
+ if (nr_cpus < num_online_cpus() && llc_numa_mismatch())
+ enable_numa = true;
+ pr_debug("sched_ext: NUMA=%*pb weight=%u\n",
+ cpumask_pr_args(numa_span(cpu)), numa_weight(cpu));
+ }
rcu_read_unlock();
pr_debug("sched_ext: LLC idle selection %s\n",
- enable_llc ? "enabled" : "disabled");
+ str_enabled_disabled(enable_llc));
pr_debug("sched_ext: NUMA idle selection %s\n",
- enable_numa ? "enabled" : "disabled");
+ str_enabled_disabled(enable_numa));
if (enable_llc)
static_branch_enable_cpuslocked(&scx_selcpu_topo_llc);
@@ -3344,6 +3417,8 @@ static void update_selcpu_topology(void)
* 4. Pick a CPU within the same NUMA node, if enabled:
* - choose a CPU from the same NUMA node to reduce memory access latency.
*
+ * 5. Pick any idle CPU usable by the task.
+ *
* Step 3 and 4 are performed only if the system has, respectively, multiple
* LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and
* scx_selcpu_topo_numa).
@@ -3360,7 +3435,6 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
*found = false;
-
/*
* This is necessary to protect llc_cpus.
*/
@@ -3379,15 +3453,10 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
*/
if (p->nr_cpus_allowed >= num_possible_cpus()) {
if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
- numa_cpus = cpumask_of_node(cpu_to_node(prev_cpu));
+ numa_cpus = numa_span(prev_cpu);
- if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) {
- struct sched_domain *sd;
-
- sd = rcu_dereference(per_cpu(sd_llc, prev_cpu));
- if (sd)
- llc_cpus = sched_domain_span(sd);
- }
+ if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc))
+ llc_cpus = llc_span(prev_cpu);
}
/*
@@ -3586,20 +3655,9 @@ static void reset_idle_masks(void)
cpumask_copy(idle_masks.smt, cpu_online_mask);
}
-void __scx_update_idle(struct rq *rq, bool idle)
+static void update_builtin_idle(int cpu, bool idle)
{
- int cpu = cpu_of(rq);
-
- if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) {
- SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
- if (!static_branch_unlikely(&scx_builtin_idle_enabled))
- return;
- }
-
- if (idle)
- cpumask_set_cpu(cpu, idle_masks.cpu);
- else
- cpumask_clear_cpu(cpu, idle_masks.cpu);
+ assign_cpu(cpu, idle_masks.cpu, idle);
#ifdef CONFIG_SCHED_SMT
if (sched_smt_active()) {
@@ -3610,10 +3668,8 @@ void __scx_update_idle(struct rq *rq, bool idle)
* idle_masks.smt handling is racy but that's fine as
* it's only for optimization and self-correcting.
*/
- for_each_cpu(cpu, smt) {
- if (!cpumask_test_cpu(cpu, idle_masks.cpu))
- return;
- }
+ if (!cpumask_subset(smt, idle_masks.cpu))
+ return;
cpumask_or(idle_masks.smt, idle_masks.smt, smt);
} else {
cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
@@ -3622,6 +3678,57 @@ void __scx_update_idle(struct rq *rq, bool idle)
#endif
}
+/*
+ * Update the idle state of a CPU to @idle.
+ *
+ * If @do_notify is true, ops.update_idle() is invoked to notify the scx
+ * scheduler of an actual idle state transition (idle to busy or vice
+ * versa). If @do_notify is false, only the idle state in the idle masks is
+ * refreshed without invoking ops.update_idle().
+ *
+ * This distinction is necessary, because an idle CPU can be "reserved" and
+ * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as
+ * busy even if no tasks are dispatched. In this case, the CPU may return
+ * to idle without a true state transition. Refreshing the idle masks
+ * without invoking ops.update_idle() ensures accurate idle state tracking
+ * while avoiding unnecessary updates and maintaining balanced state
+ * transitions.
+ */
+void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
+{
+ int cpu = cpu_of(rq);
+
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * Trigger ops.update_idle() only when transitioning from a task to
+ * the idle thread and vice versa.
+ *
+ * Idle transitions are indicated by do_notify being set to true,
+ * managed by put_prev_task_idle()/set_next_task_idle().
+ */
+ if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
+ SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
+
+ /*
+ * Update the idle masks:
+ * - for real idle transitions (do_notify == true)
+ * - for idle-to-idle transitions (indicated by the previous task
+ * being the idle thread, managed by pick_task_idle())
+ *
+ * Skip updating idle masks if the previous task is not the idle
+ * thread, since set_next_task_idle() has already handled it when
+ * transitioning from a task to the idle thread (calling this
+ * function with do_notify == true).
+ *
+ * In this way we can avoid updating the idle masks twice,
+ * unnecessarily.
+ */
+ if (static_branch_likely(&scx_builtin_idle_enabled))
+ if (do_notify || is_idle_task(rq->curr))
+ update_builtin_idle(cpu, idle);
+}
+
static void handle_hotplug(struct rq *rq, bool online)
{
int cpu = cpu_of(rq);
@@ -4641,6 +4748,7 @@ bool task_should_scx(int policy)
/**
* scx_softlockup - sched_ext softlockup handler
+ * @dur_s: number of seconds of CPU stuck due to soft lockup
*
* On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can
* live-lock the system by making many CPUs target the same DSQ to the point
@@ -4684,6 +4792,7 @@ static void scx_clear_softlockup(void)
/**
* scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress
+ * @bypass: true for bypass, false for unbypass
*
* Bypassing guarantees that all runnable tasks make forward progress without
* trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might
@@ -4744,10 +4853,9 @@ static void scx_ops_bypass(bool bypass)
*/
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
- struct rq_flags rf;
struct task_struct *p, *n;
- rq_lock(rq, &rf);
+ raw_spin_rq_lock(rq);
if (bypass) {
WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);
@@ -4763,7 +4871,7 @@ static void scx_ops_bypass(bool bypass)
* sees scx_rq_bypassing() before moving tasks to SCX.
*/
if (!scx_enabled()) {
- rq_unlock_irqrestore(rq, &rf);
+ raw_spin_rq_unlock(rq);
continue;
}
@@ -4783,10 +4891,11 @@ static void scx_ops_bypass(bool bypass)
sched_enq_and_set_task(&ctx);
}
- rq_unlock(rq, &rf);
-
/* resched to restore ticks and idle state */
- resched_cpu(cpu);
+ if (cpu_online(cpu) || cpu == smp_processor_id())
+ resched_curr(rq);
+
+ raw_spin_rq_unlock(rq);
}
atomic_dec(&scx_ops_breather_depth);
@@ -4852,7 +4961,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
struct task_struct *p;
struct rhashtable_iter rht_iter;
struct scx_dispatch_q *dsq;
- int i, kind;
+ int i, kind, cpu;
kind = atomic_read(&scx_exit_kind);
while (true) {
@@ -4935,6 +5044,15 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
scx_task_iter_stop(&sti);
percpu_up_write(&scx_fork_rwsem);
+ /*
+ * Invalidate all the rq clocks to prevent getting outdated
+ * rq clocks from a previous scx scheduler.
+ */
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ scx_rq_clock_invalidate(rq);
+ }
+
/* no task is on scx, turn off all the switches and flush in-progress calls */
static_branch_disable(&__scx_ops_enabled);
for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
@@ -5159,9 +5277,9 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK,
p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK,
ops_state >> SCX_OPSS_QSEQ_SHIFT);
- dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu",
+ dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu slice=%llu",
p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf,
- p->scx.dsq_vtime);
+ p->scx.dsq_vtime, p->scx.slice);
dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr));
if (SCX_HAS_OP(dump_task)) {
@@ -5352,7 +5470,7 @@ static struct kthread_worker *scx_create_rt_helper(const char *name)
{
struct kthread_worker *helper;
- helper = kthread_create_worker(0, name);
+ helper = kthread_run_worker(0, name);
if (helper)
sched_set_fifo(helper->task);
return helper;
@@ -6236,6 +6354,15 @@ void __init init_sched_ext_class(void)
__bpf_kfunc_start_defs();
+static bool check_builtin_idle_enabled(void)
+{
+ if (static_branch_likely(&scx_builtin_idle_enabled))
+ return true;
+
+ scx_ops_error("built-in idle tracking is disabled");
+ return false;
+}
+
/**
* scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu()
* @p: task_struct to select a CPU for
@@ -6253,10 +6380,8 @@ __bpf_kfunc_start_defs();
__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
u64 wake_flags, bool *is_idle)
{
- if (!static_branch_likely(&scx_builtin_idle_enabled)) {
- scx_ops_error("built-in idle tracking is disabled");
+ if (!check_builtin_idle_enabled())
goto prev_cpu;
- }
if (!scx_kf_allowed(SCX_KF_SELECT_CPU))
goto prev_cpu;
@@ -6340,9 +6465,7 @@ __bpf_kfunc_start_defs();
* ops.select_cpu(), and ops.dispatch().
*
* When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch
- * and @p must match the task being enqueued. Also, %SCX_DSQ_LOCAL_ON can't be
- * used to target the local DSQ of a CPU other than the enqueueing one. Use
- * ops.select_cpu() to be on the target CPU in the first place.
+ * and @p must match the task being enqueued.
*
* When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p
* will be directly inserted into the corresponding dispatch queue after
@@ -7013,7 +7136,7 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
return -ENOENT;
INIT_LIST_HEAD(&kit->cursor.node);
- kit->cursor.flags |= SCX_DSQ_LNODE_ITER_CURSOR | flags;
+ kit->cursor.flags = SCX_DSQ_LNODE_ITER_CURSOR | flags;
kit->cursor.priv = READ_ONCE(kit->dsq->seq);
return 0;
@@ -7181,7 +7304,7 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
}
/**
- * scx_bpf_dump - Generate extra debug dump specific to the BPF scheduler
+ * scx_bpf_dump_bstr - Generate extra debug dump specific to the BPF scheduler
* @fmt: format string
* @data: format string parameters packaged using ___bpf_fill() macro
* @data__sz: @data len, must end in '__sz' for the verifier
@@ -7273,7 +7396,6 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
* scx_bpf_cpuperf_set - Set the relative performance target of a CPU
* @cpu: CPU of interest
* @perf: target performance level [0, %SCX_CPUPERF_ONE]
- * @flags: %SCX_CPUPERF_* flags
*
* Set the target performance level of @cpu to @perf. @perf is in linear
* relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the
@@ -7350,10 +7472,8 @@ __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask)
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
{
- if (!static_branch_likely(&scx_builtin_idle_enabled)) {
- scx_ops_error("built-in idle tracking is disabled");
+ if (!check_builtin_idle_enabled())
return cpu_none_mask;
- }
#ifdef CONFIG_SMP
return idle_masks.cpu;
@@ -7371,10 +7491,8 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
{
- if (!static_branch_likely(&scx_builtin_idle_enabled)) {
- scx_ops_error("built-in idle tracking is disabled");
+ if (!check_builtin_idle_enabled())
return cpu_none_mask;
- }
#ifdef CONFIG_SMP
if (sched_smt_active())
@@ -7389,6 +7507,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
/**
* scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to
* either the percpu, or SMT idle-tracking cpumask.
+ * @idle_mask: &cpumask to use
*/
__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
{
@@ -7412,10 +7531,8 @@ __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
*/
__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
{
- if (!static_branch_likely(&scx_builtin_idle_enabled)) {
- scx_ops_error("built-in idle tracking is disabled");
+ if (!check_builtin_idle_enabled())
return false;
- }
if (ops_cpu_valid(cpu, NULL))
return test_and_clear_cpu_idle(cpu);
@@ -7445,10 +7562,8 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
u64 flags)
{
- if (!static_branch_likely(&scx_builtin_idle_enabled)) {
- scx_ops_error("built-in idle tracking is disabled");
+ if (!check_builtin_idle_enabled())
return -EBUSY;
- }
return scx_pick_idle_cpu(cpus_allowed, flags);
}
@@ -7543,6 +7658,68 @@ out:
}
#endif
+/**
+ * scx_bpf_now - Returns a high-performance monotonically non-decreasing
+ * clock for the current CPU. The clock returned is in nanoseconds.
+ *
+ * It provides the following properties:
+ *
+ * 1) High performance: Many BPF schedulers call bpf_ktime_get_ns() frequently
+ * to account for execution time and track tasks' runtime properties.
+ * Unfortunately, in some hardware platforms, bpf_ktime_get_ns() -- which
+ * eventually reads a hardware timestamp counter -- is neither performant nor
+ * scalable. scx_bpf_now() aims to provide a high-performance clock by
+ * using the rq clock in the scheduler core whenever possible.
+ *
+ * 2) High enough resolution for the BPF scheduler use cases: In most BPF
+ * scheduler use cases, the required clock resolution is lower than the most
+ * accurate hardware clock (e.g., rdtsc in x86). scx_bpf_now() basically
+ * uses the rq clock in the scheduler core whenever it is valid. It considers
+ * that the rq clock is valid from the time the rq clock is updated
+ * (update_rq_clock) until the rq is unlocked (rq_unpin_lock).
+ *
+ * 3) Monotonically non-decreasing clock for the same CPU: scx_bpf_now()
+ * guarantees the clock never goes backward when comparing them in the same
+ * CPU. On the other hand, when comparing clocks in different CPUs, there
+ * is no such guarantee -- the clock can go backward. It provides a
+ * monotonically *non-decreasing* clock so that it would provide the same
+ * clock values in two different scx_bpf_now() calls in the same CPU
+ * during the same period of when the rq clock is valid.
+ */
+__bpf_kfunc u64 scx_bpf_now(void)
+{
+ struct rq *rq;
+ u64 clock;
+
+ preempt_disable();
+
+ rq = this_rq();
+ if (smp_load_acquire(&rq->scx.flags) & SCX_RQ_CLK_VALID) {
+ /*
+ * If the rq clock is valid, use the cached rq clock.
+ *
+ * Note that scx_bpf_now() is re-entrant between a process
+ * context and an interrupt context (e.g., timer interrupt).
+ * However, we don't need to consider the race between them
+ * because such race is not observable from a caller.
+ */
+ clock = READ_ONCE(rq->scx.clock);
+ } else {
+ /*
+ * Otherwise, return a fresh rq clock.
+ *
+ * The rq clock is updated outside of the rq lock.
+ * In this case, keep the updated rq clock invalid so the next
+ * kfunc call outside the rq lock gets a fresh rq clock.
+ */
+ clock = sched_clock_cpu(cpu_of(rq));
+ }
+
+ preempt_enable();
+
+ return clock;
+}
+
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_any)
@@ -7574,6 +7751,7 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
#ifdef CONFIG_CGROUP_SCHED
BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
#endif
+BTF_ID_FLAGS(func, scx_bpf_now)
BTF_KFUNCS_END(scx_kfunc_ids_any)
static const struct btf_kfunc_id_set scx_kfunc_set_any = {
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index b1675bb59fc4..4d022d17ac7d 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -57,15 +57,15 @@ static inline void init_sched_ext_class(void) {}
#endif /* CONFIG_SCHED_CLASS_EXT */
#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP)
-void __scx_update_idle(struct rq *rq, bool idle);
+void __scx_update_idle(struct rq *rq, bool idle, bool do_notify);
-static inline void scx_update_idle(struct rq *rq, bool idle)
+static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify)
{
if (scx_enabled())
- __scx_update_idle(rq, idle);
+ __scx_update_idle(rq, idle, do_notify);
}
#else
-static inline void scx_update_idle(struct rq *rq, bool idle) {}
+static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {}
#endif
#ifdef CONFIG_CGROUP_SCHED
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index aa0238ee4857..ce2e94ccad0c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -37,6 +37,7 @@
#include <linux/sched/cputime.h>
#include <linux/sched/isolation.h>
#include <linux/sched/nohz.h>
+#include <linux/sched/prio.h>
#include <linux/cpuidle.h>
#include <linux/interrupt.h>
@@ -51,6 +52,8 @@
#include <asm/switch_to.h>
+#include <uapi/linux/sched/types.h>
+
#include "sched.h"
#include "stats.h"
#include "autogroup.h"
@@ -130,7 +133,7 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
#endif
#ifdef CONFIG_SYSCTL
-static struct ctl_table sched_fair_sysctls[] = {
+static const struct ctl_table sched_fair_sysctls[] = {
#ifdef CONFIG_CFS_BANDWIDTH
{
.procname = "sched_cfs_bandwidth_slice_us",
@@ -523,7 +526,7 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
* Scheduling class tree data structure manipulation methods:
*/
-static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
+static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime)
{
s64 delta = (s64)(vruntime - max_vruntime);
if (delta > 0)
@@ -532,7 +535,7 @@ static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
return max_vruntime;
}
-static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
+static inline __maybe_unused u64 min_vruntime(u64 min_vruntime, u64 vruntime)
{
s64 delta = (s64)(vruntime - min_vruntime);
if (delta < 0)
@@ -689,21 +692,16 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
*
* XXX could add max_slice to the augmented data to track this.
*/
-static s64 entity_lag(u64 avruntime, struct sched_entity *se)
+static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
s64 vlag, limit;
- vlag = avruntime - se->vruntime;
- limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
-
- return clamp(vlag, -limit, limit);
-}
-
-static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
SCHED_WARN_ON(!se->on_rq);
- se->vlag = entity_lag(avg_vruntime(cfs_rq), se);
+ vlag = avg_vruntime(cfs_rq) - se->vruntime;
+ limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
+
+ se->vlag = clamp(vlag, -limit, limit);
}
/*
@@ -915,7 +913,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
* We can safely skip eligibility check if there is only one entity
* in this cfs_rq, saving some cycles.
*/
- if (cfs_rq->nr_running == 1)
+ if (cfs_rq->nr_queued == 1)
return curr && curr->on_rq ? curr : se;
if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
@@ -1159,8 +1157,6 @@ static inline void update_curr_task(struct task_struct *p, s64 delta_exec)
trace_sched_stat_runtime(p, delta_exec);
account_group_exec_runtime(p, delta_exec);
cgroup_account_cputime(p, delta_exec);
- if (p->dl_server)
- dl_server_update(p->dl_server, delta_exec);
}
static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr)
@@ -1237,17 +1233,22 @@ static void update_curr(struct cfs_rq *cfs_rq)
update_curr_task(p, delta_exec);
/*
- * Any fair task that runs outside of fair_server should
- * account against fair_server such that it can account for
- * this time and possibly avoid running this period.
+ * If the fair_server is active, we need to account for the
+ * fair_server time whether or not the task is running on
+ * behalf of fair_server or not:
+ * - If the task is running on behalf of fair_server, we need
+ * to limit its time based on the assigned runtime.
+ * - Fair task that runs outside of fair_server should account
+ * against fair_server such that it can account for this time
+ * and possibly avoid running this period.
*/
- if (p->dl_server != &rq->fair_server)
+ if (dl_server_active(&rq->fair_server))
dl_server_update(&rq->fair_server, delta_exec);
}
account_cfs_rq_runtime(cfs_rq, delta_exec);
- if (cfs_rq->nr_running == 1)
+ if (cfs_rq->nr_queued == 1)
return;
if (resched || did_preempt_short(cfs_rq, curr)) {
@@ -2128,7 +2129,7 @@ static void update_numa_stats(struct task_numa_env *env,
ns->load += cpu_load(rq);
ns->runnable += cpu_runnable(rq);
ns->util += cpu_util_cfs(cpu);
- ns->nr_running += rq->cfs.h_nr_running;
+ ns->nr_running += rq->cfs.h_nr_runnable;
ns->compute_capacity += capacity_of(cpu);
if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) {
@@ -3679,9 +3680,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
list_add(&se->group_node, &rq->cfs_tasks);
}
#endif
- cfs_rq->nr_running++;
- if (se_is_idle(se))
- cfs_rq->idle_nr_running++;
+ cfs_rq->nr_queued++;
}
static void
@@ -3694,9 +3693,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
list_del_init(&se->group_node);
}
#endif
- cfs_rq->nr_running--;
- if (se_is_idle(se))
- cfs_rq->idle_nr_running--;
+ cfs_rq->nr_queued--;
}
/*
@@ -3771,137 +3768,32 @@ static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
#endif
-static void reweight_eevdf(struct sched_entity *se, u64 avruntime,
- unsigned long weight)
-{
- unsigned long old_weight = se->load.weight;
- s64 vlag, vslice;
-
- /*
- * VRUNTIME
- * --------
- *
- * COROLLARY #1: The virtual runtime of the entity needs to be
- * adjusted if re-weight at !0-lag point.
- *
- * Proof: For contradiction assume this is not true, so we can
- * re-weight without changing vruntime at !0-lag point.
- *
- * Weight VRuntime Avg-VRuntime
- * before w v V
- * after w' v' V'
- *
- * Since lag needs to be preserved through re-weight:
- *
- * lag = (V - v)*w = (V'- v')*w', where v = v'
- * ==> V' = (V - v)*w/w' + v (1)
- *
- * Let W be the total weight of the entities before reweight,
- * since V' is the new weighted average of entities:
- *
- * V' = (WV + w'v - wv) / (W + w' - w) (2)
- *
- * by using (1) & (2) we obtain:
- *
- * (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
- * ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
- * ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
- * ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
- *
- * Since we are doing at !0-lag point which means V != v, we
- * can simplify (3):
- *
- * ==> W / (W + w' - w) = w / w'
- * ==> Ww' = Ww + ww' - ww
- * ==> W * (w' - w) = w * (w' - w)
- * ==> W = w (re-weight indicates w' != w)
- *
- * So the cfs_rq contains only one entity, hence vruntime of
- * the entity @v should always equal to the cfs_rq's weighted
- * average vruntime @V, which means we will always re-weight
- * at 0-lag point, thus breach assumption. Proof completed.
- *
- *
- * COROLLARY #2: Re-weight does NOT affect weighted average
- * vruntime of all the entities.
- *
- * Proof: According to corollary #1, Eq. (1) should be:
- *
- * (V - v)*w = (V' - v')*w'
- * ==> v' = V' - (V - v)*w/w' (4)
- *
- * According to the weighted average formula, we have:
- *
- * V' = (WV - wv + w'v') / (W - w + w')
- * = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
- * = (WV - wv + w'V' - Vw + wv) / (W - w + w')
- * = (WV + w'V' - Vw) / (W - w + w')
- *
- * ==> V'*(W - w + w') = WV + w'V' - Vw
- * ==> V' * (W - w) = (W - w) * V (5)
- *
- * If the entity is the only one in the cfs_rq, then reweight
- * always occurs at 0-lag point, so V won't change. Or else
- * there are other entities, hence W != w, then Eq. (5) turns
- * into V' = V. So V won't change in either case, proof done.
- *
- *
- * So according to corollary #1 & #2, the effect of re-weight
- * on vruntime should be:
- *
- * v' = V' - (V - v) * w / w' (4)
- * = V - (V - v) * w / w'
- * = V - vl * w / w'
- * = V - vl'
- */
- if (avruntime != se->vruntime) {
- vlag = entity_lag(avruntime, se);
- vlag = div_s64(vlag * old_weight, weight);
- se->vruntime = avruntime - vlag;
- }
-
- /*
- * DEADLINE
- * --------
- *
- * When the weight changes, the virtual time slope changes and
- * we should adjust the relative virtual deadline accordingly.
- *
- * d' = v' + (d - v)*w/w'
- * = V' - (V - v)*w/w' + (d - v)*w/w'
- * = V - (V - v)*w/w' + (d - v)*w/w'
- * = V + (d - V)*w/w'
- */
- vslice = (s64)(se->deadline - avruntime);
- vslice = div_s64(vslice * old_weight, weight);
- se->deadline = avruntime + vslice;
-}
+static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
bool curr = cfs_rq->curr == se;
- u64 avruntime;
if (se->on_rq) {
/* commit outstanding execution time */
update_curr(cfs_rq);
- avruntime = avg_vruntime(cfs_rq);
+ update_entity_lag(cfs_rq, se);
+ se->deadline -= se->vruntime;
+ se->rel_deadline = 1;
if (!curr)
__dequeue_entity(cfs_rq, se);
update_load_sub(&cfs_rq->load, se->load.weight);
}
dequeue_load_avg(cfs_rq, se);
- if (se->on_rq) {
- reweight_eevdf(se, avruntime, weight);
- } else {
- /*
- * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
- * we need to scale se->vlag when w_i changes.
- */
- se->vlag = div_s64(se->vlag * se->load.weight, weight);
- }
+ /*
+ * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
+ * we need to scale se->vlag when w_i changes.
+ */
+ se->vlag = div_s64(se->vlag * se->load.weight, weight);
+ if (se->rel_deadline)
+ se->deadline = div_s64(se->deadline * se->load.weight, weight);
update_load_set(&se->load, weight);
@@ -3916,6 +3808,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
enqueue_load_avg(cfs_rq, se);
if (se->on_rq) {
update_load_add(&cfs_rq->load, se->load.weight);
+ place_entity(cfs_rq, se, 0);
if (!curr)
__enqueue_entity(cfs_rq, se);
@@ -4062,7 +3955,11 @@ static void update_cfs_group(struct sched_entity *se)
struct cfs_rq *gcfs_rq = group_cfs_rq(se);
long shares;
- if (!gcfs_rq)
+ /*
+ * When a group becomes empty, preserve its weight. This matters for
+ * DELAY_DEQUEUE.
+ */
+ if (!gcfs_rq || !gcfs_rq->load.weight)
return;
if (throttled_hierarchy(gcfs_rq))
@@ -5230,7 +5127,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
{
- return !cfs_rq->nr_running;
+ return !cfs_rq->nr_queued;
}
#define UPDATE_TG 0x0
@@ -5268,6 +5165,22 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
#endif /* CONFIG_SMP */
+void __setparam_fair(struct task_struct *p, const struct sched_attr *attr)
+{
+ struct sched_entity *se = &p->se;
+
+ p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+ if (attr->sched_runtime) {
+ se->custom_slice = 1;
+ se->slice = clamp_t(u64, attr->sched_runtime,
+ NSEC_PER_MSEC/10, /* HZ=1000 * 10 */
+ NSEC_PER_MSEC*100); /* HZ=100 / 10 */
+ } else {
+ se->custom_slice = 0;
+ se->slice = sysctl_sched_base_slice;
+ }
+}
+
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
@@ -5286,7 +5199,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*
* EEVDF: placement strategy #1 / #2
*/
- if (sched_feat(PLACE_LAG) && cfs_rq->nr_running && se->vlag) {
+ if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
struct sched_entity *curr = cfs_rq->curr;
unsigned long load;
@@ -5356,7 +5269,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
se->vruntime = vruntime - lag;
- if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) {
+ if (se->rel_deadline) {
se->deadline += se->vruntime;
se->rel_deadline = 0;
return;
@@ -5379,8 +5292,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
-static inline bool cfs_bandwidth_used(void);
-
static void
requeue_delayed_entity(struct sched_entity *se);
@@ -5402,7 +5313,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* When enqueuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
* - For group_entity, update its runnable_weight to reflect the new
- * h_nr_running of its group cfs_rq.
+ * h_nr_runnable of its group cfs_rq.
* - For group_entity, update its weight to reflect the new share of
* its group cfs_rq
* - Add its new weight to cfs_rq->load.weight
@@ -5435,7 +5346,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
- if (cfs_rq->nr_running == 1) {
+ if (cfs_rq->nr_queued == 1) {
check_enqueue_throttle(cfs_rq);
if (!throttled_hierarchy(cfs_rq)) {
list_add_leaf_cfs_rq(cfs_rq);
@@ -5471,9 +5382,33 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
-static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
+static void set_delayed(struct sched_entity *se)
+{
+ se->sched_delayed = 1;
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ cfs_rq->h_nr_runnable--;
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+ }
+}
+
+static void clear_delayed(struct sched_entity *se)
{
se->sched_delayed = 0;
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ cfs_rq->h_nr_runnable++;
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+ }
+}
+
+static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
+{
+ clear_delayed(se);
if (sched_feat(DELAY_ZERO) && se->vlag > 0)
se->vlag = 0;
}
@@ -5482,8 +5417,10 @@ static bool
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
bool sleep = flags & DEQUEUE_SLEEP;
+ int action = UPDATE_TG;
update_curr(cfs_rq);
+ clear_buddies(cfs_rq, se);
if (flags & DEQUEUE_DELAYED) {
SCHED_WARN_ON(!se->sched_delayed);
@@ -5500,15 +5437,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (sched_feat(DELAY_DEQUEUE) && delay &&
!entity_eligible(cfs_rq, se)) {
- if (cfs_rq->next == se)
- cfs_rq->next = NULL;
update_load_avg(cfs_rq, se, 0);
- se->sched_delayed = 1;
+ set_delayed(se);
return false;
}
}
- int action = UPDATE_TG;
if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
action |= DO_DETACH;
@@ -5516,7 +5450,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* When dequeuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
* - For group_entity, update its runnable_weight to reflect the new
- * h_nr_running of its group cfs_rq.
+ * h_nr_runnable of its group cfs_rq.
* - Subtract its previous weight from cfs_rq->load.weight.
* - For group entity, update its weight to reflect the new share
* of its group cfs_rq.
@@ -5526,8 +5460,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_stats_dequeue_fair(cfs_rq, se, flags);
- clear_buddies(cfs_rq, se);
-
update_entity_lag(cfs_rq, se);
if (sched_feat(PLACE_REL_DEADLINE) && !sleep) {
se->deadline -= se->vruntime;
@@ -5556,7 +5488,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (flags & DEQUEUE_DELAYED)
finish_delayed_dequeue_entity(se);
- if (cfs_rq->nr_running == 0)
+ if (cfs_rq->nr_queued == 0)
update_idle_cfs_rq_clock_pelt(cfs_rq);
return true;
@@ -5618,17 +5550,19 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
static struct sched_entity *
pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
{
+ struct sched_entity *se;
+
/*
- * Enabling NEXT_BUDDY will affect latency but not fairness.
+ * Picking the ->next buddy will affect latency but not fairness.
*/
- if (sched_feat(NEXT_BUDDY) &&
+ if (sched_feat(PICK_BUDDY) &&
cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
/* ->next will never be delayed */
SCHED_WARN_ON(cfs_rq->next->sched_delayed);
return cfs_rq->next;
}
- struct sched_entity *se = pick_eevdf(cfs_rq);
+ se = pick_eevdf(cfs_rq);
if (se->sched_delayed) {
dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
/*
@@ -5904,7 +5838,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)
list_del_leaf_cfs_rq(cfs_rq);
SCHED_WARN_ON(cfs_rq->throttled_clock_self);
- if (cfs_rq->nr_running)
+ if (cfs_rq->nr_queued)
cfs_rq->throttled_clock_self = rq_clock(rq);
}
cfs_rq->throttle_count++;
@@ -5917,8 +5851,8 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
- long task_delta, idle_task_delta, dequeue = 1;
- long rq_h_nr_running = rq->cfs.h_nr_running;
+ long queued_delta, runnable_delta, idle_delta, dequeue = 1;
+ long rq_h_nr_queued = rq->cfs.h_nr_queued;
raw_spin_lock(&cfs_b->lock);
/* This will start the period timer if necessary */
@@ -5948,8 +5882,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
- task_delta = cfs_rq->h_nr_running;
- idle_task_delta = cfs_rq->idle_h_nr_running;
+ queued_delta = cfs_rq->h_nr_queued;
+ runnable_delta = cfs_rq->h_nr_runnable;
+ idle_delta = cfs_rq->h_nr_idle;
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
int flags;
@@ -5969,10 +5904,11 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
dequeue_entity(qcfs_rq, se, flags);
if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_task_delta = cfs_rq->h_nr_running;
+ idle_delta = cfs_rq->h_nr_queued;
- qcfs_rq->h_nr_running -= task_delta;
- qcfs_rq->idle_h_nr_running -= idle_task_delta;
+ qcfs_rq->h_nr_queued -= queued_delta;
+ qcfs_rq->h_nr_runnable -= runnable_delta;
+ qcfs_rq->h_nr_idle -= idle_delta;
if (qcfs_rq->load.weight) {
/* Avoid re-evaluating load for this entity: */
@@ -5991,17 +5927,18 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
se_update_runnable(se);
if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_task_delta = cfs_rq->h_nr_running;
+ idle_delta = cfs_rq->h_nr_queued;
- qcfs_rq->h_nr_running -= task_delta;
- qcfs_rq->idle_h_nr_running -= idle_task_delta;
+ qcfs_rq->h_nr_queued -= queued_delta;
+ qcfs_rq->h_nr_runnable -= runnable_delta;
+ qcfs_rq->h_nr_idle -= idle_delta;
}
/* At this point se is NULL and we are at root level*/
- sub_nr_running(rq, task_delta);
+ sub_nr_running(rq, queued_delta);
/* Stop the fair server if throttling resulted in no runnable tasks */
- if (rq_h_nr_running && !rq->cfs.h_nr_running)
+ if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
dl_server_stop(&rq->fair_server);
done:
/*
@@ -6010,7 +5947,7 @@ done:
*/
cfs_rq->throttled = 1;
SCHED_WARN_ON(cfs_rq->throttled_clock);
- if (cfs_rq->nr_running)
+ if (cfs_rq->nr_queued)
cfs_rq->throttled_clock = rq_clock(rq);
return true;
}
@@ -6020,8 +5957,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
- long task_delta, idle_task_delta;
- long rq_h_nr_running = rq->cfs.h_nr_running;
+ long queued_delta, runnable_delta, idle_delta;
+ long rq_h_nr_queued = rq->cfs.h_nr_queued;
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -6054,8 +5991,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
goto unthrottle_throttle;
}
- task_delta = cfs_rq->h_nr_running;
- idle_task_delta = cfs_rq->idle_h_nr_running;
+ queued_delta = cfs_rq->h_nr_queued;
+ runnable_delta = cfs_rq->h_nr_runnable;
+ idle_delta = cfs_rq->h_nr_idle;
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
@@ -6069,10 +6007,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_task_delta = cfs_rq->h_nr_running;
+ idle_delta = cfs_rq->h_nr_queued;
- qcfs_rq->h_nr_running += task_delta;
- qcfs_rq->idle_h_nr_running += idle_task_delta;
+ qcfs_rq->h_nr_queued += queued_delta;
+ qcfs_rq->h_nr_runnable += runnable_delta;
+ qcfs_rq->h_nr_idle += idle_delta;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(qcfs_rq))
@@ -6086,10 +6025,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
se_update_runnable(se);
if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_task_delta = cfs_rq->h_nr_running;
+ idle_delta = cfs_rq->h_nr_queued;
- qcfs_rq->h_nr_running += task_delta;
- qcfs_rq->idle_h_nr_running += idle_task_delta;
+ qcfs_rq->h_nr_queued += queued_delta;
+ qcfs_rq->h_nr_runnable += runnable_delta;
+ qcfs_rq->h_nr_idle += idle_delta;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(qcfs_rq))
@@ -6097,17 +6037,17 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
}
/* Start the fair server if un-throttling resulted in new runnable tasks */
- if (!rq_h_nr_running && rq->cfs.h_nr_running)
+ if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
dl_server_start(&rq->fair_server);
/* At this point se is NULL and we are at root level*/
- add_nr_running(rq, task_delta);
+ add_nr_running(rq, queued_delta);
unthrottle_throttle:
assert_list_leaf_cfs_rq(rq);
/* Determine whether we need to wake up potentially idle CPU: */
- if (rq->curr == rq->idle && rq->cfs.nr_running)
+ if (rq->curr == rq->idle && rq->cfs.nr_queued)
resched_curr(rq);
}
@@ -6408,7 +6348,7 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
if (!cfs_bandwidth_used())
return;
- if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
+ if (!cfs_rq->runtime_enabled || cfs_rq->nr_queued)
return;
__return_cfs_rq_runtime(cfs_rq);
@@ -6679,6 +6619,10 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
lockdep_assert_rq_held(rq);
+ // Do not unthrottle for an active CPU
+ if (cpumask_test_cpu(cpu_of(rq), cpu_active_mask))
+ return;
+
/*
* The rq clock has already been updated in the
* set_rq_offline(), so we should skip updating
@@ -6694,18 +6638,20 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
continue;
/*
- * clock_task is not advancing so we just need to make sure
- * there's some valid quota amount
- */
- cfs_rq->runtime_remaining = 1;
- /*
* Offline rq is schedulable till CPU is completely disabled
* in take_cpu_down(), so we prevent new cfs throttling here.
*/
cfs_rq->runtime_enabled = 0;
- if (cfs_rq_throttled(cfs_rq))
- unthrottle_cfs_rq(cfs_rq);
+ if (!cfs_rq_throttled(cfs_rq))
+ continue;
+
+ /*
+ * clock_task is not advancing so we just need to make sure
+ * there's some valid quota amount
+ */
+ cfs_rq->runtime_remaining = 1;
+ unthrottle_cfs_rq(cfs_rq);
}
rcu_read_unlock();
@@ -6754,11 +6700,6 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
#else /* CONFIG_CFS_BANDWIDTH */
-static inline bool cfs_bandwidth_used(void)
-{
- return false;
-}
-
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
@@ -6816,7 +6757,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
SCHED_WARN_ON(task_rq(p) != rq);
- if (rq->cfs.h_nr_running > 1) {
+ if (rq->cfs.h_nr_queued > 1) {
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
u64 slice = se->slice;
s64 delta = slice - ran;
@@ -6904,7 +6845,7 @@ static inline void check_update_overutilized_status(struct rq *rq) { }
/* Runqueue only has SCHED_IDLE tasks enqueued */
static int sched_idle_rq(struct rq *rq)
{
- return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
+ return unlikely(rq->nr_running == rq->cfs.h_nr_idle &&
rq->nr_running);
}
@@ -6931,19 +6872,19 @@ requeue_delayed_entity(struct sched_entity *se)
if (sched_feat(DELAY_ZERO)) {
update_entity_lag(cfs_rq, se);
if (se->vlag > 0) {
- cfs_rq->nr_running--;
+ cfs_rq->nr_queued--;
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->vlag = 0;
place_entity(cfs_rq, se, 0);
if (se != cfs_rq->curr)
__enqueue_entity(cfs_rq, se);
- cfs_rq->nr_running++;
+ cfs_rq->nr_queued++;
}
}
update_load_avg(cfs_rq, se, 0);
- se->sched_delayed = 0;
+ clear_delayed(se);
}
/*
@@ -6956,9 +6897,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
- int idle_h_nr_running = task_has_idle_policy(p);
+ int h_nr_idle = task_has_idle_policy(p);
+ int h_nr_runnable = 1;
int task_new = !(flags & ENQUEUE_WAKEUP);
- int rq_h_nr_running = rq->cfs.h_nr_running;
+ int rq_h_nr_queued = rq->cfs.h_nr_queued;
u64 slice = 0;
/*
@@ -6983,6 +6925,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (p->in_iowait)
cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
+ if (task_new && se->sched_delayed)
+ h_nr_runnable = 0;
+
for_each_sched_entity(se) {
if (se->on_rq) {
if (se->sched_delayed)
@@ -7003,11 +6948,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
enqueue_entity(cfs_rq, se, flags);
slice = cfs_rq_min_slice(cfs_rq);
- cfs_rq->h_nr_running++;
- cfs_rq->idle_h_nr_running += idle_h_nr_running;
+ cfs_rq->h_nr_runnable += h_nr_runnable;
+ cfs_rq->h_nr_queued++;
+ cfs_rq->h_nr_idle += h_nr_idle;
if (cfs_rq_is_idle(cfs_rq))
- idle_h_nr_running = 1;
+ h_nr_idle = 1;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
@@ -7026,18 +6972,19 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
se->slice = slice;
slice = cfs_rq_min_slice(cfs_rq);
- cfs_rq->h_nr_running++;
- cfs_rq->idle_h_nr_running += idle_h_nr_running;
+ cfs_rq->h_nr_runnable += h_nr_runnable;
+ cfs_rq->h_nr_queued++;
+ cfs_rq->h_nr_idle += h_nr_idle;
if (cfs_rq_is_idle(cfs_rq))
- idle_h_nr_running = 1;
+ h_nr_idle = 1;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto enqueue_throttle;
}
- if (!rq_h_nr_running && rq->cfs.h_nr_running) {
+ if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
/* Account for idle runtime */
if (!rq->nr_running)
dl_server_update_idle_time(rq, rq->curr);
@@ -7084,19 +7031,22 @@ static void set_next_buddy(struct sched_entity *se);
static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
{
bool was_sched_idle = sched_idle_rq(rq);
- int rq_h_nr_running = rq->cfs.h_nr_running;
+ int rq_h_nr_queued = rq->cfs.h_nr_queued;
bool task_sleep = flags & DEQUEUE_SLEEP;
bool task_delayed = flags & DEQUEUE_DELAYED;
struct task_struct *p = NULL;
- int idle_h_nr_running = 0;
- int h_nr_running = 0;
+ int h_nr_idle = 0;
+ int h_nr_queued = 0;
+ int h_nr_runnable = 0;
struct cfs_rq *cfs_rq;
u64 slice = 0;
if (entity_is_task(se)) {
p = task_of(se);
- h_nr_running = 1;
- idle_h_nr_running = task_has_idle_policy(p);
+ h_nr_queued = 1;
+ h_nr_idle = task_has_idle_policy(p);
+ if (task_sleep || task_delayed || !se->sched_delayed)
+ h_nr_runnable = 1;
} else {
cfs_rq = group_cfs_rq(se);
slice = cfs_rq_min_slice(cfs_rq);
@@ -7112,11 +7062,12 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
break;
}
- cfs_rq->h_nr_running -= h_nr_running;
- cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+ cfs_rq->h_nr_runnable -= h_nr_runnable;
+ cfs_rq->h_nr_queued -= h_nr_queued;
+ cfs_rq->h_nr_idle -= h_nr_idle;
if (cfs_rq_is_idle(cfs_rq))
- idle_h_nr_running = h_nr_running;
+ h_nr_idle = h_nr_queued;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
@@ -7150,20 +7101,21 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
se->slice = slice;
slice = cfs_rq_min_slice(cfs_rq);
- cfs_rq->h_nr_running -= h_nr_running;
- cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+ cfs_rq->h_nr_runnable -= h_nr_runnable;
+ cfs_rq->h_nr_queued -= h_nr_queued;
+ cfs_rq->h_nr_idle -= h_nr_idle;
if (cfs_rq_is_idle(cfs_rq))
- idle_h_nr_running = h_nr_running;
+ h_nr_idle = h_nr_queued;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
return 0;
}
- sub_nr_running(rq, h_nr_running);
+ sub_nr_running(rq, h_nr_queued);
- if (rq_h_nr_running && !rq->cfs.h_nr_running)
+ if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
dl_server_stop(&rq->fair_server);
/* balance early to pull high priority tasks */
@@ -8780,7 +8732,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
return;
- if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) {
+ if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) {
set_next_buddy(pse);
}
@@ -8852,7 +8804,7 @@ static struct task_struct *pick_task_fair(struct rq *rq)
again:
cfs_rq = &rq->cfs;
- if (!cfs_rq->nr_running)
+ if (!cfs_rq->nr_queued)
return NULL;
do {
@@ -8969,7 +8921,7 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_stru
static bool fair_server_has_tasks(struct sched_dl_entity *dl_se)
{
- return !!dl_se->rq->cfs.nr_running;
+ return !!dl_se->rq->cfs.nr_queued;
}
static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se)
@@ -9300,43 +9252,43 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
#ifdef CONFIG_NUMA_BALANCING
/*
- * Returns 1, if task migration degrades locality
- * Returns 0, if task migration improves locality i.e migration preferred.
- * Returns -1, if task migration is not affected by locality.
+ * Returns a positive value, if task migration degrades locality.
+ * Returns 0, if task migration is not affected by locality.
+ * Returns a negative value, if task migration improves locality i.e migration preferred.
*/
-static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
+static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
{
struct numa_group *numa_group = rcu_dereference(p->numa_group);
unsigned long src_weight, dst_weight;
int src_nid, dst_nid, dist;
if (!static_branch_likely(&sched_numa_balancing))
- return -1;
+ return 0;
if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
- return -1;
+ return 0;
src_nid = cpu_to_node(env->src_cpu);
dst_nid = cpu_to_node(env->dst_cpu);
if (src_nid == dst_nid)
- return -1;
+ return 0;
/* Migrating away from the preferred node is always bad. */
if (src_nid == p->numa_preferred_nid) {
if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
return 1;
else
- return -1;
+ return 0;
}
/* Encourage migration to the preferred node. */
if (dst_nid == p->numa_preferred_nid)
- return 0;
+ return -1;
/* Leaving a core idle is often worse than degrading locality. */
if (env->idle == CPU_IDLE)
- return -1;
+ return 0;
dist = node_distance(src_nid, dst_nid);
if (numa_group) {
@@ -9347,37 +9299,77 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
dst_weight = task_weight(p, dst_nid, dist);
}
- return dst_weight < src_weight;
+ return src_weight - dst_weight;
}
#else
-static inline int migrate_degrades_locality(struct task_struct *p,
+static inline long migrate_degrades_locality(struct task_struct *p,
struct lb_env *env)
{
- return -1;
+ return 0;
}
#endif
/*
+ * Check whether the task is ineligible on the destination cpu
+ *
+ * When the PLACE_LAG scheduling feature is enabled and
+ * dst_cfs_rq->nr_queued is greater than 1, if the task
+ * is ineligible, it will also be ineligible when
+ * it is migrated to the destination cpu.
+ */
+static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_cpu)
+{
+ struct cfs_rq *dst_cfs_rq;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ dst_cfs_rq = task_group(p)->cfs_rq[dest_cpu];
+#else
+ dst_cfs_rq = &cpu_rq(dest_cpu)->cfs;
+#endif
+ if (sched_feat(PLACE_LAG) && dst_cfs_rq->nr_queued &&
+ !entity_eligible(task_cfs_rq(p), &p->se))
+ return 1;
+
+ return 0;
+}
+
+/*
* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
*/
static
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
- int tsk_cache_hot;
+ long degrades, hot;
lockdep_assert_rq_held(env->src_rq);
+ if (p->sched_task_hot)
+ p->sched_task_hot = 0;
/*
* We do not migrate tasks that are:
- * 1) throttled_lb_pair, or
- * 2) cannot be migrated to this CPU due to cpus_ptr, or
- * 3) running (obviously), or
- * 4) are cache-hot on their current CPU.
+ * 1) delayed dequeued unless we migrate load, or
+ * 2) throttled_lb_pair, or
+ * 3) cannot be migrated to this CPU due to cpus_ptr, or
+ * 4) running (obviously), or
+ * 5) are cache-hot on their current CPU.
*/
+ if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
+ return 0;
+
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
return 0;
+ /*
+ * We want to prioritize the migration of eligible tasks.
+ * For ineligible tasks we soft-limit them and only allow
+ * them to migrate when nr_balance_failed is non-zero to
+ * avoid load-balancing trying very hard to balance the load.
+ */
+ if (!env->sd->nr_balance_failed &&
+ task_is_ineligible_on_dst_cpu(p, env->dst_cpu))
+ return 0;
+
/* Disregard percpu kthreads; they are where they need to be. */
if (kthread_is_per_cpu(p))
return 0;
@@ -9433,16 +9425,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (env->flags & LBF_ACTIVE_LB)
return 1;
- tsk_cache_hot = migrate_degrades_locality(p, env);
- if (tsk_cache_hot == -1)
- tsk_cache_hot = task_hot(p, env);
+ degrades = migrate_degrades_locality(p, env);
+ if (!degrades)
+ hot = task_hot(p, env);
+ else
+ hot = degrades > 0;
- if (tsk_cache_hot <= 0 ||
- env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
- if (tsk_cache_hot == 1) {
- schedstat_inc(env->sd->lb_hot_gained[env->idle]);
- schedstat_inc(p->stats.nr_forced_migrations);
- }
+ if (!hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
+ if (hot)
+ p->sched_task_hot = 1;
return 1;
}
@@ -9457,6 +9448,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
{
lockdep_assert_rq_held(env->src_rq);
+ if (p->sched_task_hot) {
+ p->sched_task_hot = 0;
+ schedstat_inc(env->sd->lb_hot_gained[env->idle]);
+ schedstat_inc(p->stats.nr_forced_migrations);
+ }
+
deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, env->dst_cpu);
}
@@ -9617,6 +9614,9 @@ static int detach_tasks(struct lb_env *env)
continue;
next:
+ if (p->sched_task_hot)
+ schedstat_inc(p->stats.nr_failed_migrations_hot);
+
list_move(&p->se.group_node, tasks);
}
@@ -9759,7 +9759,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
update_tg_load_avg(cfs_rq);
- if (cfs_rq->nr_running == 0)
+ if (cfs_rq->nr_queued == 0)
update_idle_cfs_rq_clock_pelt(cfs_rq);
if (cfs_rq == &rq->cfs)
@@ -10291,7 +10291,7 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
* When there is more than 1 task, the group_overloaded case already
* takes care of cpu with reduced capacity
*/
- if (rq->cfs.h_nr_running != 1)
+ if (rq->cfs.h_nr_runnable != 1)
return false;
return check_cpu_capacity(rq, sd);
@@ -10313,7 +10313,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
bool *sg_overloaded,
bool *sg_overutilized)
{
- int i, nr_running, local_group;
+ int i, nr_running, local_group, sd_flags = env->sd->flags;
+ bool balancing_at_rd = !env->sd->parent;
memset(sgs, 0, sizeof(*sgs));
@@ -10326,21 +10327,14 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_load += load;
sgs->group_util += cpu_util_cfs(i);
sgs->group_runnable += cpu_runnable(rq);
- sgs->sum_h_nr_running += rq->cfs.h_nr_running;
+ sgs->sum_h_nr_running += rq->cfs.h_nr_runnable;
nr_running = rq->nr_running;
sgs->sum_nr_running += nr_running;
- if (nr_running > 1)
- *sg_overloaded = 1;
-
if (cpu_overutilized(i))
*sg_overutilized = 1;
-#ifdef CONFIG_NUMA_BALANCING
- sgs->nr_numa_running += rq->nr_numa_running;
- sgs->nr_preferred_running += rq->nr_preferred_running;
-#endif
/*
* No need to call idle_cpu() if nr_running is not 0
*/
@@ -10350,10 +10344,21 @@ static inline void update_sg_lb_stats(struct lb_env *env,
continue;
}
+ /* Overload indicator is only updated at root domain */
+ if (balancing_at_rd && nr_running > 1)
+ *sg_overloaded = 1;
+
+#ifdef CONFIG_NUMA_BALANCING
+ /* Only fbq_classify_group() uses this to classify NUMA groups */
+ if (sd_flags & SD_NUMA) {
+ sgs->nr_numa_running += rq->nr_numa_running;
+ sgs->nr_preferred_running += rq->nr_preferred_running;
+ }
+#endif
if (local_group)
continue;
- if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
+ if (sd_flags & SD_ASYM_CPUCAPACITY) {
/* Check for a misfit task on the cpu */
if (sgs->group_misfit_task_load < rq->misfit_task_load) {
sgs->group_misfit_task_load = rq->misfit_task_load;
@@ -10641,7 +10646,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
sgs->group_util += cpu_util_without(i, p);
sgs->group_runnable += cpu_runnable_without(rq, p);
local = task_running_on_cpu(i, p);
- sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
+ sgs->sum_h_nr_running += rq->cfs.h_nr_runnable - local;
nr_running = rq->nr_running - local;
sgs->sum_nr_running += nr_running;
@@ -11423,7 +11428,7 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env,
if (rt > env->fbq_type)
continue;
- nr_running = rq->cfs.h_nr_running;
+ nr_running = rq->cfs.h_nr_runnable;
if (!nr_running)
continue;
@@ -11582,7 +11587,7 @@ static int need_active_balance(struct lb_env *env)
* available on dst_cpu.
*/
if (env->idle &&
- (env->src_rq->cfs.h_nr_running == 1)) {
+ (env->src_rq->cfs.h_nr_runnable == 1)) {
if ((check_cpu_capacity(env->src_rq, sd)) &&
(capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
return 1;
@@ -11662,6 +11667,28 @@ static int should_we_balance(struct lb_env *env)
return group_balance_cpu(sg) == env->dst_cpu;
}
+static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd,
+ enum cpu_idle_type idle)
+{
+ if (!schedstat_enabled())
+ return;
+
+ switch (env->migration_type) {
+ case migrate_load:
+ __schedstat_add(sd->lb_imbalance_load[idle], env->imbalance);
+ break;
+ case migrate_util:
+ __schedstat_add(sd->lb_imbalance_util[idle], env->imbalance);
+ break;
+ case migrate_task:
+ __schedstat_add(sd->lb_imbalance_task[idle], env->imbalance);
+ break;
+ case migrate_misfit:
+ __schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance);
+ break;
+ }
+}
+
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
@@ -11712,7 +11739,7 @@ redo:
WARN_ON_ONCE(busiest == env.dst_rq);
- schedstat_add(sd->lb_imbalance[idle], env.imbalance);
+ update_lb_imbalance_stat(&env, sd, idle);
env.src_cpu = busiest->cpu;
env.src_rq = busiest;
@@ -12210,16 +12237,13 @@ static inline int on_null_domain(struct rq *rq)
* - When one of the busy CPUs notices that there may be an idle rebalancing
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
- *
- * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED is not set
- * anywhere yet.
*/
static inline int find_new_ilb(void)
{
const struct cpumask *hk_mask;
int ilb_cpu;
- hk_mask = housekeeping_cpumask(HK_TYPE_MISC);
+ hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
@@ -12237,7 +12261,8 @@ static inline int find_new_ilb(void)
* Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU
* SMP function call (IPI).
*
- * We pick the first idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
+ * We pick the first idle CPU in the HK_TYPE_KERNEL_NOISE housekeeping set
+ * (if there is one).
*/
static void kick_ilb(unsigned int flags)
{
@@ -12325,7 +12350,7 @@ static void nohz_balancer_kick(struct rq *rq)
* If there's a runnable CFS task and the current CPU has reduced
* capacity, kick the ILB to see if there's a better CPU to run on:
*/
- if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
+ if (rq->cfs.h_nr_runnable >= 1 && check_cpu_capacity(rq, sd)) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
@@ -12457,10 +12482,6 @@ void nohz_balance_enter_idle(int cpu)
if (!cpu_active(cpu))
return;
- /* Spare idle load balancing on CPUs that don't want to be disturbed: */
- if (!housekeeping_cpu(cpu, HK_TYPE_SCHED))
- return;
-
/*
* Can be set safely without rq->lock held
* If a clear happens, it will have evaluated last additions because
@@ -12680,13 +12701,6 @@ static void nohz_newidle_balance(struct rq *this_rq)
{
int this_cpu = this_rq->cpu;
- /*
- * This CPU doesn't want to be disturbed by scheduler
- * housekeeping
- */
- if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED))
- return;
-
/* Will wake up very soon. No time for doing anything else*/
if (this_rq->avg_idle < sysctl_sched_migration_cost)
return;
@@ -12823,11 +12837,11 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
* have been enqueued in the meantime. Since we're not going idle,
* pretend we pulled a task.
*/
- if (this_rq->cfs.h_nr_running && !pulled_task)
+ if (this_rq->cfs.h_nr_queued && !pulled_task)
pulled_task = 1;
/* Is there a task of a high priority class? */
- if (this_rq->nr_running != this_rq->cfs.h_nr_running)
+ if (this_rq->nr_running != this_rq->cfs.h_nr_queued)
pulled_task = -1;
out:
@@ -12848,9 +12862,9 @@ out:
/*
* This softirq handler is triggered via SCHED_SOFTIRQ from two places:
*
- * - directly from the local scheduler_tick() for periodic load balancing
+ * - directly from the local sched_tick() for periodic load balancing
*
- * - indirectly from a remote scheduler_tick() for NOHZ idle balancing
+ * - indirectly from a remote sched_tick() for NOHZ idle balancing
* through the SMP cross-call nohz_csd_func()
*/
static __latent_entropy void sched_balance_softirq(void)
@@ -12941,7 +12955,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
* MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
* if we need to give up the CPU.
*/
- if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 &&
+ if (rq->core->core_forceidle_count && rq->cfs.nr_queued == 1 &&
__entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
resched_curr(rq);
}
@@ -13085,7 +13099,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
if (!task_on_rq_queued(p))
return;
- if (rq->cfs.nr_running == 1)
+ if (rq->cfs.nr_queued == 1)
return;
/*
@@ -13495,7 +13509,7 @@ int sched_group_set_idle(struct task_group *tg, long idle)
for_each_possible_cpu(i) {
struct rq *rq = cpu_rq(i);
struct sched_entity *se = tg->se[i];
- struct cfs_rq *parent_cfs_rq, *grp_cfs_rq = tg->cfs_rq[i];
+ struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i];
bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
long idle_task_delta;
struct rq_flags rf;
@@ -13506,16 +13520,8 @@ int sched_group_set_idle(struct task_group *tg, long idle)
if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
goto next_cpu;
- if (se->on_rq) {
- parent_cfs_rq = cfs_rq_of(se);
- if (cfs_rq_is_idle(grp_cfs_rq))
- parent_cfs_rq->idle_nr_running++;
- else
- parent_cfs_rq->idle_nr_running--;
- }
-
- idle_task_delta = grp_cfs_rq->h_nr_running -
- grp_cfs_rq->idle_h_nr_running;
+ idle_task_delta = grp_cfs_rq->h_nr_queued -
+ grp_cfs_rq->h_nr_idle;
if (!cfs_rq_is_idle(grp_cfs_rq))
idle_task_delta *= -1;
@@ -13525,7 +13531,7 @@ int sched_group_set_idle(struct task_group *tg, long idle)
if (!se->on_rq)
break;
- cfs_rq->idle_h_nr_running += idle_task_delta;
+ cfs_rq->h_nr_idle += idle_task_delta;
/* Already accounted at parent level and above. */
if (cfs_rq_is_idle(cfs_rq))
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index a3d331dd2d8f..3c12d9f93331 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -32,6 +32,15 @@ SCHED_FEAT(PREEMPT_SHORT, true)
SCHED_FEAT(NEXT_BUDDY, false)
/*
+ * Allow completely ignoring cfs_rq->next; which can be set from various
+ * places:
+ * - NEXT_BUDDY (wakeup preemption)
+ * - yield_to_task()
+ * - cgroup dequeue / pick
+ */
+SCHED_FEAT(PICK_BUDDY, true)
+
+/*
* Consider buddies to be cache hot, decreases the likeliness of a
* cache buddy being migrated away, increases cache locality.
*/
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 621696269584..2c85c86b455f 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -452,19 +452,20 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
dl_server_update_idle_time(rq, prev);
- scx_update_idle(rq, false);
+ scx_update_idle(rq, false, true);
}
static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
{
update_idle_core(rq);
- scx_update_idle(rq, true);
+ scx_update_idle(rq, true, true);
schedstat_inc(rq->sched_goidle);
next->se.exec_start = rq_clock_task(rq);
}
struct task_struct *pick_task_idle(struct rq *rq)
{
+ scx_update_idle(rq, true, false);
return rq->idle;
}
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 5891e715f00d..81bc8b329ef1 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -9,15 +9,9 @@
*/
enum hk_flags {
- HK_FLAG_TIMER = BIT(HK_TYPE_TIMER),
- HK_FLAG_RCU = BIT(HK_TYPE_RCU),
- HK_FLAG_MISC = BIT(HK_TYPE_MISC),
- HK_FLAG_SCHED = BIT(HK_TYPE_SCHED),
- HK_FLAG_TICK = BIT(HK_TYPE_TICK),
HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN),
- HK_FLAG_WQ = BIT(HK_TYPE_WQ),
HK_FLAG_MANAGED_IRQ = BIT(HK_TYPE_MANAGED_IRQ),
- HK_FLAG_KTHREAD = BIT(HK_TYPE_KTHREAD),
+ HK_FLAG_KERNEL_NOISE = BIT(HK_TYPE_KERNEL_NOISE),
};
DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
@@ -97,7 +91,7 @@ void __init housekeeping_init(void)
static_branch_enable(&housekeeping_overridden);
- if (housekeeping.flags & HK_FLAG_TICK)
+ if (housekeeping.flags & HK_FLAG_KERNEL_NOISE)
sched_tick_offload_init();
for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
@@ -121,7 +115,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
unsigned int first_cpu;
int err = 0;
- if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) {
+ if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE)) {
if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) {
pr_warn("Housekeeping: nohz unsupported."
" Build with CONFIG_NO_HZ_FULL\n");
@@ -177,7 +171,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
housekeeping_setup_type(type, housekeeping_staging);
}
- if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK))
+ if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE))
tick_nohz_full_setup(non_housekeeping_mask);
housekeeping.flags |= flags;
@@ -195,8 +189,7 @@ static int __init housekeeping_nohz_full_setup(char *str)
{
unsigned long flags;
- flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU |
- HK_FLAG_MISC | HK_FLAG_KTHREAD;
+ flags = HK_FLAG_KERNEL_NOISE;
return housekeeping_setup(str, flags);
}
@@ -210,9 +203,12 @@ static int __init housekeeping_isolcpus_setup(char *str)
int len;
while (isalpha(*str)) {
+ /*
+ * isolcpus=nohz is equivalent to nohz_full.
+ */
if (!strncmp(str, "nohz,", 5)) {
str += 5;
- flags |= HK_FLAG_TICK;
+ flags |= HK_FLAG_KERNEL_NOISE;
continue;
}
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index fc07382361a8..7a8534a2deff 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -275,7 +275,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
*
* group: [ see update_cfs_group() ]
* se_weight() = tg->weight * grq->load_avg / tg->load_avg
- * se_runnable() = grq->h_nr_running
+ * se_runnable() = grq->h_nr_runnable
*
* runnable_sum = se_runnable() * runnable = grq->runnable_sum
* runnable_avg = runnable_sum
@@ -321,7 +321,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
{
if (___update_load_sum(now, &cfs_rq->avg,
scale_load_down(cfs_rq->load.weight),
- cfs_rq->h_nr_running,
+ cfs_rq->h_nr_runnable,
cfs_rq->curr != NULL)) {
___update_load_avg(&cfs_rq->avg, 1);
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 84dad1511d1e..bb56805e3d47 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -998,7 +998,7 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st
s64 delta;
u64 irq;
- if (static_branch_likely(&psi_disabled))
+ if (static_branch_likely(&psi_disabled) || !irqtime_enabled())
return;
if (!curr->pid)
@@ -1240,6 +1240,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
if (static_branch_likely(&psi_disabled))
return -EOPNOTSUPP;
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ if (!irqtime_enabled() && res == PSI_IRQ)
+ return -EOPNOTSUPP;
+#endif
+
/* Update averages before reporting them */
mutex_lock(&group->avgs_lock);
now = sched_clock();
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index bd66a46b06ac..4b8e33c615b1 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -26,7 +26,7 @@ static int sched_rt_handler(const struct ctl_table *table, int write, void *buff
size_t *lenp, loff_t *ppos);
static int sched_rr_handler(const struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos);
-static struct ctl_table sched_rt_sysctls[] = {
+static const struct ctl_table sched_rt_sysctls[] = {
{
.procname = "sched_rt_period_us",
.data = &sysctl_sched_rt_period,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 76f5f53a645f..38e0e323dda2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -362,7 +362,7 @@ extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
extern bool __checkparam_dl(const struct sched_attr *attr);
extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
-extern int dl_bw_check_overflow(int cpu);
+extern int dl_bw_deactivate(int cpu);
extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec);
/*
* SCHED_DEADLINE supports servers (nested scheduling) with the following
@@ -398,6 +398,11 @@ extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq
extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
u64 runtime, u64 period, bool init);
+static inline bool dl_server_active(struct sched_dl_entity *dl_se)
+{
+ return dl_se->dl_server_active;
+}
+
#ifdef CONFIG_CGROUP_SCHED
extern struct list_head task_groups;
@@ -645,10 +650,10 @@ struct balance_callback {
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
- unsigned int nr_running;
- unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */
- unsigned int idle_nr_running; /* SCHED_IDLE */
- unsigned int idle_h_nr_running; /* SCHED_IDLE */
+ unsigned int nr_queued;
+ unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int h_nr_idle; /* SCHED_IDLE */
s64 avg_vruntime;
u64 avg_load;
@@ -754,6 +759,7 @@ enum scx_rq_flags {
SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */
SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */
SCX_RQ_BYPASSING = 1 << 4,
+ SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */
SCX_RQ_IN_WAKEUP = 1 << 16,
SCX_RQ_IN_BALANCE = 1 << 17,
@@ -766,9 +772,10 @@ struct scx_rq {
unsigned long ops_qseq;
u64 extra_enq_flags; /* see move_task_to_local_dsq() */
u32 nr_running;
- u32 flags;
u32 cpuperf_target; /* [0, SCHED_CAPACITY_SCALE] */
bool cpu_released;
+ u32 flags;
+ u64 clock; /* current per-rq clock -- see scx_bpf_now() */
cpumask_var_t cpus_to_kick;
cpumask_var_t cpus_to_kick_if_idle;
cpumask_var_t cpus_to_preempt;
@@ -899,7 +906,7 @@ struct dl_rq {
static inline void se_update_runnable(struct sched_entity *se)
{
if (!entity_is_task(se))
- se->runnable_weight = se->my_q->h_nr_running;
+ se->runnable_weight = se->my_q->h_nr_runnable;
}
static inline long se_runnable(struct sched_entity *se)
@@ -1717,6 +1724,38 @@ struct rq_flags {
extern struct balance_callback balance_push_callback;
+#ifdef CONFIG_SCHED_CLASS_EXT
+extern const struct sched_class ext_sched_class;
+
+DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled); /* SCX BPF scheduler loaded */
+DECLARE_STATIC_KEY_FALSE(__scx_switched_all); /* all fair class tasks on SCX */
+
+#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled)
+#define scx_switched_all() static_branch_unlikely(&__scx_switched_all)
+
+static inline void scx_rq_clock_update(struct rq *rq, u64 clock)
+{
+ if (!scx_enabled())
+ return;
+ WRITE_ONCE(rq->scx.clock, clock);
+ smp_store_release(&rq->scx.flags, rq->scx.flags | SCX_RQ_CLK_VALID);
+}
+
+static inline void scx_rq_clock_invalidate(struct rq *rq)
+{
+ if (!scx_enabled())
+ return;
+ WRITE_ONCE(rq->scx.flags, rq->scx.flags & ~SCX_RQ_CLK_VALID);
+}
+
+#else /* !CONFIG_SCHED_CLASS_EXT */
+#define scx_enabled() false
+#define scx_switched_all() false
+
+static inline void scx_rq_clock_update(struct rq *rq, u64 clock) {}
+static inline void scx_rq_clock_invalidate(struct rq *rq) {}
+#endif /* !CONFIG_SCHED_CLASS_EXT */
+
/*
* Lockdep annotation that avoids accidental unlocks; it's like a
* sticky/continuous lockdep_assert_held().
@@ -1746,7 +1785,7 @@ static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
if (rq->clock_update_flags > RQCF_ACT_SKIP)
rf->clock_update_flags = RQCF_UPDATED;
#endif
-
+ scx_rq_clock_invalidate(rq);
lockdep_unpin_lock(__rq_lockp(rq), rf->cookie);
}
@@ -2271,7 +2310,7 @@ static inline int task_on_cpu(struct rq *rq, struct task_struct *p)
static inline int task_on_rq_queued(struct task_struct *p)
{
- return p->on_rq == TASK_ON_RQ_QUEUED;
+ return READ_ONCE(p->on_rq) == TASK_ON_RQ_QUEUED;
}
static inline int task_on_rq_migrating(struct task_struct *p)
@@ -2505,19 +2544,6 @@ extern const struct sched_class rt_sched_class;
extern const struct sched_class fair_sched_class;
extern const struct sched_class idle_sched_class;
-#ifdef CONFIG_SCHED_CLASS_EXT
-extern const struct sched_class ext_sched_class;
-
-DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled); /* SCX BPF scheduler loaded */
-DECLARE_STATIC_KEY_FALSE(__scx_switched_all); /* all fair class tasks on SCX */
-
-#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled)
-#define scx_switched_all() static_branch_unlikely(&__scx_switched_all)
-#else /* !CONFIG_SCHED_CLASS_EXT */
-#define scx_enabled() false
-#define scx_switched_all() false
-#endif /* !CONFIG_SCHED_CLASS_EXT */
-
/*
* Iterate only active classes. SCX can take over all fair tasks or be
* completely disabled. If the former, skip fair. If the latter, skip SCX.
@@ -2565,7 +2591,7 @@ static inline bool sched_rt_runnable(struct rq *rq)
static inline bool sched_fair_runnable(struct rq *rq)
{
- return rq->cfs.nr_running > 0;
+ return rq->cfs.nr_queued > 0;
}
extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
@@ -3233,6 +3259,12 @@ struct irqtime {
};
DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
+DECLARE_STATIC_KEY_FALSE(sched_clock_irqtime);
+
+static inline int irqtime_enabled(void)
+{
+ return static_branch_likely(&sched_clock_irqtime);
+}
/*
* Returns the irqtime minus the softirq time computed by ksoftirqd.
@@ -3253,6 +3285,13 @@ static inline u64 irq_time_read(int cpu)
return total;
}
+#else
+
+static inline int irqtime_enabled(void)
+{
+ return 0;
+}
+
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
#ifdef CONFIG_CPU_FREQ
@@ -3500,6 +3539,8 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned
#endif /* !CONFIG_HAVE_SCHED_AVG_IRQ */
+extern void __setparam_fair(struct task_struct *p, const struct sched_attr *attr);
+
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus)))
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index eb0cdcd4d921..4346fd81c31f 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -103,7 +103,7 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p,
* Bump this up when changing the output format or the meaning of an existing
* format, so that tools can adapt (or abort)
*/
-#define SCHEDSTAT_VERSION 16
+#define SCHEDSTAT_VERSION 17
static int show_schedstat(struct seq_file *seq, void *v)
{
@@ -138,14 +138,17 @@ static int show_schedstat(struct seq_file *seq, void *v)
for_each_domain(cpu, sd) {
enum cpu_idle_type itype;
- seq_printf(seq, "domain%d %*pb", dcount++,
+ seq_printf(seq, "domain%d %s %*pb", dcount++, sd->name,
cpumask_pr_args(sched_domain_span(sd)));
for (itype = 0; itype < CPU_MAX_IDLE_TYPES; itype++) {
- seq_printf(seq, " %u %u %u %u %u %u %u %u",
+ seq_printf(seq, " %u %u %u %u %u %u %u %u %u %u %u",
sd->lb_count[itype],
sd->lb_balanced[itype],
sd->lb_failed[itype],
- sd->lb_imbalance[itype],
+ sd->lb_imbalance_load[itype],
+ sd->lb_imbalance_util[itype],
+ sd->lb_imbalance_task[itype],
+ sd->lb_imbalance_misfit[itype],
sd->lb_gained[itype],
sd->lb_hot_gained[itype],
sd->lb_nobusyq[itype],
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 8ee0add5a48a..19cdbe96f93d 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -138,6 +138,10 @@ static inline void psi_enqueue(struct task_struct *p, int flags)
if (flags & ENQUEUE_RESTORE)
return;
+ /* psi_sched_switch() will handle the flags */
+ if (task_on_cpu(task_rq(p), p))
+ return;
+
if (p->se.sched_delayed) {
/* CPU migration of "sleeping" task */
SCHED_WARN_ON(!(flags & ENQUEUE_MIGRATED));
@@ -244,7 +248,10 @@ static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t)
delta = rq_clock(rq) - t->sched_info.last_queued;
t->sched_info.last_queued = 0;
t->sched_info.run_delay += delta;
-
+ if (delta > t->sched_info.max_run_delay)
+ t->sched_info.max_run_delay = delta;
+ if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay))
+ t->sched_info.min_run_delay = delta;
rq_sched_info_dequeue(rq, delta);
}
@@ -266,6 +273,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
t->sched_info.run_delay += delta;
t->sched_info.last_arrival = now;
t->sched_info.pcount++;
+ if (delta > t->sched_info.max_run_delay)
+ t->sched_info.max_run_delay = delta;
+ if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay))
+ t->sched_info.min_run_delay = delta;
rq_sched_info_arrive(rq, delta);
}
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index ff0e5ab4e37c..456d339be98f 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -300,20 +300,10 @@ static void __setscheduler_params(struct task_struct *p,
p->policy = policy;
- if (dl_policy(policy)) {
+ if (dl_policy(policy))
__setparam_dl(p, attr);
- } else if (fair_policy(policy)) {
- p->static_prio = NICE_TO_PRIO(attr->sched_nice);
- if (attr->sched_runtime) {
- p->se.custom_slice = 1;
- p->se.slice = clamp_t(u64, attr->sched_runtime,
- NSEC_PER_MSEC/10, /* HZ=1000 * 10 */
- NSEC_PER_MSEC*100); /* HZ=100 / 10 */
- } else {
- p->se.custom_slice = 0;
- p->se.slice = sysctl_sched_base_slice;
- }
- }
+ else if (fair_policy(policy))
+ __setparam_fair(p, attr);
/* rt-policy tasks do not have a timerslack */
if (rt_or_dl_task_policy(p)) {
@@ -1140,6 +1130,13 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
return 0;
/*
+ * The special/sugov task isn't part of regular bandwidth/admission
+ * control so let userspace change affinities.
+ */
+ if (dl_entity_is_special(&p->dl))
+ return 0;
+
+ /*
* Since bandwidth control happens on root_domain basis,
* if admission test is enabled, we only admit -deadline
* tasks allowed to run on all the CPUs in the task's
@@ -1433,7 +1430,7 @@ int __sched yield_to(struct task_struct *p, bool preempt)
struct rq *rq, *p_rq;
int yielded = 0;
- scoped_guard (irqsave) {
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
rq = this_rq();
again:
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 9748a4c8d668..c49aea8c1025 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -312,7 +312,7 @@ static int sched_energy_aware_handler(const struct ctl_table *table, int write,
return ret;
}
-static struct ctl_table sched_energy_aware_sysctls[] = {
+static const struct ctl_table sched_energy_aware_sysctls[] = {
{
.procname = "sched_energy_aware",
.data = &sysctl_sched_energy_aware,
@@ -1635,9 +1635,7 @@ sd_init(struct sched_domain_topology_level *tl,
.max_newidle_lb_cost = 0,
.last_decay_max_lb_cost = jiffies,
.child = child,
-#ifdef CONFIG_SCHED_DEBUG
.name = tl->name,
-#endif
};
sd_span = sched_domain_span(sd);
@@ -2338,10 +2336,8 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
if (!cpumask_subset(sched_domain_span(child),
sched_domain_span(sd))) {
pr_err("BUG: arch topology borken\n");
-#ifdef CONFIG_SCHED_DEBUG
pr_err(" the %s domain not a subset of the %s domain\n",
child->name, sd->name);
-#endif
/* Fixup, ensure @sd has at least @child CPUs. */
cpumask_or(sched_domain_span(sd),
sched_domain_span(sd),
@@ -2721,9 +2717,11 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
/*
* This domain won't be destroyed and as such
- * its dl_bw->total_bw needs to be cleared. It
- * will be recomputed in function
- * update_tasks_root_domain().
+ * its dl_bw->total_bw needs to be cleared.
+ * Tasks contribution will be then recomputed
+ * in function dl_update_tasks_root_domain(),
+ * dl_servers contribution in function
+ * dl_restore_server_root_domain().
*/
rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
dl_clear_root_domain(rd);