diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 498 |
1 files changed, 292 insertions, 206 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b173a059315c..3a89f949e307 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3542,7 +3542,7 @@ out: } } -void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +void init_numa_balancing(u64 clone_flags, struct task_struct *p) { int mm_users = 0; struct mm_struct *mm = p->mm; @@ -3957,9 +3957,6 @@ static void update_cfs_group(struct sched_entity *se) if (!gcfs_rq || !gcfs_rq->load.weight) return; - if (throttled_hierarchy(gcfs_rq)) - return; - shares = calc_group_shares(gcfs_rq); if (unlikely(se->load.weight != shares)) reweight_entity(cfs_rq_of(se), se, shares); @@ -5291,18 +5288,16 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (cfs_rq->nr_queued == 1) { check_enqueue_throttle(cfs_rq); - if (!throttled_hierarchy(cfs_rq)) { - list_add_leaf_cfs_rq(cfs_rq); - } else { + list_add_leaf_cfs_rq(cfs_rq); #ifdef CONFIG_CFS_BANDWIDTH + if (cfs_rq->pelt_clock_throttled) { struct rq *rq = rq_of(cfs_rq); - if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) - cfs_rq->throttled_clock = rq_clock(rq); - if (!cfs_rq->throttled_clock_self) - cfs_rq->throttled_clock_self = rq_clock(rq); -#endif + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - + cfs_rq->throttled_clock_pelt; + cfs_rq->pelt_clock_throttled = 0; } +#endif } } @@ -5341,8 +5336,6 @@ static void set_delayed(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_runnable--; - if (cfs_rq_throttled(cfs_rq)) - break; } } @@ -5363,8 +5356,6 @@ static void clear_delayed(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_runnable++; - if (cfs_rq_throttled(cfs_rq)) - break; } } @@ -5392,7 +5383,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * DELAY_DEQUEUE relies on spurious wakeups, special task * states must not suffer spurious wakeups, excempt them. */ - if (flags & DEQUEUE_SPECIAL) + if (flags & (DEQUEUE_SPECIAL | DEQUEUE_THROTTLE)) delay = false; WARN_ON_ONCE(delay && se->sched_delayed); @@ -5450,8 +5441,18 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (flags & DEQUEUE_DELAYED) finish_delayed_dequeue_entity(se); - if (cfs_rq->nr_queued == 0) + if (cfs_rq->nr_queued == 0) { update_idle_cfs_rq_clock_pelt(cfs_rq); +#ifdef CONFIG_CFS_BANDWIDTH + if (throttled_hierarchy(cfs_rq)) { + struct rq *rq = rq_of(cfs_rq); + + list_del_leaf_cfs_rq(cfs_rq); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); + cfs_rq->pelt_clock_throttled = 1; + } +#endif + } return true; } @@ -5725,74 +5726,253 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) return cfs_bandwidth_used() && cfs_rq->throttled; } +static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq) +{ + return cfs_bandwidth_used() && cfs_rq->pelt_clock_throttled; +} + /* check whether cfs_rq, or any parent, is throttled */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { return cfs_bandwidth_used() && cfs_rq->throttle_count; } +static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu) +{ + return throttled_hierarchy(task_group(p)->cfs_rq[dst_cpu]); +} + +static inline bool task_is_throttled(struct task_struct *p) +{ + return cfs_bandwidth_used() && p->throttled; +} + +static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags); +static void throttle_cfs_rq_work(struct callback_head *work) +{ + struct task_struct *p = container_of(work, struct task_struct, sched_throttle_work); + struct sched_entity *se; + struct cfs_rq *cfs_rq; + struct rq *rq; + + WARN_ON_ONCE(p != current); + p->sched_throttle_work.next = &p->sched_throttle_work; + + /* + * If task is exiting, then there won't be a return to userspace, so we + * don't have to bother with any of this. + */ + if ((p->flags & PF_EXITING)) + return; + + scoped_guard(task_rq_lock, p) { + se = &p->se; + cfs_rq = cfs_rq_of(se); + + /* Raced, forget */ + if (p->sched_class != &fair_sched_class) + return; + + /* + * If not in limbo, then either replenish has happened or this + * task got migrated out of the throttled cfs_rq, move along. + */ + if (!cfs_rq->throttle_count) + return; + rq = scope.rq; + update_rq_clock(rq); + WARN_ON_ONCE(p->throttled || !list_empty(&p->throttle_node)); + dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_THROTTLE); + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); + /* + * Must not set throttled before dequeue or dequeue will + * mistakenly regard this task as an already throttled one. + */ + p->throttled = true; + resched_curr(rq); + } +} + +void init_cfs_throttle_work(struct task_struct *p) +{ + init_task_work(&p->sched_throttle_work, throttle_cfs_rq_work); + /* Protect against double add, see throttle_cfs_rq() and throttle_cfs_rq_work() */ + p->sched_throttle_work.next = &p->sched_throttle_work; + INIT_LIST_HEAD(&p->throttle_node); +} + /* - * Ensure that neither of the group entities corresponding to src_cpu or - * dest_cpu are members of a throttled hierarchy when performing group - * load-balance operations. + * Task is throttled and someone wants to dequeue it again: + * it could be sched/core when core needs to do things like + * task affinity change, task group change, task sched class + * change etc. and in these cases, DEQUEUE_SLEEP is not set; + * or the task is blocked after throttled due to freezer etc. + * and in these cases, DEQUEUE_SLEEP is set. */ -static inline int throttled_lb_pair(struct task_group *tg, - int src_cpu, int dest_cpu) +static void detach_task_cfs_rq(struct task_struct *p); +static void dequeue_throttled_task(struct task_struct *p, int flags) +{ + WARN_ON_ONCE(p->se.on_rq); + list_del_init(&p->throttle_node); + + /* task blocked after throttled */ + if (flags & DEQUEUE_SLEEP) { + p->throttled = false; + return; + } + + /* + * task is migrating off its old cfs_rq, detach + * the task's load from its old cfs_rq. + */ + if (task_on_rq_migrating(p)) + detach_task_cfs_rq(p); +} + +static bool enqueue_throttled_task(struct task_struct *p) { - struct cfs_rq *src_cfs_rq, *dest_cfs_rq; + struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); - src_cfs_rq = tg->cfs_rq[src_cpu]; - dest_cfs_rq = tg->cfs_rq[dest_cpu]; + /* @p should have gone through dequeue_throttled_task() first */ + WARN_ON_ONCE(!list_empty(&p->throttle_node)); + + /* + * If the throttled task @p is enqueued to a throttled cfs_rq, + * take the fast path by directly putting the task on the + * target cfs_rq's limbo list. + * + * Do not do that when @p is current because the following race can + * cause @p's group_node to be incorectly re-insterted in its rq's + * cfs_tasks list, despite being throttled: + * + * cpuX cpuY + * p ret2user + * throttle_cfs_rq_work() sched_move_task(p) + * LOCK task_rq_lock + * dequeue_task_fair(p) + * UNLOCK task_rq_lock + * LOCK task_rq_lock + * task_current_donor(p) == true + * task_on_rq_queued(p) == true + * dequeue_task(p) + * put_prev_task(p) + * sched_change_group() + * enqueue_task(p) -> p's new cfs_rq + * is throttled, go + * fast path and skip + * actual enqueue + * set_next_task(p) + * list_move(&se->group_node, &rq->cfs_tasks); // bug + * schedule() + * + * In the above race case, @p current cfs_rq is in the same rq as + * its previous cfs_rq because sched_move_task() only moves a task + * to a different group from the same rq, so we can use its current + * cfs_rq to derive rq and test if the task is current. + */ + if (throttled_hierarchy(cfs_rq) && + !task_current_donor(rq_of(cfs_rq), p)) { + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); + return true; + } - return throttled_hierarchy(src_cfs_rq) || - throttled_hierarchy(dest_cfs_rq); + /* we can't take the fast path, do an actual enqueue*/ + p->throttled = false; + return false; } +static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags); static int tg_unthrottle_up(struct task_group *tg, void *data) { struct rq *rq = data; struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + struct task_struct *p, *tmp; + + if (--cfs_rq->throttle_count) + return 0; - cfs_rq->throttle_count--; - if (!cfs_rq->throttle_count) { + if (cfs_rq->pelt_clock_throttled) { cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - cfs_rq->throttled_clock_pelt; + cfs_rq->pelt_clock_throttled = 0; + } - /* Add cfs_rq with load or one or more already running entities to the list */ - if (!cfs_rq_is_decayed(cfs_rq)) - list_add_leaf_cfs_rq(cfs_rq); + if (cfs_rq->throttled_clock_self) { + u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self; - if (cfs_rq->throttled_clock_self) { - u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self; + cfs_rq->throttled_clock_self = 0; - cfs_rq->throttled_clock_self = 0; + if (WARN_ON_ONCE((s64)delta < 0)) + delta = 0; - if (WARN_ON_ONCE((s64)delta < 0)) - delta = 0; + cfs_rq->throttled_clock_self_time += delta; + } - cfs_rq->throttled_clock_self_time += delta; - } + /* Re-enqueue the tasks that have been throttled at this level. */ + list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) { + list_del_init(&p->throttle_node); + p->throttled = false; + enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP); } + /* Add cfs_rq with load or one or more already running entities to the list */ + if (!cfs_rq_is_decayed(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); + return 0; } +static inline bool task_has_throttle_work(struct task_struct *p) +{ + return p->sched_throttle_work.next != &p->sched_throttle_work; +} + +static inline void task_throttle_setup_work(struct task_struct *p) +{ + if (task_has_throttle_work(p)) + return; + + /* + * Kthreads and exiting tasks don't return to userspace, so adding the + * work is pointless + */ + if ((p->flags & (PF_EXITING | PF_KTHREAD))) + return; + + task_work_add(p, &p->sched_throttle_work, TWA_RESUME); +} + +static void record_throttle_clock(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + + if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) + cfs_rq->throttled_clock = rq_clock(rq); + + if (!cfs_rq->throttled_clock_self) + cfs_rq->throttled_clock_self = rq_clock(rq); +} + static int tg_throttle_down(struct task_group *tg, void *data) { struct rq *rq = data; struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; - /* group is entering throttled state, stop time */ - if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); - list_del_leaf_cfs_rq(cfs_rq); + if (cfs_rq->throttle_count++) + return 0; - WARN_ON_ONCE(cfs_rq->throttled_clock_self); - if (cfs_rq->nr_queued) - cfs_rq->throttled_clock_self = rq_clock(rq); + /* + * For cfs_rqs that still have entities enqueued, PELT clock + * stop happens at dequeue time when all entities are dequeued. + */ + if (!cfs_rq->nr_queued) { + list_del_leaf_cfs_rq(cfs_rq); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); + cfs_rq->pelt_clock_throttled = 1; } - cfs_rq->throttle_count++; + WARN_ON_ONCE(cfs_rq->throttled_clock_self); + WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list)); return 0; } @@ -5800,8 +5980,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - struct sched_entity *se; - long queued_delta, runnable_delta, idle_delta, dequeue = 1; + int dequeue = 1; raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ @@ -5824,76 +6003,17 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) if (!dequeue) return false; /* Throttle no longer required. */ - se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; - /* freeze hierarchy runnable averages while throttled */ rcu_read_lock(); walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock(); - queued_delta = cfs_rq->h_nr_queued; - runnable_delta = cfs_rq->h_nr_runnable; - idle_delta = cfs_rq->h_nr_idle; - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - int flags; - - /* throttled entity or throttle-on-deactivate */ - if (!se->on_rq) - goto done; - - /* - * Abuse SPECIAL to avoid delayed dequeue in this instance. - * This avoids teaching dequeue_entities() about throttled - * entities and keeps things relatively simple. - */ - flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL; - if (se->sched_delayed) - flags |= DEQUEUE_DELAYED; - dequeue_entity(qcfs_rq, se, flags); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_delta = cfs_rq->h_nr_queued; - - qcfs_rq->h_nr_queued -= queued_delta; - qcfs_rq->h_nr_runnable -= runnable_delta; - qcfs_rq->h_nr_idle -= idle_delta; - - if (qcfs_rq->load.weight) { - /* Avoid re-evaluating load for this entity: */ - se = parent_entity(se); - break; - } - } - - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - /* throttled entity or throttle-on-deactivate */ - if (!se->on_rq) - goto done; - - update_load_avg(qcfs_rq, se, 0); - se_update_runnable(se); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_delta = cfs_rq->h_nr_queued; - - qcfs_rq->h_nr_queued -= queued_delta; - qcfs_rq->h_nr_runnable -= runnable_delta; - qcfs_rq->h_nr_idle -= idle_delta; - } - - /* At this point se is NULL and we are at root level*/ - sub_nr_running(rq, queued_delta); -done: /* * Note: distribution will already see us throttled via the * throttled-list. rq->lock protects completion. */ cfs_rq->throttled = 1; WARN_ON_ONCE(cfs_rq->throttled_clock); - if (cfs_rq->nr_queued) - cfs_rq->throttled_clock = rq_clock(rq); return true; } @@ -5901,9 +6021,20 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - struct sched_entity *se; - long queued_delta, runnable_delta, idle_delta; - long rq_h_nr_queued = rq->cfs.h_nr_queued; + struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; + + /* + * It's possible we are called with !runtime_remaining due to things + * like user changed quota setting(see tg_set_cfs_bandwidth()) or async + * unthrottled us with a positive runtime_remaining but other still + * running entities consumed those runtime before we reached here. + * + * Anyway, we can't unthrottle this cfs_rq without any runtime remaining + * because any enqueue in tg_unthrottle_up() will immediately trigger a + * throttle, which is not supposed to happen on unthrottle path. + */ + if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0) + return; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -5933,62 +6064,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) if (list_add_leaf_cfs_rq(cfs_rq_of(se))) break; } - goto unthrottle_throttle; - } - - queued_delta = cfs_rq->h_nr_queued; - runnable_delta = cfs_rq->h_nr_runnable; - idle_delta = cfs_rq->h_nr_idle; - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - - /* Handle any unfinished DELAY_DEQUEUE business first. */ - if (se->sched_delayed) { - int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED; - - dequeue_entity(qcfs_rq, se, flags); - } else if (se->on_rq) - break; - enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_delta = cfs_rq->h_nr_queued; - - qcfs_rq->h_nr_queued += queued_delta; - qcfs_rq->h_nr_runnable += runnable_delta; - qcfs_rq->h_nr_idle += idle_delta; - - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(qcfs_rq)) - goto unthrottle_throttle; - } - - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - - update_load_avg(qcfs_rq, se, UPDATE_TG); - se_update_runnable(se); - - if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_delta = cfs_rq->h_nr_queued; - - qcfs_rq->h_nr_queued += queued_delta; - qcfs_rq->h_nr_runnable += runnable_delta; - qcfs_rq->h_nr_idle += idle_delta; - - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(qcfs_rq)) - goto unthrottle_throttle; } - /* Start the fair server if un-throttling resulted in new runnable tasks */ - if (!rq_h_nr_queued && rq->cfs.h_nr_queued) - dl_server_start(&rq->fair_server); - - /* At this point se is NULL and we are at root level*/ - add_nr_running(rq, queued_delta); - -unthrottle_throttle: assert_list_leaf_cfs_rq(rq); /* Determine whether we need to wake up potentially idle CPU: */ @@ -6472,6 +6549,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_rq->runtime_enabled = 0; INIT_LIST_HEAD(&cfs_rq->throttled_list); INIT_LIST_HEAD(&cfs_rq->throttled_csd_list); + INIT_LIST_HEAD(&cfs_rq->throttled_limbo_list); } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -6639,19 +6717,28 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} static inline void sync_throttle(struct task_group *tg, int cpu) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static void task_throttle_setup_work(struct task_struct *p) {} +static bool task_is_throttled(struct task_struct *p) { return false; } +static void dequeue_throttled_task(struct task_struct *p, int flags) {} +static bool enqueue_throttled_task(struct task_struct *p) { return false; } +static void record_throttle_clock(struct cfs_rq *cfs_rq) {} static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { return 0; } +static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq) +{ + return false; +} + static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { return 0; } -static inline int throttled_lb_pair(struct task_group *tg, - int src_cpu, int dest_cpu) +static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu) { return 0; } @@ -6831,6 +6918,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) int rq_h_nr_queued = rq->cfs.h_nr_queued; u64 slice = 0; + if (task_is_throttled(p) && enqueue_throttled_task(p)) + return; + /* * The code below (indirectly) updates schedutil which looks at * the cfs_rq utilization to select a frequency. @@ -6883,10 +6973,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = 1; - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - goto enqueue_throttle; - flags = ENQUEUE_WAKEUP; } @@ -6908,10 +6994,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = 1; - - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - goto enqueue_throttle; } if (!rq_h_nr_queued && rq->cfs.h_nr_queued) { @@ -6941,7 +7023,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!task_new) check_update_overutilized_status(rq); -enqueue_throttle: assert_list_leaf_cfs_rq(rq); hrtick_update(rq); @@ -6963,6 +7044,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) bool was_sched_idle = sched_idle_rq(rq); bool task_sleep = flags & DEQUEUE_SLEEP; bool task_delayed = flags & DEQUEUE_DELAYED; + bool task_throttled = flags & DEQUEUE_THROTTLE; struct task_struct *p = NULL; int h_nr_idle = 0; int h_nr_queued = 0; @@ -6996,9 +7078,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = h_nr_queued; - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - return 0; + if (throttled_hierarchy(cfs_rq) && task_throttled) + record_throttle_clock(cfs_rq); /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { @@ -7010,7 +7091,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) * Bias pick_next to pick a task from this cfs_rq, as * p is sleeping when it is within its sched_slice. */ - if (task_sleep && se && !throttled_hierarchy(cfs_rq)) + if (task_sleep && se) set_next_buddy(se); break; } @@ -7037,9 +7118,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = h_nr_queued; - /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) - return 0; + if (throttled_hierarchy(cfs_rq) && task_throttled) + record_throttle_clock(cfs_rq); } sub_nr_running(rq, h_nr_queued); @@ -7073,6 +7153,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) */ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) { + if (task_is_throttled(p)) { + dequeue_throttled_task(p, flags); + return true; + } + if (!p->se.sched_delayed) util_est_dequeue(&rq->cfs, p); @@ -8660,7 +8745,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * lead to a throttle). This both saves work and prevents false * next-buddy nomination below. */ - if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) + if (task_is_throttled(p)) return; if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) { @@ -8741,19 +8826,22 @@ static struct task_struct *pick_task_fair(struct rq *rq) { struct sched_entity *se; struct cfs_rq *cfs_rq; + struct task_struct *p; + bool throttled; again: cfs_rq = &rq->cfs; if (!cfs_rq->nr_queued) return NULL; + throttled = false; + do { /* Might not have done put_prev_entity() */ if (cfs_rq->curr && cfs_rq->curr->on_rq) update_curr(cfs_rq); - if (unlikely(check_cfs_rq_runtime(cfs_rq))) - goto again; + throttled |= check_cfs_rq_runtime(cfs_rq); se = pick_next_entity(rq, cfs_rq); if (!se) @@ -8761,7 +8849,10 @@ again: cfs_rq = group_cfs_rq(se); } while (cfs_rq); - return task_of(se); + p = task_of(se); + if (unlikely(throttled)) + task_throttle_setup_work(p); + return p; } static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); @@ -8859,11 +8950,6 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_stru return pick_next_task_fair(rq, prev, NULL); } -static bool fair_server_has_tasks(struct sched_dl_entity *dl_se) -{ - return !!dl_se->rq->cfs.nr_queued; -} - static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) { return pick_task_fair(dl_se->rq); @@ -8875,7 +8961,7 @@ void fair_server_init(struct rq *rq) init_dl_entity(dl_se); - dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task); + dl_server_init(dl_se, rq, fair_server_pick_task); } /* @@ -8928,8 +9014,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; - /* throttled hierarchies are not runnable */ - if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) + /* !se->on_rq also covers throttled task */ + if (!se->on_rq) return false; /* Tell the scheduler that we'd really like se to run next. */ @@ -9288,7 +9374,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * We do not migrate tasks that are: * 1) delayed dequeued unless we migrate load, or - * 2) throttled_lb_pair, or + * 2) target cfs_rq is in throttled hierarchy, or * 3) cannot be migrated to this CPU due to cpus_ptr, or * 4) running (obviously), or * 5) are cache-hot on their current CPU, or @@ -9297,7 +9383,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) return 0; - if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) + if (lb_throttled_hierarchy(p, env->dst_cpu)) return 0; /* @@ -13081,10 +13167,13 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq_throttled(cfs_rq)) - return; - - if (!throttled_hierarchy(cfs_rq)) + /* + * If a task gets attached to this cfs_rq and before being queued, + * it gets migrated to another CPU due to reasons like affinity + * change, make sure this cfs_rq stays on leaf cfs_rq list to have + * that removed load decayed or it can cause faireness problem. + */ + if (!cfs_rq_pelt_clock_throttled(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); /* Start to propagate at parent */ @@ -13095,10 +13184,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) update_load_avg(cfs_rq, se, UPDATE_TG); - if (cfs_rq_throttled(cfs_rq)) - break; - - if (!throttled_hierarchy(cfs_rq)) + if (!cfs_rq_pelt_clock_throttled(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); } } |