diff options
Diffstat (limited to 'kernel/sched/fair.c')
| -rw-r--r-- | kernel/sched/fair.c | 709 | 
1 files changed, 165 insertions, 544 deletions
| diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1866e64792a7..b39fb596f6c1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -255,9 +255,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)  	return cfs_rq->rq;  } -/* An entity is a task if it doesn't "own" a runqueue */ -#define entity_is_task(se)	(!se->my_q) -  static inline struct task_struct *task_of(struct sched_entity *se)  {  	SCHED_WARN_ON(!entity_is_task(se)); @@ -419,7 +416,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)  	return container_of(cfs_rq, struct rq, cfs);  } -#define entity_is_task(se)	1  #define for_each_sched_entity(se) \  		for (; se; se = NULL) @@ -692,7 +688,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)  }  #ifdef CONFIG_SMP - +#include "pelt.h"  #include "sched-pelt.h"  static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); @@ -735,11 +731,12 @@ static void attach_entity_cfs_rq(struct sched_entity *se);   * To solve this problem, we also cap the util_avg of successive tasks to   * only 1/2 of the left utilization budget:   * - *   util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n + *   util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n   * - * where n denotes the nth task. + * where n denotes the nth task and cpu_scale the CPU capacity.   * - * For example, a simplest series from the beginning would be like: + * For example, for a CPU with 1024 of capacity, a simplest series from + * the beginning would be like:   *   *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...   * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ... @@ -751,7 +748,8 @@ void post_init_entity_util_avg(struct sched_entity *se)  {  	struct cfs_rq *cfs_rq = cfs_rq_of(se);  	struct sched_avg *sa = &se->avg; -	long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; +	long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); +	long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;  	if (cap > 0) {  		if (cfs_rq->avg.util_avg != 0) { @@ -1314,7 +1312,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,  		 * of each group. Skip other nodes.  		 */  		if (sched_numa_topology_type == NUMA_BACKPLANE && -					dist > maxdist) +					dist >= maxdist)  			continue;  		/* Add up the faults from nearby nodes. */ @@ -1452,15 +1450,12 @@ static unsigned long capacity_of(int cpu);  /* Cached statistics for all CPUs within a node */  struct numa_stats { -	unsigned long nr_running;  	unsigned long load;  	/* Total compute capacity of CPUs on a node */  	unsigned long compute_capacity; -	/* Approximate capacity in terms of runnable tasks on a node */ -	unsigned long task_capacity; -	int has_free_capacity; +	unsigned int nr_running;  };  /* @@ -1487,8 +1482,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)  	 * the @ns structure is NULL'ed and task_numa_compare() will  	 * not find this node attractive.  	 * -	 * We'll either bail at !has_free_capacity, or we'll detect a huge -	 * imbalance and bail there. +	 * We'll detect a huge imbalance and bail there.  	 */  	if (!cpus)  		return; @@ -1497,9 +1491,8 @@ static void update_numa_stats(struct numa_stats *ns, int nid)  	smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);  	capacity = cpus / smt; /* cores */ -	ns->task_capacity = min_t(unsigned, capacity, +	capacity = min_t(unsigned, capacity,  		DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE)); -	ns->has_free_capacity = (ns->nr_running < ns->task_capacity);  }  struct task_numa_env { @@ -1548,28 +1541,12 @@ static bool load_too_imbalanced(long src_load, long dst_load,  	src_capacity = env->src_stats.compute_capacity;  	dst_capacity = env->dst_stats.compute_capacity; -	/* We care about the slope of the imbalance, not the direction. */ -	if (dst_load < src_load) -		swap(dst_load, src_load); +	imb = abs(dst_load * src_capacity - src_load * dst_capacity); -	/* Is the difference below the threshold? */ -	imb = dst_load * src_capacity * 100 - -	      src_load * dst_capacity * env->imbalance_pct; -	if (imb <= 0) -		return false; - -	/* -	 * The imbalance is above the allowed threshold. -	 * Compare it with the old imbalance. -	 */  	orig_src_load = env->src_stats.load;  	orig_dst_load = env->dst_stats.load; -	if (orig_dst_load < orig_src_load) -		swap(orig_dst_load, orig_src_load); - -	old_imb = orig_dst_load * src_capacity * 100 - -		  orig_src_load * dst_capacity * env->imbalance_pct; +	old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);  	/* Would this change make things worse? */  	return (imb > old_imb); @@ -1582,9 +1559,8 @@ static bool load_too_imbalanced(long src_load, long dst_load,   * be exchanged with the source task   */  static void task_numa_compare(struct task_numa_env *env, -			      long taskimp, long groupimp) +			      long taskimp, long groupimp, bool maymove)  { -	struct rq *src_rq = cpu_rq(env->src_cpu);  	struct rq *dst_rq = cpu_rq(env->dst_cpu);  	struct task_struct *cur;  	long src_load, dst_load; @@ -1605,97 +1581,73 @@ static void task_numa_compare(struct task_numa_env *env,  	if (cur == env->p)  		goto unlock; +	if (!cur) { +		if (maymove || imp > env->best_imp) +			goto assign; +		else +			goto unlock; +	} +  	/*  	 * "imp" is the fault differential for the source task between the  	 * source and destination node. Calculate the total differential for  	 * the source task and potential destination task. The more negative -	 * the value is, the more rmeote accesses that would be expected to +	 * the value is, the more remote accesses that would be expected to  	 * be incurred if the tasks were swapped.  	 */ -	if (cur) { -		/* Skip this swap candidate if cannot move to the source CPU: */ -		if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) -			goto unlock; +	/* Skip this swap candidate if cannot move to the source cpu */ +	if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) +		goto unlock; +	/* +	 * If dst and source tasks are in the same NUMA group, or not +	 * in any group then look only at task weights. +	 */ +	if (cur->numa_group == env->p->numa_group) { +		imp = taskimp + task_weight(cur, env->src_nid, dist) - +		      task_weight(cur, env->dst_nid, dist);  		/* -		 * If dst and source tasks are in the same NUMA group, or not -		 * in any group then look only at task weights. +		 * Add some hysteresis to prevent swapping the +		 * tasks within a group over tiny differences.  		 */ -		if (cur->numa_group == env->p->numa_group) { -			imp = taskimp + task_weight(cur, env->src_nid, dist) - -			      task_weight(cur, env->dst_nid, dist); -			/* -			 * Add some hysteresis to prevent swapping the -			 * tasks within a group over tiny differences. -			 */ -			if (cur->numa_group) -				imp -= imp/16; -		} else { -			/* -			 * Compare the group weights. If a task is all by -			 * itself (not part of a group), use the task weight -			 * instead. -			 */ -			if (cur->numa_group) -				imp += group_weight(cur, env->src_nid, dist) - -				       group_weight(cur, env->dst_nid, dist); -			else -				imp += task_weight(cur, env->src_nid, dist) - -				       task_weight(cur, env->dst_nid, dist); -		} +		if (cur->numa_group) +			imp -= imp / 16; +	} else { +		/* +		 * Compare the group weights. If a task is all by itself +		 * (not part of a group), use the task weight instead. +		 */ +		if (cur->numa_group && env->p->numa_group) +			imp += group_weight(cur, env->src_nid, dist) - +			       group_weight(cur, env->dst_nid, dist); +		else +			imp += task_weight(cur, env->src_nid, dist) - +			       task_weight(cur, env->dst_nid, dist);  	} -	if (imp <= env->best_imp && moveimp <= env->best_imp) +	if (imp <= env->best_imp)  		goto unlock; -	if (!cur) { -		/* Is there capacity at our destination? */ -		if (env->src_stats.nr_running <= env->src_stats.task_capacity && -		    !env->dst_stats.has_free_capacity) -			goto unlock; - -		goto balance; -	} - -	/* Balance doesn't matter much if we're running a task per CPU: */ -	if (imp > env->best_imp && src_rq->nr_running == 1 && -			dst_rq->nr_running == 1) +	if (maymove && moveimp > imp && moveimp > env->best_imp) { +		imp = moveimp - 1; +		cur = NULL;  		goto assign; +	}  	/*  	 * In the overloaded case, try and keep the load balanced.  	 */ -balance: -	load = task_h_load(env->p); +	load = task_h_load(env->p) - task_h_load(cur); +	if (!load) +		goto assign; +  	dst_load = env->dst_stats.load + load;  	src_load = env->src_stats.load - load; -	if (moveimp > imp && moveimp > env->best_imp) { -		/* -		 * If the improvement from just moving env->p direction is -		 * better than swapping tasks around, check if a move is -		 * possible. Store a slightly smaller score than moveimp, -		 * so an actually idle CPU will win. -		 */ -		if (!load_too_imbalanced(src_load, dst_load, env)) { -			imp = moveimp - 1; -			cur = NULL; -			goto assign; -		} -	} - -	if (imp <= env->best_imp) -		goto unlock; - -	if (cur) { -		load = task_h_load(cur); -		dst_load -= load; -		src_load += load; -	} -  	if (load_too_imbalanced(src_load, dst_load, env))  		goto unlock; +assign:  	/*  	 * One idle CPU per node is evaluated for a task numa move.  	 * Call select_idle_sibling to maybe find a better one. @@ -1711,7 +1663,6 @@ balance:  		local_irq_enable();  	} -assign:  	task_numa_assign(env, cur, imp);  unlock:  	rcu_read_unlock(); @@ -1720,43 +1671,30 @@ unlock:  static void task_numa_find_cpu(struct task_numa_env *env,  				long taskimp, long groupimp)  { +	long src_load, dst_load, load; +	bool maymove = false;  	int cpu; +	load = task_h_load(env->p); +	dst_load = env->dst_stats.load + load; +	src_load = env->src_stats.load - load; + +	/* +	 * If the improvement from just moving env->p direction is better +	 * than swapping tasks around, check if a move is possible. +	 */ +	maymove = !load_too_imbalanced(src_load, dst_load, env); +  	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {  		/* Skip this CPU if the source task cannot migrate */  		if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))  			continue;  		env->dst_cpu = cpu; -		task_numa_compare(env, taskimp, groupimp); +		task_numa_compare(env, taskimp, groupimp, maymove);  	}  } -/* Only move tasks to a NUMA node less busy than the current node. */ -static bool numa_has_capacity(struct task_numa_env *env) -{ -	struct numa_stats *src = &env->src_stats; -	struct numa_stats *dst = &env->dst_stats; - -	if (src->has_free_capacity && !dst->has_free_capacity) -		return false; - -	/* -	 * Only consider a task move if the source has a higher load -	 * than the destination, corrected for CPU capacity on each node. -	 * -	 *      src->load                dst->load -	 * --------------------- vs --------------------- -	 * src->compute_capacity    dst->compute_capacity -	 */ -	if (src->load * dst->compute_capacity * env->imbalance_pct > - -	    dst->load * src->compute_capacity * 100) -		return true; - -	return false; -} -  static int task_numa_migrate(struct task_struct *p)  {  	struct task_numa_env env = { @@ -1797,7 +1735,7 @@ static int task_numa_migrate(struct task_struct *p)  	 * elsewhere, so there is no point in (re)trying.  	 */  	if (unlikely(!sd)) { -		p->numa_preferred_nid = task_node(p); +		sched_setnuma(p, task_node(p));  		return -EINVAL;  	} @@ -1811,8 +1749,7 @@ static int task_numa_migrate(struct task_struct *p)  	update_numa_stats(&env.dst_stats, env.dst_nid);  	/* Try to find a spot on the preferred nid. */ -	if (numa_has_capacity(&env)) -		task_numa_find_cpu(&env, taskimp, groupimp); +	task_numa_find_cpu(&env, taskimp, groupimp);  	/*  	 * Look at other nodes in these cases: @@ -1842,8 +1779,7 @@ static int task_numa_migrate(struct task_struct *p)  			env.dist = dist;  			env.dst_nid = nid;  			update_numa_stats(&env.dst_stats, env.dst_nid); -			if (numa_has_capacity(&env)) -				task_numa_find_cpu(&env, taskimp, groupimp); +			task_numa_find_cpu(&env, taskimp, groupimp);  		}  	} @@ -1856,15 +1792,13 @@ static int task_numa_migrate(struct task_struct *p)  	 * trying for a better one later. Do not set the preferred node here.  	 */  	if (p->numa_group) { -		struct numa_group *ng = p->numa_group; -  		if (env.best_cpu == -1)  			nid = env.src_nid;  		else -			nid = env.dst_nid; +			nid = cpu_to_node(env.best_cpu); -		if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng)) -			sched_setnuma(p, env.dst_nid); +		if (nid != p->numa_preferred_nid) +			sched_setnuma(p, nid);  	}  	/* No better CPU than the current one was found. */ @@ -1884,7 +1818,8 @@ static int task_numa_migrate(struct task_struct *p)  		return ret;  	} -	ret = migrate_swap(p, env.best_task); +	ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu); +  	if (ret != 0)  		trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));  	put_task_struct(env.best_task); @@ -2144,8 +2079,8 @@ static int preferred_group_nid(struct task_struct *p, int nid)  static void task_numa_placement(struct task_struct *p)  { -	int seq, nid, max_nid = -1, max_group_nid = -1; -	unsigned long max_faults = 0, max_group_faults = 0; +	int seq, nid, max_nid = -1; +	unsigned long max_faults = 0;  	unsigned long fault_types[2] = { 0, 0 };  	unsigned long total_faults;  	u64 runtime, period; @@ -2224,33 +2159,30 @@ static void task_numa_placement(struct task_struct *p)  			}  		} -		if (faults > max_faults) { -			max_faults = faults; +		if (!p->numa_group) { +			if (faults > max_faults) { +				max_faults = faults; +				max_nid = nid; +			} +		} else if (group_faults > max_faults) { +			max_faults = group_faults;  			max_nid = nid;  		} - -		if (group_faults > max_group_faults) { -			max_group_faults = group_faults; -			max_group_nid = nid; -		}  	} -	update_task_scan_period(p, fault_types[0], fault_types[1]); -  	if (p->numa_group) {  		numa_group_count_active_nodes(p->numa_group);  		spin_unlock_irq(group_lock); -		max_nid = preferred_group_nid(p, max_group_nid); +		max_nid = preferred_group_nid(p, max_nid);  	}  	if (max_faults) {  		/* Set the new preferred node */  		if (max_nid != p->numa_preferred_nid)  			sched_setnuma(p, max_nid); - -		if (task_node(p) != p->numa_preferred_nid) -			numa_migrate_preferred(p);  	} + +	update_task_scan_period(p, fault_types[0], fault_types[1]);  }  static inline int get_numa_group(struct numa_group *grp) @@ -2450,14 +2382,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)  				numa_is_active_node(mem_node, ng))  		local = 1; -	task_numa_placement(p); -  	/*  	 * Retry task to preferred node migration periodically, in case it  	 * case it previously failed, or the scheduler moved us.  	 */ -	if (time_after(jiffies, p->numa_migrate_retry)) +	if (time_after(jiffies, p->numa_migrate_retry)) { +		task_numa_placement(p);  		numa_migrate_preferred(p); +	}  	if (migrated)  		p->numa_pages_migrated += pages; @@ -2749,19 +2681,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)  } while (0)  #ifdef CONFIG_SMP -/* - * XXX we want to get rid of these helpers and use the full load resolution. - */ -static inline long se_weight(struct sched_entity *se) -{ -	return scale_load_down(se->load.weight); -} - -static inline long se_runnable(struct sched_entity *se) -{ -	return scale_load_down(se->runnable_weight); -} -  static inline void  enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)  { @@ -3062,314 +2981,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)  }  #ifdef CONFIG_SMP -/* - * Approximate: - *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period) - */ -static u64 decay_load(u64 val, u64 n) -{ -	unsigned int local_n; - -	if (unlikely(n > LOAD_AVG_PERIOD * 63)) -		return 0; - -	/* after bounds checking we can collapse to 32-bit */ -	local_n = n; - -	/* -	 * As y^PERIOD = 1/2, we can combine -	 *    y^n = 1/2^(n/PERIOD) * y^(n%PERIOD) -	 * With a look-up table which covers y^n (n<PERIOD) -	 * -	 * To achieve constant time decay_load. -	 */ -	if (unlikely(local_n >= LOAD_AVG_PERIOD)) { -		val >>= local_n / LOAD_AVG_PERIOD; -		local_n %= LOAD_AVG_PERIOD; -	} - -	val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32); -	return val; -} - -static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) -{ -	u32 c1, c2, c3 = d3; /* y^0 == 1 */ - -	/* -	 * c1 = d1 y^p -	 */ -	c1 = decay_load((u64)d1, periods); - -	/* -	 *            p-1 -	 * c2 = 1024 \Sum y^n -	 *            n=1 -	 * -	 *              inf        inf -	 *    = 1024 ( \Sum y^n - \Sum y^n - y^0 ) -	 *              n=0        n=p -	 */ -	c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024; - -	return c1 + c2 + c3; -} - -/* - * Accumulate the three separate parts of the sum; d1 the remainder - * of the last (incomplete) period, d2 the span of full periods and d3 - * the remainder of the (incomplete) current period. - * - *           d1          d2           d3 - *           ^           ^            ^ - *           |           |            | - *         |<->|<----------------->|<--->| - * ... |---x---|------| ... |------|-----x (now) - * - *                           p-1 - * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0 - *                           n=1 - * - *    = u y^p +					(Step 1) - * - *                     p-1 - *      d1 y^p + 1024 \Sum y^n + d3 y^0		(Step 2) - *                     n=1 - */ -static __always_inline u32 -accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, -	       unsigned long load, unsigned long runnable, int running) -{ -	unsigned long scale_freq, scale_cpu; -	u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ -	u64 periods; - -	scale_freq = arch_scale_freq_capacity(cpu); -	scale_cpu = arch_scale_cpu_capacity(NULL, cpu); - -	delta += sa->period_contrib; -	periods = delta / 1024; /* A period is 1024us (~1ms) */ - -	/* -	 * Step 1: decay old *_sum if we crossed period boundaries. -	 */ -	if (periods) { -		sa->load_sum = decay_load(sa->load_sum, periods); -		sa->runnable_load_sum = -			decay_load(sa->runnable_load_sum, periods); -		sa->util_sum = decay_load((u64)(sa->util_sum), periods); - -		/* -		 * Step 2 -		 */ -		delta %= 1024; -		contrib = __accumulate_pelt_segments(periods, -				1024 - sa->period_contrib, delta); -	} -	sa->period_contrib = delta; - -	contrib = cap_scale(contrib, scale_freq); -	if (load) -		sa->load_sum += load * contrib; -	if (runnable) -		sa->runnable_load_sum += runnable * contrib; -	if (running) -		sa->util_sum += contrib * scale_cpu; - -	return periods; -} - -/* - * We can represent the historical contribution to runnable average as the - * coefficients of a geometric series.  To do this we sub-divide our runnable - * history into segments of approximately 1ms (1024us); label the segment that - * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g. - * - * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ... - *      p0            p1           p2 - *     (now)       (~1ms ago)  (~2ms ago) - * - * Let u_i denote the fraction of p_i that the entity was runnable. - * - * We then designate the fractions u_i as our co-efficients, yielding the - * following representation of historical load: - *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ... - * - * We choose y based on the with of a reasonably scheduling period, fixing: - *   y^32 = 0.5 - * - * This means that the contribution to load ~32ms ago (u_32) will be weighted - * approximately half as much as the contribution to load within the last ms - * (u_0). - * - * When a period "rolls over" and we have new u_0`, multiplying the previous - * sum again by y is sufficient to update: - *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) - *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] - */ -static __always_inline int -___update_load_sum(u64 now, int cpu, struct sched_avg *sa, -		  unsigned long load, unsigned long runnable, int running) -{ -	u64 delta; - -	delta = now - sa->last_update_time; -	/* -	 * This should only happen when time goes backwards, which it -	 * unfortunately does during sched clock init when we swap over to TSC. -	 */ -	if ((s64)delta < 0) { -		sa->last_update_time = now; -		return 0; -	} - -	/* -	 * Use 1024ns as the unit of measurement since it's a reasonable -	 * approximation of 1us and fast to compute. -	 */ -	delta >>= 10; -	if (!delta) -		return 0; - -	sa->last_update_time += delta << 10; - -	/* -	 * running is a subset of runnable (weight) so running can't be set if -	 * runnable is clear. But there are some corner cases where the current -	 * se has been already dequeued but cfs_rq->curr still points to it. -	 * This means that weight will be 0 but not running for a sched_entity -	 * but also for a cfs_rq if the latter becomes idle. As an example, -	 * this happens during idle_balance() which calls -	 * update_blocked_averages() -	 */ -	if (!load) -		runnable = running = 0; - -	/* -	 * Now we know we crossed measurement unit boundaries. The *_avg -	 * accrues by two steps: -	 * -	 * Step 1: accumulate *_sum since last_update_time. If we haven't -	 * crossed period boundaries, finish. -	 */ -	if (!accumulate_sum(delta, cpu, sa, load, runnable, running)) -		return 0; - -	return 1; -} - -static __always_inline void -___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable) -{ -	u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; - -	/* -	 * Step 2: update *_avg. -	 */ -	sa->load_avg = div_u64(load * sa->load_sum, divider); -	sa->runnable_load_avg =	div_u64(runnable * sa->runnable_load_sum, divider); -	sa->util_avg = sa->util_sum / divider; -} - -/* - * When a task is dequeued, its estimated utilization should not be update if - * its util_avg has not been updated at least once. - * This flag is used to synchronize util_avg updates with util_est updates. - * We map this information into the LSB bit of the utilization saved at - * dequeue time (i.e. util_est.dequeued). - */ -#define UTIL_AVG_UNCHANGED 0x1 - -static inline void cfs_se_util_change(struct sched_avg *avg) -{ -	unsigned int enqueued; - -	if (!sched_feat(UTIL_EST)) -		return; - -	/* Avoid store if the flag has been already set */ -	enqueued = avg->util_est.enqueued; -	if (!(enqueued & UTIL_AVG_UNCHANGED)) -		return; - -	/* Reset flag to report util_avg has been updated */ -	enqueued &= ~UTIL_AVG_UNCHANGED; -	WRITE_ONCE(avg->util_est.enqueued, enqueued); -} - -/* - * sched_entity: - * - *   task: - *     se_runnable() == se_weight() - * - *   group: [ see update_cfs_group() ] - *     se_weight()   = tg->weight * grq->load_avg / tg->load_avg - *     se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg - * - *   load_sum := runnable_sum - *   load_avg = se_weight(se) * runnable_avg - * - *   runnable_load_sum := runnable_sum - *   runnable_load_avg = se_runnable(se) * runnable_avg - * - * XXX collapse load_sum and runnable_load_sum - * - * cfq_rs: - * - *   load_sum = \Sum se_weight(se) * se->avg.load_sum - *   load_avg = \Sum se->avg.load_avg - * - *   runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum - *   runnable_load_avg = \Sum se->avg.runable_load_avg - */ - -static int -__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) -{ -	if (entity_is_task(se)) -		se->runnable_weight = se->load.weight; - -	if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { -		___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); -		return 1; -	} - -	return 0; -} - -static int -__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -	if (entity_is_task(se)) -		se->runnable_weight = se->load.weight; - -	if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, -				cfs_rq->curr == se)) { - -		___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); -		cfs_se_util_change(&se->avg); -		return 1; -	} - -	return 0; -} - -static int -__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) -{ -	if (___update_load_sum(now, cpu, &cfs_rq->avg, -				scale_load_down(cfs_rq->load.weight), -				scale_load_down(cfs_rq->runnable_weight), -				cfs_rq->curr != NULL)) { - -		___update_load_avg(&cfs_rq->avg, 1, 1); -		return 1; -	} - -	return 0; -} -  #ifdef CONFIG_FAIR_GROUP_SCHED  /**   * update_tg_load_avg - update the tg's load avg @@ -3982,18 +3593,10 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)  	if (!sched_feat(UTIL_EST))  		return; -	/* -	 * Update root cfs_rq's estimated utilization -	 * -	 * If *p is the last task then the root cfs_rq's estimated utilization -	 * of a CPU is 0 by definition. -	 */ -	ue.enqueued = 0; -	if (cfs_rq->nr_running) { -		ue.enqueued  = cfs_rq->avg.util_est.enqueued; -		ue.enqueued -= min_t(unsigned int, ue.enqueued, -				     (_task_util_est(p) | UTIL_AVG_UNCHANGED)); -	} +	/* Update root cfs_rq's estimated utilization */ +	ue.enqueued  = cfs_rq->avg.util_est.enqueued; +	ue.enqueued -= min_t(unsigned int, ue.enqueued, +			     (_task_util_est(p) | UTIL_AVG_UNCHANGED));  	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);  	/* @@ -4045,12 +3648,6 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)  #else /* CONFIG_SMP */ -static inline int -update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) -{ -	return 0; -} -  #define UPDATE_TG	0x0  #define SKIP_AGE_LOAD	0x0  #define DO_ATTACH	0x0 @@ -4590,6 +4187,7 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)  	now = sched_clock_cpu(smp_processor_id());  	cfs_b->runtime = cfs_b->quota;  	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); +	cfs_b->expires_seq++;  }  static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -4612,6 +4210,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)  	struct task_group *tg = cfs_rq->tg;  	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);  	u64 amount = 0, min_amount, expires; +	int expires_seq;  	/* note: this is a positive sum as runtime_remaining <= 0 */  	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; @@ -4628,6 +4227,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)  			cfs_b->idle = 0;  		}  	} +	expires_seq = cfs_b->expires_seq;  	expires = cfs_b->runtime_expires;  	raw_spin_unlock(&cfs_b->lock); @@ -4637,8 +4237,10 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)  	 * spread between our sched_clock and the one on which runtime was  	 * issued.  	 */ -	if ((s64)(expires - cfs_rq->runtime_expires) > 0) +	if (cfs_rq->expires_seq != expires_seq) { +		cfs_rq->expires_seq = expires_seq;  		cfs_rq->runtime_expires = expires; +	}  	return cfs_rq->runtime_remaining > 0;  } @@ -4664,12 +4266,9 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)  	 * has not truly expired.  	 *  	 * Fortunately we can check determine whether this the case by checking -	 * whether the global deadline has advanced. It is valid to compare -	 * cfs_b->runtime_expires without any locks since we only care about -	 * exact equality, so a partial write will still work. +	 * whether the global deadline(cfs_b->expires_seq) has advanced.  	 */ - -	if (cfs_rq->runtime_expires != cfs_b->runtime_expires) { +	if (cfs_rq->expires_seq == cfs_b->expires_seq) {  		/* extend local deadline, drift is bounded above by 2 ticks */  		cfs_rq->runtime_expires += TICK_NSEC;  	} else { @@ -4732,7 +4331,6 @@ static inline int throttled_lb_pair(struct task_group *tg,  	       throttled_hierarchy(dest_cfs_rq);  } -/* updated child weight may affect parent so we have to do this bottom up */  static int tg_unthrottle_up(struct task_group *tg, void *data)  {  	struct rq *rq = data; @@ -5202,13 +4800,18 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)  void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)  { +	u64 overrun; +  	lockdep_assert_held(&cfs_b->lock); -	if (!cfs_b->period_active) { -		cfs_b->period_active = 1; -		hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); -		hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); -	} +	if (cfs_b->period_active) +		return; + +	cfs_b->period_active = 1; +	overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); +	cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period); +	cfs_b->expires_seq++; +	hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);  }  static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -5654,8 +5257,6 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load,  		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;  	} - -	sched_avg_update(this_rq);  }  /* Used instead of source_load when we know the type == 0 */ @@ -6238,6 +5839,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p  }  #ifdef CONFIG_SCHED_SMT +DEFINE_STATIC_KEY_FALSE(sched_smt_present);  static inline void set_idle_cores(int cpu, int val)  { @@ -7295,8 +6897,8 @@ static int task_hot(struct task_struct *p, struct lb_env *env)  static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)  {  	struct numa_group *numa_group = rcu_dereference(p->numa_group); -	unsigned long src_faults, dst_faults; -	int src_nid, dst_nid; +	unsigned long src_weight, dst_weight; +	int src_nid, dst_nid, dist;  	if (!static_branch_likely(&sched_numa_balancing))  		return -1; @@ -7323,18 +6925,19 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)  		return 0;  	/* Leaving a core idle is often worse than degrading locality. */ -	if (env->idle != CPU_NOT_IDLE) +	if (env->idle == CPU_IDLE)  		return -1; +	dist = node_distance(src_nid, dst_nid);  	if (numa_group) { -		src_faults = group_faults(p, src_nid); -		dst_faults = group_faults(p, dst_nid); +		src_weight = group_weight(p, src_nid, dist); +		dst_weight = group_weight(p, dst_nid, dist);  	} else { -		src_faults = task_faults(p, src_nid); -		dst_faults = task_faults(p, dst_nid); +		src_weight = task_weight(p, src_nid, dist); +		dst_weight = task_weight(p, dst_nid, dist);  	} -	return dst_faults < src_faults; +	return dst_weight < src_weight;  }  #else @@ -7621,6 +7224,22 @@ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)  	return false;  } +static inline bool others_have_blocked(struct rq *rq) +{ +	if (READ_ONCE(rq->avg_rt.util_avg)) +		return true; + +	if (READ_ONCE(rq->avg_dl.util_avg)) +		return true; + +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +	if (READ_ONCE(rq->avg_irq.util_avg)) +		return true; +#endif + +	return false; +} +  #ifdef CONFIG_FAIR_GROUP_SCHED  static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) @@ -7680,6 +7299,12 @@ static void update_blocked_averages(int cpu)  		if (cfs_rq_has_blocked(cfs_rq))  			done = false;  	} +	update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); +	update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); +	update_irq_load_avg(rq, 0); +	/* Don't need periodic decay once load/util_avg are null */ +	if (others_have_blocked(rq)) +		done = false;  #ifdef CONFIG_NO_HZ_COMMON  	rq->last_blocked_load_update_tick = jiffies; @@ -7745,9 +7370,12 @@ static inline void update_blocked_averages(int cpu)  	rq_lock_irqsave(rq, &rf);  	update_rq_clock(rq);  	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); +	update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); +	update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); +	update_irq_load_avg(rq, 0);  #ifdef CONFIG_NO_HZ_COMMON  	rq->last_blocked_load_update_tick = jiffies; -	if (!cfs_rq_has_blocked(cfs_rq)) +	if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))  		rq->has_blocked_load = 0;  #endif  	rq_unlock_irqrestore(rq, &rf); @@ -7857,39 +7485,32 @@ static inline int get_sd_load_idx(struct sched_domain *sd,  static unsigned long scale_rt_capacity(int cpu)  {  	struct rq *rq = cpu_rq(cpu); -	u64 total, used, age_stamp, avg; -	s64 delta; +	unsigned long max = arch_scale_cpu_capacity(NULL, cpu); +	unsigned long used, free; +	unsigned long irq; -	/* -	 * Since we're reading these variables without serialization make sure -	 * we read them once before doing sanity checks on them. -	 */ -	age_stamp = READ_ONCE(rq->age_stamp); -	avg = READ_ONCE(rq->rt_avg); -	delta = __rq_clock_broken(rq) - age_stamp; +	irq = cpu_util_irq(rq); -	if (unlikely(delta < 0)) -		delta = 0; +	if (unlikely(irq >= max)) +		return 1; -	total = sched_avg_period() + delta; +	used = READ_ONCE(rq->avg_rt.util_avg); +	used += READ_ONCE(rq->avg_dl.util_avg); -	used = div_u64(avg, total); +	if (unlikely(used >= max)) +		return 1; -	if (likely(used < SCHED_CAPACITY_SCALE)) -		return SCHED_CAPACITY_SCALE - used; +	free = max - used; -	return 1; +	return scale_irq_capacity(free, irq, max);  }  static void update_cpu_capacity(struct sched_domain *sd, int cpu)  { -	unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); +	unsigned long capacity = scale_rt_capacity(cpu);  	struct sched_group *sdg = sd->groups; -	cpu_rq(cpu)->cpu_capacity_orig = capacity; - -	capacity *= scale_rt_capacity(cpu); -	capacity >>= SCHED_CAPACITY_SHIFT; +	cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu);  	if (!capacity)  		capacity = 1; | 
