From 18f08dae19990f5fffde92e3a63e0d90cda0f1a8 Mon Sep 17 00:00:00 2001
From: Cheng Jian <cj.chengjian@huawei.com>
Date: Fri, 4 Aug 2017 17:19:37 +0800
Subject: sched/core: Remove unnecessary initialization init_idle_bootup_task()

init_idle_bootup_task( ) is called in rest_init( ) to switch
the scheduling class of the boot thread to the idle class.

the function only sets:

    idle->sched_class = &idle_sched_class;

which has been set in init_idle() called by sched_init():

    /*
     * The idle tasks have their own, simple scheduling class:
     */
    idle->sched_class = &idle_sched_class;

We've already set the boot thread to idle class in
start_kernel()->sched_init()->init_idle()
so it's unnecessary to set it again in
start_kernel()->rest_init()->init_idle_bootup_task()

Signed-off-by: Cheng Jian <cj.chengjian@huawei.com>
Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <akpm@linux-foundation.org>
Cc: <huawei.libin@huawei.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1501838377-109720-1-git-send-email-cj.chengjian@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/task.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index c97e5f096927..79a2a744648d 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -30,7 +30,6 @@ extern int lockdep_tasklist_lock_is_held(void);
 
 extern asmlinkage void schedule_tail(struct task_struct *prev);
 extern void init_idle(struct task_struct *idle, int cpu);
-extern void init_idle_bootup_task(struct task_struct *idle);
 
 extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
 extern void sched_dead(struct task_struct *p);
-- 
cgit v1.2.3


From 74dc3384fc7983b78cc46ebb1824968a3db85eb1 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <asarai@suse.com>
Date: Sun, 6 Aug 2017 14:41:41 +1000
Subject: sched/debug: Use task_pid_nr_ns in /proc/$pid/sched

It appears as though the addition of the PID namespace did not update
the output code for /proc/*/sched, which resulted in it providing PIDs
that were not self-consistent with the /proc mount. This additionally
made it trivial to detect whether a process was inside &init_pid_ns from
userspace, making container detection trivial:

   https://github.com/jessfraz/amicontained

This leads to situations such as:

  % unshare -pmf
  % mount -t proc proc /proc
  % head -n1 /proc/1/sched
  head (10047, #threads: 1)

Fix this by just using task_pid_nr_ns for the output of /proc/*/sched.
All of the other uses of task_pid_nr in kernel/sched/debug.c are from a
sysctl context and thus don't need to be namespaced.

Signed-off-by: Aleksa Sarai <asarai@suse.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jess Frazelle <acidburn@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: cyphar@cyphar.com
Link: http://lkml.kernel.org/r/20170806044141.5093-1-asarai@suse.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/proc/base.c              | 3 ++-
 include/linux/sched/debug.h | 4 +++-
 kernel/sched/debug.c        | 5 +++--
 3 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 719c2e943ea1..98fd8f6df851 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1408,12 +1408,13 @@ static const struct file_operations proc_fail_nth_operations = {
 static int sched_show(struct seq_file *m, void *v)
 {
 	struct inode *inode = m->private;
+	struct pid_namespace *ns = inode->i_sb->s_fs_info;
 	struct task_struct *p;
 
 	p = get_proc_task(inode);
 	if (!p)
 		return -ESRCH;
-	proc_sched_show_task(p, m);
+	proc_sched_show_task(p, ns, m);
 
 	put_task_struct(p);
 
diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h
index e0eaee54c5a4..5d58d49e9f87 100644
--- a/include/linux/sched/debug.h
+++ b/include/linux/sched/debug.h
@@ -6,6 +6,7 @@
  */
 
 struct task_struct;
+struct pid_namespace;
 
 extern void dump_cpu_task(int cpu);
 
@@ -34,7 +35,8 @@ extern void sched_show_task(struct task_struct *p);
 
 #ifdef CONFIG_SCHED_DEBUG
 struct seq_file;
-extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
+extern void proc_sched_show_task(struct task_struct *p,
+				 struct pid_namespace *ns, struct seq_file *m);
 extern void proc_sched_set_task(struct task_struct *p);
 #endif
 
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4fa66de52bd6..ac345115877b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -872,11 +872,12 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
 #endif
 }
 
-void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
+void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
+						  struct seq_file *m)
 {
 	unsigned long nr_switches;
 
-	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p),
+	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns),
 						get_nr_threads(p));
 	SEQ_printf(m,
 		"---------------------------------------------------------"
-- 
cgit v1.2.3


From 20435d84e5f2041c64c792399ab6f2948a2c2252 Mon Sep 17 00:00:00 2001
From: Xie XiuQi <xiexiuqi@huawei.com>
Date: Mon, 7 Aug 2017 16:44:23 +0800
Subject: sched/debug: Intruduce task_state_to_char() helper function

Now that we have more than one place to get the task state,
intruduce the task_state_to_char() helper function to save some code.

No functionality changed.

Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <cj.chengjian@huawei.com>
Cc: <huawei.libin@huawei.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1502095463-160172-3-git-send-email-xiexiuqi@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 13 +++++++++++++
 kernel/sched/core.c   | 15 ++++-----------
 kernel/sched/debug.c  | 10 +++-------
 3 files changed, 20 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8337e2db0bb2..c28b182c9833 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1229,6 +1229,19 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
 	return task_pgrp_nr_ns(tsk, &init_pid_ns);
 }
 
+static inline char task_state_to_char(struct task_struct *task)
+{
+	const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
+	unsigned long state = task->state;
+
+	state = state ? __ffs(state) + 1 : 0;
+
+	/* Make sure the string lines up properly with the number of task states: */
+	BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1);
+
+	return state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?';
+}
+
 /**
  * is_global_init - check if a task structure is init. Since init
  * is free to have sub-threads we need to check tgid.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6d91c10b1814..f9f9948e2470 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5103,24 +5103,17 @@ out_unlock:
 	return retval;
 }
 
-static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
-
 void sched_show_task(struct task_struct *p)
 {
 	unsigned long free = 0;
 	int ppid;
-	unsigned long state = p->state;
-
-	/* Make sure the string lines up properly with the number of task states: */
-	BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1);
 
 	if (!try_get_task_stack(p))
 		return;
-	if (state)
-		state = __ffs(state) + 1;
-	printk(KERN_INFO "%-15.15s %c", p->comm,
-		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
-	if (state == TASK_RUNNING)
+
+	printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
+
+	if (p->state == TASK_RUNNING)
 		printk(KERN_CONT "  running task    ");
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	free = stack_not_used(p);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index d8d2ea215b85..cfd84f79e075 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -426,14 +426,10 @@ static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
 static void
 print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 {
-	unsigned long state;
-
-	if (rq->curr == p) {
+	if (rq->curr == p)
 		SEQ_printf(m, ">R");
-	} else {
-		state = p->state ? __ffs(p->state) + 1 : 0;
-		SEQ_printf(m, " %c", state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
-	}
+	else
+		SEQ_printf(m, " %c", task_state_to_char(p));
 
 	SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
 		p->comm, task_pid_nr(p),
-- 
cgit v1.2.3


From 90001d67be2fa2acbe3510d1f64fa6533efa30ef Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 31 Jul 2017 17:50:05 +0200
Subject: sched/fair: Fix wake_affine() for !NUMA_BALANCING

In commit:

  3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()")

Rik changed wake_affine to consider NUMA information when balancing
between LLC domains.

There are a number of problems here which this patch tries to address:

 - LLC < NODE; in this case we'd use the wrong information to balance
 - !NUMA_BALANCING: in this case, the new code doesn't do any
   balancing at all
 - re-computes the NUMA data for every wakeup, this can mean iterating
   up to 64 CPUs for every wakeup.
 - default affine wakeups inside a cache

We address these by saving the load/capacity values for each
sched_domain during regular load-balance and using these values in
wake_affine_llc(). The obvious down-side to using cached values is
that they can be too old and poorly reflect reality.

But this way we can use LLC wide information and thus not rely on
assuming LLC matches NODE. We also don't rely on NUMA_BALANCING nor do
we have to aggegate two nodes (or even cache domains) worth of CPUs
for each wakeup.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Fixes: 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()")
[ Minor readability improvements. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/topology.h |   8 ++
 kernel/sched/fair.c            | 190 ++++++++++++++++++++++++++---------------
 2 files changed, 130 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 7d065abc7a47..d7b6dab956ec 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -71,6 +71,14 @@ struct sched_domain_shared {
 	atomic_t	ref;
 	atomic_t	nr_busy_cpus;
 	int		has_idle_cores;
+
+	/*
+	 * Some variables from the most recent sd_lb_stats for this domain,
+	 * used by wake_affine().
+	 */
+	unsigned long	nr_running;
+	unsigned long	load;
+	unsigned long	capacity;
 };
 
 struct sched_domain {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a7f1c3b797f8..8d5868771cb3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2658,59 +2658,6 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
 	}
 }
 
-/*
- * Can a task be moved from prev_cpu to this_cpu without causing a load
- * imbalance that would trigger the load balancer?
- */
-static inline bool numa_wake_affine(struct sched_domain *sd,
-				    struct task_struct *p, int this_cpu,
-				    int prev_cpu, int sync)
-{
-	struct numa_stats prev_load, this_load;
-	s64 this_eff_load, prev_eff_load;
-
-	update_numa_stats(&prev_load, cpu_to_node(prev_cpu));
-	update_numa_stats(&this_load, cpu_to_node(this_cpu));
-
-	/*
-	 * If sync wakeup then subtract the (maximum possible)
-	 * effect of the currently running task from the load
-	 * of the current CPU:
-	 */
-	if (sync) {
-		unsigned long current_load = task_h_load(current);
-
-		if (this_load.load > current_load)
-			this_load.load -= current_load;
-		else
-			this_load.load = 0;
-	}
-
-	/*
-	 * In low-load situations, where this_cpu's node is idle due to the
-	 * sync cause above having dropped this_load.load to 0, move the task.
-	 * Moving to an idle socket will not create a bad imbalance.
-	 *
-	 * Otherwise check if the nodes are near enough in load to allow this
-	 * task to be woken on this_cpu's node.
-	 */
-	if (this_load.load > 0) {
-		unsigned long task_load = task_h_load(p);
-
-		this_eff_load = 100;
-		this_eff_load *= prev_load.compute_capacity;
-
-		prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
-		prev_eff_load *= this_load.compute_capacity;
-
-		this_eff_load *= this_load.load + task_load;
-		prev_eff_load *= prev_load.load - task_load;
-
-		return this_eff_load <= prev_eff_load;
-	}
-
-	return true;
-}
 #else
 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 {
@@ -2724,14 +2671,6 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
 {
 }
 
-#ifdef CONFIG_SMP
-static inline bool numa_wake_affine(struct sched_domain *sd,
-				    struct task_struct *p, int this_cpu,
-				    int prev_cpu, int sync)
-{
-	return true;
-}
-#endif /* !SMP */
 #endif /* CONFIG_NUMA_BALANCING */
 
 static void
@@ -5428,20 +5367,115 @@ static int wake_wide(struct task_struct *p)
 	return 1;
 }
 
+struct llc_stats {
+	unsigned long	nr_running;
+	unsigned long	load;
+	unsigned long	capacity;
+	int		has_capacity;
+};
+
+static bool get_llc_stats(struct llc_stats *stats, int cpu)
+{
+	struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+
+	if (!sds)
+		return false;
+
+	stats->nr_running	= READ_ONCE(sds->nr_running);
+	stats->load		= READ_ONCE(sds->load);
+	stats->capacity		= READ_ONCE(sds->capacity);
+	stats->has_capacity	= stats->nr_running < per_cpu(sd_llc_size, cpu);
+
+	return true;
+}
+
+/*
+ * Can a task be moved from prev_cpu to this_cpu without causing a load
+ * imbalance that would trigger the load balancer?
+ *
+ * Since we're running on 'stale' values, we might in fact create an imbalance
+ * but recomputing these values is expensive, as that'd mean iteration 2 cache
+ * domains worth of CPUs.
+ */
+static bool
+wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
+		int this_cpu, int prev_cpu, int sync)
+{
+	struct llc_stats prev_stats, this_stats;
+	s64 this_eff_load, prev_eff_load;
+	unsigned long task_load;
+
+	if (!get_llc_stats(&prev_stats, prev_cpu) ||
+	    !get_llc_stats(&this_stats, this_cpu))
+		return false;
+
+	/*
+	 * If sync wakeup then subtract the (maximum possible)
+	 * effect of the currently running task from the load
+	 * of the current LLC.
+	 */
+	if (sync) {
+		unsigned long current_load = task_h_load(current);
+
+		/* in this case load hits 0 and this LLC is considered 'idle' */
+		if (current_load > this_stats.load)
+			return true;
+
+		this_stats.load -= current_load;
+	}
+
+	/*
+	 * The has_capacity stuff is not SMT aware, but by trying to balance
+	 * the nr_running on both ends we try and fill the domain at equal
+	 * rates, thereby first consuming cores before siblings.
+	 */
+
+	/* if the old cache has capacity, stay there */
+	if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
+		return false;
+
+	/* if this cache has capacity, come here */
+	if (this_stats.has_capacity && this_stats.nr_running < prev_stats.nr_running+1)
+		return true;
+
+	/*
+	 * Check to see if we can move the load without causing too much
+	 * imbalance.
+	 */
+	task_load = task_h_load(p);
+
+	this_eff_load = 100;
+	this_eff_load *= prev_stats.capacity;
+
+	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+	prev_eff_load *= this_stats.capacity;
+
+	this_eff_load *= this_stats.load + task_load;
+	prev_eff_load *= prev_stats.load - task_load;
+
+	return this_eff_load <= prev_eff_load;
+}
+
 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 		       int prev_cpu, int sync)
 {
 	int this_cpu = smp_processor_id();
-	bool affine = false;
+	bool affine;
 
 	/*
-	 * Common case: CPUs are in the same socket, and select_idle_sibling()
-	 * will do its thing regardless of what we return:
+	 * Default to no affine wakeups; wake_affine() should not effect a task
+	 * placement the load-balancer feels inclined to undo. The conservative
+	 * option is therefore to not move tasks when they wake up.
 	 */
-	if (cpus_share_cache(prev_cpu, this_cpu))
-		affine = true;
-	else
-		affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync);
+	affine = false;
+
+	/*
+	 * If the wakeup is across cache domains, try to evaluate if movement
+	 * makes sense, otherwise rely on select_idle_siblings() to do
+	 * placement inside the cache domain.
+	 */
+	if (!cpus_share_cache(prev_cpu, this_cpu))
+		affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
 
 	schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
 	if (affine) {
@@ -7121,6 +7155,7 @@ struct sg_lb_stats {
 struct sd_lb_stats {
 	struct sched_group *busiest;	/* Busiest group in this sd */
 	struct sched_group *local;	/* Local group in this sd */
+	unsigned long total_running;
 	unsigned long total_load;	/* Total load of all groups in sd */
 	unsigned long total_capacity;	/* Total capacity of all groups in sd */
 	unsigned long avg_load;	/* Average load across all groups in sd */
@@ -7140,6 +7175,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
 	*sds = (struct sd_lb_stats){
 		.busiest = NULL,
 		.local = NULL,
+		.total_running = 0UL,
 		.total_load = 0UL,
 		.total_capacity = 0UL,
 		.busiest_stat = {
@@ -7575,6 +7611,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
  */
 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
 {
+	struct sched_domain_shared *shared = env->sd->shared;
 	struct sched_domain *child = env->sd->child;
 	struct sched_group *sg = env->sd->groups;
 	struct sg_lb_stats *local = &sds->local_stat;
@@ -7631,6 +7668,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 
 next_group:
 		/* Now, start updating sd_lb_stats */
+		sds->total_running += sgs->sum_nr_running;
 		sds->total_load += sgs->group_load;
 		sds->total_capacity += sgs->group_capacity;
 
@@ -7646,6 +7684,21 @@ next_group:
 			env->dst_rq->rd->overload = overload;
 	}
 
+	if (!shared)
+		return;
+
+	/*
+	 * Since these are sums over groups they can contain some CPUs
+	 * multiple times for the NUMA domains.
+	 *
+	 * Currently only wake_affine_llc() and find_busiest_group()
+	 * uses these numbers, only the last is affected by this problem.
+	 *
+	 * XXX fix that.
+	 */
+	WRITE_ONCE(shared->nr_running,	sds->total_running);
+	WRITE_ONCE(shared->load,	sds->total_load);
+	WRITE_ONCE(shared->capacity,	sds->total_capacity);
 }
 
 /**
@@ -7875,6 +7928,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 	if (!sds.busiest || busiest->sum_nr_running == 0)
 		goto out_balanced;
 
+	/* XXX broken for overlapping NUMA groups */
 	sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
 						/ sds.total_capacity;
 
-- 
cgit v1.2.3