[IA64] top level scheduler domain for ia64

Some have noticed that the overlapping sched domains code doesn't quite work as intended (it results in disjoint domains on some machines), and that a top level, machine spanning domain is needed. This patch from John Hawkes adds it to the ia64 code. This allows processes to run on all CPUs in large systems, though balancing is limited. It should go to Linus soon now otherwise large systems will only have ~16p (depending on topology) usable by the scheduler. I sanity checked it on a small system after rediffing John's original, and he's done some testing on very large systems. Nick, can you buy off on the sched.c change? Alternatively, do you want to send that fix separately John? Nick did indeed ACK this change, but it isn't dependent on this ia64 specific part ... so it's going to be submitted separately. Signed-off-by: John Hawkes <hawkes@sgi.com> Signed-off-by: Jesse Barnes <jbarnes@sgi.com> Signed-off-by: Tony Luck <tony.luck@intel.com>
author: John Hawkes <hawkes@sgi.com> 2004-10-20 18:23:39 +0000
committer: Tony Luck <tony.luck@intel.com> 2004-10-20 18:23:39 +0000
commit: 5c28bdb3bb9956daa32ec8d3ccaab304330032b9 (patch)
tree: d8b372e614a0850d4b77f81081becfe3da8434b1
parent: 230c76230d0ee3696385f53d226ec562ec6c8a6c (diff)
2 files changed, 52 insertions, 1 deletions
diff --git a/arch/ia64/kernel/domain.c b/arch/ia64/kernel/domain.c
index 744f31c355bf..98e7bedfae5f 100644
--- a/arch/ia64/kernel/domain.c
+++ b/arch/ia64/kernel/domain.c
@@ -119,6 +119,14 @@ static int __devinit cpu_to_phys_group(int cpu)
  */
 static DEFINE_PER_CPU(struct sched_domain, node_domains);
 static struct sched_group *sched_group_nodes[MAX_NUMNODES];
+
+static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
+static struct sched_group sched_group_allnodes[MAX_NUMNODES];
+
+static int __devinit cpu_to_allnodes_group(int cpu)
+{
+	return cpu_to_node(cpu);
+}
 #endif
 
 /*
@@ -149,9 +157,21 @@ void __devinit arch_init_sched_domains(void)
 		cpus_and(nodemask, nodemask, cpu_default_map);
 
 #ifdef CONFIG_NUMA
+		if (num_online_cpus()
+				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+			sd = &per_cpu(allnodes_domains, i);
+			*sd = SD_ALLNODES_INIT;
+			sd->span = cpu_default_map;
+			group = cpu_to_allnodes_group(i);
+			sd->groups = &sched_group_allnodes[group];
+			p = sd;
+		} else
+			p = NULL;
+
 		sd = &per_cpu(node_domains, i);
 		*sd = SD_NODE_INIT;
 		sd->span = sched_domain_node_span(node);
+		sd->parent = p;
 		cpus_and(sd->span, sd->span, cpu_default_map);
 #endif
 
@@ -201,6 +221,9 @@ void __devinit arch_init_sched_domains(void)
 	}
 
 #ifdef CONFIG_NUMA
+	init_sched_build_groups(sched_group_allnodes, cpu_default_map,
+				&cpu_to_allnodes_group);
+
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		/* Set up node groups */
 		struct sched_group *sg, *prev;
@@ -282,6 +305,15 @@ void __devinit arch_init_sched_domains(void)
 		power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
 				(cpus_weight(sd->groups->cpumask)-1) / 10;
 		sd->groups->cpu_power = power;
+
+#ifdef CONFIG_NUMA
+		sd = &per_cpu(allnodes_domains, i);
+		if (sd->groups) {
+			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+				(cpus_weight(sd->groups->cpumask)-1) / 10;
+			sd->groups->cpu_power = power;
+		}
+#endif
 	}
 
 #ifdef CONFIG_NUMA
diff --git a/include/asm-ia64/topology.h b/include/asm-ia64/topology.h
index 1062f02b088e..5b0dd8cc6bd4 100644
--- a/include/asm-ia64/topology.h
+++ b/include/asm-ia64/topology.h
@@ -58,7 +58,26 @@ void build_cpu_to_node_map(void);
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
-	.balance_interval	= 10,			\
+	.balance_interval	= 1,			\
+	.nr_balance_failed	= 0,			\
+}
+
+/* sched_domains SD_ALLNODES_INIT for IA64 NUMA machines */
+#define SD_ALLNODES_INIT (struct sched_domain) {	\
+	.span			= CPU_MASK_NONE,	\
+	.parent			= NULL,			\
+	.groups			= NULL,			\
+	.min_interval		= 80,			\
+	.max_interval		= 320,			\
+	.busy_factor		= 320,			\
+	.imbalance_pct		= 125,			\
+	.cache_hot_time		= (10*1000000),		\
+	.cache_nice_tries	= 1,			\
+	.per_cpu_gain		= 100,			\
+	.flags			= SD_LOAD_BALANCE	\
+				| SD_BALANCE_EXEC,	\
+	.last_balance		= jiffies,		\
+	.balance_interval	= 100*(63+num_online_cpus())/64,   \
 	.nr_balance_failed	= 0,			\
 }
author	John Hawkes <hawkes@sgi.com>	2004-10-20 18:23:39 +0000
committer	Tony Luck <tony.luck@intel.com>	2004-10-20 18:23:39 +0000
commit	5c28bdb3bb9956daa32ec8d3ccaab304330032b9 (patch)
tree	d8b372e614a0850d4b77f81081becfe3da8434b1
parent	230c76230d0ee3696385f53d226ec562ec6c8a6c (diff)