summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Morton <akpm@osdl.org>2004-05-09 23:29:19 -0700
committerLinus Torvalds <torvalds@ppc970.osdl.org>2004-05-09 23:29:19 -0700
commit8c8cfc36d9ec9e9cd6a440fd7bf8b5404bd11635 (patch)
tree4a0083c3db59626be538d56645069eaa6e01726d
parenta690c9b7ac0ddc28785c38526a69a7fe2e692500 (diff)
[PATCH] sched: balance-on-clone
From: Ingo Molnar <mingo@elte.hu> Implement balancing during clone(). It does the following things: - introduces SD_BALANCE_CLONE that can serve as a tool for an architecture to limit the search-idlest-CPU scope on clone(). E.g. the 512-CPU systems should rather not enable this. - uses the highest sd for the imbalance_pct, not this_rq (which didnt make sense). - unifies balance-on-exec and balance-on-clone via the find_idlest_cpu() function. Gets rid of sched_best_cpu() which was still a bit inconsistent IMO, it used 'min_load < load' as a condition for balancing - while a more correct approach would be to use half of the imbalance_pct, like passive balancing does. - the patch also reintroduces the possibility to do SD_BALANCE_EXEC on SMP systems, and activates it - to get testing. - NOTE: there's one thing in this patch that is slightly unclean: i introduced wake_up_forked_thread. I did this to make it easier to get rid of this patch later (wake_up_forked_process() has lots of dependencies in various architectures). If this capability remains in the kernel then i'll clean it up and introduce one function for wake_up_forked_process/thread. - NOTE2: i added the SD_BALANCE_CLONE flag to the NUMA CPU template too. Some NUMA architectures probably want to disable this.
-rw-r--r--include/linux/sched.h23
-rw-r--r--kernel/fork.c20
-rw-r--r--kernel/sched.c169
3 files changed, 167 insertions, 45 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1ff71bf1f5ea..66faf991b373 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -547,10 +547,11 @@ do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
#define SD_BALANCE_NEWIDLE 1 /* Balance when about to become idle */
#define SD_BALANCE_EXEC 2 /* Balance on exec */
-#define SD_WAKE_IDLE 4 /* Wake to idle CPU on task wakeup */
-#define SD_WAKE_AFFINE 8 /* Wake task to waking CPU */
-#define SD_WAKE_BALANCE 16 /* Perform balancing at task wakeup */
-#define SD_SHARE_CPUPOWER 32 /* Domain members share cpu power */
+#define SD_BALANCE_CLONE 4 /* Balance on clone */
+#define SD_WAKE_IDLE 8 /* Wake to idle CPU on task wakeup */
+#define SD_WAKE_AFFINE 16 /* Wake task to waking CPU */
+#define SD_WAKE_BALANCE 32 /* Perform balancing at task wakeup */
+#define SD_SHARE_CPUPOWER 64 /* Domain members share cpu power */
struct sched_group {
struct sched_group *next; /* Must be a circular list */
@@ -598,6 +599,8 @@ struct sched_domain {
.cache_nice_tries = 0, \
.per_cpu_gain = 15, \
.flags = SD_BALANCE_NEWIDLE \
+ | SD_BALANCE_EXEC \
+ | SD_BALANCE_CLONE \
| SD_WAKE_AFFINE \
| SD_WAKE_IDLE \
| SD_SHARE_CPUPOWER, \
@@ -619,6 +622,8 @@ struct sched_domain {
.cache_nice_tries = 1, \
.per_cpu_gain = 100, \
.flags = SD_BALANCE_NEWIDLE \
+ | SD_BALANCE_EXEC \
+ | SD_BALANCE_CLONE \
| SD_WAKE_AFFINE \
| SD_WAKE_BALANCE, \
.last_balance = jiffies, \
@@ -640,6 +645,7 @@ struct sched_domain {
.cache_nice_tries = 1, \
.per_cpu_gain = 100, \
.flags = SD_BALANCE_EXEC \
+ | SD_BALANCE_CLONE \
| SD_WAKE_BALANCE, \
.last_balance = jiffies, \
.balance_interval = 1, \
@@ -659,7 +665,7 @@ static inline int set_cpus_allowed(task_t *p, cpumask_t new_mask)
extern unsigned long long sched_clock(void);
-#ifdef CONFIG_NUMA
+#ifdef CONFIG_SMP
extern void sched_balance_exec(void);
#else
#define sched_balance_exec() {}
@@ -717,12 +723,17 @@ extern void do_timer(struct pt_regs *);
extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state));
extern int FASTCALL(wake_up_process(struct task_struct * tsk));
+extern void FASTCALL(wake_up_forked_process(struct task_struct * tsk));
#ifdef CONFIG_SMP
extern void kick_process(struct task_struct *tsk);
+ extern void FASTCALL(wake_up_forked_thread(struct task_struct * tsk));
#else
static inline void kick_process(struct task_struct *tsk) { }
+ static inline void wake_up_forked_thread(struct task_struct * tsk)
+ {
+ return wake_up_forked_process(tsk);
+ }
#endif
-extern void FASTCALL(wake_up_forked_process(struct task_struct * tsk));
extern void FASTCALL(sched_fork(task_t * p));
extern void FASTCALL(sched_exit(task_t * p));
diff --git a/kernel/fork.c b/kernel/fork.c
index 68597bc347f2..c3af0b74708a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1180,9 +1180,23 @@ long do_fork(unsigned long clone_flags,
set_tsk_thread_flag(p, TIF_SIGPENDING);
}
- if (!(clone_flags & CLONE_STOPPED))
- wake_up_forked_process(p); /* do this last */
- else
+ if (!(clone_flags & CLONE_STOPPED)) {
+ /*
+ * Do the wakeup last. On SMP we treat fork() and
+ * CLONE_VM separately, because fork() has already
+ * created cache footprint on this CPU (due to
+ * copying the pagetables), hence migration would
+ * probably be costy. Threads on the other hand
+ * have less traction to the current CPU, and if
+ * there's an imbalance then the scheduler can
+ * migrate this fresh thread now, before it
+ * accumulates a larger cache footprint:
+ */
+ if (clone_flags & CLONE_VM)
+ wake_up_forked_thread(p);
+ else
+ wake_up_forked_process(p);
+ } else
p->state = TASK_STOPPED;
++total_forks;
diff --git a/kernel/sched.c b/kernel/sched.c
index 90ee4fb0b60f..e1d1eebf840f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1156,7 +1156,133 @@ enum idle_type
};
#ifdef CONFIG_SMP
-#ifdef CONFIG_NUMA
+
+/*
+ * find_idlest_cpu - find the least busy runqueue.
+ */
+static int find_idlest_cpu(struct task_struct *p, int this_cpu,
+ struct sched_domain *sd)
+{
+ unsigned long load, min_load, this_load;
+ int i, min_cpu;
+ cpumask_t mask;
+
+ min_cpu = UINT_MAX;
+ min_load = ULONG_MAX;
+
+ cpus_and(mask, sd->span, cpu_online_map);
+ cpus_and(mask, mask, p->cpus_allowed);
+
+ for_each_cpu_mask(i, mask) {
+ load = target_load(i);
+
+ if (load < min_load) {
+ min_cpu = i;
+ min_load = load;
+
+ /* break out early on an idle CPU: */
+ if (!min_load)
+ break;
+ }
+ }
+
+ /* add +1 to account for the new task */
+ this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
+
+ /*
+ * Would with the addition of the new task to the
+ * current CPU there be an imbalance between this
+ * CPU and the idlest CPU?
+ *
+ * Use half of the balancing threshold - new-context is
+ * a good opportunity to balance.
+ */
+ if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
+ return min_cpu;
+
+ return this_cpu;
+}
+
+/*
+ * wake_up_forked_thread - wake up a freshly forked thread.
+ *
+ * This function will do some initial scheduler statistics housekeeping
+ * that must be done for every newly created context, and it also does
+ * runqueue balancing.
+ */
+void fastcall wake_up_forked_thread(task_t * p)
+{
+ unsigned long flags;
+ int this_cpu = get_cpu(), cpu;
+ struct sched_domain *tmp, *sd = NULL;
+ runqueue_t *this_rq = cpu_rq(this_cpu), *rq;
+
+ /*
+ * Find the largest domain that this CPU is part of that
+ * is willing to balance on clone:
+ */
+ for_each_domain(this_cpu, tmp)
+ if (tmp->flags & SD_BALANCE_CLONE)
+ sd = tmp;
+ if (sd)
+ cpu = find_idlest_cpu(p, this_cpu, sd);
+ else
+ cpu = this_cpu;
+
+ local_irq_save(flags);
+lock_again:
+ rq = cpu_rq(cpu);
+ double_rq_lock(this_rq, rq);
+
+ BUG_ON(p->state != TASK_RUNNING);
+
+ /*
+ * We did find_idlest_cpu() unlocked, so in theory
+ * the mask could have changed - just dont migrate
+ * in this case:
+ */
+ if (unlikely(!cpu_isset(cpu, p->cpus_allowed))) {
+ cpu = this_cpu;
+ double_rq_unlock(this_rq, rq);
+ goto lock_again;
+ }
+ /*
+ * We decrease the sleep average of forking parents
+ * and children as well, to keep max-interactive tasks
+ * from forking tasks that are max-interactive.
+ */
+ current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
+ PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
+
+ p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
+ CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
+
+ p->interactive_credit = 0;
+
+ p->prio = effective_prio(p);
+ set_task_cpu(p, cpu);
+
+ if (cpu == this_cpu) {
+ if (unlikely(!current->array))
+ __activate_task(p, rq);
+ else {
+ p->prio = current->prio;
+ list_add_tail(&p->run_list, &current->run_list);
+ p->array = current->array;
+ p->array->nr_active++;
+ rq->nr_running++;
+ }
+ } else {
+ __activate_task(p, rq);
+ if (TASK_PREEMPTS_CURR(p, rq))
+ resched_task(rq->curr);
+ }
+
+ double_rq_unlock(this_rq, rq);
+ local_irq_restore(flags);
+ put_cpu();
+}
+
/*
* If dest_cpu is allowed for this process, migrate the task to it.
* This is accomplished by forcing the cpu_allowed mask to only
@@ -1198,34 +1324,6 @@ out:
}
/*
- * Find the least loaded CPU. Slightly favor the current CPU by
- * setting its load as the minimum to start.
- */
-static int sched_best_cpu(struct task_struct *p, struct sched_domain *sd)
-{
- cpumask_t tmp;
- int i, min_load, this_cpu, best_cpu;
-
- best_cpu = this_cpu = task_cpu(p);
- min_load = INT_MAX;
-
- cpus_and(tmp, sd->span, cpu_online_map);
- for_each_cpu_mask(i, tmp) {
- unsigned long load;
- if (i == this_cpu)
- load = source_load(i);
- else
- load = target_load(i) + SCHED_LOAD_SCALE;
-
- if (min_load > load) {
- best_cpu = i;
- min_load = load;
- }
- }
- return best_cpu;
-}
-
-/*
* sched_balance_exec(): find the highest-level, exec-balance-capable
* domain and try to migrate the task to the least loaded CPU.
*
@@ -1234,19 +1332,19 @@ static int sched_best_cpu(struct task_struct *p, struct sched_domain *sd)
*/
void sched_balance_exec(void)
{
- struct sched_domain *sd, *best_sd = NULL;
+ struct sched_domain *tmp, *sd = NULL;
int new_cpu, this_cpu = get_cpu();
/* Prefer the current CPU if there's only this task running */
if (this_rq()->nr_running <= 1)
goto out;
- for_each_domain(this_cpu, sd)
- if (sd->flags & SD_BALANCE_EXEC)
- best_sd = sd;
+ for_each_domain(this_cpu, tmp)
+ if (tmp->flags & SD_BALANCE_EXEC)
+ sd = tmp;
- if (best_sd) {
- new_cpu = sched_best_cpu(current, best_sd);
+ if (sd) {
+ new_cpu = find_idlest_cpu(current, this_cpu, sd);
if (new_cpu != this_cpu) {
put_cpu();
sched_migrate_task(current, new_cpu);
@@ -1256,7 +1354,6 @@ void sched_balance_exec(void)
out:
put_cpu();
}
-#endif /* CONFIG_NUMA */
/*
* double_lock_balance - lock the busiest runqueue, this_rq is locked already.