summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/sched.c222
2 files changed, 167 insertions, 57 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index 50535a16c71e..38badc50bebc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -925,7 +925,7 @@ struct task_struct *copy_process(unsigned long clone_flags,
*/
p->first_time_slice = 1;
current->time_slice >>= 1;
- p->last_run = jiffies;
+ p->timestamp = sched_clock();
if (!current->time_slice) {
/*
* This case is rare, it happens when the parent has only
diff --git a/kernel/sched.c b/kernel/sched.c
index 1c5802ceedae..966dfe516ec5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -68,13 +68,15 @@
*/
#define MIN_TIMESLICE ( 10 * HZ / 1000)
#define MAX_TIMESLICE (200 * HZ / 1000)
-#define CHILD_PENALTY 50
+#define TIMESLICE_GRANULARITY (HZ/40 ?: 1)
+#define ON_RUNQUEUE_WEIGHT 30
+#define CHILD_PENALTY 95
#define PARENT_PENALTY 100
#define EXIT_WEIGHT 3
#define PRIO_BONUS_RATIO 25
#define INTERACTIVE_DELTA 2
-#define MAX_SLEEP_AVG (10*HZ)
-#define STARVATION_LIMIT (10*HZ)
+#define MAX_SLEEP_AVG (1*1000000000)
+#define STARVATION_LIMIT HZ
#define NODE_THRESHOLD 125
/*
@@ -115,6 +117,11 @@
#define TASK_INTERACTIVE(p) \
((p)->prio <= (p)->static_prio - DELTA(p))
+#define TASK_PREEMPTS_CURR(p, rq) \
+ ((p)->prio < (rq)->curr->prio || \
+ ((p)->prio == (rq)->curr->prio && \
+ (p)->time_slice > (rq)->curr->time_slice * 2))
+
/*
* BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
* to time slice values.
@@ -318,8 +325,8 @@ static int effective_prio(task_t *p)
if (rt_task(p))
return p->prio;
- bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 -
- MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
+ bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*(p->sleep_avg/1024)/(MAX_SLEEP_AVG/1024)/100;
+ bonus -= MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
prio = p->static_prio - bonus;
if (prio < MAX_RT_PRIO)
@@ -338,24 +345,24 @@ static inline void __activate_task(task_t *p, runqueue_t *rq)
nr_running_inc(rq);
}
-/*
- * activate_task - move a task to the runqueue and do priority recalculation
- *
- * Update all the scheduling statistics stuff. (sleep average
- * calculation, priority modifiers, etc.)
- */
-static inline void activate_task(task_t *p, runqueue_t *rq)
+static void recalc_task_prio(task_t *p, unsigned long long now)
{
- long sleep_time = jiffies - p->last_run - 1;
+ unsigned long long __sleep_time = now - p->timestamp;
+ unsigned long sleep_time;
+
+ if (__sleep_time > MAX_SLEEP_AVG)
+ sleep_time = MAX_SLEEP_AVG;
+ else
+ sleep_time = (unsigned long)__sleep_time;
if (sleep_time > 0) {
- int sleep_avg;
+ unsigned long long sleep_avg;
/*
* This code gives a bonus to interactive tasks.
*
* The boost works by updating the 'average sleep time'
- * value here, based on ->last_run. The more time a task
+ * value here, based on ->timestamp. The more time a task
* spends sleeping, the higher the average gets - and the
* higher the priority boost gets as well.
*/
@@ -374,6 +381,37 @@ static inline void activate_task(task_t *p, runqueue_t *rq)
p->prio = effective_prio(p);
}
}
+}
+
+/*
+ * activate_task - move a task to the runqueue and do priority recalculation
+ *
+ * Update all the scheduling statistics stuff. (sleep average
+ * calculation, priority modifiers, etc.)
+ */
+static inline void activate_task(task_t *p, runqueue_t *rq)
+{
+ unsigned long long now = sched_clock();
+
+ recalc_task_prio(p, now);
+
+ /*
+ * Tasks which were woken up by interrupts (ie. hw events)
+ * are most likely of interactive nature. So we give them
+ * the credit of extending their sleep time to the period
+ * of time they spend on the runqueue, waiting for execution
+ * on a CPU, first time around:
+ */
+ if (in_interrupt())
+ p->activated = 2;
+ else
+ /*
+ * Normal first-time wakeups get a credit too for on-runqueue time,
+ * but it will be weighted down:
+ */
+ p->activated = 1;
+ p->timestamp = now;
+
__activate_task(p, rq);
}
@@ -500,7 +538,7 @@ repeat_lock_task:
__activate_task(p, rq);
else {
activate_task(p, rq);
- if (p->prio < rq->curr->prio)
+ if (TASK_PREEMPTS_CURR(p, rq))
resched_task(rq->curr);
}
success = 1;
@@ -549,8 +587,8 @@ void wake_up_forked_process(task_t * p)
* and children as well, to keep max-interactive tasks
* from forking tasks that are max-interactive.
*/
- current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100;
- p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100;
+ current->sleep_avg = current->sleep_avg / 100 * PARENT_PENALTY;
+ p->sleep_avg = p->sleep_avg / 100 * CHILD_PENALTY;
p->prio = effective_prio(p);
set_task_cpu(p, smp_processor_id());
@@ -591,8 +629,7 @@ void sched_exit(task_t * p)
* the sleep_avg of the parent as well.
*/
if (p->sleep_avg < p->parent->sleep_avg)
- p->parent->sleep_avg = (p->parent->sleep_avg * EXIT_WEIGHT +
- p->sleep_avg) / (EXIT_WEIGHT + 1);
+ p->parent->sleep_avg = p->parent->sleep_avg / (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / (EXIT_WEIGHT + 1);
}
/**
@@ -994,13 +1031,8 @@ static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t
* Note that idle threads have a prio of MAX_PRIO, for this test
* to be always true for them.
*/
- if (p->prio < this_rq->curr->prio)
+ if (TASK_PREEMPTS_CURR(p, this_rq))
set_need_resched();
- else {
- if (p->prio == this_rq->curr->prio &&
- p->time_slice > this_rq->curr->time_slice)
- set_need_resched();
- }
}
/*
@@ -1017,12 +1049,14 @@ static void load_balance(runqueue_t *this_rq, int idle, cpumask_t cpumask)
runqueue_t *busiest;
prio_array_t *array;
struct list_head *head, *curr;
+ unsigned long long now;
task_t *tmp;
busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask);
if (!busiest)
goto out;
+ now = sched_clock();
/*
* We first consider expired tasks. Those will likely not be
* executed in the near future, and they are most likely to
@@ -1063,7 +1097,7 @@ skip_queue:
*/
#define CAN_MIGRATE_TASK(p,rq,this_cpu) \
- ((idle || (jiffies - (p)->last_run > cache_decay_ticks)) && \
+ ((idle || (((now - (p)->timestamp)>>10) > cache_decay_ticks)) &&\
!task_running(rq, p) && \
cpu_isset(this_cpu, (p)->cpus_allowed))
@@ -1180,8 +1214,7 @@ EXPORT_PER_CPU_SYMBOL(kstat);
*/
#define EXPIRED_STARVING(rq) \
(STARVATION_LIMIT && ((rq)->expired_timestamp && \
- (jiffies - (rq)->expired_timestamp >= \
- STARVATION_LIMIT * ((rq)->nr_running) + 1)))
+ (jiffies - (rq)->expired_timestamp >= STARVATION_LIMIT)))
/*
* This function gets called by the timer code, with HZ frequency.
@@ -1231,14 +1264,11 @@ void scheduler_tick(int user_ticks, int sys_ticks)
spin_lock(&rq->lock);
/*
* The task was running during this tick - update the
- * time slice counter and the sleep average. Note: we
- * do not update a thread's priority until it either
- * goes to sleep or uses up its timeslice. This makes
- * it possible for interactive tasks to use up their
- * timeslices at their highest priority levels.
+ * time slice counter. Note: we do not update a thread's
+ * priority until it either goes to sleep or uses up its
+ * timeslice. This makes it possible for interactive tasks
+ * to use up their timeslices at their highest priority levels.
*/
- if (p->sleep_avg)
- p->sleep_avg--;
if (unlikely(rt_task(p))) {
/*
* RR tasks need a special form of timeslice management.
@@ -1262,12 +1292,33 @@ void scheduler_tick(int user_ticks, int sys_ticks)
p->time_slice = task_timeslice(p);
p->first_time_slice = 0;
+ if (!rq->expired_timestamp)
+ rq->expired_timestamp = jiffies;
if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
- if (!rq->expired_timestamp)
- rq->expired_timestamp = jiffies;
enqueue_task(p, rq->expired);
} else
enqueue_task(p, rq->active);
+ } else {
+ /*
+ * Prevent a too long timeslice allowing a task to monopolize
+ * the CPU. We do this by splitting up the timeslice into
+ * smaller pieces.
+ *
+ * Note: this does not mean the task's timeslices expire or
+ * get lost in any way, they just might be preempted by
+ * another task of equal priority. (one with higher
+ * priority would have preempted this task already.) We
+ * requeue this task to the end of the list on this priority
+ * level, which is in essence a round-robin of tasks with
+ * equal priority.
+ */
+ if (!((task_timeslice(p) - p->time_slice) % TIMESLICE_GRANULARITY) &&
+ (p->array == rq->active)) {
+ dequeue_task(p, rq->active);
+ set_tsk_need_resched(p);
+ p->prio = effective_prio(p);
+ enqueue_task(p, rq->active);
+ }
}
out_unlock:
spin_unlock(&rq->lock);
@@ -1286,6 +1337,8 @@ asmlinkage void schedule(void)
runqueue_t *rq;
prio_array_t *array;
struct list_head *queue;
+ unsigned long long now;
+ unsigned long run_time;
int idx;
/*
@@ -1306,7 +1359,11 @@ need_resched:
rq = this_rq();
release_kernel_lock(prev);
- prev->last_run = jiffies;
+ now = sched_clock();
+ if (likely(now - prev->timestamp < MAX_SLEEP_AVG))
+ run_time = now - prev->timestamp;
+ else
+ run_time = MAX_SLEEP_AVG;
spin_lock_irq(&rq->lock);
/*
@@ -1356,12 +1413,30 @@ pick_next_task:
queue = array->queue + idx;
next = list_entry(queue->next, task_t, run_list);
+ if (next->activated) {
+ unsigned long long delta = now - next->timestamp;
+
+ if (next->activated == 1)
+ delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
+
+ next->activated = 0;
+ array = next->array;
+ dequeue_task(next, array);
+ recalc_task_prio(next, next->timestamp + delta);
+ enqueue_task(next, array);
+ }
switch_tasks:
prefetch(next);
clear_tsk_need_resched(prev);
RCU_qsctr(task_cpu(prev))++;
+ prev->sleep_avg -= run_time;
+ if ((long)prev->sleep_avg < 0)
+ prev->sleep_avg = 0;
+ prev->timestamp = now;
+
if (likely(prev != next)) {
+ next->timestamp = now;
rq->nr_switches++;
rq->curr = next;
@@ -1601,6 +1676,7 @@ void set_user_nice(task_t *p, long nice)
unsigned long flags;
prio_array_t *array;
runqueue_t *rq;
+ int old_prio, new_prio, delta;
if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
return;
@@ -1609,6 +1685,12 @@ void set_user_nice(task_t *p, long nice)
* the task might be in the middle of scheduling on another CPU.
*/
rq = task_rq_lock(p, &flags);
+ /*
+ * The RT priorities are set via setscheduler(), but we still
+ * allow the 'normal' nice value to be set - but as expected
+ * it wont have any effect on scheduling until the task is
+ * not SCHED_NORMAL:
+ */
if (rt_task(p)) {
p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
@@ -1616,16 +1698,20 @@ void set_user_nice(task_t *p, long nice)
array = p->array;
if (array)
dequeue_task(p, array);
+
+ old_prio = p->prio;
+ new_prio = NICE_TO_PRIO(nice);
+ delta = new_prio - old_prio;
p->static_prio = NICE_TO_PRIO(nice);
- p->prio = NICE_TO_PRIO(nice);
+ p->prio += delta;
+
if (array) {
enqueue_task(p, array);
/*
- * If the task is running and lowered its priority,
- * or increased its priority then reschedule its CPU:
+ * If the task increased its priority or is running and
+ * lowered its priority, then reschedule its CPU:
*/
- if ((NICE_TO_PRIO(nice) < p->static_prio) ||
- task_running(rq, p))
+ if (delta < 0 || (delta > 0 && task_running(rq, p)))
resched_task(rq->curr);
}
out_unlock:
@@ -2382,6 +2468,12 @@ static void move_task_away(struct task_struct *p, int dest_cpu)
local_irq_restore(flags);
}
+typedef struct {
+ int cpu;
+ struct completion startup_done;
+ task_t *task;
+} migration_startup_t;
+
/*
* migration_thread - this is a highprio system thread that performs
* thread migration by bumping thread off CPU then 'pushing' onto
@@ -2391,20 +2483,21 @@ static int migration_thread(void * data)
{
/* Marking "param" __user is ok, since we do a set_fs(KERNEL_DS); */
struct sched_param __user param = { .sched_priority = MAX_RT_PRIO-1 };
- int cpu = (long) data;
+ migration_startup_t *startup = data;
+ int cpu = startup->cpu;
runqueue_t *rq;
int ret;
+ startup->task = current;
+ complete(&startup->startup_done);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule();
+
+ BUG_ON(smp_processor_id() != cpu);
+
daemonize("migration/%d", cpu);
set_fs(KERNEL_DS);
- /*
- * Either we are running on the right CPU, or there's a a
- * migration thread on this CPU, guaranteed (we're started
- * serially).
- */
- set_cpus_allowed(current, cpumask_of_cpu(cpu));
-
ret = setscheduler(0, SCHED_FIFO, &param);
rq = this_rq();
@@ -2440,13 +2533,30 @@ static int migration_call(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
+ long cpu = (long) hcpu;
+ migration_startup_t startup;
+
switch (action) {
case CPU_ONLINE:
- printk("Starting migration thread for cpu %li\n",
- (long)hcpu);
- kernel_thread(migration_thread, hcpu, CLONE_KERNEL);
- while (!cpu_rq((long)hcpu)->migration_thread)
+
+ printk("Starting migration thread for cpu %li\n", cpu);
+
+ startup.cpu = cpu;
+ startup.task = NULL;
+ init_completion(&startup.startup_done);
+
+ kernel_thread(migration_thread, &startup, CLONE_KERNEL);
+ wait_for_completion(&startup.startup_done);
+ wait_task_inactive(startup.task);
+
+ startup.task->thread_info->cpu = cpu;
+ startup.task->cpus_allowed = cpumask_of_cpu(cpu);
+
+ wake_up_process(startup.task);
+
+ while (!cpu_rq(cpu)->migration_thread)
yield();
+
break;
}
return NOTIFY_OK;