diff options
| author | Ingo Molnar <mingo@elte.hu> | 2002-07-23 22:17:36 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@penguin.transmeta.com> | 2002-07-23 22:17:36 -0700 |
| commit | 97db62cc99a0d92650fbf23b7fbc034107eb01f3 (patch) | |
| tree | a708e1efa64cb8020b108b949857358e0a1cf19b | |
| parent | 9e7cec8858f964d3be10e2f703c841cf05871e09 (diff) | |
[PATCH] scheduler fixes
- introduce new type of context-switch locking, this is a must-have for
ia64 and sparc64.
- load_balance() bug noticed by Scott Rhine and myself: scan the
whole list to find imbalance number of tasks, not just the tail
of the list.
- sched_yield() fix: use current->array not rq->active.
| -rw-r--r-- | include/asm-i386/system.h | 5 | ||||
| -rw-r--r-- | include/asm-ppc/system.h | 5 | ||||
| -rw-r--r-- | include/asm-s390/system.h | 5 | ||||
| -rw-r--r-- | include/asm-s390x/system.h | 5 | ||||
| -rw-r--r-- | include/asm-sparc64/system.h | 16 | ||||
| -rw-r--r-- | include/asm-x86_64/system.h | 5 | ||||
| -rw-r--r-- | include/linux/init_task.h | 3 | ||||
| -rw-r--r-- | include/linux/sched.h | 8 | ||||
| -rw-r--r-- | kernel/exit.c | 2 | ||||
| -rw-r--r-- | kernel/fork.c | 16 | ||||
| -rw-r--r-- | kernel/kmod.c | 18 | ||||
| -rw-r--r-- | kernel/sched.c | 421 | ||||
| -rw-r--r-- | kernel/signal.c | 4 | ||||
| -rw-r--r-- | kernel/timer.c | 2 |
14 files changed, 267 insertions, 248 deletions
diff --git a/include/asm-i386/system.h b/include/asm-i386/system.h index b5bfa022bde7..d6b04c0fed09 100644 --- a/include/asm-i386/system.h +++ b/include/asm-i386/system.h @@ -11,11 +11,6 @@ struct task_struct; /* one of the stranger aspects of C forward declarations.. */ extern void FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next)); -#define prepare_arch_schedule(prev) do { } while(0) -#define finish_arch_schedule(prev) do { } while(0) -#define prepare_arch_switch(rq) do { } while(0) -#define finish_arch_switch(rq) spin_unlock_irq(&(rq)->lock) - #define switch_to(prev,next,last) do { \ asm volatile("pushl %%esi\n\t" \ "pushl %%edi\n\t" \ diff --git a/include/asm-ppc/system.h b/include/asm-ppc/system.h index a9a6b1ac6aab..00fafc7226c1 100644 --- a/include/asm-ppc/system.h +++ b/include/asm-ppc/system.h @@ -83,11 +83,6 @@ extern void cacheable_memzero(void *p, unsigned int nb); struct device_node; extern void note_scsi_host(struct device_node *, void *); -#define prepare_arch_schedule(prev) do { } while(0) -#define finish_arch_schedule(prev) do { } while(0) -#define prepare_arch_switch(rq) do { } while(0) -#define finish_arch_switch(rq) spin_unlock_irq(&(rq)->lock) - struct task_struct; extern void __switch_to(struct task_struct *, struct task_struct *); #define switch_to(prev, next, last) __switch_to((prev), (next)) diff --git a/include/asm-s390/system.h b/include/asm-s390/system.h index ab7e6a28196f..15e1af97a358 100644 --- a/include/asm-s390/system.h +++ b/include/asm-s390/system.h @@ -18,11 +18,6 @@ #endif #include <linux/kernel.h> -#define prepare_arch_schedule(prev) do { } while (0) -#define finish_arch_schedule(prev) do { } while (0) -#define prepare_arch_switch(rq) do { } while (0) -#define finish_arch_switch(rq) spin_unlock_irq(&(rq)->lock) - #define switch_to(prev,next,last) do { \ if (prev == next) \ break; \ diff --git a/include/asm-s390x/system.h b/include/asm-s390x/system.h index 2454658db855..81810a4819b9 100644 --- a/include/asm-s390x/system.h +++ b/include/asm-s390x/system.h @@ -18,11 +18,6 @@ #endif #include <linux/kernel.h> -#define prepare_arch_schedule(prev) do { } while (0) -#define finish_arch_schedule(prev) do { } while (0) -#define prepare_arch_switch(rq) do { } while (0) -#define finish_arch_switch(rq) spin_unlock_irq(&(rq)->lock) - #define switch_to(prev,next),last do { \ if (prev == next) \ break; \ diff --git a/include/asm-sparc64/system.h b/include/asm-sparc64/system.h index f98e5915e612..4f8af64d6572 100644 --- a/include/asm-sparc64/system.h +++ b/include/asm-sparc64/system.h @@ -140,13 +140,17 @@ extern void __flushw_user(void); #define flush_user_windows flushw_user #define flush_register_windows flushw_all -#define prepare_arch_schedule(prev) task_lock(prev) -#define finish_arch_schedule(prev) task_unlock(prev) -#define prepare_arch_switch(rq) \ -do { spin_unlock(&(rq)->lock); \ - flushw_all(); \ +#define prepare_arch_switch(rq, next) \ +do { spin_lock(&(next)->switch_lock); \ + spin_unlock(&(rq)->lock); \ + flushw_all(); \ } while (0) -#define finish_arch_switch(rq) local_irq_enable() + +#define finish_arch_switch(rq, prev) \ +do { spin_unlock_irq(&(prev)->switch_lock); \ +} while (0) + + #ifndef CONFIG_DEBUG_SPINLOCK #define CHECK_LOCKS(PREV) do { } while(0) diff --git a/include/asm-x86_64/system.h b/include/asm-x86_64/system.h index c9604630f910..c0c88ba72c97 100644 --- a/include/asm-x86_64/system.h +++ b/include/asm-x86_64/system.h @@ -13,11 +13,6 @@ #define LOCK_PREFIX "" #endif -#define prepare_arch_schedule(prev) do { } while(0) -#define finish_arch_schedule(prev) do { } while(0) -#define prepare_arch_switch(rq) do { } while(0) -#define finish_arch_switch(rq) spin_unlock_irq(&(rq)->lock) - #define __STR(x) #x #define STR(x) __STR(x) diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 4fcbad0862b8..56c6b7181796 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -47,7 +47,7 @@ lock_depth: -1, \ prio: MAX_PRIO-20, \ static_prio: MAX_PRIO-20, \ - policy: SCHED_OTHER, \ + policy: SCHED_NORMAL, \ cpus_allowed: -1, \ mm: NULL, \ active_mm: &init_mm, \ @@ -78,6 +78,7 @@ pending: { NULL, &tsk.pending.head, {{0}}}, \ blocked: {{0}}, \ alloc_lock: SPIN_LOCK_UNLOCKED, \ + switch_lock: SPIN_LOCK_UNLOCKED, \ journal_info: NULL, \ } diff --git a/include/linux/sched.h b/include/linux/sched.h index 004d2a129ca4..6dd4746e58cd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -116,7 +116,7 @@ extern unsigned long nr_uninterruptible(void); /* * Scheduling policies */ -#define SCHED_OTHER 0 +#define SCHED_NORMAL 0 #define SCHED_FIFO 1 #define SCHED_RR 2 @@ -207,7 +207,7 @@ struct signal_struct { /* * Priority of a process goes from 0..MAX_PRIO-1, valid RT - * priority is 0..MAX_RT_PRIO-1, and SCHED_OTHER tasks are + * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL tasks are * in the range MAX_RT_PRIO..MAX_PRIO-1. Priority values * are inverted: lower p->prio value means higher priority. * @@ -264,7 +264,7 @@ struct task_struct { unsigned long policy; unsigned long cpus_allowed; - unsigned int time_slice; + unsigned int time_slice, first_time_slice; struct list_head tasks; @@ -361,6 +361,8 @@ struct task_struct { u32 self_exec_id; /* Protection of (de-)allocation: mm, files, fs, tty */ spinlock_t alloc_lock; +/* context-switch lock */ + spinlock_t switch_lock; /* journalling filesystem info */ void *journal_info; diff --git a/kernel/exit.c b/kernel/exit.c index 71d017f2fd8f..9d35d8ece02f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -184,7 +184,7 @@ void reparent_to_init(void) current->exit_signal = SIGCHLD; current->ptrace = 0; - if ((current->policy == SCHED_OTHER) && (task_nice(current) < 0)) + if ((current->policy == SCHED_NORMAL) && (task_nice(current) < 0)) set_user_nice(current, 0); /* cpus_allowed? */ /* rt_priority? */ diff --git a/kernel/fork.c b/kernel/fork.c index 8fbe794a5ece..020db0f0e5b9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -611,7 +611,6 @@ struct task_struct *do_fork(unsigned long clone_flags, unsigned long stack_size) { int retval; - unsigned long flags; struct task_struct *p = NULL; struct completion vfork; @@ -675,6 +674,7 @@ struct task_struct *do_fork(unsigned long clone_flags, init_completion(&vfork); } spin_lock_init(&p->alloc_lock); + spin_lock_init(&p->switch_lock); clear_tsk_thread_flag(p,TIF_SIGPENDING); init_sigpending(&p->pending); @@ -740,8 +740,13 @@ struct task_struct *do_fork(unsigned long clone_flags, * total amount of pending timeslices in the system doesnt change, * resulting in more scheduling fairness. */ - local_irq_save(flags); - p->time_slice = (current->time_slice + 1) >> 1; + local_irq_disable(); + p->time_slice = (current->time_slice + 1) >> 1; + /* + * The remainder of the first timeslice might be recovered by + * the parent if the child exits early enough. + */ + p->first_time_slice = 1; current->time_slice >>= 1; p->sleep_timestamp = jiffies; if (!current->time_slice) { @@ -753,11 +758,10 @@ struct task_struct *do_fork(unsigned long clone_flags, current->time_slice = 1; preempt_disable(); scheduler_tick(0, 0); - local_irq_restore(flags); + local_irq_enable(); preempt_enable(); } else - local_irq_restore(flags); - + local_irq_enable(); /* * Ok, add it to the run-queues and make it * visible to the rest of the system. diff --git a/kernel/kmod.c b/kernel/kmod.c index c3b7e90aecdc..d063d20001a4 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -190,16 +190,19 @@ int request_module(const char * module_name) pid_t pid; int waitpid_result; sigset_t tmpsig; - int i; + int i, ret; static atomic_t kmod_concurrent = ATOMIC_INIT(0); #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ static int kmod_loop_msg; + unsigned long saved_policy = current->policy; + current->policy = SCHED_NORMAL; /* Don't allow request_module() before the root fs is mounted! */ if ( ! current->fs->root ) { printk(KERN_ERR "request_module[%s]: Root fs not mounted\n", module_name); - return -EPERM; + ret = -EPERM; + goto out; } /* If modprobe needs a service that is in a module, we get a recursive @@ -220,14 +223,16 @@ int request_module(const char * module_name) printk(KERN_ERR "kmod: runaway modprobe loop assumed and stopped\n"); atomic_dec(&kmod_concurrent); - return -ENOMEM; + ret = -ENOMEM; + goto out; } pid = kernel_thread(exec_modprobe, (void*) module_name, 0); if (pid < 0) { printk(KERN_ERR "request_module[%s]: fork failed, errno %d\n", module_name, -pid); atomic_dec(&kmod_concurrent); - return pid; + ret = pid; + goto out; } /* Block everything but SIGKILL/SIGSTOP */ @@ -250,7 +255,10 @@ int request_module(const char * module_name) printk(KERN_ERR "request_module[%s]: waitpid(%d,...) failed, errno %d\n", module_name, pid, -waitpid_result); } - return 0; + ret = 0; +out: + current->policy = saved_policy; + return ret; } #endif /* CONFIG_KMOD */ diff --git a/kernel/sched.c b/kernel/sched.c index ccceab9745db..47584a3badeb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -12,8 +12,8 @@ * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: * hybrid priority-list and round-robin design with * an array-switch method of distributing timeslices - * and per-CPU runqueues. Additional code by Davide - * Libenzi, Robert Love, and Rusty Russell. + * and per-CPU runqueues. Cleanups and useful suggestions + * by Davide Libenzi, preemptible kernel bits by Robert Love. */ #include <linux/mm.h> @@ -102,16 +102,23 @@ ((p)->prio <= (p)->static_prio - DELTA(p)) /* - * TASK_TIMESLICE scales user-nice values [ -20 ... 19 ] + * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ] * to time slice values. * * The higher a process's priority, the bigger timeslices * it gets during one round of execution. But even the lowest * priority process gets MIN_TIMESLICE worth of execution time. + * + * task_timeslice() is the interface that is used by the scheduler. */ -#define TASK_TIMESLICE(p) (MIN_TIMESLICE + \ - ((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/39)) +#define BASE_TIMESLICE(p) (MIN_TIMESLICE + \ + ((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/(MAX_USER_PRIO - 1))) + +static inline unsigned int task_timeslice(task_t *p) +{ + return BASE_TIMESLICE(p); +} /* * These are the runqueue data structures: @@ -136,13 +143,15 @@ struct prio_array { */ struct runqueue { spinlock_t lock; - unsigned long nr_running, nr_switches, expired_timestamp; - signed long nr_uninterruptible; + unsigned long nr_running, nr_switches, expired_timestamp, + nr_uninterruptible; task_t *curr, *idle; prio_array_t *active, *expired, arrays[2]; int prev_nr_running[NR_CPUS]; + task_t *migration_thread; list_t migration_queue; + } ____cacheline_aligned; static struct runqueue runqueues[NR_CPUS] __cacheline_aligned; @@ -154,6 +163,15 @@ static struct runqueue runqueues[NR_CPUS] __cacheline_aligned; #define rt_task(p) ((p)->prio < MAX_RT_PRIO) /* + * Default context-switch locking: + */ +#ifndef prepare_arch_switch +# define prepare_arch_switch(rq, next) do { } while(0) +# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) +# define task_running(rq, p) ((rq)->curr == (p)) +#endif + +/* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without * explicitly disabling preemption. @@ -307,7 +325,7 @@ void wait_task_inactive(task_t * p) repeat: preempt_disable(); rq = task_rq(p); - if (unlikely(rq->curr == p)) { + if (unlikely(task_running(rq, p))) { cpu_relax(); /* * enable/disable preemption just to make this @@ -318,7 +336,7 @@ repeat: goto repeat; } rq = task_rq_lock(p, &flags); - if (unlikely(rq->curr == p)) { + if (unlikely(task_running(rq, p))) { task_rq_unlock(rq, &flags); preempt_enable(); goto repeat; @@ -326,6 +344,7 @@ repeat: task_rq_unlock(rq, &flags); preempt_enable(); } +#endif /* * Kick the remote CPU if the task is running currently, @@ -338,10 +357,9 @@ repeat: */ void kick_if_running(task_t * p) { - if (p == task_rq(p)->curr) + if ((task_running(task_rq(p), p)) && (p->thread_info->cpu != smp_processor_id())) resched_task(p); } -#endif /* * Wake up a process. Put it on the run-queue if it's not @@ -350,6 +368,8 @@ void kick_if_running(task_t * p) * progress), and as such you're allowed to do the simpler * "current->state = TASK_RUNNING" to mark yourself runnable * without the overhead of this. + * + * returns failure only if the task is already active. */ static int try_to_wake_up(task_t * p, int sync) { @@ -366,7 +386,7 @@ repeat_lock_task: * Fast-migrate the task if it's not running or runnable * currently. Do not violate hard affinity. */ - if (unlikely(sync && (rq->curr != p) && + if (unlikely(sync && !task_running(rq, p) && (task_cpu(p) != smp_processor_id()) && (p->cpus_allowed & (1UL << smp_processor_id())))) { @@ -377,9 +397,7 @@ repeat_lock_task: if (old_state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible--; activate_task(p, rq); - /* - * If sync is set, a resched_task() is a NOOP - */ + if (p->prio < rq->curr->prio) resched_task(rq->curr); success = 1; @@ -428,9 +446,11 @@ void wake_up_forked_process(task_t * p) void sched_exit(task_t * p) { local_irq_disable(); - current->time_slice += p->time_slice; - if (unlikely(current->time_slice > MAX_TIMESLICE)) - current->time_slice = MAX_TIMESLICE; + if (p->first_time_slice) { + current->time_slice += p->time_slice; + if (unlikely(current->time_slice > MAX_TIMESLICE)) + current->time_slice = MAX_TIMESLICE; + } local_irq_enable(); /* * If the child was a (relative-) CPU hog then decrease @@ -444,8 +464,7 @@ void sched_exit(task_t * p) #if CONFIG_SMP || CONFIG_PREEMPT asmlinkage void schedule_tail(task_t *prev) { - finish_arch_switch(this_rq()); - finish_arch_schedule(prev); + finish_arch_switch(this_rq(), prev); } #endif @@ -502,7 +521,42 @@ unsigned long nr_context_switches(void) return sum; } +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) +{ + if (rq1 == rq2) + spin_lock(&rq1->lock); + else { + if (rq1 < rq2) { + spin_lock(&rq1->lock); + spin_lock(&rq2->lock); + } else { + spin_lock(&rq2->lock); + spin_lock(&rq1->lock); + } + } +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) +{ + spin_unlock(&rq1->lock); + if (rq1 != rq2) + spin_unlock(&rq2->lock); +} + #if CONFIG_SMP + /* * Lock the busiest runqueue as well, this_rq is locked already. * Recalculate nr_running if we have to drop the runqueue lock. @@ -526,22 +580,10 @@ static inline unsigned int double_lock_balance(runqueue_t *this_rq, return nr_running; } -/* - * Current runqueue is empty, or rebalance tick: if there is an - * inbalance (current runqueue is too short) then pull from - * busiest runqueue(s). - * - * We call this with the current runqueue locked, - * irqs disabled. - */ -static void load_balance(runqueue_t *this_rq, int idle) +static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle, int *imbalance) { - int imbalance, nr_running, load, max_load, - idx, i, this_cpu = smp_processor_id(); - task_t *next = this_rq->idle, *tmp; + int nr_running, load, max_load, i; runqueue_t *busiest, *rq_src; - prio_array_t *array; - list_t *head, *curr; /* * We search all runqueues to find the most busy one. @@ -590,21 +632,67 @@ static void load_balance(runqueue_t *this_rq, int idle) } if (likely(!busiest)) - return; + goto out; - imbalance = (max_load - nr_running) / 2; + *imbalance = (max_load - nr_running) / 2; /* It needs an at least ~25% imbalance to trigger balancing. */ - if (!idle && (imbalance < (max_load + 3)/4)) - return; + if (!idle && (*imbalance < (max_load + 3)/4)) { + busiest = NULL; + goto out; + } nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running); /* * Make sure nothing changed since we checked the * runqueue length. */ - if (busiest->nr_running <= nr_running + 1) - goto out_unlock; + if (busiest->nr_running <= nr_running + 1) { + spin_unlock(&busiest->lock); + busiest = NULL; + } +out: + return busiest; +} + +/* + * Move a task from a remote runqueue to the local runqueue. + * Both runqueues must be locked. + */ +static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu) +{ + dequeue_task(p, src_array); + src_rq->nr_running--; + set_task_cpu(p, this_cpu); + this_rq->nr_running++; + enqueue_task(p, this_rq->active); + /* + * Note that idle threads have a prio of MAX_PRIO, for this test + * to be always true for them. + */ + if (p->prio < this_rq->curr->prio) + set_need_resched(); +} + +/* + * Current runqueue is empty, or rebalance tick: if there is an + * inbalance (current runqueue is too short) then pull from + * busiest runqueue(s). + * + * We call this with the current runqueue locked, + * irqs disabled. + */ +static void load_balance(runqueue_t *this_rq, int idle) +{ + int imbalance, idx, this_cpu = smp_processor_id(); + runqueue_t *busiest; + prio_array_t *array; + list_t *head, *curr; + task_t *tmp; + + busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance); + if (!busiest) + goto out; /* * We first consider expired tasks. Those will likely not be @@ -647,36 +735,28 @@ skip_queue: #define CAN_MIGRATE_TASK(p,rq,this_cpu) \ ((jiffies - (p)->sleep_timestamp > cache_decay_ticks) && \ - ((p) != (rq)->curr) && \ + !task_running(rq, p) && \ ((p)->cpus_allowed & (1UL << (this_cpu)))) + curr = curr->prev; + if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) { - curr = curr->next; if (curr != head) goto skip_queue; idx++; goto skip_bitmap; } - next = tmp; - /* - * take the task out of the other runqueue and - * put it into this one: - */ - dequeue_task(next, array); - busiest->nr_running--; - set_task_cpu(next, this_cpu); - this_rq->nr_running++; - enqueue_task(next, this_rq->active); - if (next->prio < current->prio) - set_need_resched(); + pull_task(busiest, array, tmp, this_rq, this_cpu); if (!idle && --imbalance) { - if (array == busiest->expired) { - array = busiest->active; - goto new_array; - } + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; } out_unlock: spin_unlock(&busiest->lock); +out: + ; } /* @@ -691,13 +771,13 @@ out_unlock: #define BUSY_REBALANCE_TICK (HZ/4 ?: 1) #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) -static inline void idle_tick(void) +static inline void idle_tick(runqueue_t *rq) { if (jiffies % IDLE_REBALANCE_TICK) return; - spin_lock(&this_rq()->lock); - load_balance(this_rq(), 1); - spin_unlock(&this_rq()->lock); + spin_lock(&rq->lock); + load_balance(rq, 1); + spin_unlock(&rq->lock); } #endif @@ -720,7 +800,7 @@ static inline void idle_tick(void) * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. */ -void scheduler_tick(int user_tick, int system) +void scheduler_tick(int user_ticks, int sys_ticks) { int cpu = smp_processor_id(); runqueue_t *rq = this_rq(); @@ -728,18 +808,18 @@ void scheduler_tick(int user_tick, int system) if (p == rq->idle) { /* note: this timer irq context must be accounted for as well */ - if (irq_count() >= 2*HARDIRQ_OFFSET) - kstat.per_cpu_system[cpu] += system; + if (irq_count() - HARDIRQ_OFFSET >= SOFTIRQ_OFFSET) + kstat.per_cpu_system[cpu] += sys_ticks; #if CONFIG_SMP - idle_tick(); + idle_tick(rq); #endif return; } if (TASK_NICE(p) > 0) - kstat.per_cpu_nice[cpu] += user_tick; + kstat.per_cpu_nice[cpu] += user_ticks; else - kstat.per_cpu_user[cpu] += user_tick; - kstat.per_cpu_system[cpu] += system; + kstat.per_cpu_user[cpu] += user_ticks; + kstat.per_cpu_system[cpu] += sys_ticks; /* Task might have expired already, but not scheduled off yet */ if (p->array != rq->active) { @@ -753,7 +833,8 @@ void scheduler_tick(int user_tick, int system) * FIFO tasks have no timeslices. */ if ((p->policy == SCHED_RR) && !--p->time_slice) { - p->time_slice = TASK_TIMESLICE(p); + p->time_slice = task_timeslice(p); + p->first_time_slice = 0; set_tsk_need_resched(p); /* put it at the end of the queue: */ @@ -776,7 +857,8 @@ void scheduler_tick(int user_tick, int system) dequeue_task(p, rq->active); set_tsk_need_resched(p); p->prio = effective_prio(p); - p->time_slice = TASK_TIMESLICE(p); + p->time_slice = task_timeslice(p); + p->first_time_slice = 0; if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { if (!rq->expired_timestamp) @@ -818,7 +900,6 @@ need_resched: rq = this_rq(); release_kernel_lock(prev); - prepare_arch_schedule(prev); prev->sleep_timestamp = jiffies; spin_lock_irq(&rq->lock); @@ -875,14 +956,13 @@ switch_tasks: rq->nr_switches++; rq->curr = next; - prepare_arch_switch(rq); + prepare_arch_switch(rq, next); prev = context_switch(prev, next); barrier(); rq = this_rq(); - finish_arch_switch(rq); + finish_arch_switch(rq, prev); } else spin_unlock_irq(&rq->lock); - finish_arch_schedule(prev); reacquire_kernel_lock(current); preempt_enable_no_resched(); @@ -1114,7 +1194,8 @@ void set_user_nice(task_t *p, long nice) * If the task is running and lowered its priority, * or increased its priority then reschedule its CPU: */ - if ((NICE_TO_PRIO(nice) < p->static_prio) || (p == rq->curr)) + if ((NICE_TO_PRIO(nice) < p->static_prio) || + task_running(rq, p)) resched_task(rq->curr); } out_unlock: @@ -1228,18 +1309,18 @@ static int setscheduler(pid_t pid, int policy, struct sched_param *param) else { retval = -EINVAL; if (policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_OTHER) + policy != SCHED_NORMAL) goto out_unlock; } /* * Valid priorities for SCHED_FIFO and SCHED_RR are - * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_OTHER is 0. + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. */ retval = -EINVAL; if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1) goto out_unlock; - if ((policy == SCHED_OTHER) != (lp.sched_priority == 0)) + if ((policy == SCHED_NORMAL) != (lp.sched_priority == 0)) goto out_unlock; retval = -EPERM; @@ -1260,7 +1341,7 @@ static int setscheduler(pid_t pid, int policy, struct sched_param *param) retval = 0; p->policy = policy; p->rt_priority = lp.sched_priority; - if (policy != SCHED_OTHER) + if (policy != SCHED_NORMAL) p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority; else p->prio = p->static_prio; @@ -1439,39 +1520,18 @@ asmlinkage long sys_sched_yield(void) prio_array_t *array = current->array; /* - * There are three levels of how a yielding task will give up - * the current CPU: + * We implement yielding by moving the task into the expired + * queue. * - * #1 - it decreases its priority by one. This priority loss is - * temporary, it's recovered once the current timeslice - * expires. - * - * #2 - once it has reached the lowest priority level, - * it will give up timeslices one by one. (We do not - * want to give them up all at once, it's gradual, - * to protect the casual yield()er.) - * - * #3 - once all timeslices are gone we put the process into - * the expired array. - * - * (special rule: RT tasks do not lose any priority, they just - * roundrobin on their current priority level.) + * (special rule: RT tasks will just roundrobin in the active + * array.) */ - if (likely(current->prio == MAX_PRIO-1)) { - if (current->time_slice <= 1) { - dequeue_task(current, rq->active); - enqueue_task(current, rq->expired); - } else - current->time_slice--; - } else if (unlikely(rt_task(current))) { - list_move_tail(¤t->run_list, array->queue + current->prio); + if (likely(!rt_task(current))) { + dequeue_task(current, array); + enqueue_task(current, rq->expired); } else { list_del(¤t->run_list); - if (list_empty(array->queue + current->prio)) - __clear_bit(current->prio, array->bitmap); - current->prio++; list_add_tail(¤t->run_list, array->queue + current->prio); - __set_bit(current->prio, array->bitmap); } /* * Since we are going to call schedule() anyway, there's @@ -1506,7 +1566,7 @@ asmlinkage long sys_sched_get_priority_max(int policy) case SCHED_RR: ret = MAX_USER_RT_PRIO-1; break; - case SCHED_OTHER: + case SCHED_NORMAL: ret = 0; break; } @@ -1522,7 +1582,7 @@ asmlinkage long sys_sched_get_priority_min(int policy) case SCHED_RR: ret = 1; break; - case SCHED_OTHER: + case SCHED_NORMAL: ret = 0; } return ret; @@ -1548,7 +1608,7 @@ asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval) goto out_unlock; jiffies_to_timespec(p->policy & SCHED_FIFO ? - 0 : TASK_TIMESLICE(p), &t); + 0 : task_timeslice(p), &t); read_unlock(&tasklist_lock); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; out_nounlock: @@ -1652,40 +1712,6 @@ void show_state(void) read_unlock(&tasklist_lock); } -/* - * double_rq_lock - safely lock two runqueues - * - * Note this does not disable interrupts like task_rq_lock, - * you need to do so manually before calling. - */ -static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) -{ - if (rq1 == rq2) - spin_lock(&rq1->lock); - else { - if (rq1 < rq2) { - spin_lock(&rq1->lock); - spin_lock(&rq2->lock); - } else { - spin_lock(&rq2->lock); - spin_lock(&rq1->lock); - } - } -} - -/* - * double_rq_unlock - safely unlock two runqueues - * - * Note this does not restore interrupts like task_rq_unlock, - * you need to do so manually after calling. - */ -static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) -{ - spin_unlock(&rq1->lock); - if (rq1 != rq2) - spin_unlock(&rq2->lock); -} - void __init init_idle(task_t *idle, int cpu) { runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle)); @@ -1712,57 +1738,6 @@ void __init init_idle(task_t *idle, int cpu) #endif } -extern void init_timervecs(void); -extern void timer_bh(void); -extern void tqueue_bh(void); -extern void immediate_bh(void); - -void __init sched_init(void) -{ - runqueue_t *rq; - int i, j, k; - - for (i = 0; i < NR_CPUS; i++) { - prio_array_t *array; - - rq = cpu_rq(i); - rq->active = rq->arrays; - rq->expired = rq->arrays + 1; - spin_lock_init(&rq->lock); - INIT_LIST_HEAD(&rq->migration_queue); - - for (j = 0; j < 2; j++) { - array = rq->arrays + j; - for (k = 0; k < MAX_PRIO; k++) { - INIT_LIST_HEAD(array->queue + k); - __clear_bit(k, array->bitmap); - } - // delimiter for bitsearch - __set_bit(MAX_PRIO, array->bitmap); - } - } - /* - * We have to do a little magic to get the first - * process right in SMP mode. - */ - rq = this_rq(); - rq->curr = current; - rq->idle = current; - set_task_cpu(current, smp_processor_id()); - wake_up_process(current); - - init_timervecs(); - init_bh(TIMER_BH, timer_bh); - init_bh(TQUEUE_BH, tqueue_bh); - init_bh(IMMEDIATE_BH, immediate_bh); - - /* - * The boot idle thread does lazy MMU switching as well: - */ - atomic_inc(&init_mm.mm_count); - enter_lazy_tlb(&init_mm, current, smp_processor_id()); -} - #if CONFIG_SMP /* @@ -1821,7 +1796,7 @@ void set_cpus_allowed(task_t *p, unsigned long new_mask) * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. */ - if (!p->array && (p != rq->curr)) { + if (!p->array && !task_running(rq, p)) { set_task_cpu(p, __ffs(p->cpus_allowed)); task_rq_unlock(rq, &flags); goto out; @@ -1939,3 +1914,55 @@ void __init migration_init(void) } } #endif + +extern void init_timervecs(void); +extern void timer_bh(void); +extern void tqueue_bh(void); +extern void immediate_bh(void); + +void __init sched_init(void) +{ + runqueue_t *rq; + int i, j, k; + + for (i = 0; i < NR_CPUS; i++) { + prio_array_t *array; + + rq = cpu_rq(i); + rq->active = rq->arrays; + rq->expired = rq->arrays + 1; + spin_lock_init(&rq->lock); + INIT_LIST_HEAD(&rq->migration_queue); + + for (j = 0; j < 2; j++) { + array = rq->arrays + j; + for (k = 0; k < MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); + } + // delimiter for bitsearch + __set_bit(MAX_PRIO, array->bitmap); + } + } + /* + * We have to do a little magic to get the first + * process right in SMP mode. + */ + rq = this_rq(); + rq->curr = current; + rq->idle = current; + set_task_cpu(current, smp_processor_id()); + wake_up_process(current); + + init_timervecs(); + init_bh(TIMER_BH, timer_bh); + init_bh(TQUEUE_BH, tqueue_bh); + init_bh(IMMEDIATE_BH, immediate_bh); + + /* + * The boot idle thread does lazy MMU switching as well: + */ + atomic_inc(&init_mm.mm_count); + enter_lazy_tlb(&init_mm, current, smp_processor_id()); +} + diff --git a/kernel/signal.c b/kernel/signal.c index 90ae5af97bf7..dcb9beb032d4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -500,7 +500,6 @@ inline void signal_wake_up(struct task_struct *t) { set_tsk_thread_flag(t,TIF_SIGPENDING); -#ifdef CONFIG_SMP /* * If the task is running on a different CPU * force a reschedule on the other CPU to make @@ -511,9 +510,8 @@ inline void signal_wake_up(struct task_struct *t) * process of changing - but no harm is done by that * other than doing an extra (lightweight) IPI interrupt. */ - if ((t->state == TASK_RUNNING) && (t->thread_info->cpu != smp_processor_id())) + if (t->state == TASK_RUNNING) kick_if_running(t); -#endif if (t->state & TASK_INTERRUPTIBLE) { wake_up_process(t); return; diff --git a/kernel/timer.c b/kernel/timer.c index 997476fd4333..64d154ba1255 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -888,7 +888,7 @@ asmlinkage long sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp) if (t.tv_sec == 0 && t.tv_nsec <= 2000000L && - current->policy != SCHED_OTHER) + current->policy != SCHED_NORMAL) { /* * Short delay requests up to 2 ms will be handled with |
