diff options
| author | Linus Torvalds <torvalds@athlon.transmeta.com> | 2002-02-05 00:12:58 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@athlon.transmeta.com> | 2002-02-05 00:12:58 -0800 |
| commit | 908920b1d370e7a5c301d14cfce10c310be19be3 (patch) | |
| tree | 25606bf2c5215cc0e25c9647612390b88622b279 /kernel | |
| parent | d01b7e92c0020f89b4bb33fe61c0dffab7078b42 (diff) | |
v2.5.1.9 -> v2.5.1.10
- Kai Germaschewski: ISDN updates
- Al Viro: start moving buffer cache indexing to "struct block_device *"
- Greg KH: USB update
- Russell King: fix up some ARM merge issues
- Ingo Molnar: scalable scheduler
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/capability.c | 2 | ||||
| -rw-r--r-- | kernel/exit.c | 146 | ||||
| -rw-r--r-- | kernel/fork.c | 52 | ||||
| -rw-r--r-- | kernel/ksyms.c | 7 | ||||
| -rw-r--r-- | kernel/printk.c | 1 | ||||
| -rw-r--r-- | kernel/ptrace.c | 15 | ||||
| -rw-r--r-- | kernel/sched.c | 1458 | ||||
| -rw-r--r-- | kernel/signal.c | 9 | ||||
| -rw-r--r-- | kernel/softirq.c | 28 | ||||
| -rw-r--r-- | kernel/sys.c | 6 | ||||
| -rw-r--r-- | kernel/timer.c | 94 |
11 files changed, 982 insertions, 836 deletions
diff --git a/kernel/capability.c b/kernel/capability.c index 7aaf1a423011..8fd0d451069d 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -8,6 +8,8 @@ #include <linux/mm.h> #include <asm/uaccess.h> +unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */ + kernel_cap_t cap_bset = CAP_INIT_EFF_SET; /* Note: never hold tasklist_lock while spinning for this one */ diff --git a/kernel/exit.c b/kernel/exit.c index a3490404ff33..04774ebbc2e4 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -29,49 +29,39 @@ int getrusage(struct task_struct *, int, struct rusage *); static void release_task(struct task_struct * p) { - if (p != current) { + unsigned long flags; + + if (p == current) + BUG(); #ifdef CONFIG_SMP - /* - * Wait to make sure the process isn't on the - * runqueue (active on some other CPU still) - */ - for (;;) { - task_lock(p); - if (!task_has_cpu(p)) - break; - task_unlock(p); - do { - cpu_relax(); - barrier(); - } while (task_has_cpu(p)); - } - task_unlock(p); + wait_task_inactive(p); #endif - atomic_dec(&p->user->processes); - free_uid(p->user); - unhash_process(p); - - release_thread(p); - current->cmin_flt += p->min_flt + p->cmin_flt; - current->cmaj_flt += p->maj_flt + p->cmaj_flt; - current->cnswap += p->nswap + p->cnswap; - /* - * Potentially available timeslices are retrieved - * here - this way the parent does not get penalized - * for creating too many processes. - * - * (this cannot be used to artificially 'generate' - * timeslices, because any timeslice recovered here - * was given away by the parent in the first place.) - */ - current->time_slice += p->time_slice; - if (current->time_slice > MAX_TSLICE) - current->time_slice = MAX_TSLICE; - p->pid = 0; - free_task_struct(p); - } else { - printk("task releasing itself\n"); - } + atomic_dec(&p->user->processes); + free_uid(p->user); + unhash_process(p); + + release_thread(p); + current->cmin_flt += p->min_flt + p->cmin_flt; + current->cmaj_flt += p->maj_flt + p->cmaj_flt; + current->cnswap += p->nswap + p->cnswap; + /* + * Potentially available timeslices are retrieved + * here - this way the parent does not get penalized + * for creating too many processes. + * + * (this cannot be used to artificially 'generate' + * timeslices, because any timeslice recovered here + * was given away by the parent in the first place.) + */ + __save_flags(flags); + __cli(); + current->time_slice += p->time_slice; + if (current->time_slice > MAX_TIMESLICE) + current->time_slice = MAX_TIMESLICE; + __restore_flags(flags); + + p->pid = 0; + free_task_struct(p); } /* @@ -151,6 +141,80 @@ static inline int has_stopped_jobs(int pgrp) return retval; } +/** + * reparent_to_init() - Reparent the calling kernel thread to the init task. + * + * If a kernel thread is launched as a result of a system call, or if + * it ever exits, it should generally reparent itself to init so that + * it is correctly cleaned up on exit. + * + * The various task state such as scheduling policy and priority may have + * been inherited from a user process, so we reset them to sane values here. + * + * NOTE that reparent_to_init() gives the caller full capabilities. + */ +void reparent_to_init(void) +{ + write_lock_irq(&tasklist_lock); + + /* Reparent to init */ + REMOVE_LINKS(current); + current->p_pptr = child_reaper; + current->p_opptr = child_reaper; + SET_LINKS(current); + + /* Set the exit signal to SIGCHLD so we signal init on exit */ + current->exit_signal = SIGCHLD; + + current->ptrace = 0; + if ((current->policy == SCHED_OTHER) && + (current->__nice < DEF_USER_NICE)) + set_user_nice(current, DEF_USER_NICE); + /* cpus_allowed? */ + /* rt_priority? */ + /* signals? */ + current->cap_effective = CAP_INIT_EFF_SET; + current->cap_inheritable = CAP_INIT_INH_SET; + current->cap_permitted = CAP_FULL_SET; + current->keep_capabilities = 0; + memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim))); + current->user = INIT_USER; + + write_unlock_irq(&tasklist_lock); +} + +/* + * Put all the gunge required to become a kernel thread without + * attached user resources in one place where it belongs. + */ + +void daemonize(void) +{ + struct fs_struct *fs; + + + /* + * If we were started as result of loading a module, close all of the + * user space pages. We don't need them, and if we didn't close them + * they would be locked into memory. + */ + exit_mm(current); + + current->session = 1; + current->pgrp = 1; + current->tty = NULL; + + /* Become as one with the init task */ + + exit_fs(current); /* current->fs->count--; */ + fs = init_task.fs; + current->fs = fs; + atomic_inc(&fs->count); + exit_files(current); + current->files = init_task.files; + atomic_inc(¤t->files->count); +} + /* * When we die, we re-parent all our children. * Try to give them to another thread in our process diff --git a/kernel/fork.c b/kernel/fork.c index 4c3114eef4c4..3fcaeafeb535 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -29,7 +29,6 @@ /* The idle threads do not count.. */ int nr_threads; -int nr_running; int max_threads; unsigned long total_forks; /* Handle normal Linux uptimes. */ @@ -37,6 +36,8 @@ int last_pid; struct task_struct *pidhash[PIDHASH_SZ]; +rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */ + void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait) { unsigned long flags; @@ -564,6 +565,7 @@ int do_fork(unsigned long clone_flags, unsigned long stack_start, struct pt_regs *regs, unsigned long stack_size) { int retval; + unsigned long flags; struct task_struct *p; struct completion vfork; @@ -617,8 +619,7 @@ int do_fork(unsigned long clone_flags, unsigned long stack_start, copy_flags(clone_flags, p); p->pid = get_pid(clone_flags); - p->run_list.next = NULL; - p->run_list.prev = NULL; + INIT_LIST_HEAD(&p->run_list); p->p_cptr = NULL; init_waitqueue_head(&p->wait_chldexit); @@ -644,14 +645,16 @@ int do_fork(unsigned long clone_flags, unsigned long stack_start, #ifdef CONFIG_SMP { int i; - p->cpus_runnable = ~0UL; - p->processor = current->processor; + + p->cpu = smp_processor_id(); + /* ?? should we just memset this ?? */ for(i = 0; i < smp_num_cpus; i++) p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0; spin_lock_init(&p->sigmask_lock); } #endif + p->array = NULL; p->lock_depth = -1; /* -1 = no lock */ p->start_time = jiffies; @@ -685,15 +688,27 @@ int do_fork(unsigned long clone_flags, unsigned long stack_start, p->pdeath_signal = 0; /* - * "share" dynamic priority between parent and child, thus the - * total amount of dynamic priorities in the system doesnt change, - * more scheduling fairness. This is only important in the first - * timeslice, on the long run the scheduling behaviour is unchanged. + * Share the timeslice between parent and child, thus the + * total amount of pending timeslices in the system doesnt change, + * resulting in more scheduling fairness. */ + __save_flags(flags); + __cli(); p->time_slice = (current->time_slice + 1) >> 1; current->time_slice >>= 1; - if (!current->time_slice) - current->need_resched = 1; + if (!current->time_slice) { + /* + * This case is rare, it happens when the parent has only + * a single jiffy left from its timeslice. Taking the + * runqueue lock is not a problem. + */ + current->time_slice = 1; + expire_task(current); + } + p->sleep_timestamp = p->run_timestamp = jiffies; + memset(p->sleep_hist, 0, sizeof(p->sleep_hist[0])*SLEEP_HIST_SIZE); + p->sleep_idx = 0; + __restore_flags(flags); /* * Ok, add it to the run-queues and make it @@ -730,10 +745,23 @@ int do_fork(unsigned long clone_flags, unsigned long stack_start, if (p->ptrace & PT_PTRACED) send_sig(SIGSTOP, p, 1); - wake_up_process(p); /* do this last */ +#define RUN_CHILD_FIRST 1 +#if RUN_CHILD_FIRST + wake_up_forked_process(p); /* do this last */ +#else + wake_up_process(p); /* do this last */ +#endif ++total_forks; if (clone_flags & CLONE_VFORK) wait_for_completion(&vfork); +#if RUN_CHILD_FIRST + else + /* + * Let the child process run first, to avoid most of the + * COW overhead when the child exec()s afterwards. + */ + current->need_resched = 1; +#endif fork_out: return retval; diff --git a/kernel/ksyms.c b/kernel/ksyms.c index 56dd5a66a34b..eded2d7fb339 100644 --- a/kernel/ksyms.c +++ b/kernel/ksyms.c @@ -191,12 +191,12 @@ EXPORT_SYMBOL(notify_change); EXPORT_SYMBOL(set_blocksize); EXPORT_SYMBOL(sb_set_blocksize); EXPORT_SYMBOL(sb_min_blocksize); -EXPORT_SYMBOL(getblk); +EXPORT_SYMBOL(__getblk); EXPORT_SYMBOL(cdget); EXPORT_SYMBOL(cdput); EXPORT_SYMBOL(bdget); EXPORT_SYMBOL(bdput); -EXPORT_SYMBOL(bread); +EXPORT_SYMBOL(__bread); EXPORT_SYMBOL(__brelse); EXPORT_SYMBOL(__bforget); EXPORT_SYMBOL(ll_rw_block); @@ -441,6 +441,8 @@ EXPORT_SYMBOL(interruptible_sleep_on); EXPORT_SYMBOL(interruptible_sleep_on_timeout); EXPORT_SYMBOL(schedule); EXPORT_SYMBOL(schedule_timeout); +EXPORT_SYMBOL(sys_sched_yield); +EXPORT_SYMBOL(set_user_nice); EXPORT_SYMBOL(jiffies); EXPORT_SYMBOL(xtime); EXPORT_SYMBOL(do_gettimeofday); @@ -452,6 +454,7 @@ EXPORT_SYMBOL(loops_per_jiffy); EXPORT_SYMBOL(kstat); EXPORT_SYMBOL(nr_running); +EXPORT_SYMBOL(nr_context_switches); /* misc */ EXPORT_SYMBOL(panic); diff --git a/kernel/printk.c b/kernel/printk.c index 4505465f2dd7..365aed1d39ae 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -25,6 +25,7 @@ #include <linux/module.h> #include <linux/interrupt.h> /* For in_interrupt() */ #include <linux/config.h> +#include <linux/delay.h> #include <asm/uaccess.h> diff --git a/kernel/ptrace.c b/kernel/ptrace.c index b729b6f30c93..79af05d4f353 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -31,20 +31,7 @@ int ptrace_check_attach(struct task_struct *child, int kill) if (child->state != TASK_STOPPED) return -ESRCH; #ifdef CONFIG_SMP - /* Make sure the child gets off its CPU.. */ - for (;;) { - task_lock(child); - if (!task_has_cpu(child)) - break; - task_unlock(child); - do { - if (child->state != TASK_STOPPED) - return -ESRCH; - barrier(); - cpu_relax(); - } while (task_has_cpu(child)); - } - task_unlock(child); + wait_task_inactive(child); #endif } diff --git a/kernel/sched.c b/kernel/sched.c index dc5eea2af910..22c4fc8114e2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -12,331 +12,257 @@ * 1998-12-28 Implemented better SMP scheduling by Ingo Molnar */ -/* - * 'sched.c' is the main kernel file. It contains scheduling primitives - * (sleep_on, wakeup, schedule etc) as well as a number of simple system - * call functions (type getpid()), which just extract a field from - * current-task - */ - -#include <linux/config.h> #include <linux/mm.h> +#include <linux/nmi.h> #include <linux/init.h> +#include <asm/uaccess.h> #include <linux/smp_lock.h> -#include <linux/nmi.h> #include <linux/interrupt.h> -#include <linux/kernel_stat.h> -#include <linux/completion.h> -#include <linux/prefetch.h> -#include <linux/compiler.h> - -#include <asm/uaccess.h> #include <asm/mmu_context.h> -extern void timer_bh(void); -extern void tqueue_bh(void); -extern void immediate_bh(void); - -/* - * scheduler variables - */ +#define BITMAP_SIZE ((MAX_PRIO+7)/8) -unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */ +typedef struct runqueue runqueue_t; -extern void mem_use(void); +struct prio_array { + int nr_active; + spinlock_t *lock; + runqueue_t *rq; + char bitmap[BITMAP_SIZE]; + list_t queue[MAX_PRIO]; +}; /* - * Scheduling quanta. + * This is the main, per-CPU runqueue data structure. * - * NOTE! The unix "nice" value influences how long a process - * gets. The nice value ranges from -20 to +19, where a -20 - * is a "high-priority" task, and a "+10" is a low-priority - * task. The default time slice for zero-nice tasks will be 37ms. - */ -#define NICE_RANGE 40 -#define MIN_NICE_TSLICE 10000 -#define MAX_NICE_TSLICE 90000 -#define TASK_TIMESLICE(p) ((int) ts_table[19 - (p)->nice]) - -static unsigned char ts_table[NICE_RANGE]; - -#define MM_AFFINITY_BONUS 1 - -/* - * Init task must be ok at boot for the ix86 as we will check its signals - * via the SMP irq return path. - */ - -struct task_struct * init_tasks[NR_CPUS] = {&init_task, }; - -/* - * The tasklist_lock protects the linked list of processes. + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the process migration code), lock + * acquire operations must be ordered by rq->cpu. * - * The runqueue_lock locks the parts that actually access - * and change the run-queues, and have to be interrupt-safe. - * - * If both locks are to be concurrently held, the runqueue_lock - * nests inside the tasklist_lock. - * - * task->alloc_lock nests inside tasklist_lock. + * The RT event id is used to avoid calling into the the RT scheduler + * if there is a RT task active in an SMP system but there is no + * RT scheduling activity otherwise. */ -spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; /* inner */ -rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */ - -static LIST_HEAD(runqueue_head); - -static unsigned long rcl_curr; +static struct runqueue { + int cpu; + spinlock_t lock; + unsigned long nr_running, nr_switches, last_rt_event; + task_t *curr, *idle; + prio_array_t *active, *expired, arrays[2]; + char __pad [SMP_CACHE_BYTES]; +} runqueues [NR_CPUS] __cacheline_aligned; + +#define this_rq() (runqueues + smp_processor_id()) +#define task_rq(p) (runqueues + (p)->cpu) +#define cpu_rq(cpu) (runqueues + (cpu)) +#define cpu_curr(cpu) (runqueues[(cpu)].curr) +#define rt_task(p) ((p)->policy != SCHED_OTHER) + +#define lock_task_rq(rq,p,flags) \ +do { \ +repeat_lock_task: \ + rq = task_rq(p); \ + spin_lock_irqsave(&rq->lock, flags); \ + if (unlikely((rq)->cpu != (p)->cpu)) { \ + spin_unlock_irqrestore(&rq->lock, flags); \ + goto repeat_lock_task; \ + } \ +} while (0) + +#define unlock_task_rq(rq,p,flags) \ + spin_unlock_irqrestore(&rq->lock, flags) /* - * We align per-CPU scheduling data on cacheline boundaries, - * to prevent cacheline ping-pong. + * Adding/removing a task to/from a priority array: */ -static union { - struct schedule_data { - struct task_struct * curr; - cycles_t last_schedule; - } schedule_data; - char __pad [SMP_CACHE_BYTES]; -} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}}; - -#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr -#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule - -struct kernel_stat kstat; -extern struct task_struct *child_reaper; - -#ifdef CONFIG_SMP - -#define idle_task(cpu) (init_tasks[cpu_number_map(cpu)]) -#define can_schedule(p,cpu) \ - ((p)->cpus_runnable & (p)->cpus_allowed & (1 << cpu)) - -#else - -#define idle_task(cpu) (&init_task) -#define can_schedule(p,cpu) (1) - -#endif +static inline void dequeue_task(struct task_struct *p, prio_array_t *array) +{ + array->nr_active--; + list_del_init(&p->run_list); + if (list_empty(array->queue + p->prio)) + __set_bit(p->prio, array->bitmap); +} -void scheduling_functions_start_here(void) { } +static inline void enqueue_task(struct task_struct *p, prio_array_t *array) +{ + list_add_tail(&p->run_list, array->queue + p->prio); + __clear_bit(p->prio, array->bitmap); + array->nr_active++; + p->array = array; +} /* - * This is the function that decides how desirable a process is.. - * You can weigh different processes against each other depending - * on what CPU they've run on lately etc to try to handle cache - * and TLB miss penalties. + * This is the per-process load estimator. Processes that generate + * more load than the system can handle get a priority penalty. * - * Return values: - * -1000: never select this - * 0: out of time, recalculate counters (but it might still be - * selected) - * +ve: "goodness" value (the larger, the better) - * +1000: realtime process, select this. + * The estimator uses a 4-entry load-history ringbuffer which is + * updated whenever a task is moved to/from the runqueue. The load + * estimate is also updated from the timer tick to get an accurate + * estimation of currently executing tasks as well. */ +#define NEXT_IDX(idx) (((idx) + 1) % SLEEP_HIST_SIZE) -static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm) +static inline void update_sleep_avg_deactivate(task_t *p) { - int weight; - - /* - * select the current process after every other - * runnable process, but before the idle thread. - * Also, dont trigger a counter recalculation. - */ - weight = -1; - if (p->policy & SCHED_YIELD) - goto out; - - /* - * Non-RT process - normal case first. - */ - if (p->policy == SCHED_OTHER) { - /* - * Give the process a first-approximation goodness value - * according to the number of clock-ticks it has left. - * - * Don't do any other calculations if the time slice is - * over.. - */ - if (!p->time_slice) - return 0; - - weight = p->dyn_prio + 1; + unsigned int idx; + unsigned long j = jiffies, last_sample = p->run_timestamp / HZ, + curr_sample = j / HZ, delta = curr_sample - last_sample; + + if (unlikely(delta)) { + if (delta < SLEEP_HIST_SIZE) { + for (idx = 0; idx < delta; idx++) { + p->sleep_idx++; + p->sleep_idx %= SLEEP_HIST_SIZE; + p->sleep_hist[p->sleep_idx] = 0; + } + } else { + for (idx = 0; idx < SLEEP_HIST_SIZE; idx++) + p->sleep_hist[idx] = 0; + p->sleep_idx = 0; + } + } + p->sleep_timestamp = j; +} -#ifdef CONFIG_SMP - /* Give a largish advantage to the same processor... */ - /* (this is equivalent to penalizing other processors) */ - if (p->processor == this_cpu) - weight += PROC_CHANGE_PENALTY; +#if SLEEP_HIST_SIZE != 4 +# error update this code. #endif - /* .. and a slight advantage to the current MM */ - if (p->mm == this_mm || !p->mm) - weight += MM_AFFINITY_BONUS; - weight += 20 - p->nice; - goto out; - } +static inline unsigned int get_sleep_avg(task_t *p, unsigned long j) +{ + unsigned int sum; - /* - * Realtime process, select the first one on the - * runqueue (taking priorities within processes - * into account). - */ - weight = 1000 + p->rt_priority; -out: - return weight; + sum = p->sleep_hist[0]; + sum += p->sleep_hist[1]; + sum += p->sleep_hist[2]; + sum += p->sleep_hist[3]; + + return sum * HZ / ((SLEEP_HIST_SIZE-1)*HZ + (j % HZ)); } -/* - * the 'goodness value' of replacing a process on a given CPU. - * positive value means 'replace', zero or negative means 'dont'. - */ -static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu) +static inline void update_sleep_avg_activate(task_t *p, unsigned long j) { - return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm); + unsigned int idx; + unsigned long delta_ticks, last_sample = p->sleep_timestamp / HZ, + curr_sample = j / HZ, delta = curr_sample - last_sample; + + if (unlikely(delta)) { + if (delta < SLEEP_HIST_SIZE) { + p->sleep_hist[p->sleep_idx] += HZ - (p->sleep_timestamp % HZ); + p->sleep_idx++; + p->sleep_idx %= SLEEP_HIST_SIZE; + + for (idx = 1; idx < delta; idx++) { + p->sleep_idx++; + p->sleep_idx %= SLEEP_HIST_SIZE; + p->sleep_hist[p->sleep_idx] = HZ; + } + } else { + for (idx = 0; idx < SLEEP_HIST_SIZE; idx++) + p->sleep_hist[idx] = HZ; + p->sleep_idx = 0; + } + p->sleep_hist[p->sleep_idx] = 0; + delta_ticks = j % HZ; + } else + delta_ticks = j - p->sleep_timestamp; + p->sleep_hist[p->sleep_idx] += delta_ticks; + p->run_timestamp = j; } -/* - * This is ugly, but reschedule_idle() is very timing-critical. - * We are called with the runqueue spinlock held and we must - * not claim the tasklist_lock. - */ -static FASTCALL(void reschedule_idle(struct task_struct * p)); - -static void reschedule_idle(struct task_struct * p) +static inline void activate_task(task_t *p, runqueue_t *rq) { -#ifdef CONFIG_SMP - int this_cpu = smp_processor_id(); - struct task_struct *tsk, *target_tsk; - int cpu, best_cpu, i, max_prio; - cycles_t oldest_idle; + prio_array_t *array = rq->active; + unsigned long j = jiffies; + unsigned int sleep, load; + int penalty; + if (likely(p->run_timestamp == j)) + goto enqueue; /* - * shortcut if the woken up task's last CPU is - * idle now. + * Give the process a priority penalty if it has not slept often + * enough in the past. We scale the priority penalty according + * to the current load of the runqueue, and the 'load history' + * this process has. Eg. if the CPU has 3 processes running + * right now then a process that has slept more than two-thirds + * of the time is considered to be 'interactive'. The higher + * the load of the CPUs is, the easier it is for a process to + * get an non-interactivity penalty. */ - best_cpu = p->processor; - if (can_schedule(p, best_cpu)) { - tsk = idle_task(best_cpu); - if (cpu_curr(best_cpu) == tsk) { - int need_resched; -send_now_idle: - /* - * If need_resched == -1 then we can skip sending - * the IPI altogether, tsk->need_resched is - * actively watched by the idle thread. - */ - need_resched = tsk->need_resched; - tsk->need_resched = 1; - if ((best_cpu != this_cpu) && !need_resched) - smp_send_reschedule(best_cpu); - return; - } +#define MAX_PENALTY (MAX_USER_PRIO/3) + update_sleep_avg_activate(p, j); + sleep = get_sleep_avg(p, j); + load = HZ - sleep; + penalty = (MAX_PENALTY * load)/HZ; + if (!rt_task(p)) { + p->prio = NICE_TO_PRIO(p->__nice) + penalty; + if (p->prio > MAX_PRIO-1) + p->prio = MAX_PRIO-1; } +enqueue: + enqueue_task(p, array); + rq->nr_running++; +} - /* - * We know that the preferred CPU has a cache-affine current - * process, lets try to find a new idle CPU for the woken-up - * process. Select the least recently active idle CPU. (that - * one will have the least active cache context.) Also find - * the executing process which has the least priority. - */ - oldest_idle = (cycles_t) -1; - target_tsk = NULL; - max_prio = 0; - - for (i = 0; i < smp_num_cpus; i++) { - cpu = cpu_logical_map(i); - if (!can_schedule(p, cpu)) - continue; - tsk = cpu_curr(cpu); - /* - * We use the first available idle CPU. This creates - * a priority list between idle CPUs, but this is not - * a problem. - */ - if (tsk == idle_task(cpu)) { -#if defined(__i386__) && defined(CONFIG_SMP) - /* - * Check if two siblings are idle in the same - * physical package. Use them if found. - */ - if (smp_num_siblings == 2) { - if (cpu_curr(cpu_sibling_map[cpu]) == - idle_task(cpu_sibling_map[cpu])) { - oldest_idle = last_schedule(cpu); - target_tsk = tsk; - break; - } - - } -#endif - if (last_schedule(cpu) < oldest_idle) { - oldest_idle = last_schedule(cpu); - target_tsk = tsk; - } - } else { - if (oldest_idle == -1ULL) { - int prio = preemption_goodness(tsk, p, cpu); - - if (prio > max_prio) { - max_prio = prio; - target_tsk = tsk; - } - } - } - } - tsk = target_tsk; - if (tsk) { - if (oldest_idle != -1ULL) { - best_cpu = tsk->processor; - goto send_now_idle; - } - tsk->need_resched = 1; - if (tsk->processor != this_cpu) - smp_send_reschedule(tsk->processor); - } - return; - +static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) +{ + rq->nr_running--; + dequeue_task(p, p->array); + p->array = NULL; + update_sleep_avg_deactivate(p); +} -#else /* UP */ - int this_cpu = smp_processor_id(); - struct task_struct *tsk; +static inline void resched_task(task_t *p) +{ + int need_resched; - tsk = cpu_curr(this_cpu); - if (preemption_goodness(tsk, p, this_cpu) > 0) - tsk->need_resched = 1; -#endif + need_resched = p->need_resched; + wmb(); + p->need_resched = 1; + if (!need_resched) + smp_send_reschedule(p->cpu); } +#ifdef CONFIG_SMP + /* - * Careful! - * - * This has to add the process to the _beginning_ of the - * run-queue, not the end. See the comment about "This is - * subtle" in the scheduler proper.. + * Wait for a process to unschedule. This is used by the exit() and + * ptrace() code. */ -static inline void add_to_runqueue(struct task_struct * p) +void wait_task_inactive(task_t * p) { - p->dyn_prio += rcl_curr - p->rcl_last; - p->rcl_last = rcl_curr; - if (p->dyn_prio > MAX_DYNPRIO) - p->dyn_prio = MAX_DYNPRIO; - list_add(&p->run_list, &runqueue_head); - nr_running++; -} + unsigned long flags; + runqueue_t *rq; -static inline void move_last_runqueue(struct task_struct * p) -{ - list_del(&p->run_list); - list_add_tail(&p->run_list, &runqueue_head); +repeat: + rq = task_rq(p); + while (unlikely(rq->curr == p)) { + cpu_relax(); + barrier(); + } + lock_task_rq(rq, p, flags); + if (unlikely(rq->curr == p)) { + unlock_task_rq(rq, p, flags); + goto repeat; + } + unlock_task_rq(rq, p, flags); } -static inline void move_first_runqueue(struct task_struct * p) +/* + * Kick the remote CPU if the task is running currently, + * this code is used by the signal code to signal tasks + * which are in user-mode as quickly as possible. + * + * (Note that we do this lockless - if the task does anything + * while the message is in flight then it will notice the + * sigpending condition anyway.) + */ +void kick_if_running(task_t * p) { - list_del(&p->run_list); - list_add(&p->run_list, &runqueue_head); + if (p == task_rq(p)->curr) + resched_task(p); } +#endif /* * Wake up a process. Put it on the run-queue if it's not @@ -346,385 +272,394 @@ static inline void move_first_runqueue(struct task_struct * p) * "current->state = TASK_RUNNING" to mark yourself runnable * without the overhead of this. */ -static inline int try_to_wake_up(struct task_struct * p, int synchronous) +static int try_to_wake_up(task_t * p, int synchronous) { unsigned long flags; int success = 0; + runqueue_t *rq; - /* - * We want the common case fall through straight, thus the goto. - */ - spin_lock_irqsave(&runqueue_lock, flags); + lock_task_rq(rq, p, flags); p->state = TASK_RUNNING; - if (task_on_runqueue(p)) - goto out; - add_to_runqueue(p); - if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id()))) - reschedule_idle(p); - success = 1; -out: - spin_unlock_irqrestore(&runqueue_lock, flags); + if (!p->array) { + if (!rt_task(p) && synchronous && (smp_processor_id() < p->cpu)) { + spin_lock(&this_rq()->lock); + p->cpu = smp_processor_id(); + activate_task(p, this_rq()); + spin_unlock(&this_rq()->lock); + } else { + activate_task(p, rq); + if ((rq->curr == rq->idle) || + (p->prio < rq->curr->prio)) + resched_task(rq->curr); + } + success = 1; + } + unlock_task_rq(rq, p, flags); return success; } -inline int wake_up_process(struct task_struct * p) +inline int wake_up_process(task_t * p) { return try_to_wake_up(p, 0); } -static void process_timeout(unsigned long __data) +void wake_up_forked_process(task_t * p) { - wake_up_process((struct task_struct *)__data); -} + runqueue_t *rq = this_rq(); -/** - * schedule_timeout - sleep until timeout - * @timeout: timeout value in jiffies - * - * Make the current task sleep until @timeout jiffies have - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). - * - * You can set the task state as follows - - * - * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to - * pass before the routine returns. The routine will return 0 - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task. In this case the remaining time - * in jiffies will be returned, or 0 if the timer expired in time - * - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - * - * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule - * the CPU away without a bound on the timeout. In this case the return - * value will be %MAX_SCHEDULE_TIMEOUT. - * - * In all cases the return value is guaranteed to be non-negative. - */ -signed long schedule_timeout(signed long timeout) -{ - struct timer_list timer; - unsigned long expire; - - switch (timeout) - { - case MAX_SCHEDULE_TIMEOUT: - /* - * These two special cases are useful to be comfortable - * in the caller. Nothing more. We could take - * MAX_SCHEDULE_TIMEOUT from one of the negative value - * but I' d like to return a valid offset (>=0) to allow - * the caller to do everything it want with the retval. - */ - schedule(); - goto out; - default: - /* - * Another bit of PARANOID. Note that the retval will be - * 0 since no piece of kernel is supposed to do a check - * for a negative retval of schedule_timeout() (since it - * should never happens anyway). You just have the printk() - * that will tell you if something is gone wrong and where. - */ - if (timeout < 0) - { - printk(KERN_ERR "schedule_timeout: wrong timeout " - "value %lx from %p\n", timeout, - __builtin_return_address(0)); - current->state = TASK_RUNNING; - goto out; - } + spin_lock_irq(&rq->lock); + p->state = TASK_RUNNING; + if (!rt_task(p)) { + p->prio += MAX_USER_PRIO/10; + if (p->prio > MAX_PRIO-1) + p->prio = MAX_PRIO-1; } + activate_task(p, rq); + spin_unlock_irq(&rq->lock); +} - expire = timeout + jiffies; - - init_timer(&timer); - timer.expires = expire; - timer.data = (unsigned long) current; - timer.function = process_timeout; - - add_timer(&timer); - schedule(); - del_timer_sync(&timer); - - timeout = expire - jiffies; - - out: - return timeout < 0 ? 0 : timeout; +asmlinkage void schedule_tail(task_t *prev) +{ + spin_unlock_irq(&this_rq()->lock); } -/* - * schedule_tail() is getting called from the fork return path. This - * cleans up all remaining scheduler things, without impacting the - * common case. - */ -static inline void __schedule_tail(struct task_struct *prev) +static inline void context_switch(task_t *prev, task_t *next) { -#ifdef CONFIG_SMP - int policy; + struct mm_struct *mm = next->mm; + struct mm_struct *oldmm = prev->active_mm; - /* - * prev->policy can be written from here only before `prev' - * can be scheduled (before setting prev->cpus_runnable to ~0UL). - * Of course it must also be read before allowing prev - * to be rescheduled, but since the write depends on the read - * to complete, wmb() is enough. (the spin_lock() acquired - * before setting cpus_runnable is not enough because the spin_lock() - * common code semantics allows code outside the critical section - * to enter inside the critical section) - */ - policy = prev->policy; - prev->policy = policy & ~SCHED_YIELD; - wmb(); + prepare_to_switch(); - /* - * fast path falls through. We have to clear cpus_runnable before - * checking prev->state to avoid a wakeup race. Protect against - * the task exiting early. - */ - task_lock(prev); - task_release_cpu(prev); - mb(); - if (prev->state == TASK_RUNNING) - goto needs_resched; + if (!mm) { + next->active_mm = oldmm; + atomic_inc(&oldmm->mm_count); + enter_lazy_tlb(oldmm, next, smp_processor_id()); + } else + switch_mm(oldmm, mm, next, smp_processor_id()); -out_unlock: - task_unlock(prev); /* Synchronise here with release_task() if prev is TASK_ZOMBIE */ - return; + if (!prev->mm) { + prev->active_mm = NULL; + mmdrop(oldmm); + } /* - * Slow path - we 'push' the previous process and - * reschedule_idle() will attempt to find a new - * processor for it. (but it might preempt the - * current process as well.) We must take the runqueue - * lock and re-check prev->state to be correct. It might - * still happen that this process has a preemption - * 'in progress' already - but this is not a problem and - * might happen in other circumstances as well. + * Here we just switch the register state and the stack. There are + * 3 processes affected by a context switch: + * + * prev ==> .... ==> (last => next) + * + * It's the 'much more previous' 'prev' that is on next's stack, + * but prev is set to (the just run) 'last' process by switch_to(). + * This might sound slightly confusing but makes tons of sense. */ -needs_resched: - { - unsigned long flags; + switch_to(prev, next, prev); +} - /* - * Avoid taking the runqueue lock in cases where - * no preemption-check is necessery: - */ - if ((prev == idle_task(smp_processor_id())) || - (policy & SCHED_YIELD)) - goto out_unlock; +unsigned long nr_running(void) +{ + unsigned long i, sum = 0; - spin_lock_irqsave(&runqueue_lock, flags); - if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev)) - reschedule_idle(prev); - spin_unlock_irqrestore(&runqueue_lock, flags); - goto out_unlock; - } -#else - prev->policy &= ~SCHED_YIELD; -#endif /* CONFIG_SMP */ + for (i = 0; i < smp_num_cpus; i++) + sum += cpu_rq(i)->nr_running; + + return sum; } -asmlinkage void schedule_tail(struct task_struct *prev) +unsigned long nr_context_switches(void) { - __schedule_tail(prev); + unsigned long i, sum = 0; + + for (i = 0; i < smp_num_cpus; i++) + sum += cpu_rq(i)->nr_switches; + + return sum; } -void expire_task(struct task_struct *p) +static inline unsigned long max_rq_len(void) { - if (unlikely(!p->time_slice)) - goto need_resched; + unsigned long i, curr, max = 0; - if (!--p->time_slice) { - if (p->dyn_prio) - p->dyn_prio--; -need_resched: - p->need_resched = 1; + for (i = 0; i < smp_num_cpus; i++) { + curr = cpu_rq(i)->nr_running; + if (curr > max) + max = curr; } + return max; } /* - * 'schedule()' is the scheduler function. It's a very simple and nice - * scheduler: it's not perfect, but certainly works for most things. - * - * The goto is "interesting". + * Current runqueue is empty, try to find work on + * other runqueues. * - * NOTE!! Task 0 is the 'idle' task, which gets called when no other - * tasks can run. It can not be killed, and it cannot sleep. The 'state' - * information in task[0] is never used. + * We call this with the current runqueue locked, + * irqs disabled. */ -asmlinkage void schedule(void) +static void load_balance(runqueue_t *this_rq) { - struct schedule_data * sched_data; - struct task_struct *prev, *next, *p; - struct list_head *tmp; - int this_cpu, c; - - - spin_lock_prefetch(&runqueue_lock); - - if (!current->active_mm) BUG(); -need_resched_back: - prev = current; - this_cpu = prev->processor; - - if (unlikely(in_interrupt())) { - printk("Scheduling in interrupt\n"); - BUG(); - } - - release_kernel_lock(prev, this_cpu); - + int nr_tasks, load, prev_max_load, max_load, idx, i; + task_t *next = this_rq->idle, *tmp; + runqueue_t *busiest, *rq_tmp; + prio_array_t *array; + list_t *head, *curr; + + prev_max_load = max_rq_len(); + nr_tasks = prev_max_load - this_rq->nr_running; /* - * 'sched_data' is protected by the fact that we can run - * only one process per CPU. + * It needs an at least ~10% imbalance to trigger balancing: */ - sched_data = & aligned_data[this_cpu].schedule_data; - - spin_lock_irq(&runqueue_lock); + if (nr_tasks <= 1 + prev_max_load/8) + return; + prev_max_load++; - /* move an exhausted RR process to be last.. */ - if (unlikely(prev->policy == SCHED_RR)) - if (!prev->time_slice) { - prev->time_slice = TASK_TIMESLICE(prev); - move_last_runqueue(prev); - } - - switch (prev->state) { - case TASK_INTERRUPTIBLE: - if (signal_pending(prev)) { - prev->state = TASK_RUNNING; - break; +repeat_search: + /* + * We search all runqueues to find the most busy one. + * We do this lockless to reduce cache-bouncing overhead, + * we re-check the source CPU with the lock held. + */ + busiest = NULL; + max_load = 0; + for (i = 0; i < smp_num_cpus; i++) { + rq_tmp = cpu_rq(i); + load = rq_tmp->nr_running; + if ((load > max_load) && (load < prev_max_load) && + (rq_tmp != this_rq)) { + busiest = rq_tmp; + max_load = load; } - default: - del_from_runqueue(prev); - case TASK_RUNNING:; } - prev->need_resched = 0; + + if (likely(!busiest)) + return; + if (max_load <= this_rq->nr_running) + return; + prev_max_load = max_load; + if (busiest->cpu < this_rq->cpu) { + spin_unlock(&this_rq->lock); + spin_lock(&busiest->lock); + spin_lock(&this_rq->lock); + } else + spin_lock(&busiest->lock); + if (busiest->nr_running <= this_rq->nr_running + 1) + goto out_unlock; /* - * this is the scheduler proper: + * We first consider expired tasks. Those will likely not run + * in the near future, thus switching CPUs has the least effect + * on them. */ + if (busiest->expired->nr_active) + array = busiest->expired; + else + array = busiest->active; -repeat_schedule: +new_array: /* - * Default process to select.. + * Load-balancing does not affect RT tasks, so we start the + * searching at priority 128. */ - next = idle_task(this_cpu); - c = -1000; - list_for_each(tmp, &runqueue_head) { - p = list_entry(tmp, struct task_struct, run_list); - if (can_schedule(p, this_cpu)) { - int weight = goodness(p, this_cpu, prev->active_mm); - if (weight > c) - c = weight, next = p; + idx = MAX_RT_PRIO; +skip_bitmap: + idx = find_next_zero_bit(array->bitmap, MAX_PRIO, idx); + if (idx == MAX_PRIO) { + if (array == busiest->expired) { + array = busiest->active; + goto new_array; } + spin_unlock(&busiest->lock); + goto repeat_search; } - /* Do we need to re-calculate counters? */ - if (unlikely(!c)) { - rcl_curr++; - list_for_each(tmp, &runqueue_head) { - p = list_entry(tmp, struct task_struct, run_list); - p->time_slice = TASK_TIMESLICE(p); - p->rcl_last = rcl_curr; - } - goto repeat_schedule; + head = array->queue + idx; + curr = head->next; +skip_queue: + tmp = list_entry(curr, task_t, run_list); + if ((tmp == busiest->curr) || !(tmp->cpus_allowed & (1 << smp_processor_id()))) { + curr = curr->next; + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; } - + next = tmp; /* - * from this point on nothing can prevent us from - * switching to the next task, save this fact in - * sched_data. + * take the task out of the other runqueue and + * put it into this one: */ - sched_data->curr = next; - task_set_cpu(next, this_cpu); - spin_unlock_irq(&runqueue_lock); - - if (unlikely(prev == next)) { - /* We won't go through the normal tail, so do this by hand */ - prev->policy &= ~SCHED_YIELD; - goto same_process; + dequeue_task(next, array); + busiest->nr_running--; + next->cpu = smp_processor_id(); + this_rq->nr_running++; + enqueue_task(next, this_rq->active); + if (next->prio < current->prio) + current->need_resched = 1; + if (--nr_tasks) { + if (array == busiest->expired) { + array = busiest->active; + goto new_array; + } + spin_unlock(&busiest->lock); + goto repeat_search; } +out_unlock: + spin_unlock(&busiest->lock); +} -#ifdef CONFIG_SMP - /* - * maintain the per-process 'last schedule' value. - * (this has to be recalculated even if we reschedule to - * the same process) Currently this is only used on SMP, - * and it's approximate, so we do not have to maintain - * it while holding the runqueue spinlock. - */ - sched_data->last_schedule = get_cycles(); +#define REBALANCE_TICK (HZ/100) - /* - * We drop the scheduler lock early (it's a global spinlock), - * thus we have to lock the previous process from getting - * rescheduled during switch_to(). - */ +void idle_tick(void) +{ + unsigned long flags; -#endif /* CONFIG_SMP */ + if (!(jiffies % REBALANCE_TICK) && likely(this_rq()->curr != NULL)) { + spin_lock_irqsave(&this_rq()->lock, flags); + load_balance(this_rq()); + spin_unlock_irqrestore(&this_rq()->lock, flags); + } +} + +void expire_task(task_t *p) +{ + runqueue_t *rq = this_rq(); + unsigned long flags; - kstat.context_swtch++; + if (p->array != rq->active) { + p->need_resched = 1; + return; + } /* - * there are 3 processes which are affected by a context switch: - * - * prev == .... ==> (last => next) - * - * It's the 'much more previous' 'prev' that is on next's stack, - * but prev is set to (the just run) 'last' process by switch_to(). - * This might sound slightly confusing but makes tons of sense. + * The task cannot change CPUs because it's the current task. */ - prepare_to_switch(); - { - struct mm_struct *mm = next->mm; - struct mm_struct *oldmm = prev->active_mm; - if (!mm) { - if (next->active_mm) BUG(); - next->active_mm = oldmm; - atomic_inc(&oldmm->mm_count); - enter_lazy_tlb(oldmm, next, this_cpu); - } else { - if (next->active_mm != mm) BUG(); - switch_mm(oldmm, mm, next, this_cpu); - } + spin_lock_irqsave(&rq->lock, flags); + if ((p->policy != SCHED_FIFO) && !--p->time_slice) { + p->need_resched = 1; + if (rt_task(p)) + p->time_slice = RT_PRIO_TO_TIMESLICE(p->prio); + else + p->time_slice = PRIO_TO_TIMESLICE(p->prio); - if (!prev->mm) { - prev->active_mm = NULL; - mmdrop(oldmm); + /* + * Timeslice used up - discard any possible + * priority penalty: + */ + dequeue_task(p, rq->active); + /* + * Tasks that have nice values of -20 ... -15 are put + * back into the active array. If they use up too much + * CPU time then they'll get a priority penalty anyway + * so this can not starve other processes accidentally. + * Otherwise this is pretty handy for sysadmins ... + */ + if (p->prio <= MAX_RT_PRIO + MAX_PENALTY/2) + enqueue_task(p, rq->active); + else + enqueue_task(p, rq->expired); + } else { + /* + * Deactivate + activate the task so that the + * load estimator gets updated properly: + */ + if (!rt_task(p)) { + deactivate_task(p, rq); + activate_task(p, rq); } } + load_balance(rq); + spin_unlock_irqrestore(&rq->lock, flags); +} - /* - * This just switches the register state and the - * stack. - */ - switch_to(prev, next, prev); - __schedule_tail(prev); +void scheduling_functions_start_here(void) { } + +/* + * 'schedule()' is the main scheduler function. + */ +asmlinkage void schedule(void) +{ + task_t *prev, *next; + prio_array_t *array; + runqueue_t *rq; + list_t *queue; + int idx; + + if (unlikely(in_interrupt())) + BUG(); +need_resched_back: + prev = current; + release_kernel_lock(prev, smp_processor_id()); + rq = this_rq(); + spin_lock_irq(&rq->lock); + + switch (prev->state) { + case TASK_INTERRUPTIBLE: + if (unlikely(signal_pending(prev))) { + prev->state = TASK_RUNNING; + break; + } + default: + deactivate_task(prev, rq); + case TASK_RUNNING: + } +pick_next_task: + if (unlikely(!rq->nr_running)) { + load_balance(rq); + if (rq->nr_running) + goto pick_next_task; + next = rq->idle; + goto switch_tasks; + } + + array = rq->active; + if (unlikely(!array->nr_active)) { + /* + * Switch the active and expired arrays. + */ + rq->active = rq->expired; + rq->expired = array; + array = rq->active; + } + + idx = sched_find_first_zero_bit(array->bitmap); + queue = array->queue + idx; + next = list_entry(queue->next, task_t, run_list); + +switch_tasks: + prev->need_resched = 0; + + if (likely(prev != next)) { + rq->nr_switches++; + rq->curr = next; + next->cpu = prev->cpu; + context_switch(prev, next); + /* + * The runqueue pointer might be from another CPU + * if the new task was last running on a different + * CPU - thus re-load it. + */ + barrier(); + rq = this_rq(); + } + spin_unlock_irq(&rq->lock); -same_process: reacquire_kernel_lock(current); - if (current->need_resched) + if (unlikely(current->need_resched)) goto need_resched_back; return; } /* - * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just wake everything - * up. If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the - * non-exclusive tasks and one exclusive task. + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve + * number) then we wake all the non-exclusive tasks and one exclusive task. * * There are circumstances in which we can try to wake a task which has already - * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns zero - * in this (rare) case, and we handle it by contonuing to scan the queue. + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * zero in this (rare) case, and we handle it by continuing to scan the queue. */ static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode, int nr_exclusive, const int sync) { struct list_head *tmp; - struct task_struct *p; + task_t *p; list_for_each(tmp,&q->task_list) { unsigned int state; @@ -853,8 +788,93 @@ long sleep_on_timeout(wait_queue_head_t *q, long timeout) return timeout; } +/* + * Change the current task's CPU affinity. Migrate the process to a + * proper CPU and schedule away if the current CPU is removed from + * the allowed bitmask. + */ +void set_cpus_allowed(task_t *p, unsigned long new_mask) +{ + runqueue_t *this_rq = this_rq(), *target_rq; + unsigned long this_mask = 1UL << smp_processor_id(); + int target_cpu; + + new_mask &= cpu_online_map; + p->cpus_allowed = new_mask; + /* + * Can the task run on the current CPU? If not then + * migrate the process off to a proper CPU. + */ + if (new_mask & this_mask) + return; + target_cpu = ffz(~new_mask); + target_rq = cpu_rq(target_cpu); + if (target_cpu < smp_processor_id()) { + spin_lock_irq(&target_rq->lock); + spin_lock(&this_rq->lock); + } else { + spin_lock_irq(&target_rq->lock); + spin_lock(&this_rq->lock); + } + dequeue_task(p, p->array); + this_rq->nr_running--; + target_rq->nr_running++; + enqueue_task(p, target_rq->active); + target_rq->curr->need_resched = 1; + spin_unlock(&target_rq->lock); + + /* + * The easiest solution is to context switch into + * the idle thread - which will pick the best task + * afterwards: + */ + this_rq->nr_switches++; + this_rq->curr = this_rq->idle; + this_rq->idle->need_resched = 1; + context_switch(current, this_rq->idle); + barrier(); + spin_unlock_irq(&this_rq()->lock); +} + void scheduling_functions_end_here(void) { } +void set_user_nice(task_t *p, long nice) +{ + unsigned long flags; + prio_array_t *array; + runqueue_t *rq; + + if (p->__nice == nice) + return; + /* + * We have to be careful, if called from sys_setpriority(), + * the task might be in the middle of scheduling on another CPU. + */ + lock_task_rq(rq, p, flags); + if (rt_task(p)) { + p->__nice = nice; + goto out_unlock; + } + array = p->array; + if (array) { + dequeue_task(p, array); + } + p->__nice = nice; + p->prio = NICE_TO_PRIO(nice); + if (array) { + enqueue_task(p, array); + /* + * If the task is runnable and lowered its priority, + * or increased its priority then reschedule its CPU: + */ + if ((nice < p->__nice) || + ((p->__nice < nice) && (p == rq->curr))) + resched_task(rq->curr); + } +out_unlock: + unlock_task_rq(rq, p, flags); +} + #ifndef __alpha__ /* @@ -865,7 +885,7 @@ void scheduling_functions_end_here(void) { } asmlinkage long sys_nice(int increment) { - long newprio; + long nice; /* * Setpriority might change our priority at the same moment. @@ -881,28 +901,30 @@ asmlinkage long sys_nice(int increment) if (increment > 40) increment = 40; - newprio = current->nice + increment; - if (newprio < -20) - newprio = -20; - if (newprio > 19) - newprio = 19; - current->nice = newprio; + nice = current->__nice + increment; + if (nice < -20) + nice = -20; + if (nice > 19) + nice = 19; + set_user_nice(current, nice); return 0; } #endif -static inline struct task_struct *find_process_by_pid(pid_t pid) +static inline task_t *find_process_by_pid(pid_t pid) { return pid ? find_task_by_pid(pid) : current; } -static int setscheduler(pid_t pid, int policy, - struct sched_param *param) +static int setscheduler(pid_t pid, int policy, struct sched_param *param) { struct sched_param lp; - struct task_struct *p; + prio_array_t *array; + unsigned long flags; + runqueue_t *rq; int retval; + task_t *p; retval = -EINVAL; if (!param || pid < 0) @@ -916,14 +938,19 @@ static int setscheduler(pid_t pid, int policy, * We play safe to avoid deadlocks. */ read_lock_irq(&tasklist_lock); - spin_lock(&runqueue_lock); p = find_process_by_pid(pid); retval = -ESRCH; if (!p) - goto out_unlock; - + goto out_unlock_tasklist; + + /* + * To be able to change p->policy safely, the apropriate + * runqueue lock must be held. + */ + lock_task_rq(rq,p,flags); + if (policy < 0) policy = p->policy; else { @@ -951,16 +978,22 @@ static int setscheduler(pid_t pid, int policy, !capable(CAP_SYS_NICE)) goto out_unlock; + array = p->array; + if (array) + deactivate_task(p, task_rq(p)); retval = 0; p->policy = policy; p->rt_priority = lp.sched_priority; - if (task_on_runqueue(p)) - move_first_runqueue(p); - - current->need_resched = 1; + if (rt_task(p)) + p->prio = 99-p->rt_priority; + else + p->prio = NICE_TO_PRIO(p->__nice); + if (array) + activate_task(p, task_rq(p)); out_unlock: - spin_unlock(&runqueue_lock); + unlock_task_rq(rq,p,flags); +out_unlock_tasklist: read_unlock_irq(&tasklist_lock); out_nounlock: @@ -980,7 +1013,7 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param *param) asmlinkage long sys_sched_getscheduler(pid_t pid) { - struct task_struct *p; + task_t *p; int retval; retval = -EINVAL; @@ -991,7 +1024,7 @@ asmlinkage long sys_sched_getscheduler(pid_t pid) read_lock(&tasklist_lock); p = find_process_by_pid(pid); if (p) - retval = p->policy & ~SCHED_YIELD; + retval = p->policy; read_unlock(&tasklist_lock); out_nounlock: @@ -1000,7 +1033,7 @@ out_nounlock: asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param) { - struct task_struct *p; + task_t *p; struct sched_param lp; int retval; @@ -1031,42 +1064,28 @@ out_unlock: asmlinkage long sys_sched_yield(void) { + runqueue_t *rq = this_rq(); + prio_array_t *array; + /* - * Trick. sched_yield() first counts the number of truly - * 'pending' runnable processes, then returns if it's - * only the current processes. (This test does not have - * to be atomic.) In threaded applications this optimization - * gets triggered quite often. + * Decrease the yielding task's priority by one, to avoid + * livelocks. This priority loss is temporary, it's recovered + * once the current timeslice expires. + * + * If priority is already MAX_PRIO-1 then we still + * roundrobin the task within the runlist. */ + spin_lock_irq(&rq->lock); + array = current->array; + dequeue_task(current, array); + if (likely(!rt_task(current))) + if (current->prio < MAX_PRIO-1) + current->prio++; + enqueue_task(current, array); + spin_unlock_irq(&rq->lock); - int nr_pending = nr_running; - -#if CONFIG_SMP - int i; - - // Subtract non-idle processes running on other CPUs. - for (i = 0; i < smp_num_cpus; i++) { - int cpu = cpu_logical_map(i); - if (aligned_data[cpu].schedule_data.curr != idle_task(cpu)) - nr_pending--; - } -#else - // on UP this process is on the runqueue as well - nr_pending--; -#endif - if (nr_pending) { - /* - * This process can only be rescheduled by us, - * so this is safe without any locking. - */ - if (current->policy == SCHED_OTHER) - current->policy |= SCHED_YIELD; - current->need_resched = 1; + schedule(); - current->time_slice = 0; - if (++current->dyn_prio > MAX_DYNPRIO) - current->dyn_prio = MAX_DYNPRIO; - } return 0; } @@ -1104,7 +1123,7 @@ asmlinkage long sys_sched_get_priority_min(int policy) asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval) { struct timespec t; - struct task_struct *p; + task_t *p; int retval = -EINVAL; if (pid < 0) @@ -1114,8 +1133,8 @@ asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval) read_lock(&tasklist_lock); p = find_process_by_pid(pid); if (p) - jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : TASK_TIMESLICE(p), - &t); + jiffies_to_timespec(p->policy & SCHED_FIFO ? + 0 : RT_PRIO_TO_TIMESLICE(p->prio), &t); read_unlock(&tasklist_lock); if (p) retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; @@ -1123,7 +1142,7 @@ out_nounlock: return retval; } -static void show_task(struct task_struct * p) +static void show_task(task_t * p) { unsigned long free = 0; int state; @@ -1171,7 +1190,7 @@ static void show_task(struct task_struct * p) printk(" (NOTLB)\n"); { - extern void show_trace_task(struct task_struct *tsk); + extern void show_trace_task(task_t *tsk); show_trace_task(p); } } @@ -1193,7 +1212,7 @@ char * render_sigset_t(sigset_t *set, char *buffer) void show_state(void) { - struct task_struct *p; + task_t *p; #if (BITS_PER_LONG == 32) printk("\n" @@ -1216,134 +1235,97 @@ void show_state(void) read_unlock(&tasklist_lock); } -/** - * reparent_to_init() - Reparent the calling kernel thread to the init task. - * - * If a kernel thread is launched as a result of a system call, or if - * it ever exits, it should generally reparent itself to init so that - * it is correctly cleaned up on exit. - * - * The various task state such as scheduling policy and priority may have - * been inherited fro a user process, so we reset them to sane values here. - * - * NOTE that reparent_to_init() gives the caller full capabilities. - */ -void reparent_to_init(void) +extern unsigned long wait_init_idle; + +static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) { - write_lock_irq(&tasklist_lock); - - /* Reparent to init */ - REMOVE_LINKS(current); - current->p_pptr = child_reaper; - current->p_opptr = child_reaper; - SET_LINKS(current); - - /* Set the exit signal to SIGCHLD so we signal init on exit */ - current->exit_signal = SIGCHLD; - - /* We also take the runqueue_lock while altering task fields - * which affect scheduling decisions */ - spin_lock(&runqueue_lock); - - current->ptrace = 0; - current->nice = DEF_NICE; - current->policy = SCHED_OTHER; - /* cpus_allowed? */ - /* rt_priority? */ - /* signals? */ - current->cap_effective = CAP_INIT_EFF_SET; - current->cap_inheritable = CAP_INIT_INH_SET; - current->cap_permitted = CAP_FULL_SET; - current->keep_capabilities = 0; - memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim))); - current->user = INIT_USER; - - spin_unlock(&runqueue_lock); - write_unlock_irq(&tasklist_lock); + if (rq1 == rq2) + spin_lock(&rq1->lock); + else { + if (rq1->cpu < rq2->cpu) { + spin_lock(&rq1->lock); + spin_lock(&rq2->lock); + } else { + spin_lock(&rq2->lock); + spin_lock(&rq1->lock); + } + } } -/* - * Put all the gunge required to become a kernel thread without - * attached user resources in one place where it belongs. - */ - -void daemonize(void) +static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) { - struct fs_struct *fs; - - - /* - * If we were started as result of loading a module, close all of the - * user space pages. We don't need them, and if we didn't close them - * they would be locked into memory. - */ - exit_mm(current); - - current->session = 1; - current->pgrp = 1; - current->tty = NULL; - - /* Become as one with the init task */ - - exit_fs(current); /* current->fs->count--; */ - fs = init_task.fs; - current->fs = fs; - atomic_inc(&fs->count); - exit_files(current); - current->files = init_task.files; - atomic_inc(¤t->files->count); + spin_unlock(&rq1->lock); + if (rq1 != rq2) + spin_unlock(&rq2->lock); } -extern unsigned long wait_init_idle; - void __init init_idle(void) { - struct schedule_data * sched_data; - sched_data = &aligned_data[smp_processor_id()].schedule_data; + runqueue_t *this_rq = this_rq(), *rq = current->array->rq; + unsigned long flags; - if (current != &init_task && task_on_runqueue(current)) { - printk("UGH! (%d:%d) was on the runqueue, removing.\n", - smp_processor_id(), current->pid); - del_from_runqueue(current); + __save_flags(flags); + __cli(); + double_rq_lock(this_rq, rq); + + this_rq->curr = this_rq->idle = current; + deactivate_task(current, rq); + current->array = NULL; + current->prio = MAX_PRIO; + current->state = TASK_RUNNING; + clear_bit(smp_processor_id(), &wait_init_idle); + double_rq_unlock(this_rq, rq); + while (wait_init_idle) { + cpu_relax(); + barrier(); } - current->dyn_prio = 0; - sched_data->curr = current; - sched_data->last_schedule = get_cycles(); - clear_bit(current->processor, &wait_init_idle); + current->need_resched = 1; + __sti(); } -extern void init_timervecs (void); - -static void fill_tslice_map(void) -{ - int i; - - for (i = 0; i < NICE_RANGE; i++) { - ts_table[i] = ((MIN_NICE_TSLICE + - ((MAX_NICE_TSLICE - - MIN_NICE_TSLICE) / (NICE_RANGE - 1)) * i) * HZ) / 1000000; - if (!ts_table[i]) ts_table[i] = 1; - } -} +extern void init_timervecs(void); +extern void timer_bh(void); +extern void tqueue_bh(void); +extern void immediate_bh(void); void __init sched_init(void) { + runqueue_t *rq; + int i, j, k; + + for (i = 0; i < NR_CPUS; i++) { + runqueue_t *rq = cpu_rq(i); + prio_array_t *array; + + rq->active = rq->arrays + 0; + rq->expired = rq->arrays + 1; + spin_lock_init(&rq->lock); + rq->cpu = i; + + for (j = 0; j < 2; j++) { + array = rq->arrays + j; + array->rq = rq; + array->lock = &rq->lock; + for (k = 0; k < MAX_PRIO; k++) + INIT_LIST_HEAD(array->queue + k); + memset(array->bitmap, 0xff, BITMAP_SIZE); + // zero delimiter for bitsearch + clear_bit(MAX_PRIO, array->bitmap); + } + } /* * We have to do a little magic to get the first * process right in SMP mode. */ - int cpu = smp_processor_id(); - int nr; - - init_task.processor = cpu; + rq = this_rq(); + rq->curr = current; + rq->idle = NULL; + wake_up_process(current); - for(nr = 0; nr < PIDHASH_SZ; nr++) - pidhash[nr] = NULL; - - fill_tslice_map(); + for (i = 0; i < PIDHASH_SZ; i++) + pidhash[i] = NULL; init_timervecs(); - init_bh(TIMER_BH, timer_bh); init_bh(TQUEUE_BH, tqueue_bh); init_bh(IMMEDIATE_BH, immediate_bh); @@ -1352,5 +1334,5 @@ void __init sched_init(void) * The boot idle thread does lazy MMU switching as well: */ atomic_inc(&init_mm.mm_count); - enter_lazy_tlb(&init_mm, current, cpu); + enter_lazy_tlb(&init_mm, current, smp_processor_id()); } diff --git a/kernel/signal.c b/kernel/signal.c index 44acecd851c7..b5b866aa3527 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -478,12 +478,9 @@ static inline void signal_wake_up(struct task_struct *t) * process of changing - but no harm is done by that * other than doing an extra (lightweight) IPI interrupt. */ - spin_lock(&runqueue_lock); - if (task_has_cpu(t) && t->processor != smp_processor_id()) - smp_send_reschedule(t->processor); - spin_unlock(&runqueue_lock); -#endif /* CONFIG_SMP */ - + if ((t->state == TASK_RUNNING) && (t->cpu != smp_processor_id())) + kick_if_running(t); +#endif if (t->state & TASK_INTERRUPTIBLE) { wake_up_process(t); return; diff --git a/kernel/softirq.c b/kernel/softirq.c index 97b8eb8cc831..4d8bb7b9795d 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -259,10 +259,9 @@ void tasklet_kill(struct tasklet_struct *t) while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { current->state = TASK_RUNNING; - do { - current->policy |= SCHED_YIELD; - schedule(); - } while (test_bit(TASKLET_STATE_SCHED, &t->state)); + do + yield(); + while (test_bit(TASKLET_STATE_SCHED, &t->state)); } tasklet_unlock_wait(t); clear_bit(TASKLET_STATE_SCHED, &t->state); @@ -365,13 +364,13 @@ static int ksoftirqd(void * __bind_cpu) int cpu = cpu_logical_map(bind_cpu); daemonize(); - current->nice = 19; + set_user_nice(current, 19); sigfillset(¤t->blocked); /* Migrate to the right CPU */ - current->cpus_allowed = 1UL << cpu; - while (smp_processor_id() != cpu) - schedule(); + set_cpus_allowed(current, 1UL << cpu); + if (smp_processor_id() != cpu) + BUG(); sprintf(current->comm, "ksoftirqd_CPU%d", bind_cpu); @@ -400,18 +399,13 @@ static __init int spawn_ksoftirqd(void) { int cpu; - for (cpu = 0; cpu < smp_num_cpus; cpu++) { + for (cpu = 0; cpu < smp_num_cpus; cpu++) if (kernel_thread(ksoftirqd, (void *) (long) cpu, CLONE_FS | CLONE_FILES | CLONE_SIGNAL) < 0) printk("spawn_ksoftirqd() failed for cpu %d\n", cpu); - else { - while (!ksoftirqd_task(cpu_logical_map(cpu))) { - current->policy |= SCHED_YIELD; - schedule(); - } - } - } - + else + while (!ksoftirqd_task(cpu_logical_map(cpu))) + yield(); return 0; } diff --git a/kernel/sys.c b/kernel/sys.c index 765761a4e71b..21c21ca8bbe6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -220,10 +220,10 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) } if (error == -ESRCH) error = 0; - if (niceval < p->nice && !capable(CAP_SYS_NICE)) + if (niceval < p->__nice && !capable(CAP_SYS_NICE)) error = -EACCES; else - p->nice = niceval; + set_user_nice(p, niceval); } read_unlock(&tasklist_lock); @@ -249,7 +249,7 @@ asmlinkage long sys_getpriority(int which, int who) long niceval; if (!proc_sel(p, which, who)) continue; - niceval = 20 - p->nice; + niceval = 20 - p->__nice; if (niceval > retval) retval = niceval; } diff --git a/kernel/timer.c b/kernel/timer.c index c56ec37ea817..ce3945cda799 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -25,6 +25,8 @@ #include <asm/uaccess.h> +struct kernel_stat kstat; + /* * Timekeeping variables */ @@ -584,13 +586,16 @@ void update_process_times(int user_tick) update_one_process(p, user_tick, system, cpu); if (p->pid) { expire_task(p); - if (p->nice > 0) + if (p->__nice > 0) kstat.per_cpu_nice[cpu] += user_tick; else kstat.per_cpu_user[cpu] += user_tick; kstat.per_cpu_system[cpu] += system; - } else if (local_bh_count(cpu) || local_irq_count(cpu) > 1) - kstat.per_cpu_system[cpu] += system; + } else { + idle_tick(); + if (local_bh_count(cpu) || local_irq_count(cpu) > 1) + kstat.per_cpu_system[cpu] += system; + } } /* @@ -791,6 +796,89 @@ asmlinkage long sys_getegid(void) #endif +static void process_timeout(unsigned long __data) +{ + wake_up_process((task_t *)__data); +} + +/** + * schedule_timeout - sleep until timeout + * @timeout: timeout value in jiffies + * + * Make the current task sleep until @timeout jiffies have + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to + * pass before the routine returns. The routine will return 0 + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task. In this case the remaining time + * in jiffies will be returned, or 0 if the timer expired in time + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule + * the CPU away without a bound on the timeout. In this case the return + * value will be %MAX_SCHEDULE_TIMEOUT. + * + * In all cases the return value is guaranteed to be non-negative. + */ +signed long schedule_timeout(signed long timeout) +{ + struct timer_list timer; + unsigned long expire; + + switch (timeout) + { + case MAX_SCHEDULE_TIMEOUT: + /* + * These two special cases are useful to be comfortable + * in the caller. Nothing more. We could take + * MAX_SCHEDULE_TIMEOUT from one of the negative value + * but I' d like to return a valid offset (>=0) to allow + * the caller to do everything it want with the retval. + */ + schedule(); + goto out; + default: + /* + * Another bit of PARANOID. Note that the retval will be + * 0 since no piece of kernel is supposed to do a check + * for a negative retval of schedule_timeout() (since it + * should never happens anyway). You just have the printk() + * that will tell you if something is gone wrong and where. + */ + if (timeout < 0) + { + printk(KERN_ERR "schedule_timeout: wrong timeout " + "value %lx from %p\n", timeout, + __builtin_return_address(0)); + current->state = TASK_RUNNING; + goto out; + } + } + + expire = timeout + jiffies; + + init_timer(&timer); + timer.expires = expire; + timer.data = (unsigned long) current; + timer.function = process_timeout; + + add_timer(&timer); + schedule(); + del_timer_sync(&timer); + + timeout = expire - jiffies; + + out: + return timeout < 0 ? 0 : timeout; +} + /* Thread ID - the internal kernel "pid" */ asmlinkage long sys_gettid(void) { |
