v2.5.1.9 -> v2.5.1.10

- Kai Germaschewski: ISDN updates - Al Viro: start moving buffer cache indexing to "struct block_device *" - Greg KH: USB update - Russell King: fix up some ARM merge issues - Ingo Molnar: scalable scheduler
author: Linus Torvalds <torvalds@athlon.transmeta.com> 2002-02-05 00:12:58 -0800
committer: Linus Torvalds <torvalds@athlon.transmeta.com> 2002-02-05 00:12:58 -0800
commit: 908920b1d370e7a5c301d14cfce10c310be19be3 (patch)
tree: 25606bf2c5215cc0e25c9647612390b88622b279 /kernel
parent: d01b7e92c0020f89b4bb33fe61c0dffab7078b42 (diff)
11 files changed, 982 insertions, 836 deletions
diff --git a/kernel/capability.c b/kernel/capability.c
index 7aaf1a423011..8fd0d451069d 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -8,6 +8,8 @@
 #include <linux/mm.h>
 #include <asm/uaccess.h>
 
+unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
+
 kernel_cap_t cap_bset = CAP_INIT_EFF_SET;
 
 /* Note: never hold tasklist_lock while spinning for this one */
diff --git a/kernel/exit.c b/kernel/exit.c
index a3490404ff33..04774ebbc2e4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -29,49 +29,39 @@ int getrusage(struct task_struct *, int, struct rusage *);
 
 static void release_task(struct task_struct * p)
 {
-	if (p != current) {
+	unsigned long flags;
+
+	if (p == current)
+		BUG();
 #ifdef CONFIG_SMP
-		/*
-		 * Wait to make sure the process isn't on the
-		 * runqueue (active on some other CPU still)
-		 */
-		for (;;) {
-			task_lock(p);
-			if (!task_has_cpu(p))
-				break;
-			task_unlock(p);
-			do {
-				cpu_relax();
-				barrier();
-			} while (task_has_cpu(p));
-		}
-		task_unlock(p);
+	wait_task_inactive(p);
 #endif
-		atomic_dec(&p->user->processes);
-		free_uid(p->user);
-		unhash_process(p);
-
-		release_thread(p);
-		current->cmin_flt += p->min_flt + p->cmin_flt;
-		current->cmaj_flt += p->maj_flt + p->cmaj_flt;
-		current->cnswap += p->nswap + p->cnswap;
-		/*
-		 * Potentially available timeslices are retrieved
-		 * here - this way the parent does not get penalized
-		 * for creating too many processes.
-		 *
-		 * (this cannot be used to artificially 'generate'
-		 * timeslices, because any timeslice recovered here
-		 * was given away by the parent in the first place.)
-		 */
-		current->time_slice += p->time_slice;
-		if (current->time_slice > MAX_TSLICE)
-			current->time_slice = MAX_TSLICE;
-		p->pid = 0;
-		free_task_struct(p);
-	} else {
-		printk("task releasing itself\n");
-	}
+	atomic_dec(&p->user->processes);
+	free_uid(p->user);
+	unhash_process(p);
+
+	release_thread(p);
+	current->cmin_flt += p->min_flt + p->cmin_flt;
+	current->cmaj_flt += p->maj_flt + p->cmaj_flt;
+	current->cnswap += p->nswap + p->cnswap;
+	/*
+	 * Potentially available timeslices are retrieved
+	 * here - this way the parent does not get penalized
+	 * for creating too many processes.
+	 *
+	 * (this cannot be used to artificially 'generate'
+	 * timeslices, because any timeslice recovered here
+	 * was given away by the parent in the first place.)
+	 */
+	__save_flags(flags);
+	__cli();
+	current->time_slice += p->time_slice;
+	if (current->time_slice > MAX_TIMESLICE)
+		current->time_slice = MAX_TIMESLICE;
+	__restore_flags(flags);
+
+	p->pid = 0;
+	free_task_struct(p);
 }
 
 /*
@@ -151,6 +141,80 @@ static inline int has_stopped_jobs(int pgrp)
 	return retval;
 }
 
+/**
+ * reparent_to_init() - Reparent the calling kernel thread to the init task.
+ *
+ * If a kernel thread is launched as a result of a system call, or if
+ * it ever exits, it should generally reparent itself to init so that
+ * it is correctly cleaned up on exit.
+ *
+ * The various task state such as scheduling policy and priority may have
+ * been inherited from a user process, so we reset them to sane values here.
+ *
+ * NOTE that reparent_to_init() gives the caller full capabilities.
+ */
+void reparent_to_init(void)
+{
+	write_lock_irq(&tasklist_lock);
+
+	/* Reparent to init */
+	REMOVE_LINKS(current);
+	current->p_pptr = child_reaper;
+	current->p_opptr = child_reaper;
+	SET_LINKS(current);
+
+	/* Set the exit signal to SIGCHLD so we signal init on exit */
+	current->exit_signal = SIGCHLD;
+
+	current->ptrace = 0;
+	if ((current->policy == SCHED_OTHER) &&
+			(current->__nice < DEF_USER_NICE))
+		set_user_nice(current, DEF_USER_NICE);
+	/* cpus_allowed? */
+	/* rt_priority? */
+	/* signals? */
+	current->cap_effective = CAP_INIT_EFF_SET;
+	current->cap_inheritable = CAP_INIT_INH_SET;
+	current->cap_permitted = CAP_FULL_SET;
+	current->keep_capabilities = 0;
+	memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim)));
+	current->user = INIT_USER;
+
+	write_unlock_irq(&tasklist_lock);
+}
+
+/*
+ *	Put all the gunge required to become a kernel thread without
+ *	attached user resources in one place where it belongs.
+ */
+
+void daemonize(void)
+{
+	struct fs_struct *fs;
+
+
+	/*
+	 * If we were started as result of loading a module, close all of the
+	 * user space pages.  We don't need them, and if we didn't close them
+	 * they would be locked into memory.
+	 */
+	exit_mm(current);
+
+	current->session = 1;
+	current->pgrp = 1;
+	current->tty = NULL;
+
+	/* Become as one with the init task */
+
+	exit_fs(current);	/* current->fs->count--; */
+	fs = init_task.fs;
+	current->fs = fs;
+	atomic_inc(&fs->count);
+ 	exit_files(current);
+	current->files = init_task.files;
+	atomic_inc(&current->files->count);
+}
+
 /*
  * When we die, we re-parent all our children.
  * Try to give them to another thread in our process
diff --git a/kernel/fork.c b/kernel/fork.c
index 4c3114eef4c4..3fcaeafeb535 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -29,7 +29,6 @@
 
 /* The idle threads do not count.. */
 int nr_threads;
-int nr_running;
 
 int max_threads;
 unsigned long total_forks;	/* Handle normal Linux uptimes. */
@@ -37,6 +36,8 @@ int last_pid;
 
 struct task_struct *pidhash[PIDHASH_SZ];
 
+rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;  /* outer */
+
 void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
 {
 	unsigned long flags;
@@ -564,6 +565,7 @@ int do_fork(unsigned long clone_flags, unsigned long stack_start,
 	    struct pt_regs *regs, unsigned long stack_size)
 {
 	int retval;
+	unsigned long flags;
 	struct task_struct *p;
 	struct completion vfork;
 
@@ -617,8 +619,7 @@ int do_fork(unsigned long clone_flags, unsigned long stack_start,
 	copy_flags(clone_flags, p);
 	p->pid = get_pid(clone_flags);
 
-	p->run_list.next = NULL;
-	p->run_list.prev = NULL;
+	INIT_LIST_HEAD(&p->run_list);
 
 	p->p_cptr = NULL;
 	init_waitqueue_head(&p->wait_chldexit);
@@ -644,14 +645,16 @@ int do_fork(unsigned long clone_flags, unsigned long stack_start,
 #ifdef CONFIG_SMP
 	{
 		int i;
-		p->cpus_runnable = ~0UL;
-		p->processor = current->processor;
+
+		p->cpu = smp_processor_id();
+
 		/* ?? should we just memset this ?? */
 		for(i = 0; i < smp_num_cpus; i++)
 			p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
 		spin_lock_init(&p->sigmask_lock);
 	}
 #endif
+	p->array = NULL;
 	p->lock_depth = -1;		/* -1 = no lock */
 	p->start_time = jiffies;
 
@@ -685,15 +688,27 @@ int do_fork(unsigned long clone_flags, unsigned long stack_start,
 	p->pdeath_signal = 0;
 
 	/*
-	 * "share" dynamic priority between parent and child, thus the
-	 * total amount of dynamic priorities in the system doesnt change,
-	 * more scheduling fairness. This is only important in the first
-	 * timeslice, on the long run the scheduling behaviour is unchanged.
+	 * Share the timeslice between parent and child, thus the
+	 * total amount of pending timeslices in the system doesnt change,
+	 * resulting in more scheduling fairness.
 	 */
+	__save_flags(flags);
+	__cli();
 	p->time_slice = (current->time_slice + 1) >> 1;
 	current->time_slice >>= 1;
-	if (!current->time_slice)
-		current->need_resched = 1;
+	if (!current->time_slice) {
+		/*
+	 	 * This case is rare, it happens when the parent has only
+	 	 * a single jiffy left from its timeslice. Taking the
+		 * runqueue lock is not a problem.
+		 */
+		current->time_slice = 1;
+		expire_task(current);
+	}
+	p->sleep_timestamp = p->run_timestamp = jiffies;
+	memset(p->sleep_hist, 0, sizeof(p->sleep_hist[0])*SLEEP_HIST_SIZE);
+	p->sleep_idx = 0;
+	__restore_flags(flags);
 
 	/*
 	 * Ok, add it to the run-queues and make it
@@ -730,10 +745,23 @@ int do_fork(unsigned long clone_flags, unsigned long stack_start,
 	if (p->ptrace & PT_PTRACED)
 		send_sig(SIGSTOP, p, 1);
 
-	wake_up_process(p);		/* do this last */
+#define RUN_CHILD_FIRST 1
+#if RUN_CHILD_FIRST
+	wake_up_forked_process(p);		/* do this last */
+#else
+	wake_up_process(p);			/* do this last */
+#endif
 	++total_forks;
 	if (clone_flags & CLONE_VFORK)
 		wait_for_completion(&vfork);
+#if RUN_CHILD_FIRST
+	else
+		/*
+		 * Let the child process run first, to avoid most of the
+		 * COW overhead when the child exec()s afterwards.
+		 */
+		current->need_resched = 1;
+#endif
 
 fork_out:
 	return retval;
diff --git a/kernel/ksyms.c b/kernel/ksyms.c
index 56dd5a66a34b..eded2d7fb339 100644
--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -191,12 +191,12 @@ EXPORT_SYMBOL(notify_change);
 EXPORT_SYMBOL(set_blocksize);
 EXPORT_SYMBOL(sb_set_blocksize);
 EXPORT_SYMBOL(sb_min_blocksize);
-EXPORT_SYMBOL(getblk);
+EXPORT_SYMBOL(__getblk);
 EXPORT_SYMBOL(cdget);
 EXPORT_SYMBOL(cdput);
 EXPORT_SYMBOL(bdget);
 EXPORT_SYMBOL(bdput);
-EXPORT_SYMBOL(bread);
+EXPORT_SYMBOL(__bread);
 EXPORT_SYMBOL(__brelse);
 EXPORT_SYMBOL(__bforget);
 EXPORT_SYMBOL(ll_rw_block);
@@ -441,6 +441,8 @@ EXPORT_SYMBOL(interruptible_sleep_on);
 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
 EXPORT_SYMBOL(schedule);
 EXPORT_SYMBOL(schedule_timeout);
+EXPORT_SYMBOL(sys_sched_yield);
+EXPORT_SYMBOL(set_user_nice);
 EXPORT_SYMBOL(jiffies);
 EXPORT_SYMBOL(xtime);
 EXPORT_SYMBOL(do_gettimeofday);
@@ -452,6 +454,7 @@ EXPORT_SYMBOL(loops_per_jiffy);
 
 EXPORT_SYMBOL(kstat);
 EXPORT_SYMBOL(nr_running);
+EXPORT_SYMBOL(nr_context_switches);
 
 /* misc */
 EXPORT_SYMBOL(panic);
diff --git a/kernel/printk.c b/kernel/printk.c
index 4505465f2dd7..365aed1d39ae 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -25,6 +25,7 @@
 #include <linux/module.h>
 #include <linux/interrupt.h>			/* For in_interrupt() */
 #include <linux/config.h>
+#include <linux/delay.h>
 
 #include <asm/uaccess.h>
 
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index b729b6f30c93..79af05d4f353 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -31,20 +31,7 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 		if (child->state != TASK_STOPPED)
 			return -ESRCH;
 #ifdef CONFIG_SMP
-		/* Make sure the child gets off its CPU.. */
-		for (;;) {
-			task_lock(child);
-			if (!task_has_cpu(child))
-				break;
-			task_unlock(child);
-			do {
-				if (child->state != TASK_STOPPED)
-					return -ESRCH;
-				barrier();
-				cpu_relax();
-			} while (task_has_cpu(child));
-		}
-		task_unlock(child);
+		wait_task_inactive(child);
 #endif		
 	}
 
diff --git a/kernel/sched.c b/kernel/sched.c
index dc5eea2af910..22c4fc8114e2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -12,331 +12,257 @@
  *  1998-12-28  Implemented better SMP scheduling by Ingo Molnar
  */
 
-/*
- * 'sched.c' is the main kernel file. It contains scheduling primitives
- * (sleep_on, wakeup, schedule etc) as well as a number of simple system
- * call functions (type getpid()), which just extract a field from
- * current-task
- */
-
-#include <linux/config.h>
 #include <linux/mm.h>
+#include <linux/nmi.h>
 #include <linux/init.h>
+#include <asm/uaccess.h>
 #include <linux/smp_lock.h>
-#include <linux/nmi.h>
 #include <linux/interrupt.h>
-#include <linux/kernel_stat.h>
-#include <linux/completion.h>
-#include <linux/prefetch.h>
-#include <linux/compiler.h>
-
-#include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 
-extern void timer_bh(void);
-extern void tqueue_bh(void);
-extern void immediate_bh(void);
-
-/*
- * scheduler variables
- */
+#define BITMAP_SIZE ((MAX_PRIO+7)/8)
 
-unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
+typedef struct runqueue runqueue_t;
 
-extern void mem_use(void);
+struct prio_array {
+	int nr_active;
+	spinlock_t *lock;
+	runqueue_t *rq;
+	char bitmap[BITMAP_SIZE];
+	list_t queue[MAX_PRIO];
+};
 
 /*
- * Scheduling quanta.
+ * This is the main, per-CPU runqueue data structure.
  *
- * NOTE! The unix "nice" value influences how long a process
- * gets. The nice value ranges from -20 to +19, where a -20
- * is a "high-priority" task, and a "+10" is a low-priority
- * task. The default time slice for zero-nice tasks will be 37ms.
- */
-#define NICE_RANGE	40
-#define MIN_NICE_TSLICE	10000
-#define MAX_NICE_TSLICE	90000
-#define TASK_TIMESLICE(p)	((int) ts_table[19 - (p)->nice])
-
-static unsigned char ts_table[NICE_RANGE];
-
-#define MM_AFFINITY_BONUS	1
-
-/*
- *	Init task must be ok at boot for the ix86 as we will check its signals
- *	via the SMP irq return path.
- */
- 
-struct task_struct * init_tasks[NR_CPUS] = {&init_task, };
-
-/*
- * The tasklist_lock protects the linked list of processes.
+ * Locking rule: those places that want to lock multiple runqueues
+ * (such as the load balancing or the process migration code), lock
+ * acquire operations must be ordered by rq->cpu.
  *
- * The runqueue_lock locks the parts that actually access
- * and change the run-queues, and have to be interrupt-safe.
- *
- * If both locks are to be concurrently held, the runqueue_lock
- * nests inside the tasklist_lock.
- *
- * task->alloc_lock nests inside tasklist_lock.
+ * The RT event id is used to avoid calling into the the RT scheduler
+ * if there is a RT task active in an SMP system but there is no
+ * RT scheduling activity otherwise.
  */
-spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED;  /* inner */
-rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;	/* outer */
-
-static LIST_HEAD(runqueue_head);
-
-static unsigned long rcl_curr;
+static struct runqueue {
+	int cpu;
+	spinlock_t lock;
+	unsigned long nr_running, nr_switches, last_rt_event;
+	task_t *curr, *idle;
+	prio_array_t *active, *expired, arrays[2];
+	char __pad [SMP_CACHE_BYTES];
+} runqueues [NR_CPUS] __cacheline_aligned;
+
+#define this_rq()		(runqueues + smp_processor_id())
+#define task_rq(p)		(runqueues + (p)->cpu)
+#define cpu_rq(cpu)		(runqueues + (cpu))
+#define cpu_curr(cpu)		(runqueues[(cpu)].curr)
+#define rt_task(p)		((p)->policy != SCHED_OTHER)
+
+#define lock_task_rq(rq,p,flags)				\
+do {								\
+repeat_lock_task:						\
+	rq = task_rq(p);					\
+	spin_lock_irqsave(&rq->lock, flags);			\
+	if (unlikely((rq)->cpu != (p)->cpu)) {			\
+		spin_unlock_irqrestore(&rq->lock, flags);	\
+		goto repeat_lock_task;				\
+	}							\
+} while (0)
+
+#define unlock_task_rq(rq,p,flags)				\
+	spin_unlock_irqrestore(&rq->lock, flags)
 
 /*
- * We align per-CPU scheduling data on cacheline boundaries,
- * to prevent cacheline ping-pong.
+ * Adding/removing a task to/from a priority array:
  */
-static union {
-	struct schedule_data {
-		struct task_struct * curr;
-		cycles_t last_schedule;
-	} schedule_data;
-	char __pad [SMP_CACHE_BYTES];
-} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
-
-#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
-#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
-
-struct kernel_stat kstat;
-extern struct task_struct *child_reaper;
-
-#ifdef CONFIG_SMP
-
-#define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
-#define can_schedule(p,cpu) \
-	((p)->cpus_runnable & (p)->cpus_allowed & (1 << cpu))
-
-#else
-
-#define idle_task(cpu) (&init_task)
-#define can_schedule(p,cpu) (1)
-
-#endif
+static inline void dequeue_task(struct task_struct *p, prio_array_t *array)
+{
+	array->nr_active--;
+	list_del_init(&p->run_list);
+	if (list_empty(array->queue + p->prio))
+		__set_bit(p->prio, array->bitmap);
+}
 
-void scheduling_functions_start_here(void) { }
+static inline void enqueue_task(struct task_struct *p, prio_array_t *array)
+{
+	list_add_tail(&p->run_list, array->queue + p->prio);
+	__clear_bit(p->prio, array->bitmap);
+	array->nr_active++;
+	p->array = array;
+}
 
 /*
- * This is the function that decides how desirable a process is..
- * You can weigh different processes against each other depending
- * on what CPU they've run on lately etc to try to handle cache
- * and TLB miss penalties.
+ * This is the per-process load estimator. Processes that generate
+ * more load than the system can handle get a priority penalty.
  *
- * Return values:
- *	 -1000: never select this
- *	     0: out of time, recalculate counters (but it might still be
- *		selected)
- *	   +ve: "goodness" value (the larger, the better)
- *	 +1000: realtime process, select this.
+ * The estimator uses a 4-entry load-history ringbuffer which is
+ * updated whenever a task is moved to/from the runqueue. The load
+ * estimate is also updated from the timer tick to get an accurate
+ * estimation of currently executing tasks as well.
  */
+#define NEXT_IDX(idx) (((idx) + 1) % SLEEP_HIST_SIZE)
 
-static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
+static inline void update_sleep_avg_deactivate(task_t *p)
 {
-	int weight;
-
-	/*
-	 * select the current process after every other
-	 * runnable process, but before the idle thread.
-	 * Also, dont trigger a counter recalculation.
-	 */
-	weight = -1;
-	if (p->policy & SCHED_YIELD)
-		goto out;
-
-	/*
-	 * Non-RT process - normal case first.
-	 */
-	if (p->policy == SCHED_OTHER) {
-		/*
-		 * Give the process a first-approximation goodness value
-		 * according to the number of clock-ticks it has left.
-		 *
-		 * Don't do any other calculations if the time slice is
-		 * over..
-		 */
-		if (!p->time_slice)
-			return 0;
-
-		weight = p->dyn_prio + 1;
+	unsigned int idx;
+	unsigned long j = jiffies, last_sample = p->run_timestamp / HZ,
+		curr_sample = j / HZ, delta = curr_sample - last_sample;
+
+	if (unlikely(delta)) {
+		if (delta < SLEEP_HIST_SIZE) {
+			for (idx = 0; idx < delta; idx++) {
+				p->sleep_idx++;
+				p->sleep_idx %= SLEEP_HIST_SIZE;
+				p->sleep_hist[p->sleep_idx] = 0;
+			}
+		} else {
+			for (idx = 0; idx < SLEEP_HIST_SIZE; idx++)
+				p->sleep_hist[idx] = 0;
+			p->sleep_idx = 0;
+		}
+	}
+	p->sleep_timestamp = j;
+}
 
-#ifdef CONFIG_SMP
-		/* Give a largish advantage to the same processor...   */
-		/* (this is equivalent to penalizing other processors) */
-		if (p->processor == this_cpu)
-			weight += PROC_CHANGE_PENALTY;
+#if SLEEP_HIST_SIZE != 4
+# error update this code.
 #endif
 
-		/* .. and a slight advantage to the current MM */
-		if (p->mm == this_mm || !p->mm)
-			weight += MM_AFFINITY_BONUS;
-		weight += 20 - p->nice;
-		goto out;
-	}
+static inline unsigned int get_sleep_avg(task_t *p, unsigned long j)
+{
+	unsigned int sum;
 
-	/*
-	 * Realtime process, select the first one on the
-	 * runqueue (taking priorities within processes
-	 * into account).
-	 */
-	weight = 1000 + p->rt_priority;
-out:
-	return weight;
+	sum = p->sleep_hist[0];
+	sum += p->sleep_hist[1];
+	sum += p->sleep_hist[2];
+	sum += p->sleep_hist[3];
+
+	return sum * HZ / ((SLEEP_HIST_SIZE-1)*HZ + (j % HZ));
 }
 
-/*
- * the 'goodness value' of replacing a process on a given CPU.
- * positive value means 'replace', zero or negative means 'dont'.
- */
-static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu)
+static inline void update_sleep_avg_activate(task_t *p, unsigned long j)
 {
-	return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm);
+	unsigned int idx;
+	unsigned long delta_ticks, last_sample = p->sleep_timestamp / HZ,
+		curr_sample = j / HZ, delta = curr_sample - last_sample;
+
+	if (unlikely(delta)) {
+		if (delta < SLEEP_HIST_SIZE) {
+			p->sleep_hist[p->sleep_idx] += HZ - (p->sleep_timestamp % HZ);
+			p->sleep_idx++;
+			p->sleep_idx %= SLEEP_HIST_SIZE;
+
+			for (idx = 1; idx < delta; idx++) {
+				p->sleep_idx++;
+				p->sleep_idx %= SLEEP_HIST_SIZE;
+				p->sleep_hist[p->sleep_idx] = HZ;
+			}
+		} else {
+			for (idx = 0; idx < SLEEP_HIST_SIZE; idx++)
+				p->sleep_hist[idx] = HZ;
+			p->sleep_idx = 0;
+		}
+		p->sleep_hist[p->sleep_idx] = 0;
+		delta_ticks = j % HZ;
+	} else
+		delta_ticks = j - p->sleep_timestamp;
+	p->sleep_hist[p->sleep_idx] += delta_ticks;
+	p->run_timestamp = j;
 }
 
-/*
- * This is ugly, but reschedule_idle() is very timing-critical.
- * We are called with the runqueue spinlock held and we must
- * not claim the tasklist_lock.
- */
-static FASTCALL(void reschedule_idle(struct task_struct * p));
-
-static void reschedule_idle(struct task_struct * p)
+static inline void activate_task(task_t *p, runqueue_t *rq)
 {
-#ifdef CONFIG_SMP
-	int this_cpu = smp_processor_id();
-	struct task_struct *tsk, *target_tsk;
-	int cpu, best_cpu, i, max_prio;
-	cycles_t oldest_idle;
+	prio_array_t *array = rq->active;
+	unsigned long j = jiffies;
+	unsigned int sleep, load;
+	int penalty;
 
+	if (likely(p->run_timestamp == j))
+		goto enqueue;
 	/*
-	 * shortcut if the woken up task's last CPU is
-	 * idle now.
+	 * Give the process a priority penalty if it has not slept often
+	 * enough in the past. We scale the priority penalty according
+	 * to the current load of the runqueue, and the 'load history'
+	 * this process has. Eg. if the CPU has 3 processes running
+	 * right now then a process that has slept more than two-thirds
+	 * of the time is considered to be 'interactive'. The higher
+	 * the load of the CPUs is, the easier it is for a process to
+	 * get an non-interactivity penalty.
 	 */
-	best_cpu = p->processor;
-	if (can_schedule(p, best_cpu)) {
-		tsk = idle_task(best_cpu);
-		if (cpu_curr(best_cpu) == tsk) {
-			int need_resched;
-send_now_idle:
-			/*
-			 * If need_resched == -1 then we can skip sending
-			 * the IPI altogether, tsk->need_resched is
-			 * actively watched by the idle thread.
-			 */
-			need_resched = tsk->need_resched;
-			tsk->need_resched = 1;
-			if ((best_cpu != this_cpu) && !need_resched)
-				smp_send_reschedule(best_cpu);
-			return;
-		}
+#define MAX_PENALTY (MAX_USER_PRIO/3)
+	update_sleep_avg_activate(p, j);
+	sleep = get_sleep_avg(p, j);
+	load = HZ - sleep;
+	penalty = (MAX_PENALTY * load)/HZ;
+	if (!rt_task(p)) {
+		p->prio = NICE_TO_PRIO(p->__nice) + penalty;
+		if (p->prio > MAX_PRIO-1)
+			p->prio = MAX_PRIO-1;
 	}
+enqueue:
+	enqueue_task(p, array);
+	rq->nr_running++;
+}
 
-	/*
-	 * We know that the preferred CPU has a cache-affine current
-	 * process, lets try to find a new idle CPU for the woken-up
-	 * process. Select the least recently active idle CPU. (that
-	 * one will have the least active cache context.) Also find
-	 * the executing process which has the least priority.
-	 */
-	oldest_idle = (cycles_t) -1;
-	target_tsk = NULL;
-	max_prio = 0;
-
-	for (i = 0; i < smp_num_cpus; i++) {
-		cpu = cpu_logical_map(i);
-		if (!can_schedule(p, cpu))
-			continue;
-		tsk = cpu_curr(cpu);
-		/*
-		 * We use the first available idle CPU. This creates
-		 * a priority list between idle CPUs, but this is not
-		 * a problem.
-		 */
-		if (tsk == idle_task(cpu)) {
-#if defined(__i386__) && defined(CONFIG_SMP)
-                        /*
-			 * Check if two siblings are idle in the same
-			 * physical package. Use them if found.
-			 */
-			if (smp_num_siblings == 2) {
-				if (cpu_curr(cpu_sibling_map[cpu]) == 
-			            idle_task(cpu_sibling_map[cpu])) {
-					oldest_idle = last_schedule(cpu);
-					target_tsk = tsk;
-					break;
-				}
-				
-                        }
-#endif		
-			if (last_schedule(cpu) < oldest_idle) {
-				oldest_idle = last_schedule(cpu);
-				target_tsk = tsk;
-			}
-		} else {
-			if (oldest_idle == -1ULL) {
-				int prio = preemption_goodness(tsk, p, cpu);
-
-				if (prio > max_prio) {
-					max_prio = prio;
-					target_tsk = tsk;
-				}
-			}
-		}
-	}
-	tsk = target_tsk;
-	if (tsk) {
-		if (oldest_idle != -1ULL) {
-			best_cpu = tsk->processor;
-			goto send_now_idle;
-		}
-		tsk->need_resched = 1;
-		if (tsk->processor != this_cpu)
-			smp_send_reschedule(tsk->processor);
-	}
-	return;
-		
+static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
+{
+	rq->nr_running--;
+	dequeue_task(p, p->array);
+	p->array = NULL;
+	update_sleep_avg_deactivate(p);
+}
 
-#else /* UP */
-	int this_cpu = smp_processor_id();
-	struct task_struct *tsk;
+static inline void resched_task(task_t *p)
+{
+	int need_resched;
 
-	tsk = cpu_curr(this_cpu);
-	if (preemption_goodness(tsk, p, this_cpu) > 0)
-		tsk->need_resched = 1;
-#endif
+	need_resched = p->need_resched;
+	wmb();
+	p->need_resched = 1;
+	if (!need_resched)
+		smp_send_reschedule(p->cpu);
 }
 
+#ifdef CONFIG_SMP
+
 /*
- * Careful!
- *
- * This has to add the process to the _beginning_ of the
- * run-queue, not the end. See the comment about "This is
- * subtle" in the scheduler proper..
+ * Wait for a process to unschedule. This is used by the exit() and
+ * ptrace() code.
  */
-static inline void add_to_runqueue(struct task_struct * p)
+void wait_task_inactive(task_t * p)
 {
-	p->dyn_prio += rcl_curr - p->rcl_last;
-	p->rcl_last = rcl_curr;
-	if (p->dyn_prio > MAX_DYNPRIO)
-		p->dyn_prio = MAX_DYNPRIO;
-	list_add(&p->run_list, &runqueue_head);
-	nr_running++;
-}
+	unsigned long flags;
+	runqueue_t *rq;
 
-static inline void move_last_runqueue(struct task_struct * p)
-{
-	list_del(&p->run_list);
-	list_add_tail(&p->run_list, &runqueue_head);
+repeat:
+	rq = task_rq(p);
+	while (unlikely(rq->curr == p)) {
+		cpu_relax();
+		barrier();
+	}
+	lock_task_rq(rq, p, flags);
+	if (unlikely(rq->curr == p)) {
+		unlock_task_rq(rq, p, flags);
+		goto repeat;
+	}
+	unlock_task_rq(rq, p, flags);
 }
 
-static inline void move_first_runqueue(struct task_struct * p)
+/*
+ * Kick the remote CPU if the task is running currently,
+ * this code is used by the signal code to signal tasks
+ * which are in user-mode as quickly as possible.
+ *
+ * (Note that we do this lockless - if the task does anything
+ * while the message is in flight then it will notice the
+ * sigpending condition anyway.)
+ */
+void kick_if_running(task_t * p)
 {
-	list_del(&p->run_list);
-	list_add(&p->run_list, &runqueue_head);
+	if (p == task_rq(p)->curr)
+		resched_task(p);
 }
+#endif
 
 /*
  * Wake up a process. Put it on the run-queue if it's not
@@ -346,385 +272,394 @@ static inline void move_first_runqueue(struct task_struct * p)
  * "current->state = TASK_RUNNING" to mark yourself runnable
  * without the overhead of this.
  */
-static inline int try_to_wake_up(struct task_struct * p, int synchronous)
+static int try_to_wake_up(task_t * p, int synchronous)
 {
 	unsigned long flags;
 	int success = 0;
+	runqueue_t *rq;
 
-	/*
-	 * We want the common case fall through straight, thus the goto.
-	 */
-	spin_lock_irqsave(&runqueue_lock, flags);
+	lock_task_rq(rq, p, flags);
 	p->state = TASK_RUNNING;
-	if (task_on_runqueue(p))
-		goto out;
-	add_to_runqueue(p);
-	if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id())))
-		reschedule_idle(p);
-	success = 1;
-out:
-	spin_unlock_irqrestore(&runqueue_lock, flags);
+	if (!p->array) {
+		if (!rt_task(p) && synchronous && (smp_processor_id() < p->cpu)) {
+			spin_lock(&this_rq()->lock);
+			p->cpu = smp_processor_id();
+			activate_task(p, this_rq());
+			spin_unlock(&this_rq()->lock);
+		} else {
+			activate_task(p, rq);
+			if ((rq->curr == rq->idle) ||
+					(p->prio < rq->curr->prio))
+				resched_task(rq->curr);
+		}
+		success = 1;
+	}
+	unlock_task_rq(rq, p, flags);
 	return success;
 }
 
-inline int wake_up_process(struct task_struct * p)
+inline int wake_up_process(task_t * p)
 {
 	return try_to_wake_up(p, 0);
 }
 
-static void process_timeout(unsigned long __data)
+void wake_up_forked_process(task_t * p)
 {
-	wake_up_process((struct task_struct *)__data);
-}
+	runqueue_t *rq = this_rq();
 
-/**
- * schedule_timeout - sleep until timeout
- * @timeout: timeout value in jiffies
- *
- * Make the current task sleep until @timeout jiffies have
- * elapsed. The routine will return immediately unless
- * the current task state has been set (see set_current_state()).
- *
- * You can set the task state as follows -
- *
- * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
- * pass before the routine returns. The routine will return 0
- *
- * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
- * delivered to the current task. In this case the remaining time
- * in jiffies will be returned, or 0 if the timer expired in time
- *
- * The current task state is guaranteed to be TASK_RUNNING when this 
- * routine returns.
- *
- * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
- * the CPU away without a bound on the timeout. In this case the return
- * value will be %MAX_SCHEDULE_TIMEOUT.
- *
- * In all cases the return value is guaranteed to be non-negative.
- */
-signed long schedule_timeout(signed long timeout)
-{
-	struct timer_list timer;
-	unsigned long expire;
-
-	switch (timeout)
-	{
-	case MAX_SCHEDULE_TIMEOUT:
-		/*
-		 * These two special cases are useful to be comfortable
-		 * in the caller. Nothing more. We could take
-		 * MAX_SCHEDULE_TIMEOUT from one of the negative value
-		 * but I' d like to return a valid offset (>=0) to allow
-		 * the caller to do everything it want with the retval.
-		 */
-		schedule();
-		goto out;
-	default:
-		/*
-		 * Another bit of PARANOID. Note that the retval will be
-		 * 0 since no piece of kernel is supposed to do a check
-		 * for a negative retval of schedule_timeout() (since it
-		 * should never happens anyway). You just have the printk()
-		 * that will tell you if something is gone wrong and where.
-		 */
-		if (timeout < 0)
-		{
-			printk(KERN_ERR "schedule_timeout: wrong timeout "
-			       "value %lx from %p\n", timeout,
-			       __builtin_return_address(0));
-			current->state = TASK_RUNNING;
-			goto out;
-		}
+	spin_lock_irq(&rq->lock);
+	p->state = TASK_RUNNING;
+	if (!rt_task(p)) {
+		p->prio += MAX_USER_PRIO/10;
+		if (p->prio > MAX_PRIO-1)
+			p->prio = MAX_PRIO-1;
 	}
+	activate_task(p, rq);
+	spin_unlock_irq(&rq->lock);
+}
 
-	expire = timeout + jiffies;
-
-	init_timer(&timer);
-	timer.expires = expire;
-	timer.data = (unsigned long) current;
-	timer.function = process_timeout;
-
-	add_timer(&timer);
-	schedule();
-	del_timer_sync(&timer);
-
-	timeout = expire - jiffies;
-
- out:
-	return timeout < 0 ? 0 : timeout;
+asmlinkage void schedule_tail(task_t *prev)
+{
+	spin_unlock_irq(&this_rq()->lock);
 }
 
-/*
- * schedule_tail() is getting called from the fork return path. This
- * cleans up all remaining scheduler things, without impacting the
- * common case.
- */
-static inline void __schedule_tail(struct task_struct *prev)
+static inline void context_switch(task_t *prev, task_t *next)
 {
-#ifdef CONFIG_SMP
-	int policy;
+	struct mm_struct *mm = next->mm;
+	struct mm_struct *oldmm = prev->active_mm;
 
-	/*
-	 * prev->policy can be written from here only before `prev'
-	 * can be scheduled (before setting prev->cpus_runnable to ~0UL).
-	 * Of course it must also be read before allowing prev
-	 * to be rescheduled, but since the write depends on the read
-	 * to complete, wmb() is enough. (the spin_lock() acquired
-	 * before setting cpus_runnable is not enough because the spin_lock()
-	 * common code semantics allows code outside the critical section
-	 * to enter inside the critical section)
-	 */
-	policy = prev->policy;
-	prev->policy = policy & ~SCHED_YIELD;
-	wmb();
+	prepare_to_switch();
 
-	/*
-	 * fast path falls through. We have to clear cpus_runnable before
-	 * checking prev->state to avoid a wakeup race. Protect against
-	 * the task exiting early.
-	 */
-	task_lock(prev);
-	task_release_cpu(prev);
-	mb();
-	if (prev->state == TASK_RUNNING)
-		goto needs_resched;
+	if (!mm) {
+		next->active_mm = oldmm;
+		atomic_inc(&oldmm->mm_count);
+		enter_lazy_tlb(oldmm, next, smp_processor_id());
+	} else
+		switch_mm(oldmm, mm, next, smp_processor_id());
 
-out_unlock:
-	task_unlock(prev);	/* Synchronise here with release_task() if prev is TASK_ZOMBIE */
-	return;
+	if (!prev->mm) {
+		prev->active_mm = NULL;
+		mmdrop(oldmm);
+	}
 
 	/*
-	 * Slow path - we 'push' the previous process and
-	 * reschedule_idle() will attempt to find a new
-	 * processor for it. (but it might preempt the
-	 * current process as well.) We must take the runqueue
-	 * lock and re-check prev->state to be correct. It might
-	 * still happen that this process has a preemption
-	 * 'in progress' already - but this is not a problem and
-	 * might happen in other circumstances as well.
+	 * Here we just switch the register state and the stack. There are
+	 * 3 processes affected by a context switch:
+	 *
+	 * prev ==> .... ==> (last => next)
+	 *
+	 * It's the 'much more previous' 'prev' that is on next's stack,
+	 * but prev is set to (the just run) 'last' process by switch_to().
+	 * This might sound slightly confusing but makes tons of sense.
 	 */
-needs_resched:
-	{
-		unsigned long flags;
+	switch_to(prev, next, prev);
+}
 
-		/*
-		 * Avoid taking the runqueue lock in cases where
-		 * no preemption-check is necessery:
-		 */
-		if ((prev == idle_task(smp_processor_id())) ||
-						(policy & SCHED_YIELD))
-			goto out_unlock;
+unsigned long nr_running(void)
+{
+	unsigned long i, sum = 0;
 
-		spin_lock_irqsave(&runqueue_lock, flags);
-		if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev))
-			reschedule_idle(prev);
-		spin_unlock_irqrestore(&runqueue_lock, flags);
-		goto out_unlock;
-	}
-#else
-	prev->policy &= ~SCHED_YIELD;
-#endif /* CONFIG_SMP */
+	for (i = 0; i < smp_num_cpus; i++)
+		sum += cpu_rq(i)->nr_running;
+
+	return sum;
 }
 
-asmlinkage void schedule_tail(struct task_struct *prev)
+unsigned long nr_context_switches(void)
 {
-	__schedule_tail(prev);
+	unsigned long i, sum = 0;
+
+	for (i = 0; i < smp_num_cpus; i++)
+		sum += cpu_rq(i)->nr_switches;
+
+	return sum;
 }
 
-void expire_task(struct task_struct *p)
+static inline unsigned long max_rq_len(void)
 {
-	if (unlikely(!p->time_slice))
-		goto need_resched;
+	unsigned long i, curr, max = 0;
 
-	if (!--p->time_slice) {
-		if (p->dyn_prio)
-			p->dyn_prio--;
-need_resched:
-		p->need_resched = 1;
+	for (i = 0; i < smp_num_cpus; i++) {
+		curr = cpu_rq(i)->nr_running;
+		if (curr > max)
+			max = curr;
 	}
+	return max;
 }
 
 /*
- *  'schedule()' is the scheduler function. It's a very simple and nice
- * scheduler: it's not perfect, but certainly works for most things.
- *
- * The goto is "interesting".
+ * Current runqueue is empty, try to find work on
+ * other runqueues.
  *
- *   NOTE!!  Task 0 is the 'idle' task, which gets called when no other
- * tasks can run. It can not be killed, and it cannot sleep. The 'state'
- * information in task[0] is never used.
+ * We call this with the current runqueue locked,
+ * irqs disabled.
  */
-asmlinkage void schedule(void)
+static void load_balance(runqueue_t *this_rq)
 {
-	struct schedule_data * sched_data;
-	struct task_struct *prev, *next, *p;
-	struct list_head *tmp;
-	int this_cpu, c;
-
-
-	spin_lock_prefetch(&runqueue_lock);
-
-	if (!current->active_mm) BUG();
-need_resched_back:
-	prev = current;
-	this_cpu = prev->processor;
-
-	if (unlikely(in_interrupt())) {
-		printk("Scheduling in interrupt\n");
-		BUG();
-	}
-
-	release_kernel_lock(prev, this_cpu);
-
+	int nr_tasks, load, prev_max_load, max_load, idx, i;
+	task_t *next = this_rq->idle, *tmp;
+	runqueue_t *busiest, *rq_tmp;
+	prio_array_t *array;
+	list_t *head, *curr;
+
+	prev_max_load = max_rq_len();
+	nr_tasks = prev_max_load - this_rq->nr_running;
 	/*
-	 * 'sched_data' is protected by the fact that we can run
-	 * only one process per CPU.
+	 * It needs an at least ~10% imbalance to trigger balancing:
 	 */
-	sched_data = & aligned_data[this_cpu].schedule_data;
-
-	spin_lock_irq(&runqueue_lock);
+	if (nr_tasks <= 1 + prev_max_load/8)
+		return;
+	prev_max_load++;
 
-	/* move an exhausted RR process to be last.. */
-	if (unlikely(prev->policy == SCHED_RR))
-		if (!prev->time_slice) {
-			prev->time_slice = TASK_TIMESLICE(prev);
-			move_last_runqueue(prev);
-		}
-
-	switch (prev->state) {
-	case TASK_INTERRUPTIBLE:
-		if (signal_pending(prev)) {
-			prev->state = TASK_RUNNING;
-			break;
+repeat_search:
+	/*
+	 * We search all runqueues to find the most busy one.
+	 * We do this lockless to reduce cache-bouncing overhead,
+	 * we re-check the source CPU with the lock held.
+	 */
+	busiest = NULL;
+	max_load = 0;
+	for (i = 0; i < smp_num_cpus; i++) {
+		rq_tmp = cpu_rq(i);
+		load = rq_tmp->nr_running;
+		if ((load > max_load) && (load < prev_max_load) &&
+						(rq_tmp != this_rq)) {
+			busiest = rq_tmp;
+			max_load = load;
 		}
-	default:
-		del_from_runqueue(prev);
-	case TASK_RUNNING:;
 	}
-	prev->need_resched = 0;
+
+	if (likely(!busiest))
+		return;
+	if (max_load <= this_rq->nr_running)
+		return;
+	prev_max_load = max_load;
+	if (busiest->cpu < this_rq->cpu) {
+		spin_unlock(&this_rq->lock);
+		spin_lock(&busiest->lock);
+		spin_lock(&this_rq->lock);
+	} else
+		spin_lock(&busiest->lock);
+	if (busiest->nr_running <= this_rq->nr_running + 1)
+		goto out_unlock;
 
 	/*
-	 * this is the scheduler proper:
+	 * We first consider expired tasks. Those will likely not run
+	 * in the near future, thus switching CPUs has the least effect
+	 * on them.
 	 */
+	if (busiest->expired->nr_active)
+		array = busiest->expired;
+	else
+		array = busiest->active;
 
-repeat_schedule:
+new_array:
 	/*
-	 * Default process to select..
+	 * Load-balancing does not affect RT tasks, so we start the
+	 * searching at priority 128.
 	 */
-	next = idle_task(this_cpu);
-	c = -1000;
-	list_for_each(tmp, &runqueue_head) {
-		p = list_entry(tmp, struct task_struct, run_list);
-		if (can_schedule(p, this_cpu)) {
-			int weight = goodness(p, this_cpu, prev->active_mm);
-			if (weight > c)
-				c = weight, next = p;
+	idx = MAX_RT_PRIO;
+skip_bitmap:
+	idx = find_next_zero_bit(array->bitmap, MAX_PRIO, idx);
+	if (idx == MAX_PRIO) {
+		if (array == busiest->expired) {
+			array = busiest->active;
+			goto new_array;
 		}
+		spin_unlock(&busiest->lock);
+		goto repeat_search;
 	}
 
-	/* Do we need to re-calculate counters? */
-	if (unlikely(!c)) {
-		rcl_curr++;
-		list_for_each(tmp, &runqueue_head) {
-			p = list_entry(tmp, struct task_struct, run_list);
-			p->time_slice = TASK_TIMESLICE(p);
-			p->rcl_last = rcl_curr;
-		}
-		goto repeat_schedule;
+	head = array->queue + idx;
+	curr = head->next;
+skip_queue:
+	tmp = list_entry(curr, task_t, run_list);
+	if ((tmp == busiest->curr) || !(tmp->cpus_allowed & (1 << smp_processor_id()))) {
+		curr = curr->next;
+		if (curr != head)
+			goto skip_queue;
+		idx++;
+		goto skip_bitmap;
 	}
-
+	next = tmp;
 	/*
-	 * from this point on nothing can prevent us from
-	 * switching to the next task, save this fact in
-	 * sched_data.
+	 * take the task out of the other runqueue and
+	 * put it into this one:
 	 */
-	sched_data->curr = next;
-	task_set_cpu(next, this_cpu);
-	spin_unlock_irq(&runqueue_lock);
-
-	if (unlikely(prev == next)) {
-		/* We won't go through the normal tail, so do this by hand */
-		prev->policy &= ~SCHED_YIELD;
-		goto same_process;
+	dequeue_task(next, array);
+	busiest->nr_running--;
+	next->cpu = smp_processor_id();
+	this_rq->nr_running++;
+	enqueue_task(next, this_rq->active);
+	if (next->prio < current->prio)
+		current->need_resched = 1;
+	if (--nr_tasks) {
+		if (array == busiest->expired) {
+			array = busiest->active;
+			goto new_array;
+		}
+		spin_unlock(&busiest->lock);
+		goto repeat_search;
 	}
+out_unlock:
+	spin_unlock(&busiest->lock);
+}
 
-#ifdef CONFIG_SMP
- 	/*
- 	 * maintain the per-process 'last schedule' value.
- 	 * (this has to be recalculated even if we reschedule to
- 	 * the same process) Currently this is only used on SMP,
-	 * and it's approximate, so we do not have to maintain
-	 * it while holding the runqueue spinlock.
- 	 */
- 	sched_data->last_schedule = get_cycles();
+#define REBALANCE_TICK (HZ/100)
 
-	/*
-	 * We drop the scheduler lock early (it's a global spinlock),
-	 * thus we have to lock the previous process from getting
-	 * rescheduled during switch_to().
-	 */
+void idle_tick(void)
+{
+	unsigned long flags;
 
-#endif /* CONFIG_SMP */
+	if (!(jiffies % REBALANCE_TICK) && likely(this_rq()->curr != NULL)) {
+		spin_lock_irqsave(&this_rq()->lock, flags);
+		load_balance(this_rq());
+		spin_unlock_irqrestore(&this_rq()->lock, flags);
+	}
+}
+
+void expire_task(task_t *p)
+{
+	runqueue_t *rq = this_rq();
+	unsigned long flags;
 
-	kstat.context_swtch++;
+	if (p->array != rq->active) {
+		p->need_resched = 1;
+		return;
+	}
 	/*
-	 * there are 3 processes which are affected by a context switch:
-	 *
-	 * prev == .... ==> (last => next)
-	 *
-	 * It's the 'much more previous' 'prev' that is on next's stack,
-	 * but prev is set to (the just run) 'last' process by switch_to().
-	 * This might sound slightly confusing but makes tons of sense.
+	 * The task cannot change CPUs because it's the current task.
 	 */
-	prepare_to_switch();
-	{
-		struct mm_struct *mm = next->mm;
-		struct mm_struct *oldmm = prev->active_mm;
-		if (!mm) {
-			if (next->active_mm) BUG();
-			next->active_mm = oldmm;
-			atomic_inc(&oldmm->mm_count);
-			enter_lazy_tlb(oldmm, next, this_cpu);
-		} else {
-			if (next->active_mm != mm) BUG();
-			switch_mm(oldmm, mm, next, this_cpu);
-		}
+	spin_lock_irqsave(&rq->lock, flags);
+	if ((p->policy != SCHED_FIFO) && !--p->time_slice) {
+		p->need_resched = 1;
+		if (rt_task(p))
+			p->time_slice = RT_PRIO_TO_TIMESLICE(p->prio);
+		else
+			p->time_slice = PRIO_TO_TIMESLICE(p->prio);
 
-		if (!prev->mm) {
-			prev->active_mm = NULL;
-			mmdrop(oldmm);
+		/*
+		 * Timeslice used up - discard any possible
+		 * priority penalty:
+		 */
+		dequeue_task(p, rq->active);
+		/*
+		 * Tasks that have nice values of -20 ... -15 are put
+		 * back into the active array. If they use up too much
+		 * CPU time then they'll get a priority penalty anyway
+		 * so this can not starve other processes accidentally.
+		 * Otherwise this is pretty handy for sysadmins ...
+		 */
+		if (p->prio <= MAX_RT_PRIO + MAX_PENALTY/2)
+			enqueue_task(p, rq->active);
+		else
+			enqueue_task(p, rq->expired);
+	} else {
+		/*
+		 * Deactivate + activate the task so that the
+		 * load estimator gets updated properly:
+		 */
+		if (!rt_task(p)) {
+			deactivate_task(p, rq);
+			activate_task(p, rq);
 		}
 	}
+	load_balance(rq);
+	spin_unlock_irqrestore(&rq->lock, flags);
+}
 
-	/*
-	 * This just switches the register state and the
-	 * stack.
-	 */
-	switch_to(prev, next, prev);
-	__schedule_tail(prev);
+void scheduling_functions_start_here(void) { }
+
+/*
+ * 'schedule()' is the main scheduler function.
+ */
+asmlinkage void schedule(void)
+{
+	task_t *prev, *next;
+	prio_array_t *array;
+	runqueue_t *rq;
+	list_t *queue;
+	int idx;
+
+	if (unlikely(in_interrupt()))
+		BUG();
+need_resched_back:
+	prev = current;
+	release_kernel_lock(prev, smp_processor_id());
+	rq = this_rq();
+	spin_lock_irq(&rq->lock);
+
+	switch (prev->state) {
+		case TASK_INTERRUPTIBLE:
+			if (unlikely(signal_pending(prev))) {
+				prev->state = TASK_RUNNING;
+				break;
+			}
+		default:
+			deactivate_task(prev, rq);
+		case TASK_RUNNING:
+	}
+pick_next_task:
+	if (unlikely(!rq->nr_running)) {
+		load_balance(rq);
+		if (rq->nr_running)
+			goto pick_next_task;
+		next = rq->idle;
+		goto switch_tasks;
+	}
+
+	array = rq->active;
+	if (unlikely(!array->nr_active)) {
+		/*
+		 * Switch the active and expired arrays.
+		 */
+		rq->active = rq->expired;
+		rq->expired = array;
+		array = rq->active;
+	}
+
+	idx = sched_find_first_zero_bit(array->bitmap);
+	queue = array->queue + idx;
+	next = list_entry(queue->next, task_t, run_list);
+
+switch_tasks:
+	prev->need_resched = 0;
+
+	if (likely(prev != next)) {
+		rq->nr_switches++;
+		rq->curr = next;
+		next->cpu = prev->cpu;
+		context_switch(prev, next);
+		/*
+		 * The runqueue pointer might be from another CPU
+		 * if the new task was last running on a different
+		 * CPU - thus re-load it.
+		 */
+		barrier();
+		rq = this_rq();
+	}
+	spin_unlock_irq(&rq->lock);
 
-same_process:
 	reacquire_kernel_lock(current);
-	if (current->need_resched)
+	if (unlikely(current->need_resched))
 		goto need_resched_back;
 	return;
 }
 
 /*
- * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just wake everything
- * up.  If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the
- * non-exclusive tasks and one exclusive task.
+ * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
+ * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
+ * number) then we wake all the non-exclusive tasks and one exclusive task.
  *
  * There are circumstances in which we can try to wake a task which has already
- * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns zero
- * in this (rare) case, and we handle it by contonuing to scan the queue.
+ * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
+ * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
 static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode,
 			 	     int nr_exclusive, const int sync)
 {
 	struct list_head *tmp;
-	struct task_struct *p;
+	task_t *p;
 
 	list_for_each(tmp,&q->task_list) {
 		unsigned int state;
@@ -853,8 +788,93 @@ long sleep_on_timeout(wait_queue_head_t *q, long timeout)
 	return timeout;
 }
 
+/*
+ * Change the current task's CPU affinity. Migrate the process to a
+ * proper CPU and schedule away if the current CPU is removed from
+ * the allowed bitmask.
+ */
+void set_cpus_allowed(task_t *p, unsigned long new_mask)
+{
+	runqueue_t *this_rq = this_rq(), *target_rq;
+	unsigned long this_mask = 1UL << smp_processor_id();
+	int target_cpu;
+
+	new_mask &= cpu_online_map;
+	p->cpus_allowed = new_mask;
+	/*
+	 * Can the task run on the current CPU? If not then
+	 * migrate the process off to a proper CPU.
+	 */
+	if (new_mask & this_mask)
+		return;
+	target_cpu = ffz(~new_mask);
+	target_rq = cpu_rq(target_cpu);
+	if (target_cpu < smp_processor_id()) {
+		spin_lock_irq(&target_rq->lock);
+		spin_lock(&this_rq->lock);
+	} else {
+		spin_lock_irq(&target_rq->lock);
+		spin_lock(&this_rq->lock);
+	}
+	dequeue_task(p, p->array);
+	this_rq->nr_running--;
+	target_rq->nr_running++;
+	enqueue_task(p, target_rq->active);
+	target_rq->curr->need_resched = 1;
+	spin_unlock(&target_rq->lock);
+
+	/*
+	 * The easiest solution is to context switch into
+	 * the idle thread - which will pick the best task
+	 * afterwards:
+	 */
+	this_rq->nr_switches++;
+	this_rq->curr = this_rq->idle;
+	this_rq->idle->need_resched = 1;
+	context_switch(current, this_rq->idle);
+	barrier();
+	spin_unlock_irq(&this_rq()->lock);
+}
+
 void scheduling_functions_end_here(void) { }
 
+void set_user_nice(task_t *p, long nice)
+{
+	unsigned long flags;
+	prio_array_t *array;
+	runqueue_t *rq;
+
+	if (p->__nice == nice)
+		return;
+	/*
+	 * We have to be careful, if called from sys_setpriority(),
+	 * the task might be in the middle of scheduling on another CPU.
+	 */
+	lock_task_rq(rq, p, flags);
+	if (rt_task(p)) {
+		p->__nice = nice;
+		goto out_unlock;
+	}
+	array = p->array;
+	if (array) {
+		dequeue_task(p, array);
+	}
+	p->__nice = nice;
+	p->prio = NICE_TO_PRIO(nice);
+	if (array) {
+		enqueue_task(p, array);
+		/*
+		 * If the task is runnable and lowered its priority,
+		 * or increased its priority then reschedule its CPU:
+		 */
+		if ((nice < p->__nice) ||
+				((p->__nice < nice) && (p == rq->curr)))
+			resched_task(rq->curr);
+	}
+out_unlock:
+	unlock_task_rq(rq, p, flags);
+}
+
 #ifndef __alpha__
 
 /*
@@ -865,7 +885,7 @@ void scheduling_functions_end_here(void) { }
 
 asmlinkage long sys_nice(int increment)
 {
-	long newprio;
+	long nice;
 
 	/*
 	 *	Setpriority might change our priority at the same moment.
@@ -881,28 +901,30 @@ asmlinkage long sys_nice(int increment)
 	if (increment > 40)
 		increment = 40;
 
-	newprio = current->nice + increment;
-	if (newprio < -20)
-		newprio = -20;
-	if (newprio > 19)
-		newprio = 19;
-	current->nice = newprio;
+	nice = current->__nice + increment;
+	if (nice < -20)
+		nice = -20;
+	if (nice > 19)
+		nice = 19;
+	set_user_nice(current, nice);
 	return 0;
 }
 
 #endif
 
-static inline struct task_struct *find_process_by_pid(pid_t pid)
+static inline task_t *find_process_by_pid(pid_t pid)
 {
 	return pid ? find_task_by_pid(pid) : current;
 }
 
-static int setscheduler(pid_t pid, int policy, 
-			struct sched_param *param)
+static int setscheduler(pid_t pid, int policy, struct sched_param *param)
 {
 	struct sched_param lp;
-	struct task_struct *p;
+	prio_array_t *array;
+	unsigned long flags;
+	runqueue_t *rq;
 	int retval;
+	task_t *p;
 
 	retval = -EINVAL;
 	if (!param || pid < 0)
@@ -916,14 +938,19 @@ static int setscheduler(pid_t pid, int policy,
 	 * We play safe to avoid deadlocks.
 	 */
 	read_lock_irq(&tasklist_lock);
-	spin_lock(&runqueue_lock);
 
 	p = find_process_by_pid(pid);
 
 	retval = -ESRCH;
 	if (!p)
-		goto out_unlock;
-			
+		goto out_unlock_tasklist;
+
+	/*
+	 * To be able to change p->policy safely, the apropriate
+	 * runqueue lock must be held.
+	 */
+	lock_task_rq(rq,p,flags);
+
 	if (policy < 0)
 		policy = p->policy;
 	else {
@@ -951,16 +978,22 @@ static int setscheduler(pid_t pid, int policy,
 	    !capable(CAP_SYS_NICE))
 		goto out_unlock;
 
+	array = p->array;
+	if (array)
+		deactivate_task(p, task_rq(p));
 	retval = 0;
 	p->policy = policy;
 	p->rt_priority = lp.sched_priority;
-	if (task_on_runqueue(p))
-		move_first_runqueue(p);
-
-	current->need_resched = 1;
+	if (rt_task(p))
+		p->prio = 99-p->rt_priority;
+	else
+		p->prio = NICE_TO_PRIO(p->__nice);
+	if (array)
+		activate_task(p, task_rq(p));
 
 out_unlock:
-	spin_unlock(&runqueue_lock);
+	unlock_task_rq(rq,p,flags);
+out_unlock_tasklist:
 	read_unlock_irq(&tasklist_lock);
 
 out_nounlock:
@@ -980,7 +1013,7 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param *param)
 
 asmlinkage long sys_sched_getscheduler(pid_t pid)
 {
-	struct task_struct *p;
+	task_t *p;
 	int retval;
 
 	retval = -EINVAL;
@@ -991,7 +1024,7 @@ asmlinkage long sys_sched_getscheduler(pid_t pid)
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	if (p)
-		retval = p->policy & ~SCHED_YIELD;
+		retval = p->policy;
 	read_unlock(&tasklist_lock);
 
 out_nounlock:
@@ -1000,7 +1033,7 @@ out_nounlock:
 
 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param)
 {
-	struct task_struct *p;
+	task_t *p;
 	struct sched_param lp;
 	int retval;
 
@@ -1031,42 +1064,28 @@ out_unlock:
 
 asmlinkage long sys_sched_yield(void)
 {
+	runqueue_t *rq = this_rq();
+	prio_array_t *array;
+
 	/*
-	 * Trick. sched_yield() first counts the number of truly 
-	 * 'pending' runnable processes, then returns if it's
-	 * only the current processes. (This test does not have
-	 * to be atomic.) In threaded applications this optimization
-	 * gets triggered quite often.
+	 * Decrease the yielding task's priority by one, to avoid
+	 * livelocks. This priority loss is temporary, it's recovered
+	 * once the current timeslice expires.
+	 *
+	 * If priority is already MAX_PRIO-1 then we still
+	 * roundrobin the task within the runlist.
 	 */
+	spin_lock_irq(&rq->lock);
+	array = current->array;
+	dequeue_task(current, array);
+	if (likely(!rt_task(current)))
+		if (current->prio < MAX_PRIO-1)
+			current->prio++;
+	enqueue_task(current, array);
+	spin_unlock_irq(&rq->lock);
 
-	int nr_pending = nr_running;
-
-#if CONFIG_SMP
-	int i;
-
-	// Subtract non-idle processes running on other CPUs.
-	for (i = 0; i < smp_num_cpus; i++) {
-		int cpu = cpu_logical_map(i);
-		if (aligned_data[cpu].schedule_data.curr != idle_task(cpu))
-			nr_pending--;
-	}
-#else
-	// on UP this process is on the runqueue as well
-	nr_pending--;
-#endif
-	if (nr_pending) {
-		/*
-		 * This process can only be rescheduled by us,
-		 * so this is safe without any locking.
-		 */
-		if (current->policy == SCHED_OTHER)
-			current->policy |= SCHED_YIELD;
-		current->need_resched = 1;
+	schedule();
 
-		current->time_slice = 0;
-		if (++current->dyn_prio > MAX_DYNPRIO)
-			current->dyn_prio = MAX_DYNPRIO;
-	}
 	return 0;
 }
 
@@ -1104,7 +1123,7 @@ asmlinkage long sys_sched_get_priority_min(int policy)
 asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
 {
 	struct timespec t;
-	struct task_struct *p;
+	task_t *p;
 	int retval = -EINVAL;
 
 	if (pid < 0)
@@ -1114,8 +1133,8 @@ asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	if (p)
-		jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : TASK_TIMESLICE(p),
-				    &t);
+		jiffies_to_timespec(p->policy & SCHED_FIFO ?
+					 0 : RT_PRIO_TO_TIMESLICE(p->prio), &t);
 	read_unlock(&tasklist_lock);
 	if (p)
 		retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
@@ -1123,7 +1142,7 @@ out_nounlock:
 	return retval;
 }
 
-static void show_task(struct task_struct * p)
+static void show_task(task_t * p)
 {
 	unsigned long free = 0;
 	int state;
@@ -1171,7 +1190,7 @@ static void show_task(struct task_struct * p)
 		printk(" (NOTLB)\n");
 
 	{
-		extern void show_trace_task(struct task_struct *tsk);
+		extern void show_trace_task(task_t *tsk);
 		show_trace_task(p);
 	}
 }
@@ -1193,7 +1212,7 @@ char * render_sigset_t(sigset_t *set, char *buffer)
 
 void show_state(void)
 {
-	struct task_struct *p;
+	task_t *p;
 
 #if (BITS_PER_LONG == 32)
 	printk("\n"
@@ -1216,134 +1235,97 @@ void show_state(void)
 	read_unlock(&tasklist_lock);
 }
 
-/**
- * reparent_to_init() - Reparent the calling kernel thread to the init task.
- *
- * If a kernel thread is launched as a result of a system call, or if
- * it ever exits, it should generally reparent itself to init so that
- * it is correctly cleaned up on exit.
- *
- * The various task state such as scheduling policy and priority may have
- * been inherited fro a user process, so we reset them to sane values here.
- *
- * NOTE that reparent_to_init() gives the caller full capabilities.
- */
-void reparent_to_init(void)
+extern unsigned long wait_init_idle;
+
+static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
 {
-	write_lock_irq(&tasklist_lock);
-
-	/* Reparent to init */
-	REMOVE_LINKS(current);
-	current->p_pptr = child_reaper;
-	current->p_opptr = child_reaper;
-	SET_LINKS(current);
-
-	/* Set the exit signal to SIGCHLD so we signal init on exit */
-	current->exit_signal = SIGCHLD;
-
-	/* We also take the runqueue_lock while altering task fields
-	 * which affect scheduling decisions */
-	spin_lock(&runqueue_lock);
-
-	current->ptrace = 0;
-	current->nice = DEF_NICE;
-	current->policy = SCHED_OTHER;
-	/* cpus_allowed? */
-	/* rt_priority? */
-	/* signals? */
-	current->cap_effective = CAP_INIT_EFF_SET;
-	current->cap_inheritable = CAP_INIT_INH_SET;
-	current->cap_permitted = CAP_FULL_SET;
-	current->keep_capabilities = 0;
-	memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim)));
-	current->user = INIT_USER;
-
-	spin_unlock(&runqueue_lock);
-	write_unlock_irq(&tasklist_lock);
+	if (rq1 == rq2)
+		spin_lock(&rq1->lock);
+	else {
+		if (rq1->cpu < rq2->cpu) {
+			spin_lock(&rq1->lock);
+			spin_lock(&rq2->lock);
+		} else {
+			spin_lock(&rq2->lock);
+			spin_lock(&rq1->lock);
+		}
+	}
 }
 
-/*
- *	Put all the gunge required to become a kernel thread without
- *	attached user resources in one place where it belongs.
- */
-
-void daemonize(void)
+static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
 {
-	struct fs_struct *fs;
-
-
-	/*
-	 * If we were started as result of loading a module, close all of the
-	 * user space pages.  We don't need them, and if we didn't close them
-	 * they would be locked into memory.
-	 */
-	exit_mm(current);
-
-	current->session = 1;
-	current->pgrp = 1;
-	current->tty = NULL;
-
-	/* Become as one with the init task */
-
-	exit_fs(current);	/* current->fs->count--; */
-	fs = init_task.fs;
-	current->fs = fs;
-	atomic_inc(&fs->count);
- 	exit_files(current);
-	current->files = init_task.files;
-	atomic_inc(&current->files->count);
+	spin_unlock(&rq1->lock);
+	if (rq1 != rq2)
+		spin_unlock(&rq2->lock);
 }
 
-extern unsigned long wait_init_idle;
-
 void __init init_idle(void)
 {
-	struct schedule_data * sched_data;
-	sched_data = &aligned_data[smp_processor_id()].schedule_data;
+	runqueue_t *this_rq = this_rq(), *rq = current->array->rq;
+	unsigned long flags;
 
-	if (current != &init_task && task_on_runqueue(current)) {
-		printk("UGH! (%d:%d) was on the runqueue, removing.\n",
-			   smp_processor_id(), current->pid);
-		del_from_runqueue(current);
+	__save_flags(flags);
+	__cli();
+	double_rq_lock(this_rq, rq);
+
+	this_rq->curr = this_rq->idle = current;
+	deactivate_task(current, rq);
+	current->array = NULL;
+	current->prio = MAX_PRIO;
+	current->state = TASK_RUNNING;
+	clear_bit(smp_processor_id(), &wait_init_idle);
+	double_rq_unlock(this_rq, rq);
+	while (wait_init_idle) {
+		cpu_relax();
+		barrier();
 	}
-	current->dyn_prio = 0;
-	sched_data->curr = current;
-	sched_data->last_schedule = get_cycles();
-	clear_bit(current->processor, &wait_init_idle);
+	current->need_resched = 1;
+	__sti();
 }
 
-extern void init_timervecs (void);
-
-static void fill_tslice_map(void)
-{
-	int i;
-
-	for (i = 0; i < NICE_RANGE; i++) {
-		ts_table[i] = ((MIN_NICE_TSLICE +
-						((MAX_NICE_TSLICE -
-						  MIN_NICE_TSLICE) / (NICE_RANGE - 1)) * i) * HZ) / 1000000;
-		if (!ts_table[i]) ts_table[i] = 1;
-	}
-}
+extern void init_timervecs(void);
+extern void timer_bh(void);
+extern void tqueue_bh(void);
+extern void immediate_bh(void);
 
 void __init sched_init(void)
 {
+	runqueue_t *rq;
+	int i, j, k;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		runqueue_t *rq = cpu_rq(i);
+		prio_array_t *array;
+
+		rq->active = rq->arrays + 0;
+		rq->expired = rq->arrays + 1;
+		spin_lock_init(&rq->lock);
+		rq->cpu = i;
+
+		for (j = 0; j < 2; j++) {
+			array = rq->arrays + j;
+			array->rq = rq;
+			array->lock = &rq->lock;
+			for (k = 0; k < MAX_PRIO; k++)
+				INIT_LIST_HEAD(array->queue + k);
+			memset(array->bitmap, 0xff, BITMAP_SIZE);
+			// zero delimiter for bitsearch
+			clear_bit(MAX_PRIO, array->bitmap);
+		}
+	}
 	/*
 	 * We have to do a little magic to get the first
 	 * process right in SMP mode.
 	 */
-	int cpu = smp_processor_id();
-	int nr;
-
-	init_task.processor = cpu;
+	rq = this_rq();
+	rq->curr = current;
+	rq->idle = NULL;
+	wake_up_process(current);
 
-	for(nr = 0; nr < PIDHASH_SZ; nr++)
-		pidhash[nr] = NULL;
-
-	fill_tslice_map();
+	for (i = 0; i < PIDHASH_SZ; i++)
+		pidhash[i] = NULL;
 
 	init_timervecs();
-
 	init_bh(TIMER_BH, timer_bh);
 	init_bh(TQUEUE_BH, tqueue_bh);
 	init_bh(IMMEDIATE_BH, immediate_bh);
@@ -1352,5 +1334,5 @@ void __init sched_init(void)
 	 * The boot idle thread does lazy MMU switching as well:
 	 */
 	atomic_inc(&init_mm.mm_count);
-	enter_lazy_tlb(&init_mm, current, cpu);
+	enter_lazy_tlb(&init_mm, current, smp_processor_id());
 }
diff --git a/kernel/signal.c b/kernel/signal.c
index 44acecd851c7..b5b866aa3527 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -478,12 +478,9 @@ static inline void signal_wake_up(struct task_struct *t)
 	 * process of changing - but no harm is done by that
 	 * other than doing an extra (lightweight) IPI interrupt.
 	 */
-	spin_lock(&runqueue_lock);
-	if (task_has_cpu(t) && t->processor != smp_processor_id())
-		smp_send_reschedule(t->processor);
-	spin_unlock(&runqueue_lock);
-#endif /* CONFIG_SMP */
-
+	if ((t->state == TASK_RUNNING) && (t->cpu != smp_processor_id()))
+		kick_if_running(t);
+#endif
 	if (t->state & TASK_INTERRUPTIBLE) {
 		wake_up_process(t);
 		return;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 97b8eb8cc831..4d8bb7b9795d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -259,10 +259,9 @@ void tasklet_kill(struct tasklet_struct *t)
 
 	while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
 		current->state = TASK_RUNNING;
-		do {
-			current->policy |= SCHED_YIELD;
-			schedule();
-		} while (test_bit(TASKLET_STATE_SCHED, &t->state));
+		do
+			yield();
+		while (test_bit(TASKLET_STATE_SCHED, &t->state));
 	}
 	tasklet_unlock_wait(t);
 	clear_bit(TASKLET_STATE_SCHED, &t->state);
@@ -365,13 +364,13 @@ static int ksoftirqd(void * __bind_cpu)
 	int cpu = cpu_logical_map(bind_cpu);
 
 	daemonize();
-	current->nice = 19;
+	set_user_nice(current, 19);
 	sigfillset(&current->blocked);
 
 	/* Migrate to the right CPU */
-	current->cpus_allowed = 1UL << cpu;
-	while (smp_processor_id() != cpu)
-		schedule();
+	set_cpus_allowed(current, 1UL << cpu);
+	if (smp_processor_id() != cpu)
+		BUG();
 
 	sprintf(current->comm, "ksoftirqd_CPU%d", bind_cpu);
 
@@ -400,18 +399,13 @@ static __init int spawn_ksoftirqd(void)
 {
 	int cpu;
 
-	for (cpu = 0; cpu < smp_num_cpus; cpu++) {
+	for (cpu = 0; cpu < smp_num_cpus; cpu++)
 		if (kernel_thread(ksoftirqd, (void *) (long) cpu,
 				  CLONE_FS | CLONE_FILES | CLONE_SIGNAL) < 0)
 			printk("spawn_ksoftirqd() failed for cpu %d\n", cpu);
-		else {
-			while (!ksoftirqd_task(cpu_logical_map(cpu))) {
-				current->policy |= SCHED_YIELD;
-				schedule();
-			}
-		}
-	}
-
+		else
+			while (!ksoftirqd_task(cpu_logical_map(cpu)))
+				yield();
 	return 0;
 }
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 765761a4e71b..21c21ca8bbe6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -220,10 +220,10 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
 		}
 		if (error == -ESRCH)
 			error = 0;
-		if (niceval < p->nice && !capable(CAP_SYS_NICE))
+		if (niceval < p->__nice && !capable(CAP_SYS_NICE))
 			error = -EACCES;
 		else
-			p->nice = niceval;
+			set_user_nice(p, niceval);
 	}
 	read_unlock(&tasklist_lock);
 
@@ -249,7 +249,7 @@ asmlinkage long sys_getpriority(int which, int who)
 		long niceval;
 		if (!proc_sel(p, which, who))
 			continue;
-		niceval = 20 - p->nice;
+		niceval = 20 - p->__nice;
 		if (niceval > retval)
 			retval = niceval;
 	}
diff --git a/kernel/timer.c b/kernel/timer.c
index c56ec37ea817..ce3945cda799 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -25,6 +25,8 @@
 
 #include <asm/uaccess.h>
 
+struct kernel_stat kstat;
+
 /*
  * Timekeeping variables
  */
@@ -584,13 +586,16 @@ void update_process_times(int user_tick)
 	update_one_process(p, user_tick, system, cpu);
 	if (p->pid) {
 		expire_task(p);
-		if (p->nice > 0)
+		if (p->__nice > 0)
 			kstat.per_cpu_nice[cpu] += user_tick;
 		else
 			kstat.per_cpu_user[cpu] += user_tick;
 		kstat.per_cpu_system[cpu] += system;
-	} else if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
-		kstat.per_cpu_system[cpu] += system;
+	} else {
+		idle_tick();
+		if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
+			kstat.per_cpu_system[cpu] += system;
+	}
 }
 
 /*
@@ -791,6 +796,89 @@ asmlinkage long sys_getegid(void)
 
 #endif
 
+static void process_timeout(unsigned long __data)
+{
+	wake_up_process((task_t *)__data);
+}
+
+/**
+ * schedule_timeout - sleep until timeout
+ * @timeout: timeout value in jiffies
+ *
+ * Make the current task sleep until @timeout jiffies have
+ * elapsed. The routine will return immediately unless
+ * the current task state has been set (see set_current_state()).
+ *
+ * You can set the task state as follows -
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
+ * pass before the routine returns. The routine will return 0
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task. In this case the remaining time
+ * in jiffies will be returned, or 0 if the timer expired in time
+ *
+ * The current task state is guaranteed to be TASK_RUNNING when this
+ * routine returns.
+ *
+ * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
+ * the CPU away without a bound on the timeout. In this case the return
+ * value will be %MAX_SCHEDULE_TIMEOUT.
+ *
+ * In all cases the return value is guaranteed to be non-negative.
+ */
+signed long schedule_timeout(signed long timeout)
+{
+	struct timer_list timer;
+	unsigned long expire;
+
+	switch (timeout)
+	{
+	case MAX_SCHEDULE_TIMEOUT:
+		/*
+		 * These two special cases are useful to be comfortable
+		 * in the caller. Nothing more. We could take
+		 * MAX_SCHEDULE_TIMEOUT from one of the negative value
+		 * but I' d like to return a valid offset (>=0) to allow
+		 * the caller to do everything it want with the retval.
+		 */
+		schedule();
+		goto out;
+	default:
+		/*
+		 * Another bit of PARANOID. Note that the retval will be
+		 * 0 since no piece of kernel is supposed to do a check
+		 * for a negative retval of schedule_timeout() (since it
+		 * should never happens anyway). You just have the printk()
+		 * that will tell you if something is gone wrong and where.
+		 */
+		if (timeout < 0)
+		{
+			printk(KERN_ERR "schedule_timeout: wrong timeout "
+			       "value %lx from %p\n", timeout,
+			       __builtin_return_address(0));
+			current->state = TASK_RUNNING;
+			goto out;
+		}
+	}
+
+	expire = timeout + jiffies;
+
+	init_timer(&timer);
+	timer.expires = expire;
+	timer.data = (unsigned long) current;
+	timer.function = process_timeout;
+
+	add_timer(&timer);
+	schedule();
+	del_timer_sync(&timer);
+
+	timeout = expire - jiffies;
+
+ out:
+	return timeout < 0 ? 0 : timeout;
+}
+
 /* Thread ID - the internal kernel "pid" */
 asmlinkage long sys_gettid(void)
 {
author	Linus Torvalds <torvalds@athlon.transmeta.com>	2002-02-05 00:12:58 -0800
committer	Linus Torvalds <torvalds@athlon.transmeta.com>	2002-02-05 00:12:58 -0800
commit	908920b1d370e7a5c301d14cfce10c310be19be3 (patch)
tree	25606bf2c5215cc0e25c9647612390b88622b279 /kernel
parent	d01b7e92c0020f89b4bb33fe61c0dffab7078b42 (diff)