9 files changed, 357 insertions, 149 deletions
diff --git a/kernel/configs.c b/kernel/configs.c
index 6a5c0c9d9176..57f54451edbe 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -47,7 +47,7 @@
 /**************************************************/
 /* globals and useful constants                   */
 
-static const char IKCONFIG_VERSION[] = "0.6";
+static const char IKCONFIG_VERSION[] __initdata = "0.7";
 
 static ssize_t
 ikconfig_read_current(struct file *file, char __user *buf,
@@ -72,32 +72,6 @@ static struct file_operations ikconfig_file_ops = {
 	.read = ikconfig_read_current,
 };
 
-
-/***************************************************/
-/* build_info_show: let people read the info       */
-/* we have on the tools used to build this kernel  */
-
-static int build_info_show(struct seq_file *seq, void *v)
-{
-	seq_printf(seq,
-		   "Kernel:    %s\nCompiler:  %s\nVersion_in_Makefile: %s\n",
-		   ikconfig_build_info, LINUX_COMPILER, UTS_RELEASE);
-	return 0;
-}
-
-static int build_info_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, build_info_show, PDE(inode)->data);
-}
-
-static struct file_operations build_info_file_ops = {
-	.owner = THIS_MODULE,
-	.open  = build_info_open,
-	.read  = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
 /***************************************************/
 /* ikconfig_init: start up everything we need to */
 
@@ -112,26 +86,12 @@ static int __init ikconfig_init(void)
 	entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO,
 				  &proc_root);
 	if (!entry)
-		goto leave;
+		return -ENOMEM;
 
 	entry->proc_fops = &ikconfig_file_ops;
 	entry->size = kernel_config_data_size;
 
-	/* create the "build_info" file */
-	entry = create_proc_entry("config_build_info",
-				  S_IFREG | S_IRUGO, &proc_root);
-	if (!entry)
-		goto leave_gz;
-	entry->proc_fops = &build_info_file_ops;
-
 	return 0;
-
-leave_gz:
-	/* remove the file from proc */
-	remove_proc_entry("config.gz", &proc_root);
-
-leave:
-	return -ENOMEM;
 }
 
 /***************************************************/
@@ -139,9 +99,7 @@ leave:
 
 static void __exit ikconfig_cleanup(void)
 {
-	/* remove the files */
 	remove_proc_entry("config.gz", &proc_root);
-	remove_proc_entry("config_build_info", &proc_root);
 }
 
 module_init(ikconfig_init);
diff --git a/kernel/exit.c b/kernel/exit.c
index b6174f82adf9..c565fd69d559 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -152,7 +152,7 @@ static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task)
 				|| p->state >= TASK_ZOMBIE 
 				|| p->real_parent->pid == 1)
 			continue;
-		if (p->real_parent->pgrp != pgrp
+		if (process_group(p->real_parent) != pgrp
 			    && p->real_parent->session == p->session) {
 			ret = 0;
 			break;
@@ -247,9 +247,9 @@ void __set_special_pids(pid_t session, pid_t pgrp)
 		curr->session = session;
 		attach_pid(curr, PIDTYPE_SID, session);
 	}
-	if (curr->pgrp != pgrp) {
+	if (process_group(curr) != pgrp) {
 		detach_pid(curr, PIDTYPE_PGID);
-		curr->pgrp = pgrp;
+		curr->group_leader->__pgrp = pgrp;
 		attach_pid(curr, PIDTYPE_PGID, pgrp);
 	}
 }
@@ -508,9 +508,9 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced)
 	 * than we are, and it was the only connection
 	 * outside, so the child pgrp is now orphaned.
 	 */
-	if ((p->pgrp != father->pgrp) &&
+	if ((process_group(p) != process_group(father)) &&
 	    (p->session == father->session)) {
-		int pgrp = p->pgrp;
+		int pgrp = process_group(p);
 
 		if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) {
 			__kill_pg_info(SIGHUP, (void *)1, pgrp);
@@ -618,12 +618,12 @@ static void exit_notify(struct task_struct *tsk)
 	 
 	t = tsk->real_parent;
 	
-	if ((t->pgrp != tsk->pgrp) &&
+	if ((process_group(t) != process_group(tsk)) &&
 	    (t->session == tsk->session) &&
-	    will_become_orphaned_pgrp(tsk->pgrp, tsk) &&
-	    has_stopped_jobs(tsk->pgrp)) {
-		__kill_pg_info(SIGHUP, (void *)1, tsk->pgrp);
-		__kill_pg_info(SIGCONT, (void *)1, tsk->pgrp);
+	    will_become_orphaned_pgrp(process_group(tsk), tsk) &&
+	    has_stopped_jobs(process_group(tsk))) {
+		__kill_pg_info(SIGHUP, (void *)1, process_group(tsk));
+		__kill_pg_info(SIGCONT, (void *)1, process_group(tsk));
 	}
 
 	/* Let father know we died 
@@ -813,10 +813,10 @@ static int eligible_child(pid_t pid, int options, task_t *p)
 		if (p->pid != pid)
 			return 0;
 	} else if (!pid) {
-		if (p->pgrp != current->pgrp)
+		if (process_group(p) != process_group(current))
 			return 0;
 	} else if (pid != -1) {
-		if (p->pgrp != -pid)
+		if (process_group(p) != -pid)
 			return 0;
 	}
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 37d79b4e16e6..f2d3115483da 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -305,7 +305,7 @@ out:
 	return retval;
 fail_nomem:
 	retval = -ENOMEM;
-  fail:
+fail:
 	vm_unacct_memory(charge);
 	goto out;
 }
@@ -499,7 +499,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 		goto fail_nomem;
 
 	if (init_new_context(tsk,mm))
-		goto free_pt;
+		goto fail_nocontext;
 
 	retval = dup_mmap(mm, oldmm);
 	if (retval)
@@ -514,6 +514,15 @@ free_pt:
 	mmput(mm);
 fail_nomem:
 	return retval;
+
+fail_nocontext:
+	/*
+	 * If init_new_context() failed, we cannot use mmput() to free the mm
+	 * because it calls destroy_context()
+	 */
+	mm_free_pgd(mm);
+	free_mm(mm);
+	return retval;
 }
 
 static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old)
@@ -925,7 +934,7 @@ struct task_struct *copy_process(unsigned long clone_flags,
 	 */
 	p->first_time_slice = 1;
 	current->time_slice >>= 1;
-	p->last_run = jiffies;
+	p->timestamp = sched_clock();
 	if (!current->time_slice) {
 		/*
 	 	 * This case is rare, it happens when the parent has only
@@ -1004,7 +1013,7 @@ struct task_struct *copy_process(unsigned long clone_flags,
 	attach_pid(p, PIDTYPE_PID, p->pid);
 	if (thread_group_leader(p)) {
 		attach_pid(p, PIDTYPE_TGID, p->tgid);
-		attach_pid(p, PIDTYPE_PGID, p->pgrp);
+		attach_pid(p, PIDTYPE_PGID, process_group(p));
 		attach_pid(p, PIDTYPE_SID, p->session);
 		if (p->pid)
 			__get_cpu_var(process_counts)++;
diff --git a/kernel/ksyms.c b/kernel/ksyms.c
index 9f61a0496c2a..9da2940ac0e6 100644
--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -348,8 +348,6 @@ EXPORT_SYMBOL(lock_page);
 EXPORT_SYMBOL(unlock_page);
 
 /* device registration */
-EXPORT_SYMBOL(register_chrdev);
-EXPORT_SYMBOL(unregister_chrdev);
 EXPORT_SYMBOL(register_blkdev);
 EXPORT_SYMBOL(unregister_blkdev);
 EXPORT_SYMBOL(tty_register_driver);
diff --git a/kernel/pid.c b/kernel/pid.c
index 00413e3967b9..713f54eaeda9 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -250,13 +250,13 @@ void switch_exec_pids(task_t *leader, task_t *thread)
 
 	attach_pid(thread, PIDTYPE_PID, thread->pid);
 	attach_pid(thread, PIDTYPE_TGID, thread->tgid);
-	attach_pid(thread, PIDTYPE_PGID, thread->pgrp);
+	attach_pid(thread, PIDTYPE_PGID, leader->__pgrp);
 	attach_pid(thread, PIDTYPE_SID, thread->session);
 	list_add_tail(&thread->tasks, &init_task.tasks);
 
 	attach_pid(leader, PIDTYPE_PID, leader->pid);
 	attach_pid(leader, PIDTYPE_TGID, leader->tgid);
-	attach_pid(leader, PIDTYPE_PGID, leader->pgrp);
+	attach_pid(leader, PIDTYPE_PGID, leader->__pgrp);
 	attach_pid(leader, PIDTYPE_SID, leader->session);
 }
 
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index d9be410a9e62..64940545cb84 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -344,6 +344,7 @@ static inline struct task_struct * good_sigevent(sigevent_t * event)
 		return NULL;
 
 	if ((event->sigev_notify & ~SIGEV_NONE & MIPS_SIGEV) &&
+			event->sigev_signo &&
 			((unsigned) (event->sigev_signo > SIGRTMAX)))
 		return NULL;
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 9dc251a8d8a5..b35f717d1b58 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -14,6 +14,7 @@
  *		an array-switch method of distributing timeslices
  *		and per-CPU runqueues.  Cleanups and useful suggestions
  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
+ *  2003-09-03	Interactivity tuning by Con Kolivas.
  */
 
 #include <linux/mm.h>
@@ -59,6 +60,14 @@
 #define USER_PRIO(p)		((p)-MAX_RT_PRIO)
 #define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
 #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
+#define AVG_TIMESLICE	(MIN_TIMESLICE + ((MAX_TIMESLICE - MIN_TIMESLICE) *\
+			(MAX_PRIO-1-NICE_TO_PRIO(0))/(MAX_USER_PRIO - 1)))
+
+/*
+ * Some helpers for converting nanosecond timing to jiffy resolution
+ */
+#define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
+#define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
 
 /*
  * These are the 'tuning knobs' of the scheduler:
@@ -69,14 +78,18 @@
  */
 #define MIN_TIMESLICE		( 10 * HZ / 1000)
 #define MAX_TIMESLICE		(200 * HZ / 1000)
-#define CHILD_PENALTY		50
+#define ON_RUNQUEUE_WEIGHT	30
+#define CHILD_PENALTY		95
 #define PARENT_PENALTY		100
 #define EXIT_WEIGHT		3
 #define PRIO_BONUS_RATIO	25
+#define MAX_BONUS		(MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
 #define INTERACTIVE_DELTA	2
-#define MAX_SLEEP_AVG		(10*HZ)
-#define STARVATION_LIMIT	(10*HZ)
+#define MAX_SLEEP_AVG		(AVG_TIMESLICE * MAX_BONUS)
+#define STARVATION_LIMIT	(MAX_SLEEP_AVG)
+#define NS_MAX_SLEEP_AVG	(JIFFIES_TO_NS(MAX_SLEEP_AVG))
 #define NODE_THRESHOLD		125
+#define CREDIT_LIMIT		100
 
 /*
  * If a task is 'interactive' then we reinsert it in the active
@@ -106,6 +119,19 @@
  * too hard.
  */
 
+#define CURRENT_BONUS(p) \
+	(NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
+		MAX_SLEEP_AVG)
+
+#ifdef CONFIG_SMP
+#define TIMESLICE_GRANULARITY(p)	(MIN_TIMESLICE * \
+		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
+			num_online_cpus())
+#else
+#define TIMESLICE_GRANULARITY(p)	(MIN_TIMESLICE * \
+		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
+#endif
+
 #define SCALE(v1,v1_max,v2_max) \
 	(v1) * (v2_max) / (v1_max)
 
@@ -116,6 +142,19 @@
 #define TASK_INTERACTIVE(p) \
 	((p)->prio <= (p)->static_prio - DELTA(p))
 
+#define JUST_INTERACTIVE_SLEEP(p) \
+	(JIFFIES_TO_NS(MAX_SLEEP_AVG * \
+		(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
+
+#define HIGH_CREDIT(p) \
+	((p)->interactive_credit > CREDIT_LIMIT)
+
+#define LOW_CREDIT(p) \
+	((p)->interactive_credit < -CREDIT_LIMIT)
+
+#define TASK_PREEMPTS_CURR(p, rq) \
+	((p)->prio < (rq)->curr->prio)
+
 /*
  * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
  * to time slice values.
@@ -180,7 +219,6 @@ static DEFINE_PER_CPU(struct runqueue, runqueues);
 #define this_rq()		(&__get_cpu_var(runqueues))
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
-#define rt_task(p)		((p)->prio < MAX_RT_PRIO)
 
 /*
  * Default context-switch locking:
@@ -320,8 +358,7 @@ static int effective_prio(task_t *p)
 	if (rt_task(p))
 		return p->prio;
 
-	bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 -
-			MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
+	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
 
 	prio = p->static_prio - bonus;
 	if (prio < MAX_RT_PRIO)
@@ -340,6 +377,82 @@ static inline void __activate_task(task_t *p, runqueue_t *rq)
 	nr_running_inc(rq);
 }
 
+static void recalc_task_prio(task_t *p, unsigned long long now)
+{
+	unsigned long long __sleep_time = now - p->timestamp;
+	unsigned long sleep_time;
+
+	if (__sleep_time > NS_MAX_SLEEP_AVG)
+		sleep_time = NS_MAX_SLEEP_AVG;
+	else
+		sleep_time = (unsigned long)__sleep_time;
+
+	if (likely(sleep_time > 0)) {
+		/*
+		 * User tasks that sleep a long time are categorised as
+		 * idle and will get just interactive status to stay active &
+		 * prevent them suddenly becoming cpu hogs and starving
+		 * other processes.
+		 */
+		if (p->mm && p->activated != -1 &&
+			sleep_time > JUST_INTERACTIVE_SLEEP(p)){
+				p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
+						AVG_TIMESLICE);
+				if (!HIGH_CREDIT(p))
+					p->interactive_credit++;
+		} else {
+			/*
+			 * The lower the sleep avg a task has the more
+			 * rapidly it will rise with sleep time.
+			 */
+			sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
+
+			/*
+			 * Tasks with low interactive_credit are limited to
+			 * one timeslice worth of sleep avg bonus.
+			 */
+			if (LOW_CREDIT(p) &&
+				sleep_time > JIFFIES_TO_NS(task_timeslice(p)))
+					sleep_time =
+						JIFFIES_TO_NS(task_timeslice(p));
+
+			/*
+			 * Non high_credit tasks waking from uninterruptible
+			 * sleep are limited in their sleep_avg rise as they
+			 * are likely to be cpu hogs waiting on I/O
+			 */
+			if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm){
+				if (p->sleep_avg >= JUST_INTERACTIVE_SLEEP(p))
+					sleep_time = 0;
+				else if (p->sleep_avg + sleep_time >=
+					JUST_INTERACTIVE_SLEEP(p)){
+						p->sleep_avg =
+							JUST_INTERACTIVE_SLEEP(p);
+						sleep_time = 0;
+					}
+			}
+
+			/*
+			 * This code gives a bonus to interactive tasks.
+			 *
+			 * The boost works by updating the 'average sleep time'
+			 * value here, based on ->timestamp. The more time a task
+			 * spends sleeping, the higher the average gets - and the
+			 * higher the priority boost gets as well.
+			 */
+			p->sleep_avg += sleep_time;
+
+			if (p->sleep_avg > NS_MAX_SLEEP_AVG){
+				p->sleep_avg = NS_MAX_SLEEP_AVG;
+				if (!HIGH_CREDIT(p))
+					p->interactive_credit++;
+			}
+		}
+	}
+
+	p->prio = effective_prio(p);
+}
+
 /*
  * activate_task - move a task to the runqueue and do priority recalculation
  *
@@ -348,34 +461,33 @@ static inline void __activate_task(task_t *p, runqueue_t *rq)
  */
 static inline void activate_task(task_t *p, runqueue_t *rq)
 {
-	long sleep_time = jiffies - p->last_run - 1;
+	unsigned long long now = sched_clock();
 
-	if (sleep_time > 0) {
-		int sleep_avg;
+	recalc_task_prio(p, now);
 
+	/*
+	 * This checks to make sure it's not an uninterruptible task
+	 * that is now waking up.
+	 */
+	if (!p->activated){
 		/*
-		 * This code gives a bonus to interactive tasks.
-		 *
-		 * The boost works by updating the 'average sleep time'
-		 * value here, based on ->last_run. The more time a task
-		 * spends sleeping, the higher the average gets - and the
-		 * higher the priority boost gets as well.
+		 * Tasks which were woken up by interrupts (ie. hw events)
+		 * are most likely of interactive nature. So we give them
+		 * the credit of extending their sleep time to the period
+		 * of time they spend on the runqueue, waiting for execution
+		 * on a CPU, first time around:
 		 */
-		sleep_avg = p->sleep_avg + sleep_time;
-
+		if (in_interrupt())
+			p->activated = 2;
+		else
 		/*
-		 * 'Overflow' bonus ticks go to the waker as well, so the
-		 * ticks are not lost. This has the effect of further
-		 * boosting tasks that are related to maximum-interactive
-		 * tasks.
+		 * Normal first-time wakeups get a credit too for on-runqueue
+		 * time, but it will be weighted down:
 		 */
-		if (sleep_avg > MAX_SLEEP_AVG)
-			sleep_avg = MAX_SLEEP_AVG;
-		if (p->sleep_avg != sleep_avg) {
-			p->sleep_avg = sleep_avg;
-			p->prio = effective_prio(p);
+			p->activated = 1;
 		}
-	}
+	p->timestamp = now;
+
 	__activate_task(p, rq);
 }
 
@@ -496,13 +608,19 @@ repeat_lock_task:
 				task_rq_unlock(rq, &flags);
 				goto repeat_lock_task;
 			}
-			if (old_state == TASK_UNINTERRUPTIBLE)
+			if (old_state == TASK_UNINTERRUPTIBLE){
 				rq->nr_uninterruptible--;
+				/*
+				 * Tasks on involuntary sleep don't earn
+				 * sleep_avg beyond just interactive state.
+				 */
+				p->activated = -1;
+			}
 			if (sync)
 				__activate_task(p, rq);
 			else {
 				activate_task(p, rq);
-				if (p->prio < rq->curr->prio)
+				if (TASK_PREEMPTS_CURR(p, rq))
 					resched_task(rq->curr);
 			}
 			success = 1;
@@ -551,8 +669,14 @@ void wake_up_forked_process(task_t * p)
 	 * and children as well, to keep max-interactive tasks
 	 * from forking tasks that are max-interactive.
 	 */
-	current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100;
-	p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100;
+	current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
+		PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
+
+	p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
+		CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
+
+	p->interactive_credit = 0;
+
 	p->prio = effective_prio(p);
 	set_task_cpu(p, smp_processor_id());
 
@@ -593,8 +717,9 @@ void sched_exit(task_t * p)
 	 * the sleep_avg of the parent as well.
 	 */
 	if (p->sleep_avg < p->parent->sleep_avg)
-		p->parent->sleep_avg = (p->parent->sleep_avg * EXIT_WEIGHT +
-			p->sleep_avg) / (EXIT_WEIGHT + 1);
+		p->parent->sleep_avg = p->parent->sleep_avg /
+		(EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
+		(EXIT_WEIGHT + 1);
 }
 
 /**
@@ -960,10 +1085,10 @@ static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu,
 	if (likely(!busiest))
 		goto out;
 
-	*imbalance = (max_load - nr_running) / 2;
+	*imbalance = max_load - nr_running;
 
 	/* It needs an at least ~25% imbalance to trigger balancing. */
-	if (!idle && (*imbalance < (max_load + 3)/4)) {
+	if (!idle && ((*imbalance)*4 < max_load)) {
 		busiest = NULL;
 		goto out;
 	}
@@ -973,7 +1098,7 @@ static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu,
 	 * Make sure nothing changed since we checked the
 	 * runqueue length.
 	 */
-	if (busiest->nr_running <= nr_running + 1) {
+	if (busiest->nr_running <= nr_running) {
 		spin_unlock(&busiest->lock);
 		busiest = NULL;
 	}
@@ -996,13 +1121,31 @@ static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t
 	 * Note that idle threads have a prio of MAX_PRIO, for this test
 	 * to be always true for them.
 	 */
-	if (p->prio < this_rq->curr->prio)
+	if (TASK_PREEMPTS_CURR(p, this_rq))
 		set_need_resched();
-	else {
-		if (p->prio == this_rq->curr->prio &&
-				p->time_slice > this_rq->curr->time_slice)
-			set_need_resched();
-	}
+}
+
+/*
+ * Previously:
+ *
+ * #define CAN_MIGRATE_TASK(p,rq,this_cpu)	\
+ *	((!idle || (NS_TO_JIFFIES(now - (p)->timestamp) > \
+ *		cache_decay_ticks)) && !task_running(rq, p) && \
+ *			cpu_isset(this_cpu, (p)->cpus_allowed))
+ */
+
+static inline int
+can_migrate_task(task_t *tsk, runqueue_t *rq, int this_cpu, int idle)
+{
+	unsigned long delta = sched_clock() - tsk->timestamp;
+
+	if (!idle && (delta <= JIFFIES_TO_NS(cache_decay_ticks)))
+		return 0;
+	if (task_running(rq, tsk))
+		return 0;
+	if (!cpu_isset(this_cpu, tsk->cpus_allowed))
+		return 0;
+	return 1;
 }
 
 /*
@@ -1026,6 +1169,12 @@ static void load_balance(runqueue_t *this_rq, int idle, cpumask_t cpumask)
 		goto out;
 
 	/*
+	 * We only want to steal a number of tasks equal to 1/2 the imbalance,
+	 * otherwise we'll just shift the imbalance to the new queue:
+	 */
+	imbalance /= 2;
+
+	/*
 	 * We first consider expired tasks. Those will likely not be
 	 * executed in the near future, and they are most likely to
 	 * be cache-cold, thus switching CPUs has the least effect
@@ -1064,14 +1213,9 @@ skip_queue:
 	 * 3) are cache-hot on their current CPU.
 	 */
 
-#define CAN_MIGRATE_TASK(p,rq,this_cpu)					\
-	((idle || (jiffies - (p)->last_run > cache_decay_ticks)) &&	\
-		!task_running(rq, p) &&					\
-			cpu_isset(this_cpu, (p)->cpus_allowed))
-
 	curr = curr->prev;
 
-	if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) {
+	if (!can_migrate_task(tmp, busiest, this_cpu, idle)) {
 		if (curr != head)
 			goto skip_queue;
 		idx++;
@@ -1233,14 +1377,11 @@ void scheduler_tick(int user_ticks, int sys_ticks)
 	spin_lock(&rq->lock);
 	/*
 	 * The task was running during this tick - update the
-	 * time slice counter and the sleep average. Note: we
-	 * do not update a thread's priority until it either
-	 * goes to sleep or uses up its timeslice. This makes
-	 * it possible for interactive tasks to use up their
-	 * timeslices at their highest priority levels.
+	 * time slice counter. Note: we do not update a thread's
+	 * priority until it either goes to sleep or uses up its
+	 * timeslice. This makes it possible for interactive tasks
+	 * to use up their timeslices at their highest priority levels.
 	 */
-	if (p->sleep_avg)
-		p->sleep_avg--;
 	if (unlikely(rt_task(p))) {
 		/*
 		 * RR tasks need a special form of timeslice management.
@@ -1264,12 +1405,39 @@ void scheduler_tick(int user_ticks, int sys_ticks)
 		p->time_slice = task_timeslice(p);
 		p->first_time_slice = 0;
 
+		if (!rq->expired_timestamp)
+			rq->expired_timestamp = jiffies;
 		if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
-			if (!rq->expired_timestamp)
-				rq->expired_timestamp = jiffies;
 			enqueue_task(p, rq->expired);
 		} else
 			enqueue_task(p, rq->active);
+	} else {
+		/*
+		 * Prevent a too long timeslice allowing a task to monopolize
+		 * the CPU. We do this by splitting up the timeslice into
+		 * smaller pieces.
+		 *
+		 * Note: this does not mean the task's timeslices expire or
+		 * get lost in any way, they just might be preempted by
+		 * another task of equal priority. (one with higher
+		 * priority would have preempted this task already.) We
+		 * requeue this task to the end of the list on this priority
+		 * level, which is in essence a round-robin of tasks with
+		 * equal priority.
+		 *
+		 * This only applies to tasks in the interactive
+		 * delta range with at least TIMESLICE_GRANULARITY to requeue.
+		 */
+		if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
+			p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
+			(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
+			(p->array == rq->active)) {
+
+			dequeue_task(p, rq->active);
+			set_tsk_need_resched(p);
+			p->prio = effective_prio(p);
+			enqueue_task(p, rq->active);
+		}
 	}
 out_unlock:
 	spin_unlock(&rq->lock);
@@ -1288,6 +1456,8 @@ asmlinkage void schedule(void)
 	runqueue_t *rq;
 	prio_array_t *array;
 	struct list_head *queue;
+	unsigned long long now;
+	unsigned long run_time;
 	int idx;
 
 	/*
@@ -1308,7 +1478,20 @@ need_resched:
 	rq = this_rq();
 
 	release_kernel_lock(prev);
-	prev->last_run = jiffies;
+	now = sched_clock();
+	if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
+		run_time = now - prev->timestamp;
+	else
+		run_time = NS_MAX_SLEEP_AVG;
+
+	/*
+	 * Tasks with interactive credits get charged less run_time
+	 * at high sleep_avg to delay them losing their interactive
+	 * status
+	 */
+	if (HIGH_CREDIT(prev))
+		run_time /= (CURRENT_BONUS(prev) ? : 1);
+
 	spin_lock_irq(&rq->lock);
 
 	/*
@@ -1358,12 +1541,33 @@ pick_next_task:
 	queue = array->queue + idx;
 	next = list_entry(queue->next, task_t, run_list);
 
+	if (next->activated > 0) {
+		unsigned long long delta = now - next->timestamp;
+
+		if (next->activated == 1)
+			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
+
+		array = next->array;
+		dequeue_task(next, array);
+		recalc_task_prio(next, next->timestamp + delta);
+		enqueue_task(next, array);
+	}
+	next->activated = 0;
 switch_tasks:
 	prefetch(next);
 	clear_tsk_need_resched(prev);
 	RCU_qsctr(task_cpu(prev))++;
 
+	prev->sleep_avg -= run_time;
+	if ((long)prev->sleep_avg <= 0){
+		prev->sleep_avg = 0;
+		if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev)))
+			prev->interactive_credit--;
+	}
+	prev->timestamp = now;
+
 	if (likely(prev != next)) {
+		next->timestamp = now;
 		rq->nr_switches++;
 		rq->curr = next;
 
@@ -1603,6 +1807,7 @@ void set_user_nice(task_t *p, long nice)
 	unsigned long flags;
 	prio_array_t *array;
 	runqueue_t *rq;
+	int old_prio, new_prio, delta;
 
 	if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
 		return;
@@ -1611,6 +1816,12 @@ void set_user_nice(task_t *p, long nice)
 	 * the task might be in the middle of scheduling on another CPU.
 	 */
 	rq = task_rq_lock(p, &flags);
+	/*
+	 * The RT priorities are set via setscheduler(), but we still
+	 * allow the 'normal' nice value to be set - but as expected
+	 * it wont have any effect on scheduling until the task is
+	 * not SCHED_NORMAL:
+	 */
 	if (rt_task(p)) {
 		p->static_prio = NICE_TO_PRIO(nice);
 		goto out_unlock;
@@ -1618,16 +1829,20 @@ void set_user_nice(task_t *p, long nice)
 	array = p->array;
 	if (array)
 		dequeue_task(p, array);
+
+	old_prio = p->prio;
+	new_prio = NICE_TO_PRIO(nice);
+	delta = new_prio - old_prio;
 	p->static_prio = NICE_TO_PRIO(nice);
-	p->prio = NICE_TO_PRIO(nice);
+	p->prio += delta;
+
 	if (array) {
 		enqueue_task(p, array);
 		/*
-		 * If the task is running and lowered its priority,
-		 * or increased its priority then reschedule its CPU:
+		 * If the task increased its priority or is running and
+		 * lowered its priority, then reschedule its CPU:
 		 */
-		if ((NICE_TO_PRIO(nice) < p->static_prio) ||
-							task_running(rq, p))
+		if (delta < 0 || (delta > 0 && task_running(rq, p)))
 			resched_task(rq->curr);
 	}
 out_unlock:
@@ -2384,6 +2599,12 @@ static void move_task_away(struct task_struct *p, int dest_cpu)
 	local_irq_restore(flags);
 }
 
+typedef struct {
+	int cpu;
+	struct completion startup_done;
+	task_t *task;
+} migration_startup_t;
+
 /*
  * migration_thread - this is a highprio system thread that performs
  * thread migration by bumping thread off CPU then 'pushing' onto
@@ -2393,20 +2614,21 @@ static int migration_thread(void * data)
 {
 	/* Marking "param" __user is ok, since we do a set_fs(KERNEL_DS); */
 	struct sched_param __user param = { .sched_priority = MAX_RT_PRIO-1 };
-	int cpu = (long) data;
+	migration_startup_t *startup = data;
+	int cpu = startup->cpu;
 	runqueue_t *rq;
 	int ret;
 
+	startup->task = current;
+	complete(&startup->startup_done);
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule();
+
+	BUG_ON(smp_processor_id() != cpu);
+
 	daemonize("migration/%d", cpu);
 	set_fs(KERNEL_DS);
 
-	/*
-	 * Either we are running on the right CPU, or there's a a
-	 * migration thread on this CPU, guaranteed (we're started
-	 * serially).
-	 */
-	set_cpus_allowed(current, cpumask_of_cpu(cpu));
-
 	ret = setscheduler(0, SCHED_FIFO, &param);
 
 	rq = this_rq();
@@ -2445,13 +2667,30 @@ static int migration_call(struct notifier_block *nfb,
 			  unsigned long action,
 			  void *hcpu)
 {
+	long cpu = (long) hcpu;
+	migration_startup_t startup;
+
 	switch (action) {
 	case CPU_ONLINE:
-		printk("Starting migration thread for cpu %li\n",
-		       (long)hcpu);
-		kernel_thread(migration_thread, hcpu, CLONE_KERNEL);
-		while (!cpu_rq((long)hcpu)->migration_thread)
+
+		printk("Starting migration thread for cpu %li\n", cpu);
+
+		startup.cpu = cpu;
+		startup.task = NULL;
+		init_completion(&startup.startup_done);
+
+		kernel_thread(migration_thread, &startup, CLONE_KERNEL);
+		wait_for_completion(&startup.startup_done);
+		wait_task_inactive(startup.task);
+
+		startup.task->thread_info->cpu = cpu;
+		startup.task->cpus_allowed = cpumask_of_cpu(cpu);
+
+		wake_up_process(startup.task);
+
+		while (!cpu_rq(cpu)->migration_thread)
 			yield();
+
 		break;
 	}
 	return NOTIFY_OK;
@@ -2574,6 +2813,8 @@ void __might_sleep(char *file, int line)
 		prev_jiffy = jiffies;
 		printk(KERN_ERR "Debug: sleeping function called from invalid"
 				" context at %s:%d\n", file, line);
+		printk("in_atomic():%d, irqs_disabled():%d\n",
+				in_atomic(), irqs_disabled());
 		dump_stack();
 	}
 #endif
diff --git a/kernel/signal.c b/kernel/signal.c
index 72333be1fd42..852da1a009da 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1139,7 +1139,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid)
 static int kill_something_info(int sig, struct siginfo *info, int pid)
 {
 	if (!pid) {
-		return kill_pg_info(sig, info, current->pgrp);
+		return kill_pg_info(sig, info, process_group(current));
 	} else if (pid == -1) {
 		int retval = 0, count = 0;
 		struct task_struct * p;
@@ -1798,7 +1798,7 @@ relock:
 
 			/* signals can be posted during this window */
 
-			if (is_orphaned_pgrp(current->pgrp))
+			if (is_orphaned_pgrp(process_group(current)))
 				goto relock;
 
 			spin_lock_irq(&current->sighand->siglock);
diff --git a/kernel/sys.c b/kernel/sys.c
index b172afa53be1..9eda26d6745c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -290,7 +290,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
 			break;
 		case PRIO_PGRP:
 			if (!who)
-				who = current->pgrp;
+				who = process_group(current);
 			for_each_task_pid(who, PIDTYPE_PGID, p, l, pid)
 				error = set_one_prio(p, niceval, error);
 			break;
@@ -346,7 +346,7 @@ asmlinkage long sys_getpriority(int which, int who)
 			break;
 		case PRIO_PGRP:
 			if (!who)
-				who = current->pgrp;
+				who = process_group(current);
 			for_each_task_pid(who, PIDTYPE_PGID, p, l, pid) {
 				niceval = 20 - task_nice(p);
 				if (niceval > retval)
@@ -982,11 +982,12 @@ ok_pgid:
 	if (err)
 		goto out;
 
-	if (p->pgrp != pgid) {
+	if (process_group(p) != pgid) {
 		detach_pid(p, PIDTYPE_PGID);
-		p->pgrp = pgid;
+		p->group_leader->__pgrp = pgid;
 		attach_pid(p, PIDTYPE_PGID, pgid);
 	}
+
 	err = 0;
 out:
 	/* All paths lead to here, thus we are safe. -DaveM */
@@ -997,7 +998,7 @@ out:
 asmlinkage long sys_getpgid(pid_t pid)
 {
 	if (!pid) {
-		return current->pgrp;
+		return process_group(current);
 	} else {
 		int retval;
 		struct task_struct *p;
@@ -1009,7 +1010,7 @@ asmlinkage long sys_getpgid(pid_t pid)
 		if (p) {
 			retval = security_task_getpgid(p);
 			if (!retval)
-				retval = p->pgrp;
+				retval = process_group(p);
 		}
 		read_unlock(&tasklist_lock);
 		return retval;
@@ -1019,7 +1020,7 @@ asmlinkage long sys_getpgid(pid_t pid)
 asmlinkage long sys_getpgrp(void)
 {
 	/* SMP - assuming writes are word atomic this is fine */
-	return current->pgrp;
+	return process_group(current);
 }
 
 asmlinkage long sys_getsid(pid_t pid)
@@ -1062,7 +1063,7 @@ asmlinkage long sys_setsid(void)
 	__set_special_pids(current->pid, current->pid);
 	current->tty = NULL;
 	current->tty_old_pgrp = 0;
-	err = current->pgrp;
+	err = process_group(current);
 out:
 	write_unlock_irq(&tasklist_lock);
 	return err;