merge

author: Greg Kroah-Hartman <greg@kroah.com> 2004-10-18 18:03:51 -0700
committer: Greg Kroah-Hartman <greg@kroah.com> 2004-10-18 18:03:51 -0700
commit: bffe01870598b7a0a77073e25ee94e026bc98e6b (patch)
tree: f5f5b65fec1239b18a4c2634807ff5d7e5729de9 /kernel
parent: 23aebb6f8755121394ef088d84a7fa483b444aa9 (diff)
parent: a4946826c30c56a5830326552a395c5b6afc13ef (diff)
32 files changed, 1512 insertions, 516 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 93155e52bb54..c4337751cee7 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_AUDIT) += audit.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_SYSFS) += ksysfs.o
+obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 
 ifneq ($(CONFIG_IA64),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index daf23c4efab4..15ab0324139d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -53,6 +53,7 @@
 #include <linux/vfs.h>
 #include <linux/jiffies.h>
 #include <linux/times.h>
+#include <linux/syscalls.h>
 #include <asm/uaccess.h>
 #include <asm/div64.h>
 #include <linux/blkdev.h> /* sector_div */
@@ -384,6 +385,8 @@ static void do_acct_process(long exitcode, struct file *file)
 	unsigned long vsize;
 	unsigned long flim;
 	u64 elapsed;
+	u64 run_time;
+	struct timespec uptime;
 
 	/*
 	 * First check to see if there is enough free_space to continue
@@ -401,7 +404,13 @@ static void do_acct_process(long exitcode, struct file *file)
 	ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
 	strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
 
-	elapsed = jiffies_64_to_AHZ(get_jiffies_64() - current->start_time);
+	/* calculate run_time in nsec*/
+	do_posix_clock_monotonic_gettime(&uptime);
+	run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
+	run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC
+					+ current->start_time.tv_nsec;
+	/* convert nsec -> AHZ */
+	elapsed = nsec_to_AHZ(run_time);
 #if ACCT_VERSION==3
 	ac.ac_etime = encode_float(elapsed);
 #else
@@ -480,11 +489,11 @@ static void do_acct_process(long exitcode, struct file *file)
 	/*
  	 * Accounting records are not subject to resource limits.
  	 */
-	flim = current->rlim[RLIMIT_FSIZE].rlim_cur;
-	current->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+	flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
 	file->f_op->write(file, (char *)&ac,
 			       sizeof(acct_t), &file->f_pos);
-	current->rlim[RLIMIT_FSIZE].rlim_cur = flim;
+	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
 	set_fs(fs);
 }
 
diff --git a/kernel/capability.c b/kernel/capability.c
index 7e864e2ccf6a..7800a5066c0f 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -10,6 +10,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/security.h>
+#include <linux/syscalls.h>
 #include <asm/uaccess.h>
 
 unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 083521327e64..719ba42ff5ee 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -61,13 +61,13 @@ static inline void check_for_tasks(int cpu)
  * cpu' with certain environment variables set.  */
 static int cpu_run_sbin_hotplug(unsigned int cpu, const char *action)
 {
-	char *argv[3], *envp[5], cpu_str[12], action_str[32];
+	char *argv[3], *envp[6], cpu_str[12], action_str[32], devpath_str[40];
 	int i;
 
 	sprintf(cpu_str, "CPU=%d", cpu);
 	sprintf(action_str, "ACTION=%s", action);
-	/* FIXME: Add DEVPATH. --RR */
-
+	sprintf(devpath_str, "DEVPATH=devices/system/cpu/cpu%d", cpu);
+	
 	i = 0;
 	argv[i++] = hotplug_path;
 	argv[i++] = "cpu";
@@ -79,6 +79,7 @@ static int cpu_run_sbin_hotplug(unsigned int cpu, const char *action)
 	envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
 	envp[i++] = cpu_str;
 	envp[i++] = action_str;
+	envp[i++] = devpath_str;
 	envp[i] = NULL;
 
 	return call_usermodehelper(argv[0], argv, envp, 0);
@@ -123,6 +124,15 @@ int cpu_down(unsigned int cpu)
 		goto out;
 	}
 
+	err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
+						(void *)(long)cpu);
+	if (err == NOTIFY_BAD) {
+		printk("%s: attempt to take down CPU %u failed\n",
+				__FUNCTION__, cpu);
+		err = -EINVAL;
+		goto out;
+	}
+
 	/* Ensure that we are not runnable on dying cpu */
 	old_allowed = current->cpus_allowed;
 	tmp = CPU_MASK_ALL;
@@ -131,6 +141,11 @@ int cpu_down(unsigned int cpu)
 
 	p = __stop_machine_run(take_cpu_down, NULL, cpu);
 	if (IS_ERR(p)) {
+		/* CPU didn't die: tell everyone.  Can't complain. */
+		if (notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
+				(void *)(long)cpu) == NOTIFY_BAD)
+			BUG();
+
 		err = PTR_ERR(p);
 		goto out_allowed;
 	}
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 5fba35304459..ad3e5d54e119 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/personality.h>
 #include <linux/sched.h>
+#include <linux/syscalls.h>
 #include <linux/sysctl.h>
 #include <linux/types.h>
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 6860b509dd11..55d853392524 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -24,6 +24,7 @@
 #include <linux/mount.h>
 #include <linux/proc_fs.h>
 #include <linux/mempolicy.h>
+#include <linux/syscalls.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -75,7 +76,7 @@ repeat:
 	 */
 	zap_leader = 0;
 	leader = p->group_leader;
-	if (leader != p && thread_group_empty(leader) && leader->state == TASK_ZOMBIE) {
+	if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
 		BUG_ON(leader->exit_signal == -1);
 		do_notify_parent(leader, leader->exit_signal);
 		/*
@@ -157,7 +158,7 @@ static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task)
 
 	do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
 		if (p == ignored_task
-				|| p->state >= TASK_ZOMBIE 
+				|| p->exit_state >= EXIT_ZOMBIE
 				|| p->real_parent->pid == 1)
 			continue;
 		if (process_group(p->real_parent) != pgrp
@@ -237,7 +238,8 @@ void reparent_to_init(void)
 	/* rt_priority? */
 	/* signals? */
 	security_task_reparent_to_init(current);
-	memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim)));
+	memcpy(current->signal->rlim, init_task.signal->rlim,
+	       sizeof(current->signal->rlim));
 	atomic_inc(&(INIT_USER->__count));
 	switch_uid(INIT_USER);
 
@@ -517,7 +519,7 @@ static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_re
 	 * Make sure we're not reparenting to ourselves and that
 	 * the parent is not a zombie.
 	 */
-	BUG_ON(p == reaper || reaper->state >= TASK_ZOMBIE);
+	BUG_ON(p == reaper || reaper->state >= EXIT_ZOMBIE || reaper->exit_state >= EXIT_ZOMBIE);
 	p->real_parent = reaper;
 	if (p->parent == p->real_parent)
 		BUG();
@@ -552,7 +554,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced)
 		/* If we'd notified the old parent about this child's death,
 		 * also notify the new parent.
 		 */
-		if (p->state == TASK_ZOMBIE && p->exit_signal != -1 &&
+		if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&
 		    thread_group_empty(p))
 			do_notify_parent(p, p->exit_signal);
 		else if (p->state == TASK_TRACED) {
@@ -600,7 +602,7 @@ static inline void forget_original_parent(struct task_struct * father,
 			reaper = child_reaper;
 			break;
 		}
-	} while (reaper->state >= TASK_ZOMBIE);
+	} while (reaper->exit_state >= EXIT_ZOMBIE);
 
 	/*
 	 * There are only two places where our children can be:
@@ -626,7 +628,7 @@ static inline void forget_original_parent(struct task_struct * father,
 		} else {
 			/* reparent ptraced task to its real parent */
 			__ptrace_unlink (p);
-			if (p->state == TASK_ZOMBIE && p->exit_signal != -1 &&
+			if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&
 			    thread_group_empty(p))
 				do_notify_parent(p, p->exit_signal);
 		}
@@ -637,7 +639,7 @@ static inline void forget_original_parent(struct task_struct * father,
 		 * zombie forever since we prevented it from self-reap itself
 		 * while it was being traced by us, to be able to see it in wait4.
 		 */
-		if (unlikely(ptrace && p->state == TASK_ZOMBIE && p->exit_signal == -1))
+		if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1))
 			list_add(&p->ptrace_list, to_release);
 	}
 	list_for_each_safe(_p, _n, &father->ptrace_children) {
@@ -750,10 +752,10 @@ static void exit_notify(struct task_struct *tsk)
 		do_notify_parent(tsk, SIGCHLD);
 	}
 
-	state = TASK_ZOMBIE;
+	state = EXIT_ZOMBIE;
 	if (tsk->exit_signal == -1 && tsk->ptrace == 0)
-		state = TASK_DEAD;
-	tsk->state = state;
+		state = EXIT_DEAD;
+	tsk->exit_state = state;
 
 	/*
 	 * Clear these here so that update_process_times() won't try to deliver
@@ -761,7 +763,6 @@ static void exit_notify(struct task_struct *tsk)
 	 */
 	tsk->it_virt_value = 0;
 	tsk->it_prof_value = 0;
-	tsk->rlim[RLIMIT_CPU].rlim_cur = RLIM_INFINITY;
 
 	write_unlock_irq(&tasklist_lock);
 
@@ -772,7 +773,7 @@ static void exit_notify(struct task_struct *tsk)
 	}
 
 	/* If the process is dead, release it - nobody will wait for it */
-	if (state == TASK_DEAD)
+	if (state == EXIT_DEAD)
 		release_task(tsk);
 
 	/* PF_DEAD causes final put_task_struct after we schedule. */
@@ -829,6 +830,8 @@ asmlinkage NORET_TYPE void do_exit(long code)
 	mpol_free(tsk->mempolicy);
 	tsk->mempolicy = NULL;
 #endif
+
+	BUG_ON(!(current->flags & PF_DEAD));
 	schedule();
 	BUG();
 	/* Avoid "noreturn function does return".  */
@@ -972,7 +975,7 @@ static int wait_noreap_copyout(task_t *p, pid_t pid, uid_t uid,
 }
 
 /*
- * Handle sys_wait4 work for one task in state TASK_ZOMBIE.  We hold
+ * Handle sys_wait4 work for one task in state EXIT_ZOMBIE.  We hold
  * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
  * the lock and this task is uninteresting.  If we return nonzero, we have
  * released the lock and the system call should return.
@@ -991,7 +994,7 @@ static int wait_task_zombie(task_t *p, int noreap,
 		int exit_code = p->exit_code;
 		int why, status;
 
-		if (unlikely(p->state != TASK_ZOMBIE))
+		if (unlikely(p->exit_state != EXIT_ZOMBIE))
 			return 0;
 		if (unlikely(p->exit_signal == -1 && p->ptrace == 0))
 			return 0;
@@ -1012,9 +1015,9 @@ static int wait_task_zombie(task_t *p, int noreap,
 	 * Try to move the task's state to DEAD
 	 * only one thread is allowed to do this:
 	 */
-	state = xchg(&p->state, TASK_DEAD);
-	if (state != TASK_ZOMBIE) {
-		BUG_ON(state != TASK_DEAD);
+	state = xchg(&p->exit_state, EXIT_DEAD);
+	if (state != EXIT_ZOMBIE) {
+		BUG_ON(state != EXIT_DEAD);
 		return 0;
 	}
 	if (unlikely(p->exit_signal == -1 && p->ptrace == 0)) {
@@ -1059,7 +1062,7 @@ static int wait_task_zombie(task_t *p, int noreap,
 
 	/*
 	 * Now we are sure this task is interesting, and no other
-	 * thread can reap it because we set its state to TASK_DEAD.
+	 * thread can reap it because we set its state to EXIT_DEAD.
 	 */
 	read_unlock(&tasklist_lock);
 
@@ -1091,7 +1094,8 @@ static int wait_task_zombie(task_t *p, int noreap,
 	if (!retval && infop)
 		retval = put_user(p->uid, &infop->si_uid);
 	if (retval) {
-		p->state = TASK_ZOMBIE;
+		// TODO: is this safe?
+		p->exit_state = EXIT_ZOMBIE;
 		return retval;
 	}
 	retval = p->pid;
@@ -1100,7 +1104,8 @@ static int wait_task_zombie(task_t *p, int noreap,
 		/* Double-check with lock held.  */
 		if (p->real_parent != p->parent) {
 			__ptrace_unlink(p);
-			p->state = TASK_ZOMBIE;
+			// TODO: is this safe?
+			p->exit_state = EXIT_ZOMBIE;
 			/*
 			 * If this is not a detached task, notify the parent.
 			 * If it's still not detached after that, don't release
@@ -1171,13 +1176,13 @@ static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap,
 	/*
 	 * This uses xchg to be atomic with the thread resuming and setting
 	 * it.  It must also be done with the write lock held to prevent a
-	 * race with the TASK_ZOMBIE case.
+	 * race with the EXIT_ZOMBIE case.
 	 */
 	exit_code = xchg(&p->exit_code, 0);
-	if (unlikely(p->state >= TASK_ZOMBIE)) {
+	if (unlikely(p->exit_state >= EXIT_ZOMBIE)) {
 		/*
 		 * The task resumed and then died.  Let the next iteration
-		 * catch it in TASK_ZOMBIE.  Note that exit_code might
+		 * catch it in EXIT_ZOMBIE.  Note that exit_code might
 		 * already be zero here if it resumed and did _exit(0).
 		 * The task itself is dead and won't touch exit_code again;
 		 * other processors in this function are locked out.
@@ -1228,6 +1233,74 @@ bail_ref:
 	return retval;
 }
 
+/*
+ * Handle do_wait work for one task in a live, non-stopped state.
+ * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
+ * the lock and this task is uninteresting.  If we return nonzero, we have
+ * released the lock and the system call should return.
+ */
+static int wait_task_continued(task_t *p, int noreap,
+			       struct siginfo __user *infop,
+			       int __user *stat_addr, struct rusage __user *ru)
+{
+	int retval;
+	pid_t pid;
+	uid_t uid;
+
+	if (unlikely(!p->signal))
+		return 0;
+
+	if (p->signal->stop_state >= 0)
+		return 0;
+
+	spin_lock_irq(&p->sighand->siglock);
+	if (p->signal->stop_state >= 0) { /* Re-check with the lock held.  */
+		spin_unlock_irq(&p->sighand->siglock);
+		return 0;
+	}
+	if (!noreap)
+		p->signal->stop_state = 0;
+	spin_unlock_irq(&p->sighand->siglock);
+
+	pid = p->pid;
+	uid = p->uid;
+	get_task_struct(p);
+	read_unlock(&tasklist_lock);
+
+	if (!infop) {
+		retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+		put_task_struct(p);
+		if (!retval && stat_addr)
+			retval = put_user(0xffff, stat_addr);
+		if (!retval)
+			retval = p->pid;
+	} else {
+		retval = wait_noreap_copyout(p, pid, uid,
+					     CLD_CONTINUED, SIGCONT,
+					     infop, ru);
+		BUG_ON(retval == 0);
+	}
+
+	return retval;
+}
+
+
+static inline int my_ptrace_child(struct task_struct *p)
+{
+	if (!(p->ptrace & PT_PTRACED))
+		return 0;
+	if (!(p->ptrace & PT_ATTACHED))
+		return 1;
+	/*
+	 * This child was PTRACE_ATTACH'd.  We should be seeing it only if
+	 * we are the attacher.  If we are the real parent, this is a race
+	 * inside ptrace_attach.  It is waiting for the tasklist_lock,
+	 * which we have to switch the parent links, but has already set
+	 * the flags in p->ptrace.
+	 */
+	return (p->parent != p->real_parent);
+}
+
 static long do_wait(pid_t pid, int options, struct siginfo __user *infop,
 		    int __user *stat_addr, struct rusage __user *ru)
 {
@@ -1256,12 +1329,12 @@ repeat:
 
 			switch (p->state) {
 			case TASK_TRACED:
-				if (!(p->ptrace & PT_PTRACED))
+				if (!my_ptrace_child(p))
 					continue;
 				/*FALLTHROUGH*/
 			case TASK_STOPPED:
 				if (!(options & WUNTRACED) &&
-				    !(p->ptrace & PT_PTRACED))
+				    !my_ptrace_child(p))
 					continue;
 				retval = wait_task_stopped(p, ret == 2,
 							   (options & WNOWAIT),
@@ -1270,47 +1343,36 @@ repeat:
 				if (retval != 0) /* He released the lock.  */
 					goto end;
 				break;
-			case TASK_ZOMBIE:
-				/*
-				 * Eligible but we cannot release it yet:
-				 */
-				if (ret == 2)
-					goto check_continued;
-				if (!likely(options & WEXITED))
-					continue;
-				retval = wait_task_zombie(
-					p, (options & WNOWAIT),
-					infop, stat_addr, ru);
-				if (retval != 0) /* He released the lock.  */
-					goto end;
-				break;
-			case TASK_DEAD:
-				continue;
 			default:
+			// case EXIT_DEAD:
+				if (p->exit_state == EXIT_DEAD)
+					continue;
+			// case EXIT_ZOMBIE:
+				if (p->exit_state == EXIT_ZOMBIE) {
+					/*
+					 * Eligible but we cannot release
+					 * it yet:
+					 */
+					if (ret == 2)
+						goto check_continued;
+					if (!likely(options & WEXITED))
+						continue;
+					retval = wait_task_zombie(
+						p, (options & WNOWAIT),
+						infop, stat_addr, ru);
+					/* He released the lock.  */
+					if (retval != 0)
+						goto end;
+					break;
+				}
 check_continued:
 				if (!unlikely(options & WCONTINUED))
 					continue;
-				if (unlikely(!p->signal))
-					continue;
-				spin_lock_irq(&p->sighand->siglock);
-				if (p->signal->stop_state < 0) {
-					pid_t pid;
-					uid_t uid;
-
-					if (!(options & WNOWAIT))
-						p->signal->stop_state = 0;
-					spin_unlock_irq(&p->sighand->siglock);
-					pid = p->pid;
-					uid = p->uid;
-					get_task_struct(p);
-					read_unlock(&tasklist_lock);
-					retval = wait_noreap_copyout(p, pid,
-							uid, CLD_CONTINUED,
-							SIGCONT, infop, ru);
-					BUG_ON(retval == 0);
+				retval = wait_task_continued(
+					p, (options & WNOWAIT),
+					infop, stat_addr, ru);
+				if (retval != 0) /* He released the lock.  */
 					goto end;
-				}
-				spin_unlock_irq(&p->sighand->siglock);
 				break;
 			}
 		}
@@ -1376,6 +1438,8 @@ asmlinkage long sys_waitid(int which, pid_t pid,
 			   struct siginfo __user *infop, int options,
 			   struct rusage __user *ru)
 {
+	long ret;
+
 	if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))
 		return -EINVAL;
 	if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
@@ -1398,15 +1462,26 @@ asmlinkage long sys_waitid(int which, pid_t pid,
 		return -EINVAL;
 	}
 
-	return do_wait(pid, options, infop, NULL, ru);
+	ret = do_wait(pid, options, infop, NULL, ru);
+
+	/* avoid REGPARM breakage on x86: */
+	prevent_tail_call(ret);
+	return ret;
 }
 
 asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr,
 			  int options, struct rusage __user *ru)
 {
-	if (options & ~(WNOHANG|WUNTRACED|__WNOTHREAD|__WCLONE|__WALL))
+	long ret;
+
+	if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
+			__WNOTHREAD|__WCLONE|__WALL))
 		return -EINVAL;
-	return do_wait(pid, options | WEXITED, NULL, stat_addr, ru);
+	ret = do_wait(pid, options | WEXITED, NULL, stat_addr, ru);
+
+	/* avoid REGPARM breakage on x86: */
+	prevent_tail_call(ret);
+	return ret;
 }
 
 #ifdef __ARCH_WANT_SYS_WAITPID
diff --git a/kernel/fork.c b/kernel/fork.c
index 8c7ba481c9a5..3020dccc548f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -86,7 +86,7 @@ EXPORT_SYMBOL(free_task);
 
 void __put_task_struct(struct task_struct *tsk)
 {
-	WARN_ON(!(tsk->state & (TASK_DEAD | TASK_ZOMBIE)));
+	WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
@@ -249,8 +249,8 @@ void __init fork_init(unsigned long mempages)
 	if(max_threads < 20)
 		max_threads = 20;
 
-	init_task.rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
-	init_task.rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
+	init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
+	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
 }
 
 static struct task_struct *dup_task_struct(struct task_struct *orig)
@@ -762,8 +762,17 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 
 	for (i = open_files; i != 0; i--) {
 		struct file *f = *old_fds++;
-		if (f)
+		if (f) {
 			get_file(f);
+		} else {
+			/*
+			 * The fd may be claimed in the fd bitmap but not yet
+			 * instantiated in the files array if a sibling thread
+			 * is partway through open().  So make sure that this
+			 * fd is available to the new process.
+			 */
+			FD_CLR(open_files - i, newf->open_fds);
+		}
 		*new_fds++ = f;
 	}
 	spin_unlock(&oldf->file_lock);
@@ -872,6 +881,10 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 
+	task_lock(current->group_leader);
+	memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
+	task_unlock(current->group_leader);
+
 	return 0;
 }
 
@@ -941,7 +954,7 @@ static task_t *copy_process(unsigned long clone_flags,
 
 	retval = -EAGAIN;
 	if (atomic_read(&p->user->processes) >=
-			p->rlim[RLIMIT_NPROC].rlim_cur) {
+			p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
 		if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
 				p->user != &root_user)
 			goto bad_fork_free;
@@ -992,7 +1005,7 @@ static task_t *copy_process(unsigned long clone_flags,
 
 	p->utime = p->stime = 0;
 	p->lock_depth = -1;		/* -1 = no lock */
-	p->start_time = get_jiffies_64();
+	do_posix_clock_monotonic_gettime(&p->start_time);
 	p->security = NULL;
 	p->io_context = NULL;
 	p->io_wait = NULL;
@@ -1049,6 +1062,7 @@ static task_t *copy_process(unsigned long clone_flags,
 	/* ok, now we should be set up.. */
 	p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
 	p->pdeath_signal = 0;
+	p->exit_state = 0;
 
 	/* Perform scheduler related setup */
 	sched_fork(p);
@@ -1146,7 +1160,8 @@ fork_out:
 bad_fork_cleanup_namespace:
 	exit_namespace(p);
 bad_fork_cleanup_mm:
-	mmput(p->mm);
+	if (p->mm)
+		mmput(p->mm);
 bad_fork_cleanup_signal:
 	exit_signal(p);
 bad_fork_cleanup_sighand:
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
new file mode 100644
index 000000000000..66d3be6e4da8
--- /dev/null
+++ b/kernel/irq/Makefile
@@ -0,0 +1,4 @@
+
+obj-y := autoprobe.o handle.o manage.o spurious.o
+obj-$(CONFIG_PROC_FS) += proc.o
+
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
new file mode 100644
index 000000000000..16818726cd21
--- /dev/null
+++ b/kernel/irq/autoprobe.c
@@ -0,0 +1,188 @@
+/*
+ * linux/kernel/irq/autoprobe.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the interrupt probing code and driver APIs.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+
+/*
+ * Autodetection depends on the fact that any interrupt that
+ * comes in on to an unassigned handler will get stuck with
+ * "IRQ_WAITING" cleared and the interrupt disabled.
+ */
+static DECLARE_MUTEX(probe_sem);
+
+/**
+ *	probe_irq_on	- begin an interrupt autodetect
+ *
+ *	Commence probing for an interrupt. The interrupts are scanned
+ *	and a mask of potential interrupt lines is returned.
+ *
+ */
+unsigned long probe_irq_on(void)
+{
+	unsigned long val, delay;
+	irq_desc_t *desc;
+	unsigned int i;
+
+	down(&probe_sem);
+	/*
+	 * something may have generated an irq long ago and we want to
+	 * flush such a longstanding irq before considering it as spurious.
+	 */
+	for (i = NR_IRQS-1; i > 0; i--) {
+		desc = irq_desc + i;
+
+		spin_lock_irq(&desc->lock);
+		if (!irq_desc[i].action)
+			irq_desc[i].handler->startup(i);
+		spin_unlock_irq(&desc->lock);
+	}
+
+	/* Wait for longstanding interrupts to trigger. */
+	for (delay = jiffies + HZ/50; time_after(delay, jiffies); )
+		/* about 20ms delay */ barrier();
+
+	/*
+	 * enable any unassigned irqs
+	 * (we must startup again here because if a longstanding irq
+	 * happened in the previous stage, it may have masked itself)
+	 */
+	for (i = NR_IRQS-1; i > 0; i--) {
+		desc = irq_desc + i;
+
+		spin_lock_irq(&desc->lock);
+		if (!desc->action) {
+			desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
+			if (desc->handler->startup(i))
+				desc->status |= IRQ_PENDING;
+		}
+		spin_unlock_irq(&desc->lock);
+	}
+
+	/*
+	 * Wait for spurious interrupts to trigger
+	 */
+	for (delay = jiffies + HZ/10; time_after(delay, jiffies); )
+		/* about 100ms delay */ barrier();
+
+	/*
+	 * Now filter out any obviously spurious interrupts
+	 */
+	val = 0;
+	for (i = 0; i < NR_IRQS; i++) {
+		irq_desc_t *desc = irq_desc + i;
+		unsigned int status;
+
+		spin_lock_irq(&desc->lock);
+		status = desc->status;
+
+		if (status & IRQ_AUTODETECT) {
+			/* It triggered already - consider it spurious. */
+			if (!(status & IRQ_WAITING)) {
+				desc->status = status & ~IRQ_AUTODETECT;
+				desc->handler->shutdown(i);
+			} else
+				if (i < 32)
+					val |= 1 << i;
+		}
+		spin_unlock_irq(&desc->lock);
+	}
+
+	return val;
+}
+
+EXPORT_SYMBOL(probe_irq_on);
+
+/**
+ *	probe_irq_mask - scan a bitmap of interrupt lines
+ *	@val:	mask of interrupts to consider
+ *
+ *	Scan the interrupt lines and return a bitmap of active
+ *	autodetect interrupts. The interrupt probe logic state
+ *	is then returned to its previous value.
+ *
+ *	Note: we need to scan all the irq's even though we will
+ *	only return autodetect irq numbers - just so that we reset
+ *	them all to a known state.
+ */
+unsigned int probe_irq_mask(unsigned long val)
+{
+	unsigned int mask;
+	int i;
+
+	mask = 0;
+	for (i = 0; i < NR_IRQS; i++) {
+		irq_desc_t *desc = irq_desc + i;
+		unsigned int status;
+
+		spin_lock_irq(&desc->lock);
+		status = desc->status;
+
+		if (status & IRQ_AUTODETECT) {
+			if (i < 16 && !(status & IRQ_WAITING))
+				mask |= 1 << i;
+
+			desc->status = status & ~IRQ_AUTODETECT;
+			desc->handler->shutdown(i);
+		}
+		spin_unlock_irq(&desc->lock);
+	}
+	up(&probe_sem);
+
+	return mask & val;
+}
+
+/**
+ *	probe_irq_off	- end an interrupt autodetect
+ *	@val: mask of potential interrupts (unused)
+ *
+ *	Scans the unused interrupt lines and returns the line which
+ *	appears to have triggered the interrupt. If no interrupt was
+ *	found then zero is returned. If more than one interrupt is
+ *	found then minus the first candidate is returned to indicate
+ *	their is doubt.
+ *
+ *	The interrupt probe logic state is returned to its previous
+ *	value.
+ *
+ *	BUGS: When used in a module (which arguably shouldn't happen)
+ *	nothing prevents two IRQ probe callers from overlapping. The
+ *	results of this are non-optimal.
+ */
+int probe_irq_off(unsigned long val)
+{
+	int i, irq_found = 0, nr_irqs = 0;
+
+	for (i = 0; i < NR_IRQS; i++) {
+		irq_desc_t *desc = irq_desc + i;
+		unsigned int status;
+
+		spin_lock_irq(&desc->lock);
+		status = desc->status;
+
+		if (status & IRQ_AUTODETECT) {
+			if (!(status & IRQ_WAITING)) {
+				if (!nr_irqs)
+					irq_found = i;
+				nr_irqs++;
+			}
+			desc->status = status & ~IRQ_AUTODETECT;
+			desc->handler->shutdown(i);
+		}
+		spin_unlock_irq(&desc->lock);
+	}
+	up(&probe_sem);
+
+	if (nr_irqs > 1)
+		irq_found = -irq_found;
+	return irq_found;
+}
+
+EXPORT_SYMBOL(probe_irq_off);
+
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
new file mode 100644
index 000000000000..6f0ebc254e3e
--- /dev/null
+++ b/kernel/irq/handle.c
@@ -0,0 +1,204 @@
+/*
+ * linux/kernel/irq/handle.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the core interrupt handling code.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+
+#include "internals.h"
+
+/*
+ * Linux has a controller-independent interrupt architecture.
+ * Every controller has a 'controller-template', that is used
+ * by the main code to do the right thing. Each driver-visible
+ * interrupt source is transparently wired to the apropriate
+ * controller. Thus drivers need not be aware of the
+ * interrupt-controller.
+ *
+ * The code is designed to be easily extended with new/different
+ * interrupt controllers, without having to do assembly magic or
+ * having to touch the generic code.
+ *
+ * Controller mappings for all interrupt sources:
+ */
+irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
+	[0 ... NR_IRQS-1] = {
+		.handler = &no_irq_type,
+		.lock = SPIN_LOCK_UNLOCKED
+	}
+};
+
+/*
+ * Generic 'no controller' code
+ */
+static void end_none(unsigned int irq) { }
+static void enable_none(unsigned int irq) { }
+static void disable_none(unsigned int irq) { }
+static void shutdown_none(unsigned int irq) { }
+static unsigned int startup_none(unsigned int irq) { return 0; }
+
+static void ack_none(unsigned int irq)
+{
+	/*
+	 * 'what should we do if we get a hw irq event on an illegal vector'.
+	 * each architecture has to answer this themself.
+	 */
+	ack_bad_irq(irq);
+}
+
+struct hw_interrupt_type no_irq_type = {
+	.typename = 	"none",
+	.startup = 	startup_none,
+	.shutdown = 	shutdown_none,
+	.enable = 	enable_none,
+	.disable = 	disable_none,
+	.ack = 		ack_none,
+	.end = 		end_none,
+	.set_affinity = NULL
+};
+
+/*
+ * Special, empty irq handler:
+ */
+irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs)
+{
+	return IRQ_NONE;
+}
+
+/*
+ * Exit an interrupt context. Process softirqs if needed and possible:
+ */
+void irq_exit(void)
+{
+	preempt_count() -= IRQ_EXIT_OFFSET;
+	if (!in_interrupt() && local_softirq_pending())
+		do_softirq();
+	preempt_enable_no_resched();
+}
+
+/*
+ * Have got an event to handle:
+ */
+asmlinkage int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
+				struct irqaction *action)
+{
+	int ret, retval = 0, status = 0;
+
+	if (!(action->flags & SA_INTERRUPT))
+		local_irq_enable();
+
+	do {
+		ret = action->handler(irq, action->dev_id, regs);
+		if (ret == IRQ_HANDLED)
+			status |= action->flags;
+		retval |= ret;
+		action = action->next;
+	} while (action);
+
+	if (status & SA_SAMPLE_RANDOM)
+		add_interrupt_randomness(irq);
+	local_irq_disable();
+
+	return retval;
+}
+
+/*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+asmlinkage unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
+{
+	irq_desc_t *desc = irq_desc + irq;
+	struct irqaction * action;
+	unsigned int status;
+
+	kstat_this_cpu.irqs[irq]++;
+	if (desc->status & IRQ_PER_CPU) {
+		irqreturn_t action_ret;
+
+		/*
+		 * No locking required for CPU-local interrupts:
+		 */
+		desc->handler->ack(irq);
+		action_ret = handle_IRQ_event(irq, regs, desc->action);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret);
+		desc->handler->end(irq);
+		return 1;
+	}
+
+	spin_lock(&desc->lock);
+	desc->handler->ack(irq);
+	/*
+	 * REPLAY is when Linux resends an IRQ that was dropped earlier
+	 * WAITING is used by probe to mark irqs that are being tested
+	 */
+	status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING);
+	status |= IRQ_PENDING; /* we _want_ to handle it */
+
+	/*
+	 * If the IRQ is disabled for whatever reason, we cannot
+	 * use the action we have.
+	 */
+	action = NULL;
+	if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) {
+		action = desc->action;
+		status &= ~IRQ_PENDING; /* we commit to handling */
+		status |= IRQ_INPROGRESS; /* we are handling it */
+	}
+	desc->status = status;
+
+	/*
+	 * If there is no IRQ handler or it was disabled, exit early.
+	 * Since we set PENDING, if another processor is handling
+	 * a different instance of this same irq, the other processor
+	 * will take care of it.
+	 */
+	if (unlikely(!action))
+		goto out;
+
+	/*
+	 * Edge triggered interrupts need to remember
+	 * pending events.
+	 * This applies to any hw interrupts that allow a second
+	 * instance of the same irq to arrive while we are in do_IRQ
+	 * or in the handler. But the code here only handles the _second_
+	 * instance of the irq, not the third or fourth. So it is mostly
+	 * useful for irq hardware that does not mask cleanly in an
+	 * SMP environment.
+	 */
+	for (;;) {
+		irqreturn_t action_ret;
+
+		spin_unlock(&desc->lock);
+
+		action_ret = handle_IRQ_event(irq, regs, action);
+
+		spin_lock(&desc->lock);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret);
+		if (likely(!(desc->status & IRQ_PENDING)))
+			break;
+		desc->status &= ~IRQ_PENDING;
+	}
+	desc->status &= ~IRQ_INPROGRESS;
+
+out:
+	/*
+	 * The ->end() handler has to deal with interrupts which got
+	 * disabled while the handler was running.
+	 */
+	desc->handler->end(irq);
+	spin_unlock(&desc->lock);
+
+	return 1;
+}
+
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
new file mode 100644
index 000000000000..46feba630266
--- /dev/null
+++ b/kernel/irq/internals.h
@@ -0,0 +1,18 @@
+/*
+ * IRQ subsystem internal functions and variables:
+ */
+
+extern int noirqdebug;
+
+#ifdef CONFIG_PROC_FS
+extern void register_irq_proc(unsigned int irq);
+extern void register_handler_proc(unsigned int irq, struct irqaction *action);
+extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);
+#else
+static inline void register_irq_proc(unsigned int irq) { }
+static inline void register_handler_proc(unsigned int irq,
+					 struct irqaction *action) { }
+static inline void unregister_handler_proc(unsigned int irq,
+					   struct irqaction *action) { }
+#endif
+
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
new file mode 100644
index 000000000000..a164fe243338
--- /dev/null
+++ b/kernel/irq/manage.c
@@ -0,0 +1,347 @@
+/*
+ * linux/kernel/irq/manage.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains driver APIs to the irq subsystem.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/interrupt.h>
+
+#include "internals.h"
+
+#ifdef CONFIG_SMP
+
+/**
+ *	synchronize_irq - wait for pending IRQ handlers (on other CPUs)
+ *
+ *	This function waits for any pending IRQ handlers for this interrupt
+ *	to complete before returning. If you use this function while
+ *	holding a resource the IRQ handler may need you will deadlock.
+ *
+ *	This function may be called - with care - from IRQ context.
+ */
+void synchronize_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_desc + irq;
+
+	while (desc->status & IRQ_INPROGRESS)
+		cpu_relax();
+}
+
+EXPORT_SYMBOL(synchronize_irq);
+
+#endif
+
+/**
+ *	disable_irq_nosync - disable an irq without waiting
+ *	@irq: Interrupt to disable
+ *
+ *	Disable the selected interrupt line.  Disables and Enables are
+ *	nested.
+ *	Unlike disable_irq(), this function does not ensure existing
+ *	instances of the IRQ handler have completed before returning.
+ *
+ *	This function may be called from IRQ context.
+ */
+void disable_irq_nosync(unsigned int irq)
+{
+	irq_desc_t *desc = irq_desc + irq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&desc->lock, flags);
+	if (!desc->depth++) {
+		desc->status |= IRQ_DISABLED;
+		desc->handler->disable(irq);
+	}
+	spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+EXPORT_SYMBOL(disable_irq_nosync);
+
+/**
+ *	disable_irq - disable an irq and wait for completion
+ *	@irq: Interrupt to disable
+ *
+ *	Disable the selected interrupt line.  Enables and Disables are
+ *	nested.
+ *	This function waits for any pending IRQ handlers for this interrupt
+ *	to complete before returning. If you use this function while
+ *	holding a resource the IRQ handler may need you will deadlock.
+ *
+ *	This function may be called - with care - from IRQ context.
+ */
+void disable_irq(unsigned int irq)
+{
+	irq_desc_t *desc = irq_desc + irq;
+
+	disable_irq_nosync(irq);
+	if (desc->action)
+		synchronize_irq(irq);
+}
+
+EXPORT_SYMBOL(disable_irq);
+
+/**
+ *	enable_irq - enable handling of an irq
+ *	@irq: Interrupt to enable
+ *
+ *	Undoes the effect of one call to disable_irq().  If this
+ *	matches the last disable, processing of interrupts on this
+ *	IRQ line is re-enabled.
+ *
+ *	This function may be called from IRQ context.
+ */
+void enable_irq(unsigned int irq)
+{
+	irq_desc_t *desc = irq_desc + irq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&desc->lock, flags);
+	switch (desc->depth) {
+	case 0:
+		WARN_ON(1);
+		break;
+	case 1: {
+		unsigned int status = desc->status & ~IRQ_DISABLED;
+
+		desc->status = status;
+		if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
+			desc->status = status | IRQ_REPLAY;
+			hw_resend_irq(desc->handler,irq);
+		}
+		desc->handler->enable(irq);
+		/* fall-through */
+	}
+	default:
+		desc->depth--;
+	}
+	spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+EXPORT_SYMBOL(enable_irq);
+
+/*
+ * Internal function that tells the architecture code whether a
+ * particular irq has been exclusively allocated or is available
+ * for driver use.
+ */
+int can_request_irq(unsigned int irq, unsigned long irqflags)
+{
+	struct irqaction *action;
+
+	if (irq >= NR_IRQS)
+		return 0;
+
+	action = irq_desc[irq].action;
+	if (action)
+		if (irqflags & action->flags & SA_SHIRQ)
+			action = NULL;
+
+	return !action;
+}
+
+/*
+ * Internal function to register an irqaction - typically used to
+ * allocate special interrupts that are part of the architecture.
+ */
+int setup_irq(unsigned int irq, struct irqaction * new)
+{
+	struct irq_desc *desc = irq_desc + irq;
+	struct irqaction *old, **p;
+	unsigned long flags;
+	int shared = 0;
+
+	if (desc->handler == &no_irq_type)
+		return -ENOSYS;
+	/*
+	 * Some drivers like serial.c use request_irq() heavily,
+	 * so we have to be careful not to interfere with a
+	 * running system.
+	 */
+	if (new->flags & SA_SAMPLE_RANDOM) {
+		/*
+		 * This function might sleep, we want to call it first,
+		 * outside of the atomic block.
+		 * Yes, this might clear the entropy pool if the wrong
+		 * driver is attempted to be loaded, without actually
+		 * installing a new handler, but is this really a problem,
+		 * only the sysadmin is able to do this.
+		 */
+		rand_initialize_irq(irq);
+	}
+
+	/*
+	 * The following block of code has to be executed atomically
+	 */
+	spin_lock_irqsave(&desc->lock,flags);
+	p = &desc->action;
+	if ((old = *p) != NULL) {
+		/* Can't share interrupts unless both agree to */
+		if (!(old->flags & new->flags & SA_SHIRQ)) {
+			spin_unlock_irqrestore(&desc->lock,flags);
+			return -EBUSY;
+		}
+
+		/* add new interrupt at end of irq queue */
+		do {
+			p = &old->next;
+			old = *p;
+		} while (old);
+		shared = 1;
+	}
+
+	*p = new;
+
+	if (!shared) {
+		desc->depth = 0;
+		desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT |
+				  IRQ_WAITING | IRQ_INPROGRESS);
+		if (desc->handler->startup)
+			desc->handler->startup(irq);
+		else
+			desc->handler->enable(irq);
+	}
+	spin_unlock_irqrestore(&desc->lock,flags);
+
+	new->irq = irq;
+	register_irq_proc(irq);
+	new->dir = NULL;
+	register_handler_proc(irq, new);
+
+	return 0;
+}
+
+/**
+ *	free_irq - free an interrupt
+ *	@irq: Interrupt line to free
+ *	@dev_id: Device identity to free
+ *
+ *	Remove an interrupt handler. The handler is removed and if the
+ *	interrupt line is no longer in use by any driver it is disabled.
+ *	On a shared IRQ the caller must ensure the interrupt is disabled
+ *	on the card it drives before calling this function. The function
+ *	does not return until any executing interrupts for this IRQ
+ *	have completed.
+ *
+ *	This function must not be called from interrupt context.
+ */
+void free_irq(unsigned int irq, void *dev_id)
+{
+	struct irq_desc *desc;
+	struct irqaction **p;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS)
+		return;
+
+	desc = irq_desc + irq;
+	spin_lock_irqsave(&desc->lock,flags);
+	p = &desc->action;
+	for (;;) {
+		struct irqaction * action = *p;
+
+		if (action) {
+			struct irqaction **pp = p;
+
+			p = &action->next;
+			if (action->dev_id != dev_id)
+				continue;
+
+			/* Found it - now remove it from the list of entries */
+			*pp = action->next;
+			if (!desc->action) {
+				desc->status |= IRQ_DISABLED;
+				if (desc->handler->shutdown)
+					desc->handler->shutdown(irq);
+				else
+					desc->handler->disable(irq);
+			}
+			spin_unlock_irqrestore(&desc->lock,flags);
+			unregister_handler_proc(irq, action);
+
+			/* Make sure it's not being used on another CPU */
+			synchronize_irq(irq);
+			kfree(action);
+			return;
+		}
+		printk(KERN_ERR "Trying to free free IRQ%d\n",irq);
+		spin_unlock_irqrestore(&desc->lock,flags);
+		return;
+	}
+}
+
+EXPORT_SYMBOL(free_irq);
+
+/**
+ *	request_irq - allocate an interrupt line
+ *	@irq: Interrupt line to allocate
+ *	@handler: Function to be called when the IRQ occurs
+ *	@irqflags: Interrupt type flags
+ *	@devname: An ascii name for the claiming device
+ *	@dev_id: A cookie passed back to the handler function
+ *
+ *	This call allocates interrupt resources and enables the
+ *	interrupt line and IRQ handling. From the point this
+ *	call is made your handler function may be invoked. Since
+ *	your handler function must clear any interrupt the board
+ *	raises, you must take care both to initialise your hardware
+ *	and to set up the interrupt handler in the right order.
+ *
+ *	Dev_id must be globally unique. Normally the address of the
+ *	device data structure is used as the cookie. Since the handler
+ *	receives this value it makes sense to use it.
+ *
+ *	If your interrupt is shared you must pass a non NULL dev_id
+ *	as this is required when freeing the interrupt.
+ *
+ *	Flags:
+ *
+ *	SA_SHIRQ		Interrupt is shared
+ *	SA_INTERRUPT		Disable local interrupts while processing
+ *	SA_SAMPLE_RANDOM	The interrupt can be used for entropy
+ *
+ */
+int request_irq(unsigned int irq,
+		irqreturn_t (*handler)(int, void *, struct pt_regs *),
+		unsigned long irqflags, const char * devname, void *dev_id)
+{
+	struct irqaction * action;
+	int retval;
+
+	/*
+	 * Sanity-check: shared interrupts must pass in a real dev-ID,
+	 * otherwise we'll have trouble later trying to figure out
+	 * which interrupt is which (messes up the interrupt freeing
+	 * logic etc).
+	 */
+	if ((irqflags & SA_SHIRQ) && !dev_id)
+		return -EINVAL;
+	if (irq >= NR_IRQS)
+		return -EINVAL;
+	if (!handler)
+		return -EINVAL;
+
+	action = kmalloc(sizeof(struct irqaction), GFP_ATOMIC);
+	if (!action)
+		return -ENOMEM;
+
+	action->handler = handler;
+	action->flags = irqflags;
+	cpus_clear(action->mask);
+	action->name = devname;
+	action->next = NULL;
+	action->dev_id = dev_id;
+
+	retval = setup_irq(irq, action);
+	if (retval)
+		kfree(action);
+
+	return retval;
+}
+
+EXPORT_SYMBOL(request_irq);
+
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
new file mode 100644
index 000000000000..2ddbe8404c9c
--- /dev/null
+++ b/kernel/irq/proc.c
@@ -0,0 +1,156 @@
+/*
+ * linux/kernel/irq/proc.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the /proc/irq/ handling code.
+ */
+
+#include <linux/irq.h>
+#include <linux/proc_fs.h>
+#include <linux/interrupt.h>
+
+static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
+
+#ifdef CONFIG_SMP
+
+/*
+ * The /proc/irq/<irq>/smp_affinity values:
+ */
+static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
+
+cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
+
+static int irq_affinity_read_proc(char *page, char **start, off_t off,
+				  int count, int *eof, void *data)
+{
+	int len = cpumask_scnprintf(page, count, irq_affinity[(long)data]);
+
+	if (count - len < 2)
+		return -EINVAL;
+	len += sprintf(page + len, "\n");
+	return len;
+}
+
+int no_irq_affinity;
+static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
+				   unsigned long count, void *data)
+{
+	unsigned int irq = (int)(long)data, full_count = count, err;
+	cpumask_t new_value, tmp;
+
+	if (!irq_desc[irq].handler->set_affinity || no_irq_affinity)
+		return -EIO;
+
+	err = cpumask_parse(buffer, count, new_value);
+	if (err)
+		return err;
+
+	/*
+	 * Do not allow disabling IRQs completely - it's a too easy
+	 * way to make the system unusable accidentally :-) At least
+	 * one online CPU still has to be targeted.
+	 */
+	cpus_and(tmp, new_value, cpu_online_map);
+	if (cpus_empty(tmp))
+		return -EINVAL;
+
+	irq_affinity[irq] = new_value;
+	irq_desc[irq].handler->set_affinity(irq,
+					cpumask_of_cpu(first_cpu(new_value)));
+
+	return full_count;
+}
+
+#endif
+
+#define MAX_NAMELEN 128
+
+static int name_unique(unsigned int irq, struct irqaction *new_action)
+{
+	struct irq_desc *desc = irq_desc + irq;
+	struct irqaction *action;
+
+	for (action = desc->action ; action; action = action->next)
+		if ((action != new_action) && action->name &&
+				!strcmp(new_action->name, action->name))
+			return 0;
+	return 1;
+}
+
+void register_handler_proc(unsigned int irq, struct irqaction *action)
+{
+	char name [MAX_NAMELEN];
+
+	if (!irq_dir[irq] || action->dir || !action->name ||
+					!name_unique(irq, action))
+		return;
+
+	memset(name, 0, MAX_NAMELEN);
+	snprintf(name, MAX_NAMELEN, "%s", action->name);
+
+	/* create /proc/irq/1234/handler/ */
+	action->dir = proc_mkdir(name, irq_dir[irq]);
+}
+
+#undef MAX_NAMELEN
+
+#define MAX_NAMELEN 10
+
+void register_irq_proc(unsigned int irq)
+{
+	char name [MAX_NAMELEN];
+
+	if (!root_irq_dir ||
+		(irq_desc[irq].handler == &no_irq_type) ||
+			irq_dir[irq])
+		return;
+
+	memset(name, 0, MAX_NAMELEN);
+	sprintf(name, "%d", irq);
+
+	/* create /proc/irq/1234 */
+	irq_dir[irq] = proc_mkdir(name, root_irq_dir);
+
+#ifdef CONFIG_SMP
+	{
+		struct proc_dir_entry *entry;
+
+		/* create /proc/irq/<irq>/smp_affinity */
+		entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]);
+
+		if (entry) {
+			entry->nlink = 1;
+			entry->data = (void *)(long)irq;
+			entry->read_proc = irq_affinity_read_proc;
+			entry->write_proc = irq_affinity_write_proc;
+		}
+		smp_affinity_entry[irq] = entry;
+	}
+#endif
+}
+
+#undef MAX_NAMELEN
+
+void unregister_handler_proc(unsigned int irq, struct irqaction *action)
+{
+	if (action->dir)
+		remove_proc_entry(action->dir->name, irq_dir[irq]);
+}
+
+void init_irq_proc(void)
+{
+	int i;
+
+	/* create /proc/irq */
+	root_irq_dir = proc_mkdir("irq", NULL);
+	if (!root_irq_dir)
+		return;
+
+	/*
+	 * Create entries for all existing IRQs.
+	 */
+	for (i = 0; i < NR_IRQS; i++)
+		register_irq_proc(i);
+}
+
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
new file mode 100644
index 000000000000..f6297c306905
--- /dev/null
+++ b/kernel/irq/spurious.c
@@ -0,0 +1,96 @@
+/*
+ * linux/kernel/irq/spurious.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains spurious interrupt handling.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/interrupt.h>
+
+/*
+ * If 99,900 of the previous 100,000 interrupts have not been handled
+ * then assume that the IRQ is stuck in some manner. Drop a diagnostic
+ * and try to turn the IRQ off.
+ *
+ * (The other 100-of-100,000 interrupts may have been a correctly
+ *  functioning device sharing an IRQ with the failing one)
+ *
+ * Called under desc->lock
+ */
+
+static void
+__report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
+{
+	struct irqaction *action;
+
+	if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) {
+		printk(KERN_ERR "irq event %d: bogus return value %x\n",
+				irq, action_ret);
+	} else {
+		printk(KERN_ERR "irq %d: nobody cared!\n", irq);
+	}
+	dump_stack();
+	printk(KERN_ERR "handlers:\n");
+	action = desc->action;
+	while (action) {
+		printk(KERN_ERR "[<%p>]", action->handler);
+		print_symbol(" (%s)",
+			(unsigned long)action->handler);
+		printk("\n");
+		action = action->next;
+	}
+}
+
+void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
+{
+	static int count = 100;
+
+	if (count > 0) {
+		count--;
+		__report_bad_irq(irq, desc, action_ret);
+	}
+}
+
+void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
+{
+	if (action_ret != IRQ_HANDLED) {
+		desc->irqs_unhandled++;
+		if (action_ret != IRQ_NONE)
+			report_bad_irq(irq, desc, action_ret);
+	}
+
+	desc->irq_count++;
+	if (desc->irq_count < 100000)
+		return;
+
+	desc->irq_count = 0;
+	if (desc->irqs_unhandled > 99900) {
+		/*
+		 * The interrupt is stuck
+		 */
+		__report_bad_irq(irq, desc, action_ret);
+		/*
+		 * Now kill the IRQ
+		 */
+		printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
+		desc->status |= IRQ_DISABLED;
+		desc->handler->disable(irq);
+	}
+	desc->irqs_unhandled = 0;
+}
+
+int noirqdebug;
+
+int __init noirqdebug_setup(char *str)
+{
+	noirqdebug = 1;
+	printk(KERN_INFO "IRQ lockup detection disabled\n");
+	return 1;
+}
+
+__setup("noirqdebug", noirqdebug_setup);
+
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 6918cb7460a8..95fbf1c6becf 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -9,6 +9,7 @@
 #include <linux/mm.h>
 #include <linux/smp_lock.h>
 #include <linux/interrupt.h>
+#include <linux/syscalls.h>
 #include <linux/time.h>
 
 #include <asm/uaccess.h>
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 01436a31c690..ca4e28b4c6a7 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -25,6 +25,8 @@
  *		hlists and exceptions notifier as suggested by Andi Kleen.
  * 2004-July	Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
  *		interface to access function arguments.
+ * 2004-Sep	Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes
+ *		exceptions notifier to be first on the priority list.
  */
 #include <linux/kprobes.h>
 #include <linux/spinlock.h>
@@ -108,6 +110,7 @@ void unregister_kprobe(struct kprobe *p)
 
 static struct notifier_block kprobe_exceptions_nb = {
 	.notifier_call = kprobe_exceptions_notify,
+	.priority = 0x7fffffff /* we need to notified first */
 };
 
 int register_jprobe(struct jprobe *jp)
diff --git a/kernel/panic.c b/kernel/panic.c
index b3abe97f88a6..fce7f4030d0a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -16,7 +16,6 @@
 #include <linux/notifier.h>
 #include <linux/init.h>
 #include <linux/sysrq.h>
-#include <linux/syscalls.h>
 #include <linux/interrupt.h>
 #include <linux/nmi.h>
 
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index ef5c42101748..c2dc4f89da41 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -43,6 +43,7 @@
 #include <linux/compiler.h>
 #include <linux/idr.h>
 #include <linux/posix-timers.h>
+#include <linux/syscalls.h>
 #include <linux/wait.h>
 #include <linux/workqueue.h>
 
@@ -219,11 +220,6 @@ static __init int init_posix_timers(void)
 		.clock_set = do_posix_clock_monotonic_settime
 	};
 
-#ifdef CONFIG_TIME_INTERPOLATION
-	/* Clocks are more accurate with time interpolators */
-	clock_realtime.res = clock_monotonic.res = time_interpolator_resolution();
-#endif
-
 	register_posix_clock(CLOCK_REALTIME, &clock_realtime);
 	register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
 
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 00b390d7a5ad..7ff375e7c95f 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -11,7 +11,9 @@
 
 static int new_loglevel = 10;
 static int orig_loglevel;
+#ifdef SUSPEND_CONSOLE
 static int orig_fgconsole, orig_kmsg;
+#endif
 
 int pm_prepare_console(void)
 {
@@ -50,6 +52,7 @@ void pm_restore_console(void)
 	acquire_console_sem();
 	set_console(orig_fgconsole);
 	release_console_sem();
+	kmsg_redirect = orig_kmsg;
 #endif
 	return;
 }
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 312aa169c566..4cc81e03085c 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -85,10 +85,20 @@ static int in_suspend __nosavedata = 0;
 
 static void free_some_memory(void)
 {
-	printk("Freeing memory: ");
-	while (shrink_all_memory(10000))
-		printk(".");
-	printk("|\n");
+	unsigned int i = 0;
+	unsigned int tmp;
+	unsigned long pages = 0;
+	char *p = "-\\|/";
+
+	printk("Freeing memory...  ");
+	while ((tmp = shrink_all_memory(10000))) {
+		pages += tmp;
+		printk("\b%c", p[i]);
+		i++;
+		if (i > 3)
+			i = 0;
+	}
+	printk("\bdone (%li pages freed)\n", pages);
 }
 
 
diff --git a/kernel/power/process.c b/kernel/power/process.c
index bda013de59a5..78d92dc6a1ed 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -23,8 +23,8 @@ static inline int freezeable(struct task_struct * p)
 {
 	if ((p == current) || 
 	    (p->flags & PF_NOFREEZE) ||
-	    (p->state == TASK_ZOMBIE) ||
-	    (p->state == TASK_DEAD) ||
+	    (p->exit_state == EXIT_ZOMBIE) ||
+	    (p->exit_state == EXIT_DEAD) ||
 	    (p->state == TASK_STOPPED) ||
 	    (p->state == TASK_TRACED))
 		return 0;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index f65d295478c4..375299c86c45 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -294,15 +294,19 @@ static int data_write(void)
 {
 	int error = 0;
 	int i;
+	unsigned int mod = nr_copy_pages / 100;
 
-	printk( "Writing data to swap (%d pages): ", nr_copy_pages );
+	if (!mod)
+		mod = 1;
+
+	printk( "Writing data to swap (%d pages)...     ", nr_copy_pages );
 	for (i = 0; i < nr_copy_pages && !error; i++) {
-		if (!(i%100))
-			printk( "." );
+		if (!(i%mod))
+			printk( "\b\b\b\b%3d%%", i / mod );
 		error = write_page((pagedir_nosave+i)->address,
 					  &((pagedir_nosave+i)->swap_address));
 	}
-	printk(" %d Pages done.\n",i);
+	printk("\b\b\b\bdone\n");
 	return error;
 }
 
@@ -856,7 +860,9 @@ int swsusp_suspend(void)
 	local_irq_disable();
 	save_processor_state();
 	error = swsusp_arch_suspend();
+	/* Restore control flow magically appears here */
 	restore_processor_state();
+	restore_highmem();
 	local_irq_enable();
 	return error;
 }
@@ -876,8 +882,13 @@ int swsusp_resume(void)
 {
 	int error;
 	local_irq_disable();
+	/* We'll ignore saved state, but this gets preempt count (etc) right */
 	save_processor_state();
 	error = swsusp_arch_resume();
+	/* Code below is only ever reached in case of failure. Otherwise
+	 * execution continues at place where swsusp_arch_suspend was called
+         */
+	BUG_ON(!error);
 	restore_processor_state();
 	restore_highmem();
 	local_irq_enable();
@@ -1134,14 +1145,18 @@ static int __init data_read(void)
 	struct pbe * p;
 	int error;
 	int i;
+	int mod = nr_copy_pages / 100;
+
+	if (!mod)
+		mod = 1;
 
 	if ((error = swsusp_pagedir_relocate()))
 		return error;
 
-	printk( "Reading image data (%d pages): ", nr_copy_pages );
+	printk( "Reading image data (%d pages):     ", nr_copy_pages );
 	for(i = 0, p = pagedir_nosave; i < nr_copy_pages && !error; i++, p++) {
-		if (!(i%100))
-			printk( "." );
+		if (!(i%mod))
+			printk( "\b\b\b\b%3d%%", i / mod );
 		error = bio_read_page(swp_offset(p->swap_address),
 				  (void *)p->address);
 	}
diff --git a/kernel/printk.c b/kernel/printk.c
index 18dba205c171..c02ec626f384 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -30,6 +30,7 @@
 #include <linux/smp.h>
 #include <linux/security.h>
 #include <linux/bootmem.h>
+#include <linux/syscalls.h>
 
 #include <asm/uaccess.h>
 
diff --git a/kernel/profile.c b/kernel/profile.c
index cab14764cea0..1c4375fad923 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -22,14 +22,14 @@ static int __init profile_setup(char * str)
 	int par;
 
 	if (!strncmp(str, "schedule", 8)) {
-		prof_on = 2;
+		prof_on = SCHED_PROFILING;
 		printk(KERN_INFO "kernel schedule profiling enabled\n");
 		if (str[7] == ',')
 			str += 8;
 	}
 	if (get_option(&str,&par)) {
 		prof_shift = par;
-		prof_on = 1;
+		prof_on = CPU_PROFILING;
 		printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n",
 			prof_shift);
 	}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index b14b4a467729..c3ac348e2368 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -82,7 +82,8 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 	 */
 	read_lock(&tasklist_lock);
 	if ((child->ptrace & PT_PTRACED) && child->parent == current &&
-	    child->signal != NULL) {
+	    (!(child->ptrace & PT_ATTACHED) || child->real_parent != current)
+	    && child->signal != NULL) {
 		ret = 0;
 		spin_lock_irq(&child->sighand->siglock);
 		if (child->state == TASK_STOPPED) {
@@ -131,7 +132,7 @@ int ptrace_attach(struct task_struct *task)
 		goto bad;
 
 	/* Go */
-	task->ptrace |= PT_PTRACED;
+	task->ptrace |= PT_PTRACED | PT_ATTACHED;
 	if (capable(CAP_SYS_PTRACE))
 		task->ptrace |= PT_PTRACE_CAP;
 	task_unlock(task);
@@ -162,7 +163,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 	write_lock_irq(&tasklist_lock);
 	__ptrace_unlink(child);
 	/* .. and wake it up. */
-	if (child->state != TASK_ZOMBIE)
+	if (child->exit_state != EXIT_ZOMBIE)
 		wake_up_process(child);
 	write_unlock_irq(&tasklist_lock);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 9b395e8d616d..8e5e2af64509 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -42,6 +42,7 @@
 #include <linux/percpu.h>
 #include <linux/kthread.h>
 #include <linux/seq_file.h>
+#include <linux/syscalls.h>
 #include <linux/times.h>
 #include <asm/tlb.h>
 
@@ -180,17 +181,8 @@ static unsigned int task_timeslice(task_t *p)
 	else
 		return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
 }
-#define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time)
-
-enum idle_type
-{
-	IDLE,
-	NOT_IDLE,
-	NEWLY_IDLE,
-	MAX_IDLE_TYPES
-};
-
-struct sched_domain;
+#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)	\
+				< (long long) (sd)->cache_hot_time)
 
 /*
  * These are the runqueue data structures:
@@ -289,140 +281,6 @@ struct runqueue {
 
 static DEFINE_PER_CPU(struct runqueue, runqueues);
 
-/*
- * sched-domains (multiprocessor balancing) declarations:
- */
-#ifdef CONFIG_SMP
-#define SCHED_LOAD_SCALE	128UL	/* increase resolution of load */
-
-#define SD_BALANCE_NEWIDLE	1	/* Balance when about to become idle */
-#define SD_BALANCE_EXEC		2	/* Balance on exec */
-#define SD_WAKE_IDLE		4	/* Wake to idle CPU on task wakeup */
-#define SD_WAKE_AFFINE		8	/* Wake task to waking CPU */
-#define SD_WAKE_BALANCE		16	/* Perform balancing at task wakeup */
-#define SD_SHARE_CPUPOWER	32	/* Domain members share cpu power */
-
-struct sched_group {
-	struct sched_group *next;	/* Must be a circular list */
-	cpumask_t cpumask;
-
-	/*
-	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
-	 * single CPU. This should be read only (except for setup). Although
-	 * it will need to be written to at cpu hot(un)plug time, perhaps the
-	 * cpucontrol semaphore will provide enough exclusion?
-	 */
-	unsigned long cpu_power;
-};
-
-struct sched_domain {
-	/* These fields must be setup */
-	struct sched_domain *parent;	/* top domain must be null terminated */
-	struct sched_group *groups;	/* the balancing groups of the domain */
-	cpumask_t span;			/* span of all CPUs in this domain */
-	unsigned long min_interval;	/* Minimum balance interval ms */
-	unsigned long max_interval;	/* Maximum balance interval ms */
-	unsigned int busy_factor;	/* less balancing by factor if busy */
-	unsigned int imbalance_pct;	/* No balance until over watermark */
-	unsigned long long cache_hot_time; /* Task considered cache hot (ns) */
-	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */
-	unsigned int per_cpu_gain;	/* CPU % gained by adding domain cpus */
-	int flags;			/* See SD_* */
-
-	/* Runtime fields. */
-	unsigned long last_balance;	/* init to jiffies. units in jiffies */
-	unsigned int balance_interval;	/* initialise to 1. units in ms. */
-	unsigned int nr_balance_failed; /* initialise to 0 */
-
-#ifdef CONFIG_SCHEDSTATS
-	/* load_balance() stats */
-	unsigned long lb_cnt[MAX_IDLE_TYPES];
-	unsigned long lb_failed[MAX_IDLE_TYPES];
-	unsigned long lb_imbalance[MAX_IDLE_TYPES];
-	unsigned long lb_nobusyg[MAX_IDLE_TYPES];
-	unsigned long lb_nobusyq[MAX_IDLE_TYPES];
-
-	/* sched_balance_exec() stats */
-	unsigned long sbe_attempts;
-	unsigned long sbe_pushed;
-
-	/* try_to_wake_up() stats */
-	unsigned long ttwu_wake_affine;
-	unsigned long ttwu_wake_balance;
-#endif
-};
-
-#ifndef ARCH_HAS_SCHED_TUNE
-#ifdef CONFIG_SCHED_SMT
-#define ARCH_HAS_SCHED_WAKE_IDLE
-/* Common values for SMT siblings */
-#define SD_SIBLING_INIT (struct sched_domain) {		\
-	.span			= CPU_MASK_NONE,	\
-	.parent			= NULL,			\
-	.groups			= NULL,			\
-	.min_interval		= 1,			\
-	.max_interval		= 2,			\
-	.busy_factor		= 8,			\
-	.imbalance_pct		= 110,			\
-	.cache_hot_time		= 0,			\
-	.cache_nice_tries	= 0,			\
-	.per_cpu_gain		= 25,			\
-	.flags			= SD_BALANCE_NEWIDLE	\
-				| SD_BALANCE_EXEC	\
-				| SD_WAKE_AFFINE	\
-				| SD_WAKE_IDLE		\
-				| SD_SHARE_CPUPOWER,	\
-	.last_balance		= jiffies,		\
-	.balance_interval	= 1,			\
-	.nr_balance_failed	= 0,			\
-}
-#endif
-
-/* Common values for CPUs */
-#define SD_CPU_INIT (struct sched_domain) {		\
-	.span			= CPU_MASK_NONE,	\
-	.parent			= NULL,			\
-	.groups			= NULL,			\
-	.min_interval		= 1,			\
-	.max_interval		= 4,			\
-	.busy_factor		= 64,			\
-	.imbalance_pct		= 125,			\
-	.cache_hot_time		= (5*1000000/2),	\
-	.cache_nice_tries	= 1,			\
-	.per_cpu_gain		= 100,			\
-	.flags			= SD_BALANCE_NEWIDLE	\
-				| SD_BALANCE_EXEC	\
-				| SD_WAKE_AFFINE	\
-				| SD_WAKE_BALANCE,	\
-	.last_balance		= jiffies,		\
-	.balance_interval	= 1,			\
-	.nr_balance_failed	= 0,			\
-}
-
-/* Arch can override this macro in processor.h */
-#if defined(CONFIG_NUMA) && !defined(SD_NODE_INIT)
-#define SD_NODE_INIT (struct sched_domain) {		\
-	.span			= CPU_MASK_NONE,	\
-	.parent			= NULL,			\
-	.groups			= NULL,			\
-	.min_interval		= 8,			\
-	.max_interval		= 32,			\
-	.busy_factor		= 32,			\
-	.imbalance_pct		= 125,			\
-	.cache_hot_time		= (10*1000000),		\
-	.cache_nice_tries	= 1,			\
-	.per_cpu_gain		= 100,			\
-	.flags			= SD_BALANCE_EXEC	\
-				| SD_WAKE_BALANCE,	\
-	.last_balance		= jiffies,		\
-	.balance_interval	= 1,			\
-	.nr_balance_failed	= 0,			\
-}
-#endif
-#endif /* ARCH_HAS_SCHED_TUNE */
-#endif
-
-
 #define for_each_domain(cpu, domain) \
 	for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
 
@@ -501,7 +359,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
 		    rq->smt_cnt, rq->sbe_cnt, rq->rq_sched_info.cpu_time,
 		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
 
-		for (itype = IDLE; itype < MAX_IDLE_TYPES; itype++)
+		for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; itype++)
 			seq_printf(seq, " %lu %lu", rq->pt_gained[itype],
 						    rq->pt_lost[itype]);
 		seq_printf(seq, "\n");
@@ -513,7 +371,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
 
 			cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
 			seq_printf(seq, "domain%d %s", dcnt++, mask_str);
-			for (itype = IDLE; itype < MAX_IDLE_TYPES; itype++) {
+			for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
+						itype++) {
 				seq_printf(seq, " %lu %lu %lu %lu %lu",
 				    sd->lb_cnt[itype],
 				    sd->lb_failed[itype],
@@ -1087,8 +946,7 @@ static int wake_idle(int cpu, task_t *p)
 	if (!(sd->flags & SD_WAKE_IDLE))
 		return cpu;
 
-	cpus_and(tmp, sd->span, cpu_online_map);
-	cpus_and(tmp, tmp, p->cpus_allowed);
+	cpus_and(tmp, sd->span, p->cpus_allowed);
 
 	for_each_cpu_mask(i, tmp) {
 		if (idle_cpu(i))
@@ -1123,14 +981,14 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
 	int cpu, this_cpu, success = 0;
 	unsigned long flags;
 	long old_state;
-	runqueue_t *rq;
+	runqueue_t *rq, *old_rq;
 #ifdef CONFIG_SMP
 	unsigned long load, this_load;
 	struct sched_domain *sd;
 	int new_cpu;
 #endif
 
-	rq = task_rq_lock(p, &flags);
+	old_rq = rq = task_rq_lock(p, &flags);
 	schedstat_inc(rq, ttwu_cnt);
 	old_state = p->state;
 	if (!(old_state & state))
@@ -1225,7 +1083,7 @@ out_set_cpu:
 out_activate:
 #endif /* CONFIG_SMP */
 	if (old_state == TASK_UNINTERRUPTIBLE) {
-		rq->nr_uninterruptible--;
+		old_rq->nr_uninterruptible--;
 		/*
 		 * Tasks on involuntary sleep don't earn
 		 * sleep_avg beyond just interactive state.
@@ -1471,10 +1329,10 @@ static void finish_task_switch(task_t *prev)
 
 	/*
 	 * A task struct has one reference for the use as "current".
-	 * If a task dies, then it sets TASK_ZOMBIE in tsk->state and calls
-	 * schedule one last time. The schedule call will never return,
+	 * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and
+	 * calls schedule one last time. The schedule call will never return,
 	 * and the scheduled task must drop that reference.
-	 * The test for TASK_ZOMBIE must occur while the runqueue locks are
+	 * The test for EXIT_ZOMBIE must occur while the runqueue locks are
 	 * still held, otherwise prev could be scheduled on another cpu, die
 	 * there before we look at prev->state, and then the reference would
 	 * be dropped twice.
@@ -1640,8 +1498,7 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu,
 	min_cpu = UINT_MAX;
 	min_load = ULONG_MAX;
 
-	cpus_and(mask, sd->span, cpu_online_map);
-	cpus_and(mask, mask, p->cpus_allowed);
+	cpus_and(mask, sd->span, p->cpus_allowed);
 
 	for_each_cpu_mask(i, mask) {
 		load = target_load(i);
@@ -1893,7 +1750,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	max_load = this_load = total_load = total_pwr = 0;
 
 	do {
-		cpumask_t tmp;
 		unsigned long load;
 		int local_group;
 		int i, nr_cpus = 0;
@@ -1902,11 +1758,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 
 		/* Tally up the load of all CPUs in the group */
 		avg_load = 0;
-		cpus_and(tmp, group->cpumask, cpu_online_map);
-		if (unlikely(cpus_empty(tmp)))
-			goto nextgroup;
 
-		for_each_cpu_mask(i, tmp) {
+		for_each_cpu_mask(i, group->cpumask) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
 				load = target_load(i);
@@ -2011,7 +1864,7 @@ nextgroup:
 
 out_balanced:
 	if (busiest && (idle == NEWLY_IDLE ||
-			(idle == IDLE && max_load > SCHED_LOAD_SCALE)) ) {
+			(idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) {
 		*imbalance = 1;
 		return busiest;
 	}
@@ -2025,13 +1878,11 @@ out_balanced:
  */
 static runqueue_t *find_busiest_queue(struct sched_group *group)
 {
-	cpumask_t tmp;
 	unsigned long load, max_load = 0;
 	runqueue_t *busiest = NULL;
 	int i;
 
-	cpus_and(tmp, group->cpumask, cpu_online_map);
-	for_each_cpu_mask(i, tmp) {
+	for_each_cpu_mask(i, group->cpumask) {
 		load = source_load(i);
 
 		if (load > max_load) {
@@ -2232,18 +2083,13 @@ static void active_load_balance(runqueue_t *busiest, int busiest_cpu)
 
 	group = sd->groups;
 	do {
-		cpumask_t tmp;
 		runqueue_t *rq;
 		int push_cpu = 0;
 
 		if (group == busy_group)
 			goto next_group;
 
-		cpus_and(tmp, group->cpumask, cpu_online_map);
-		if (!cpus_weight(tmp))
-			goto next_group;
-
-		for_each_cpu_mask(i, tmp) {
+		for_each_cpu_mask(i, group->cpumask) {
 			if (!idle_cpu(i))
 				goto next_group;
 			push_cpu = i;
@@ -2260,7 +2106,7 @@ static void active_load_balance(runqueue_t *busiest, int busiest_cpu)
 		if (unlikely(busiest == rq))
 			goto next_group;
 		double_lock_balance(busiest, rq);
-		if (move_tasks(rq, push_cpu, busiest, 1, sd, IDLE)) {
+		if (move_tasks(rq, push_cpu, busiest, 1, sd, SCHED_IDLE)) {
 			schedstat_inc(busiest, alb_lost);
 			schedstat_inc(rq, alb_gained);
 		} else {
@@ -2304,9 +2150,13 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
 	this_rq->cpu_load = (old_load + this_load) / 2;
 
 	for_each_domain(this_cpu, sd) {
-		unsigned long interval = sd->balance_interval;
+		unsigned long interval;
 
-		if (idle != IDLE)
+		if (!(sd->flags & SD_LOAD_BALANCE))
+			continue;
+
+		interval = sd->balance_interval;
+		if (idle != SCHED_IDLE)
 			interval *= sd->busy_factor;
 
 		/* scale ms to jiffies */
@@ -2408,7 +2258,7 @@ void scheduler_tick(int user_ticks, int sys_ticks)
 			cpustat->idle += sys_ticks;
 		if (wake_priority_sleeper(rq))
 			goto out;
-		rebalance_tick(cpu, rq, IDLE);
+		rebalance_tick(cpu, rq, SCHED_IDLE);
 		return;
 	}
 	if (TASK_NICE(p) > 0)
@@ -2512,7 +2362,7 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
 	 */
 	spin_unlock(&this_rq->lock);
 
-	cpus_and(sibling_map, sd->span, cpu_online_map);
+	sibling_map = sd->span;
 
 	for_each_cpu_mask(i, sibling_map)
 		spin_lock(&cpu_rq(i)->lock);
@@ -2557,7 +2407,7 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
 	 * wake_sleeping_dependent():
 	 */
 	spin_unlock(&this_rq->lock);
-	cpus_and(sibling_map, sd->span, cpu_online_map);
+	sibling_map = sd->span;
 	for_each_cpu_mask(i, sibling_map)
 		spin_lock(&cpu_rq(i)->lock);
 	cpu_clear(this_cpu, sibling_map);
@@ -2639,12 +2489,15 @@ asmlinkage void __sched schedule(void)
 	 * schedule() atomically, we ignore that path for now.
 	 * Otherwise, whine if we are scheduling when we should not be.
 	 */
-	if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) {
+	if (likely(!(current->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)))) {
 		if (unlikely(in_atomic())) {
-			printk(KERN_ERR "bad: scheduling while atomic!\n");
+			printk(KERN_ERR "scheduling while atomic: "
+				"%s/0x%08x/%d\n",
+				current->comm, preempt_count(), current->pid);
 			dump_stack();
 		}
 	}
+	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 
 need_resched:
 	preempt_disable();
@@ -2678,6 +2531,8 @@ need_resched:
 
 	spin_lock_irq(&rq->lock);
 
+	if (unlikely(current->flags & PF_DEAD))
+		current->state = EXIT_DEAD;
 	/*
 	 * if entering off of a kernel preemption go straight
 	 * to picking the next task.
@@ -2764,7 +2619,7 @@ switch_tasks:
 		if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev)))
 			prev->interactive_credit--;
 	}
-	prev->timestamp = now;
+	prev->timestamp = prev->last_ran = now;
 
 	sched_info_switch(prev, next);
 	if (likely(prev != next)) {
@@ -3221,7 +3076,6 @@ static int setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 				policy != SCHED_NORMAL)
 			goto out_unlock;
 	}
-	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 
 	/*
 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
@@ -4068,7 +3922,7 @@ static void migrate_dead(unsigned int dead_cpu, task_t *tsk)
 	struct runqueue *rq = cpu_rq(dead_cpu);
 
 	/* Must be exiting, otherwise would be on tasklist. */
-	BUG_ON(tsk->state != TASK_ZOMBIE && tsk->state != TASK_DEAD);
+	BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD);
 
 	/* Cannot have done final schedule yet: would have vanished. */
 	BUG_ON(tsk->flags & PF_DEAD);
@@ -4209,16 +4063,17 @@ spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
 EXPORT_SYMBOL(kernel_flag);
 
 #ifdef CONFIG_SMP
-/* Attach the domain 'sd' to 'cpu' as its base domain */
-static void cpu_attach_domain(struct sched_domain *sd, int cpu)
+/*
+ * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
+ * hold the hotplug lock.
+ */
+void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
 	migration_req_t req;
 	unsigned long flags;
 	runqueue_t *rq = cpu_rq(cpu);
 	int local = 1;
 
-	lock_cpu_hotplug();
-
 	spin_lock_irqsave(&rq->lock, flags);
 
 	if (cpu == smp_processor_id() || !cpu_online(cpu)) {
@@ -4237,128 +4092,10 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 		wake_up_process(rq->migration_thread);
 		wait_for_completion(&req.done);
 	}
-
-	unlock_cpu_hotplug();
 }
 
-/*
- * To enable disjoint top-level NUMA domains, define SD_NODES_PER_DOMAIN
- * in arch code. That defines the number of nearby nodes in a node's top
- * level scheduling domain.
- */
-#if defined(CONFIG_NUMA) && defined(SD_NODES_PER_DOMAIN)
-/**
- * find_next_best_node - find the next node to include in a sched_domain
- * @node: node whose sched_domain we're building
- * @used_nodes: nodes already in the sched_domain
- *
- * Find the next node to include in a given scheduling domain.  Simply
- * finds the closest node not already in the @used_nodes map.
- *
- * Should use nodemask_t.
- */
-static int __init find_next_best_node(int node, unsigned long *used_nodes)
-{
-	int i, n, val, min_val, best_node = 0;
-
-	min_val = INT_MAX;
-
-	for (i = 0; i < numnodes; i++) {
-		/* Start at @node */
-		n = (node + i) % numnodes;
-
-		/* Skip already used nodes */
-		if (test_bit(n, used_nodes))
-			continue;
-
-		/* Simple min distance search */
-		val = node_distance(node, i);
-
-		if (val < min_val) {
-			min_val = val;
-			best_node = n;
-		}
-	}
-
-	set_bit(best_node, used_nodes);
-	return best_node;
-}
-
-/**
- * sched_domain_node_span - get a cpumask for a node's sched_domain
- * @node: node whose cpumask we're constructing
- * @size: number of nodes to include in this span
- *
- * Given a node, construct a good cpumask for its sched_domain to span.  It
- * should be one that prevents unnecessary balancing, but also spreads tasks
- * out optimally.
- */
-cpumask_t __init sched_domain_node_span(int node)
-{
-	int i;
-	cpumask_t span;
-	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
-
-	cpus_clear(span);
-	bitmap_zero(used_nodes, MAX_NUMNODES);
-
-	for (i = 0; i < SD_NODES_PER_DOMAIN; i++) {
-		int next_node = find_next_best_node(node, used_nodes);
-		cpumask_t  nodemask;
-
-		nodemask = node_to_cpumask(next_node);
-		cpus_or(span, span, nodemask);
-	}
-
-	return span;
-}
-#else /* CONFIG_NUMA && SD_NODES_PER_DOMAIN */
-cpumask_t __init sched_domain_node_span(int node)
-{
-	return cpu_possible_map;
-}
-#endif /* CONFIG_NUMA && SD_NODES_PER_DOMAIN */
-
-#ifdef CONFIG_SCHED_SMT
-static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
-static struct sched_group sched_group_cpus[NR_CPUS];
-__init static int cpu_to_cpu_group(int cpu)
-{
-	return cpu;
-}
-#endif
-
-static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static struct sched_group sched_group_phys[NR_CPUS];
-__init static int cpu_to_phys_group(int cpu)
-{
-#ifdef CONFIG_SCHED_SMT
-	return first_cpu(cpu_sibling_map[cpu]);
-#else
-	return cpu;
-#endif
-}
-
-#ifdef CONFIG_NUMA
-
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group sched_group_nodes[MAX_NUMNODES];
-__init static int cpu_to_node_group(int cpu)
-{
-	return cpu_to_node(cpu);
-}
-#endif
-
-/* Groups for isolated scheduling domains */
-static struct sched_group sched_group_isolated[NR_CPUS];
-
 /* cpus with isolated domains */
-cpumask_t __initdata cpu_isolated_map = CPU_MASK_NONE;
-
-__init static int cpu_to_isolated_group(int cpu)
-{
-	return cpu;
-}
+cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
 
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
@@ -4385,7 +4122,7 @@ __setup ("isolcpus=", isolated_cpu_setup);
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
-__init static void init_sched_build_groups(struct sched_group groups[],
+void __devinit init_sched_build_groups(struct sched_group groups[],
 			cpumask_t span, int (*group_fn)(int cpu))
 {
 	struct sched_group *first = NULL, *last = NULL;
@@ -4419,56 +4156,99 @@ __init static void init_sched_build_groups(struct sched_group groups[],
 	last->next = first;
 }
 
-__init static void arch_init_sched_domains(void)
+
+#ifdef ARCH_HAS_SCHED_DOMAIN
+extern void __devinit arch_init_sched_domains(void);
+extern void __devinit arch_destroy_sched_domains(void);
+#else
+#ifdef CONFIG_SCHED_SMT
+static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
+static struct sched_group sched_group_cpus[NR_CPUS];
+static int __devinit cpu_to_cpu_group(int cpu)
+{
+	return cpu;
+}
+#endif
+
+static DEFINE_PER_CPU(struct sched_domain, phys_domains);
+static struct sched_group sched_group_phys[NR_CPUS];
+static int __devinit cpu_to_phys_group(int cpu)
+{
+#ifdef CONFIG_SCHED_SMT
+	return first_cpu(cpu_sibling_map[cpu]);
+#else
+	return cpu;
+#endif
+}
+
+#ifdef CONFIG_NUMA
+
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
+static struct sched_group sched_group_nodes[MAX_NUMNODES];
+static int __devinit cpu_to_node_group(int cpu)
+{
+	return cpu_to_node(cpu);
+}
+#endif
+
+#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
+/*
+ * The domains setup code relies on siblings not spanning
+ * multiple nodes. Make sure the architecture has a proper
+ * siblings map:
+ */
+static void check_sibling_maps(void)
+{
+	int i, j;
+
+	for_each_online_cpu(i) {
+		for_each_cpu_mask(j, cpu_sibling_map[i]) {
+			if (cpu_to_node(i) != cpu_to_node(j)) {
+				printk(KERN_INFO "warning: CPU %d siblings map "
+					"to different node - isolating "
+					"them.\n", i);
+				cpu_sibling_map[i] = cpumask_of_cpu(i);
+				break;
+			}
+		}
+	}
+}
+#endif
+
+/*
+ * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
+ */
+static void __devinit arch_init_sched_domains(void)
 {
 	int i;
 	cpumask_t cpu_default_map;
 
+#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
+	check_sibling_maps();
+#endif
 	/*
 	 * Setup mask for cpus without special case scheduling requirements.
 	 * For now this just excludes isolated cpus, but could be used to
 	 * exclude other special cases in the future.
 	 */
 	cpus_complement(cpu_default_map, cpu_isolated_map);
-	cpus_and(cpu_default_map, cpu_default_map, cpu_possible_map);
+	cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
 
-	/* Set up domains */
-	for_each_cpu(i) {
+	/*
+	 * Set up domains. Isolated domains just stay on the dummy domain.
+	 */
+	for_each_cpu_mask(i, cpu_default_map) {
 		int group;
 		struct sched_domain *sd = NULL, *p;
 		cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
 
 		cpus_and(nodemask, nodemask, cpu_default_map);
 
-		/*
-		 * Set up isolated domains.
-		 * Unlike those of other cpus, the domains and groups are
-		 * single level, and span a single cpu.
-		 */
-		if (cpu_isset(i, cpu_isolated_map)) {
-#ifdef CONFIG_SCHED_SMT
-			sd = &per_cpu(cpu_domains, i);
-#else
-			sd = &per_cpu(phys_domains, i);
-#endif
-			group = cpu_to_isolated_group(i);
-			*sd = SD_CPU_INIT;
-			cpu_set(i, sd->span);
-			sd->balance_interval = INT_MAX;	/* Don't balance */
-			sd->flags = 0;			/* Avoid WAKE_ */
-			sd->groups = &sched_group_isolated[group];
-			printk(KERN_INFO "Setting up cpu %d isolated.\n", i);
-			/* Single level, so continue with next cpu */
-			continue;
-		}
-
 #ifdef CONFIG_NUMA
 		sd = &per_cpu(node_domains, i);
 		group = cpu_to_node_group(i);
 		*sd = SD_NODE_INIT;
-		/* FIXME: should be multilevel, in arch code */
-		sd->span = sched_domain_node_span(i);
-		cpus_and(sd->span, sd->span, cpu_default_map);
+		sd->span = cpu_default_map;
 		sd->groups = &sched_group_nodes[group];
 #endif
 
@@ -4476,11 +4256,7 @@ __init static void arch_init_sched_domains(void)
 		sd = &per_cpu(phys_domains, i);
 		group = cpu_to_phys_group(i);
 		*sd = SD_CPU_INIT;
-#ifdef CONFIG_NUMA
 		sd->span = nodemask;
-#else
-		sd->span = cpu_possible_map;
-#endif
 		sd->parent = p;
 		sd->groups = &sched_group_phys[group];
 
@@ -4498,7 +4274,7 @@ __init static void arch_init_sched_domains(void)
 
 #ifdef CONFIG_SCHED_SMT
 	/* Set up CPU (sibling) groups */
-	for_each_cpu(i) {
+	for_each_online_cpu(i) {
 		cpumask_t this_sibling_map = cpu_sibling_map[i];
 		cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
 		if (i != first_cpu(this_sibling_map))
@@ -4509,16 +4285,6 @@ __init static void arch_init_sched_domains(void)
 	}
 #endif
 
-	/* Set up isolated groups */
-	for_each_cpu_mask(i, cpu_isolated_map) {
-		cpumask_t mask;
-		cpus_clear(mask);
-		cpu_set(i, mask);
-		init_sched_build_groups(sched_group_isolated, mask,
-						&cpu_to_isolated_group);
-	}
-
-#ifdef CONFIG_NUMA
 	/* Set up physical groups */
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		cpumask_t nodemask = node_to_cpumask(i);
@@ -4530,10 +4296,6 @@ __init static void arch_init_sched_domains(void)
 		init_sched_build_groups(sched_group_phys, nodemask,
 						&cpu_to_phys_group);
 	}
-#else
-	init_sched_build_groups(sched_group_phys, cpu_possible_map,
-							&cpu_to_phys_group);
-#endif
 
 #ifdef CONFIG_NUMA
 	/* Set up node groups */
@@ -4566,7 +4328,7 @@ __init static void arch_init_sched_domains(void)
 	}
 
 	/* Attach the domains */
-	for_each_cpu(i) {
+	for_each_online_cpu(i) {
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
 		sd = &per_cpu(cpu_domains, i);
@@ -4577,21 +4339,29 @@ __init static void arch_init_sched_domains(void)
 	}
 }
 
-#undef SCHED_DOMAIN_DEBUG
+#ifdef CONFIG_HOTPLUG_CPU
+static void __devinit arch_destroy_sched_domains(void)
+{
+	/* Do nothing: everything is statically allocated. */
+}
+#endif
+
+#endif /* ARCH_HAS_SCHED_DOMAIN */
+
+#define SCHED_DOMAIN_DEBUG
 #ifdef SCHED_DOMAIN_DEBUG
-void sched_domain_debug(void)
+static void sched_domain_debug(void)
 {
 	int i;
 
-	for_each_cpu(i) {
+	for_each_online_cpu(i) {
 		runqueue_t *rq = cpu_rq(i);
 		struct sched_domain *sd;
 		int level = 0;
 
 		sd = rq->sd;
 
-		printk(KERN_DEBUG "CPU%d: %s\n",
-				i, (cpu_online(i) ? " online" : "offline"));
+		printk(KERN_DEBUG "CPU%d:\n", i);
 
 		do {
 			int j;
@@ -4605,7 +4375,17 @@ void sched_domain_debug(void)
 			printk(KERN_DEBUG);
 			for (j = 0; j < level + 1; j++)
 				printk(" ");
-			printk("domain %d: span %s\n", level, str);
+			printk("domain %d: ", level);
+
+			if (!(sd->flags & SD_LOAD_BALANCE)) {
+				printk("does not balance");
+				if (sd->parent)
+					printk(" ERROR !SD_LOAD_BALANCE domain has parent");
+				printk("\n");
+				break;
+			}
+
+			printk("span %s\n", str);
 
 			if (!cpu_isset(i, sd->span))
 				printk(KERN_DEBUG "ERROR domain->span does not contain CPU%d\n", i);
@@ -4657,10 +4437,64 @@ void sched_domain_debug(void)
 #define sched_domain_debug() {}
 #endif
 
+#ifdef CONFIG_SMP
+/*
+ * Initial dummy domain for early boot and for hotplug cpu. Being static,
+ * it is initialized to zero, so all balancing flags are cleared which is
+ * what we want.
+ */
+static struct sched_domain sched_domain_dummy;
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Force a reinitialization of the sched domains hierarchy.  The domains
+ * and groups cannot be updated in place without racing with the balancing
+ * code, so we temporarily attach all running cpus to a "dummy" domain
+ * which will prevent rebalancing while the sched domains are recalculated.
+ */
+static int update_sched_domains(struct notifier_block *nfb,
+				unsigned long action, void *hcpu)
+{
+	int i;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_DOWN_PREPARE:
+		for_each_online_cpu(i)
+			cpu_attach_domain(&sched_domain_dummy, i);
+		arch_destroy_sched_domains();
+		return NOTIFY_OK;
+
+	case CPU_UP_CANCELED:
+	case CPU_DOWN_FAILED:
+	case CPU_ONLINE:
+	case CPU_DEAD:
+		/*
+		 * Fall through and re-initialise the domains.
+		 */
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+
+	/* The hotplug lock is already held by cpu_up/cpu_down */
+	arch_init_sched_domains();
+
+	sched_domain_debug();
+
+	return NOTIFY_OK;
+}
+#endif
+
 void __init sched_init_smp(void)
 {
+	lock_cpu_hotplug();
 	arch_init_sched_domains();
 	sched_domain_debug();
+	unlock_cpu_hotplug();
+	/* XXX: Theoretical race here - CPU may be hotplugged now */
+	hotcpu_notifier(update_sched_domains, 0);
 }
 #else
 void __init sched_init_smp(void)
@@ -4682,24 +4516,6 @@ void __init sched_init(void)
 	runqueue_t *rq;
 	int i, j, k;
 
-#ifdef CONFIG_SMP
-	/* Set up an initial dummy domain for early boot */
-	static struct sched_domain sched_domain_init;
-	static struct sched_group sched_group_init;
-
-	memset(&sched_domain_init, 0, sizeof(struct sched_domain));
-	sched_domain_init.span = CPU_MASK_ALL;
-	sched_domain_init.groups = &sched_group_init;
-	sched_domain_init.last_balance = jiffies;
-	sched_domain_init.balance_interval = INT_MAX; /* Don't balance */
-	sched_domain_init.busy_factor = 1;
-
-	memset(&sched_group_init, 0, sizeof(struct sched_group));
-	sched_group_init.cpumask = CPU_MASK_ALL;
-	sched_group_init.next = &sched_group_init;
-	sched_group_init.cpu_power = SCHED_LOAD_SCALE;
-#endif
-
 	for (i = 0; i < NR_CPUS; i++) {
 		prio_array_t *array;
 
@@ -4710,7 +4526,7 @@ void __init sched_init(void)
 		rq->best_expired_prio = MAX_PRIO;
 
 #ifdef CONFIG_SMP
-		rq->sd = &sched_domain_init;
+		rq->sd = &sched_domain_dummy;
 		rq->cpu_load = 0;
 		rq->active_balance = 0;
 		rq->push_cpu = 0;
diff --git a/kernel/signal.c b/kernel/signal.c
index df2cd4216838..f67390806d73 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -20,6 +20,7 @@
 #include <linux/tty.h>
 #include <linux/binfmts.h>
 #include <linux/security.h>
+#include <linux/syscalls.h>
 #include <linux/ptrace.h>
 #include <asm/param.h>
 #include <asm/uaccess.h>
@@ -269,7 +270,7 @@ static struct sigqueue *__sigqueue_alloc(void)
 	struct sigqueue *q = NULL;
 
 	if (atomic_read(&current->user->sigpending) <
-			current->rlim[RLIMIT_SIGPENDING].rlim_cur)
+			current->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
 		q = kmem_cache_alloc(sigqueue_cachep, GFP_ATOMIC);
 	if (q) {
 		INIT_LIST_HEAD(&q->list);
@@ -764,7 +765,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	   pass on the info struct.  */
 
 	if (atomic_read(&t->user->sigpending) <
-			t->rlim[RLIMIT_SIGPENDING].rlim_cur)
+			t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
 		q = kmem_cache_alloc(sigqueue_cachep, GFP_ATOMIC);
 
 	if (q) {
@@ -913,7 +914,7 @@ __group_complete_signal(int sig, struct task_struct *p)
 	 * Don't bother zombies and stopped tasks (but
 	 * SIGKILL will punch through stopped state)
 	 */
-	mask = TASK_DEAD | TASK_ZOMBIE | TASK_TRACED;
+	mask = EXIT_DEAD | EXIT_ZOMBIE | TASK_TRACED;
 	if (sig != SIGKILL)
 		mask |= TASK_STOPPED;
 
@@ -1069,7 +1070,7 @@ void zap_other_threads(struct task_struct *p)
 		/*
 		 * Don't bother with already dead threads
 		 */
-		if (t->state & (TASK_ZOMBIE|TASK_DEAD))
+		if (t->exit_state & (EXIT_ZOMBIE|EXIT_DEAD))
 			continue;
 
 		/*
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 4a3da9be9f26..0e1e1356e35a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -137,11 +137,17 @@ EXPORT_SYMBOL(do_softirq);
 
 void local_bh_enable(void)
 {
-	__local_bh_enable();
 	WARN_ON(irqs_disabled());
-	if (unlikely(!in_interrupt() &&
-		     local_softirq_pending()))
+	/*
+	 * Keep preemption disabled until we are done with
+	 * softirq processing:
+ 	 */
+	preempt_count() -= SOFTIRQ_OFFSET - 1;
+
+	if (unlikely(!in_interrupt() && local_softirq_pending()))
 		invoke_softirq();
+
+	dec_preempt_count();
 	preempt_check_resched();
 }
 EXPORT_SYMBOL(local_bh_enable);
diff --git a/kernel/sys.c b/kernel/sys.c
index 571bba8c8989..a95e3900dc1e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -24,6 +24,11 @@
 #include <linux/dcookies.h>
 #include <linux/suspend.h>
 
+/* Don't include this - it breaks ia64's cond_syscall() implementation */
+#if 0
+#include <linux/syscalls.h>
+#endif
+
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/unistd.h>
@@ -649,7 +654,7 @@ static int set_user(uid_t new_ruid, int dumpclear)
 		return -EAGAIN;
 
 	if (atomic_read(&new_user->processes) >=
-				current->rlim[RLIMIT_NPROC].rlim_cur &&
+				current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
 			new_user != &root_user) {
 		free_uid(new_user);
 		return -EAGAIN;
@@ -1496,9 +1501,13 @@ asmlinkage long sys_getrlimit(unsigned int resource, struct rlimit __user *rlim)
 {
 	if (resource >= RLIM_NLIMITS)
 		return -EINVAL;
-	else
-		return copy_to_user(rlim, current->rlim + resource, sizeof(*rlim))
-			? -EFAULT : 0;
+	else {
+		struct rlimit value;
+		task_lock(current->group_leader);
+		value = current->signal->rlim[resource];
+		task_unlock(current->group_leader);
+		return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;
+	}
 }
 
 #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
@@ -1513,7 +1522,9 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r
 	if (resource >= RLIM_NLIMITS)
 		return -EINVAL;
 
-	memcpy(&x, current->rlim + resource, sizeof(*rlim));
+	task_lock(current->group_leader);
+	x = current->signal->rlim[resource];
+	task_unlock(current->group_leader);
 	if(x.rlim_cur > 0x7FFFFFFF)
 		x.rlim_cur = 0x7FFFFFFF;
 	if(x.rlim_max > 0x7FFFFFFF)
@@ -1534,21 +1545,20 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
 		return -EFAULT;
        if (new_rlim.rlim_cur > new_rlim.rlim_max)
                return -EINVAL;
-	old_rlim = current->rlim + resource;
-	if (((new_rlim.rlim_cur > old_rlim->rlim_max) ||
-	     (new_rlim.rlim_max > old_rlim->rlim_max)) &&
+	old_rlim = current->signal->rlim + resource;
+	if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
 	    !capable(CAP_SYS_RESOURCE))
 		return -EPERM;
-	if (resource == RLIMIT_NOFILE) {
-		if (new_rlim.rlim_cur > NR_OPEN || new_rlim.rlim_max > NR_OPEN)
+	if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN)
 			return -EPERM;
-	}
 
 	retval = security_task_setrlimit(resource, &new_rlim);
 	if (retval)
 		return retval;
 
+	task_lock(current->group_leader);
 	*old_rlim = new_rlim;
+	task_unlock(current->group_leader);
 	return 0;
 }
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 99a0af0ed9a8..469cf0c2f26e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -40,6 +40,7 @@
 #include <linux/times.h>
 #include <linux/limits.h>
 #include <linux/dcache.h>
+#include <linux/syscalls.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
diff --git a/kernel/time.c b/kernel/time.c
index 167a72a078ea..0b4797fa3d30 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -31,6 +31,7 @@
 #include <linux/timex.h>
 #include <linux/errno.h>
 #include <linux/smp_lock.h>
+#include <linux/syscalls.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 
diff --git a/kernel/timer.c b/kernel/timer.c
index 36e40d8b182d..e3c9b5fcd52f 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -31,11 +31,13 @@
 #include <linux/time.h>
 #include <linux/jiffies.h>
 #include <linux/cpu.h>
+#include <linux/syscalls.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/div64.h>
 #include <asm/timex.h>
+#include <asm/io.h>
 
 #ifdef CONFIG_TIME_INTERPOLATION
 static void time_interpolator_update(long delta_nsec);
@@ -788,13 +790,12 @@ static void update_wall_time(unsigned long ticks)
 	do {
 		ticks--;
 		update_wall_time_one_tick();
+		if (xtime.tv_nsec >= 1000000000) {
+			xtime.tv_nsec -= 1000000000;
+			xtime.tv_sec++;
+			second_overflow();
+		}
 	} while (ticks);
-
-	if (xtime.tv_nsec >= 1000000000) {
-	    xtime.tv_nsec -= 1000000000;
-	    xtime.tv_sec++;
-	    second_overflow();
-	}
 }
 
 static inline void do_process_times(struct task_struct *p,
@@ -804,12 +805,13 @@ static inline void do_process_times(struct task_struct *p,
 
 	psecs = (p->utime += user);
 	psecs += (p->stime += system);
-	if (psecs / HZ >= p->rlim[RLIMIT_CPU].rlim_cur) {
+	if (p->signal && !unlikely(p->state & (EXIT_DEAD|EXIT_ZOMBIE)) &&
+	    psecs / HZ >= p->signal->rlim[RLIMIT_CPU].rlim_cur) {
 		/* Send SIGXCPU every second.. */
 		if (!(psecs % HZ))
 			send_sig(SIGXCPU, p, 1);
 		/* and SIGKILL when we go over max.. */
-		if (psecs / HZ >= p->rlim[RLIMIT_CPU].rlim_max)
+		if (psecs / HZ >= p->signal->rlim[RLIMIT_CPU].rlim_max)
 			send_sig(SIGKILL, p, 1);
 	}
 }
@@ -1624,13 +1626,13 @@ EXPORT_SYMBOL(msleep);
  */
 unsigned long msleep_interruptible(unsigned int msecs)
 {
-       unsigned long timeout = msecs_to_jiffies(msecs);
+	unsigned long timeout = msecs_to_jiffies(msecs);
 
-       while (timeout && !signal_pending(current)) {
-               set_current_state(TASK_INTERRUPTIBLE);
-               timeout = schedule_timeout(timeout);
-       }
-       return jiffies_to_msecs(timeout);
+	while (timeout && !signal_pending(current)) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		timeout = schedule_timeout(timeout);
+	}
+	return jiffies_to_msecs(timeout);
 }
 
 EXPORT_SYMBOL(msleep_interruptible);
author	Greg Kroah-Hartman <greg@kroah.com>	2004-10-18 18:03:51 -0700
committer	Greg Kroah-Hartman <greg@kroah.com>	2004-10-18 18:03:51 -0700
commit	bffe01870598b7a0a77073e25ee94e026bc98e6b (patch)
tree	f5f5b65fec1239b18a4c2634807ff5d7e5729de9 /kernel
parent	23aebb6f8755121394ef088d84a7fa483b444aa9 (diff)
parent	a4946826c30c56a5830326552a395c5b6afc13ef (diff)