Merge ia64 test tree back into release tree.

author: Tony Luck <tony.luck@intel.com> 2005-01-17 20:28:07 -0800
committer: Tony Luck <tony.luck@intel.com> 2005-01-17 20:28:07 -0800
commit: 9790ee6f6a30db6b9fd4f27e437b87fa8e511cdd (patch)
tree: 5283a41fba9fe6a85c0ed186ad0469dd0cc04323 /kernel
parent: bcb39c06eefc908121323c23ead2fdd246585ad3 (diff)
parent: fdefff6242ce95e350570a7f65e1ff49bc3a66b6 (diff)
42 files changed, 1529 insertions, 1032 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index 4a3bd224a836..32e39accbb86 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -528,3 +528,36 @@ void acct_process(long exitcode)
 	do_acct_process(exitcode, file);
 	fput(file);
 }
+
+
+/*
+ * acct_update_integrals
+ *    -  update mm integral fields in task_struct
+ */
+void acct_update_integrals(void)
+{
+	struct task_struct *tsk = current;
+
+	if (likely(tsk->mm)) {
+		long delta = tsk->stime - tsk->acct_stimexpd;
+
+		if (delta == 0)
+			return;
+		tsk->acct_stimexpd = tsk->stime;
+		tsk->acct_rss_mem1 += delta * tsk->mm->rss;
+		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
+	}
+}
+
+/*
+ * acct_clear_integrals
+ *    - clear the mm integral fields in task_struct
+ */
+void acct_clear_integrals(struct task_struct *tsk)
+{
+	if (tsk) {
+		tsk->acct_stimexpd = 0;
+		tsk->acct_rss_mem1 = 0;
+		tsk->acct_vm_mem1 = 0;
+	}
+}
diff --git a/kernel/audit.c b/kernel/audit.c
index d813b7aa4b4c..e21f947bacf2 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -98,8 +98,8 @@ static struct sock *audit_sock;
  * The second list is a list of pre-allocated audit buffers (if more
  * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
  * being placed on the freelist). */
-static spinlock_t  audit_txlist_lock = SPIN_LOCK_UNLOCKED;
-static spinlock_t  audit_freelist_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(audit_txlist_lock);
+static DEFINE_SPINLOCK(audit_freelist_lock);
 static int	   audit_freelist_count = 0;
 static LIST_HEAD(audit_txlist);
 static LIST_HEAD(audit_freelist);
@@ -169,7 +169,7 @@ static inline int audit_rate_check(void)
 {
 	static unsigned long	last_check = 0;
 	static int		messages   = 0;
-	static spinlock_t	lock	   = SPIN_LOCK_UNLOCKED;
+	static DEFINE_SPINLOCK(lock);
 	unsigned long		flags;
 	unsigned long		now;
 	unsigned long		elapsed;
@@ -199,7 +199,7 @@ static inline int audit_rate_check(void)
 void audit_log_lost(const char *message)
 {
 	static unsigned long	last_msg = 0;
-	static spinlock_t	lock     = SPIN_LOCK_UNLOCKED;
+	static DEFINE_SPINLOCK(lock);
 	unsigned long		flags;
 	unsigned long		now;
 	int			print;
@@ -419,7 +419,7 @@ static int audit_receive_skb(struct sk_buff *skb)
 		if (rlen > skb->len)
 			rlen = skb->len;
 		if ((err = audit_receive_msg(skb, nlh))) {
-			netlink_ack(skb, nlh, -err);
+			netlink_ack(skb, nlh, err);
 		} else if (nlh->nlmsg_flags & NLM_F_ACK)
 			netlink_ack(skb, nlh, 0);
 		skb_pull(skb, rlen);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 0962944e8357..aa617bbe49b0 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -591,7 +591,7 @@ static void audit_log_exit(struct audit_context *context)
 	if (context->personality != PER_LINUX)
 		audit_log_format(ab, " per=%lx", context->personality);
 	if (context->return_valid)
-		audit_log_format(ab, " exit=%u", context->return_code);
+		audit_log_format(ab, " exit=%d", context->return_code);
 	audit_log_format(ab,
 		  " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
 		  " pid=%d loginuid=%d uid=%d gid=%d"
diff --git a/kernel/capability.c b/kernel/capability.c
index 7800a5066c0f..b828d545a97b 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -23,7 +23,7 @@ EXPORT_SYMBOL(cap_bset);
  * This global lock protects task->cap_* for all tasks including current.
  * Locking rule: acquire this prior to tasklist_lock.
  */
-spinlock_t task_capability_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(task_capability_lock);
 
 /*
  * For sys_getproccap() and sys_setproccap(), any of the three
@@ -85,34 +85,60 @@ out:
  * cap_set_pg - set capabilities for all processes in a given process
  * group.  We call this holding task_capability_lock and tasklist_lock.
  */
-static inline void cap_set_pg(int pgrp, kernel_cap_t *effective,
+static inline int cap_set_pg(int pgrp, kernel_cap_t *effective,
 			      kernel_cap_t *inheritable,
 			      kernel_cap_t *permitted)
 {
 	task_t *g, *target;
+	int ret = -EPERM;
+	int found = 0;
 
 	do_each_task_pid(pgrp, PIDTYPE_PGID, g) {
 		target = g;
-		while_each_thread(g, target)
-			security_capset_set(target, effective, inheritable, permitted);
+		while_each_thread(g, target) {
+			if (!security_capset_check(target, effective,
+							inheritable,
+							permitted)) {
+				security_capset_set(target, effective,
+							inheritable,
+							permitted);
+				ret = 0;
+			}
+			found = 1;
+		}
 	} while_each_task_pid(pgrp, PIDTYPE_PGID, g);
+
+	if (!found)
+	     ret = 0;
+	return ret;
 }
 
 /*
  * cap_set_all - set capabilities for all processes other than init
  * and self.  We call this holding task_capability_lock and tasklist_lock.
  */
-static inline void cap_set_all(kernel_cap_t *effective,
+static inline int cap_set_all(kernel_cap_t *effective,
 			       kernel_cap_t *inheritable,
 			       kernel_cap_t *permitted)
 {
      task_t *g, *target;
+     int ret = -EPERM;
+     int found = 0;
 
      do_each_thread(g, target) {
              if (target == current || target->pid == 1)
                      continue;
+             found = 1;
+	     if (security_capset_check(target, effective, inheritable,
+						permitted))
+		     continue;
+	     ret = 0;
 	     security_capset_set(target, effective, inheritable, permitted);
      } while_each_thread(g, target);
+
+     if (!found)
+	     ret = 0;
+     return ret;
 }
 
 /*
@@ -147,7 +173,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
      if (get_user(pid, &header->pid))
 	     return -EFAULT; 
 
-     if (pid && !capable(CAP_SETPCAP))
+     if (pid && pid != current->pid && !capable(CAP_SETPCAP))
              return -EPERM;
 
      if (copy_from_user(&effective, &data->effective, sizeof(effective)) ||
@@ -167,36 +193,23 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
      } else
                target = current;
 
-     ret = -EPERM;
-
-     if (security_capset_check(target, &effective, &inheritable, &permitted))
-	     goto out;
-
-     if (!cap_issubset(inheritable, cap_combine(target->cap_inheritable,
-                       current->cap_permitted)))
-             goto out;
-
-     /* verify restrictions on target's new Permitted set */
-     if (!cap_issubset(permitted, cap_combine(target->cap_permitted,
-                       current->cap_permitted)))
-             goto out;
-
-     /* verify the _new_Effective_ is a subset of the _new_Permitted_ */
-     if (!cap_issubset(effective, permitted))
-             goto out;
-
      ret = 0;
 
      /* having verified that the proposed changes are legal,
            we now put them into effect. */
      if (pid < 0) {
              if (pid == -1)  /* all procs other than current and init */
-                     cap_set_all(&effective, &inheritable, &permitted);
+                     ret = cap_set_all(&effective, &inheritable, &permitted);
 
              else            /* all procs in process group */
-                     cap_set_pg(-pid, &effective, &inheritable, &permitted);
+                     ret = cap_set_pg(-pid, &effective, &inheritable,
+		     					&permitted);
      } else {
-	     security_capset_set(target, &effective, &inheritable, &permitted);
+	     ret = security_capset_check(target, &effective, &inheritable,
+	     						&permitted);
+	     if (!ret)
+		     security_capset_set(target, &effective, &inheritable,
+		     					&permitted);
      }
 
 out:
diff --git a/kernel/compat.c b/kernel/compat.c
index 672310635347..f14fbde52bb3 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -20,6 +20,7 @@
 #include <linux/futex.h>	/* for FUTEX_WAIT */
 #include <linux/syscalls.h>
 #include <linux/unistd.h>
+#include <linux/security.h>
 
 #include <asm/uaccess.h>
 
@@ -162,15 +163,15 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
 		struct compat_tms tmp;
 		struct task_struct *tsk = current;
 		struct task_struct *t;
-		unsigned long utime, stime, cutime, cstime;
+		cputime_t utime, stime, cutime, cstime;
 
 		read_lock(&tasklist_lock);
 		utime = tsk->signal->utime;
 		stime = tsk->signal->stime;
 		t = tsk;
 		do {
-			utime += t->utime;
-			stime += t->stime;
+			utime = cputime_add(utime, t->utime);
+			stime = cputime_add(stime, t->stime);
 			t = next_thread(t);
 		} while (t != tsk);
 
@@ -189,10 +190,10 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
 		spin_unlock_irq(&tsk->sighand->siglock);
 		read_unlock(&tasklist_lock);
 
-		tmp.tms_utime = compat_jiffies_to_clock_t(utime);
-		tmp.tms_stime = compat_jiffies_to_clock_t(stime);
-		tmp.tms_cutime = compat_jiffies_to_clock_t(cutime);
-		tmp.tms_cstime = compat_jiffies_to_clock_t(cstime);
+		tmp.tms_utime = compat_jiffies_to_clock_t(cputime_to_jiffies(utime));
+		tmp.tms_stime = compat_jiffies_to_clock_t(cputime_to_jiffies(stime));
+		tmp.tms_cutime = compat_jiffies_to_clock_t(cputime_to_jiffies(cutime));
+		tmp.tms_cstime = compat_jiffies_to_clock_t(cputime_to_jiffies(cstime));
 		if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
 			return -EFAULT;
 	}
@@ -680,3 +681,128 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
 
 	return 0;
 }
+
+void
+sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
+{
+	switch (_NSIG_WORDS) {
+#if defined (__COMPAT_ENDIAN_SWAP__)
+	case 4: set->sig[3] = compat->sig[7] | (((long)compat->sig[6]) << 32 );
+	case 3: set->sig[2] = compat->sig[5] | (((long)compat->sig[4]) << 32 );
+	case 2: set->sig[1] = compat->sig[3] | (((long)compat->sig[2]) << 32 );
+	case 1: set->sig[0] = compat->sig[1] | (((long)compat->sig[0]) << 32 );
+#else
+	case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
+	case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 );
+	case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 );
+	case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
+#endif
+	}
+}
+
+asmlinkage long
+compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
+		struct compat_siginfo __user *uinfo,
+		struct compat_timespec __user *uts, compat_size_t sigsetsize)
+{
+	compat_sigset_t s32;
+	sigset_t s;
+	int sig;
+	struct timespec t;
+	siginfo_t info;
+	long ret, timeout = 0;
+
+	if (sigsetsize != sizeof(sigset_t))
+		return -EINVAL;
+
+	if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
+		return -EFAULT;
+	sigset_from_compat(&s, &s32);
+	sigdelsetmask(&s,sigmask(SIGKILL)|sigmask(SIGSTOP));
+	signotset(&s);
+
+	if (uts) {
+		if (get_compat_timespec (&t, uts))
+			return -EFAULT;
+		if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0
+				|| t.tv_sec < 0)
+			return -EINVAL;
+	}
+
+	spin_lock_irq(&current->sighand->siglock);
+	sig = dequeue_signal(current, &s, &info);
+	if (!sig) {
+		timeout = MAX_SCHEDULE_TIMEOUT;
+		if (uts)
+			timeout = timespec_to_jiffies(&t)
+				+(t.tv_sec || t.tv_nsec);
+		if (timeout) {
+			current->real_blocked = current->blocked;
+			sigandsets(&current->blocked, &current->blocked, &s);
+
+			recalc_sigpending();
+			spin_unlock_irq(&current->sighand->siglock);
+
+			current->state = TASK_INTERRUPTIBLE;
+			timeout = schedule_timeout(timeout);
+
+			spin_lock_irq(&current->sighand->siglock);
+			sig = dequeue_signal(current, &s, &info);
+			current->blocked = current->real_blocked;
+			siginitset(&current->real_blocked, 0);
+			recalc_sigpending();
+		}
+	}
+	spin_unlock_irq(&current->sighand->siglock);
+
+	if (sig) {
+		ret = sig;
+		if (uinfo) {
+			if (copy_siginfo_to_user32(uinfo, &info))
+				ret = -EFAULT;
+		}
+	}else {
+		ret = timeout?-EINTR:-EAGAIN;
+	}
+	return ret;
+
+}
+
+#ifdef __ARCH_WANT_COMPAT_SYS_TIME
+
+/* compat_time_t is a 32 bit "long" and needs to get converted. */
+
+asmlinkage long compat_sys_time(compat_time_t __user * tloc)
+{
+	compat_time_t i;
+	struct timeval tv;
+
+	do_gettimeofday(&tv);
+	i = tv.tv_sec;
+
+	if (tloc) {
+		if (put_user(i,tloc))
+			i = -EFAULT;
+	}
+	return i;
+}
+
+asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
+{
+	struct timespec tv;
+	int err;
+
+	if (get_user(tv.tv_sec, tptr))
+		return -EFAULT;
+
+	tv.tv_nsec = 0;
+
+	err = security_settime(&tv, NULL);
+	if (err)
+		return err;
+
+	do_settimeofday(&tv);
+	return 0;
+}
+
+#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index ebaba873ebad..628f4ccda127 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -48,7 +48,9 @@ static inline void check_for_tasks(int cpu)
 
 	write_lock_irq(&tasklist_lock);
 	for_each_process(p) {
-		if (task_cpu(p) == cpu && (p->utime != 0 || p->stime != 0))
+		if (task_cpu(p) == cpu &&
+		    (!cputime_eq(p->utime, cputime_zero) ||
+		     !cputime_eq(p->stime, cputime_zero)))
 			printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
 				(state = %ld, flags = %lx) \n",
 				 p->comm, p->pid, cpu, p->state, p->flags);
@@ -132,7 +134,8 @@ int cpu_down(unsigned int cpu)
 	__cpu_die(cpu);
 
 	/* Move it here so it can run. */
-	kthread_bind(p, smp_processor_id());
+	kthread_bind(p, get_cpu());
+	put_cpu();
 
 	/* CPU is completely dead: tell everyone.  Too late to complain. */
 	if (notifier_call_chain(&cpu_chain, CPU_DEAD, (void *)(long)cpu)
diff --git a/kernel/dma.c b/kernel/dma.c
index 940d02c50879..aef0a45b7893 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -38,7 +38,7 @@
  */
 
 
-spinlock_t dma_spin_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(dma_spin_lock);
 
 /*
  *	If our port doesn't define this it has no PC like DMA
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index ad3e5d54e119..867d6dbeb574 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -22,7 +22,7 @@
 static void default_handler(int, struct pt_regs *);
 
 static struct exec_domain *exec_domains = &default_exec_domain;
-static rwlock_t exec_domains_lock = RW_LOCK_UNLOCKED;
+static DEFINE_RWLOCK(exec_domains_lock);
 
 
 static u_long ident_map[32] = {
diff --git a/kernel/exit.c b/kernel/exit.c
index 64bc9502cd1c..3171228f25c3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -159,7 +159,7 @@ static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task)
 
 	do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
 		if (p == ignored_task
-				|| p->exit_state >= EXIT_ZOMBIE
+				|| p->exit_state
 				|| p->real_parent->pid == 1)
 			continue;
 		if (process_group(p->real_parent) != pgrp
@@ -332,7 +332,9 @@ void daemonize(const char *name, ...)
 	exit_mm(current);
 
 	set_special_pids(1, 1);
+	down(&tty_sem);
 	current->signal->tty = NULL;
+	up(&tty_sem);
 
 	/* Block and flush all signals */
 	sigfillset(&blocked);
@@ -470,7 +472,7 @@ EXPORT_SYMBOL_GPL(exit_fs);
  * Turn us into a lazy TLB process if we
  * aren't already..
  */
-static inline void __exit_mm(struct task_struct * tsk)
+void exit_mm(struct task_struct * tsk)
 {
 	struct mm_struct *mm = tsk->mm;
 
@@ -506,18 +508,13 @@ static inline void __exit_mm(struct task_struct * tsk)
 	mmput(mm);
 }
 
-void exit_mm(struct task_struct *tsk)
-{
-	__exit_mm(tsk);
-}
-
 static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper)
 {
 	/*
 	 * Make sure we're not reparenting to ourselves and that
 	 * the parent is not a zombie.
 	 */
-	BUG_ON(p == reaper || reaper->state >= EXIT_ZOMBIE || reaper->exit_state >= EXIT_ZOMBIE);
+	BUG_ON(p == reaper || reaper->exit_state >= EXIT_ZOMBIE);
 	p->real_parent = reaper;
 	if (p->parent == p->real_parent)
 		BUG();
@@ -560,7 +557,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced)
 			 * a normal stop since it's no longer being
 			 * traced.
 			 */
-			p->state = TASK_STOPPED;
+			ptrace_untrace(p);
 		}
 	}
 
@@ -599,7 +596,7 @@ static inline void forget_original_parent(struct task_struct * father,
 			reaper = child_reaper;
 			break;
 		}
-	} while (reaper->exit_state >= EXIT_ZOMBIE);
+	} while (reaper->exit_state);
 
 	/*
 	 * There are only two places where our children can be:
@@ -656,7 +653,7 @@ static void exit_notify(struct task_struct *tsk)
 	struct task_struct *t;
 	struct list_head ptrace_dead, *_p, *_n;
 
-	if (signal_pending(tsk) && !tsk->signal->group_exit
+	if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT)
 	    && !thread_group_empty(tsk)) {
 		/*
 		 * This occurs when there was a race between our exit
@@ -750,7 +747,9 @@ static void exit_notify(struct task_struct *tsk)
 	}
 
 	state = EXIT_ZOMBIE;
-	if (tsk->exit_signal == -1 && tsk->ptrace == 0)
+	if (tsk->exit_signal == -1 &&
+	    (likely(tsk->ptrace == 0) ||
+	     unlikely(tsk->parent->signal->flags & SIGNAL_GROUP_EXIT)))
 		state = EXIT_DEAD;
 	tsk->exit_state = state;
 
@@ -758,8 +757,8 @@ static void exit_notify(struct task_struct *tsk)
 	 * Clear these here so that update_process_times() won't try to deliver
 	 * itimer, profile or rlimit signals to this task while it is in late exit.
 	 */
-	tsk->it_virt_value = 0;
-	tsk->it_prof_value = 0;
+	tsk->it_virt_value = cputime_zero;
+	tsk->it_prof_value = cputime_zero;
 
 	write_unlock_irq(&tasklist_lock);
 
@@ -793,6 +792,12 @@ fastcall NORET_TYPE void do_exit(long code)
 		panic("Attempted to kill init!");
 	if (tsk->io_context)
 		exit_io_context();
+
+	if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
+		current->ptrace_message = code;
+		ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
+	}
+
 	tsk->flags |= PF_EXITING;
 	del_timer_sync(&tsk->real_timer);
 
@@ -801,15 +806,12 @@ fastcall NORET_TYPE void do_exit(long code)
 				current->comm, current->pid,
 				preempt_count());
 
-	if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
-		current->ptrace_message = code;
-		ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
-	}
-
+	acct_update_integrals();
+	update_mem_hiwater();
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead)
 		acct_process(code);
-	__exit_mm(tsk);
+	exit_mm(tsk);
 
 	exit_sem(tsk);
 	__exit_files(tsk);
@@ -877,18 +879,18 @@ do_group_exit(int exit_code)
 {
 	BUG_ON(exit_code & 0x80); /* core dumps don't get here */
 
-	if (current->signal->group_exit)
+	if (current->signal->flags & SIGNAL_GROUP_EXIT)
 		exit_code = current->signal->group_exit_code;
 	else if (!thread_group_empty(current)) {
 		struct signal_struct *const sig = current->signal;
 		struct sighand_struct *const sighand = current->sighand;
 		read_lock(&tasklist_lock);
 		spin_lock_irq(&sighand->siglock);
-		if (sig->group_exit)
+		if (sig->flags & SIGNAL_GROUP_EXIT)
 			/* Another thread got here before we took the lock.  */
 			exit_code = sig->group_exit_code;
 		else {
-			sig->group_exit = 1;
+			sig->flags = SIGNAL_GROUP_EXIT;
 			sig->group_exit_code = exit_code;
 			zap_other_threads(current);
 		}
@@ -1046,10 +1048,16 @@ static int wait_task_zombie(task_t *p, int noreap,
 		 * here reaping other children at the same time.
 		 */
 		spin_lock_irq(&p->parent->sighand->siglock);
-		p->parent->signal->cutime +=
-			p->utime + p->signal->utime + p->signal->cutime;
-		p->parent->signal->cstime +=
-			p->stime + p->signal->stime + p->signal->cstime;
+		p->parent->signal->cutime =
+			cputime_add(p->parent->signal->cutime,
+			cputime_add(p->utime,
+			cputime_add(p->signal->utime,
+				    p->signal->cutime)));
+		p->parent->signal->cstime =
+			cputime_add(p->parent->signal->cstime,
+			cputime_add(p->stime,
+			cputime_add(p->signal->stime,
+				    p->signal->cstime)));
 		p->parent->signal->cmin_flt +=
 			p->min_flt + p->signal->min_flt + p->signal->cmin_flt;
 		p->parent->signal->cmaj_flt +=
@@ -1068,7 +1076,7 @@ static int wait_task_zombie(task_t *p, int noreap,
 	read_unlock(&tasklist_lock);
 
 	retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
-	status = p->signal->group_exit
+	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
 		? p->signal->group_exit_code : p->exit_code;
 	if (!retval && stat_addr)
 		retval = put_user(status, stat_addr);
@@ -1180,7 +1188,7 @@ static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap,
 	 * race with the EXIT_ZOMBIE case.
 	 */
 	exit_code = xchg(&p->exit_code, 0);
-	if (unlikely(p->exit_state >= EXIT_ZOMBIE)) {
+	if (unlikely(p->exit_state)) {
 		/*
 		 * The task resumed and then died.  Let the next iteration
 		 * catch it in EXIT_ZOMBIE.  Note that exit_code might
@@ -1258,16 +1266,17 @@ static int wait_task_continued(task_t *p, int noreap,
 	if (unlikely(!p->signal))
 		return 0;
 
-	if (p->signal->stop_state >= 0)
+	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
 		return 0;
 
 	spin_lock_irq(&p->sighand->siglock);
-	if (p->signal->stop_state >= 0) { /* Re-check with the lock held.  */
+	/* Re-check with the lock held.  */
+	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
 		spin_unlock_irq(&p->sighand->siglock);
 		return 0;
 	}
 	if (!noreap)
-		p->signal->stop_state = 0;
+		p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
 	spin_unlock_irq(&p->sighand->siglock);
 
 	pid = p->pid;
@@ -1316,7 +1325,7 @@ static long do_wait(pid_t pid, int options, struct siginfo __user *infop,
 	struct task_struct *tsk;
 	int flag, retval;
 
-	add_wait_queue(&current->wait_chldexit,&wait);
+	add_wait_queue(&current->signal->wait_chldexit,&wait);
 repeat:
 	/*
 	 * We will set this flag if we see any child that might later
@@ -1430,7 +1439,7 @@ check_continued:
 	retval = -ECHILD;
 end:
 	current->state = TASK_RUNNING;
-	remove_wait_queue(&current->wait_chldexit,&wait);
+	remove_wait_queue(&current->signal->wait_chldexit,&wait);
 	if (infop) {
 		if (retval > 0)
 		retval = 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index 84252e055db4..be1ff8ddbb9c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -39,6 +39,7 @@
 #include <linux/audit.h>
 #include <linux/profile.h>
 #include <linux/rmap.h>
+#include <linux/acct.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -47,17 +48,17 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
-/* The idle threads do not count..
- * Protected by write_lock_irq(&tasklist_lock)
+/*
+ * Protected counters by write_lock_irq(&tasklist_lock)
  */
-int nr_threads;
-
-int max_threads;
 unsigned long total_forks;	/* Handle normal Linux uptimes. */
+int nr_threads; 		/* The idle threads do not count.. */
+
+int max_threads;		/* tunable limit on nr_threads */
 
 DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 
-rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;  /* outer */
+ __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
 
 EXPORT_SYMBOL(tasklist_lock);
 
@@ -218,6 +219,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
       
 			/* insert tmp into the share list, just after mpnt */
 			spin_lock(&file->f_mapping->i_mmap_lock);
+			tmp->vm_truncate_count = mpnt->vm_truncate_count;
 			flush_dcache_mmap_lock(file->f_mapping);
 			vma_prio_tree_add(tmp, mpnt);
 			flush_dcache_mmap_unlock(file->f_mapping);
@@ -279,7 +281,7 @@ static inline void mm_free_pgd(struct mm_struct * mm)
 #define mm_free_pgd(mm)
 #endif /* CONFIG_MMU */
 
-spinlock_t mmlist_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
+ __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
 
 #define allocate_mm()	(kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
 #define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))
@@ -469,6 +471,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 	if (retval)
 		goto free_pt;
 
+	mm->hiwater_rss = mm->rss;
+	mm->hiwater_vm = mm->total_vm;
+
 good_mm:
 	tsk->mm = mm;
 	tsk->active_mm = mm;
@@ -729,11 +734,11 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 		return -ENOMEM;
 	atomic_set(&sig->count, 1);
 	atomic_set(&sig->live, 1);
-	sig->group_exit = 0;
+	init_waitqueue_head(&sig->wait_chldexit);
+	sig->flags = 0;
 	sig->group_exit_code = 0;
 	sig->group_exit_task = NULL;
 	sig->group_stop_count = 0;
-	sig->stop_state = 0;
 	sig->curr_target = NULL;
 	init_sigpending(&sig->shared_pending);
 	INIT_LIST_HEAD(&sig->posix_timers);
@@ -744,7 +749,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	sig->leader = 0;	/* session leadership doesn't inherit */
 	sig->tty_old_pgrp = 0;
 
-	sig->utime = sig->stime = sig->cutime = sig->cstime = 0;
+	sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 
@@ -857,7 +862,6 @@ static task_t *copy_process(unsigned long clone_flags,
 
 	INIT_LIST_HEAD(&p->children);
 	INIT_LIST_HEAD(&p->sibling);
-	init_waitqueue_head(&p->wait_chldexit);
 	p->vfork_done = NULL;
 	spin_lock_init(&p->alloc_lock);
 	spin_lock_init(&p->proc_lock);
@@ -865,12 +869,23 @@ static task_t *copy_process(unsigned long clone_flags,
 	clear_tsk_thread_flag(p, TIF_SIGPENDING);
 	init_sigpending(&p->pending);
 
-	p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
-	p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
+	p->it_real_value = 0;
+	p->it_real_incr = 0;
+	p->it_virt_value = cputime_zero;
+	p->it_virt_incr = cputime_zero;
+	p->it_prof_value = cputime_zero;
+	p->it_prof_incr = cputime_zero;
 	init_timer(&p->real_timer);
 	p->real_timer.data = (unsigned long) p;
 
-	p->utime = p->stime = 0;
+	p->utime = cputime_zero;
+	p->stime = cputime_zero;
+	p->rchar = 0;		/* I/O counter: bytes read */
+	p->wchar = 0;		/* I/O counter: bytes written */
+	p->syscr = 0;		/* I/O counter: read syscalls */
+	p->syscw = 0;		/* I/O counter: write syscalls */
+	acct_clear_integrals(p);
+
 	p->lock_depth = -1;		/* -1 = no lock */
 	do_posix_clock_monotonic_gettime(&p->start_time);
 	p->security = NULL;
@@ -985,7 +1000,7 @@ static task_t *copy_process(unsigned long clone_flags,
 		 * do not create this new thread - the whole thread
 		 * group is supposed to exit anyway.
 		 */
-		if (current->signal->group_exit) {
+		if (current->signal->flags & SIGNAL_GROUP_EXIT) {
 			spin_unlock(&current->sighand->siglock);
 			write_unlock_irq(&tasklist_lock);
 			retval = -EAGAIN;
@@ -1020,6 +1035,7 @@ static task_t *copy_process(unsigned long clone_flags,
 	}
 
 	nr_threads++;
+	total_forks++;
 	write_unlock_irq(&tasklist_lock);
 	retval = 0;
 
@@ -1152,7 +1168,6 @@ long do_fork(unsigned long clone_flags,
 			wake_up_new_task(p, clone_flags);
 		else
 			p->state = TASK_STOPPED;
-		++total_forks;
 
 		if (unlikely (trace)) {
 			current->ptrace_message = pid;
diff --git a/kernel/intermodule.c b/kernel/intermodule.c
index 09f556507f57..9078649382cf 100644
--- a/kernel/intermodule.c
+++ b/kernel/intermodule.c
@@ -14,7 +14,7 @@
  */
 
 static struct list_head ime_list = LIST_HEAD_INIT(ime_list);
-static spinlock_t ime_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(ime_lock);
 static int kmalloc_failed;
 
 struct inter_module_entry {
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 16818726cd21..98d62d8efeaf 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -137,6 +137,7 @@ unsigned int probe_irq_mask(unsigned long val)
 
 	return mask & val;
 }
+EXPORT_SYMBOL(probe_irq_mask);
 
 /**
  *	probe_irq_off	- end an interrupt autodetect
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index ebc25823b73d..2fb0e46e11f3 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -73,17 +73,6 @@ irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs)
 }
 
 /*
- * Exit an interrupt context. Process softirqs if needed and possible:
- */
-void irq_exit(void)
-{
-	preempt_count() -= IRQ_EXIT_OFFSET;
-	if (!in_interrupt() && local_softirq_pending())
-		do_softirq();
-	preempt_enable_no_resched();
-}
-
-/*
  * Have got an event to handle:
  */
 fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 95fbf1c6becf..e1743c563206 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -16,11 +16,10 @@
 
 int do_getitimer(int which, struct itimerval *value)
 {
-	register unsigned long val, interval;
+	register unsigned long val;
 
 	switch (which) {
 	case ITIMER_REAL:
-		interval = current->it_real_incr;
 		val = 0;
 		/* 
 		 * FIXME! This needs to be atomic, in case the kernel timer happens!
@@ -32,20 +31,20 @@ int do_getitimer(int which, struct itimerval *value)
 			if ((long) val <= 0)
 				val = 1;
 		}
+		jiffies_to_timeval(val, &value->it_value);
+		jiffies_to_timeval(current->it_real_incr, &value->it_interval);
 		break;
 	case ITIMER_VIRTUAL:
-		val = current->it_virt_value;
-		interval = current->it_virt_incr;
+		cputime_to_timeval(current->it_virt_value, &value->it_value);
+		cputime_to_timeval(current->it_virt_incr, &value->it_interval);
 		break;
 	case ITIMER_PROF:
-		val = current->it_prof_value;
-		interval = current->it_prof_incr;
+		cputime_to_timeval(current->it_prof_value, &value->it_value);
+		cputime_to_timeval(current->it_prof_incr, &value->it_interval);
 		break;
 	default:
 		return(-EINVAL);
 	}
-	jiffies_to_timeval(val, &value->it_value);
-	jiffies_to_timeval(interval, &value->it_interval);
 	return 0;
 }
 
@@ -81,37 +80,43 @@ void it_real_fn(unsigned long __data)
 
 int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
 {
-	register unsigned long i, j;
+	unsigned long expire;
+	cputime_t cputime;
 	int k;
 
-	i = timeval_to_jiffies(&value->it_interval);
-	j = timeval_to_jiffies(&value->it_value);
 	if (ovalue && (k = do_getitimer(which, ovalue)) < 0)
 		return k;
 	switch (which) {
 		case ITIMER_REAL:
 			del_timer_sync(&current->real_timer);
-			current->it_real_value = j;
-			current->it_real_incr = i;
-			if (!j)
+			expire = timeval_to_jiffies(&value->it_value);
+			current->it_real_value = expire;
+			current->it_real_incr =
+				timeval_to_jiffies(&value->it_interval);
+			if (!expire)
 				break;
-			if (j > (unsigned long) LONG_MAX)
-				j = LONG_MAX;
-			i = j + jiffies;
-			current->real_timer.expires = i;
+			if (expire > (unsigned long) LONG_MAX)
+				expire = LONG_MAX;
+			current->real_timer.expires = jiffies + expire;
 			add_timer(&current->real_timer);
 			break;
 		case ITIMER_VIRTUAL:
-			if (j)
-				j++;
-			current->it_virt_value = j;
-			current->it_virt_incr = i;
+			cputime = timeval_to_cputime(&value->it_value);
+			if (cputime_gt(cputime, cputime_zero))
+				cputime = cputime_add(cputime,
+						      jiffies_to_cputime(1));
+			current->it_virt_value = cputime;
+			cputime = timeval_to_cputime(&value->it_interval);
+			current->it_virt_incr = cputime;
 			break;
 		case ITIMER_PROF:
-			if (j)
-				j++;
-			current->it_prof_value = j;
-			current->it_prof_incr = i;
+			cputime = timeval_to_cputime(&value->it_value);
+			if (cputime_gt(cputime, cputime_zero))
+				cputime = cputime_add(cputime,
+						      jiffies_to_cputime(1));
+			current->it_prof_value = cputime;
+			cputime = timeval_to_cputime(&value->it_interval);
+			current->it_prof_incr = cputime;
 			break;
 		default:
 			return -EINVAL;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 8db13f565ed9..315751c2b09b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -20,6 +20,8 @@
 #include <linux/proc_fs.h>
 #include <linux/mm.h>
 
+#include <asm/sections.h>
+
 #ifdef CONFIG_KALLSYMS_ALL
 #define all_var 1
 #else
@@ -28,7 +30,7 @@
 
 /* These will be re-linked against their real values during the second link stage */
 extern unsigned long kallsyms_addresses[] __attribute__((weak));
-extern unsigned long kallsyms_num_syms __attribute__((weak));
+extern unsigned long kallsyms_num_syms __attribute__((weak,section("data")));
 extern u8 kallsyms_names[] __attribute__((weak));
 
 extern u8 kallsyms_token_table[] __attribute__((weak));
@@ -36,9 +38,6 @@ extern u16 kallsyms_token_index[] __attribute__((weak));
 
 extern unsigned long kallsyms_markers[] __attribute__((weak));
 
-/* Defined by the linker script. */
-extern char _stext[], _etext[], _sinittext[], _einittext[], _end[];
-
 static inline int is_kernel_inittext(unsigned long addr)
 {
 	if (addr >= (unsigned long)_sinittext
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d3d1321b0e5c..cc6f72585f1e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -43,7 +43,7 @@
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 
 unsigned int kprobe_cpu = NR_CPUS;
-static spinlock_t kprobe_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(kprobe_lock);
 
 /* Locks kprobe: irqs must be disabled */
 void lock_kprobes(void)
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 31f1a60df733..1f064a63f8cf 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -30,7 +30,8 @@ static ssize_t hotplug_seqnum_show(struct subsystem *subsys, char *page)
 KERNEL_ATTR_RO(hotplug_seqnum);
 #endif
 
-static decl_subsys(kernel, NULL, NULL);
+decl_subsys(kernel, NULL, NULL);
+EXPORT_SYMBOL_GPL(kernel_subsys);
 
 static struct attribute * kernel_attrs[] = {
 #ifdef CONFIG_HOTPLUG
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 5689ebb1a250..e377e2244103 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -14,6 +14,12 @@
 #include <linux/module.h>
 #include <asm/semaphore.h>
 
+/*
+ * We dont want to execute off keventd since it might
+ * hold a semaphore our callers hold too:
+ */
+static struct workqueue_struct *helper_wq;
+
 struct kthread_create_info
 {
 	/* Information passed to kthread() from keventd. */
@@ -126,12 +132,13 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 	init_completion(&create.started);
 	init_completion(&create.done);
 
-	/* If we're being called to start the first workqueue, we
-	 * can't use keventd. */
-	if (!keventd_up())
+	/*
+	 * The workqueue needs to start up first:
+	 */
+	if (!helper_wq)
 		work.func(work.data);
 	else {
-		schedule_work(&work);
+		queue_work(helper_wq, &work);
 		wait_for_completion(&create.done);
 	}
 	if (!IS_ERR(create.result)) {
@@ -183,3 +190,13 @@ int kthread_stop(struct task_struct *k)
 	return ret;
 }
 EXPORT_SYMBOL(kthread_stop);
+
+static __init int helper_init(void)
+{
+	helper_wq = create_singlethread_workqueue("kthread");
+	BUG_ON(!helper_wq);
+
+	return 0;
+}
+core_initcall(helper_init);
+
diff --git a/kernel/module.c b/kernel/module.c
index 0798443ce002..ce427b675b98 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -53,7 +53,7 @@
 #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
 
 /* Protects module list */
-static spinlock_t modlist_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(modlist_lock);
 
 /* List of modules, protected by module_mutex AND modlist_lock */
 static DECLARE_MUTEX(module_mutex);
@@ -379,7 +379,7 @@ static void module_unload_init(struct module *mod)
 	for (i = 0; i < NR_CPUS; i++)
 		local_set(&mod->ref[i].count, 0);
 	/* Hold reference count during initialization. */
-	local_set(&mod->ref[smp_processor_id()].count, 1);
+	local_set(&mod->ref[_smp_processor_id()].count, 1);
 	/* Backwards compatibility macros put refcount during init. */
 	mod->waiter = current;
 }
@@ -651,7 +651,8 @@ void symbol_put_addr(void *addr)
 }
 EXPORT_SYMBOL_GPL(symbol_put_addr);
 
-static ssize_t show_refcnt(struct module *mod, char *buffer)
+static ssize_t show_refcnt(struct module_attribute *mattr,
+			   struct module *mod, char *buffer)
 {
 	/* sysfs holds a reference */
 	return sprintf(buffer, "%u\n", module_refcount(mod)-1);
@@ -681,13 +682,6 @@ static inline int use_module(struct module *a, struct module *b)
 static inline void module_unload_init(struct module *mod)
 {
 }
-
-asmlinkage long
-sys_delete_module(const char __user *name_user, unsigned int flags)
-{
-	return -ENOSYS;
-}
-
 #endif /* CONFIG_MODULE_UNLOAD */
 
 #ifdef CONFIG_OBSOLETE_MODPARM
@@ -936,79 +930,71 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
  * J. Corbet <corbet@lwn.net>
  */
 #ifdef CONFIG_KALLSYMS
-static void module_sect_attrs_release(struct kobject *kobj)
-{
-	kfree(container_of(kobj, struct module_sections, kobj));
-}
-
-static ssize_t module_sect_show(struct kobject *kobj, struct attribute *attr,
-		char *buf)
+static ssize_t module_sect_show(struct module_attribute *mattr,
+				struct module *mod, char *buf)
 {
 	struct module_sect_attr *sattr =
-		container_of(attr, struct module_sect_attr, attr);
+		container_of(mattr, struct module_sect_attr, mattr);
 	return sprintf(buf, "0x%lx\n", sattr->address);
 }
 
-static struct sysfs_ops module_sect_ops = {
-	.show = module_sect_show,
-};
-
-static struct kobj_type module_sect_ktype = {
-	.sysfs_ops = &module_sect_ops,
-	.release =   module_sect_attrs_release,
-};
-
 static void add_sect_attrs(struct module *mod, unsigned int nsect,
 		char *secstrings, Elf_Shdr *sechdrs)
 {
-	unsigned int nloaded = 0, i;
+	unsigned int nloaded = 0, i, size[2];
+	struct module_sect_attrs *sect_attrs;
 	struct module_sect_attr *sattr;
-	
-	if (!mod->mkobj)
-		return;
+	struct attribute **gattr;
 	
 	/* Count loaded sections and allocate structures */
 	for (i = 0; i < nsect; i++)
 		if (sechdrs[i].sh_flags & SHF_ALLOC)
 			nloaded++;
-	mod->sect_attrs = kmalloc(sizeof(struct module_sections) +
-			nloaded*sizeof(mod->sect_attrs->attrs[0]), GFP_KERNEL);
-	if (! mod->sect_attrs)
+	size[0] = ALIGN(sizeof(*sect_attrs)
+			+ nloaded * sizeof(sect_attrs->attrs[0]),
+			sizeof(sect_attrs->grp.attrs[0]));
+	size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]);
+	if (! (sect_attrs = kmalloc(size[0] + size[1], GFP_KERNEL)))
 		return;
 
-	/* sections entry setup */
-	memset(mod->sect_attrs, 0, sizeof(struct module_sections));
-	if (kobject_set_name(&mod->sect_attrs->kobj, "sections"))
-		goto out;
-	mod->sect_attrs->kobj.parent = &mod->mkobj->kobj;
-	mod->sect_attrs->kobj.ktype = &module_sect_ktype;
-	if (kobject_register(&mod->sect_attrs->kobj))
-		goto out;
+	/* Setup section attributes. */
+	sect_attrs->grp.name = "sections";
+	sect_attrs->grp.attrs = (void *)sect_attrs + size[0];
 
-	/* And the section attributes. */
-	sattr = &mod->sect_attrs->attrs[0];
+	sattr = &sect_attrs->attrs[0];
+	gattr = &sect_attrs->grp.attrs[0];
 	for (i = 0; i < nsect; i++) {
 		if (! (sechdrs[i].sh_flags & SHF_ALLOC))
 			continue;
 		sattr->address = sechdrs[i].sh_addr;
 		strlcpy(sattr->name, secstrings + sechdrs[i].sh_name,
-				MODULE_SECT_NAME_LEN);
-		sattr->attr.name = sattr->name;
-		sattr->attr.owner = mod;
-		sattr->attr.mode = S_IRUGO;
-		(void) sysfs_create_file(&mod->sect_attrs->kobj, &sattr->attr);
-		sattr++;
+			MODULE_SECT_NAME_LEN);
+		sattr->mattr.show = module_sect_show;
+		sattr->mattr.store = NULL;
+		sattr->mattr.attr.name = sattr->name;
+		sattr->mattr.attr.owner = mod;
+		sattr->mattr.attr.mode = S_IRUGO;
+		*(gattr++) = &(sattr++)->mattr.attr;
 	}
+	*gattr = NULL;
+
+	if (sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp))
+		goto out;
+
+	mod->sect_attrs = sect_attrs;
 	return;
   out:
-	kfree(mod->sect_attrs);
-	mod->sect_attrs = NULL;
+	kfree(sect_attrs);
 }
 
 static void remove_sect_attrs(struct module *mod)
 {
 	if (mod->sect_attrs) {
-		kobject_unregister(&mod->sect_attrs->kobj);
+		sysfs_remove_group(&mod->mkobj.kobj,
+				   &mod->sect_attrs->grp);
+		/* We are positive that no one is using any sect attrs
+		 * at this point.  Deallocate immediately. */
+		kfree(mod->sect_attrs);
 		mod->sect_attrs = NULL;
 	}
 }
@@ -1029,11 +1015,11 @@ static inline void remove_sect_attrs(struct module *mod)
 #ifdef CONFIG_MODULE_UNLOAD
 static inline int module_add_refcnt_attr(struct module *mod)
 {
-	return sysfs_create_file(&mod->mkobj->kobj, &refcnt.attr);
+	return sysfs_create_file(&mod->mkobj.kobj, &refcnt.attr);
 }
 static void module_remove_refcnt_attr(struct module *mod)
 {
-	return sysfs_remove_file(&mod->mkobj->kobj, &refcnt.attr);
+	return sysfs_remove_file(&mod->mkobj.kobj, &refcnt.attr);
 }
 #else
 static inline int module_add_refcnt_attr(struct module *mod)
@@ -1052,17 +1038,13 @@ static int mod_sysfs_setup(struct module *mod,
 {
 	int err;
 
-	mod->mkobj = kmalloc(sizeof(struct module_kobject), GFP_KERNEL);
-	if (!mod->mkobj)
-		return -ENOMEM;
-
-	memset(&mod->mkobj->kobj, 0, sizeof(mod->mkobj->kobj));
-	err = kobject_set_name(&mod->mkobj->kobj, "%s", mod->name);
+	memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
+	err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name);
 	if (err)
 		goto out;
-	kobj_set_kset_s(mod->mkobj, module_subsys);
-	mod->mkobj->mod = mod;
-	err = kobject_register(&mod->mkobj->kobj);
+	kobj_set_kset_s(&mod->mkobj, module_subsys);
+	mod->mkobj.mod = mod;
+	err = kobject_register(&mod->mkobj.kobj);
 	if (err)
 		goto out;
 
@@ -1077,11 +1059,8 @@ static int mod_sysfs_setup(struct module *mod,
 	return 0;
 
 out_unreg:
-	/* Calls module_kobj_release */
-	kobject_unregister(&mod->mkobj->kobj);
-	return err;
+	kobject_unregister(&mod->mkobj.kobj);
 out:
-	kfree(mod->mkobj);
 	return err;
 }
 
@@ -1090,8 +1069,7 @@ static void mod_kobject_remove(struct module *mod)
 	module_remove_refcnt_attr(mod);
 	module_param_sysfs_remove(mod);
 
-	/* Calls module_kobj_release */
-	kobject_unregister(&mod->mkobj->kobj);
+	kobject_unregister(&mod->mkobj.kobj);
 }
 
 /* Free a module, remove from lists, etc (must hold module mutex). */
@@ -1713,6 +1691,9 @@ static struct module *load_module(void __user *umod,
 				 / sizeof(struct kernel_param),
 				 NULL);
 	}
+	if (err < 0)
+		goto arch_cleanup;
+
 	err = mod_sysfs_setup(mod, 
 			      (struct kernel_param *)
 			      sechdrs[setupindex].sh_addr,
@@ -2089,11 +2070,9 @@ void module_add_driver(struct module *mod, struct device_driver *drv)
 {
 	if (!mod || !drv)
 		return;
-	if (!mod->mkobj)
-		return;
 
 	/* Don't check return code; this call is idempotent */
-	sysfs_create_link(&drv->kobj, &mod->mkobj->kobj, "module");
+	sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module");
 }
 EXPORT_SYMBOL(module_add_driver);
 
diff --git a/kernel/params.c b/kernel/params.c
index 45dd451e17c1..ec3dbf68e253 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -77,10 +77,16 @@ static int parse_one(char *param,
 static char *next_arg(char *args, char **param, char **val)
 {
 	unsigned int i, equals = 0;
-	int in_quote = 0;
+	int in_quote = 0, quoted = 0;
+	char *next;
 
 	/* Chew any extra spaces */
 	while (*args == ' ') args++;
+	if (*args == '"') {
+		args++;
+		in_quote = 1;
+		quoted = 1;
+	}
 
 	for (i = 0; args[i]; i++) {
 		if (args[i] == ' ' && !in_quote)
@@ -106,13 +112,16 @@ static char *next_arg(char *args, char **param, char **val)
 			if (args[i-1] == '"')
 				args[i-1] = '\0';
 		}
+		if (quoted && args[i-1] == '"')
+			args[i-1] = '\0';
 	}
 
 	if (args[i]) {
 		args[i] = '\0';
-		return args + i + 1;
+		next = args + i + 1;
 	} else
-		return args + i;
+		next = args + i;
+	return next;
 }
 
 /* Args looks like "foo=bar,bar2 baz=fuz wiz". */
@@ -357,26 +366,23 @@ extern struct kernel_param __start___param[], __stop___param[];
 
 struct param_attribute
 {
-	struct attribute attr;
+	struct module_attribute mattr;
 	struct kernel_param *param;
 };
 
-struct param_kobject
+struct module_param_attrs
 {
-	struct kobject kobj;
-
-	unsigned int num_attributes;
-	struct param_attribute attr[0];
+	struct attribute_group grp;
+	struct param_attribute attrs[0];
 };
 
-#define to_param_attr(n) container_of(n, struct param_attribute, attr);
+#define to_param_attr(n) container_of(n, struct param_attribute, mattr);
 
-static ssize_t param_attr_show(struct kobject *kobj,
-			       struct attribute *attr,
-			       char *buf)
+static ssize_t param_attr_show(struct module_attribute *mattr,
+			       struct module *mod, char *buf)
 {
 	int count;
-	struct param_attribute *attribute = to_param_attr(attr);
+	struct param_attribute *attribute = to_param_attr(mattr);
 
 	if (!attribute->param->get)
 		return -EPERM;
@@ -390,12 +396,12 @@ static ssize_t param_attr_show(struct kobject *kobj,
 }
 
 /* sysfs always hands a nul-terminated string in buf.  We rely on that. */
-static ssize_t param_attr_store(struct kobject *kobj,
-				struct attribute *attr,
+static ssize_t param_attr_store(struct module_attribute *mattr,
+				struct module *owner,
 				const char *buf, size_t len)
 {
  	int err;
-	struct param_attribute *attribute = to_param_attr(attr);
+	struct param_attribute *attribute = to_param_attr(mattr);
 
 	if (!attribute->param->set)
 		return -EPERM;
@@ -406,27 +412,6 @@ static ssize_t param_attr_store(struct kobject *kobj,
 	return err;
 }
 
-
-static struct sysfs_ops param_sysfs_ops = {
-	.show = param_attr_show,
-	.store = param_attr_store,
-};
-
-static void param_kobj_release(struct kobject *kobj)
-{
-	kfree(container_of(kobj, struct param_kobject, kobj));
-}
-
-static struct kobj_type param_ktype = {
-	.sysfs_ops =	&param_sysfs_ops,
-	.release =	&param_kobj_release,
-};
-
-static struct kset param_kset = {
-	.subsys =	&module_subsys,
-	.ktype =	&param_ktype,
-};
-
 #ifdef CONFIG_MODULES
 #define __modinit
 #else
@@ -434,54 +419,6 @@ static struct kset param_kset = {
 #endif
 
 /*
- * param_add_attribute - actually adds an parameter to sysfs
- * @mod: owner of parameter
- * @pk: param_kobject the attribute shall be assigned to.
- *      One per module, one per KBUILD_MODNAME.
- * @kp: kernel_param to be added
- * @skip: offset where the parameter name start in kp->name.
- * Needed for built-in modules
- *
- * Fill in data into appropriate &pk->attr[], and create sysfs file.
- */
-static __modinit int param_add_attribute(struct module *mod,
-					 struct param_kobject *pk,
-					 struct kernel_param *kp,
-					 unsigned int skip)
-{
-	struct param_attribute *a;
-	int err;
-
-	a = &pk->attr[pk->num_attributes];
-	a->attr.name = (char *) &kp->name[skip];
-	a->attr.owner = mod;
-	a->attr.mode = kp->perm;
-	a->param = kp;
-	err = sysfs_create_file(&pk->kobj, &a->attr);
-	if (!err)
-		pk->num_attributes++;
-	return err;
-}
-
-/*
- * param_sysfs_remove - remove sysfs support for one module or KBUILD_MODNAME
- * @pk: struct param_kobject which is to be removed
- *
- * Called when an error in registration occurs or a module is removed
- * from the system.
- */
-static __modinit void param_sysfs_remove(struct param_kobject *pk)
-{
-	unsigned int i;
-	for (i = 0; i < pk->num_attributes; i++)
-		sysfs_remove_file(&pk->kobj,&pk->attr[i].attr);
-
-	/* Calls param_kobj_release */
-	kobject_unregister(&pk->kobj);
-}
-
-
-/*
  * param_sysfs_setup - setup sysfs support for one module or KBUILD_MODNAME
  * @mk: struct module_kobject (contains parent kobject)
  * @kparam: array of struct kernel_param, the actual parameter definitions
@@ -492,15 +429,17 @@ static __modinit void param_sysfs_remove(struct param_kobject *pk)
  * in sysfs. A pointer to the param_kobject is returned on success,
  * NULL if there's no parameter to export, or other ERR_PTR(err).
  */
-static __modinit struct param_kobject *
+static __modinit struct module_param_attrs *
 param_sysfs_setup(struct module_kobject *mk,
 		  struct kernel_param *kparam,
 		  unsigned int num_params,
 		  unsigned int name_skip)
 {
-	struct param_kobject *pk;
+	struct module_param_attrs *mp;
 	unsigned int valid_attrs = 0;
-	unsigned int i;
+	unsigned int i, size[2];
+	struct param_attribute *pattr;
+	struct attribute **gattr;
 	int err;
 
 	for (i=0; i<num_params; i++) {
@@ -511,42 +450,39 @@ param_sysfs_setup(struct module_kobject *mk,
 	if (!valid_attrs)
 		return NULL;
 
-	pk = kmalloc(sizeof(struct param_kobject)
-		     + sizeof(struct param_attribute) * valid_attrs,
-		     GFP_KERNEL);
-	if (!pk)
-		return ERR_PTR(-ENOMEM);
-	memset(pk, 0, sizeof(struct param_kobject)
-	       + sizeof(struct param_attribute) * valid_attrs);
+	size[0] = ALIGN(sizeof(*mp) +
+			valid_attrs * sizeof(mp->attrs[0]),
+			sizeof(mp->grp.attrs[0]));
+	size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]);
 
-	err = kobject_set_name(&pk->kobj, "parameters");
-	if (err)
-		goto out;
+	mp = kmalloc(size[0] + size[1], GFP_KERNEL);
+	if (!mp)
+		return ERR_PTR(-ENOMEM);
 
-	pk->kobj.kset = &param_kset;
-	pk->kobj.parent = &mk->kobj;
-	err = kobject_register(&pk->kobj);
-	if (err)
-		goto out;
+	mp->grp.name = "parameters";
+	mp->grp.attrs = (void *)mp + size[0];
 
+	pattr = &mp->attrs[0];
+	gattr = &mp->grp.attrs[0];
 	for (i = 0; i < num_params; i++) {
-		if (kparam[i].perm) {
-			err = param_add_attribute(mk->mod, pk,
-						  &kparam[i], name_skip);
-			if (err)
-				goto out_unreg;
+		struct kernel_param *kp = &kparam[i];
+		if (kp->perm) {
+			pattr->param = kp;
+			pattr->mattr.show = param_attr_show;
+			pattr->mattr.store = param_attr_store;
+			pattr->mattr.attr.name = (char *)&kp->name[name_skip];
+			pattr->mattr.attr.owner = mk->mod;
+			pattr->mattr.attr.mode = kp->perm;
+			*(gattr++) = &(pattr++)->mattr.attr;
 		}
 	}
+	*gattr = NULL;
 
-	return pk;
-
-out_unreg:
-	param_sysfs_remove(pk);
-	return ERR_PTR(err);
-
-out:
-	kfree(pk);
-	return ERR_PTR(err);
+	if ((err = sysfs_create_group(&mk->kobj, &mp->grp))) {
+		kfree(mp);
+		return ERR_PTR(err);
+	}
+	return mp;
 }
 
 
@@ -565,13 +501,13 @@ int module_param_sysfs_setup(struct module *mod,
 			     struct kernel_param *kparam,
 			     unsigned int num_params)
 {
-	struct param_kobject *pk;
+	struct module_param_attrs *mp;
 
-	pk = param_sysfs_setup(mod->mkobj, kparam, num_params, 0);
-	if (IS_ERR(pk))
-		return PTR_ERR(pk);
+	mp = param_sysfs_setup(&mod->mkobj, kparam, num_params, 0);
+	if (IS_ERR(mp))
+		return PTR_ERR(mp);
 
-	mod->params_kobject = pk;
+	mod->param_attrs = mp;
 	return 0;
 }
 
@@ -584,9 +520,13 @@ int module_param_sysfs_setup(struct module *mod,
  */
 void module_param_sysfs_remove(struct module *mod)
 {
-	if (mod->params_kobject) {
-		param_sysfs_remove(mod->params_kobject);
-		mod->params_kobject = NULL;
+	if (mod->param_attrs) {
+		sysfs_remove_group(&mod->mkobj.kobj,
+				   &mod->param_attrs->grp);
+		/* We are positive that no one is using any param
+		 * attrs at this point.  Deallocate immediately. */
+		kfree(mod->param_attrs);
+		mod->param_attrs = NULL;
 	}
 }
 #endif
@@ -610,8 +550,10 @@ static void __init kernel_param_sysfs_setup(const char *name,
 	kobject_register(&mk->kobj);
 
 	/* no need to keep the kobject if no parameter is exported */
-	if (!param_sysfs_setup(mk, kparam, num_params, name_skip))
+	if (!param_sysfs_setup(mk, kparam, num_params, name_skip)) {
 		kobject_unregister(&mk->kobj);
+		kfree(mk);
+	}
 }
 
 /*
@@ -691,7 +633,7 @@ static ssize_t module_attr_show(struct kobject *kobj,
 	if (!try_module_get(mk->mod))
 		return -ENODEV;
 
-	ret = attribute->show(mk->mod, buf);
+	ret = attribute->show(attribute, mk->mod, buf);
 
 	module_put(mk->mod);
 
@@ -710,14 +652,8 @@ static struct sysfs_ops module_sysfs_ops = {
 };
 #endif
 
-static void module_kobj_release(struct kobject *kobj)
-{
-	kfree(container_of(kobj, struct module_kobject, kobj));
-}
-
 static struct kobj_type module_ktype = {
 	.sysfs_ops =	&module_sysfs_ops,
-	.release =	&module_kobj_release,
 };
 
 decl_subsys(module, &module_ktype, NULL);
@@ -728,8 +664,6 @@ decl_subsys(module, &module_ktype, NULL);
 static int __init param_sysfs_init(void)
 {
 	subsystem_register(&module_subsys);
-	kobject_set_name(&param_kset.kobj, "parameters");
-	kset_init(&param_kset);
 
 	param_sysfs_builtin();
 
diff --git a/kernel/pid.c b/kernel/pid.c
index 185a8bee8168..edba31c681ac 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -60,7 +60,7 @@ typedef struct pidmap {
 static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
 	 { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };
 
-static spinlock_t pidmap_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
+static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
 
 fastcall void free_pidmap(int pid)
 {
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 33a67e7ad826..9e79eca513ca 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -85,7 +85,7 @@ static inline u64  mpy_l_X_l_ll(unsigned long mpy1,unsigned long mpy2)
  */
 static kmem_cache_t *posix_timers_cache;
 static struct idr posix_timers_id;
-static spinlock_t idr_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(idr_lock);
 
 /*
  * Just because the timer is not in the timer list does NOT mean it is
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ed49ffd12bc6..696387ffe49c 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -48,7 +48,7 @@ config SOFTWARE_SUSPEND
 	  involved in suspending. Also in this case there is a risk that buffers
 	  on disk won't match with saved ones.
 
-	  For more information take a look at Documentation/power/swsusp.txt.
+	  For more information take a look at <file:Documentation/power/swsusp.txt>.
 
 config PM_STD_PARTITION
 	string "Default resume partition"
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 0f5dc712ad70..b9b3f5881677 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -51,7 +51,7 @@ static void power_down(suspend_disk_method_t mode)
 	local_irq_save(flags);
 	switch(mode) {
 	case PM_DISK_PLATFORM:
-		device_power_down(PM_SUSPEND_DISK);
+ 		device_power_down(PMSG_SUSPEND);
 		error = pm_ops->enter(PM_SUSPEND_DISK);
 		break;
 	case PM_DISK_SHUTDOWN:
@@ -144,8 +144,10 @@ static int prepare(void)
 	free_some_memory();
 
 	disable_nonboot_cpus();
-	if ((error = device_suspend(PM_SUSPEND_DISK)))
+	if ((error = device_suspend(PMSG_FREEZE))) {
+		printk("Some devices failed to suspend\n");
 		goto Finish;
+	}
 
 	return 0;
  Finish:
@@ -163,7 +165,7 @@ static int prepare(void)
  *
  *	If we're going through the firmware, then get it over with quickly.
  *
- *	If not, then call swsusp to do it's thing, then figure out how
+ *	If not, then call swsusp to do its thing, then figure out how
  *	to power down the system.
  */
 
@@ -201,7 +203,7 @@ int pm_suspend_disk(void)
  *	software_resume - Resume from a saved image.
  *
  *	Called as a late_initcall (so all devices are discovered and
- *	initialized), we call pmdisk to see if we have a saved image or not.
+ *	initialized), we call swsusp to see if we have a saved image or not.
  *	If so, we quiesce devices, the restore the saved image. We will
  *	return above (in pm_suspend_disk() ) if everything goes well.
  *	Otherwise, we fail gracefully and return to the normally
@@ -221,7 +223,7 @@ static int software_resume(void)
 		return 0;
 	}
 
-	pr_debug("PM: Reading pmdisk image.\n");
+	pr_debug("PM: Reading swsusp image.\n");
 
 	if ((error = swsusp_read()))
 		goto Done;
@@ -284,7 +286,7 @@ static char * pm_disk_modes[] = {
 
 static ssize_t disk_show(struct subsystem * subsys, char * buf)
 {
-	return sprintf(buf,"%s\n",pm_disk_modes[pm_disk_mode]);
+	return sprintf(buf, "%s\n", pm_disk_modes[pm_disk_mode]);
 }
 
 
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 0aefb03ede09..b7ef95c96230 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -65,7 +65,7 @@ static int suspend_prepare(suspend_state_t state)
 			goto Thaw;
 	}
 
-	if ((error = device_suspend(state)))
+	if ((error = device_suspend(PMSG_SUSPEND)))
 		goto Finish;
 	return 0;
  Finish:
@@ -78,13 +78,14 @@ static int suspend_prepare(suspend_state_t state)
 }
 
 
-static int suspend_enter(u32 state)
+static int suspend_enter(suspend_state_t state)
 {
 	int error = 0;
 	unsigned long flags;
 
 	local_irq_save(flags);
-	if ((error = device_power_down(state)))
+
+	if ((error = device_power_down(PMSG_SUSPEND)))
 		goto Done;
 	error = pm_ops->enter(state);
 	device_power_up();
@@ -99,7 +100,7 @@ static int suspend_enter(u32 state)
  *	@state:		State we're coming out of.
  *
  *	Call platform code to clean up, restart processes, and free the 
- *	console that we've allocated.
+ *	console that we've allocated. This is not called for suspend-to-disk.
  */
 
 static void suspend_finish(suspend_state_t state)
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 4bf7fe6d2feb..22cdaa42d922 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -67,12 +67,13 @@
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/pgtable.h>
+#include <asm/tlbflush.h>
 #include <asm/io.h>
 
 #include "power.h"
 
 /* References to section boundaries */
-extern char __nosave_begin, __nosave_end;
+extern const void __nosave_begin, __nosave_end;
 
 /* Variables to be preserved over suspend */
 static int pagedir_order_check;
@@ -419,7 +420,7 @@ struct highmem_page {
 	struct highmem_page *next;
 };
 
-struct highmem_page *highmem_copy = NULL;
+static struct highmem_page *highmem_copy;
 
 static int save_highmem_zone(struct zone *zone)
 {
@@ -752,11 +753,11 @@ static int swsusp_alloc(void)
 		return -ENOSPC;
 
 	if ((error = alloc_pagedir())) {
-		pr_debug("suspend: Allocating pagedir failed.\n");
+		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
 		return error;
 	}
 	if ((error = alloc_image_pages())) {
-		pr_debug("suspend: Allocating image pages failed.\n");
+		printk(KERN_ERR "suspend: Allocating image pages failed.\n");
 		swsusp_free();
 		return error;
 	}
@@ -766,7 +767,7 @@ static int swsusp_alloc(void)
 	return 0;
 }
 
-int suspend_prepare_image(void)
+static int suspend_prepare_image(void)
 {
 	int error;
 
@@ -842,11 +843,22 @@ int swsusp_suspend(void)
 	if ((error = arch_prepare_suspend()))
 		return error;
 	local_irq_disable();
+	/* At this point, device_suspend() has been called, but *not*
+	 * device_power_down(). We *must* device_power_down() now.
+	 * Otherwise, drivers for some devices (e.g. interrupt controllers)
+	 * become desynchronized with the actual state of the hardware
+	 * at resume time, and evil weirdness ensues.
+	 */
+	if ((error = device_power_down(PMSG_FREEZE))) {
+		local_irq_enable();
+		return error;
+	}
 	save_processor_state();
 	error = swsusp_arch_suspend();
 	/* Restore control flow magically appears here */
 	restore_processor_state();
 	restore_highmem();
+	device_power_up();
 	local_irq_enable();
 	return error;
 }
@@ -866,6 +878,7 @@ int swsusp_resume(void)
 {
 	int error;
 	local_irq_disable();
+	device_power_down(PMSG_FREEZE);
 	/* We'll ignore saved state, but this gets preempt count (etc) right */
 	save_processor_state();
 	error = swsusp_arch_resume();
@@ -875,6 +888,7 @@ int swsusp_resume(void)
 	BUG_ON(!error);
 	restore_processor_state();
 	restore_highmem();
+	device_power_up();
 	local_irq_enable();
 	return error;
 }
@@ -1036,12 +1050,12 @@ static int submit(int rw, pgoff_t page_off, void * page)
 	return error;
 }
 
-int bio_read_page(pgoff_t page_off, void * page)
+static int bio_read_page(pgoff_t page_off, void * page)
 {
 	return submit(READ, page_off, page);
 }
 
-int bio_write_page(pgoff_t page_off, void * page)
+static int bio_write_page(pgoff_t page_off, void * page)
 {
 	return submit(WRITE, page_off, page);
 }
@@ -1158,7 +1172,7 @@ static int __init read_pagedir(void)
 		return -ENOMEM;
 	pagedir_nosave = (struct pbe *)addr;
 
-	pr_debug("pmdisk: Reading pagedir (%d Pages)\n",n);
+	pr_debug("swsusp: Reading pagedir (%d Pages)\n",n);
 
 	for (i = 0; i < n && !error; i++, addr += PAGE_SIZE) {
 		unsigned long offset = swp_offset(swsusp_info.pagedir[i]);
diff --git a/kernel/printk.c b/kernel/printk.c
index 4e9fd492f30e..d914a90d6206 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -78,7 +78,7 @@ static int console_locked;
  * It is also used in interesting ways to provide interlocking in
  * release_console_sem().
  */
-static spinlock_t logbuf_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(logbuf_lock);
 
 static char __log_buf[__LOG_BUF_LEN];
 static char *log_buf = __log_buf;
@@ -284,6 +284,7 @@ int do_syslog(int type, char __user * buf, int len)
 			error = __put_user(c,buf);
 			buf++;
 			i++;
+			cond_resched();
 			spin_lock_irq(&logbuf_lock);
 		}
 		spin_unlock_irq(&logbuf_lock);
@@ -325,6 +326,7 @@ int do_syslog(int type, char __user * buf, int len)
 			c = LOG_BUF(j);
 			spin_unlock_irq(&logbuf_lock);
 			error = __put_user(c,&buf[count-1-i]);
+			cond_resched();
 			spin_lock_irq(&logbuf_lock);
 		}
 		spin_unlock_irq(&logbuf_lock);
@@ -340,6 +342,7 @@ int do_syslog(int type, char __user * buf, int len)
 					error = -EFAULT;
 					break;
 				}
+				cond_resched();
 			}
 		}
 		break;
@@ -642,8 +645,9 @@ void release_console_sem(void)
 		_con_start = con_start;
 		_log_end = log_end;
 		con_start = log_end;		/* Flush */
-		spin_unlock_irqrestore(&logbuf_lock, flags);
+		spin_unlock(&logbuf_lock);
 		call_console_drivers(_con_start, _log_end);
+		local_irq_restore(flags);
 	}
 	console_locked = 0;
 	console_may_schedule = 0;
@@ -871,7 +875,7 @@ void tty_write_message(struct tty_struct *tty, char *msg)
  */
 int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
 {
-	static spinlock_t ratelimit_lock = SPIN_LOCK_UNLOCKED;
+	static DEFINE_SPINLOCK(ratelimit_lock);
 	static unsigned long toks = 10*5*HZ;
 	static unsigned long last_msg;
 	static int missed;
diff --git a/kernel/profile.c b/kernel/profile.c
index ff62fa98328a..a38fa70075fe 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -83,7 +83,7 @@ void __init profile_init(void)
 #ifdef CONFIG_PROFILING
  
 static DECLARE_RWSEM(profile_rwsem);
-static rwlock_t handoff_lock = RW_LOCK_UNLOCKED;
+static DEFINE_RWLOCK(handoff_lock);
 static struct notifier_block * task_exit_notifier;
 static struct notifier_block * task_free_notifier;
 static struct notifier_block * munmap_notifier;
@@ -326,17 +326,15 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
 		node = cpu_to_node(cpu);
 		per_cpu(cpu_profile_flip, cpu) = 0;
 		if (!per_cpu(cpu_profile_hits, cpu)[1]) {
-			page = alloc_pages_node(node, GFP_KERNEL, 0);
+			page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
 			if (!page)
 				return NOTIFY_BAD;
-			clear_highpage(page);
 			per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
 		}
 		if (!per_cpu(cpu_profile_hits, cpu)[0]) {
-			page = alloc_pages_node(node, GFP_KERNEL, 0);
+			page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
 			if (!page)
 				goto out_free;
-			clear_highpage(page);
 			per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
 		}
 		break;
@@ -510,16 +508,14 @@ static int __init create_hash_tables(void)
 		int node = cpu_to_node(cpu);
 		struct page *page;
 
-		page = alloc_pages_node(node, GFP_KERNEL, 0);
+		page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
 		if (!page)
 			goto out_cleanup;
-		clear_highpage(page);
 		per_cpu(cpu_profile_hits, cpu)[1]
 				= (struct profile_hit *)page_address(page);
-		page = alloc_pages_node(node, GFP_KERNEL, 0);
+		page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
 		if (!page)
 			goto out_cleanup;
-		clear_highpage(page);
 		per_cpu(cpu_profile_hits, cpu)[0]
 				= (struct profile_hit *)page_address(page);
 	}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 60801c692810..136a8feba91e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -38,10 +38,24 @@ void __ptrace_link(task_t *child, task_t *new_parent)
 	SET_LINKS(child);
 }
  
-static inline int pending_resume_signal(struct sigpending *pending)
+/*
+ * Turn a tracing stop into a normal stop now, since with no tracer there
+ * would be no way to wake it up with SIGCONT or SIGKILL.  If there was a
+ * signal sent that would resume the child, but didn't because it was in
+ * TASK_TRACED, resume it now.
+ * Requires that irqs be disabled.
+ */
+void ptrace_untrace(task_t *child)
 {
-#define M(sig) (1UL << ((sig)-1))
-	return sigtestsetmask(&pending->signal, M(SIGCONT) | M(SIGKILL));
+	spin_lock(&child->sighand->siglock);
+	if (child->state == TASK_TRACED) {
+		if (child->signal->flags & SIGNAL_STOP_STOPPED) {
+			child->state = TASK_STOPPED;
+		} else {
+			signal_wake_up(child, 1);
+		}
+	}
+	spin_unlock(&child->sighand->siglock);
 }
 
 /*
@@ -55,29 +69,15 @@ void __ptrace_unlink(task_t *child)
 	if (!child->ptrace)
 		BUG();
 	child->ptrace = 0;
-	if (list_empty(&child->ptrace_list))
-		return;
-	list_del_init(&child->ptrace_list);
-	REMOVE_LINKS(child);
-	child->parent = child->real_parent;
-	SET_LINKS(child);
-
-	if (child->state == TASK_TRACED) {
-		/*
-		 * Turn a tracing stop into a normal stop now,
-		 * since with no tracer there would be no way
-		 * to wake it up with SIGCONT or SIGKILL.
-		 * If there was a signal sent that would resume the child,
-		 * but didn't because it was in TASK_TRACED, resume it now.
-		 */
-		spin_lock(&child->sighand->siglock);
-		child->state = TASK_STOPPED;
-		if (pending_resume_signal(&child->pending) ||
-		    pending_resume_signal(&child->signal->shared_pending)) {
-			signal_wake_up(child, 1);
-		}
-		spin_unlock(&child->sighand->siglock);
+	if (!list_empty(&child->ptrace_list)) {
+		list_del_init(&child->ptrace_list);
+		REMOVE_LINKS(child);
+		child->parent = child->real_parent;
+		SET_LINKS(child);
 	}
+
+	if (child->state == TASK_TRACED)
+		ptrace_untrace(child);
 }
 
 /*
@@ -319,18 +319,33 @@ static int ptrace_setoptions(struct task_struct *child, long data)
 
 static int ptrace_getsiginfo(struct task_struct *child, siginfo_t __user * data)
 {
-	if (child->last_siginfo == NULL)
-		return -EINVAL;
-	return copy_siginfo_to_user(data, child->last_siginfo);
+	siginfo_t lastinfo;
+
+	spin_lock_irq(&child->sighand->siglock);
+	if (likely(child->last_siginfo != NULL)) {
+		memcpy(&lastinfo, child->last_siginfo, sizeof (siginfo_t));
+		spin_unlock_irq(&child->sighand->siglock);
+		return copy_siginfo_to_user(data, &lastinfo);
+	}
+	spin_unlock_irq(&child->sighand->siglock);
+	return -EINVAL;
 }
 
 static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data)
 {
-	if (child->last_siginfo == NULL)
-		return -EINVAL;
-	if (copy_from_user(child->last_siginfo, data, sizeof (siginfo_t)) != 0)
+	siginfo_t newinfo;
+
+	if (copy_from_user(&newinfo, data, sizeof (siginfo_t)) != 0)
 		return -EFAULT;
-	return 0;
+
+	spin_lock_irq(&child->sighand->siglock);
+	if (likely(child->last_siginfo != NULL)) {
+		memcpy(child->last_siginfo, &newinfo, sizeof (siginfo_t));
+		spin_unlock_irq(&child->sighand->siglock);
+		return 0;
+	}
+	spin_unlock_irq(&child->sighand->siglock);
+	return -EINVAL;
 }
 
 int ptrace_request(struct task_struct *child, long request,
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 80cac1cd0859..f0ae3c3c013e 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -49,9 +49,9 @@
 
 /* Definition for rcupdate control block. */
 struct rcu_ctrlblk rcu_ctrlblk = 
-	{ .cur = -300, .completed = -300 , .lock = SEQCNT_ZERO };
+	{ .cur = -300, .completed = -300 };
 struct rcu_ctrlblk rcu_bh_ctrlblk =
-	{ .cur = -300, .completed = -300 , .lock = SEQCNT_ZERO };
+	{ .cur = -300, .completed = -300 };
 
 /* Bookkeeping of the progress of the grace period */
 struct rcu_state {
@@ -60,9 +60,9 @@ struct rcu_state {
 	                              /* for current batch to proceed.        */
 };
 
-struct rcu_state rcu_state ____cacheline_maxaligned_in_smp =
+static struct rcu_state rcu_state ____cacheline_maxaligned_in_smp =
 	  {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
-struct rcu_state rcu_bh_state ____cacheline_maxaligned_in_smp =
+static struct rcu_state rcu_bh_state ____cacheline_maxaligned_in_smp =
 	  {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
 
 DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
@@ -185,10 +185,13 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp,
 			rcp->completed == rcp->cur) {
 		/* Can't change, since spin lock held. */
 		cpus_andnot(rsp->cpumask, cpu_online_map, nohz_cpu_mask);
-		write_seqcount_begin(&rcp->lock);
+
 		rcp->next_pending = 0;
+		/* next_pending == 0 must be visible in __rcu_process_callbacks()
+		 * before it can see new value of cur.
+		 */
+		smp_wmb();
 		rcp->cur++;
-		write_seqcount_end(&rcp->lock);
 	}
 }
 
@@ -216,9 +219,9 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
 			struct rcu_state *rsp, struct rcu_data *rdp)
 {
 	if (rdp->quiescbatch != rcp->cur) {
-		/* new grace period: record qsctr value. */
+		/* start new grace period: */
 		rdp->qs_pending = 1;
-		rdp->last_qsctr = rdp->qsctr;
+		rdp->passed_quiesc = 0;
 		rdp->quiescbatch = rcp->cur;
 		return;
 	}
@@ -231,11 +234,10 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
 		return;
 
 	/* 
-	 * Races with local timer interrupt - in the worst case
-	 * we may miss one quiescent state of that CPU. That is
-	 * tolerable. So no need to disable interrupts.
+	 * Was there a quiescent state since the beginning of the grace
+	 * period? If no, then exit and wait for the next call.
 	 */
-	if (rdp->qsctr == rdp->last_qsctr)
+	if (!rdp->passed_quiesc)
 		return;
 	rdp->qs_pending = 0;
 
@@ -319,8 +321,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 
 	local_irq_disable();
 	if (rdp->nxtlist && !rdp->curlist) {
-		int next_pending, seq;
-
 		rdp->curlist = rdp->nxtlist;
 		rdp->curtail = rdp->nxttail;
 		rdp->nxtlist = NULL;
@@ -330,14 +330,15 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 		/*
 		 * start the next batch of callbacks
 		 */
-		do {
-			seq = read_seqcount_begin(&rcp->lock);
-			/* determine batch number */
-			rdp->batch = rcp->cur + 1;
-			next_pending = rcp->next_pending;
-		} while (read_seqcount_retry(&rcp->lock, seq));
-
-		if (!next_pending) {
+
+		/* determine batch number */
+		rdp->batch = rcp->cur + 1;
+		/* see the comment and corresponding wmb() in
+		 * the rcu_start_batch()
+		 */
+		smp_rmb();
+
+		if (!rcp->next_pending) {
 			/* and start it/schedule start if it's a new batch */
 			spin_lock(&rsp->lock);
 			rcu_start_batch(rcp, rsp, 1);
diff --git a/kernel/resource.c b/kernel/resource.c
index 5f013dc4f649..72596bc6fdaf 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -39,7 +39,7 @@ struct resource iomem_resource = {
 
 EXPORT_SYMBOL(iomem_resource);
 
-static rwlock_t resource_lock = RW_LOCK_UNLOCKED;
+static DEFINE_RWLOCK(resource_lock);
 
 #ifdef CONFIG_PROC_FS
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 43bfde70a34c..099f946ed40c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -36,6 +36,7 @@
 #include <linux/blkdev.h>
 #include <linux/delay.h>
 #include <linux/smp.h>
+#include <linux/threads.h>
 #include <linux/timer.h>
 #include <linux/rcupdate.h>
 #include <linux/cpu.h>
@@ -48,12 +49,6 @@
 
 #include <asm/unistd.h>
 
-#ifdef CONFIG_NUMA
-#define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
-#else
-#define cpu_to_node_mask(cpu) (cpu_online_map)
-#endif
-
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -97,7 +92,6 @@
 #define MAX_SLEEP_AVG		(DEF_TIMESLICE * MAX_BONUS)
 #define STARVATION_LIMIT	(MAX_SLEEP_AVG)
 #define NS_MAX_SLEEP_AVG	(JIFFIES_TO_NS(MAX_SLEEP_AVG))
-#define CREDIT_LIMIT		100
 
 /*
  * If a task is 'interactive' then we reinsert it in the active
@@ -131,12 +125,14 @@
 	(NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
 		MAX_SLEEP_AVG)
 
+#define GRANULARITY	(10 * HZ / 1000 ? : 1)
+
 #ifdef CONFIG_SMP
-#define TIMESLICE_GRANULARITY(p)	(MIN_TIMESLICE * \
+#define TIMESLICE_GRANULARITY(p)	(GRANULARITY * \
 		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
 			num_online_cpus())
 #else
-#define TIMESLICE_GRANULARITY(p)	(MIN_TIMESLICE * \
+#define TIMESLICE_GRANULARITY(p)	(GRANULARITY * \
 		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
 #endif
 
@@ -153,12 +149,6 @@
 	(JIFFIES_TO_NS(MAX_SLEEP_AVG * \
 		(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
 
-#define HIGH_CREDIT(p) \
-	((p)->interactive_credit > CREDIT_LIMIT)
-
-#define LOW_CREDIT(p) \
-	((p)->interactive_credit < -CREDIT_LIMIT)
-
 #define TASK_PREEMPTS_CURR(p, rq) \
 	((p)->prio < (rq)->curr->prio)
 
@@ -448,11 +438,21 @@ static runqueue_t *this_rq_lock(void)
 	return rq;
 }
 
-static inline void rq_unlock(runqueue_t *rq)
-	__releases(rq->lock)
+#ifdef CONFIG_SCHED_SMT
+static int cpu_and_siblings_are_idle(int cpu)
 {
-	spin_unlock_irq(&rq->lock);
+	int sib;
+	for_each_cpu_mask(sib, cpu_sibling_map[cpu]) {
+		if (idle_cpu(sib))
+			continue;
+		return 0;
+	}
+
+	return 1;
 }
+#else
+#define cpu_and_siblings_are_idle(A) idle_cpu(A)
+#endif
 
 #ifdef CONFIG_SCHEDSTATS
 /*
@@ -581,10 +581,14 @@ static void enqueue_task(struct task_struct *p, prio_array_t *array)
 }
 
 /*
- * Used by the migration code - we pull tasks from the head of the
- * remote queue so we want these tasks to show up at the head of the
- * local queue:
+ * Put task to the end of the run list without the overhead of dequeue
+ * followed by enqueue.
  */
+static void requeue_task(struct task_struct *p, prio_array_t *array)
+{
+	list_move_tail(&p->run_list, array->queue + p->prio);
+}
+
 static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
 {
 	list_add(&p->run_list, array->queue + p->prio);
@@ -663,8 +667,6 @@ static void recalc_task_prio(task_t *p, unsigned long long now)
 			sleep_time > INTERACTIVE_SLEEP(p)) {
 				p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
 						DEF_TIMESLICE);
-				if (!HIGH_CREDIT(p))
-					p->interactive_credit++;
 		} else {
 			/*
 			 * The lower the sleep avg a task has the more
@@ -673,19 +675,11 @@ static void recalc_task_prio(task_t *p, unsigned long long now)
 			sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
 
 			/*
-			 * Tasks with low interactive_credit are limited to
-			 * one timeslice worth of sleep avg bonus.
+			 * Tasks waking from uninterruptible sleep are
+			 * limited in their sleep_avg rise as they
+			 * are likely to be waiting on I/O
 			 */
-			if (LOW_CREDIT(p) &&
-			    sleep_time > JIFFIES_TO_NS(task_timeslice(p)))
-				sleep_time = JIFFIES_TO_NS(task_timeslice(p));
-
-			/*
-			 * Non high_credit tasks waking from uninterruptible
-			 * sleep are limited in their sleep_avg rise as they
-			 * are likely to be cpu hogs waiting on I/O
-			 */
-			if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm) {
+			if (p->activated == -1 && p->mm) {
 				if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
 					sleep_time = 0;
 				else if (p->sleep_avg + sleep_time >=
@@ -705,11 +699,8 @@ static void recalc_task_prio(task_t *p, unsigned long long now)
 			 */
 			p->sleep_avg += sleep_time;
 
-			if (p->sleep_avg > NS_MAX_SLEEP_AVG) {
+			if (p->sleep_avg > NS_MAX_SLEEP_AVG)
 				p->sleep_avg = NS_MAX_SLEEP_AVG;
-				if (!HIGH_CREDIT(p))
-					p->interactive_credit++;
-			}
 		}
 	}
 
@@ -934,9 +925,10 @@ static inline unsigned long target_load(int cpu)
 #endif
 
 /*
- * wake_idle() is useful especially on SMT architectures to wake a
- * task onto an idle sibling if we would otherwise wake it onto a
- * busy sibling.
+ * wake_idle() will wake a task on an idle cpu if task->cpu is
+ * not idle and an idle cpu is available.  The span of cpus to
+ * search starts with cpus closest then further out as needed,
+ * so we always favor a closer, idle cpu.
  *
  * Returns the CPU we should wake onto.
  */
@@ -944,24 +936,23 @@ static inline unsigned long target_load(int cpu)
 static int wake_idle(int cpu, task_t *p)
 {
 	cpumask_t tmp;
-	runqueue_t *rq = cpu_rq(cpu);
 	struct sched_domain *sd;
 	int i;
 
 	if (idle_cpu(cpu))
 		return cpu;
 
-	sd = rq->sd;
-	if (!(sd->flags & SD_WAKE_IDLE))
-		return cpu;
-
-	cpus_and(tmp, sd->span, p->cpus_allowed);
-
-	for_each_cpu_mask(i, tmp) {
-		if (idle_cpu(i))
-			return i;
+	for_each_domain(cpu, sd) {
+		if (sd->flags & SD_WAKE_IDLE) {
+			cpus_and(tmp, sd->span, cpu_online_map);
+			cpus_and(tmp, tmp, p->cpus_allowed);
+			for_each_cpu_mask(i, tmp) {
+				if (idle_cpu(i))
+					return i;
+			}
+		}
+		else break;
 	}
-
 	return cpu;
 }
 #else
@@ -1073,7 +1064,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
 out_set_cpu:
 	schedstat_inc(rq, ttwu_attempts);
 	new_cpu = wake_idle(new_cpu, p);
-	if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) {
+	if (new_cpu != cpu) {
 		schedstat_inc(rq, ttwu_moved);
 		set_task_cpu(p, new_cpu);
 		task_rq_unlock(rq, &flags);
@@ -1126,7 +1117,7 @@ out:
 int fastcall wake_up_process(task_t * p)
 {
 	return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
-		       		 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
+				 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
 }
 
 EXPORT_SYMBOL(wake_up_process);
@@ -1191,7 +1182,7 @@ void fastcall sched_fork(task_t *p)
 		 */
 		current->time_slice = 1;
 		preempt_disable();
-		scheduler_tick(0, 0);
+		scheduler_tick();
 		local_irq_enable();
 		preempt_enable();
 	} else
@@ -1227,8 +1218,6 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
 	p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
 		CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
 
-	p->interactive_credit = 0;
-
 	p->prio = effective_prio(p);
 
 	if (likely(cpu == this_cpu)) {
@@ -1667,13 +1656,18 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
 	if (!cpu_isset(this_cpu, p->cpus_allowed))
 		return 0;
 
-	/* Aggressive migration if we've failed balancing */
-	if (idle == NEWLY_IDLE ||
-			sd->nr_balance_failed < sd->cache_nice_tries) {
-		if (task_hot(p, rq->timestamp_last_tick, sd))
-			return 0;
-	}
+	/*
+	 * Aggressive migration if:
+	 * 1) the [whole] cpu is idle, or
+	 * 2) too many balance attempts have failed.
+	 */
+
+	if (cpu_and_siblings_are_idle(this_cpu) || \
+			sd->nr_balance_failed > sd->cache_nice_tries)
+		return 1;
 
+	if (task_hot(p, rq->timestamp_last_tick, sd))
+			return 0;
 	return 1;
 }
 
@@ -2009,7 +2003,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 		if (sd->balance_interval < sd->max_interval)
 			sd->balance_interval++;
 	} else {
-                sd->nr_balance_failed = 0;
+		sd->nr_balance_failed = 0;
 
 		/* We were unbalanced, so reset the balancing interval */
 		sd->balance_interval = sd->min_interval;
@@ -2088,23 +2082,6 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
 	}
 }
 
-#ifdef CONFIG_SCHED_SMT
-static int cpu_and_siblings_are_idle(int cpu)
-{
-	int sib;
-	for_each_cpu_mask(sib, cpu_sibling_map[cpu]) {
-		if (idle_cpu(sib))
-			continue;
-		return 0;
-	}
-
-	return 1;
-}
-#else
-#define cpu_and_siblings_are_idle(A) idle_cpu(A)
-#endif
-
-
 /*
  * active_load_balance is run by migration threads. It pushes running tasks
  * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
@@ -2117,7 +2094,9 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
 {
 	struct sched_domain *sd;
 	struct sched_group *cpu_group;
+	runqueue_t *target_rq;
 	cpumask_t visited_cpus;
+	int cpu;
 
 	schedstat_inc(busiest_rq, alb_cnt);
 	/*
@@ -2126,46 +2105,43 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
 	 */
 	visited_cpus = CPU_MASK_NONE;
 	for_each_domain(busiest_cpu, sd) {
-		if (!(sd->flags & SD_LOAD_BALANCE) || busiest_rq->nr_running <= 1)
-			break; /* no more domains to search or no more tasks to move */
+		if (!(sd->flags & SD_LOAD_BALANCE))
+			/* no more domains to search */
+			break;
 
 		cpu_group = sd->groups;
-		do { /* sched_groups should either use list_heads or be merged into the domains structure */
-			int cpu, target_cpu = -1;
-			runqueue_t *target_rq;
-
+		do {
 			for_each_cpu_mask(cpu, cpu_group->cpumask) {
-				if (cpu_isset(cpu, visited_cpus) || cpu == busiest_cpu ||
-				    !cpu_and_siblings_are_idle(cpu)) {
-					cpu_set(cpu, visited_cpus);
+				if (busiest_rq->nr_running <= 1)
+					/* no more tasks left to move */
+					return;
+				if (cpu_isset(cpu, visited_cpus))
+					continue;
+				cpu_set(cpu, visited_cpus);
+				if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu)
 					continue;
-				}
-				target_cpu = cpu;
-				break;
-			}
-			if (target_cpu == -1)
-				goto next_group; /* failed to find a suitable target cpu in this domain */
-
-			target_rq = cpu_rq(target_cpu);
 
-			/*
-			 * This condition is "impossible", if it occurs we need to fix it
-			 * Reported by Bjorn Helgaas on a 128-cpu setup.
-			 */
-			BUG_ON(busiest_rq == target_rq);
-
-			/* move a task from busiest_rq to target_rq */
-			double_lock_balance(busiest_rq, target_rq);
-			if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE)) {
-				schedstat_inc(busiest_rq, alb_lost);
-				schedstat_inc(target_rq, alb_gained);
-			} else {
-				schedstat_inc(busiest_rq, alb_failed);
+				target_rq = cpu_rq(cpu);
+				/*
+				 * This condition is "impossible", if it occurs
+				 * we need to fix it.  Originally reported by
+				 * Bjorn Helgaas on a 128-cpu setup.
+				 */
+				BUG_ON(busiest_rq == target_rq);
+
+				/* move a task from busiest_rq to target_rq */
+				double_lock_balance(busiest_rq, target_rq);
+				if (move_tasks(target_rq, cpu, busiest_rq,
+						1, sd, SCHED_IDLE)) {
+					schedstat_inc(busiest_rq, alb_lost);
+					schedstat_inc(target_rq, alb_gained);
+				} else {
+					schedstat_inc(busiest_rq, alb_failed);
+				}
+				spin_unlock(&target_rq->lock);
 			}
-			spin_unlock(&target_rq->lock);
-next_group:
 			cpu_group = cpu_group->next;
-		} while (cpu_group != sd->groups && busiest_rq->nr_running > 1);
+		} while (cpu_group != sd->groups);
 	}
 }
 
@@ -2275,48 +2251,172 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 			((rq)->curr->static_prio > (rq)->best_expired_prio))
 
 /*
+ * Do the virtual cpu time signal calculations.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in user space since the last update
+ */
+static inline void account_it_virt(struct task_struct * p, cputime_t cputime)
+{
+	cputime_t it_virt = p->it_virt_value;
+
+	if (cputime_gt(it_virt, cputime_zero) &&
+	    cputime_gt(cputime, cputime_zero)) {
+		if (cputime_ge(cputime, it_virt)) {
+			it_virt = cputime_add(it_virt, p->it_virt_incr);
+			send_sig(SIGVTALRM, p, 1);
+		}
+		it_virt = cputime_sub(it_virt, cputime);
+		p->it_virt_value = it_virt;
+	}
+}
+
+/*
+ * Do the virtual profiling signal calculations.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in user and kernel space since the last update
+ */
+static void account_it_prof(struct task_struct *p, cputime_t cputime)
+{
+	cputime_t it_prof = p->it_prof_value;
+
+	if (cputime_gt(it_prof, cputime_zero) &&
+	    cputime_gt(cputime, cputime_zero)) {
+		if (cputime_ge(cputime, it_prof)) {
+			it_prof = cputime_add(it_prof, p->it_prof_incr);
+			send_sig(SIGPROF, p, 1);
+		}
+		it_prof = cputime_sub(it_prof, cputime);
+		p->it_prof_value = it_prof;
+	}
+}
+
+/*
+ * Check if the process went over its cputime resource limit after
+ * some cpu time got added to utime/stime.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in user and kernel space since the last update
+ */
+static void check_rlimit(struct task_struct *p, cputime_t cputime)
+{
+	cputime_t total, tmp;
+
+	total = cputime_add(p->utime, p->stime);
+	tmp = jiffies_to_cputime(p->signal->rlim[RLIMIT_CPU].rlim_cur);
+	if (unlikely(cputime_gt(total, tmp))) {
+		/* Send SIGXCPU every second. */
+		tmp = cputime_sub(total, cputime);
+		if (cputime_to_secs(tmp) < cputime_to_secs(total))
+			send_sig(SIGXCPU, p, 1);
+		/* and SIGKILL when we go over max.. */
+		tmp = jiffies_to_cputime(p->signal->rlim[RLIMIT_CPU].rlim_max);
+		if (cputime_gt(total, tmp))
+			send_sig(SIGKILL, p, 1);
+	}
+}
+
+/*
+ * Account user cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @hardirq_offset: the offset to subtract from hardirq_count()
+ * @cputime: the cpu time spent in user space since the last update
+ */
+void account_user_time(struct task_struct *p, cputime_t cputime)
+{
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	cputime64_t tmp;
+
+	p->utime = cputime_add(p->utime, cputime);
+
+	/* Check for signals (SIGVTALRM, SIGPROF, SIGXCPU & SIGKILL). */
+	check_rlimit(p, cputime);
+	account_it_virt(p, cputime);
+	account_it_prof(p, cputime);
+
+	/* Add user time to cpustat. */
+	tmp = cputime_to_cputime64(cputime);
+	if (TASK_NICE(p) > 0)
+		cpustat->nice = cputime64_add(cpustat->nice, tmp);
+	else
+		cpustat->user = cputime64_add(cpustat->user, tmp);
+}
+
+/*
+ * Account system cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @hardirq_offset: the offset to subtract from hardirq_count()
+ * @cputime: the cpu time spent in kernel space since the last update
+ */
+void account_system_time(struct task_struct *p, int hardirq_offset,
+			 cputime_t cputime)
+{
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	runqueue_t *rq = this_rq();
+	cputime64_t tmp;
+
+	p->stime = cputime_add(p->stime, cputime);
+
+	/* Check for signals (SIGPROF, SIGXCPU & SIGKILL). */
+	if (likely(p->signal && p->exit_state < EXIT_ZOMBIE)) {
+		check_rlimit(p, cputime);
+		account_it_prof(p, cputime);
+	}
+
+	/* Add system time to cpustat. */
+	tmp = cputime_to_cputime64(cputime);
+	if (hardirq_count() - hardirq_offset)
+		cpustat->irq = cputime64_add(cpustat->irq, tmp);
+	else if (softirq_count())
+		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+	else if (p != rq->idle)
+		cpustat->system = cputime64_add(cpustat->system, tmp);
+	else if (atomic_read(&rq->nr_iowait) > 0)
+		cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
+	else
+		cpustat->idle = cputime64_add(cpustat->idle, tmp);
+}
+
+/*
+ * Account for involuntary wait time.
+ * @p: the process from which the cpu time has been stolen
+ * @steal: the cpu time spent in involuntary wait
+ */
+void account_steal_time(struct task_struct *p, cputime_t steal)
+{
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	cputime64_t tmp = cputime_to_cputime64(steal);
+	runqueue_t *rq = this_rq();
+
+	if (p == rq->idle) {
+		p->stime = cputime_add(p->stime, steal);
+		if (atomic_read(&rq->nr_iowait) > 0)
+			cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
+		else
+			cpustat->idle = cputime64_add(cpustat->idle, tmp);
+	} else
+		cpustat->steal = cputime64_add(cpustat->steal, tmp);
+}
+
+/*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
  *
  * It also gets called by the fork code, when changing the parent's
  * timeslices.
  */
-void scheduler_tick(int user_ticks, int sys_ticks)
+void scheduler_tick(void)
 {
 	int cpu = smp_processor_id();
-	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	runqueue_t *rq = this_rq();
 	task_t *p = current;
 
 	rq->timestamp_last_tick = sched_clock();
 
-	if (rcu_pending(cpu))
-		rcu_check_callbacks(cpu, user_ticks);
-
-	/* note: this timer irq context must be accounted for as well */
-	if (hardirq_count() - HARDIRQ_OFFSET) {
-		cpustat->irq += sys_ticks;
-		sys_ticks = 0;
-	} else if (softirq_count()) {
-		cpustat->softirq += sys_ticks;
-		sys_ticks = 0;
-	}
-
 	if (p == rq->idle) {
-		if (atomic_read(&rq->nr_iowait) > 0)
-			cpustat->iowait += sys_ticks;
-		else
-			cpustat->idle += sys_ticks;
 		if (wake_priority_sleeper(rq))
 			goto out;
 		rebalance_tick(cpu, rq, SCHED_IDLE);
 		return;
 	}
-	if (TASK_NICE(p) > 0)
-		cpustat->nice += user_ticks;
-	else
-		cpustat->user += user_ticks;
-	cpustat->system += sys_ticks;
 
 	/* Task might have expired already, but not scheduled off yet */
 	if (p->array != rq->active) {
@@ -2342,8 +2442,7 @@ void scheduler_tick(int user_ticks, int sys_ticks)
 			set_tsk_need_resched(p);
 
 			/* put it at the end of the queue: */
-			dequeue_task(p, rq->active);
-			enqueue_task(p, rq->active);
+			requeue_task(p, rq->active);
 		}
 		goto out_unlock;
 	}
@@ -2384,10 +2483,8 @@ void scheduler_tick(int user_ticks, int sys_ticks)
 			(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
 			(p->array == rq->active)) {
 
-			dequeue_task(p, rq->active);
+			requeue_task(p, rq->active);
 			set_tsk_need_resched(p);
-			p->prio = effective_prio(p);
-			enqueue_task(p, rq->active);
 		}
 	}
 out_unlock:
@@ -2521,6 +2618,38 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
 }
 #endif
 
+#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
+
+void fastcall add_preempt_count(int val)
+{
+	/*
+	 * Underflow?
+	 */
+	BUG_ON(((int)preempt_count() < 0));
+	preempt_count() += val;
+	/*
+	 * Spinlock count overflowing soon?
+	 */
+	BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
+}
+EXPORT_SYMBOL(add_preempt_count);
+
+void fastcall sub_preempt_count(int val)
+{
+	/*
+	 * Underflow?
+	 */
+	BUG_ON(val > preempt_count());
+	/*
+	 * Is the spinlock portion underflowing?
+	 */
+	BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK));
+	preempt_count() -= val;
+}
+EXPORT_SYMBOL(sub_preempt_count);
+
+#endif
+
 /*
  * schedule() is the main scheduler function.
  */
@@ -2540,7 +2669,7 @@ asmlinkage void __sched schedule(void)
 	 * schedule() atomically, we ignore that path for now.
 	 * Otherwise, whine if we are scheduling when we should not be.
 	 */
-	if (likely(!(current->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)))) {
+	if (likely(!current->exit_state)) {
 		if (unlikely(in_atomic())) {
 			printk(KERN_ERR "scheduling while atomic: "
 				"%s/0x%08x/%d\n",
@@ -2561,7 +2690,7 @@ need_resched_nonpreemptible:
 	 * The idle thread is not allowed to schedule!
 	 * Remove this check after it has been exercised a bit.
 	 */
-	if (unlikely(current == rq->idle) && current->state != TASK_RUNNING) {
+	if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
 		printk(KERN_ERR "bad: scheduling from the idle thread!\n");
 		dump_stack();
 	}
@@ -2574,21 +2703,16 @@ need_resched_nonpreemptible:
 		run_time = NS_MAX_SLEEP_AVG;
 
 	/*
-	 * Tasks with interactive credits get charged less run_time
-	 * at high sleep_avg to delay them losing their interactive
-	 * status
+	 * Tasks charged proportionately less run_time at high sleep_avg to
+	 * delay them losing their interactive status
 	 */
-	if (HIGH_CREDIT(prev))
-		run_time /= (CURRENT_BONUS(prev) ? : 1);
+	run_time /= (CURRENT_BONUS(prev) ? : 1);
 
 	spin_lock_irq(&rq->lock);
 
-	if (unlikely(current->flags & PF_DEAD))
-		current->state = EXIT_DEAD;
-	/*
-	 * if entering off of a kernel preemption go straight
-	 * to picking the next task.
-	 */
+	if (unlikely(prev->flags & PF_DEAD))
+		prev->state = EXIT_DEAD;
+
 	switch_count = &prev->nivcsw;
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
 		switch_count = &prev->nvcsw;
@@ -2670,11 +2794,8 @@ switch_tasks:
 	rcu_qsctr_inc(task_cpu(prev));
 
 	prev->sleep_avg -= run_time;
-	if ((long)prev->sleep_avg <= 0) {
+	if ((long)prev->sleep_avg <= 0)
 		prev->sleep_avg = 0;
-		if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev)))
-			prev->interactive_credit--;
-	}
 	prev->timestamp = prev->last_ran = now;
 
 	sched_info_switch(prev, next);
@@ -2711,7 +2832,10 @@ EXPORT_SYMBOL(schedule);
 asmlinkage void __sched preempt_schedule(void)
 {
 	struct thread_info *ti = current_thread_info();
-
+#ifdef CONFIG_PREEMPT_BKL
+	struct task_struct *task = current;
+	int saved_lock_depth;
+#endif
 	/*
 	 * If there is a non-zero preempt_count or interrupts are disabled,
 	 * we do not want to preempt the current task.  Just return..
@@ -2720,9 +2844,21 @@ asmlinkage void __sched preempt_schedule(void)
 		return;
 
 need_resched:
-	ti->preempt_count = PREEMPT_ACTIVE;
+	add_preempt_count(PREEMPT_ACTIVE);
+	/*
+	 * We keep the big kernel semaphore locked, but we
+	 * clear ->lock_depth so that schedule() doesnt
+	 * auto-release the semaphore:
+	 */
+#ifdef CONFIG_PREEMPT_BKL
+	saved_lock_depth = task->lock_depth;
+	task->lock_depth = -1;
+#endif
 	schedule();
-	ti->preempt_count = 0;
+#ifdef CONFIG_PREEMPT_BKL
+	task->lock_depth = saved_lock_depth;
+#endif
+	sub_preempt_count(PREEMPT_ACTIVE);
 
 	/* we could miss a preemption opportunity between schedule and now */
 	barrier();
@@ -2955,7 +3091,7 @@ void set_user_nice(task_t *p, long nice)
 	 */
 	rq = task_rq_lock(p, &flags);
 	/*
-	 * The RT priorities are set via setscheduler(), but we still
+	 * The RT priorities are set via sched_setscheduler(), but we still
 	 * allow the 'normal' nice value to be set - but as expected
 	 * it wont have any effect on scheduling until the task is
 	 * not SCHED_NORMAL:
@@ -3055,6 +3191,15 @@ int task_nice(const task_t *p)
 	return TASK_NICE(p);
 }
 
+/*
+ * The only users of task_nice are binfmt_elf and binfmt_elf32.
+ * binfmt_elf is no longer modular, but binfmt_elf32 still is.
+ * Therefore, task_nice is needed if there is a compat_mode.
+ */
+#ifdef CONFIG_COMPAT
+EXPORT_SYMBOL_GPL(task_nice);
+#endif
+
 /**
  * idle_cpu - is a given cpu idle currently?
  * @cpu: the processor in question.
@@ -3096,67 +3241,48 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
 		p->prio = p->static_prio;
 }
 
-/*
- * setscheduler - change the scheduling policy and/or RT priority of a thread.
+/**
+ * sched_setscheduler - change the scheduling policy and/or RT priority of
+ * a thread.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
  */
-static int setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param)
 {
-	struct sched_param lp;
-	int retval = -EINVAL;
+	int retval;
 	int oldprio, oldpolicy = -1;
 	prio_array_t *array;
 	unsigned long flags;
 	runqueue_t *rq;
-	task_t *p;
-
-	if (!param || pid < 0)
-		goto out_nounlock;
 
-	retval = -EFAULT;
-	if (copy_from_user(&lp, param, sizeof(struct sched_param)))
-		goto out_nounlock;
-
-	/*
-	 * We play safe to avoid deadlocks.
-	 */
-	read_lock_irq(&tasklist_lock);
-
-	p = find_process_by_pid(pid);
-
-	retval = -ESRCH;
-	if (!p)
-		goto out_unlock;
 recheck:
 	/* double check policy once rq lock held */
 	if (policy < 0)
 		policy = oldpolicy = p->policy;
-	else {
-		retval = -EINVAL;
-		if (policy != SCHED_FIFO && policy != SCHED_RR &&
+	else if (policy != SCHED_FIFO && policy != SCHED_RR &&
 				policy != SCHED_NORMAL)
-			goto out_unlock;
-	}
+			return -EINVAL;
 	/*
 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
 	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
 	 */
-	retval = -EINVAL;
-	if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1)
-		goto out_unlock;
-	if ((policy == SCHED_NORMAL) != (lp.sched_priority == 0))
-		goto out_unlock;
+	if (param->sched_priority < 0 ||
+	    param->sched_priority > MAX_USER_RT_PRIO-1)
+		return -EINVAL;
+	if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
+		return -EINVAL;
 
-	retval = -EPERM;
 	if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
 	    !capable(CAP_SYS_NICE))
-		goto out_unlock;
+		return -EPERM;
 	if ((current->euid != p->euid) && (current->euid != p->uid) &&
 	    !capable(CAP_SYS_NICE))
-		goto out_unlock;
+		return -EPERM;
 
-	retval = security_task_setscheduler(p, policy, &lp);
+	retval = security_task_setscheduler(p, policy, param);
 	if (retval)
-		goto out_unlock;
+		return retval;
 	/*
 	 * To be able to change p->policy safely, the apropriate
 	 * runqueue lock must be held.
@@ -3170,12 +3296,11 @@ recheck:
 	}
 	array = p->array;
 	if (array)
-		deactivate_task(p, task_rq(p));
-	retval = 0;
+		deactivate_task(p, rq);
 	oldprio = p->prio;
-	__setscheduler(p, policy, lp.sched_priority);
+	__setscheduler(p, policy, param->sched_priority);
 	if (array) {
-		__activate_task(p, task_rq(p));
+		__activate_task(p, rq);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
 		 * our priority decreased, or if we are not currently running on
@@ -3188,22 +3313,41 @@ recheck:
 			resched_task(rq->curr);
 	}
 	task_rq_unlock(rq, &flags);
-out_unlock:
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sched_setscheduler);
+
+static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+{
+	int retval;
+	struct sched_param lparam;
+	struct task_struct *p;
+
+	if (!param || pid < 0)
+		return -EINVAL;
+	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
+		return -EFAULT;
+	read_lock_irq(&tasklist_lock);
+	p = find_process_by_pid(pid);
+	if (!p) {
+		read_unlock_irq(&tasklist_lock);
+		return -ESRCH;
+	}
+	retval = sched_setscheduler(p, policy, &lparam);
 	read_unlock_irq(&tasklist_lock);
-out_nounlock:
 	return retval;
 }
 
 /**
  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
  * @pid: the pid in question.
- * @policy: new policy
+ * @policy: new policy.
  * @param: structure containing the new RT priority.
  */
 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
 				       struct sched_param __user *param)
 {
-	return setscheduler(pid, policy, param);
+	return do_sched_setscheduler(pid, policy, param);
 }
 
 /**
@@ -3213,7 +3357,7 @@ asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
  */
 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
 {
-	return setscheduler(pid, -1, param);
+	return do_sched_setscheduler(pid, -1, param);
 }
 
 /**
@@ -3444,8 +3588,14 @@ asmlinkage long sys_sched_yield(void)
 	} else if (!rq->expired->nr_active)
 		schedstat_inc(rq, yld_exp_empty);
 
-	dequeue_task(current, array);
-	enqueue_task(current, target);
+	if (array != target) {
+		dequeue_task(current, array);
+		enqueue_task(current, target);
+	} else
+		/*
+		 * requeue_task is cheaper so perform that if possible.
+		 */
+		requeue_task(current, array);
 
 	/*
 	 * Since we are going to call schedule() anyway, there's
@@ -3460,13 +3610,71 @@ asmlinkage long sys_sched_yield(void)
 	return 0;
 }
 
-void __sched __cond_resched(void)
+static inline void __cond_resched(void)
 {
-	set_current_state(TASK_RUNNING);
-	schedule();
+	do {
+		add_preempt_count(PREEMPT_ACTIVE);
+		schedule();
+		sub_preempt_count(PREEMPT_ACTIVE);
+	} while (need_resched());
+}
+
+int __sched cond_resched(void)
+{
+	if (need_resched()) {
+		__cond_resched();
+		return 1;
+	}
+	return 0;
+}
+
+EXPORT_SYMBOL(cond_resched);
+
+/*
+ * cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * call schedule, and on return reacquire the lock.
+ *
+ * This works OK both with and without CONFIG_PREEMPT.  We do strange low-level
+ * operations here to prevent schedule() from being called twice (once via
+ * spin_unlock(), once by hand).
+ */
+int cond_resched_lock(spinlock_t * lock)
+{
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
+	if (lock->break_lock) {
+		lock->break_lock = 0;
+		spin_unlock(lock);
+		cpu_relax();
+		spin_lock(lock);
+	}
+#endif
+	if (need_resched()) {
+		_raw_spin_unlock(lock);
+		preempt_enable_no_resched();
+		__cond_resched();
+		spin_lock(lock);
+		return 1;
+	}
+	return 0;
+}
+
+EXPORT_SYMBOL(cond_resched_lock);
+
+int __sched cond_resched_softirq(void)
+{
+	BUG_ON(!in_softirq());
+
+	if (need_resched()) {
+		__local_bh_enable();
+		__cond_resched();
+		local_bh_disable();
+		return 1;
+	}
+	return 0;
 }
 
-EXPORT_SYMBOL(__cond_resched);
+EXPORT_SYMBOL(cond_resched_softirq);
+
 
 /**
  * yield - yield the current processor to other threads.
@@ -3491,7 +3699,7 @@ EXPORT_SYMBOL(yield);
  */
 void __sched io_schedule(void)
 {
-	struct runqueue *rq = this_rq();
+	struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id());
 
 	atomic_inc(&rq->nr_iowait);
 	schedule();
@@ -3502,7 +3710,7 @@ EXPORT_SYMBOL(io_schedule);
 
 long __sched io_schedule_timeout(long timeout)
 {
-	struct runqueue *rq = this_rq();
+	struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id());
 	long ret;
 
 	atomic_inc(&rq->nr_iowait);
@@ -3699,7 +3907,6 @@ void __devinit init_idle(task_t *idle, int cpu)
 	unsigned long flags;
 
 	idle->sleep_avg = 0;
-	idle->interactive_credit = 0;
 	idle->array = NULL;
 	idle->prio = MAX_PRIO;
 	idle->state = TASK_RUNNING;
@@ -3711,7 +3918,7 @@ void __devinit init_idle(task_t *idle, int cpu)
 	spin_unlock_irqrestore(&rq->lock, flags);
 
 	/* Set the preempt count _outside_ the spinlocks! */
-#ifdef CONFIG_PREEMPT
+#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
 	idle->thread_info->preempt_count = (idle->lock_depth >= 0);
 #else
 	idle->thread_info->preempt_count = 0;
@@ -3879,8 +4086,7 @@ static int migration_thread(void * data)
 
 		if (req->type == REQ_MOVE_TASK) {
 			spin_unlock(&rq->lock);
-			__migrate_task(req->task, smp_processor_id(),
-					req->dest_cpu);
+			__migrate_task(req->task, cpu, req->dest_cpu);
 			local_irq_enable();
 		} else if (req->type == REQ_SET_DOMAIN) {
 			rq->sd = req->sd;
@@ -4004,6 +4210,20 @@ void sched_idle_next(void)
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 
+/* Ensures that the idle task is using init_mm right before its cpu goes
+ * offline.
+ */
+void idle_task_exit(void)
+{
+	struct mm_struct *mm = current->active_mm;
+
+	BUG_ON(cpu_online(smp_processor_id()));
+
+	if (mm != &init_mm)
+		switch_mm(mm, &init_mm, current);
+	mmdrop(mm);
+}
+
 static void migrate_dead(unsigned int dead_cpu, task_t *tsk)
 {
 	struct runqueue *rq = cpu_rq(dead_cpu);
@@ -4136,6 +4356,94 @@ int __init migration_init(void)
 #endif
 
 #ifdef CONFIG_SMP
+#define SCHED_DOMAIN_DEBUG
+#ifdef SCHED_DOMAIN_DEBUG
+static void sched_domain_debug(struct sched_domain *sd, int cpu)
+{
+	int level = 0;
+
+	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
+
+	do {
+		int i;
+		char str[NR_CPUS];
+		struct sched_group *group = sd->groups;
+		cpumask_t groupmask;
+
+		cpumask_scnprintf(str, NR_CPUS, sd->span);
+		cpus_clear(groupmask);
+
+		printk(KERN_DEBUG);
+		for (i = 0; i < level + 1; i++)
+			printk(" ");
+		printk("domain %d: ", level);
+
+		if (!(sd->flags & SD_LOAD_BALANCE)) {
+			printk("does not load-balance\n");
+			if (sd->parent)
+				printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
+			break;
+		}
+
+		printk("span %s\n", str);
+
+		if (!cpu_isset(cpu, sd->span))
+			printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
+		if (!cpu_isset(cpu, group->cpumask))
+			printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
+
+		printk(KERN_DEBUG);
+		for (i = 0; i < level + 2; i++)
+			printk(" ");
+		printk("groups:");
+		do {
+			if (!group) {
+				printk("\n");
+				printk(KERN_ERR "ERROR: group is NULL\n");
+				break;
+			}
+
+			if (!group->cpu_power) {
+				printk("\n");
+				printk(KERN_ERR "ERROR: domain->cpu_power not set\n");
+			}
+
+			if (!cpus_weight(group->cpumask)) {
+				printk("\n");
+				printk(KERN_ERR "ERROR: empty group\n");
+			}
+
+			if (cpus_intersects(groupmask, group->cpumask)) {
+				printk("\n");
+				printk(KERN_ERR "ERROR: repeated CPUs\n");
+			}
+
+			cpus_or(groupmask, groupmask, group->cpumask);
+
+			cpumask_scnprintf(str, NR_CPUS, group->cpumask);
+			printk(" %s", str);
+
+			group = group->next;
+		} while (group != sd->groups);
+		printk("\n");
+
+		if (!cpus_equal(sd->span, groupmask))
+			printk(KERN_ERR "ERROR: groups don't span domain->span\n");
+
+		level++;
+		sd = sd->parent;
+
+		if (sd) {
+			if (!cpus_subset(groupmask, sd->span))
+				printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
+		}
+
+	} while (sd);
+}
+#else
+#define sched_domain_debug(sd, cpu) {}
+#endif
+
 /*
  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
  * hold the hotplug lock.
@@ -4147,6 +4455,8 @@ void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
 	runqueue_t *rq = cpu_rq(cpu);
 	int local = 1;
 
+	sched_domain_debug(sd, cpu);
+
 	spin_lock_irqsave(&rq->lock, flags);
 
 	if (cpu == smp_processor_id() || !cpu_online(cpu)) {
@@ -4178,7 +4488,8 @@ static int __init isolated_cpu_setup(char *str)
 	str = get_options(str, ARRAY_SIZE(ints), ints);
 	cpus_clear(cpu_isolated_map);
 	for (i = 1; i <= ints[0]; i++)
-		cpu_set(ints[i], cpu_isolated_map);
+		if (ints[i] < NR_CPUS)
+			cpu_set(ints[i], cpu_isolated_map);
 	return 1;
 }
 
@@ -4421,96 +4732,6 @@ static void __devinit arch_destroy_sched_domains(void)
 
 #endif /* ARCH_HAS_SCHED_DOMAIN */
 
-#define SCHED_DOMAIN_DEBUG
-#ifdef SCHED_DOMAIN_DEBUG
-static void sched_domain_debug(void)
-{
-	int i;
-
-	for_each_online_cpu(i) {
-		runqueue_t *rq = cpu_rq(i);
-		struct sched_domain *sd;
-		int level = 0;
-
-		sd = rq->sd;
-
-		printk(KERN_DEBUG "CPU%d:\n", i);
-
-		do {
-			int j;
-			char str[NR_CPUS];
-			struct sched_group *group = sd->groups;
-			cpumask_t groupmask;
-
-			cpumask_scnprintf(str, NR_CPUS, sd->span);
-			cpus_clear(groupmask);
-
-			printk(KERN_DEBUG);
-			for (j = 0; j < level + 1; j++)
-				printk(" ");
-			printk("domain %d: ", level);
-
-			if (!(sd->flags & SD_LOAD_BALANCE)) {
-				printk("does not load-balance");
-				if (sd->parent)
-					printk(" ERROR !SD_LOAD_BALANCE domain has parent");
-				printk("\n");
-				break;
-			}
-
-			printk("span %s\n", str);
-
-			if (!cpu_isset(i, sd->span))
-				printk(KERN_DEBUG "ERROR domain->span does not contain CPU%d\n", i);
-			if (!cpu_isset(i, group->cpumask))
-				printk(KERN_DEBUG "ERROR domain->groups does not contain CPU%d\n", i);
-
-			printk(KERN_DEBUG);
-			for (j = 0; j < level + 2; j++)
-				printk(" ");
-			printk("groups:");
-			do {
-				if (!group) {
-					printk(" ERROR: NULL");
-					break;
-				}
-				
-				if (!group->cpu_power)
-					printk(KERN_DEBUG "ERROR group->cpu_power not set\n");
-
-				if (!cpus_weight(group->cpumask))
-					printk(" ERROR empty group:");
-
-				if (cpus_intersects(groupmask, group->cpumask))
-					printk(" ERROR repeated CPUs:");
-
-				cpus_or(groupmask, groupmask, group->cpumask);
-
-				cpumask_scnprintf(str, NR_CPUS, group->cpumask);
-				printk(" %s", str);
-
-				group = group->next;
-			} while (group != sd->groups);
-			printk("\n");
-
-			if (!cpus_equal(sd->span, groupmask))
-				printk(KERN_DEBUG "ERROR groups don't span domain->span\n");
-
-			level++;
-			sd = sd->parent;
-
-			if (sd) {
-				if (!cpus_subset(groupmask, sd->span))
-					printk(KERN_DEBUG "ERROR parent span is not a superset of domain->span\n");
-			}
-
-		} while (sd);
-	}
-}
-#else
-#define sched_domain_debug() {}
-#endif
-
 /*
  * Initial dummy domain for early boot and for hotplug cpu. Being static,
  * it is initialized to zero, so all balancing flags are cleared which is
@@ -4553,8 +4774,6 @@ static int update_sched_domains(struct notifier_block *nfb,
 	/* The hotplug lock is already held by cpu_up/cpu_down */
 	arch_init_sched_domains();
 
-	sched_domain_debug();
-
 	return NOTIFY_OK;
 }
 #endif
@@ -4563,7 +4782,6 @@ void __init sched_init_smp(void)
 {
 	lock_cpu_hotplug();
 	arch_init_sched_domains();
-	sched_domain_debug();
 	unlock_cpu_hotplug();
 	/* XXX: Theoretical race here - CPU may be hotplugged now */
 	hotcpu_notifier(update_sched_domains, 0);
@@ -4640,7 +4858,7 @@ void __might_sleep(char *file, int line)
 	static unsigned long prev_jiffy;	/* ratelimiting */
 
 	if ((in_atomic() || irqs_disabled()) &&
-	    system_state == SYSTEM_RUNNING) {
+	    system_state == SYSTEM_RUNNING && !oops_in_progress) {
 		if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
 			return;
 		prev_jiffy = jiffies;
diff --git a/kernel/signal.c b/kernel/signal.c
index cc50f2504365..d98e9624ea30 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -153,11 +153,6 @@ static kmem_cache_t *sigqueue_cachep;
 	(!T(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
 	 (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)
 
-#define sig_avoid_stop_race() \
-	(sigtestsetmask(&current->pending.signal, M(SIGCONT) | M(SIGKILL)) || \
-	 sigtestsetmask(&current->signal->shared_pending.signal, \
-						  M(SIGCONT) | M(SIGKILL)))
-
 static int sig_ignored(struct task_struct *t, int sig)
 {
 	void __user * handler;
@@ -380,8 +375,8 @@ void __exit_signal(struct task_struct *tsk)
 		 * We won't ever get here for the group leader, since it
 		 * will have been the last reference on the signal_struct.
 		 */
-		sig->utime += tsk->utime;
-		sig->stime += tsk->stime;
+		sig->utime = cputime_add(sig->utime, tsk->utime);
+		sig->stime = cputime_add(sig->stime, tsk->stime);
 		sig->min_flt += tsk->min_flt;
 		sig->maj_flt += tsk->maj_flt;
 		sig->nvcsw += tsk->nvcsw;
@@ -551,6 +546,21 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 	if (!signr)
 		signr = __dequeue_signal(&tsk->signal->shared_pending,
 					 mask, info);
+ 	if (signr && unlikely(sig_kernel_stop(signr))) {
+ 		/*
+ 		 * Set a marker that we have dequeued a stop signal.  Our
+ 		 * caller might release the siglock and then the pending
+ 		 * stop signal it is about to process is no longer in the
+ 		 * pending bitmasks, but must still be cleared by a SIGCONT
+ 		 * (and overruled by a SIGKILL).  So those cases clear this
+ 		 * shared flag after we've set it.  Note that this flag may
+ 		 * remain set after the signal we return is ignored or
+ 		 * handled.  That doesn't matter because its only purpose
+ 		 * is to alert stop-signal processing code when another
+ 		 * processor has come along and cleared the flag.
+ 		 */
+ 		tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
+ 	}
 	if ( signr &&
 	     ((info->si_code & __SI_MASK) == __SI_TIMER) &&
 	     info->si_sys_private){
@@ -577,15 +587,15 @@ void signal_wake_up(struct task_struct *t, int resume)
 	set_tsk_thread_flag(t, TIF_SIGPENDING);
 
 	/*
-	 * If resume is set, we want to wake it up in the TASK_STOPPED case.
-	 * We don't check for TASK_STOPPED because there is a race with it
+	 * For SIGKILL, we want to wake it up in the stopped/traced case.
+	 * We don't check t->state here because there is a race with it
 	 * executing another processor and just now entering stopped state.
-	 * By calling wake_up_process any time resume is set, we ensure
-	 * the process will wake up and handle its stop or death signal.
+	 * By using wake_up_state, we ensure the process will wake up and
+	 * handle its death signal.
 	 */
 	mask = TASK_INTERRUPTIBLE;
 	if (resume)
-		mask |= TASK_STOPPED;
+		mask |= TASK_STOPPED | TASK_TRACED;
 	if (!wake_up_state(t, mask))
 		kick_process(t);
 }
@@ -651,6 +661,12 @@ static void handle_stop_signal(int sig, struct task_struct *p)
 {
 	struct task_struct *t;
 
+	if (p->flags & SIGNAL_GROUP_EXIT)
+		/*
+		 * The process is in the middle of dying already.
+		 */
+		return;
+
 	if (sig_kernel_stop(sig)) {
 		/*
 		 * This is a stop signal.  Remove SIGCONT from all queues.
@@ -680,7 +696,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
 			 * the SIGCHLD was pending on entry to this kill.
 			 */
 			p->signal->group_stop_count = 0;
-			p->signal->stop_state = 1;
+			p->signal->flags = SIGNAL_STOP_CONTINUED;
 			spin_unlock(&p->sighand->siglock);
 			if (p->ptrace & PT_PTRACED)
 				do_notify_parent_cldstop(p, p->parent,
@@ -722,12 +738,12 @@ static void handle_stop_signal(int sig, struct task_struct *p)
 			t = next_thread(t);
 		} while (t != p);
 
-		if (p->signal->stop_state > 0) {
+		if (p->signal->flags & SIGNAL_STOP_STOPPED) {
 			/*
 			 * We were in fact stopped, and are now continued.
 			 * Notify the parent with CLD_CONTINUED.
 			 */
-			p->signal->stop_state = -1;
+			p->signal->flags = SIGNAL_STOP_CONTINUED;
 			p->signal->group_exit_code = 0;
 			spin_unlock(&p->sighand->siglock);
 			if (p->ptrace & PT_PTRACED)
@@ -739,7 +755,20 @@ static void handle_stop_signal(int sig, struct task_struct *p)
 					p->group_leader->real_parent,
 							 CLD_CONTINUED);
 			spin_lock(&p->sighand->siglock);
+		} else {
+			/*
+			 * We are not stopped, but there could be a stop
+			 * signal in the middle of being processed after
+			 * being removed from the queue.  Clear that too.
+			 */
+			p->signal->flags = 0;
 		}
+	} else if (sig == SIGKILL) {
+		/*
+		 * Make sure that any pending stop signal already dequeued
+		 * is undone by the wakeup for SIGKILL.
+		 */
+		p->signal->flags = 0;
 	}
 }
 
@@ -905,12 +934,12 @@ __group_complete_signal(int sig, struct task_struct *p)
 	struct task_struct *t;
 
 	/*
-	 * Don't bother zombies and stopped tasks (but
-	 * SIGKILL will punch through stopped state)
+	 * Don't bother traced and stopped tasks (but
+	 * SIGKILL will punch through that).
 	 */
-	mask = EXIT_DEAD | EXIT_ZOMBIE | TASK_TRACED;
-	if (sig != SIGKILL)
-		mask |= TASK_STOPPED;
+	mask = TASK_STOPPED | TASK_TRACED;
+	if (sig == SIGKILL)
+		mask = 0;
 
 	/*
 	 * Now find a thread we can wake up to take the signal off the queue.
@@ -953,7 +982,7 @@ __group_complete_signal(int sig, struct task_struct *p)
 	 * Found a killable thread.  If the signal will be fatal,
 	 * then start taking the whole group down immediately.
 	 */
-	if (sig_fatal(p, sig) && !p->signal->group_exit &&
+	if (sig_fatal(p, sig) && !(p->signal->flags & SIGNAL_GROUP_EXIT) &&
 	    !sigismember(&t->real_blocked, sig) &&
 	    (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) {
 		/*
@@ -966,7 +995,7 @@ __group_complete_signal(int sig, struct task_struct *p)
 			 * running and doing things after a slower
 			 * thread has the fatal signal pending.
 			 */
-			p->signal->group_exit = 1;
+			p->signal->flags = SIGNAL_GROUP_EXIT;
 			p->signal->group_exit_code = sig;
 			p->signal->group_stop_count = 0;
 			t = p;
@@ -1055,6 +1084,7 @@ void zap_other_threads(struct task_struct *p)
 {
 	struct task_struct *t;
 
+	p->signal->flags = SIGNAL_GROUP_EXIT;
 	p->signal->group_stop_count = 0;
 
 	if (thread_group_empty(p))
@@ -1064,7 +1094,7 @@ void zap_other_threads(struct task_struct *p)
 		/*
 		 * Don't bother with already dead threads
 		 */
-		if (t->exit_state & (EXIT_ZOMBIE|EXIT_DEAD))
+		if (t->exit_state)
 			continue;
 
 		/*
@@ -1407,28 +1437,12 @@ out:
 }
 
 /*
- * Joy. Or not. Pthread wants us to wake up every thread
- * in our parent group.
+ * Wake up any threads in the parent blocked in wait* syscalls.
  */
-static void __wake_up_parent(struct task_struct *p,
+static inline void __wake_up_parent(struct task_struct *p,
 				    struct task_struct *parent)
 {
-	struct task_struct *tsk = parent;
-
-	/*
-	 * Fortunately this is not necessary for thread groups:
-	 */
-	if (p->tgid == tsk->tgid) {
-		wake_up_interruptible_sync(&tsk->wait_chldexit);
-		return;
-	}
-
-	do {
-		wake_up_interruptible_sync(&tsk->wait_chldexit);
-		tsk = next_thread(tsk);
-		if (tsk->signal != parent->signal)
-			BUG();
-	} while (tsk != parent);
+	wake_up_interruptible_sync(&parent->signal->wait_chldexit);
 }
 
 /*
@@ -1442,8 +1456,7 @@ void do_notify_parent(struct task_struct *tsk, int sig)
 	unsigned long flags;
 	struct sighand_struct *psig;
 
-	if (sig == -1)
-		BUG();
+	BUG_ON(sig == -1);
 
  	/* do_notify_parent_cldstop should have been called instead.  */
  	BUG_ON(tsk->state & (TASK_STOPPED|TASK_TRACED));
@@ -1457,8 +1470,10 @@ void do_notify_parent(struct task_struct *tsk, int sig)
 	info.si_uid = tsk->uid;
 
 	/* FIXME: find out whether or not this is supposed to be c*time. */
-	info.si_utime = tsk->utime + tsk->signal->utime;
-	info.si_stime = tsk->stime + tsk->signal->stime;
+	info.si_utime = cputime_to_jiffies(cputime_add(tsk->utime,
+						       tsk->signal->utime));
+	info.si_stime = cputime_to_jiffies(cputime_add(tsk->stime,
+						       tsk->signal->stime));
 
 	info.si_status = tsk->exit_code & 0x7f;
 	if (tsk->exit_code & 0x80)
@@ -1514,8 +1529,8 @@ do_notify_parent_cldstop(struct task_struct *tsk, struct task_struct *parent,
 	info.si_uid = tsk->uid;
 
 	/* FIXME: find out whether or not this is supposed to be c*time. */
-	info.si_utime = tsk->utime;
-	info.si_stime = tsk->stime;
+	info.si_utime = cputime_to_jiffies(tsk->utime);
+	info.si_stime = cputime_to_jiffies(tsk->stime);
 
  	info.si_code = why;
  	switch (why) {
@@ -1551,11 +1566,12 @@ do_notify_parent_cldstop(struct task_struct *tsk, struct task_struct *parent,
  * We always set current->last_siginfo while stopped here.
  * That makes it a way to test a stopped process for
  * being ptrace-stopped vs being job-control-stopped.
+ *
+ * If we actually decide not to stop at all because the tracer is gone,
+ * we leave nostop_code in current->exit_code.
  */
-static void ptrace_stop(int exit_code, siginfo_t *info)
+static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
 {
-	BUG_ON(!(current->ptrace & PT_PTRACED));
-
 	/*
 	 * If there is a group stop in progress,
 	 * we must participate in the bookkeeping.
@@ -1570,9 +1586,24 @@ static void ptrace_stop(int exit_code, siginfo_t *info)
 	set_current_state(TASK_TRACED);
 	spin_unlock_irq(&current->sighand->siglock);
 	read_lock(&tasklist_lock);
-	do_notify_parent_cldstop(current, current->parent, CLD_TRAPPED);
-	read_unlock(&tasklist_lock);
-	schedule();
+	if (likely(current->ptrace & PT_PTRACED) &&
+	    likely(current->parent != current->real_parent ||
+		   !(current->ptrace & PT_ATTACHED)) &&
+	    (likely(current->parent->signal != current->signal) ||
+	     !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
+		do_notify_parent_cldstop(current, current->parent,
+					 CLD_TRAPPED);
+		read_unlock(&tasklist_lock);
+		schedule();
+	} else {
+		/*
+		 * By the time we got the lock, our tracer went away.
+		 * Don't stop here.
+		 */
+		read_unlock(&tasklist_lock);
+		set_current_state(TASK_RUNNING);
+		current->exit_code = nostop_code;
+	}
 
 	/*
 	 * We are back.  Now reacquire the siglock before touching
@@ -1603,7 +1634,7 @@ void ptrace_notify(int exit_code)
 
 	/* Let the debugger run.  */
 	spin_lock_irq(&current->sighand->siglock);
-	ptrace_stop(exit_code, &info);
+	ptrace_stop(exit_code, 0, &info);
 	spin_unlock_irq(&current->sighand->siglock);
 }
 
@@ -1641,15 +1672,18 @@ finish_stop(int stop_count)
 /*
  * This performs the stopping for SIGSTOP and other stop signals.
  * We have to stop all threads in the thread group.
+ * Returns nonzero if we've actually stopped and released the siglock.
+ * Returns zero if we didn't stop and still hold the siglock.
  */
-static void
+static int
 do_signal_stop(int signr)
 {
 	struct signal_struct *sig = current->signal;
 	struct sighand_struct *sighand = current->sighand;
 	int stop_count = -1;
 
-	/* spin_lock_irq(&sighand->siglock) is now done in caller */
+	if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED))
+		return 0;
 
 	if (sig->group_stop_count > 0) {
 		/*
@@ -1661,7 +1695,7 @@ do_signal_stop(int signr)
 		current->exit_code = signr;
 		set_current_state(TASK_STOPPED);
 		if (stop_count == 0)
-			sig->stop_state = 1;
+			sig->flags = SIGNAL_STOP_STOPPED;
 		spin_unlock_irq(&sighand->siglock);
 	}
 	else if (thread_group_empty(current)) {
@@ -1670,7 +1704,7 @@ do_signal_stop(int signr)
 		 */
 		current->exit_code = current->signal->group_exit_code = signr;
 		set_current_state(TASK_STOPPED);
-		sig->stop_state = 1;
+		sig->flags = SIGNAL_STOP_STOPPED;
 		spin_unlock_irq(&sighand->siglock);
 	}
 	else {
@@ -1691,25 +1725,16 @@ do_signal_stop(int signr)
 		read_lock(&tasklist_lock);
 		spin_lock_irq(&sighand->siglock);
 
-		if (unlikely(sig->group_exit)) {
+		if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED)) {
 			/*
-			 * There is a group exit in progress now.
-			 * We'll just ignore the stop and process the
-			 * associated fatal signal.
+			 * Another stop or continue happened while we
+			 * didn't have the lock.  We can just swallow this
+			 * signal now.  If we raced with a SIGCONT, that
+			 * should have just cleared it now.  If we raced
+			 * with another processor delivering a stop signal,
+			 * then the SIGCONT that wakes us up should clear it.
 			 */
-			spin_unlock_irq(&sighand->siglock);
-			read_unlock(&tasklist_lock);
-			return;
-		}
-
-		if (unlikely(sig_avoid_stop_race())) {
-			/*
-			 * Either a SIGCONT or a SIGKILL signal was
-			 * posted in the siglock-not-held window.
-			 */
-			spin_unlock_irq(&sighand->siglock);
-			read_unlock(&tasklist_lock);
-			return;
+			return 0;
 		}
 
 		if (sig->group_stop_count == 0) {
@@ -1737,13 +1762,14 @@ do_signal_stop(int signr)
 		current->exit_code = signr;
 		set_current_state(TASK_STOPPED);
 		if (stop_count == 0)
-			sig->stop_state = 1;
+			sig->flags = SIGNAL_STOP_STOPPED;
 
 		spin_unlock_irq(&sighand->siglock);
 		read_unlock(&tasklist_lock);
 	}
 
 	finish_stop(stop_count);
+	return 1;
 }
 
 /*
@@ -1765,7 +1791,7 @@ static inline int handle_group_stop(void)
 		return 0;
 	}
 
-	if (current->signal->group_exit)
+	if (current->signal->flags & SIGNAL_GROUP_EXIT)
 		/*
 		 * Group stop is so another thread can do a core dump,
 		 * or else we are racing against a death signal.
@@ -1779,7 +1805,7 @@ static inline int handle_group_stop(void)
 	 */
 	stop_count = --current->signal->group_stop_count;
 	if (stop_count == 0)
-		current->signal->stop_state = 1;
+		current->signal->flags = SIGNAL_STOP_STOPPED;
 	current->exit_code = current->signal->group_exit_code;
 	set_current_state(TASK_STOPPED);
 	spin_unlock_irq(&current->sighand->siglock);
@@ -1811,7 +1837,7 @@ relock:
 			ptrace_signal_deliver(regs, cookie);
 
 			/* Let the debugger run.  */
-			ptrace_stop(signr, info);
+			ptrace_stop(signr, signr, info);
 
 			/* We're back.  Did the debugger cancel the sig?  */
 			signr = current->exit_code;
@@ -1873,28 +1899,27 @@ relock:
 			 * This allows an intervening SIGCONT to be posted.
 			 * We need to check for that and bail out if necessary.
 			 */
-			if (signr == SIGSTOP) {
-				do_signal_stop(signr); /* releases siglock */
-				goto relock;
-			}
-			spin_unlock_irq(&current->sighand->siglock);
+			if (signr != SIGSTOP) {
+				spin_unlock_irq(&current->sighand->siglock);
 
-			/* signals can be posted during this window */
+				/* signals can be posted during this window */
 
-			if (is_orphaned_pgrp(process_group(current)))
-				goto relock;
+				if (is_orphaned_pgrp(process_group(current)))
+					goto relock;
 
-			spin_lock_irq(&current->sighand->siglock);
-			if (unlikely(sig_avoid_stop_race())) {
-				/*
-				 * Either a SIGCONT or a SIGKILL signal was
-				 * posted in the siglock-not-held window.
-				 */
-				continue;
+				spin_lock_irq(&current->sighand->siglock);
 			}
 
-			do_signal_stop(signr); /* releases siglock */
-			goto relock;
+			if (likely(do_signal_stop(signr))) {
+				/* It released the siglock.  */
+				goto relock;
+			}
+
+			/*
+			 * We didn't actually stop, due to a race
+			 * with SIGCONT or something like that.
+			 */
+			continue;
 		}
 
 		spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7572ca9ece74..582a1e8091bc 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -142,7 +142,7 @@ void local_bh_enable(void)
 	 * Keep preemption disabled until we are done with
 	 * softirq processing:
  	 */
-	preempt_count() -= SOFTIRQ_OFFSET - 1;
+ 	sub_preempt_count(SOFTIRQ_OFFSET - 1);
 
 	if (unlikely(!in_interrupt() && local_softirq_pending()))
 		do_softirq();
@@ -152,6 +152,24 @@ void local_bh_enable(void)
 }
 EXPORT_SYMBOL(local_bh_enable);
 
+#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
+# define invoke_softirq()	__do_softirq()
+#else
+# define invoke_softirq()	do_softirq()
+#endif
+
+/*
+ * Exit an interrupt context. Process softirqs if needed and possible:
+ */
+void irq_exit(void)
+{
+	account_system_vtime(current);
+	sub_preempt_count(IRQ_EXIT_OFFSET);
+	if (!in_interrupt() && local_softirq_pending())
+		invoke_softirq();
+	preempt_enable_no_resched();
+}
+
 /*
  * This function must run with irqs disabled!
  */
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 476da1fd86f4..beacf8b7cee7 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -2,6 +2,8 @@
  * Copyright (2004) Linus Torvalds
  *
  * Author: Zwane Mwaikambo <zwane@fsmlabs.com>
+ *
+ * Copyright (2004) Ingo Molnar
  */
 
 #include <linux/config.h>
@@ -11,6 +13,17 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 
+/*
+ * Generic declaration of the raw read_trylock() function,
+ * architectures are supposed to optimize this:
+ */
+int __lockfunc generic_raw_read_trylock(rwlock_t *lock)
+{
+	_raw_read_lock(lock);
+	return 1;
+}
+EXPORT_SYMBOL(generic_raw_read_trylock);
+
 int __lockfunc _spin_trylock(spinlock_t *lock)
 {
 	preempt_disable();
@@ -22,86 +35,29 @@ int __lockfunc _spin_trylock(spinlock_t *lock)
 }
 EXPORT_SYMBOL(_spin_trylock);
 
-int __lockfunc _write_trylock(rwlock_t *lock)
+int __lockfunc _read_trylock(rwlock_t *lock)
 {
 	preempt_disable();
-	if (_raw_write_trylock(lock))
+	if (_raw_read_trylock(lock))
 		return 1;
 
 	preempt_enable();
 	return 0;
 }
-EXPORT_SYMBOL(_write_trylock);
-
-#ifdef CONFIG_PREEMPT
-/*
- * This could be a long-held lock.  If another CPU holds it for a long time,
- * and that CPU is not asked to reschedule then *this* CPU will spin on the
- * lock for a long time, even if *this* CPU is asked to reschedule.
- *
- * So what we do here, in the slow (contended) path is to spin on the lock by
- * hand while permitting preemption.
- *
- * Called inside preempt_disable().
- */
-static inline void __preempt_spin_lock(spinlock_t *lock)
-{
-	if (preempt_count() > 1) {
-		_raw_spin_lock(lock);
-		return;
-	}
-
-	do {
-		preempt_enable();
-		while (spin_is_locked(lock))
-			cpu_relax();
-		preempt_disable();
-	} while (!_raw_spin_trylock(lock));
-}
+EXPORT_SYMBOL(_read_trylock);
 
-void __lockfunc _spin_lock(spinlock_t *lock)
+int __lockfunc _write_trylock(rwlock_t *lock)
 {
 	preempt_disable();
-	if (unlikely(!_raw_spin_trylock(lock)))
-		__preempt_spin_lock(lock);
-}
-
-static inline void __preempt_write_lock(rwlock_t *lock)
-{
-	if (preempt_count() > 1) {
-		_raw_write_lock(lock);
-		return;
-	}
-
-	do {
-		preempt_enable();
-		while (rwlock_is_locked(lock))
-			cpu_relax();
-		preempt_disable();
-	} while (!_raw_write_trylock(lock));
-}
+	if (_raw_write_trylock(lock))
+		return 1;
 
-void __lockfunc _write_lock(rwlock_t *lock)
-{
-	preempt_disable();
-	if (unlikely(!_raw_write_trylock(lock)))
-		__preempt_write_lock(lock);
-}
-#else
-void __lockfunc _spin_lock(spinlock_t *lock)
-{
-	preempt_disable();
-	_raw_spin_lock(lock);
+	preempt_enable();
+	return 0;
 }
+EXPORT_SYMBOL(_write_trylock);
 
-void __lockfunc _write_lock(rwlock_t *lock)
-{
-	preempt_disable();
-	_raw_write_lock(lock);
-}
-#endif
-EXPORT_SYMBOL(_spin_lock);
-EXPORT_SYMBOL(_write_lock);
+#ifndef CONFIG_PREEMPT
 
 void __lockfunc _read_lock(rwlock_t *lock)
 {
@@ -110,27 +66,6 @@ void __lockfunc _read_lock(rwlock_t *lock)
 }
 EXPORT_SYMBOL(_read_lock);
 
-void __lockfunc _spin_unlock(spinlock_t *lock)
-{
-	_raw_spin_unlock(lock);
-	preempt_enable();
-}
-EXPORT_SYMBOL(_spin_unlock);
-
-void __lockfunc _write_unlock(rwlock_t *lock)
-{
-	_raw_write_unlock(lock);
-	preempt_enable();
-}
-EXPORT_SYMBOL(_write_unlock);
-
-void __lockfunc _read_unlock(rwlock_t *lock)
-{
-	_raw_read_unlock(lock);
-	preempt_enable();
-}
-EXPORT_SYMBOL(_read_unlock);
-
 unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
 {
 	unsigned long flags;
@@ -212,6 +147,132 @@ void __lockfunc _write_lock_bh(rwlock_t *lock)
 }
 EXPORT_SYMBOL(_write_lock_bh);
 
+void __lockfunc _spin_lock(spinlock_t *lock)
+{
+	preempt_disable();
+	_raw_spin_lock(lock);
+}
+
+EXPORT_SYMBOL(_spin_lock);
+
+void __lockfunc _write_lock(rwlock_t *lock)
+{
+	preempt_disable();
+	_raw_write_lock(lock);
+}
+
+EXPORT_SYMBOL(_write_lock);
+
+#else /* CONFIG_PREEMPT: */
+
+/*
+ * This could be a long-held lock. We both prepare to spin for a long
+ * time (making _this_ CPU preemptable if possible), and we also signal
+ * towards that other CPU that it should break the lock ASAP.
+ *
+ * (We do this in a function because inlining it would be excessive.)
+ */
+
+#define BUILD_LOCK_OPS(op, locktype, is_locked_fn)			\
+void __lockfunc _##op##_lock(locktype *lock)				\
+{									\
+	preempt_disable();						\
+	for (;;) {							\
+		if (likely(_raw_##op##_trylock(lock)))			\
+			break;						\
+		preempt_enable();					\
+		if (!(lock)->break_lock)				\
+			(lock)->break_lock = 1;				\
+		while (is_locked_fn(lock) && (lock)->break_lock)	\
+			cpu_relax();					\
+		preempt_disable();					\
+	}								\
+}									\
+									\
+EXPORT_SYMBOL(_##op##_lock);						\
+									\
+unsigned long __lockfunc _##op##_lock_irqsave(locktype *lock)		\
+{									\
+	unsigned long flags;						\
+									\
+	preempt_disable();						\
+	for (;;) {							\
+		local_irq_save(flags);					\
+		if (likely(_raw_##op##_trylock(lock)))			\
+			break;						\
+		local_irq_restore(flags);				\
+									\
+		preempt_enable();					\
+		if (!(lock)->break_lock)				\
+			(lock)->break_lock = 1;				\
+		while (is_locked_fn(lock) && (lock)->break_lock)	\
+			cpu_relax();					\
+		preempt_disable();					\
+	}								\
+	return flags;							\
+}									\
+									\
+EXPORT_SYMBOL(_##op##_lock_irqsave);					\
+									\
+void __lockfunc _##op##_lock_irq(locktype *lock)			\
+{									\
+	_##op##_lock_irqsave(lock);					\
+}									\
+									\
+EXPORT_SYMBOL(_##op##_lock_irq);					\
+									\
+void __lockfunc _##op##_lock_bh(locktype *lock)				\
+{									\
+	unsigned long flags;						\
+									\
+	/*							*/	\
+	/* Careful: we must exclude softirqs too, hence the	*/	\
+	/* irq-disabling. We use the generic preemption-aware	*/	\
+	/* function:						*/	\
+	/**/								\
+	flags = _##op##_lock_irqsave(lock);				\
+	local_bh_disable();						\
+	local_irq_restore(flags);					\
+}									\
+									\
+EXPORT_SYMBOL(_##op##_lock_bh)
+
+/*
+ * Build preemption-friendly versions of the following
+ * lock-spinning functions:
+ *
+ *         _[spin|read|write]_lock()
+ *         _[spin|read|write]_lock_irq()
+ *         _[spin|read|write]_lock_irqsave()
+ *         _[spin|read|write]_lock_bh()
+ */
+BUILD_LOCK_OPS(spin, spinlock_t, spin_is_locked);
+BUILD_LOCK_OPS(read, rwlock_t, rwlock_is_locked);
+BUILD_LOCK_OPS(write, rwlock_t, spin_is_locked);
+
+#endif /* CONFIG_PREEMPT */
+
+void __lockfunc _spin_unlock(spinlock_t *lock)
+{
+	_raw_spin_unlock(lock);
+	preempt_enable();
+}
+EXPORT_SYMBOL(_spin_unlock);
+
+void __lockfunc _write_unlock(rwlock_t *lock)
+{
+	_raw_write_unlock(lock);
+	preempt_enable();
+}
+EXPORT_SYMBOL(_write_unlock);
+
+void __lockfunc _read_unlock(rwlock_t *lock)
+{
+	_raw_read_unlock(lock);
+	preempt_enable();
+}
+EXPORT_SYMBOL(_read_unlock);
+
 void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
 {
 	_raw_spin_unlock(lock);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2ceea25f67f6..e31b1cb8e503 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -95,7 +95,7 @@ static int stop_machine(void)
 	stopmachine_state = STOPMACHINE_WAIT;
 
 	for_each_online_cpu(i) {
-		if (i == smp_processor_id())
+		if (i == _smp_processor_id())
 			continue;
 		ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);
 		if (ret < 0)
@@ -177,7 +177,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
 
 	/* If they don't care which CPU fn runs on, bind to any online one. */
 	if (cpu == NR_CPUS)
-		cpu = smp_processor_id();
+		cpu = _smp_processor_id();
 
 	p = kthread_create(do_stop, &smdata, "kstopmachine");
 	if (!IS_ERR(p)) {
diff --git a/kernel/sys.c b/kernel/sys.c
index fdc29f17ac93..6e354fd380e7 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -23,6 +23,7 @@
 #include <linux/security.h>
 #include <linux/dcookies.h>
 #include <linux/suspend.h>
+#include <linux/tty.h>
 
 #include <linux/compat.h>
 #include <linux/syscalls.h>
@@ -88,7 +89,7 @@ int cad_pid = 1;
  */
 
 static struct notifier_block *reboot_notifier_list;
-rwlock_t notifier_lock = RW_LOCK_UNLOCKED;
+DEFINE_RWLOCK(notifier_lock);
 
 /**
  *	notifier_chain_register	- Add notifier to a notifier chain
@@ -892,15 +893,15 @@ asmlinkage long sys_times(struct tms __user * tbuf)
 		struct tms tmp;
 		struct task_struct *tsk = current;
 		struct task_struct *t;
-		unsigned long utime, stime, cutime, cstime;
+		cputime_t utime, stime, cutime, cstime;
 
 		read_lock(&tasklist_lock);
 		utime = tsk->signal->utime;
 		stime = tsk->signal->stime;
 		t = tsk;
 		do {
-			utime += t->utime;
-			stime += t->stime;
+			utime = cputime_add(utime, t->utime);
+			stime = cputime_add(stime, t->stime);
 			t = next_thread(t);
 		} while (t != tsk);
 
@@ -919,10 +920,10 @@ asmlinkage long sys_times(struct tms __user * tbuf)
 		spin_unlock_irq(&tsk->sighand->siglock);
 		read_unlock(&tasklist_lock);
 
-		tmp.tms_utime = jiffies_to_clock_t(utime);
-		tmp.tms_stime = jiffies_to_clock_t(stime);
-		tmp.tms_cutime = jiffies_to_clock_t(cutime);
-		tmp.tms_cstime = jiffies_to_clock_t(cstime);
+		tmp.tms_utime = cputime_to_clock_t(utime);
+		tmp.tms_stime = cputime_to_clock_t(stime);
+		tmp.tms_cutime = cputime_to_clock_t(cutime);
+		tmp.tms_cstime = cputime_to_clock_t(cstime);
 		if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
 			return -EFAULT;
 	}
@@ -1075,6 +1076,7 @@ asmlinkage long sys_setsid(void)
 	if (!thread_group_leader(current))
 		return -EINVAL;
 
+	down(&tty_sem);
 	write_lock_irq(&tasklist_lock);
 
 	pid = find_pid(PIDTYPE_PGID, current->pid);
@@ -1088,6 +1090,7 @@ asmlinkage long sys_setsid(void)
 	err = process_group(current);
 out:
 	write_unlock_irq(&tasklist_lock);
+	up(&tty_sem);
 	return err;
 }
 
@@ -1525,7 +1528,7 @@ void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 {
 	struct task_struct *t;
 	unsigned long flags;
-	unsigned long utime, stime;
+	cputime_t utime, stime;
 
 	memset((char *) r, 0, sizeof *r);
 
@@ -1542,12 +1545,12 @@ void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 			r->ru_minflt = p->signal->cmin_flt;
 			r->ru_majflt = p->signal->cmaj_flt;
 			spin_unlock_irqrestore(&p->sighand->siglock, flags);
-			jiffies_to_timeval(utime, &r->ru_utime);
-			jiffies_to_timeval(stime, &r->ru_stime);
+			cputime_to_timeval(utime, &r->ru_utime);
+			cputime_to_timeval(stime, &r->ru_stime);
 			break;
 		case RUSAGE_SELF:
 			spin_lock_irqsave(&p->sighand->siglock, flags);
-			utime = stime = 0;
+			utime = stime = cputime_zero;
 			goto sum_group;
 		case RUSAGE_BOTH:
 			spin_lock_irqsave(&p->sighand->siglock, flags);
@@ -1558,16 +1561,16 @@ void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 			r->ru_minflt = p->signal->cmin_flt;
 			r->ru_majflt = p->signal->cmaj_flt;
 		sum_group:
-			utime += p->signal->utime;
-			stime += p->signal->stime;
+			utime = cputime_add(utime, p->signal->utime);
+			stime = cputime_add(stime, p->signal->stime);
 			r->ru_nvcsw += p->signal->nvcsw;
 			r->ru_nivcsw += p->signal->nivcsw;
 			r->ru_minflt += p->signal->min_flt;
 			r->ru_majflt += p->signal->maj_flt;
 			t = p;
 			do {
-				utime += t->utime;
-				stime += t->stime;
+				utime = cputime_add(utime, t->utime);
+				stime = cputime_add(stime, t->stime);
 				r->ru_nvcsw += t->nvcsw;
 				r->ru_nivcsw += t->nivcsw;
 				r->ru_minflt += t->min_flt;
@@ -1575,8 +1578,8 @@ void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 				t = next_thread(t);
 			} while (t != p);
 			spin_unlock_irqrestore(&p->sighand->siglock, flags);
-			jiffies_to_timeval(utime, &r->ru_utime);
-			jiffies_to_timeval(stime, &r->ru_stime);
+			cputime_to_timeval(utime, &r->ru_utime);
+			cputime_to_timeval(stime, &r->ru_stime);
 			break;
 		default:
 			BUG();
@@ -1689,6 +1692,15 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 			set_task_comm(me, ncomm);
 			return 0;
 		}
+		case PR_GET_NAME: {
+			struct task_struct *me = current;
+			unsigned char tcomm[sizeof(me->comm)];
+
+			get_task_comm(tcomm, me);
+			if (copy_to_user((char __user *)arg2, tcomm, sizeof(tcomm)))
+				return -EFAULT;
+			return 0;
+		}
 		default:
 			error = -EINVAL;
 			break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 568b4579ef24..85503726d60b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -52,7 +52,6 @@
 #if defined(CONFIG_SYSCTL)
 
 /* External variables not in a header file. */
-extern int panic_timeout;
 extern int C_A_D;
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
@@ -765,6 +764,7 @@ static ctl_table vm_table[] = {
 		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 	},
+#ifdef CONFIG_MMU
 	{
 		.ctl_name	= VM_MAX_MAP_COUNT,
 		.procname	= "max_map_count",
@@ -773,6 +773,7 @@ static ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
+#endif
 	{
 		.ctl_name	= VM_LAPTOP_MODE,
 		.procname	= "laptop_mode",
@@ -914,6 +915,7 @@ static ctl_table fs_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
+#ifdef CONFIG_MMU
 	{
 		.ctl_name	= FS_LEASE_TIME,
 		.procname	= "lease-break-time",
@@ -938,6 +940,7 @@ static ctl_table fs_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+#endif
 	{ .ctl_name = 0 }
 };
 
diff --git a/kernel/time.c b/kernel/time.c
index b6d01cf709c4..d5400f6af052 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -33,6 +33,7 @@
 #include <linux/smp_lock.h>
 #include <linux/syscalls.h>
 #include <linux/security.h>
+#include <linux/fs.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -52,12 +53,10 @@ EXPORT_SYMBOL(sys_tz);
  * sys_gettimeofday().  Is this for backwards compatibility?  If so,
  * why not move it into the appropriate arch directory (for those
  * architectures that need it).
- *
- * XXX This function is NOT 64-bit clean!
  */
-asmlinkage long sys_time(int __user * tloc)
+asmlinkage long sys_time(time_t __user * tloc)
 {
-	int i;
+	time_t i;
 	struct timeval tv;
 
 	do_gettimeofday(&tv);
@@ -417,7 +416,7 @@ asmlinkage long sys_adjtimex(struct timex __user *txc_p)
 	return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret;
 }
 
-struct timespec current_kernel_time(void)
+inline struct timespec current_kernel_time(void)
 {
         struct timespec now;
         unsigned long seq;
@@ -433,6 +432,50 @@ struct timespec current_kernel_time(void)
 
 EXPORT_SYMBOL(current_kernel_time);
 
+/**
+ * current_fs_time - Return FS time
+ * @sb: Superblock.
+ *
+ * Return the current time truncated to the time granuality supported by
+ * the fs.
+ */
+struct timespec current_fs_time(struct super_block *sb)
+{
+	struct timespec now = current_kernel_time();
+	return timespec_trunc(now, sb->s_time_gran);
+}
+EXPORT_SYMBOL(current_fs_time);
+
+/**
+ * timespec_trunc - Truncate timespec to a granuality
+ * @t: Timespec
+ * @gran: Granuality in ns.
+ *
+ * Truncate a timespec to a granuality. gran must be smaller than a second.
+ * Always rounds down.
+ *
+ * This function should be only used for timestamps returned by
+ * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because
+ * it doesn't handle the better resolution of the later.
+ */
+struct timespec timespec_trunc(struct timespec t, unsigned gran)
+{
+	/*
+	 * Division is pretty slow so avoid it for common cases.
+	 * Currently current_kernel_time() never returns better than
+	 * jiffies resolution. Exploit that.
+	 */
+	if (gran <= jiffies_to_usecs(1) * 1000) {
+		/* nothing */
+	} else if (gran == 1000000000) {
+		t.tv_nsec = 0;
+	} else {
+		t.tv_nsec -= t.tv_nsec % gran;
+	}
+	return t;
+}
+EXPORT_SYMBOL(timespec_trunc);
+
 #ifdef CONFIG_TIME_INTERPOLATION
 void getnstimeofday (struct timespec *tv)
 {
diff --git a/kernel/timer.c b/kernel/timer.c
index d0eed9b563c4..6bb47b0e4983 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -465,7 +465,14 @@ repeat:
 			smp_wmb();
 			timer->base = NULL;
 			spin_unlock_irq(&base->lock);
-			fn(data);
+			{
+				u32 preempt_count = preempt_count();
+				fn(data);
+				if (preempt_count != preempt_count()) {
+					printk("huh, entered %p with %08x, exited with %08x?\n", fn, preempt_count, preempt_count());
+					BUG();
+				}
+			}
 			spin_lock_irq(&base->lock);
 			goto repeat;
 		}
@@ -554,7 +561,7 @@ unsigned long tick_nsec = TICK_NSEC;		/* ACTHZ period (nsec) */
 /* 
  * The current time 
  * wall_to_monotonic is what we need to add to xtime (or xtime corrected 
- * for sub jiffie times) to get to monotonic time.  Monotonic is pegged at zero
+ * for sub jiffie times) to get to monotonic time.  Monotonic is pegged
  * at zero at system boot time, so wall_to_monotonic will be negative,
  * however, we will ALWAYS keep the tv_nsec part positive so we can use
  * the usual normalization.
@@ -799,59 +806,6 @@ static void update_wall_time(unsigned long ticks)
 	} while (ticks);
 }
 
-static inline void do_process_times(struct task_struct *p,
-	unsigned long user, unsigned long system)
-{
-	unsigned long psecs;
-
-	psecs = (p->utime += user);
-	psecs += (p->stime += system);
-	if (p->signal && !unlikely(p->state & (EXIT_DEAD|EXIT_ZOMBIE)) &&
-	    psecs / HZ >= p->signal->rlim[RLIMIT_CPU].rlim_cur) {
-		/* Send SIGXCPU every second.. */
-		if (!(psecs % HZ))
-			send_sig(SIGXCPU, p, 1);
-		/* and SIGKILL when we go over max.. */
-		if (psecs / HZ >= p->signal->rlim[RLIMIT_CPU].rlim_max)
-			send_sig(SIGKILL, p, 1);
-	}
-}
-
-static inline void do_it_virt(struct task_struct * p, unsigned long ticks)
-{
-	unsigned long it_virt = p->it_virt_value;
-
-	if (it_virt) {
-		it_virt -= ticks;
-		if (!it_virt) {
-			it_virt = p->it_virt_incr;
-			send_sig(SIGVTALRM, p, 1);
-		}
-		p->it_virt_value = it_virt;
-	}
-}
-
-static inline void do_it_prof(struct task_struct *p)
-{
-	unsigned long it_prof = p->it_prof_value;
-
-	if (it_prof) {
-		if (--it_prof == 0) {
-			it_prof = p->it_prof_incr;
-			send_sig(SIGPROF, p, 1);
-		}
-		p->it_prof_value = it_prof;
-	}
-}
-
-static void update_one_process(struct task_struct *p, unsigned long user,
-			unsigned long system, int cpu)
-{
-	do_process_times(p, user, system);
-	do_it_virt(p, user);
-	do_it_prof(p);
-}	
-
 /*
  * Called from the timer interrupt handler to charge one tick to the current 
  * process.  user_tick is 1 if the tick is user time, 0 for system.
@@ -859,11 +813,17 @@ static void update_one_process(struct task_struct *p, unsigned long user,
 void update_process_times(int user_tick)
 {
 	struct task_struct *p = current;
-	int cpu = smp_processor_id(), system = user_tick ^ 1;
+	int cpu = smp_processor_id();
 
-	update_one_process(p, user_tick, system, cpu);
+	/* Note: this timer irq context must be accounted for as well. */
+	if (user_tick)
+		account_user_time(p, jiffies_to_cputime(1));
+	else
+		account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
 	run_local_timers();
-	scheduler_tick(user_tick, system);
+	if (rcu_pending(cpu))
+		rcu_check_callbacks(cpu, user_tick);
+	scheduler_tick();
 }
 
 /*
@@ -1438,7 +1398,7 @@ void __init init_timers(void)
 
 struct time_interpolator *time_interpolator;
 static struct time_interpolator *time_interpolator_list;
-static spinlock_t time_interpolator_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(time_interpolator_lock);
 
 static inline u64 time_interpolator_get_cycles(unsigned int src)
 {
diff --git a/kernel/user.c b/kernel/user.c
index 693487dc940e..18f63146602a 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -26,7 +26,7 @@
 
 static kmem_cache_t *uid_cachep;
 static struct list_head uidhash_table[UIDHASH_SZ];
-static spinlock_t uidhash_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(uidhash_lock);
 
 struct user_struct root_user = {
 	.__count	= ATOMIC_INIT(1),
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ee77ccd01d04..3993f7bdf5c2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -8,7 +8,7 @@
  *
  * Derived from the taskqueue/keventd code by:
  *
- *   David Woodhouse <dwmw2@redhat.com>
+ *   David Woodhouse <dwmw2@infradead.org>
  *   Andrew Morton <andrewm@uow.edu.au>
  *   Kai Petzke <wpp@marie.physik.tu-berlin.de>
  *   Theodore Ts'o <tytso@mit.edu>
@@ -64,7 +64,7 @@ struct workqueue_struct {
 
 /* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
    threads to each one as cpus come/go. */
-static spinlock_t workqueue_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(workqueue_lock);
 static LIST_HEAD(workqueues);
 
 /* If it's single threaded, it isn't in the list of workqueues. */
@@ -188,7 +188,7 @@ static int worker_thread(void *__cwq)
 
 	current->flags |= PF_NOFREEZE;
 
-	set_user_nice(current, -10);
+	set_user_nice(current, -5);
 
 	/* Block and flush all signals */
 	sigfillset(&blocked);
author	Tony Luck <tony.luck@intel.com>	2005-01-17 20:28:07 -0800
committer	Tony Luck <tony.luck@intel.com>	2005-01-17 20:28:07 -0800
commit	9790ee6f6a30db6b9fd4f27e437b87fa8e511cdd (patch)
tree	5283a41fba9fe6a85c0ed186ad0469dd0cc04323 /kernel
parent	bcb39c06eefc908121323c23ead2fdd246585ad3 (diff)
parent	fdefff6242ce95e350570a7f65e1ff49bc3a66b6 (diff)