Merge branch 'master' of /home/tglx/work/mtd/git/linux-2.6.git/

author: Thomas Gleixner <tglx@mtd.linutronix.de> 2005-11-06 15:36:37 +0100
committer: Thomas Gleixner <tglx@mtd.linutronix.de> 2005-11-06 15:36:37 +0100
commit: 2fc2991175bf77395e6b15fe6b2304d3bf72da40 (patch)
tree: b0ff38c09240e7c00e1577d447ebe89143d752dc /kernel
parent: 8b491d750885ebe8e7d385ce4186c85957d67123 (diff)
parent: 7015faa7df829876a0f931cd18aa6d7c24a1b581 (diff)
48 files changed, 3709 insertions, 1791 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index cb05cd05d237..4f5a1453093a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,6 +12,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 obj-$(CONFIG_FUTEX) += futex.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += cpu.o spinlock.o
+obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
 obj-$(CONFIG_UID16) += uid16.o
 obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
@@ -21,15 +22,16 @@ obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_IKCONFIG) += configs.o
-obj-$(CONFIG_IKCONFIG_PROC) += configs.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
 obj-$(CONFIG_AUDIT) += audit.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_SYSFS) += ksysfs.o
+obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_SECCOMP) += seccomp.o
+obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index 4168f631868e..2e3f4a47e7d0 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -165,7 +165,7 @@ out:
 }
 
 /*
- * Close the old accouting file (if currently open) and then replace
+ * Close the old accounting file (if currently open) and then replace
  * it with file (if non-NULL).
  *
  * NOTE: acct_globals.lock MUST be held on entry and exit.
@@ -199,11 +199,16 @@ static void acct_file_reopen(struct file *file)
 	}
 }
 
-/*
- *  sys_acct() is the only system call needed to implement process
- *  accounting. It takes the name of the file where accounting records
- *  should be written. If the filename is NULL, accounting will be
- *  shutdown.
+/**
+ * sys_acct - enable/disable process accounting
+ * @name: file name for accounting records or NULL to shutdown accounting
+ *
+ * Returns 0 for success or negative errno values for failure.
+ *
+ * sys_acct() is the only system call needed to implement process
+ * accounting. It takes the name of the file where accounting records
+ * should be written. If the filename is NULL, accounting will be
+ * shutdown.
  */
 asmlinkage long sys_acct(const char __user *name)
 {
@@ -220,7 +225,7 @@ asmlinkage long sys_acct(const char __user *name)
 			return (PTR_ERR(tmp));
 		}
 		/* Difference from BSD - they don't do O_APPEND */
-		file = filp_open(tmp, O_WRONLY|O_APPEND, 0);
+		file = filp_open(tmp, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
 		putname(tmp);
 		if (IS_ERR(file)) {
 			return (PTR_ERR(file));
@@ -250,9 +255,12 @@ asmlinkage long sys_acct(const char __user *name)
 	return (0);
 }
 
-/*
- * If the accouting is turned on for a file in the filesystem pointed
- * to by sb, turn accouting off.
+/**
+ * acct_auto_close - turn off a filesystem's accounting if it is on
+ * @sb: super block for the filesystem
+ *
+ * If the accounting is turned on for a file in the filesystem pointed
+ * to by sb, turn accounting off.
  */
 void acct_auto_close(struct super_block *sb)
 {
@@ -503,8 +511,11 @@ static void do_acct_process(long exitcode, struct file *file)
 	set_fs(fs);
 }
 
-/*
+/**
  * acct_process - now just a wrapper around do_acct_process
+ * @exitcode: task exit code
+ *
+ * handles process accounting for an exiting task
  */
 void acct_process(long exitcode)
 {
@@ -530,9 +541,9 @@ void acct_process(long exitcode)
 }
 
 
-/*
- * acct_update_integrals
- *    -  update mm integral fields in task_struct
+/**
+ * acct_update_integrals - update mm integral fields in task_struct
+ * @tsk: task_struct for accounting
  */
 void acct_update_integrals(struct task_struct *tsk)
 {
@@ -542,14 +553,14 @@ void acct_update_integrals(struct task_struct *tsk)
 		if (delta == 0)
 			return;
 		tsk->acct_stimexpd = tsk->stime;
-		tsk->acct_rss_mem1 += delta * get_mm_counter(tsk->mm, rss);
+		tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
 		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
 	}
 }
 
-/*
- * acct_clear_integrals
- *    - clear the mm integral fields in task_struct
+/**
+ * acct_clear_integrals - clear the mm integral fields in task_struct
+ * @tsk: task_struct whose accounting fields are cleared
  */
 void acct_clear_integrals(struct task_struct *tsk)
 {
diff --git a/kernel/audit.c b/kernel/audit.c
index ef35166fdc29..0c56320d38dc 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -79,6 +79,8 @@ static int	audit_rate_limit;
 
 /* Number of outstanding audit_buffers allowed. */
 static int	audit_backlog_limit = 64;
+static int	audit_backlog_wait_time = 60 * HZ;
+static int	audit_backlog_wait_overflow = 0;
 
 /* The identity of the user shutting down the audit system. */
 uid_t		audit_sig_uid = -1;
@@ -106,18 +108,12 @@ static LIST_HEAD(audit_freelist);
 static struct sk_buff_head audit_skb_queue;
 static struct task_struct *kauditd_task;
 static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
-
-/* There are three lists of rules -- one to search at task creation
- * time, one to search at syscall entry time, and another to search at
- * syscall exit time. */
-static LIST_HEAD(audit_tsklist);
-static LIST_HEAD(audit_entlist);
-static LIST_HEAD(audit_extlist);
+static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
 
 /* The netlink socket is only to be read by 1 CPU, which lets us assume
  * that list additions and deletions never happen simultaneously in
  * auditsc.c */
-static DECLARE_MUTEX(audit_netlink_sem);
+DECLARE_MUTEX(audit_netlink_sem);
 
 /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
  * audit records.  Since printk uses a 1024 byte buffer, this buffer
@@ -137,6 +133,7 @@ struct audit_buffer {
 	struct list_head     list;
 	struct sk_buff       *skb;	/* formatted skb ready to send */
 	struct audit_context *ctx;	/* NULL or associated context */
+	gfp_t		     gfp_mask;
 };
 
 static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
@@ -145,11 +142,6 @@ static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
 	nlh->nlmsg_pid = pid;
 }
 
-struct audit_entry {
-	struct list_head  list;
-	struct audit_rule rule;
-};
-
 static void audit_panic(const char *message)
 {
 	switch (audit_failure)
@@ -233,7 +225,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid)
 {
 	int old		 = audit_rate_limit;
 	audit_rate_limit = limit;
-	audit_log(NULL, AUDIT_CONFIG_CHANGE, 
+	audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 
 			"audit_rate_limit=%d old=%d by auid=%u",
 			audit_rate_limit, old, loginuid);
 	return old;
@@ -243,7 +235,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid)
 {
 	int old		 = audit_backlog_limit;
 	audit_backlog_limit = limit;
-	audit_log(NULL, AUDIT_CONFIG_CHANGE,
+	audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
 			"audit_backlog_limit=%d old=%d by auid=%u",
 			audit_backlog_limit, old, loginuid);
 	return old;
@@ -255,7 +247,7 @@ static int audit_set_enabled(int state, uid_t loginuid)
 	if (state != 0 && state != 1)
 		return -EINVAL;
 	audit_enabled = state;
-	audit_log(NULL, AUDIT_CONFIG_CHANGE,
+	audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
 			"audit_enabled=%d old=%d by auid=%u",
 			audit_enabled, old, loginuid);
 	return old;
@@ -269,7 +261,7 @@ static int audit_set_failure(int state, uid_t loginuid)
 	    && state != AUDIT_FAIL_PANIC)
 		return -EINVAL;
 	audit_failure = state;
-	audit_log(NULL, AUDIT_CONFIG_CHANGE,
+	audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
 			"audit_failure=%d old=%d by auid=%u",
 			audit_failure, old, loginuid);
 	return old;
@@ -281,6 +273,7 @@ int kauditd_thread(void *dummy)
 
 	while (1) {
 		skb = skb_dequeue(&audit_skb_queue);
+		wake_up(&audit_backlog_wait);
 		if (skb) {
 			if (audit_pid) {
 				int err = netlink_unicast(audit_sock, skb, audit_pid, 0);
@@ -290,7 +283,7 @@ int kauditd_thread(void *dummy)
 					audit_pid = 0;
 				}
 			} else {
-				printk(KERN_ERR "%s\n", skb->data + NLMSG_SPACE(0));
+				printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0));
 				kfree_skb(skb);
 			}
 		} else {
@@ -423,7 +416,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		if (status_get->mask & AUDIT_STATUS_PID) {
 			int old   = audit_pid;
 			audit_pid = status_get->pid;
-			audit_log(NULL, AUDIT_CONFIG_CHANGE,
+			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
 				"audit_pid=%d old=%d by auid=%u",
 				  audit_pid, old, loginuid);
 		}
@@ -435,15 +428,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		break;
 	case AUDIT_USER:
 	case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
-		ab = audit_log_start(NULL, msg_type);
-		if (!ab)
-			break;	/* audit_panic has been called */
-		audit_log_format(ab,
-				 "user pid=%d uid=%u auid=%u"
-				 " msg='%.1024s'",
-				 pid, uid, loginuid, (char *)data);
-		audit_set_pid(ab, pid);
-		audit_log_end(ab);
+		if (!audit_enabled && msg_type != AUDIT_USER_AVC)
+			return 0;
+
+		err = audit_filter_user(&NETLINK_CB(skb), msg_type);
+		if (err == 1) {
+			err = 0;
+			ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
+			if (ab) {
+				audit_log_format(ab,
+						 "user pid=%d uid=%u auid=%u msg='%.1024s'",
+						 pid, uid, loginuid, (char *)data);
+				audit_set_pid(ab, pid);
+				audit_log_end(ab);
+			}
+		}
 		break;
 	case AUDIT_ADD:
 	case AUDIT_DEL:
@@ -514,7 +513,8 @@ static int __init audit_init(void)
 {
 	printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
 	       audit_default ? "enabled" : "disabled");
-	audit_sock = netlink_kernel_create(NETLINK_AUDIT, audit_receive);
+	audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive,
+					   THIS_MODULE);
 	if (!audit_sock)
 		audit_panic("cannot initialize netlink socket");
 
@@ -522,7 +522,7 @@ static int __init audit_init(void)
 	skb_queue_head_init(&audit_skb_queue);
 	audit_initialized = 1;
 	audit_enabled = audit_default;
-	audit_log(NULL, AUDIT_KERNEL, "initialized");
+	audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
 	return 0;
 }
 __initcall(audit_init);
@@ -560,7 +560,7 @@ static void audit_buffer_free(struct audit_buffer *ab)
 }
 
 static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
-						int gfp_mask, int type)
+						gfp_t gfp_mask, int type)
 {
 	unsigned long flags;
 	struct audit_buffer *ab = NULL;
@@ -586,6 +586,7 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
 		goto err;
 
 	ab->ctx = ctx;
+	ab->gfp_mask = gfp_mask;
 	nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0));
 	nlh->nlmsg_type = type;
 	nlh->nlmsg_flags = 0;
@@ -605,26 +606,27 @@ err:
  * (timestamp,serial) tuple is unique for each syscall and is live from
  * syscall entry to syscall exit.
  *
- * Atomic values are only guaranteed to be 24-bit, so we count down.
- *
  * NOTE: Another possibility is to store the formatted records off the
  * audit context (for those records that have a context), and emit them
  * all at syscall exit.  However, this could delay the reporting of
  * significant errors until syscall exit (or never, if the system
  * halts). */
+
 unsigned int audit_serial(void)
 {
-	static atomic_t serial = ATOMIC_INIT(0xffffff);
-	unsigned int a, b;
+	static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED;
+	static unsigned int serial = 0;
+
+	unsigned long flags;
+	unsigned int ret;
 
+	spin_lock_irqsave(&serial_lock, flags);
 	do {
-		a = atomic_read(&serial);
-		if (atomic_dec_and_test(&serial))
-			atomic_set(&serial, 0xffffff);
-		b = atomic_read(&serial);
-	} while (b != a - 1);
+		ret = ++serial;
+	} while (unlikely(!ret));
+	spin_unlock_irqrestore(&serial_lock, flags);
 
-	return 0xffffff - b;
+	return ret;
 }
 
 static inline void audit_get_stamp(struct audit_context *ctx, 
@@ -644,17 +646,43 @@ static inline void audit_get_stamp(struct audit_context *ctx,
  * syscall, then the syscall is marked as auditable and an audit record
  * will be written at syscall exit.  If there is no associated task, tsk
  * should be NULL. */
-struct audit_buffer *audit_log_start(struct audit_context *ctx, int type)
+
+struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
+				     int type)
 {
 	struct audit_buffer	*ab	= NULL;
 	struct timespec		t;
 	unsigned int		serial;
+	int reserve;
+	unsigned long timeout_start = jiffies;
 
 	if (!audit_initialized)
 		return NULL;
 
-	if (audit_backlog_limit
-	    && skb_queue_len(&audit_skb_queue) > audit_backlog_limit) {
+	if (gfp_mask & __GFP_WAIT)
+		reserve = 0;
+	else
+		reserve = 5; /* Allow atomic callers to go up to five 
+				entries over the normal backlog limit */
+
+	while (audit_backlog_limit
+	       && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
+		if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time
+		    && time_before(jiffies, timeout_start + audit_backlog_wait_time)) {
+
+			/* Wait for auditd to drain the queue a little */
+			DECLARE_WAITQUEUE(wait, current);
+			set_current_state(TASK_INTERRUPTIBLE);
+			add_wait_queue(&audit_backlog_wait, &wait);
+
+			if (audit_backlog_limit &&
+			    skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
+				schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies);
+
+			__set_current_state(TASK_RUNNING);
+			remove_wait_queue(&audit_backlog_wait, &wait);
+			continue;
+		}
 		if (audit_rate_check())
 			printk(KERN_WARNING
 			       "audit: audit_backlog=%d > "
@@ -662,10 +690,12 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, int type)
 			       skb_queue_len(&audit_skb_queue),
 			       audit_backlog_limit);
 		audit_log_lost("backlog limit exceeded");
+		audit_backlog_wait_time = audit_backlog_wait_overflow;
+		wake_up(&audit_backlog_wait);
 		return NULL;
 	}
 
-	ab = audit_buffer_alloc(ctx, GFP_ATOMIC, type);
+	ab = audit_buffer_alloc(ctx, gfp_mask, type);
 	if (!ab) {
 		audit_log_lost("out of memory in audit_log_start");
 		return NULL;
@@ -689,7 +719,7 @@ static inline int audit_expand(struct audit_buffer *ab, int extra)
 {
 	struct sk_buff *skb = ab->skb;
 	int ret = pskb_expand_head(skb, skb_headroom(skb), extra,
-				   GFP_ATOMIC);
+				   ab->gfp_mask);
 	if (ret < 0) {
 		audit_log_lost("out of memory in audit_expand");
 		return 0;
@@ -808,7 +838,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
 		audit_log_format(ab, " %s", prefix);
 
 	/* We will allow 11 spaces for ' (deleted)' to be appended */
-	path = kmalloc(PATH_MAX+11, GFP_KERNEL);
+	path = kmalloc(PATH_MAX+11, ab->gfp_mask);
 	if (!path) {
 		audit_log_format(ab, "<no memory>");
 		return;
@@ -840,7 +870,7 @@ void audit_log_end(struct audit_buffer *ab)
 			ab->skb = NULL;
 			wake_up_interruptible(&kauditd_wait);
 		} else {
-			printk("%s\n", ab->skb->data + NLMSG_SPACE(0));
+			printk(KERN_NOTICE "%s\n", ab->skb->data + NLMSG_SPACE(0));
 		}
 	}
 	audit_buffer_free(ab);
@@ -849,12 +879,13 @@ void audit_log_end(struct audit_buffer *ab)
 /* Log an audit record.  This is a convenience function that calls
  * audit_log_start, audit_log_vformat, and audit_log_end.  It may be
  * called in any context. */
-void audit_log(struct audit_context *ctx, int type, const char *fmt, ...)
+void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, 
+	       const char *fmt, ...)
 {
 	struct audit_buffer *ab;
 	va_list args;
 
-	ab = audit_log_start(ctx, type);
+	ab = audit_log_start(ctx, gfp_mask, type);
 	if (ab) {
 		va_start(args, fmt);
 		audit_log_vformat(ab, fmt, args);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e75f84e1a1a0..d8a68509e729 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -39,6 +39,9 @@
 #include <linux/audit.h>
 #include <linux/personality.h>
 #include <linux/time.h>
+#include <linux/kthread.h>
+#include <linux/netlink.h>
+#include <linux/compiler.h>
 #include <asm/unistd.h>
 
 /* 0 = no checking
@@ -95,6 +98,7 @@ struct audit_names {
 	uid_t		uid;
 	gid_t		gid;
 	dev_t		rdev;
+	unsigned	flags;
 };
 
 struct audit_aux_data {
@@ -167,9 +171,16 @@ struct audit_context {
 /* There are three lists of rules -- one to search at task creation
  * time, one to search at syscall entry time, and another to search at
  * syscall exit time. */
-static LIST_HEAD(audit_tsklist);
-static LIST_HEAD(audit_entlist);
-static LIST_HEAD(audit_extlist);
+static struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
+	LIST_HEAD_INIT(audit_filter_list[0]),
+	LIST_HEAD_INIT(audit_filter_list[1]),
+	LIST_HEAD_INIT(audit_filter_list[2]),
+	LIST_HEAD_INIT(audit_filter_list[3]),
+	LIST_HEAD_INIT(audit_filter_list[4]),
+#if AUDIT_NR_FILTERS != 5
+#error Fix audit_filter_list initialiser
+#endif
+};
 
 struct audit_entry {
 	struct list_head  list;
@@ -179,9 +190,36 @@ struct audit_entry {
 
 extern int audit_pid;
 
+/* Copy rule from user-space to kernel-space.  Called from 
+ * audit_add_rule during AUDIT_ADD. */
+static inline int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
+{
+	int i;
+
+	if (s->action != AUDIT_NEVER
+	    && s->action != AUDIT_POSSIBLE
+	    && s->action != AUDIT_ALWAYS)
+		return -1;
+	if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS)
+		return -1;
+	if ((s->flags & ~AUDIT_FILTER_PREPEND) >= AUDIT_NR_FILTERS)
+		return -1;
+
+	d->flags	= s->flags;
+	d->action	= s->action;
+	d->field_count	= s->field_count;
+	for (i = 0; i < d->field_count; i++) {
+		d->fields[i] = s->fields[i];
+		d->values[i] = s->values[i];
+	}
+	for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i];
+	return 0;
+}
+
 /* Check to see if two rules are identical.  It is called from
+ * audit_add_rule during AUDIT_ADD and 
  * audit_del_rule during AUDIT_DEL. */
-static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
+static inline int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
 {
 	int i;
 
@@ -210,19 +248,37 @@ static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
 /* Note that audit_add_rule and audit_del_rule are called via
  * audit_receive() in audit.c, and are protected by
  * audit_netlink_sem. */
-static inline int audit_add_rule(struct audit_entry *entry,
-				 struct list_head *list)
+static inline int audit_add_rule(struct audit_rule *rule,
+				  struct list_head *list)
 {
-	if (entry->rule.flags & AUDIT_PREPEND) {
-		entry->rule.flags &= ~AUDIT_PREPEND;
+	struct audit_entry  *entry;
+
+	/* Do not use the _rcu iterator here, since this is the only
+	 * addition routine. */
+	list_for_each_entry(entry, list, list) {
+		if (!audit_compare_rule(rule, &entry->rule)) {
+			return -EEXIST;
+		}
+	}
+
+	if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL)))
+		return -ENOMEM;
+	if (audit_copy_rule(&entry->rule, rule)) {
+		kfree(entry);
+		return -EINVAL;
+	}
+
+	if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
+		entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
 		list_add_rcu(&entry->list, list);
 	} else {
 		list_add_tail_rcu(&entry->list, list);
 	}
+
 	return 0;
 }
 
-static void audit_free_rule(struct rcu_head *head)
+static inline void audit_free_rule(struct rcu_head *head)
 {
 	struct audit_entry *e = container_of(head, struct audit_entry, rcu);
 	kfree(e);
@@ -245,82 +301,82 @@ static inline int audit_del_rule(struct audit_rule *rule,
 			return 0;
 		}
 	}
-	return -EFAULT;		/* No matching rule */
+	return -ENOENT;		/* No matching rule */
 }
 
-/* Copy rule from user-space to kernel-space.  Called during
- * AUDIT_ADD. */
-static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
+static int audit_list_rules(void *_dest)
 {
+	int pid, seq;
+	int *dest = _dest;
+	struct audit_entry *entry;
 	int i;
 
-	if (s->action != AUDIT_NEVER
-	    && s->action != AUDIT_POSSIBLE
-	    && s->action != AUDIT_ALWAYS)
-		return -1;
-	if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS)
-		return -1;
+	pid = dest[0];
+	seq = dest[1];
+	kfree(dest);
 
-	d->flags	= s->flags;
-	d->action	= s->action;
-	d->field_count	= s->field_count;
-	for (i = 0; i < d->field_count; i++) {
-		d->fields[i] = s->fields[i];
-		d->values[i] = s->values[i];
+	down(&audit_netlink_sem);
+
+	/* The *_rcu iterators not needed here because we are
+	   always called with audit_netlink_sem held. */
+	for (i=0; i<AUDIT_NR_FILTERS; i++) {
+		list_for_each_entry(entry, &audit_filter_list[i], list)
+			audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+					 &entry->rule, sizeof(entry->rule));
 	}
-	for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i];
+	audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
+	
+	up(&audit_netlink_sem);
 	return 0;
 }
 
 int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 							uid_t loginuid)
 {
-	u32		   flags;
-	struct audit_entry *entry;
+	struct task_struct *tsk;
+	int *dest;
 	int		   err = 0;
+	unsigned listnr;
 
 	switch (type) {
 	case AUDIT_LIST:
-		/* The *_rcu iterators not needed here because we are
-		   always called with audit_netlink_sem held. */
-		list_for_each_entry(entry, &audit_tsklist, list)
-			audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
-					 &entry->rule, sizeof(entry->rule));
-		list_for_each_entry(entry, &audit_entlist, list)
-			audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
-					 &entry->rule, sizeof(entry->rule));
-		list_for_each_entry(entry, &audit_extlist, list)
-			audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
-					 &entry->rule, sizeof(entry->rule));
-		audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
+		/* We can't just spew out the rules here because we might fill
+		 * the available socket buffer space and deadlock waiting for
+		 * auditctl to read from it... which isn't ever going to
+		 * happen if we're actually running in the context of auditctl
+		 * trying to _send_ the stuff */
+		 
+		dest = kmalloc(2 * sizeof(int), GFP_KERNEL);
+		if (!dest)
+			return -ENOMEM;
+		dest[0] = pid;
+		dest[1] = seq;
+
+		tsk = kthread_run(audit_list_rules, dest, "audit_list_rules");
+		if (IS_ERR(tsk)) {
+			kfree(dest);
+			err = PTR_ERR(tsk);
+		}
 		break;
 	case AUDIT_ADD:
-		if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL)))
-			return -ENOMEM;
-		if (audit_copy_rule(&entry->rule, data)) {
-			kfree(entry);
+		listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND;
+		if (listnr >= AUDIT_NR_FILTERS)
 			return -EINVAL;
-		}
-		flags = entry->rule.flags;
-		if (!err && (flags & AUDIT_PER_TASK))
-			err = audit_add_rule(entry, &audit_tsklist);
-		if (!err && (flags & AUDIT_AT_ENTRY))
-			err = audit_add_rule(entry, &audit_entlist);
-		if (!err && (flags & AUDIT_AT_EXIT))
-			err = audit_add_rule(entry, &audit_extlist);
-		audit_log(NULL, AUDIT_CONFIG_CHANGE, 
-				"auid=%u added an audit rule\n", loginuid);
+
+		err = audit_add_rule(data, &audit_filter_list[listnr]);
+		if (!err)
+			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+				  "auid=%u added an audit rule\n", loginuid);
 		break;
 	case AUDIT_DEL:
-		flags =((struct audit_rule *)data)->flags;
-		if (!err && (flags & AUDIT_PER_TASK))
-			err = audit_del_rule(data, &audit_tsklist);
-		if (!err && (flags & AUDIT_AT_ENTRY))
-			err = audit_del_rule(data, &audit_entlist);
-		if (!err && (flags & AUDIT_AT_EXIT))
-			err = audit_del_rule(data, &audit_extlist);
-		audit_log(NULL, AUDIT_CONFIG_CHANGE,
-				"auid=%u removed an audit rule\n", loginuid);
+		listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND;
+		if (listnr >= AUDIT_NR_FILTERS)
+			return -EINVAL;
+
+		err = audit_del_rule(data, &audit_filter_list[listnr]);
+		if (!err)
+			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+				  "auid=%u removed an audit rule\n", loginuid);
 		break;
 	default:
 		return -EINVAL;
@@ -384,8 +440,12 @@ static int audit_filter_rules(struct task_struct *tsk,
 				result = (ctx->return_code == value);
 			break;
 		case AUDIT_SUCCESS:
-			if (ctx && ctx->return_valid)
-				result = (ctx->return_valid == AUDITSC_SUCCESS);
+			if (ctx && ctx->return_valid) {
+				if (value)
+					result = (ctx->return_valid == AUDITSC_SUCCESS);
+				else
+					result = (ctx->return_valid == AUDITSC_FAILURE);
+			}
 			break;
 		case AUDIT_DEVMAJOR:
 			if (ctx) {
@@ -454,7 +514,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk)
 	enum audit_state   state;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(e, &audit_tsklist, list) {
+	list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
 		if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
 			rcu_read_unlock();
 			return state;
@@ -474,20 +534,84 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
 					     struct list_head *list)
 {
 	struct audit_entry *e;
+	enum audit_state state;
+
+	if (audit_pid && tsk->tgid == audit_pid)
+		return AUDIT_DISABLED;
+
+	rcu_read_lock();
+	if (!list_empty(list)) {
+		    int word = AUDIT_WORD(ctx->major);
+		    int bit  = AUDIT_BIT(ctx->major);
+
+		    list_for_each_entry_rcu(e, list, list) {
+			    if ((e->rule.mask[word] & bit) == bit
+				&& audit_filter_rules(tsk, &e->rule, ctx, &state)) {
+				    rcu_read_unlock();
+				    return state;
+			    }
+		    }
+	}
+	rcu_read_unlock();
+	return AUDIT_BUILD_CONTEXT;
+}
+
+static int audit_filter_user_rules(struct netlink_skb_parms *cb,
+			      struct audit_rule *rule,
+			      enum audit_state *state)
+{
+	int i;
+
+	for (i = 0; i < rule->field_count; i++) {
+		u32 field  = rule->fields[i] & ~AUDIT_NEGATE;
+		u32 value  = rule->values[i];
+		int result = 0;
+
+		switch (field) {
+		case AUDIT_PID:
+			result = (cb->creds.pid == value);
+			break;
+		case AUDIT_UID:
+			result = (cb->creds.uid == value);
+			break;
+		case AUDIT_GID:
+			result = (cb->creds.gid == value);
+			break;
+		case AUDIT_LOGINUID:
+			result = (cb->loginuid == value);
+			break;
+		}
+
+		if (rule->fields[i] & AUDIT_NEGATE)
+			result = !result;
+		if (!result)
+			return 0;
+	}
+	switch (rule->action) {
+	case AUDIT_NEVER:    *state = AUDIT_DISABLED;	    break;
+	case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT;  break;
+	case AUDIT_ALWAYS:   *state = AUDIT_RECORD_CONTEXT; break;
+	}
+	return 1;
+}
+
+int audit_filter_user(struct netlink_skb_parms *cb, int type)
+{
+	struct audit_entry *e;
 	enum audit_state   state;
-	int		   word = AUDIT_WORD(ctx->major);
-	int		   bit  = AUDIT_BIT(ctx->major);
+	int ret = 1;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(e, list, list) {
-		if ((e->rule.mask[word] & bit) == bit
- 		    && audit_filter_rules(tsk, &e->rule, ctx, &state)) {
-			rcu_read_unlock();
-			return state;
+	list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
+		if (audit_filter_user_rules(cb, &e->rule, &state)) {
+			if (state == AUDIT_DISABLED)
+				ret = 0;
+			break;
 		}
 	}
 	rcu_read_unlock();
-	return AUDIT_BUILD_CONTEXT;
+
+	return ret; /* Audit by default */
 }
 
 /* This should be called with task_lock() held. */
@@ -504,7 +628,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 
 	if (context->in_syscall && !context->auditable) {
 		enum audit_state state;
-		state = audit_filter_syscall(tsk, context, &audit_extlist);
+		state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
 		if (state == AUDIT_RECORD_CONTEXT)
 			context->auditable = 1;
 	}
@@ -679,13 +803,13 @@ static void audit_log_task_info(struct audit_buffer *ab)
 	up_read(&mm->mmap_sem);
 }
 
-static void audit_log_exit(struct audit_context *context)
+static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
 {
 	int i;
 	struct audit_buffer *ab;
 	struct audit_aux_data *aux;
 
-	ab = audit_log_start(context, AUDIT_SYSCALL);
+	ab = audit_log_start(context, gfp_mask, AUDIT_SYSCALL);
 	if (!ab)
 		return;		/* audit_panic has been called */
 	audit_log_format(ab, "arch=%x syscall=%d",
@@ -717,7 +841,7 @@ static void audit_log_exit(struct audit_context *context)
 
 	for (aux = context->aux; aux; aux = aux->next) {
 
-		ab = audit_log_start(context, aux->type);
+		ab = audit_log_start(context, GFP_KERNEL, aux->type);
 		if (!ab)
 			continue; /* audit_panic has been called */
 
@@ -754,14 +878,14 @@ static void audit_log_exit(struct audit_context *context)
 	}
 
 	if (context->pwd && context->pwdmnt) {
-		ab = audit_log_start(context, AUDIT_CWD);
+		ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
 		if (ab) {
 			audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt);
 			audit_log_end(ab);
 		}
 	}
 	for (i = 0; i < context->name_count; i++) {
-		ab = audit_log_start(context, AUDIT_PATH);
+		ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
 		if (!ab)
 			continue; /* audit_panic has been called */
 
@@ -770,6 +894,8 @@ static void audit_log_exit(struct audit_context *context)
 			audit_log_format(ab, " name=");
 			audit_log_untrustedstring(ab, context->names[i].name);
 		}
+		audit_log_format(ab, " flags=%x\n", context->names[i].flags);
+			 
 		if (context->names[i].ino != (unsigned long)-1)
 			audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o"
 					     " ouid=%u ogid=%u rdev=%02x:%02x",
@@ -799,9 +925,11 @@ void audit_free(struct task_struct *tsk)
 		return;
 
 	/* Check for system calls that do not go through the exit
-	 * function (e.g., exit_group), then free context block. */
-	if (context->in_syscall && context->auditable && context->pid != audit_pid)
-		audit_log_exit(context);
+	 * function (e.g., exit_group), then free context block. 
+	 * We use GFP_ATOMIC here because we might be doing this 
+	 * in the context of the idle thread */
+	if (context->in_syscall && context->auditable)
+		audit_log_exit(context, GFP_ATOMIC);
 
 	audit_free_context(context);
 }
@@ -876,11 +1004,11 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
 
 	state = context->state;
 	if (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT)
-		state = audit_filter_syscall(tsk, context, &audit_entlist);
+		state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
 	if (likely(state == AUDIT_DISABLED))
 		return;
 
-	context->serial     = audit_serial();
+	context->serial     = 0;
 	context->ctime      = CURRENT_TIME;
 	context->in_syscall = 1;
 	context->auditable  = !!(state == AUDIT_RECORD_CONTEXT);
@@ -903,10 +1031,10 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code)
 	/* Not having a context here is ok, since the parent may have
 	 * called __put_task_struct. */
 	if (likely(!context))
-		return;
+		goto out;
 
-	if (context->in_syscall && context->auditable && context->pid != audit_pid)
-		audit_log_exit(context);
+	if (context->in_syscall && context->auditable)
+		audit_log_exit(context, GFP_KERNEL);
 
 	context->in_syscall = 0;
 	context->auditable  = 0;
@@ -919,9 +1047,9 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code)
 	} else {
 		audit_free_names(context);
 		audit_free_aux(context);
-		audit_zero_context(context, context->state);
 		tsk->audit_context = context;
 	}
+ out:
 	put_task_struct(tsk);
 }
 
@@ -996,7 +1124,7 @@ void audit_putname(const char *name)
 
 /* Store the inode and device from a lookup.  Called from
  * fs/namei.c:path_lookup(). */
-void audit_inode(const char *name, const struct inode *inode)
+void audit_inode(const char *name, const struct inode *inode, unsigned flags)
 {
 	int idx;
 	struct audit_context *context = current->audit_context;
@@ -1022,17 +1150,20 @@ void audit_inode(const char *name, const struct inode *inode)
 		++context->ino_count;
 #endif
 	}
-	context->names[idx].ino  = inode->i_ino;
-	context->names[idx].dev	 = inode->i_sb->s_dev;
-	context->names[idx].mode = inode->i_mode;
-	context->names[idx].uid  = inode->i_uid;
-	context->names[idx].gid  = inode->i_gid;
-	context->names[idx].rdev = inode->i_rdev;
+	context->names[idx].flags = flags;
+	context->names[idx].ino   = inode->i_ino;
+	context->names[idx].dev	  = inode->i_sb->s_dev;
+	context->names[idx].mode  = inode->i_mode;
+	context->names[idx].uid   = inode->i_uid;
+	context->names[idx].gid   = inode->i_gid;
+	context->names[idx].rdev  = inode->i_rdev;
 }
 
 void auditsc_get_stamp(struct audit_context *ctx,
 		       struct timespec *t, unsigned int *serial)
 {
+	if (!ctx->serial)
+		ctx->serial = audit_serial();
 	t->tv_sec  = ctx->ctime.tv_sec;
 	t->tv_nsec = ctx->ctime.tv_nsec;
 	*serial    = ctx->serial;
@@ -1044,7 +1175,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
 	if (task->audit_context) {
 		struct audit_buffer *ab;
 
-		ab = audit_log_start(NULL, AUDIT_LOGIN);
+		ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
 		if (ab) {
 			audit_log_format(ab, "login pid=%d uid=%u "
 				"old auid=%u new auid=%u",
@@ -1153,7 +1284,7 @@ void audit_signal_info(int sig, struct task_struct *t)
 	extern pid_t audit_sig_pid;
 	extern uid_t audit_sig_uid;
 
-	if (unlikely(audit_pid && t->pid == audit_pid)) {
+	if (unlikely(audit_pid && t->tgid == audit_pid)) {
 		if (sig == SIGTERM || sig == SIGHUP) {
 			struct audit_context *ctx = current->audit_context;
 			audit_sig_pid = current->pid;
diff --git a/kernel/compat.c b/kernel/compat.c
index ddfcaaa86623..102296e21ea8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -48,8 +48,7 @@ static long compat_nanosleep_restart(struct restart_block *restart)
 	if (!time_after(expire, now))
 		return 0;
 
-	current->state = TASK_INTERRUPTIBLE;
-	expire = schedule_timeout(expire - now);
+	expire = schedule_timeout_interruptible(expire - now);
 	if (expire == 0)
 		return 0;
 
@@ -82,8 +81,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
 		return -EINVAL;
 
 	expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
-	current->state = TASK_INTERRUPTIBLE;
-	expire = schedule_timeout(expire);
+	expire = schedule_timeout_interruptible(expire);
 	if (expire == 0)
 		return 0;
 
@@ -795,8 +793,7 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
 			recalc_sigpending();
 			spin_unlock_irq(&current->sighand->siglock);
 
-			current->state = TASK_INTERRUPTIBLE;
-			timeout = schedule_timeout(timeout);
+			timeout = schedule_timeout_interruptible(timeout);
 
 			spin_lock_irq(&current->sighand->siglock);
 			sig = dequeue_signal(current, &s, &info);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 53d8263ae12e..3619e939182e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -17,6 +17,7 @@
 
 /* This protects CPUs going up and down... */
 DECLARE_MUTEX(cpucontrol);
+EXPORT_SYMBOL_GPL(cpucontrol);
 
 static struct notifier_block *cpu_chain;
 
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 805fb9097318..5a737ed9dac7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -32,6 +32,7 @@
 #include <linux/kernel.h>
 #include <linux/kmod.h>
 #include <linux/list.h>
+#include <linux/mempolicy.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/mount.h>
@@ -60,6 +61,9 @@ struct cpuset {
 	cpumask_t cpus_allowed;		/* CPUs allowed to tasks in cpuset */
 	nodemask_t mems_allowed;	/* Memory Nodes allowed to tasks */
 
+	/*
+	 * Count is atomic so can incr (fork) or decr (exit) without a lock.
+	 */
 	atomic_t count;			/* count tasks using this cpuset */
 
 	/*
@@ -142,44 +146,91 @@ static struct vfsmount *cpuset_mount;
 static struct super_block *cpuset_sb = NULL;
 
 /*
- * cpuset_sem should be held by anyone who is depending on the children
- * or sibling lists of any cpuset, or performing non-atomic operations
- * on the flags or *_allowed values of a cpuset, such as raising the
- * CS_REMOVED flag bit iff it is not already raised, or reading and
- * conditionally modifying the *_allowed values.  One kernel global
- * cpuset semaphore should be sufficient - these things don't change
- * that much.
- *
- * The code that modifies cpusets holds cpuset_sem across the entire
- * operation, from cpuset_common_file_write() down, single threading
- * all cpuset modifications (except for counter manipulations from
- * fork and exit) across the system.  This presumes that cpuset
- * modifications are rare - better kept simple and safe, even if slow.
- *
- * The code that reads cpusets, such as in cpuset_common_file_read()
- * and below, only holds cpuset_sem across small pieces of code, such
- * as when reading out possibly multi-word cpumasks and nodemasks, as
- * the risks are less, and the desire for performance a little greater.
- * The proc_cpuset_show() routine needs to hold cpuset_sem to insure
- * that no cs->dentry is NULL, as it walks up the cpuset tree to root.
- *
- * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't
- * (usually) grab cpuset_sem.  These are the two most performance
- * critical pieces of code here.  The exception occurs on exit(),
- * when a task in a notify_on_release cpuset exits.  Then cpuset_sem
+ * We have two global cpuset semaphores below.  They can nest.
+ * It is ok to first take manage_sem, then nest callback_sem.  We also
+ * require taking task_lock() when dereferencing a tasks cpuset pointer.
+ * See "The task_lock() exception", at the end of this comment.
+ *
+ * A task must hold both semaphores to modify cpusets.  If a task
+ * holds manage_sem, then it blocks others wanting that semaphore,
+ * ensuring that it is the only task able to also acquire callback_sem
+ * and be able to modify cpusets.  It can perform various checks on
+ * the cpuset structure first, knowing nothing will change.  It can
+ * also allocate memory while just holding manage_sem.  While it is
+ * performing these checks, various callback routines can briefly
+ * acquire callback_sem to query cpusets.  Once it is ready to make
+ * the changes, it takes callback_sem, blocking everyone else.
+ *
+ * Calls to the kernel memory allocator can not be made while holding
+ * callback_sem, as that would risk double tripping on callback_sem
+ * from one of the callbacks into the cpuset code from within
+ * __alloc_pages().
+ *
+ * If a task is only holding callback_sem, then it has read-only
+ * access to cpusets.
+ *
+ * The task_struct fields mems_allowed and mems_generation may only
+ * be accessed in the context of that task, so require no locks.
+ *
+ * Any task can increment and decrement the count field without lock.
+ * So in general, code holding manage_sem or callback_sem can't rely
+ * on the count field not changing.  However, if the count goes to
+ * zero, then only attach_task(), which holds both semaphores, can
+ * increment it again.  Because a count of zero means that no tasks
+ * are currently attached, therefore there is no way a task attached
+ * to that cpuset can fork (the other way to increment the count).
+ * So code holding manage_sem or callback_sem can safely assume that
+ * if the count is zero, it will stay zero.  Similarly, if a task
+ * holds manage_sem or callback_sem on a cpuset with zero count, it
+ * knows that the cpuset won't be removed, as cpuset_rmdir() needs
+ * both of those semaphores.
+ *
+ * A possible optimization to improve parallelism would be to make
+ * callback_sem a R/W semaphore (rwsem), allowing the callback routines
+ * to proceed in parallel, with read access, until the holder of
+ * manage_sem needed to take this rwsem for exclusive write access
+ * and modify some cpusets.
+ *
+ * The cpuset_common_file_write handler for operations that modify
+ * the cpuset hierarchy holds manage_sem across the entire operation,
+ * single threading all such cpuset modifications across the system.
+ *
+ * The cpuset_common_file_read() handlers only hold callback_sem across
+ * small pieces of code, such as when reading out possibly multi-word
+ * cpumasks and nodemasks.
+ *
+ * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't
+ * (usually) take either semaphore.  These are the two most performance
+ * critical pieces of code here.  The exception occurs on cpuset_exit(),
+ * when a task in a notify_on_release cpuset exits.  Then manage_sem
  * is taken, and if the cpuset count is zero, a usermode call made
  * to /sbin/cpuset_release_agent with the name of the cpuset (path
  * relative to the root of cpuset file system) as the argument.
  *
- * A cpuset can only be deleted if both its 'count' of using tasks is
- * zero, and its list of 'children' cpusets is empty.  Since all tasks
- * in the system use _some_ cpuset, and since there is always at least
- * one task in the system (init, pid == 1), therefore, top_cpuset
- * always has either children cpusets and/or using tasks.  So no need
- * for any special hack to ensure that top_cpuset cannot be deleted.
+ * A cpuset can only be deleted if both its 'count' of using tasks
+ * is zero, and its list of 'children' cpusets is empty.  Since all
+ * tasks in the system use _some_ cpuset, and since there is always at
+ * least one task in the system (init, pid == 1), therefore, top_cpuset
+ * always has either children cpusets and/or using tasks.  So we don't
+ * need a special hack to ensure that top_cpuset cannot be deleted.
+ *
+ * The above "Tale of Two Semaphores" would be complete, but for:
+ *
+ *	The task_lock() exception
+ *
+ * The need for this exception arises from the action of attach_task(),
+ * which overwrites one tasks cpuset pointer with another.  It does
+ * so using both semaphores, however there are several performance
+ * critical places that need to reference task->cpuset without the
+ * expense of grabbing a system global semaphore.  Therefore except as
+ * noted below, when dereferencing or, as in attach_task(), modifying
+ * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
+ * (task->alloc_lock) already in the task_struct routinely used for
+ * such matters.
  */
 
-static DECLARE_MUTEX(cpuset_sem);
+static DECLARE_MUTEX(manage_sem);
+static DECLARE_MUTEX(callback_sem);
 
 /*
  * A couple of forward declarations required, due to cyclic reference loop:
@@ -354,7 +405,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
 }
 
 /*
- * Call with cpuset_sem held.  Writes path of cpuset into buf.
+ * Call with manage_sem held.  Writes path of cpuset into buf.
  * Returns 0 on success, -errno on error.
  */
 
@@ -398,21 +449,32 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
  * to continue to serve a useful existence.  Next time it's released,
  * we will get notified again, if it still has 'notify_on_release' set.
  *
- * Note final arg to call_usermodehelper() is 0 - that means
- * don't wait.  Since we are holding the global cpuset_sem here,
- * and we are asking another thread (started from keventd) to rmdir a
- * cpuset, we can't wait - or we'd deadlock with the removing thread
- * on cpuset_sem.
+ * The final arg to call_usermodehelper() is 0, which means don't
+ * wait.  The separate /sbin/cpuset_release_agent task is forked by
+ * call_usermodehelper(), then control in this thread returns here,
+ * without waiting for the release agent task.  We don't bother to
+ * wait because the caller of this routine has no use for the exit
+ * status of the /sbin/cpuset_release_agent task, so no sense holding
+ * our caller up for that.
+ *
+ * When we had only one cpuset semaphore, we had to call this
+ * without holding it, to avoid deadlock when call_usermodehelper()
+ * allocated memory.  With two locks, we could now call this while
+ * holding manage_sem, but we still don't, so as to minimize
+ * the time manage_sem is held.
  */
 
-static int cpuset_release_agent(char *cpuset_str)
+static void cpuset_release_agent(const char *pathbuf)
 {
 	char *argv[3], *envp[3];
 	int i;
 
+	if (!pathbuf)
+		return;
+
 	i = 0;
 	argv[i++] = "/sbin/cpuset_release_agent";
-	argv[i++] = cpuset_str;
+	argv[i++] = (char *)pathbuf;
 	argv[i] = NULL;
 
 	i = 0;
@@ -421,17 +483,29 @@ static int cpuset_release_agent(char *cpuset_str)
 	envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
 	envp[i] = NULL;
 
-	return call_usermodehelper(argv[0], argv, envp, 0);
+	call_usermodehelper(argv[0], argv, envp, 0);
+	kfree(pathbuf);
 }
 
 /*
  * Either cs->count of using tasks transitioned to zero, or the
  * cs->children list of child cpusets just became empty.  If this
  * cs is notify_on_release() and now both the user count is zero and
- * the list of children is empty, send notice to user land.
+ * the list of children is empty, prepare cpuset path in a kmalloc'd
+ * buffer, to be returned via ppathbuf, so that the caller can invoke
+ * cpuset_release_agent() with it later on, once manage_sem is dropped.
+ * Call here with manage_sem held.
+ *
+ * This check_for_release() routine is responsible for kmalloc'ing
+ * pathbuf.  The above cpuset_release_agent() is responsible for
+ * kfree'ing pathbuf.  The caller of these routines is responsible
+ * for providing a pathbuf pointer, initialized to NULL, then
+ * calling check_for_release() with manage_sem held and the address
+ * of the pathbuf pointer, then dropping manage_sem, then calling
+ * cpuset_release_agent() with pathbuf, as set by check_for_release().
  */
 
-static void check_for_release(struct cpuset *cs)
+static void check_for_release(struct cpuset *cs, char **ppathbuf)
 {
 	if (notify_on_release(cs) && atomic_read(&cs->count) == 0 &&
 	    list_empty(&cs->children)) {
@@ -441,10 +515,9 @@ static void check_for_release(struct cpuset *cs)
 		if (!buf)
 			return;
 		if (cpuset_path(cs, buf, PAGE_SIZE) < 0)
-			goto out;
-		cpuset_release_agent(buf);
-out:
-		kfree(buf);
+			kfree(buf);
+		else
+			*ppathbuf = buf;
 	}
 }
 
@@ -459,7 +532,7 @@ out:
  * One way or another, we guarantee to return some non-empty subset
  * of cpu_online_map.
  *
- * Call with cpuset_sem held.
+ * Call with callback_sem held.
  */
 
 static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
@@ -483,7 +556,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
  * One way or another, we guarantee to return some non-empty subset
  * of node_online_map.
  *
- * Call with cpuset_sem held.
+ * Call with callback_sem held.
  */
 
 static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
@@ -498,31 +571,47 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 }
 
 /*
- * Refresh current tasks mems_allowed and mems_generation from
- * current tasks cpuset.  Call with cpuset_sem held.
- *
- * Be sure to call refresh_mems() on any cpuset operation which
- * (1) holds cpuset_sem, and (2) might possibly alloc memory.
- * Call after obtaining cpuset_sem lock, before any possible
- * allocation.  Otherwise one risks trying to allocate memory
- * while the task cpuset_mems_generation is not the same as
- * the mems_generation in its cpuset, which would deadlock on
- * cpuset_sem in cpuset_update_current_mems_allowed().
- *
- * Since we hold cpuset_sem, once refresh_mems() is called, the
- * test (current->cpuset_mems_generation != cs->mems_generation)
- * in cpuset_update_current_mems_allowed() will remain false,
- * until we drop cpuset_sem.  Anyone else who would change our
- * cpusets mems_generation needs to lock cpuset_sem first.
+ * Refresh current tasks mems_allowed and mems_generation from current
+ * tasks cpuset.
+ *
+ * Call without callback_sem or task_lock() held.  May be called with
+ * or without manage_sem held.  Will acquire task_lock() and might
+ * acquire callback_sem during call.
+ *
+ * The task_lock() is required to dereference current->cpuset safely.
+ * Without it, we could pick up the pointer value of current->cpuset
+ * in one instruction, and then attach_task could give us a different
+ * cpuset, and then the cpuset we had could be removed and freed,
+ * and then on our next instruction, we could dereference a no longer
+ * valid cpuset pointer to get its mems_generation field.
+ *
+ * This routine is needed to update the per-task mems_allowed data,
+ * within the tasks context, when it is trying to allocate memory
+ * (in various mm/mempolicy.c routines) and notices that some other
+ * task has been modifying its cpuset.
  */
 
 static void refresh_mems(void)
 {
-	struct cpuset *cs = current->cpuset;
+	int my_cpusets_mem_gen;
+
+	task_lock(current);
+	my_cpusets_mem_gen = current->cpuset->mems_generation;
+	task_unlock(current);
+
+	if (current->cpuset_mems_generation != my_cpusets_mem_gen) {
+		struct cpuset *cs;
+		nodemask_t oldmem = current->mems_allowed;
 
-	if (current->cpuset_mems_generation != cs->mems_generation) {
+		down(&callback_sem);
+		task_lock(current);
+		cs = current->cpuset;
 		guarantee_online_mems(cs, &current->mems_allowed);
 		current->cpuset_mems_generation = cs->mems_generation;
+		task_unlock(current);
+		up(&callback_sem);
+		if (!nodes_equal(oldmem, current->mems_allowed))
+			numa_policy_rebind(&oldmem, &current->mems_allowed);
 	}
 }
 
@@ -531,7 +620,7 @@ static void refresh_mems(void)
  *
  * One cpuset is a subset of another if all its allowed CPUs and
  * Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set.
+ * are only set if the other's are set.  Call holding manage_sem.
  */
 
 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -549,7 +638,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
  * If we replaced the flag and mask values of the current cpuset
  * (cur) with those values in the trial cpuset (trial), would
  * our various subset and exclusive rules still be valid?  Presumes
- * cpuset_sem held.
+ * manage_sem held.
  *
  * 'cur' is the address of an actual, in-use cpuset.  Operations
  * such as list traversal that depend on the actual address of the
@@ -603,9 +692,10 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
  *    exclusive child cpusets
  * Build these two partitions by calling partition_sched_domains
  *
- * Call with cpuset_sem held.  May nest a call to the
+ * Call with manage_sem held.  May nest a call to the
  * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
  */
+
 static void update_cpu_domains(struct cpuset *cur)
 {
 	struct cpuset *c, *par = cur->parent;
@@ -647,6 +737,10 @@ static void update_cpu_domains(struct cpuset *cur)
 	unlock_cpu_hotplug();
 }
 
+/*
+ * Call with manage_sem held.  May take callback_sem during call.
+ */
+
 static int update_cpumask(struct cpuset *cs, char *buf)
 {
 	struct cpuset trialcs;
@@ -663,12 +757,18 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 	if (retval < 0)
 		return retval;
 	cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
+	down(&callback_sem);
 	cs->cpus_allowed = trialcs.cpus_allowed;
+	up(&callback_sem);
 	if (is_cpu_exclusive(cs) && !cpus_unchanged)
 		update_cpu_domains(cs);
 	return 0;
 }
 
+/*
+ * Call with manage_sem held.  May take callback_sem during call.
+ */
+
 static int update_nodemask(struct cpuset *cs, char *buf)
 {
 	struct cpuset trialcs;
@@ -683,9 +783,11 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 		return -ENOSPC;
 	retval = validate_change(cs, &trialcs);
 	if (retval == 0) {
+		down(&callback_sem);
 		cs->mems_allowed = trialcs.mems_allowed;
 		atomic_inc(&cpuset_mems_generation);
 		cs->mems_generation = atomic_read(&cpuset_mems_generation);
+		up(&callback_sem);
 	}
 	return retval;
 }
@@ -696,6 +798,8 @@ static int update_nodemask(struct cpuset *cs, char *buf)
  *						CS_NOTIFY_ON_RELEASE)
  * cs:	the cpuset to update
  * buf:	the buffer where we read the 0 or 1
+ *
+ * Call with manage_sem held.
  */
 
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
@@ -717,24 +821,35 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
 		return err;
 	cpu_exclusive_changed =
 		(is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
+	down(&callback_sem);
 	if (turning_on)
 		set_bit(bit, &cs->flags);
 	else
 		clear_bit(bit, &cs->flags);
+	up(&callback_sem);
 
 	if (cpu_exclusive_changed)
                 update_cpu_domains(cs);
 	return 0;
 }
 
-static int attach_task(struct cpuset *cs, char *buf)
+/*
+ * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly
+ * writing the path of the old cpuset in 'ppathbuf' if it needs to be
+ * notified on release.
+ *
+ * Call holding manage_sem.  May take callback_sem and task_lock of
+ * the task 'pid' during call.
+ */
+
+static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 {
 	pid_t pid;
 	struct task_struct *tsk;
 	struct cpuset *oldcs;
 	cpumask_t cpus;
 
-	if (sscanf(buf, "%d", &pid) != 1)
+	if (sscanf(pidbuf, "%d", &pid) != 1)
 		return -EIO;
 	if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
 		return -ENOSPC;
@@ -743,7 +858,7 @@ static int attach_task(struct cpuset *cs, char *buf)
 		read_lock(&tasklist_lock);
 
 		tsk = find_task_by_pid(pid);
-		if (!tsk) {
+		if (!tsk || tsk->flags & PF_EXITING) {
 			read_unlock(&tasklist_lock);
 			return -ESRCH;
 		}
@@ -761,10 +876,13 @@ static int attach_task(struct cpuset *cs, char *buf)
 		get_task_struct(tsk);
 	}
 
+	down(&callback_sem);
+
 	task_lock(tsk);
 	oldcs = tsk->cpuset;
 	if (!oldcs) {
 		task_unlock(tsk);
+		up(&callback_sem);
 		put_task_struct(tsk);
 		return -ESRCH;
 	}
@@ -775,9 +893,10 @@ static int attach_task(struct cpuset *cs, char *buf)
 	guarantee_online_cpus(cs, &cpus);
 	set_cpus_allowed(tsk, cpus);
 
+	up(&callback_sem);
 	put_task_struct(tsk);
 	if (atomic_dec_and_test(&oldcs->count))
-		check_for_release(oldcs);
+		check_for_release(oldcs, ppathbuf);
 	return 0;
 }
 
@@ -801,6 +920,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	struct cftype *cft = __d_cft(file->f_dentry);
 	cpuset_filetype_t type = cft->private;
 	char *buffer;
+	char *pathbuf = NULL;
 	int retval = 0;
 
 	/* Crude upper limit on largest legitimate cpulist user might write. */
@@ -817,7 +937,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	}
 	buffer[nbytes] = 0;	/* nul-terminate */
 
-	down(&cpuset_sem);
+	down(&manage_sem);
 
 	if (is_removed(cs)) {
 		retval = -ENODEV;
@@ -841,7 +961,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 		retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer);
 		break;
 	case FILE_TASKLIST:
-		retval = attach_task(cs, buffer);
+		retval = attach_task(cs, buffer, &pathbuf);
 		break;
 	default:
 		retval = -EINVAL;
@@ -851,7 +971,8 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	if (retval == 0)
 		retval = nbytes;
 out2:
-	up(&cpuset_sem);
+	up(&manage_sem);
+	cpuset_release_agent(pathbuf);
 out1:
 	kfree(buffer);
 	return retval;
@@ -890,9 +1011,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 {
 	cpumask_t mask;
 
-	down(&cpuset_sem);
+	down(&callback_sem);
 	mask = cs->cpus_allowed;
-	up(&cpuset_sem);
+	up(&callback_sem);
 
 	return cpulist_scnprintf(page, PAGE_SIZE, mask);
 }
@@ -901,9 +1022,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 {
 	nodemask_t mask;
 
-	down(&cpuset_sem);
+	down(&callback_sem);
 	mask = cs->mems_allowed;
-	up(&cpuset_sem);
+	up(&callback_sem);
 
 	return nodelist_scnprintf(page, PAGE_SIZE, mask);
 }
@@ -917,8 +1038,6 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
 	char *page;
 	ssize_t retval = 0;
 	char *s;
-	char *start;
-	size_t n;
 
 	if (!(page = (char *)__get_free_page(GFP_KERNEL)))
 		return -ENOMEM;
@@ -946,12 +1065,8 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
 		goto out;
 	}
 	*s++ = '\n';
-	*s = '\0';
 
-	start = page + *ppos;
-	n = s - start;
-	retval = n - copy_to_user(buf, start, min(n, nbytes));
-	*ppos += retval;
+	retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
 out:
 	free_page((unsigned long)page);
 	return retval;
@@ -1002,6 +1117,21 @@ static int cpuset_file_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+/*
+ * cpuset_rename - Only allow simple rename of directories in place.
+ */
+static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry,
+                  struct inode *new_dir, struct dentry *new_dentry)
+{
+	if (!S_ISDIR(old_dentry->d_inode->i_mode))
+		return -ENOTDIR;
+	if (new_dentry->d_inode)
+		return -EEXIST;
+	if (old_dir != new_dir)
+		return -EIO;
+	return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
 static struct file_operations cpuset_file_operations = {
 	.read = cpuset_file_read,
 	.write = cpuset_file_write,
@@ -1014,6 +1144,7 @@ static struct inode_operations cpuset_dir_inode_operations = {
 	.lookup = simple_lookup,
 	.mkdir = cpuset_mkdir,
 	.rmdir = cpuset_rmdir,
+	.rename = cpuset_rename,
 };
 
 static int cpuset_create_file(struct dentry *dentry, int mode)
@@ -1117,7 +1248,9 @@ struct ctr_struct {
 
 /*
  * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'.
- * Return actual number of pids loaded.
+ * Return actual number of pids loaded.  No need to task_lock(p)
+ * when reading out p->cpuset, as we don't really care if it changes
+ * on the next cycle, and we are not going to try to dereference it.
  */
 static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs)
 {
@@ -1159,6 +1292,12 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
 	return cnt;
 }
 
+/*
+ * Handle an open on 'tasks' file.  Prepare a buffer listing the
+ * process id's of tasks currently attached to the cpuset being opened.
+ *
+ * Does not require any specific cpuset semaphores, and does not take any.
+ */
 static int cpuset_tasks_open(struct inode *unused, struct file *file)
 {
 	struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
@@ -1306,7 +1445,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 	if (!cs)
 		return -ENOMEM;
 
-	down(&cpuset_sem);
+	down(&manage_sem);
 	refresh_mems();
 	cs->flags = 0;
 	if (notify_on_release(parent))
@@ -1321,25 +1460,27 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 
 	cs->parent = parent;
 
+	down(&callback_sem);
 	list_add(&cs->sibling, &cs->parent->children);
+	up(&callback_sem);
 
 	err = cpuset_create_dir(cs, name, mode);
 	if (err < 0)
 		goto err;
 
 	/*
-	 * Release cpuset_sem before cpuset_populate_dir() because it
+	 * Release manage_sem before cpuset_populate_dir() because it
 	 * will down() this new directory's i_sem and if we race with
 	 * another mkdir, we might deadlock.
 	 */
-	up(&cpuset_sem);
+	up(&manage_sem);
 
 	err = cpuset_populate_dir(cs->dentry);
 	/* If err < 0, we have a half-filled directory - oh well ;) */
 	return 0;
 err:
 	list_del(&cs->sibling);
-	up(&cpuset_sem);
+	up(&manage_sem);
 	kfree(cs);
 	return err;
 }
@@ -1357,33 +1498,37 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	struct cpuset *cs = dentry->d_fsdata;
 	struct dentry *d;
 	struct cpuset *parent;
+	char *pathbuf = NULL;
 
 	/* the vfs holds both inode->i_sem already */
 
-	down(&cpuset_sem);
+	down(&manage_sem);
 	refresh_mems();
 	if (atomic_read(&cs->count) > 0) {
-		up(&cpuset_sem);
+		up(&manage_sem);
 		return -EBUSY;
 	}
 	if (!list_empty(&cs->children)) {
-		up(&cpuset_sem);
+		up(&manage_sem);
 		return -EBUSY;
 	}
 	parent = cs->parent;
+	down(&callback_sem);
 	set_bit(CS_REMOVED, &cs->flags);
 	if (is_cpu_exclusive(cs))
 		update_cpu_domains(cs);
 	list_del(&cs->sibling);	/* delete my sibling from parent->children */
-	if (list_empty(&parent->children))
-		check_for_release(parent);
 	spin_lock(&cs->dentry->d_lock);
 	d = dget(cs->dentry);
 	cs->dentry = NULL;
 	spin_unlock(&d->d_lock);
 	cpuset_d_remove_dir(d);
 	dput(d);
-	up(&cpuset_sem);
+	up(&callback_sem);
+	if (list_empty(&parent->children))
+		check_for_release(parent, &pathbuf);
+	up(&manage_sem);
+	cpuset_release_agent(pathbuf);
 	return 0;
 }
 
@@ -1442,16 +1587,26 @@ void __init cpuset_init_smp(void)
  * cpuset_fork - attach newly forked task to its parents cpuset.
  * @tsk: pointer to task_struct of forking parent process.
  *
- * Description: By default, on fork, a task inherits its
- * parent's cpuset.  The pointer to the shared cpuset is
- * automatically copied in fork.c by dup_task_struct().
- * This cpuset_fork() routine need only increment the usage
- * counter in that cpuset.
+ * Description: A task inherits its parent's cpuset at fork().
+ *
+ * A pointer to the shared cpuset was automatically copied in fork.c
+ * by dup_task_struct().  However, we ignore that copy, since it was
+ * not made under the protection of task_lock(), so might no longer be
+ * a valid cpuset pointer.  attach_task() might have already changed
+ * current->cpuset, allowing the previously referenced cpuset to
+ * be removed and freed.  Instead, we task_lock(current) and copy
+ * its present value of current->cpuset for our freshly forked child.
+ *
+ * At the point that cpuset_fork() is called, 'current' is the parent
+ * task, and the passed argument 'child' points to the child task.
  **/
 
-void cpuset_fork(struct task_struct *tsk)
+void cpuset_fork(struct task_struct *child)
 {
-	atomic_inc(&tsk->cpuset->count);
+	task_lock(current);
+	child->cpuset = current->cpuset;
+	atomic_inc(&child->cpuset->count);
+	task_unlock(current);
 }
 
 /**
@@ -1460,33 +1615,43 @@ void cpuset_fork(struct task_struct *tsk)
  *
  * Description: Detach cpuset from @tsk and release it.
  *
- * Note that cpusets marked notify_on_release force every task
- * in them to take the global cpuset_sem semaphore when exiting.
- * This could impact scaling on very large systems.  Be reluctant
- * to use notify_on_release cpusets where very high task exit
- * scaling is required on large systems.
- *
- * Don't even think about derefencing 'cs' after the cpuset use
- * count goes to zero, except inside a critical section guarded
- * by the cpuset_sem semaphore.  If you don't hold cpuset_sem,
- * then a zero cpuset use count is a license to any other task to
- * nuke the cpuset immediately.
+ * Note that cpusets marked notify_on_release force every task in
+ * them to take the global manage_sem semaphore when exiting.
+ * This could impact scaling on very large systems.  Be reluctant to
+ * use notify_on_release cpusets where very high task exit scaling
+ * is required on large systems.
+ *
+ * Don't even think about derefencing 'cs' after the cpuset use count
+ * goes to zero, except inside a critical section guarded by manage_sem
+ * or callback_sem.   Otherwise a zero cpuset use count is a license to
+ * any other task to nuke the cpuset immediately, via cpuset_rmdir().
+ *
+ * This routine has to take manage_sem, not callback_sem, because
+ * it is holding that semaphore while calling check_for_release(),
+ * which calls kmalloc(), so can't be called holding callback__sem().
+ *
+ * We don't need to task_lock() this reference to tsk->cpuset,
+ * because tsk is already marked PF_EXITING, so attach_task() won't
+ * mess with it.
  **/
 
 void cpuset_exit(struct task_struct *tsk)
 {
 	struct cpuset *cs;
 
-	task_lock(tsk);
+	BUG_ON(!(tsk->flags & PF_EXITING));
+
 	cs = tsk->cpuset;
 	tsk->cpuset = NULL;
-	task_unlock(tsk);
 
 	if (notify_on_release(cs)) {
-		down(&cpuset_sem);
+		char *pathbuf = NULL;
+
+		down(&manage_sem);
 		if (atomic_dec_and_test(&cs->count))
-			check_for_release(cs);
-		up(&cpuset_sem);
+			check_for_release(cs, &pathbuf);
+		up(&manage_sem);
+		cpuset_release_agent(pathbuf);
 	} else {
 		atomic_dec(&cs->count);
 	}
@@ -1506,11 +1671,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
 {
 	cpumask_t mask;
 
-	down(&cpuset_sem);
+	down(&callback_sem);
 	task_lock((struct task_struct *)tsk);
 	guarantee_online_cpus(tsk->cpuset, &mask);
 	task_unlock((struct task_struct *)tsk);
-	up(&cpuset_sem);
+	up(&callback_sem);
 
 	return mask;
 }
@@ -1526,19 +1691,28 @@ void cpuset_init_current_mems_allowed(void)
  * If the current tasks cpusets mems_allowed changed behind our backs,
  * update current->mems_allowed and mems_generation to the new value.
  * Do not call this routine if in_interrupt().
+ *
+ * Call without callback_sem or task_lock() held.  May be called
+ * with or without manage_sem held.  Unless exiting, it will acquire
+ * task_lock().  Also might acquire callback_sem during call to
+ * refresh_mems().
  */
 
 void cpuset_update_current_mems_allowed(void)
 {
-	struct cpuset *cs = current->cpuset;
+	struct cpuset *cs;
+	int need_to_refresh = 0;
 
+	task_lock(current);
+	cs = current->cpuset;
 	if (!cs)
-		return;		/* task is exiting */
-	if (current->cpuset_mems_generation != cs->mems_generation) {
-		down(&cpuset_sem);
+		goto done;
+	if (current->cpuset_mems_generation != cs->mems_generation)
+		need_to_refresh = 1;
+done:
+	task_unlock(current);
+	if (need_to_refresh)
 		refresh_mems();
-		up(&cpuset_sem);
-	}
 }
 
 /**
@@ -1570,23 +1744,135 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
 	return 0;
 }
 
+/*
+ * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
+ * ancestor to the specified cpuset.  Call holding callback_sem.
+ * If no ancestor is mem_exclusive (an unusual configuration), then
+ * returns the root cpuset.
+ */
+static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
+{
+	while (!is_mem_exclusive(cs) && cs->parent)
+		cs = cs->parent;
+	return cs;
+}
+
 /**
- * cpuset_zone_allowed - is zone z allowed in current->mems_allowed
- * @z: zone in question
+ * cpuset_zone_allowed - Can we allocate memory on zone z's memory node?
+ * @z: is this zone on an allowed node?
+ * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL)
  *
- * Is zone z allowed in current->mems_allowed, or is
- * the CPU in interrupt context? (zone is always allowed in this case)
- */
-int cpuset_zone_allowed(struct zone *z)
+ * If we're in interrupt, yes, we can always allocate.  If zone
+ * z's node is in our tasks mems_allowed, yes.  If it's not a
+ * __GFP_HARDWALL request and this zone's nodes is in the nearest
+ * mem_exclusive cpuset ancestor to this tasks cpuset, yes.
+ * Otherwise, no.
+ *
+ * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
+ * and do not allow allocations outside the current tasks cpuset.
+ * GFP_KERNEL allocations are not so marked, so can escape to the
+ * nearest mem_exclusive ancestor cpuset.
+ *
+ * Scanning up parent cpusets requires callback_sem.  The __alloc_pages()
+ * routine only calls here with __GFP_HARDWALL bit _not_ set if
+ * it's a GFP_KERNEL allocation, and all nodes in the current tasks
+ * mems_allowed came up empty on the first pass over the zonelist.
+ * So only GFP_KERNEL allocations, if all nodes in the cpuset are
+ * short of memory, might require taking the callback_sem semaphore.
+ *
+ * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
+ * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
+ * hardwall cpusets - no allocation on a node outside the cpuset is
+ * allowed (unless in interrupt, of course).
+ *
+ * The second loop doesn't even call here for GFP_ATOMIC requests
+ * (if the __alloc_pages() local variable 'wait' is set).  That check
+ * and the checks below have the combined affect in the second loop of
+ * the __alloc_pages() routine that:
+ *	in_interrupt - any node ok (current task context irrelevant)
+ *	GFP_ATOMIC   - any node ok
+ *	GFP_KERNEL   - any node in enclosing mem_exclusive cpuset ok
+ *	GFP_USER     - only nodes in current tasks mems allowed ok.
+ **/
+
+int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+{
+	int node;			/* node that zone z is on */
+	const struct cpuset *cs;	/* current cpuset ancestors */
+	int allowed = 1;		/* is allocation in zone z allowed? */
+
+	if (in_interrupt())
+		return 1;
+	node = z->zone_pgdat->node_id;
+	if (node_isset(node, current->mems_allowed))
+		return 1;
+	if (gfp_mask & __GFP_HARDWALL)	/* If hardwall request, stop here */
+		return 0;
+
+	/* Not hardwall and node outside mems_allowed: scan up cpusets */
+	down(&callback_sem);
+
+	if (current->flags & PF_EXITING) /* Let dying task have memory */
+		return 1;
+	task_lock(current);
+	cs = nearest_exclusive_ancestor(current->cpuset);
+	task_unlock(current);
+
+	allowed = node_isset(node, cs->mems_allowed);
+	up(&callback_sem);
+	return allowed;
+}
+
+/**
+ * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors?
+ * @p: pointer to task_struct of some other task.
+ *
+ * Description: Return true if the nearest mem_exclusive ancestor
+ * cpusets of tasks @p and current overlap.  Used by oom killer to
+ * determine if task @p's memory usage might impact the memory
+ * available to the current task.
+ *
+ * Acquires callback_sem - not suitable for calling from a fast path.
+ **/
+
+int cpuset_excl_nodes_overlap(const struct task_struct *p)
 {
-	return in_interrupt() ||
-		node_isset(z->zone_pgdat->node_id, current->mems_allowed);
+	const struct cpuset *cs1, *cs2;	/* my and p's cpuset ancestors */
+	int overlap = 0;		/* do cpusets overlap? */
+
+	down(&callback_sem);
+
+	task_lock(current);
+	if (current->flags & PF_EXITING) {
+		task_unlock(current);
+		goto done;
+	}
+	cs1 = nearest_exclusive_ancestor(current->cpuset);
+	task_unlock(current);
+
+	task_lock((struct task_struct *)p);
+	if (p->flags & PF_EXITING) {
+		task_unlock((struct task_struct *)p);
+		goto done;
+	}
+	cs2 = nearest_exclusive_ancestor(p->cpuset);
+	task_unlock((struct task_struct *)p);
+
+	overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
+done:
+	up(&callback_sem);
+
+	return overlap;
 }
 
 /*
  * proc_cpuset_show()
  *  - Print tasks cpuset path into seq_file.
  *  - Used for /proc/<pid>/cpuset.
+ *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
+ *    doesn't really matter if tsk->cpuset changes after we read it,
+ *    and we take manage_sem, keeping attach_task() from changing it
+ *    anyway.
  */
 
 static int proc_cpuset_show(struct seq_file *m, void *v)
@@ -1601,10 +1887,8 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
 		return -ENOMEM;
 
 	tsk = m->private;
-	down(&cpuset_sem);
-	task_lock(tsk);
+	down(&manage_sem);
 	cs = tsk->cpuset;
-	task_unlock(tsk);
 	if (!cs) {
 		retval = -EINVAL;
 		goto out;
@@ -1616,7 +1900,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
 	seq_puts(m, buf);
 	seq_putc(m, '\n');
 out:
-	up(&cpuset_sem);
+	up(&manage_sem);
 	kfree(buf);
 	return retval;
 }
diff --git a/kernel/exit.c b/kernel/exit.c
index 9d1b10ed0135..537394b25e8d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -368,17 +368,25 @@ EXPORT_SYMBOL(daemonize);
 static inline void close_files(struct files_struct * files)
 {
 	int i, j;
+	struct fdtable *fdt;
 
 	j = 0;
+
+	/*
+	 * It is safe to dereference the fd table without RCU or
+	 * ->file_lock because this is the last reference to the
+	 * files structure.
+	 */
+	fdt = files_fdtable(files);
 	for (;;) {
 		unsigned long set;
 		i = j * __NFDBITS;
-		if (i >= files->max_fdset || i >= files->max_fds)
+		if (i >= fdt->max_fdset || i >= fdt->max_fds)
 			break;
-		set = files->open_fds->fds_bits[j++];
+		set = fdt->open_fds->fds_bits[j++];
 		while (set) {
 			if (set & 1) {
-				struct file * file = xchg(&files->fd[i], NULL);
+				struct file * file = xchg(&fdt->fd[i], NULL);
 				if (file)
 					filp_close(file, files);
 			}
@@ -403,18 +411,22 @@ struct files_struct *get_files_struct(struct task_struct *task)
 
 void fastcall put_files_struct(struct files_struct *files)
 {
+	struct fdtable *fdt;
+
 	if (atomic_dec_and_test(&files->count)) {
 		close_files(files);
 		/*
 		 * Free the fd and fdset arrays if we expanded them.
+		 * If the fdtable was embedded, pass files for freeing
+		 * at the end of the RCU grace period. Otherwise,
+		 * you can free files immediately.
 		 */
-		if (files->fd != &files->fd_array[0])
-			free_fd_array(files->fd, files->max_fds);
-		if (files->max_fdset > __FD_SETSIZE) {
-			free_fdset(files->open_fds, files->max_fdset);
-			free_fdset(files->close_on_exec, files->max_fdset);
-		}
-		kmem_cache_free(files_cachep, files);
+		fdt = files_fdtable(files);
+		if (fdt == &files->fdtab)
+			fdt->free_files = files;
+		else
+			kmem_cache_free(files_cachep, files);
+		free_fdtable(fdt);
 	}
 }
 
@@ -535,7 +547,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced)
 
 	if (p->pdeath_signal)
 		/* We already hold the tasklist_lock here.  */
-		group_send_sig_info(p->pdeath_signal, (void *) 0, p);
+		group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
 
 	/* Move the child from its dying parent to the new one.  */
 	if (unlikely(traced)) {
@@ -579,8 +591,8 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced)
 		int pgrp = process_group(p);
 
 		if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) {
-			__kill_pg_info(SIGHUP, (void *)1, pgrp);
-			__kill_pg_info(SIGCONT, (void *)1, pgrp);
+			__kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp);
+			__kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp);
 		}
 	}
 }
@@ -715,8 +727,8 @@ static void exit_notify(struct task_struct *tsk)
 	    (t->signal->session == tsk->signal->session) &&
 	    will_become_orphaned_pgrp(process_group(tsk), tsk) &&
 	    has_stopped_jobs(process_group(tsk))) {
-		__kill_pg_info(SIGHUP, (void *)1, process_group(tsk));
-		__kill_pg_info(SIGCONT, (void *)1, process_group(tsk));
+		__kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk));
+		__kill_pg_info(SIGCONT, SEND_SIG_PRIV, process_group(tsk));
 	}
 
 	/* Let father know we died 
@@ -771,10 +783,6 @@ static void exit_notify(struct task_struct *tsk)
 	/* If the process is dead, release it - nobody will wait for it */
 	if (state == EXIT_DEAD)
 		release_task(tsk);
-
-	/* PF_DEAD causes final put_task_struct after we schedule. */
-	preempt_disable();
-	tsk->flags |= PF_DEAD;
 }
 
 fastcall NORET_TYPE void do_exit(long code)
@@ -827,10 +835,16 @@ fastcall NORET_TYPE void do_exit(long code)
 				preempt_count());
 
 	acct_update_integrals(tsk);
-	update_mem_hiwater(tsk);
+	if (tsk->mm) {
+		update_hiwater_rss(tsk->mm);
+		update_hiwater_vm(tsk->mm);
+	}
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
-	if (group_dead)
+	if (group_dead) {
+ 		del_timer_sync(&tsk->signal->real_timer);
+		exit_itimers(tsk->signal);
 		acct_process(code);
+	}
 	exit_mm(tsk);
 
 	exit_sem(tsk);
@@ -855,7 +869,11 @@ fastcall NORET_TYPE void do_exit(long code)
 	tsk->mempolicy = NULL;
 #endif
 
-	BUG_ON(!(current->flags & PF_DEAD));
+	/* PF_DEAD causes final put_task_struct after we schedule. */
+	preempt_disable();
+	BUG_ON(tsk->flags & PF_DEAD);
+	tsk->flags |= PF_DEAD;
+
 	schedule();
 	BUG();
 	/* Avoid "noreturn function does return".  */
@@ -1189,7 +1207,7 @@ static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap,
 
 		exit_code = p->exit_code;
 		if (unlikely(!exit_code) ||
-		    unlikely(p->state > TASK_STOPPED))
+		    unlikely(p->state & TASK_TRACED))
 			goto bail_ref;
 		return wait_noreap_copyout(p, pid, uid,
 					   why, (exit_code << 8) | 0x7f,
@@ -1365,6 +1383,15 @@ repeat:
 
 			switch (p->state) {
 			case TASK_TRACED:
+				/*
+				 * When we hit the race with PTRACE_ATTACH,
+				 * we will not report this child.  But the
+				 * race means it has not yet been moved to
+				 * our ptrace_children list, so we need to
+				 * set the flag here to avoid a spurious ECHILD
+				 * when the race happens with the only child.
+				 */
+				flag = 1;
 				if (!my_ptrace_child(p))
 					continue;
 				/*FALLTHROUGH*/
diff --git a/kernel/fork.c b/kernel/fork.c
index b65187f0c74e..8a069612eac3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -35,6 +35,7 @@
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
 #include <linux/futex.h>
+#include <linux/rcupdate.h>
 #include <linux/ptrace.h>
 #include <linux/mount.h>
 #include <linux/audit.h>
@@ -176,41 +177,42 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 
 	/* One for us, one for whoever does the "release_task()" (usually parent) */
 	atomic_set(&tsk->usage,2);
+	atomic_set(&tsk->fs_excl, 0);
 	return tsk;
 }
 
 #ifdef CONFIG_MMU
-static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
+static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 {
-	struct vm_area_struct * mpnt, *tmp, **pprev;
+	struct vm_area_struct *mpnt, *tmp, **pprev;
 	struct rb_node **rb_link, *rb_parent;
 	int retval;
 	unsigned long charge;
 	struct mempolicy *pol;
 
 	down_write(&oldmm->mmap_sem);
-	flush_cache_mm(current->mm);
+	flush_cache_mm(oldmm);
+	down_write(&mm->mmap_sem);
+
 	mm->locked_vm = 0;
 	mm->mmap = NULL;
 	mm->mmap_cache = NULL;
 	mm->free_area_cache = oldmm->mmap_base;
 	mm->cached_hole_size = ~0UL;
 	mm->map_count = 0;
-	set_mm_counter(mm, rss, 0);
-	set_mm_counter(mm, anon_rss, 0);
 	cpus_clear(mm->cpu_vm_mask);
 	mm->mm_rb = RB_ROOT;
 	rb_link = &mm->mm_rb.rb_node;
 	rb_parent = NULL;
 	pprev = &mm->mmap;
 
-	for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
+	for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
 		struct file *file;
 
 		if (mpnt->vm_flags & VM_DONTCOPY) {
 			long pages = vma_pages(mpnt);
 			mm->total_vm -= pages;
-			__vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
+			vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
 								-pages);
 			continue;
 		}
@@ -251,12 +253,8 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 		}
 
 		/*
-		 * Link in the new vma and copy the page table entries:
-		 * link in first so that swapoff can see swap entries.
-		 * Note that, exceptionally, here the vma is inserted
-		 * without holding mm->mmap_sem.
+		 * Link in the new vma and copy the page table entries.
 		 */
-		spin_lock(&mm->page_table_lock);
 		*pprev = tmp;
 		pprev = &tmp->vm_next;
 
@@ -265,8 +263,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 		rb_parent = &tmp->vm_rb;
 
 		mm->map_count++;
-		retval = copy_page_range(mm, current->mm, tmp);
-		spin_unlock(&mm->page_table_lock);
+		retval = copy_page_range(mm, oldmm, tmp);
 
 		if (tmp->vm_ops && tmp->vm_ops->open)
 			tmp->vm_ops->open(tmp);
@@ -275,9 +272,9 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 			goto out;
 	}
 	retval = 0;
-
 out:
-	flush_tlb_mm(current->mm);
+	up_write(&mm->mmap_sem);
+	flush_tlb_mm(oldmm);
 	up_write(&oldmm->mmap_sem);
 	return retval;
 fail_nomem_policy:
@@ -321,6 +318,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
 	INIT_LIST_HEAD(&mm->mmlist);
 	mm->core_waiters = 0;
 	mm->nr_ptes = 0;
+	set_mm_counter(mm, file_rss, 0);
+	set_mm_counter(mm, anon_rss, 0);
 	spin_lock_init(&mm->page_table_lock);
 	rwlock_init(&mm->ioctx_list_lock);
 	mm->ioctx_list = NULL;
@@ -497,7 +496,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 	if (retval)
 		goto free_pt;
 
-	mm->hiwater_rss = get_mm_counter(mm,rss);
+	mm->hiwater_rss = get_mm_rss(mm);
 	mm->hiwater_vm = mm->total_vm;
 
 good_mm:
@@ -564,24 +563,53 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
 	return 0;
 }
 
-static int count_open_files(struct files_struct *files, int size)
+static int count_open_files(struct fdtable *fdt)
 {
+	int size = fdt->max_fdset;
 	int i;
 
 	/* Find the last open fd */
 	for (i = size/(8*sizeof(long)); i > 0; ) {
-		if (files->open_fds->fds_bits[--i])
+		if (fdt->open_fds->fds_bits[--i])
 			break;
 	}
 	i = (i+1) * 8 * sizeof(long);
 	return i;
 }
 
+static struct files_struct *alloc_files(void)
+{
+	struct files_struct *newf;
+	struct fdtable *fdt;
+
+	newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
+	if (!newf)
+		goto out;
+
+	atomic_set(&newf->count, 1);
+
+	spin_lock_init(&newf->file_lock);
+	fdt = &newf->fdtab;
+	fdt->next_fd = 0;
+	fdt->max_fds = NR_OPEN_DEFAULT;
+	fdt->max_fdset = __FD_SETSIZE;
+	fdt->close_on_exec = &newf->close_on_exec_init;
+	fdt->open_fds = &newf->open_fds_init;
+	fdt->fd = &newf->fd_array[0];
+	INIT_RCU_HEAD(&fdt->rcu);
+	fdt->free_files = NULL;
+	fdt->next = NULL;
+	rcu_assign_pointer(newf->fdt, fdt);
+out:
+	return newf;
+}
+
 static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 {
 	struct files_struct *oldf, *newf;
 	struct file **old_fds, **new_fds;
 	int open_files, size, i, error = 0, expand;
+	struct fdtable *old_fdt, *new_fdt;
 
 	/*
 	 * A background process may not have any files ...
@@ -602,35 +630,27 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 	 */
 	tsk->files = NULL;
 	error = -ENOMEM;
-	newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
-	if (!newf) 
+	newf = alloc_files();
+	if (!newf)
 		goto out;
 
-	atomic_set(&newf->count, 1);
-
-	spin_lock_init(&newf->file_lock);
-	newf->next_fd	    = 0;
-	newf->max_fds	    = NR_OPEN_DEFAULT;
-	newf->max_fdset	    = __FD_SETSIZE;
-	newf->close_on_exec = &newf->close_on_exec_init;
-	newf->open_fds	    = &newf->open_fds_init;
-	newf->fd	    = &newf->fd_array[0];
-
 	spin_lock(&oldf->file_lock);
-
-	open_files = count_open_files(oldf, oldf->max_fdset);
+	old_fdt = files_fdtable(oldf);
+	new_fdt = files_fdtable(newf);
+	size = old_fdt->max_fdset;
+	open_files = count_open_files(old_fdt);
 	expand = 0;
 
 	/*
 	 * Check whether we need to allocate a larger fd array or fd set.
 	 * Note: we're not a clone task, so the open count won't  change.
 	 */
-	if (open_files > newf->max_fdset) {
-		newf->max_fdset = 0;
+	if (open_files > new_fdt->max_fdset) {
+		new_fdt->max_fdset = 0;
 		expand = 1;
 	}
-	if (open_files > newf->max_fds) {
-		newf->max_fds = 0;
+	if (open_files > new_fdt->max_fds) {
+		new_fdt->max_fds = 0;
 		expand = 1;
 	}
 
@@ -642,14 +662,21 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 		spin_unlock(&newf->file_lock);
 		if (error < 0)
 			goto out_release;
+		new_fdt = files_fdtable(newf);
+		/*
+		 * Reacquire the oldf lock and a pointer to its fd table
+		 * who knows it may have a new bigger fd table. We need
+		 * the latest pointer.
+		 */
 		spin_lock(&oldf->file_lock);
+		old_fdt = files_fdtable(oldf);
 	}
 
-	old_fds = oldf->fd;
-	new_fds = newf->fd;
+	old_fds = old_fdt->fd;
+	new_fds = new_fdt->fd;
 
-	memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
-	memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);
+	memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8);
+	memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8);
 
 	for (i = open_files; i != 0; i--) {
 		struct file *f = *old_fds++;
@@ -662,24 +689,24 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 			 * is partway through open().  So make sure that this
 			 * fd is available to the new process.
 			 */
-			FD_CLR(open_files - i, newf->open_fds);
+			FD_CLR(open_files - i, new_fdt->open_fds);
 		}
-		*new_fds++ = f;
+		rcu_assign_pointer(*new_fds++, f);
 	}
 	spin_unlock(&oldf->file_lock);
 
 	/* compute the remainder to be cleared */
-	size = (newf->max_fds - open_files) * sizeof(struct file *);
+	size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
 
 	/* This is long word aligned thus could use a optimized version */ 
 	memset(new_fds, 0, size); 
 
-	if (newf->max_fdset > open_files) {
-		int left = (newf->max_fdset-open_files)/8;
+	if (new_fdt->max_fdset > open_files) {
+		int left = (new_fdt->max_fdset-open_files)/8;
 		int start = open_files / (8 * sizeof(unsigned long));
 
-		memset(&newf->open_fds->fds_bits[start], 0, left);
-		memset(&newf->close_on_exec->fds_bits[start], 0, left);
+		memset(&new_fdt->open_fds->fds_bits[start], 0, left);
+		memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
 	}
 
 	tsk->files = newf;
@@ -688,9 +715,9 @@ out:
 	return error;
 
 out_release:
-	free_fdset (newf->close_on_exec, newf->max_fdset);
-	free_fdset (newf->open_fds, newf->max_fdset);
-	free_fd_array(newf->fd, newf->max_fds);
+	free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset);
+	free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
+	free_fd_array(new_fdt->fd, new_fdt->max_fds);
 	kmem_cache_free(files_cachep, newf);
 	goto out;
 }
@@ -818,7 +845,7 @@ static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
 {
 	unsigned long new_flags = p->flags;
 
-	new_flags &= ~PF_SUPERPRIV;
+	new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE);
 	new_flags |= PF_FORKNOEXEC;
 	if (!(clone_flags & CLONE_PTRACE))
 		p->ptrace = 0;
@@ -994,6 +1021,9 @@ static task_t *copy_process(unsigned long clone_flags,
 	 * of CLONE_PTRACE.
 	 */
 	clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
+#ifdef TIF_SYSCALL_EMU
+	clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
+#endif
 
 	/* Our parent execution domain becomes current domain
 	   These must match for thread signalling to apply */
@@ -1029,7 +1059,8 @@ static task_t *copy_process(unsigned long clone_flags,
 	 * parent's CPU). This avoids alot of nasty races.
 	 */
 	p->cpus_allowed = current->cpus_allowed;
-	if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed)))
+	if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
+			!cpu_online(task_cpu(p))))
 		set_task_cpu(p, smp_processor_id());
 
 	/*
@@ -1112,6 +1143,9 @@ static task_t *copy_process(unsigned long clone_flags,
 			__get_cpu_var(process_counts)++;
 	}
 
+	if (!current->signal->tty && p->signal->tty)
+		p->signal->tty = NULL;
+
 	nr_threads++;
 	total_forks++;
 	write_unlock_irq(&tasklist_lock);
diff --git a/kernel/futex.c b/kernel/futex.c
index c7130f86106c..3b4d5ad44cc6 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -40,6 +40,7 @@
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
+#include <asm/futex.h>
 
 #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
 
@@ -204,15 +205,13 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
 	/*
 	 * Do a quick atomic lookup first - this is the fastpath.
 	 */
-	spin_lock(&current->mm->page_table_lock);
-	page = follow_page(mm, uaddr, 0);
+	page = follow_page(mm, uaddr, FOLL_TOUCH|FOLL_GET);
 	if (likely(page != NULL)) {
 		key->shared.pgoff =
 			page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-		spin_unlock(&current->mm->page_table_lock);
+		put_page(page);
 		return 0;
 	}
-	spin_unlock(&current->mm->page_table_lock);
 
 	/*
 	 * Do it the general way.
@@ -327,6 +326,118 @@ out:
 }
 
 /*
+ * Wake up all waiters hashed on the physical page that is mapped
+ * to this virtual address:
+ */
+static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op)
+{
+	union futex_key key1, key2;
+	struct futex_hash_bucket *bh1, *bh2;
+	struct list_head *head;
+	struct futex_q *this, *next;
+	int ret, op_ret, attempt = 0;
+
+retryfull:
+	down_read(&current->mm->mmap_sem);
+
+	ret = get_futex_key(uaddr1, &key1);
+	if (unlikely(ret != 0))
+		goto out;
+	ret = get_futex_key(uaddr2, &key2);
+	if (unlikely(ret != 0))
+		goto out;
+
+	bh1 = hash_futex(&key1);
+	bh2 = hash_futex(&key2);
+
+retry:
+	if (bh1 < bh2)
+		spin_lock(&bh1->lock);
+	spin_lock(&bh2->lock);
+	if (bh1 > bh2)
+		spin_lock(&bh1->lock);
+
+	op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2);
+	if (unlikely(op_ret < 0)) {
+		int dummy;
+
+		spin_unlock(&bh1->lock);
+		if (bh1 != bh2)
+			spin_unlock(&bh2->lock);
+
+		/* futex_atomic_op_inuser needs to both read and write
+		 * *(int __user *)uaddr2, but we can't modify it
+		 * non-atomically.  Therefore, if get_user below is not
+		 * enough, we need to handle the fault ourselves, while
+		 * still holding the mmap_sem.  */
+		if (attempt++) {
+			struct vm_area_struct * vma;
+			struct mm_struct *mm = current->mm;
+
+			ret = -EFAULT;
+			if (attempt >= 2 ||
+			    !(vma = find_vma(mm, uaddr2)) ||
+			    vma->vm_start > uaddr2 ||
+			    !(vma->vm_flags & VM_WRITE))
+				goto out;
+
+			switch (handle_mm_fault(mm, vma, uaddr2, 1)) {
+			case VM_FAULT_MINOR:
+				current->min_flt++;
+				break;
+			case VM_FAULT_MAJOR:
+				current->maj_flt++;
+				break;
+			default:
+				goto out;
+			}
+			goto retry;
+		}
+
+		/* If we would have faulted, release mmap_sem,
+		 * fault it in and start all over again.  */
+		up_read(&current->mm->mmap_sem);
+
+		ret = get_user(dummy, (int __user *)uaddr2);
+		if (ret)
+			return ret;
+
+		goto retryfull;
+	}
+
+	head = &bh1->chain;
+
+	list_for_each_entry_safe(this, next, head, list) {
+		if (match_futex (&this->key, &key1)) {
+			wake_futex(this);
+			if (++ret >= nr_wake)
+				break;
+		}
+	}
+
+	if (op_ret > 0) {
+		head = &bh2->chain;
+
+		op_ret = 0;
+		list_for_each_entry_safe(this, next, head, list) {
+			if (match_futex (&this->key, &key2)) {
+				wake_futex(this);
+				if (++op_ret >= nr_wake2)
+					break;
+			}
+		}
+		ret += op_ret;
+	}
+
+	spin_unlock(&bh1->lock);
+	if (bh1 != bh2)
+		spin_unlock(&bh2->lock);
+out:
+	up_read(&current->mm->mmap_sem);
+	return ret;
+}
+
+/*
  * Requeue all waiters hashed on one physical page to another
  * physical page.
  */
@@ -673,23 +784,17 @@ static int futex_fd(unsigned long uaddr, int signal)
 	filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
 
 	if (signal) {
-		int err;
 		err = f_setown(filp, current->pid, 1);
 		if (err < 0) {
-			put_unused_fd(ret);
-			put_filp(filp);
-			ret = err;
-			goto out;
+			goto error;
 		}
 		filp->f_owner.signum = signal;
 	}
 
 	q = kmalloc(sizeof(*q), GFP_KERNEL);
 	if (!q) {
-		put_unused_fd(ret);
-		put_filp(filp);
-		ret = -ENOMEM;
-		goto out;
+		err = -ENOMEM;
+		goto error;
 	}
 
 	down_read(&current->mm->mmap_sem);
@@ -697,10 +802,8 @@ static int futex_fd(unsigned long uaddr, int signal)
 
 	if (unlikely(err != 0)) {
 		up_read(&current->mm->mmap_sem);
-		put_unused_fd(ret);
-		put_filp(filp);
 		kfree(q);
-		return err;
+		goto error;
 	}
 
 	/*
@@ -716,6 +819,11 @@ static int futex_fd(unsigned long uaddr, int signal)
 	fd_install(ret, filp);
 out:
 	return ret;
+error:
+	put_unused_fd(ret);
+	put_filp(filp);
+	ret = err;
+	goto out;
 }
 
 long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
@@ -740,6 +848,9 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
 	case FUTEX_CMP_REQUEUE:
 		ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
 		break;
+	case FUTEX_WAKE_OP:
+		ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
+		break;
 	default:
 		ret = -ENOSYS;
 	}
diff --git a/kernel/intermodule.c b/kernel/intermodule.c
index 388977f3e9b7..0cbe633420fb 100644
--- a/kernel/intermodule.c
+++ b/kernel/intermodule.c
@@ -39,7 +39,7 @@ void inter_module_register(const char *im_name, struct module *owner, const void
 	struct list_head *tmp;
 	struct inter_module_entry *ime, *ime_new;
 
-	if (!(ime_new = kmalloc(sizeof(*ime), GFP_KERNEL))) {
+	if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) {
 		/* Overloaded kernel, not fatal */
 		printk(KERN_ERR
 			"Aiee, inter_module_register: cannot kmalloc entry for '%s'\n",
@@ -47,7 +47,6 @@ void inter_module_register(const char *im_name, struct module *owner, const void
 		kmalloc_failed = 1;
 		return;
 	}
-	memset(ime_new, 0, sizeof(*ime_new));
 	ime_new->im_name = im_name;
 	ime_new->owner = owner;
 	ime_new->userdata = userdata;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index c29f83c16497..51df337b37db 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -111,20 +111,22 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
 	unsigned int status;
 
 	kstat_this_cpu.irqs[irq]++;
-	if (desc->status & IRQ_PER_CPU) {
+	if (CHECK_IRQ_PER_CPU(desc->status)) {
 		irqreturn_t action_ret;
 
 		/*
 		 * No locking required for CPU-local interrupts:
 		 */
-		desc->handler->ack(irq);
+		if (desc->handler->ack)
+			desc->handler->ack(irq);
 		action_ret = handle_IRQ_event(irq, regs, desc->action);
 		desc->handler->end(irq);
 		return 1;
 	}
 
 	spin_lock(&desc->lock);
-	desc->handler->ack(irq);
+	if (desc->handler->ack)
+		desc->handler->ack(irq);
 	/*
 	 * REPLAY is when Linux resends an IRQ that was dropped earlier
 	 * WAITING is used by probe to mark irqs that are being tested
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ac6700985705..1cfdb08ddf20 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -18,6 +18,10 @@
 
 cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
 
+#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
+cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
+#endif
+
 /**
  *	synchronize_irq - wait for pending IRQ handlers (on other CPUs)
  *
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 85d08daa6600..f26e534c6585 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -19,12 +19,22 @@ static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
  */
 static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
 
-void __attribute__((weak))
-proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
+{
+	/*
+	 * Save these away for later use. Re-progam when the
+	 * interrupt is pending
+	 */
+	set_pending_irq(irq, mask_val);
+}
+#else
+void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 {
 	irq_affinity[irq] = mask_val;
 	irq_desc[irq].handler->set_affinity(irq, mask_val);
 }
+#endif
 
 static int irq_affinity_read_proc(char *page, char **start, off_t off,
 				  int count, int *eof, void *data)
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 13bcec151b57..39277dd6bf90 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -18,6 +18,7 @@
 #include <linux/fs.h>
 #include <linux/err.h>
 #include <linux/proc_fs.h>
+#include <linux/sched.h>	/* for cond_resched */
 #include <linux/mm.h>
 
 #include <asm/sections.h>
diff --git a/kernel/kexec.c b/kernel/kexec.c
index cdd4dcd8fb63..2c95848fbce8 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -90,7 +90,7 @@ int kexec_should_crash(struct task_struct *p)
 static int kimage_is_destination_range(struct kimage *image,
 				       unsigned long start, unsigned long end);
 static struct page *kimage_alloc_page(struct kimage *image,
-				       unsigned int gfp_mask,
+				       gfp_t gfp_mask,
 				       unsigned long dest);
 
 static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
@@ -326,8 +326,7 @@ static int kimage_is_destination_range(struct kimage *image,
 	return 0;
 }
 
-static struct page *kimage_alloc_pages(unsigned int gfp_mask,
-					unsigned int order)
+static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
 {
 	struct page *pages;
 
@@ -335,7 +334,7 @@ static struct page *kimage_alloc_pages(unsigned int gfp_mask,
 	if (pages) {
 		unsigned int count, i;
 		pages->mapping = NULL;
-		pages->private = order;
+		set_page_private(pages, order);
 		count = 1 << order;
 		for (i = 0; i < count; i++)
 			SetPageReserved(pages + i);
@@ -348,7 +347,7 @@ static void kimage_free_pages(struct page *page)
 {
 	unsigned int order, count, i;
 
-	order = page->private;
+	order = page_private(page);
 	count = 1 << order;
 	for (i = 0; i < count; i++)
 		ClearPageReserved(page + i);
@@ -654,7 +653,7 @@ static kimage_entry_t *kimage_dst_used(struct kimage *image,
 }
 
 static struct page *kimage_alloc_page(struct kimage *image,
-					unsigned int gfp_mask,
+					gfp_t gfp_mask,
 					unsigned long destination)
 {
 	/*
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 179baafcdd96..64ab045c3d9d 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -36,7 +36,7 @@
  * struct kfifo with kfree().
  */
 struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size,
-			 unsigned int __nocast gfp_mask, spinlock_t *lock)
+			 gfp_t gfp_mask, spinlock_t *lock)
 {
 	struct kfifo *fifo;
 
@@ -64,7 +64,7 @@ EXPORT_SYMBOL(kfifo_init);
  *
  * The size will be rounded-up to a power of 2.
  */
-struct kfifo *kfifo_alloc(unsigned int size, unsigned int __nocast gfp_mask, spinlock_t *lock)
+struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
 {
 	unsigned char *buffer;
 	struct kfifo *ret;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 44166e3bb8af..51a892063aaa 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -131,14 +131,14 @@ struct subprocess_info {
 static int ____call_usermodehelper(void *data)
 {
 	struct subprocess_info *sub_info = data;
-	struct key *old_session;
+	struct key *new_session, *old_session;
 	int retval;
 
 	/* Unblock all signals and set the session keyring. */
-	key_get(sub_info->ring);
+	new_session = key_get(sub_info->ring);
 	flush_signals(current);
 	spin_lock_irq(&current->sighand->siglock);
-	old_session = __install_session_keyring(current, sub_info->ring);
+	old_session = __install_session_keyring(current, new_session);
 	flush_signal_handlers(current, 1);
 	sigemptyset(&current->blocked);
 	recalc_sigpending();
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b0237122b24e..ce4915dd683a 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -35,8 +35,10 @@
 #include <linux/spinlock.h>
 #include <linux/hash.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/moduleloader.h>
+#include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
 #include <asm/errno.h>
 #include <asm/kdebug.h>
@@ -72,7 +74,7 @@ static struct hlist_head kprobe_insn_pages;
  * get_insn_slot() - Find a slot on an executable page for an instruction.
  * We allocate an executable page if there's no room on existing ones.
  */
-kprobe_opcode_t *get_insn_slot(void)
+kprobe_opcode_t __kprobes *get_insn_slot(void)
 {
 	struct kprobe_insn_page *kip;
 	struct hlist_node *pos;
@@ -117,7 +119,7 @@ kprobe_opcode_t *get_insn_slot(void)
 	return kip->insns;
 }
 
-void free_insn_slot(kprobe_opcode_t *slot)
+void __kprobes free_insn_slot(kprobe_opcode_t *slot)
 {
 	struct kprobe_insn_page *kip;
 	struct hlist_node *pos;
@@ -152,20 +154,42 @@ void free_insn_slot(kprobe_opcode_t *slot)
 }
 
 /* Locks kprobe: irqs must be disabled */
-void lock_kprobes(void)
+void __kprobes lock_kprobes(void)
 {
+	unsigned long flags = 0;
+
+	/* Avoiding local interrupts to happen right after we take the kprobe_lock
+	 * and before we get a chance to update kprobe_cpu, this to prevent
+	 * deadlock when we have a kprobe on ISR routine and a kprobe on task
+	 * routine
+	 */
+	local_irq_save(flags);
+
 	spin_lock(&kprobe_lock);
 	kprobe_cpu = smp_processor_id();
+
+ 	local_irq_restore(flags);
 }
 
-void unlock_kprobes(void)
+void __kprobes unlock_kprobes(void)
 {
+	unsigned long flags = 0;
+
+	/* Avoiding local interrupts to happen right after we update
+	 * kprobe_cpu and before we get a a chance to release kprobe_lock,
+	 * this to prevent deadlock when we have a kprobe on ISR routine and
+	 * a kprobe on task routine
+	 */
+	local_irq_save(flags);
+
 	kprobe_cpu = NR_CPUS;
 	spin_unlock(&kprobe_lock);
+
+ 	local_irq_restore(flags);
 }
 
 /* You have to be holding the kprobe_lock */
-struct kprobe *get_kprobe(void *addr)
+struct kprobe __kprobes *get_kprobe(void *addr)
 {
 	struct hlist_head *head;
 	struct hlist_node *node;
@@ -183,7 +207,7 @@ struct kprobe *get_kprobe(void *addr)
  * Aggregate handlers for multiple kprobes support - these handlers
  * take care of invoking the individual kprobe handlers on p->list
  */
-static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
+static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct kprobe *kp;
 
@@ -198,8 +222,8 @@ static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	return 0;
 }
 
-static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
-			      unsigned long flags)
+static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
+					unsigned long flags)
 {
 	struct kprobe *kp;
 
@@ -213,8 +237,8 @@ static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
 	return;
 }
 
-static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
-			      int trapnr)
+static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
+					int trapnr)
 {
 	/*
 	 * if we faulted "during" the execution of a user specified
@@ -227,7 +251,7 @@ static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
 	return 0;
 }
 
-static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
+static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct kprobe *kp = curr_kprobe;
 	if (curr_kprobe && kp->break_handler) {
@@ -240,7 +264,7 @@ static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
 	return 0;
 }
 
-struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp)
+struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
 {
 	struct hlist_node *node;
 	struct kretprobe_instance *ri;
@@ -249,7 +273,8 @@ struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp)
 	return NULL;
 }
 
-static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp)
+static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe
+							      *rp)
 {
 	struct hlist_node *node;
 	struct kretprobe_instance *ri;
@@ -258,7 +283,7 @@ static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp)
 	return NULL;
 }
 
-void add_rp_inst(struct kretprobe_instance *ri)
+void __kprobes add_rp_inst(struct kretprobe_instance *ri)
 {
 	/*
 	 * Remove rp inst off the free list -
@@ -276,7 +301,7 @@ void add_rp_inst(struct kretprobe_instance *ri)
 	hlist_add_head(&ri->uflist, &ri->rp->used_instances);
 }
 
-void recycle_rp_inst(struct kretprobe_instance *ri)
+void __kprobes recycle_rp_inst(struct kretprobe_instance *ri)
 {
 	/* remove rp inst off the rprobe_inst_table */
 	hlist_del(&ri->hlist);
@@ -291,7 +316,7 @@ void recycle_rp_inst(struct kretprobe_instance *ri)
 		kfree(ri);
 }
 
-struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk)
+struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
 {
 	return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
 }
@@ -302,7 +327,7 @@ struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk)
  * instances associated with this task. These left over instances represent
  * probed functions that have been called but will never return.
  */
-void kprobe_flush_task(struct task_struct *tk)
+void __kprobes kprobe_flush_task(struct task_struct *tk)
 {
         struct kretprobe_instance *ri;
         struct hlist_head *head;
@@ -322,7 +347,8 @@ void kprobe_flush_task(struct task_struct *tk)
  * This kprobe pre_handler is registered with every kretprobe. When probe
  * hits it will set up the return probe.
  */
-static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
+static int __kprobes pre_handler_kretprobe(struct kprobe *p,
+					   struct pt_regs *regs)
 {
 	struct kretprobe *rp = container_of(p, struct kretprobe, kp);
 
@@ -353,7 +379,7 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
 * Add the new probe to old_p->list. Fail if this is the
 * second jprobe at the address - two jprobes can't coexist
 */
-static int add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
+static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
 {
         struct kprobe *kp;
 
@@ -395,7 +421,8 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
  * the intricacies
  * TODO: Move kcalloc outside the spinlock
  */
-static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p)
+static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
+					  struct kprobe *p)
 {
 	int ret = 0;
 	struct kprobe *ap;
@@ -434,15 +461,25 @@ static inline void cleanup_aggr_kprobe(struct kprobe *old_p,
 		spin_unlock_irqrestore(&kprobe_lock, flags);
 }
 
-int register_kprobe(struct kprobe *p)
+static int __kprobes in_kprobes_functions(unsigned long addr)
+{
+	if (addr >= (unsigned long)__kprobes_text_start
+		&& addr < (unsigned long)__kprobes_text_end)
+		return -EINVAL;
+	return 0;
+}
+
+int __kprobes register_kprobe(struct kprobe *p)
 {
 	int ret = 0;
 	unsigned long flags = 0;
 	struct kprobe *old_p;
 
-	if ((ret = arch_prepare_kprobe(p)) != 0) {
+	if ((ret = in_kprobes_functions((unsigned long) p->addr)) != 0)
+		return ret;
+	if ((ret = arch_prepare_kprobe(p)) != 0)
 		goto rm_kprobe;
-	}
+
 	spin_lock_irqsave(&kprobe_lock, flags);
 	old_p = get_kprobe(p->addr);
 	p->nmissed = 0;
@@ -466,7 +503,7 @@ rm_kprobe:
 	return ret;
 }
 
-void unregister_kprobe(struct kprobe *p)
+void __kprobes unregister_kprobe(struct kprobe *p)
 {
 	unsigned long flags;
 	struct kprobe *old_p;
@@ -487,7 +524,7 @@ static struct notifier_block kprobe_exceptions_nb = {
 	.priority = 0x7fffffff /* we need to notified first */
 };
 
-int register_jprobe(struct jprobe *jp)
+int __kprobes register_jprobe(struct jprobe *jp)
 {
 	/* Todo: Verify probepoint is a function entry point */
 	jp->kp.pre_handler = setjmp_pre_handler;
@@ -496,14 +533,14 @@ int register_jprobe(struct jprobe *jp)
 	return register_kprobe(&jp->kp);
 }
 
-void unregister_jprobe(struct jprobe *jp)
+void __kprobes unregister_jprobe(struct jprobe *jp)
 {
 	unregister_kprobe(&jp->kp);
 }
 
 #ifdef ARCH_SUPPORTS_KRETPROBES
 
-int register_kretprobe(struct kretprobe *rp)
+int __kprobes register_kretprobe(struct kretprobe *rp)
 {
 	int ret = 0;
 	struct kretprobe_instance *inst;
@@ -540,14 +577,14 @@ int register_kretprobe(struct kretprobe *rp)
 
 #else /* ARCH_SUPPORTS_KRETPROBES */
 
-int register_kretprobe(struct kretprobe *rp)
+int __kprobes register_kretprobe(struct kretprobe *rp)
 {
 	return -ENOSYS;
 }
 
 #endif /* ARCH_SUPPORTS_KRETPROBES */
 
-void unregister_kretprobe(struct kretprobe *rp)
+void __kprobes unregister_kretprobe(struct kretprobe *rp)
 {
 	unsigned long flags;
 	struct kretprobe_instance *ri;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index f50f174e92da..e75950a1092c 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -165,6 +165,12 @@ EXPORT_SYMBOL(kthread_bind);
 
 int kthread_stop(struct task_struct *k)
 {
+	return kthread_stop_sem(k, NULL);
+}
+EXPORT_SYMBOL(kthread_stop);
+
+int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
+{
 	int ret;
 
 	down(&kthread_stop_lock);
@@ -178,7 +184,10 @@ int kthread_stop(struct task_struct *k)
 
 	/* Now set kthread_should_stop() to true, and wake it up. */
 	kthread_stop_info.k = k;
-	wake_up_process(k);
+	if (s)
+		up(s);
+	else
+		wake_up_process(k);
 	put_task_struct(k);
 
 	/* Once it dies, reset stop ptr, gather result and we're done. */
@@ -189,7 +198,7 @@ int kthread_stop(struct task_struct *k)
 
 	return ret;
 }
-EXPORT_SYMBOL(kthread_stop);
+EXPORT_SYMBOL(kthread_stop_sem);
 
 static __init int helper_init(void)
 {
diff --git a/kernel/module.c b/kernel/module.c
index c32995fbd8fd..ff5c500ab625 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -20,6 +20,7 @@
 #include <linux/module.h>
 #include <linux/moduleloader.h>
 #include <linux/init.h>
+#include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/elf.h>
@@ -498,7 +499,7 @@ static inline int try_force(unsigned int flags)
 {
 	int ret = (flags & O_TRUNC);
 	if (ret)
-		tainted |= TAINT_FORCED_MODULE;
+		add_taint(TAINT_FORCED_MODULE);
 	return ret;
 }
 #else
@@ -897,7 +898,7 @@ static int check_version(Elf_Shdr *sechdrs,
 	if (!(tainted & TAINT_FORCED_MODULE)) {
 		printk("%s: no version for \"%s\" found: kernel tainted.\n",
 		       mod->name, symname);
-		tainted |= TAINT_FORCED_MODULE;
+		add_taint(TAINT_FORCED_MODULE);
 	}
 	return 1;
 }
@@ -1352,7 +1353,7 @@ static void set_license(struct module *mod, const char *license)
 	if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) {
 		printk(KERN_WARNING "%s: module license '%s' taints kernel.\n",
 		       mod->name, license);
-		tainted |= TAINT_PROPRIETARY_MODULE;
+		add_taint(TAINT_PROPRIETARY_MODULE);
 	}
 }
 
@@ -1509,6 +1510,7 @@ static struct module *load_module(void __user *umod,
 	long err = 0;
 	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
 	struct exception_table_entry *extable;
+	mm_segment_t old_fs;
 
 	DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
 	       umod, len, uargs);
@@ -1609,7 +1611,7 @@ static struct module *load_module(void __user *umod,
 	modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
 	/* This is allowed: modprobe --force will invalidate it. */
 	if (!modmagic) {
-		tainted |= TAINT_FORCED_MODULE;
+		add_taint(TAINT_FORCED_MODULE);
 		printk(KERN_WARNING "%s: no version magic, tainting kernel.\n",
 		       mod->name);
 	} else if (!same_magic(modmagic, vermagic)) {
@@ -1738,7 +1740,7 @@ static struct module *load_module(void __user *umod,
 	    (mod->num_gpl_syms && !gplcrcindex)) {
 		printk(KERN_WARNING "%s: No versions for exported symbols."
 		       " Tainting kernel.\n", mod->name);
-		tainted |= TAINT_FORCED_MODULE;
+		add_taint(TAINT_FORCED_MODULE);
 	}
 #endif
 
@@ -1779,6 +1781,24 @@ static struct module *load_module(void __user *umod,
 	if (err < 0)
 		goto cleanup;
 
+	/* flush the icache in correct context */
+	old_fs = get_fs();
+	set_fs(KERNEL_DS);
+
+	/*
+	 * Flush the instruction cache, since we've played with text.
+	 * Do it before processing of module parameters, so the module
+	 * can provide parameter accessor functions of its own.
+	 */
+	if (mod->module_init)
+		flush_icache_range((unsigned long)mod->module_init,
+				   (unsigned long)mod->module_init
+				   + mod->init_size);
+	flush_icache_range((unsigned long)mod->module_core,
+			   (unsigned long)mod->module_core + mod->core_size);
+
+	set_fs(old_fs);
+
 	mod->args = args;
 	if (obsparmindex) {
 		err = obsolete_params(mod->name, mod->args,
@@ -1860,7 +1880,6 @@ sys_init_module(void __user *umod,
 		const char __user *uargs)
 {
 	struct module *mod;
-	mm_segment_t old_fs = get_fs();
 	int ret = 0;
 
 	/* Must have permission */
@@ -1878,19 +1897,6 @@ sys_init_module(void __user *umod,
 		return PTR_ERR(mod);
 	}
 
-	/* flush the icache in correct context */
-	set_fs(KERNEL_DS);
-
-	/* Flush the instruction cache, since we've played with text */
-	if (mod->module_init)
-		flush_icache_range((unsigned long)mod->module_init,
-				   (unsigned long)mod->module_init
-				   + mod->init_size);
-	flush_icache_range((unsigned long)mod->module_core,
-			   (unsigned long)mod->module_core + mod->core_size);
-
-	set_fs(old_fs);
-
 	/* Now sew it into the lists.  They won't access us, since
            strong_try_module_get() will fail. */
 	stop_machine_run(__link_module, mod, NR_CPUS);
diff --git a/kernel/params.c b/kernel/params.c
index d586c35ef8fc..47ba69547945 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -23,6 +23,7 @@
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/err.h>
+#include <linux/slab.h>
 
 #if 0
 #define DEBUGP printk
@@ -80,8 +81,6 @@ static char *next_arg(char *args, char **param, char **val)
 	int in_quote = 0, quoted = 0;
 	char *next;
 
-	/* Chew any extra spaces */
-	while (*args == ' ') args++;
 	if (*args == '"') {
 		args++;
 		in_quote = 1;
@@ -121,6 +120,10 @@ static char *next_arg(char *args, char **param, char **val)
 		next = args + i + 1;
 	} else
 		next = args + i;
+
+	/* Chew up trailing spaces. */
+	while (*next == ' ')
+		next++;
 	return next;
 }
 
@@ -135,6 +138,10 @@ int parse_args(const char *name,
 
 	DEBUGP("Parsing ARGS: %s\n", args);
 
+	/* Chew leading spaces */
+	while (*args == ' ')
+		args++;
+
 	while (*args) {
 		int ret;
 
@@ -542,8 +549,8 @@ static void __init kernel_param_sysfs_setup(const char *name,
 {
 	struct module_kobject *mk;
 
-	mk = kmalloc(sizeof(struct module_kobject), GFP_KERNEL);
-	memset(mk, 0, sizeof(struct module_kobject));
+	mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
+	BUG_ON(!mk);
 
 	mk->mod = THIS_MODULE;
 	kobj_set_kset_s(mk, module_subsys);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index ad85d3f0dcc4..91a894264941 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -91,7 +91,7 @@ static inline union cpu_time_count cpu_time_sub(clockid_t which_clock,
  * Update expiry time from increment, and increase overrun count,
  * given the current clock sample.
  */
-static inline void bump_cpu_timer(struct k_itimer *timer,
+static void bump_cpu_timer(struct k_itimer *timer,
 				  union cpu_time_count now)
 {
 	int i;
@@ -110,7 +110,7 @@ static inline void bump_cpu_timer(struct k_itimer *timer,
 		for (i = 0; incr < delta - incr; i++)
 			incr = incr << 1;
 		for (; i >= 0; incr >>= 1, i--) {
-			if (delta <= incr)
+			if (delta < incr)
 				continue;
 			timer->it.cpu.expires.sched += incr;
 			timer->it_overrun += 1 << i;
@@ -128,7 +128,7 @@ static inline void bump_cpu_timer(struct k_itimer *timer,
 		for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++)
 			     incr = cputime_add(incr, incr);
 		for (; i >= 0; incr = cputime_halve(incr), i--) {
-			if (cputime_le(delta, incr))
+			if (cputime_lt(delta, incr))
 				continue;
 			timer->it.cpu.expires.cpu =
 				cputime_add(timer->it.cpu.expires.cpu, incr);
@@ -380,14 +380,9 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
 int posix_cpu_timer_del(struct k_itimer *timer)
 {
 	struct task_struct *p = timer->it.cpu.task;
+	int ret = 0;
 
-	if (timer->it.cpu.firing)
-		return TIMER_RETRY;
-
-	if (unlikely(p == NULL))
-		return 0;
-
-	if (!list_empty(&timer->it.cpu.entry)) {
+	if (likely(p != NULL)) {
 		read_lock(&tasklist_lock);
 		if (unlikely(p->signal == NULL)) {
 			/*
@@ -396,18 +391,20 @@ int posix_cpu_timer_del(struct k_itimer *timer)
 			 */
 			BUG_ON(!list_empty(&timer->it.cpu.entry));
 		} else {
-			/*
-			 * Take us off the task's timer list.
-			 */
 			spin_lock(&p->sighand->siglock);
-			list_del(&timer->it.cpu.entry);
+			if (timer->it.cpu.firing)
+				ret = TIMER_RETRY;
+			else
+				list_del(&timer->it.cpu.entry);
 			spin_unlock(&p->sighand->siglock);
 		}
 		read_unlock(&tasklist_lock);
+
+		if (!ret)
+			put_task_struct(p);
 	}
-	put_task_struct(p);
 
-	return 0;
+	return ret;
 }
 
 /*
@@ -424,7 +421,6 @@ static void cleanup_timers(struct list_head *head,
 	cputime_t ptime = cputime_add(utime, stime);
 
 	list_for_each_entry_safe(timer, next, head, entry) {
-		timer->task = NULL;
 		list_del_init(&timer->entry);
 		if (cputime_lt(timer->expires.cpu, ptime)) {
 			timer->expires.cpu = cputime_zero;
@@ -436,7 +432,6 @@ static void cleanup_timers(struct list_head *head,
 
 	++head;
 	list_for_each_entry_safe(timer, next, head, entry) {
-		timer->task = NULL;
 		list_del_init(&timer->entry);
 		if (cputime_lt(timer->expires.cpu, utime)) {
 			timer->expires.cpu = cputime_zero;
@@ -448,7 +443,6 @@ static void cleanup_timers(struct list_head *head,
 
 	++head;
 	list_for_each_entry_safe(timer, next, head, entry) {
-		timer->task = NULL;
 		list_del_init(&timer->entry);
 		if (timer->expires.sched < sched_time) {
 			timer->expires.sched = 0;
@@ -492,6 +486,9 @@ static void process_timer_rebalance(struct task_struct *p,
  	struct task_struct *t = p;
 	unsigned int nthreads = atomic_read(&p->signal->live);
 
+	if (!nthreads)
+		return;
+
 	switch (clock_idx) {
 	default:
 		BUG();
@@ -500,7 +497,7 @@ static void process_timer_rebalance(struct task_struct *p,
 		left = cputime_div(cputime_sub(expires.cpu, val.cpu),
 				   nthreads);
 		do {
-			if (!unlikely(t->exit_state)) {
+			if (!unlikely(t->flags & PF_EXITING)) {
 				ticks = cputime_add(prof_ticks(t), left);
 				if (cputime_eq(t->it_prof_expires,
 					       cputime_zero) ||
@@ -515,7 +512,7 @@ static void process_timer_rebalance(struct task_struct *p,
 		left = cputime_div(cputime_sub(expires.cpu, val.cpu),
 				   nthreads);
 		do {
-			if (!unlikely(t->exit_state)) {
+			if (!unlikely(t->flags & PF_EXITING)) {
 				ticks = cputime_add(virt_ticks(t), left);
 				if (cputime_eq(t->it_virt_expires,
 					       cputime_zero) ||
@@ -530,7 +527,7 @@ static void process_timer_rebalance(struct task_struct *p,
 		nsleft = expires.sched - val.sched;
 		do_div(nsleft, nthreads);
 		do {
-			if (!unlikely(t->exit_state)) {
+			if (!unlikely(t->flags & PF_EXITING)) {
 				ns = t->sched_time + nsleft;
 				if (t->it_sched_expires == 0 ||
 				    t->it_sched_expires > ns) {
@@ -569,6 +566,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
 	struct cpu_timer_list *next;
 	unsigned long i;
 
+	if (CPUCLOCK_PERTHREAD(timer->it_clock)	&& (p->flags & PF_EXITING))
+		return;
+
 	head = (CPUCLOCK_PERTHREAD(timer->it_clock) ?
 		p->cpu_timers : p->signal->cpu_timers);
 	head += CPUCLOCK_WHICH(timer->it_clock);
@@ -579,17 +579,15 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
 	listpos = head;
 	if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
 		list_for_each_entry(next, head, entry) {
-			if (next->expires.sched > nt->expires.sched) {
-				listpos = &next->entry;
+			if (next->expires.sched > nt->expires.sched)
 				break;
-			}
+			listpos = &next->entry;
 		}
 	} else {
 		list_for_each_entry(next, head, entry) {
-			if (cputime_gt(next->expires.cpu, nt->expires.cpu)) {
-				listpos = &next->entry;
+			if (cputime_gt(next->expires.cpu, nt->expires.cpu))
 				break;
-			}
+			listpos = &next->entry;
 		}
 	}
 	list_add(&nt->entry, listpos);
@@ -733,9 +731,15 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 	 * Disarm any old timer after extracting its expiry time.
 	 */
 	BUG_ON(!irqs_disabled());
+
+	ret = 0;
 	spin_lock(&p->sighand->siglock);
 	old_expires = timer->it.cpu.expires;
-	list_del_init(&timer->it.cpu.entry);
+	if (unlikely(timer->it.cpu.firing)) {
+		timer->it.cpu.firing = -1;
+		ret = TIMER_RETRY;
+	} else
+		list_del_init(&timer->it.cpu.entry);
 	spin_unlock(&p->sighand->siglock);
 
 	/*
@@ -783,7 +787,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 		}
 	}
 
-	if (unlikely(timer->it.cpu.firing)) {
+	if (unlikely(ret)) {
 		/*
 		 * We are colliding with the timer actually firing.
 		 * Punt after filling in the timer's old value, and
@@ -791,8 +795,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 		 * it as an overrun (thanks to bump_cpu_timer above).
 		 */
 		read_unlock(&tasklist_lock);
-		timer->it.cpu.firing = -1;
-		ret = TIMER_RETRY;
 		goto out;
 	}
 
@@ -958,14 +960,16 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 static void check_thread_timers(struct task_struct *tsk,
 				struct list_head *firing)
 {
+	int maxfire;
 	struct list_head *timers = tsk->cpu_timers;
 
+	maxfire = 20;
 	tsk->it_prof_expires = cputime_zero;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_entry(timers->next,
 						      struct cpu_timer_list,
 						      entry);
-		if (cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
+		if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
 			tsk->it_prof_expires = t->expires.cpu;
 			break;
 		}
@@ -974,12 +978,13 @@ static void check_thread_timers(struct task_struct *tsk,
 	}
 
 	++timers;
+	maxfire = 20;
 	tsk->it_virt_expires = cputime_zero;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_entry(timers->next,
 						      struct cpu_timer_list,
 						      entry);
-		if (cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
+		if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
 			tsk->it_virt_expires = t->expires.cpu;
 			break;
 		}
@@ -988,12 +993,13 @@ static void check_thread_timers(struct task_struct *tsk,
 	}
 
 	++timers;
+	maxfire = 20;
 	tsk->it_sched_expires = 0;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_entry(timers->next,
 						      struct cpu_timer_list,
 						      entry);
-		if (tsk->sched_time < t->expires.sched) {
+		if (!--maxfire || tsk->sched_time < t->expires.sched) {
 			tsk->it_sched_expires = t->expires.sched;
 			break;
 		}
@@ -1010,6 +1016,7 @@ static void check_thread_timers(struct task_struct *tsk,
 static void check_process_timers(struct task_struct *tsk,
 				 struct list_head *firing)
 {
+	int maxfire;
 	struct signal_struct *const sig = tsk->signal;
 	cputime_t utime, stime, ptime, virt_expires, prof_expires;
 	unsigned long long sched_time, sched_expires;
@@ -1042,12 +1049,13 @@ static void check_process_timers(struct task_struct *tsk,
 	} while (t != tsk);
 	ptime = cputime_add(utime, stime);
 
+	maxfire = 20;
 	prof_expires = cputime_zero;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_entry(timers->next,
 						      struct cpu_timer_list,
 						      entry);
-		if (cputime_lt(ptime, t->expires.cpu)) {
+		if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) {
 			prof_expires = t->expires.cpu;
 			break;
 		}
@@ -1056,12 +1064,13 @@ static void check_process_timers(struct task_struct *tsk,
 	}
 
 	++timers;
+	maxfire = 20;
 	virt_expires = cputime_zero;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_entry(timers->next,
 						      struct cpu_timer_list,
 						      entry);
-		if (cputime_lt(utime, t->expires.cpu)) {
+		if (!--maxfire || cputime_lt(utime, t->expires.cpu)) {
 			virt_expires = t->expires.cpu;
 			break;
 		}
@@ -1070,12 +1079,13 @@ static void check_process_timers(struct task_struct *tsk,
 	}
 
 	++timers;
+	maxfire = 20;
 	sched_expires = 0;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_entry(timers->next,
 						      struct cpu_timer_list,
 						      entry);
-		if (sched_time < t->expires.sched) {
+		if (!--maxfire || sched_time < t->expires.sched) {
 			sched_expires = t->expires.sched;
 			break;
 		}
@@ -1158,6 +1168,9 @@ static void check_process_timers(struct task_struct *tsk,
 		unsigned long long sched_left, sched;
 		const unsigned int nthreads = atomic_read(&sig->live);
 
+		if (!nthreads)
+			return;
+
 		prof_left = cputime_sub(prof_expires, utime);
 		prof_left = cputime_sub(prof_left, stime);
 		prof_left = cputime_div(prof_left, nthreads);
@@ -1194,7 +1207,7 @@ static void check_process_timers(struct task_struct *tsk,
 
 			do {
 				t = next_thread(t);
-			} while (unlikely(t->exit_state));
+			} while (unlikely(t->flags & PF_EXITING));
 		} while (t != tsk);
 	}
 }
@@ -1212,7 +1225,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 		/*
 		 * The task was cleaned up already, no future firings.
 		 */
-		return;
+		goto out;
 
 	/*
 	 * Fetch the current sample and update the timer's expiry time.
@@ -1222,7 +1235,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 		bump_cpu_timer(timer, now);
 		if (unlikely(p->exit_state)) {
 			clear_dead_task(timer, now);
-			return;
+			goto out;
 		}
 		read_lock(&tasklist_lock); /* arm_timer needs it.  */
 	} else {
@@ -1235,8 +1248,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 			put_task_struct(p);
 			timer->it.cpu.task = p = NULL;
 			timer->it.cpu.expires.sched = 0;
-			read_unlock(&tasklist_lock);
-			return;
+			goto out_unlock;
 		} else if (unlikely(p->exit_state) && thread_group_empty(p)) {
 			/*
 			 * We've noticed that the thread is dead, but
@@ -1244,8 +1256,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 			 * drop our task ref.
 			 */
 			clear_dead_task(timer, now);
-			read_unlock(&tasklist_lock);
-			return;
+			goto out_unlock;
 		}
 		cpu_clock_sample_group(timer->it_clock, p, &now);
 		bump_cpu_timer(timer, now);
@@ -1257,7 +1268,13 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 	 */
 	arm_timer(timer, now);
 
+out_unlock:
 	read_unlock(&tasklist_lock);
+
+out:
+	timer->it_overrun_last = timer->it_overrun;
+	timer->it_overrun = -1;
+	++timer->it_requeue_pending;
 }
 
 /*
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 10b2ad749d14..ea55c7a1cd75 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -427,21 +427,23 @@ int posix_timer_event(struct k_itimer *timr,int si_private)
 	timr->sigq->info.si_code = SI_TIMER;
 	timr->sigq->info.si_tid = timr->it_id;
 	timr->sigq->info.si_value = timr->it_sigev_value;
+
 	if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
-		if (unlikely(timr->it_process->flags & PF_EXITING)) {
-			timr->it_sigev_notify = SIGEV_SIGNAL;
-			put_task_struct(timr->it_process);
-			timr->it_process = timr->it_process->group_leader;
-			goto group;
-		}
-		return send_sigqueue(timr->it_sigev_signo, timr->sigq,
-			timr->it_process);
-	}
-	else {
-	group:
-		return send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
-			timr->it_process);
+		struct task_struct *leader;
+		int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq,
+					timr->it_process);
+
+		if (likely(ret >= 0))
+			return ret;
+
+		timr->it_sigev_notify = SIGEV_SIGNAL;
+		leader = timr->it_process->group_leader;
+		put_task_struct(timr->it_process);
+		timr->it_process = leader;
 	}
+
+	return send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
+				   timr->it_process);
 }
 EXPORT_SYMBOL_GPL(posix_timer_event);
 
@@ -1155,7 +1157,7 @@ retry_delete:
 }
 
 /*
- * This is called by __exit_signal, only when there are no more
+ * This is called by do_exit or de_thread, only when there are no more
  * references to the shared signal_struct.
  */
 void exit_itimers(struct signal_struct *sig)
@@ -1166,7 +1168,6 @@ void exit_itimers(struct signal_struct *sig)
 		tmr = list_entry(sig->posix_timers.next, struct k_itimer, list);
 		itimer_delete(tmr);
 	}
-	del_timer_sync(&sig->real_timer);
 }
 
 /*
@@ -1294,13 +1295,6 @@ sys_clock_getres(clockid_t which_clock, struct timespec __user *tp)
 	return error;
 }
 
-static void nanosleep_wake_up(unsigned long __data)
-{
-	struct task_struct *p = (struct task_struct *) __data;
-
-	wake_up_process(p);
-}
-
 /*
  * The standard says that an absolute nanosleep call MUST wake up at
  * the requested time in spite of clock settings.  Here is what we do:
@@ -1441,7 +1435,6 @@ static int common_nsleep(clockid_t which_clock,
 			 int flags, struct timespec *tsave)
 {
 	struct timespec t, dum;
-	struct timer_list new_timer;
 	DECLARE_WAITQUEUE(abs_wqueue, current);
 	u64 rq_time = (u64)0;
 	s64 left;
@@ -1450,10 +1443,6 @@ static int common_nsleep(clockid_t which_clock,
 	    &current_thread_info()->restart_block;
 
 	abs_wqueue.flags = 0;
-	init_timer(&new_timer);
-	new_timer.expires = 0;
-	new_timer.data = (unsigned long) current;
-	new_timer.function = nanosleep_wake_up;
 	abs = flags & TIMER_ABSTIME;
 
 	if (restart_block->fn == clock_nanosleep_restart) {
@@ -1489,13 +1478,8 @@ static int common_nsleep(clockid_t which_clock,
 		if (left < (s64)0)
 			break;
 
-		new_timer.expires = jiffies + left;
-		__set_current_state(TASK_INTERRUPTIBLE);
-		add_timer(&new_timer);
-
-		schedule();
+		schedule_timeout_interruptible(left);
 
-		del_timer_sync(&new_timer);
 		left = rq_time - get_jiffies_64();
 	} while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING));
 
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 2c7121d9bff1..46a5e5acff97 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -1,5 +1,6 @@
 config PM
 	bool "Power Management support"
+	depends on !IA64_HP_SIM
 	---help---
 	  "Power Management" means that parts of your computer are shut
 	  off or put into a power conserving "sleep" mode if they are not
@@ -28,7 +29,7 @@ config PM_DEBUG
 
 config SOFTWARE_SUSPEND
 	bool "Software Suspend"
-	depends on EXPERIMENTAL && PM && SWAP && ((X86 && SMP) || ((FVR || PPC32 || X86) && !SMP))
+	depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FVR || PPC32) && !SMP)
 	---help---
 	  Enable the possibility of suspending the machine.
 	  It doesn't need APM.
@@ -72,6 +73,18 @@ config PM_STD_PARTITION
 	  suspended image to. It will simply pick the first available swap 
 	  device.
 
+config SWSUSP_ENCRYPT
+	bool "Encrypt suspend image"
+	depends on SOFTWARE_SUSPEND && CRYPTO=y && (CRYPTO_AES=y || CRYPTO_AES_586=y || CRYPTO_AES_X86_64=y)
+	default ""
+	---help---
+	  To prevent data gathering from swap after resume you can encrypt
+	  the suspend image with a temporary key that is deleted on
+	  resume.
+
+	  Note that the temporary key is stored unencrypted on disk while the
+	  system is suspended.
+
 config SUSPEND_SMP
 	bool
 	depends on HOTPLUG_CPU && X86 && PM
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 2f438d0eaa13..c71eb4579c07 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -4,7 +4,7 @@ EXTRA_CFLAGS	+=	-DDEBUG
 endif
 
 obj-y				:= main.o process.o console.o pm.o
-obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o disk.o
+obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o disk.o snapshot.o
 
 obj-$(CONFIG_SUSPEND_SMP)	+= smp.o
 
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 664eb0469b6e..027322a564f4 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -17,12 +17,12 @@
 #include <linux/delay.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
+#include <linux/pm.h>
 
 #include "power.h"
 
 
 extern suspend_disk_method_t pm_disk_mode;
-extern struct pm_ops * pm_ops;
 
 extern int swsusp_suspend(void);
 extern int swsusp_write(void);
@@ -30,7 +30,6 @@ extern int swsusp_check(void);
 extern int swsusp_read(void);
 extern void swsusp_close(void);
 extern int swsusp_resume(void);
-extern int swsusp_free(void);
 
 
 static int noresume = 0;
@@ -49,13 +48,11 @@ dev_t swsusp_resume_device;
 
 static void power_down(suspend_disk_method_t mode)
 {
-	unsigned long flags;
 	int error = 0;
 
-	local_irq_save(flags);
 	switch(mode) {
 	case PM_DISK_PLATFORM:
- 		device_shutdown();
+		kernel_power_off_prepare();
 		error = pm_ops->enter(PM_SUSPEND_DISK);
 		break;
 	case PM_DISK_SHUTDOWN:
@@ -95,10 +92,7 @@ static void free_some_memory(void)
 	printk("Freeing memory...  ");
 	while ((tmp = shrink_all_memory(10000))) {
 		pages += tmp;
-		printk("\b%c", p[i]);
-		i++;
-		if (i > 3)
-			i = 0;
+		printk("\b%c", p[i++ % 4]);
 	}
 	printk("\bdone (%li pages freed)\n", pages);
 }
@@ -112,24 +106,12 @@ static inline void platform_finish(void)
 	}
 }
 
-static void finish(void)
-{
-	device_resume();
-	platform_finish();
-	thaw_processes();
-	enable_nonboot_cpus();
-	pm_restore_console();
-}
-
-
 static int prepare_processes(void)
 {
 	int error;
 
 	pm_prepare_console();
-
 	sys_sync();
-
 	disable_nonboot_cpus();
 
 	if (freeze_processes()) {
@@ -162,15 +144,6 @@ static void unprepare_processes(void)
 	pm_restore_console();
 }
 
-static int prepare_devices(void)
-{
-	int error;
-
-	if ((error = device_suspend(PMSG_FREEZE)))
-		printk("Some devices failed to suspend\n");
-	return error;
-}
-
 /**
  *	pm_suspend_disk - The granpappy of power management.
  *
@@ -187,32 +160,37 @@ int pm_suspend_disk(void)
 	error = prepare_processes();
 	if (error)
 		return error;
-	error = prepare_devices();
 
+	error = device_suspend(PMSG_FREEZE);
 	if (error) {
+		printk("Some devices failed to suspend\n");
 		unprepare_processes();
 		return error;
 	}
 
-	pr_debug("PM: Attempting to suspend to disk.\n");
-	if (pm_disk_mode == PM_DISK_FIRMWARE)
-		return pm_ops->enter(PM_SUSPEND_DISK);
-
 	pr_debug("PM: snapshotting memory.\n");
 	in_suspend = 1;
 	if ((error = swsusp_suspend()))
 		goto Done;
 
 	if (in_suspend) {
+		device_resume();
 		pr_debug("PM: writing image.\n");
 		error = swsusp_write();
 		if (!error)
 			power_down(pm_disk_mode);
+		else {
+			swsusp_free();
+			unprepare_processes();
+			return error;
+		}
 	} else
 		pr_debug("PM: Image restored successfully.\n");
+
 	swsusp_free();
  Done:
-	finish();
+	device_resume();
+	unprepare_processes();
 	return error;
 }
 
@@ -233,9 +211,12 @@ static int software_resume(void)
 {
 	int error;
 
+	down(&pm_sem);
 	if (!swsusp_resume_device) {
-		if (!strlen(resume_file))
+		if (!strlen(resume_file)) {
+			up(&pm_sem);
 			return -ENOENT;
+		}
 		swsusp_resume_device = name_to_dev_t(resume_file);
 		pr_debug("swsusp: Resume From Partition %s\n", resume_file);
 	} else {
@@ -248,6 +229,7 @@ static int software_resume(void)
 		 * FIXME: If noresume is specified, we need to find the partition
 		 * and reset it back to normal swap space.
 		 */
+		up(&pm_sem);
 		return 0;
 	}
 
@@ -265,25 +247,30 @@ static int software_resume(void)
 
 	pr_debug("PM: Reading swsusp image.\n");
 
-	if ((error = swsusp_read()))
-		goto Cleanup;
+	if ((error = swsusp_read())) {
+		swsusp_free();
+		goto Thaw;
+	}
 
 	pr_debug("PM: Preparing devices for restore.\n");
 
-	if ((error = prepare_devices()))
-		goto Free;
+	if ((error = device_suspend(PMSG_FREEZE))) {
+		printk("Some devices failed to suspend\n");
+		swsusp_free();
+		goto Thaw;
+	}
 
 	mb();
 
 	pr_debug("PM: Restoring saved image.\n");
 	swsusp_resume();
 	pr_debug("PM: Restore failed, recovering.n");
-	finish();
- Free:
-	swsusp_free();
- Cleanup:
+	device_resume();
+ Thaw:
 	unprepare_processes();
  Done:
+	/* For success case, the suspend path will release the lock */
+	up(&pm_sem);
 	pr_debug("PM: Resume from disk failed.\n");
 	return 0;
 }
@@ -390,7 +377,9 @@ static ssize_t resume_store(struct subsystem * subsys, const char * buf, size_t
 	if (sscanf(buf, "%u:%u", &maj, &min) == 2) {
 		res = MKDEV(maj,min);
 		if (maj == MAJOR(res) && min == MINOR(res)) {
+			down(&pm_sem);
 			swsusp_resume_device = res;
+			up(&pm_sem);
 			printk("Attempting manual resume\n");
 			noresume = 0;
 			software_resume();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 71aa0fd22007..18d7d693fbba 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -143,11 +143,12 @@ static void suspend_finish(suspend_state_t state)
 
 
 
-static char * pm_states[] = {
+static char *pm_states[PM_SUSPEND_MAX] = {
 	[PM_SUSPEND_STANDBY]	= "standby",
 	[PM_SUSPEND_MEM]	= "mem",
+#ifdef CONFIG_SOFTWARE_SUSPEND
 	[PM_SUSPEND_DISK]	= "disk",
-	NULL,
+#endif
 };
 
 
@@ -166,6 +167,8 @@ static int enter_state(suspend_state_t state)
 {
 	int error;
 
+	if (pm_ops->valid && !pm_ops->valid(state))
+		return -ENODEV;
 	if (down_trylock(&pm_sem))
 		return -EBUSY;
 
@@ -235,7 +238,8 @@ static ssize_t state_show(struct subsystem * subsys, char * buf)
 	char * s = buf;
 
 	for (i = 0; i < PM_SUSPEND_MAX; i++) {
-		if (pm_states[i])
+		if (pm_states[i] && pm_ops && (!pm_ops->valid
+			||(pm_ops->valid && pm_ops->valid(i))))
 			s += sprintf(s,"%s ",pm_states[i]);
 	}
 	s += sprintf(s,"\n");
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 61deda04e39e..159149321b3c 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -60,9 +60,8 @@ struct pm_dev *pm_register(pm_dev_t type,
 			   unsigned long id,
 			   pm_callback callback)
 {
-	struct pm_dev *dev = kmalloc(sizeof(struct pm_dev), GFP_KERNEL);
+	struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL);
 	if (dev) {
-		memset(dev, 0, sizeof(*dev));
 		dev->type = type;
 		dev->id = id;
 		dev->callback = callback;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index cd6a3493cc0d..d4fd96a135ab 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -1,7 +1,7 @@
 #include <linux/suspend.h>
 #include <linux/utsname.h>
 
-/* With SUSPEND_CONSOLE defined, it suspend looks *really* cool, but
+/* With SUSPEND_CONSOLE defined suspend looks *really* cool, but
    we probably do not take enough locks for switching consoles, etc,
    so bad things might happen.
 */
@@ -9,6 +9,9 @@
 #define SUSPEND_CONSOLE	(MAX_NR_CONSOLES-1)
 #endif
 
+#define MAX_PBES	((PAGE_SIZE - sizeof(struct new_utsname) \
+			- 4 - 3*sizeof(unsigned long) - sizeof(int) \
+			- sizeof(void *)) / sizeof(swp_entry_t))
 
 struct swsusp_info {
 	struct new_utsname	uts;
@@ -18,7 +21,7 @@ struct swsusp_info {
 	unsigned long		image_pages;
 	unsigned long		pagedir_pages;
 	suspend_pagedir_t	* suspend_pagedir;
-	swp_entry_t		pagedir[768];
+	swp_entry_t		pagedir[MAX_PBES];
 } __attribute__((aligned(PAGE_SIZE)));
 
 
@@ -50,3 +53,20 @@ extern void thaw_processes(void);
 
 extern int pm_prepare_console(void);
 extern void pm_restore_console(void);
+
+
+/* References to section boundaries */
+extern const void __nosave_begin, __nosave_end;
+
+extern unsigned int nr_copy_pages;
+extern suspend_pagedir_t *pagedir_nosave;
+extern suspend_pagedir_t *pagedir_save;
+
+extern asmlinkage int swsusp_arch_suspend(void);
+extern asmlinkage int swsusp_arch_resume(void);
+
+extern int restore_highmem(void);
+extern struct pbe * alloc_pagedir(unsigned nr_pages);
+extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages);
+extern void swsusp_free(void);
+extern int enough_swap(unsigned nr_pages);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 3bd0d261818f..28de118f7a0b 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -38,7 +38,6 @@ void refrigerator(void)
 	   processes around? */
 	long save;
 	save = current->state;
-	current->state = TASK_UNINTERRUPTIBLE;
 	pr_debug("%s entered refrigerator\n", current->comm);
 	printk("=");
 
@@ -47,8 +46,10 @@ void refrigerator(void)
 	recalc_sigpending(); /* We sent fake signal, clean it up */
 	spin_unlock_irq(&current->sighand->siglock);
 
-	while (frozen(current))
+	while (frozen(current)) {
+		current->state = TASK_UNINTERRUPTIBLE;
 		schedule();
+	}
 	pr_debug("%s left refrigerator\n", current->comm);
 	current->state = save;
 }
@@ -80,13 +81,33 @@ int freeze_processes(void)
 		} while_each_thread(g, p);
 		read_unlock(&tasklist_lock);
 		yield();			/* Yield is okay here */
-		if (time_after(jiffies, start_time + TIMEOUT)) {
+		if (todo && time_after(jiffies, start_time + TIMEOUT)) {
 			printk( "\n" );
 			printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo );
-			return todo;
+			break;
 		}
 	} while(todo);
 
+	/* This does not unfreeze processes that are already frozen
+	 * (we have slightly ugly calling convention in that respect,
+	 * and caller must call thaw_processes() if something fails),
+	 * but it cleans up leftover PF_FREEZE requests.
+	 */
+	if (todo) {
+		read_lock(&tasklist_lock);
+		do_each_thread(g, p)
+			if (freezing(p)) {
+				pr_debug("  clean up: %s\n", p->comm);
+				p->flags &= ~PF_FREEZE;
+				spin_lock_irqsave(&p->sighand->siglock, flags);
+				recalc_sigpending_tsk(p);
+				spin_unlock_irqrestore(&p->sighand->siglock, flags);
+			}
+		while_each_thread(g, p);
+		read_unlock(&tasklist_lock);
+		return todo;
+	}
+
 	printk( "|\n" );
 	BUG_ON(in_atomic());
 	return 0;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
new file mode 100644
index 000000000000..42a628704398
--- /dev/null
+++ b/kernel/power/snapshot.c
@@ -0,0 +1,435 @@
+/*
+ * linux/kernel/power/snapshot.c
+ *
+ * This file provide system snapshot/restore functionality.
+ *
+ * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz>
+ *
+ * This file is released under the GPLv2, and is based on swsusp.c.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/suspend.h>
+#include <linux/smp_lock.h>
+#include <linux/delay.h>
+#include <linux/bitops.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/pm.h>
+#include <linux/device.h>
+#include <linux/bootmem.h>
+#include <linux/syscalls.h>
+#include <linux/console.h>
+#include <linux/highmem.h>
+
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+
+#include "power.h"
+
+#ifdef CONFIG_HIGHMEM
+struct highmem_page {
+	char *data;
+	struct page *page;
+	struct highmem_page *next;
+};
+
+static struct highmem_page *highmem_copy;
+
+static int save_highmem_zone(struct zone *zone)
+{
+	unsigned long zone_pfn;
+	mark_free_pages(zone);
+	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+		struct page *page;
+		struct highmem_page *save;
+		void *kaddr;
+		unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+
+		if (!(pfn%1000))
+			printk(".");
+		if (!pfn_valid(pfn))
+			continue;
+		page = pfn_to_page(pfn);
+		/*
+		 * This condition results from rvmalloc() sans vmalloc_32()
+		 * and architectural memory reservations. This should be
+		 * corrected eventually when the cases giving rise to this
+		 * are better understood.
+		 */
+		if (PageReserved(page)) {
+			printk("highmem reserved page?!\n");
+			continue;
+		}
+		BUG_ON(PageNosave(page));
+		if (PageNosaveFree(page))
+			continue;
+		save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
+		if (!save)
+			return -ENOMEM;
+		save->next = highmem_copy;
+		save->page = page;
+		save->data = (void *) get_zeroed_page(GFP_ATOMIC);
+		if (!save->data) {
+			kfree(save);
+			return -ENOMEM;
+		}
+		kaddr = kmap_atomic(page, KM_USER0);
+		memcpy(save->data, kaddr, PAGE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		highmem_copy = save;
+	}
+	return 0;
+}
+
+
+static int save_highmem(void)
+{
+	struct zone *zone;
+	int res = 0;
+
+	pr_debug("swsusp: Saving Highmem\n");
+	for_each_zone (zone) {
+		if (is_highmem(zone))
+			res = save_highmem_zone(zone);
+		if (res)
+			return res;
+	}
+	return 0;
+}
+
+int restore_highmem(void)
+{
+	printk("swsusp: Restoring Highmem\n");
+	while (highmem_copy) {
+		struct highmem_page *save = highmem_copy;
+		void *kaddr;
+		highmem_copy = save->next;
+
+		kaddr = kmap_atomic(save->page, KM_USER0);
+		memcpy(kaddr, save->data, PAGE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		free_page((long) save->data);
+		kfree(save);
+	}
+	return 0;
+}
+#else
+static int save_highmem(void) { return 0; }
+int restore_highmem(void) { return 0; }
+#endif /* CONFIG_HIGHMEM */
+
+
+static int pfn_is_nosave(unsigned long pfn)
+{
+	unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
+	unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
+	return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
+}
+
+/**
+ *	saveable - Determine whether a page should be cloned or not.
+ *	@pfn:	The page
+ *
+ *	We save a page if it's Reserved, and not in the range of pages
+ *	statically defined as 'unsaveable', or if it isn't reserved, and
+ *	isn't part of a free chunk of pages.
+ */
+
+static int saveable(struct zone *zone, unsigned long *zone_pfn)
+{
+	unsigned long pfn = *zone_pfn + zone->zone_start_pfn;
+	struct page *page;
+
+	if (!pfn_valid(pfn))
+		return 0;
+
+	page = pfn_to_page(pfn);
+	BUG_ON(PageReserved(page) && PageNosave(page));
+	if (PageNosave(page))
+		return 0;
+	if (PageReserved(page) && pfn_is_nosave(pfn)) {
+		pr_debug("[nosave pfn 0x%lx]", pfn);
+		return 0;
+	}
+	if (PageNosaveFree(page))
+		return 0;
+
+	return 1;
+}
+
+static unsigned count_data_pages(void)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+	unsigned n;
+
+	n = 0;
+	for_each_zone (zone) {
+		if (is_highmem(zone))
+			continue;
+		mark_free_pages(zone);
+		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+			n += saveable(zone, &zone_pfn);
+	}
+	return n;
+}
+
+static void copy_data_pages(struct pbe *pblist)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+	struct pbe *pbe, *p;
+
+	pbe = pblist;
+	for_each_zone (zone) {
+		if (is_highmem(zone))
+			continue;
+		mark_free_pages(zone);
+		/* This is necessary for swsusp_free() */
+		for_each_pb_page (p, pblist)
+			SetPageNosaveFree(virt_to_page(p));
+		for_each_pbe (p, pblist)
+			SetPageNosaveFree(virt_to_page(p->address));
+		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+			if (saveable(zone, &zone_pfn)) {
+				struct page *page;
+				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
+				BUG_ON(!pbe);
+				pbe->orig_address = (unsigned long)page_address(page);
+				/* copy_page is not usable for copying task structs. */
+				memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
+				pbe = pbe->next;
+			}
+		}
+	}
+	BUG_ON(pbe);
+}
+
+
+/**
+ *	free_pagedir - free pages allocated with alloc_pagedir()
+ */
+
+static void free_pagedir(struct pbe *pblist)
+{
+	struct pbe *pbe;
+
+	while (pblist) {
+		pbe = (pblist + PB_PAGE_SKIP)->next;
+		ClearPageNosave(virt_to_page(pblist));
+		ClearPageNosaveFree(virt_to_page(pblist));
+		free_page((unsigned long)pblist);
+		pblist = pbe;
+	}
+}
+
+/**
+ *	fill_pb_page - Create a list of PBEs on a given memory page
+ */
+
+static inline void fill_pb_page(struct pbe *pbpage)
+{
+	struct pbe *p;
+
+	p = pbpage;
+	pbpage += PB_PAGE_SKIP;
+	do
+		p->next = p + 1;
+	while (++p < pbpage);
+}
+
+/**
+ *	create_pbe_list - Create a list of PBEs on top of a given chain
+ *	of memory pages allocated with alloc_pagedir()
+ */
+
+void create_pbe_list(struct pbe *pblist, unsigned nr_pages)
+{
+	struct pbe *pbpage, *p;
+	unsigned num = PBES_PER_PAGE;
+
+	for_each_pb_page (pbpage, pblist) {
+		if (num >= nr_pages)
+			break;
+
+		fill_pb_page(pbpage);
+		num += PBES_PER_PAGE;
+	}
+	if (pbpage) {
+		for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++)
+			p->next = p + 1;
+		p->next = NULL;
+	}
+	pr_debug("create_pbe_list(): initialized %d PBEs\n", num);
+}
+
+static void *alloc_image_page(void)
+{
+	void *res = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
+	if (res) {
+		SetPageNosave(virt_to_page(res));
+		SetPageNosaveFree(virt_to_page(res));
+	}
+	return res;
+}
+
+/**
+ *	alloc_pagedir - Allocate the page directory.
+ *
+ *	First, determine exactly how many pages we need and
+ *	allocate them.
+ *
+ *	We arrange the pages in a chain: each page is an array of PBES_PER_PAGE
+ *	struct pbe elements (pbes) and the last element in the page points
+ *	to the next page.
+ *
+ *	On each page we set up a list of struct_pbe elements.
+ */
+
+struct pbe *alloc_pagedir(unsigned nr_pages)
+{
+	unsigned num;
+	struct pbe *pblist, *pbe;
+
+	if (!nr_pages)
+		return NULL;
+
+	pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
+	pblist = alloc_image_page();
+	/* FIXME: rewrite this ugly loop */
+	for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
+        		pbe = pbe->next, num += PBES_PER_PAGE) {
+		pbe += PB_PAGE_SKIP;
+		pbe->next = alloc_image_page();
+	}
+	if (!pbe) { /* get_zeroed_page() failed */
+		free_pagedir(pblist);
+		pblist = NULL;
+        }
+	return pblist;
+}
+
+/**
+ * Free pages we allocated for suspend. Suspend pages are alocated
+ * before atomic copy, so we need to free them after resume.
+ */
+
+void swsusp_free(void)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+
+	for_each_zone(zone) {
+		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+			if (pfn_valid(zone_pfn + zone->zone_start_pfn)) {
+				struct page * page;
+				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
+				if (PageNosave(page) && PageNosaveFree(page)) {
+					ClearPageNosave(page);
+					ClearPageNosaveFree(page);
+					free_page((long) page_address(page));
+				}
+			}
+	}
+}
+
+
+/**
+ *	enough_free_mem - Make sure we enough free memory to snapshot.
+ *
+ *	Returns TRUE or FALSE after checking the number of available
+ *	free pages.
+ */
+
+static int enough_free_mem(unsigned nr_pages)
+{
+	pr_debug("swsusp: available memory: %u pages\n", nr_free_pages());
+	return nr_free_pages() > (nr_pages + PAGES_FOR_IO +
+		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
+}
+
+
+static struct pbe *swsusp_alloc(unsigned nr_pages)
+{
+	struct pbe *pblist, *p;
+
+	if (!(pblist = alloc_pagedir(nr_pages))) {
+		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
+		return NULL;
+	}
+	create_pbe_list(pblist, nr_pages);
+
+	for_each_pbe (p, pblist) {
+		p->address = (unsigned long)alloc_image_page();
+		if (!p->address) {
+			printk(KERN_ERR "suspend: Allocating image pages failed.\n");
+			swsusp_free();
+			return NULL;
+		}
+	}
+
+	return pblist;
+}
+
+asmlinkage int swsusp_save(void)
+{
+	unsigned nr_pages;
+
+	pr_debug("swsusp: critical section: \n");
+	if (save_highmem()) {
+		printk(KERN_CRIT "swsusp: Not enough free pages for highmem\n");
+		restore_highmem();
+		return -ENOMEM;
+	}
+
+	drain_local_pages();
+	nr_pages = count_data_pages();
+	printk("swsusp: Need to copy %u pages\n", nr_pages);
+
+	pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n",
+		 nr_pages,
+		 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE,
+		 PAGES_FOR_IO, nr_free_pages());
+
+	/* This is needed because of the fixed size of swsusp_info */
+	if (MAX_PBES < (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE)
+		return -ENOSPC;
+
+	if (!enough_free_mem(nr_pages)) {
+		printk(KERN_ERR "swsusp: Not enough free memory\n");
+		return -ENOMEM;
+	}
+
+	if (!enough_swap(nr_pages)) {
+		printk(KERN_ERR "swsusp: Not enough free swap\n");
+		return -ENOSPC;
+	}
+
+	pagedir_nosave = swsusp_alloc(nr_pages);
+	if (!pagedir_nosave)
+		return -ENOMEM;
+
+	/* During allocating of suspend pagedir, new cold pages may appear.
+	 * Kill them.
+	 */
+	drain_local_pages();
+	copy_data_pages(pagedir_nosave);
+
+	/*
+	 * End of critical section. From now on, we can write to memory,
+	 * but we should not touch disk. This specially means we must _not_
+	 * touch swap space! Except we must write out our image of course.
+	 */
+
+	nr_copy_pages = nr_pages;
+
+	printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
+	return 0;
+}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index f2bc71b9fe8b..12db1d2ad61f 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -1,11 +1,10 @@
 /*
  * linux/kernel/power/swsusp.c
  *
- * This file is to realize architecture-independent
- * machine suspend feature using pretty near only high-level routines
+ * This file provides code to write suspend image to swap and read it back.
  *
  * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001-2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
  *
  * This file is released under the GPLv2.
  *
@@ -31,6 +30,9 @@
  * Alex Badea <vampire@go.ro>:
  * Fixed runaway init
  *
+ * Andreas Steinmetz <ast@domdv.de>:
+ * Added encrypted suspend option
+ *
  * More state savers are welcome. Especially for the scsi layer...
  *
  * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
@@ -44,11 +46,7 @@
 #include <linux/utsname.h>
 #include <linux/version.h>
 #include <linux/delay.h>
-#include <linux/reboot.h>
 #include <linux/bitops.h>
-#include <linux/vt_kern.h>
-#include <linux/kbd_kern.h>
-#include <linux/keyboard.h>
 #include <linux/spinlock.h>
 #include <linux/genhd.h>
 #include <linux/kernel.h>
@@ -60,10 +58,8 @@
 #include <linux/swapops.h>
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
-#include <linux/console.h>
 #include <linux/highmem.h>
 #include <linux/bio.h>
-#include <linux/mount.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -71,18 +67,20 @@
 #include <asm/tlbflush.h>
 #include <asm/io.h>
 
-#include "power.h"
+#include <linux/random.h>
+#include <linux/crypto.h>
+#include <asm/scatterlist.h>
 
-/* References to section boundaries */
-extern const void __nosave_begin, __nosave_end;
+#include "power.h"
 
-/* Variables to be preserved over suspend */
-static int nr_copy_pages_check;
+#define CIPHER "aes"
+#define MAXKEY 32
+#define MAXIV  32
 
 extern char resume_file[];
 
 /* Local variables that should not be affected by save */
-static unsigned int nr_copy_pages __nosavedata = 0;
+unsigned int nr_copy_pages __nosavedata = 0;
 
 /* Suspend pagedir is allocated before final copy, therefore it
    must be freed after resume
@@ -98,12 +96,13 @@ static unsigned int nr_copy_pages __nosavedata = 0;
    MMU hardware.
  */
 suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
-static suspend_pagedir_t *pagedir_save;
+suspend_pagedir_t *pagedir_save;
 
 #define SWSUSP_SIG	"S1SUSPEND"
 
 static struct swsusp_header {
-	char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
+	char reserved[PAGE_SIZE - 20 - MAXKEY - MAXIV - sizeof(swp_entry_t)];
+	u8 key_iv[MAXKEY+MAXIV];
 	swp_entry_t swsusp_info;
 	char	orig_sig[10];
 	char	sig[10];
@@ -112,12 +111,6 @@ static struct swsusp_header {
 static struct swsusp_info swsusp_info;
 
 /*
- * XXX: We try to keep some more pages free so that I/O operations succeed
- * without paging. Might this be more?
- */
-#define PAGES_FOR_IO	512
-
-/*
  * Saving part...
  */
 
@@ -129,6 +122,131 @@ static struct swsusp_info swsusp_info;
 static unsigned short swapfile_used[MAX_SWAPFILES];
 static unsigned short root_swap;
 
+static int write_page(unsigned long addr, swp_entry_t * loc);
+static int bio_read_page(pgoff_t page_off, void * page);
+
+static u8 key_iv[MAXKEY+MAXIV];
+
+#ifdef CONFIG_SWSUSP_ENCRYPT
+
+static int crypto_init(int mode, void **mem)
+{
+	int error = 0;
+	int len;
+	char *modemsg;
+	struct crypto_tfm *tfm;
+
+	modemsg = mode ? "suspend not possible" : "resume not possible";
+
+	tfm = crypto_alloc_tfm(CIPHER, CRYPTO_TFM_MODE_CBC);
+	if(!tfm) {
+		printk(KERN_ERR "swsusp: no tfm, %s\n", modemsg);
+		error = -EINVAL;
+		goto out;
+	}
+
+	if(MAXKEY < crypto_tfm_alg_min_keysize(tfm)) {
+		printk(KERN_ERR "swsusp: key buffer too small, %s\n", modemsg);
+		error = -ENOKEY;
+		goto fail;
+	}
+
+	if (mode)
+		get_random_bytes(key_iv, MAXKEY+MAXIV);
+
+	len = crypto_tfm_alg_max_keysize(tfm);
+	if (len > MAXKEY)
+		len = MAXKEY;
+
+	if (crypto_cipher_setkey(tfm, key_iv, len)) {
+		printk(KERN_ERR "swsusp: key setup failure, %s\n", modemsg);
+		error = -EKEYREJECTED;
+		goto fail;
+	}
+
+	len = crypto_tfm_alg_ivsize(tfm);
+
+	if (MAXIV < len) {
+		printk(KERN_ERR "swsusp: iv buffer too small, %s\n", modemsg);
+		error = -EOVERFLOW;
+		goto fail;
+	}
+
+	crypto_cipher_set_iv(tfm, key_iv+MAXKEY, len);
+
+	*mem=(void *)tfm;
+
+	goto out;
+
+fail:	crypto_free_tfm(tfm);
+out:	return error;
+}
+
+static __inline__ void crypto_exit(void *mem)
+{
+	crypto_free_tfm((struct crypto_tfm *)mem);
+}
+
+static __inline__ int crypto_write(struct pbe *p, void *mem)
+{
+	int error = 0;
+	struct scatterlist src, dst;
+
+	src.page   = virt_to_page(p->address);
+	src.offset = 0;
+	src.length = PAGE_SIZE;
+	dst.page   = virt_to_page((void *)&swsusp_header);
+	dst.offset = 0;
+	dst.length = PAGE_SIZE;
+
+	error = crypto_cipher_encrypt((struct crypto_tfm *)mem, &dst, &src,
+					PAGE_SIZE);
+
+	if (!error)
+		error = write_page((unsigned long)&swsusp_header,
+				&(p->swap_address));
+	return error;
+}
+
+static __inline__ int crypto_read(struct pbe *p, void *mem)
+{
+	int error = 0;
+	struct scatterlist src, dst;
+
+	error = bio_read_page(swp_offset(p->swap_address), (void *)p->address);
+	if (!error) {
+		src.offset = 0;
+		src.length = PAGE_SIZE;
+		dst.offset = 0;
+		dst.length = PAGE_SIZE;
+		src.page = dst.page = virt_to_page((void *)p->address);
+
+		error = crypto_cipher_decrypt((struct crypto_tfm *)mem, &dst,
+						&src, PAGE_SIZE);
+	}
+	return error;
+}
+#else
+static __inline__ int crypto_init(int mode, void *mem)
+{
+	return 0;
+}
+
+static __inline__ void crypto_exit(void *mem)
+{
+}
+
+static __inline__ int crypto_write(struct pbe *p, void *mem)
+{
+	return write_page(p->address, &(p->swap_address));
+}
+
+static __inline__ int crypto_read(struct pbe *p, void *mem)
+{
+	return bio_read_page(swp_offset(p->swap_address), (void *)p->address);
+}
+#endif
+
 static int mark_swapfiles(swp_entry_t prev)
 {
 	int error;
@@ -140,6 +258,7 @@ static int mark_swapfiles(swp_entry_t prev)
 	    !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
 		memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
 		memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
+		memcpy(swsusp_header.key_iv, key_iv, MAXKEY+MAXIV);
 		swsusp_header.swsusp_info = prev;
 		error = rw_swap_page_sync(WRITE,
 					  swp_entry(root_swap, 0),
@@ -179,9 +298,9 @@ static int swsusp_swap_check(void) /* This is called before saving image */
 	len=strlen(resume_file);
 	root_swap = 0xFFFF;
 
-	swap_list_lock();
+	spin_lock(&swap_lock);
 	for (i=0; i<MAX_SWAPFILES; i++) {
-		if (swap_info[i].flags == 0) {
+		if (!(swap_info[i].flags & SWP_WRITEOK)) {
 			swapfile_used[i]=SWAPFILE_UNUSED;
 		} else {
 			if (!len) {
@@ -202,7 +321,7 @@ static int swsusp_swap_check(void) /* This is called before saving image */
 			}
 		}
 	}
-	swap_list_unlock();
+	spin_unlock(&swap_lock);
 	return (root_swap != 0xffff) ? 0 : -ENODEV;
 }
 
@@ -216,16 +335,16 @@ static void lock_swapdevices(void)
 {
 	int i;
 
-	swap_list_lock();
+	spin_lock(&swap_lock);
 	for (i = 0; i< MAX_SWAPFILES; i++)
 		if (swapfile_used[i] == SWAPFILE_IGNORED) {
-			swap_info[i].flags ^= 0xFF;
+			swap_info[i].flags ^= SWP_WRITEOK;
 		}
-	swap_list_unlock();
+	spin_unlock(&swap_lock);
 }
 
 /**
- *	write_swap_page - Write one page to a fresh swap location.
+ *	write_page - Write one page to a fresh swap location.
  *	@addr:	Address we're writing.
  *	@loc:	Place to store the entry we used.
  *
@@ -264,15 +383,14 @@ static int write_page(unsigned long addr, swp_entry_t * loc)
 static void data_free(void)
 {
 	swp_entry_t entry;
-	int i;
+	struct pbe * p;
 
-	for (i = 0; i < nr_copy_pages; i++) {
-		entry = (pagedir_nosave + i)->swap_address;
+	for_each_pbe(p, pagedir_nosave) {
+		entry = p->swap_address;
 		if (entry.val)
 			swap_free(entry);
 		else
 			break;
-		(pagedir_nosave + i)->swap_address = (swp_entry_t){0};
 	}
 }
 
@@ -286,6 +404,10 @@ static int data_write(void)
 	int error = 0, i = 0;
 	unsigned int mod = nr_copy_pages / 100;
 	struct pbe *p;
+	void *tfm;
+
+	if ((error = crypto_init(1, &tfm)))
+		return error;
 
 	if (!mod)
 		mod = 1;
@@ -294,11 +416,14 @@ static int data_write(void)
 	for_each_pbe (p, pagedir_nosave) {
 		if (!(i%mod))
 			printk( "\b\b\b\b%3d%%", i / mod );
-		if ((error = write_page(p->address, &(p->swap_address))))
+		if ((error = crypto_write(p, tfm))) {
+			crypto_exit(tfm);
 			return error;
+		}
 		i++;
 	}
 	printk("\b\b\b\bdone\n");
+	crypto_exit(tfm);
 	return error;
 }
 
@@ -385,7 +510,6 @@ static int write_pagedir(void)
  *	write_suspend_image - Write entire image and metadata.
  *
  */
-
 static int write_suspend_image(void)
 {
 	int error;
@@ -400,6 +524,7 @@ static int write_suspend_image(void)
 	if ((error = close_swap()))
 		goto FreePagedir;
  Done:
+	memset(key_iv, 0, MAXKEY+MAXIV);
 	return error;
  FreePagedir:
 	free_pagedir_entries();
@@ -408,354 +533,6 @@ static int write_suspend_image(void)
 	goto Done;
 }
 
-
-#ifdef CONFIG_HIGHMEM
-struct highmem_page {
-	char *data;
-	struct page *page;
-	struct highmem_page *next;
-};
-
-static struct highmem_page *highmem_copy;
-
-static int save_highmem_zone(struct zone *zone)
-{
-	unsigned long zone_pfn;
-	mark_free_pages(zone);
-	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
-		struct page *page;
-		struct highmem_page *save;
-		void *kaddr;
-		unsigned long pfn = zone_pfn + zone->zone_start_pfn;
-
-		if (!(pfn%1000))
-			printk(".");
-		if (!pfn_valid(pfn))
-			continue;
-		page = pfn_to_page(pfn);
-		/*
-		 * This condition results from rvmalloc() sans vmalloc_32()
-		 * and architectural memory reservations. This should be
-		 * corrected eventually when the cases giving rise to this
-		 * are better understood.
-		 */
-		if (PageReserved(page)) {
-			printk("highmem reserved page?!\n");
-			continue;
-		}
-		BUG_ON(PageNosave(page));
-		if (PageNosaveFree(page))
-			continue;
-		save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
-		if (!save)
-			return -ENOMEM;
-		save->next = highmem_copy;
-		save->page = page;
-		save->data = (void *) get_zeroed_page(GFP_ATOMIC);
-		if (!save->data) {
-			kfree(save);
-			return -ENOMEM;
-		}
-		kaddr = kmap_atomic(page, KM_USER0);
-		memcpy(save->data, kaddr, PAGE_SIZE);
-		kunmap_atomic(kaddr, KM_USER0);
-		highmem_copy = save;
-	}
-	return 0;
-}
-#endif /* CONFIG_HIGHMEM */
-
-
-static int save_highmem(void)
-{
-#ifdef CONFIG_HIGHMEM
-	struct zone *zone;
-	int res = 0;
-
-	pr_debug("swsusp: Saving Highmem\n");
-	for_each_zone (zone) {
-		if (is_highmem(zone))
-			res = save_highmem_zone(zone);
-		if (res)
-			return res;
-	}
-#endif
-	return 0;
-}
-
-static int restore_highmem(void)
-{
-#ifdef CONFIG_HIGHMEM
-	printk("swsusp: Restoring Highmem\n");
-	while (highmem_copy) {
-		struct highmem_page *save = highmem_copy;
-		void *kaddr;
-		highmem_copy = save->next;
-
-		kaddr = kmap_atomic(save->page, KM_USER0);
-		memcpy(kaddr, save->data, PAGE_SIZE);
-		kunmap_atomic(kaddr, KM_USER0);
-		free_page((long) save->data);
-		kfree(save);
-	}
-#endif
-	return 0;
-}
-
-
-static int pfn_is_nosave(unsigned long pfn)
-{
-	unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
-	unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
-	return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
-}
-
-/**
- *	saveable - Determine whether a page should be cloned or not.
- *	@pfn:	The page
- *
- *	We save a page if it's Reserved, and not in the range of pages
- *	statically defined as 'unsaveable', or if it isn't reserved, and
- *	isn't part of a free chunk of pages.
- */
-
-static int saveable(struct zone * zone, unsigned long * zone_pfn)
-{
-	unsigned long pfn = *zone_pfn + zone->zone_start_pfn;
-	struct page * page;
-
-	if (!pfn_valid(pfn))
-		return 0;
-
-	page = pfn_to_page(pfn);
-	BUG_ON(PageReserved(page) && PageNosave(page));
-	if (PageNosave(page))
-		return 0;
-	if (PageReserved(page) && pfn_is_nosave(pfn)) {
-		pr_debug("[nosave pfn 0x%lx]", pfn);
-		return 0;
-	}
-	if (PageNosaveFree(page))
-		return 0;
-
-	return 1;
-}
-
-static void count_data_pages(void)
-{
-	struct zone *zone;
-	unsigned long zone_pfn;
-
-	nr_copy_pages = 0;
-
-	for_each_zone (zone) {
-		if (is_highmem(zone))
-			continue;
-		mark_free_pages(zone);
-		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-			nr_copy_pages += saveable(zone, &zone_pfn);
-	}
-}
-
-
-static void copy_data_pages(void)
-{
-	struct zone *zone;
-	unsigned long zone_pfn;
-	struct pbe * pbe = pagedir_nosave;
-
-	pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages);
-	for_each_zone (zone) {
-		if (is_highmem(zone))
-			continue;
-		mark_free_pages(zone);
-		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
-			if (saveable(zone, &zone_pfn)) {
-				struct page * page;
-				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
-				BUG_ON(!pbe);
-				pbe->orig_address = (long) page_address(page);
-				/* copy_page is not usable for copying task structs. */
-				memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
-				pbe = pbe->next;
-			}
-		}
-	}
-	BUG_ON(pbe);
-}
-
-
-/**
- *	calc_nr - Determine the number of pages needed for a pbe list.
- */
-
-static int calc_nr(int nr_copy)
-{
-	int extra = 0;
-	int mod = !!(nr_copy % PBES_PER_PAGE);
-	int diff = (nr_copy / PBES_PER_PAGE) + mod;
-
-	do {
-		extra += diff;
-		nr_copy += diff;
-		mod = !!(nr_copy % PBES_PER_PAGE);
-		diff = (nr_copy / PBES_PER_PAGE) + mod - extra;
-	} while (diff > 0);
-
-	return nr_copy;
-}
-
-/**
- *	free_pagedir - free pages allocated with alloc_pagedir()
- */
-
-static inline void free_pagedir(struct pbe *pblist)
-{
-	struct pbe *pbe;
-
-	while (pblist) {
-		pbe = (pblist + PB_PAGE_SKIP)->next;
-		free_page((unsigned long)pblist);
-		pblist = pbe;
-	}
-}
-
-/**
- *	fill_pb_page - Create a list of PBEs on a given memory page
- */
-
-static inline void fill_pb_page(struct pbe *pbpage)
-{
-	struct pbe *p;
-
-	p = pbpage;
-	pbpage += PB_PAGE_SKIP;
-	do
-		p->next = p + 1;
-	while (++p < pbpage);
-}
-
-/**
- *	create_pbe_list - Create a list of PBEs on top of a given chain
- *	of memory pages allocated with alloc_pagedir()
- */
-
-static void create_pbe_list(struct pbe *pblist, unsigned nr_pages)
-{
-	struct pbe *pbpage, *p;
-	unsigned num = PBES_PER_PAGE;
-
-	for_each_pb_page (pbpage, pblist) {
-		if (num >= nr_pages)
-			break;
-
-		fill_pb_page(pbpage);
-		num += PBES_PER_PAGE;
-	}
-	if (pbpage) {
-		for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++)
-			p->next = p + 1;
-		p->next = NULL;
-	}
-	pr_debug("create_pbe_list(): initialized %d PBEs\n", num);
-}
-
-/**
- *	alloc_pagedir - Allocate the page directory.
- *
- *	First, determine exactly how many pages we need and
- *	allocate them.
- *
- *	We arrange the pages in a chain: each page is an array of PBES_PER_PAGE
- *	struct pbe elements (pbes) and the last element in the page points
- *	to the next page.
- *
- *	On each page we set up a list of struct_pbe elements.
- */
-
-static struct pbe * alloc_pagedir(unsigned nr_pages)
-{
-	unsigned num;
-	struct pbe *pblist, *pbe;
-
-	if (!nr_pages)
-		return NULL;
-
-	pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
-	pblist = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
-	for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
-        		pbe = pbe->next, num += PBES_PER_PAGE) {
-		pbe += PB_PAGE_SKIP;
-		pbe->next = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
-	}
-	if (!pbe) { /* get_zeroed_page() failed */
-		free_pagedir(pblist);
-		pblist = NULL;
-        }
-	return pblist;
-}
-
-/**
- *	free_image_pages - Free pages allocated for snapshot
- */
-
-static void free_image_pages(void)
-{
-	struct pbe * p;
-
-	for_each_pbe (p, pagedir_save) {
-		if (p->address) {
-			ClearPageNosave(virt_to_page(p->address));
-			free_page(p->address);
-			p->address = 0;
-		}
-	}
-}
-
-/**
- *	alloc_image_pages - Allocate pages for the snapshot.
- */
-
-static int alloc_image_pages(void)
-{
-	struct pbe * p;
-
-	for_each_pbe (p, pagedir_save) {
-		p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
-		if (!p->address)
-			return -ENOMEM;
-		SetPageNosave(virt_to_page(p->address));
-	}
-	return 0;
-}
-
-void swsusp_free(void)
-{
-	BUG_ON(PageNosave(virt_to_page(pagedir_save)));
-	BUG_ON(PageNosaveFree(virt_to_page(pagedir_save)));
-	free_image_pages();
-	free_pagedir(pagedir_save);
-}
-
-
-/**
- *	enough_free_mem - Make sure we enough free memory to snapshot.
- *
- *	Returns TRUE or FALSE after checking the number of available
- *	free pages.
- */
-
-static int enough_free_mem(void)
-{
-	if (nr_free_pages() < (nr_copy_pages + PAGES_FOR_IO)) {
-		pr_debug("swsusp: Not enough free pages: Have %d\n",
-			 nr_free_pages());
-		return 0;
-	}
-	return 1;
-}
-
-
 /**
  *	enough_swap - Make sure we have enough swap to save the image.
  *
@@ -766,83 +543,14 @@ static int enough_free_mem(void)
  *	We should only consider resume_device.
  */
 
-static int enough_swap(void)
+int enough_swap(unsigned nr_pages)
 {
 	struct sysinfo i;
 
 	si_swapinfo(&i);
-	if (i.freeswap < (nr_copy_pages + PAGES_FOR_IO))  {
-		pr_debug("swsusp: Not enough swap. Need %ld\n",i.freeswap);
-		return 0;
-	}
-	return 1;
-}
-
-static int swsusp_alloc(void)
-{
-	int error;
-
-	pagedir_nosave = NULL;
-	nr_copy_pages = calc_nr(nr_copy_pages);
-
-	pr_debug("suspend: (pages needed: %d + %d free: %d)\n",
-		 nr_copy_pages, PAGES_FOR_IO, nr_free_pages());
-
-	if (!enough_free_mem())
-		return -ENOMEM;
-
-	if (!enough_swap())
-		return -ENOSPC;
-
-	if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
-		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
-		return -ENOMEM;
-	}
-	create_pbe_list(pagedir_save, nr_copy_pages);
-	pagedir_nosave = pagedir_save;
-	if ((error = alloc_image_pages())) {
-		printk(KERN_ERR "suspend: Allocating image pages failed.\n");
-		swsusp_free();
-		return error;
-	}
-
-	nr_copy_pages_check = nr_copy_pages;
-	return 0;
-}
-
-static int suspend_prepare_image(void)
-{
-	int error;
-
-	pr_debug("swsusp: critical section: \n");
-	if (save_highmem()) {
-		printk(KERN_CRIT "Suspend machine: Not enough free pages for highmem\n");
-		restore_highmem();
-		return -ENOMEM;
-	}
-
-	drain_local_pages();
-	count_data_pages();
-	printk("swsusp: Need to copy %u pages\n", nr_copy_pages);
-
-	error = swsusp_alloc();
-	if (error)
-		return error;
-
-	/* During allocating of suspend pagedir, new cold pages may appear.
-	 * Kill them.
-	 */
-	drain_local_pages();
-	copy_data_pages();
-
-	/*
-	 * End of critical section. From now on, we can write to memory,
-	 * but we should not touch disk. This specially means we must _not_
-	 * touch swap space! Except we must write out our image of course.
-	 */
-
-	printk("swsusp: critical section/: done (%d pages copied)\n", nr_copy_pages );
-	return 0;
+	pr_debug("swsusp: available swap: %lu pages\n", i.freeswap);
+	return i.freeswap > (nr_pages + PAGES_FOR_IO +
+		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
 }
 
 
@@ -854,7 +562,7 @@ static int suspend_prepare_image(void)
 int swsusp_write(void)
 {
 	int error;
-	device_resume();
+
 	lock_swapdevices();
 	error = write_suspend_image();
 	/* This will unlock ignored swap devices since writing is finished */
@@ -864,14 +572,6 @@ int swsusp_write(void)
 }
 
 
-extern asmlinkage int swsusp_arch_suspend(void);
-extern asmlinkage int swsusp_arch_resume(void);
-
-
-asmlinkage int swsusp_save(void)
-{
-	return suspend_prepare_image();
-}
 
 int swsusp_suspend(void)
 {
@@ -886,23 +586,23 @@ int swsusp_suspend(void)
 	 * at resume time, and evil weirdness ensues.
 	 */
 	if ((error = device_power_down(PMSG_FREEZE))) {
+		printk(KERN_ERR "Some devices failed to power down, aborting suspend\n");
 		local_irq_enable();
 		return error;
 	}
 
 	if ((error = swsusp_swap_check())) {
-		printk(KERN_ERR "swsusp: FATAL: cannot find swap device, try "
-				"swapon -a!\n");
+		printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n");
+		device_power_up();
 		local_irq_enable();
 		return error;
 	}
 
 	save_processor_state();
 	if ((error = swsusp_arch_suspend()))
-		printk("Error %d suspending\n", error);
+		printk(KERN_ERR "Error %d suspending\n", error);
 	/* Restore control flow magically appears here */
 	restore_processor_state();
-	BUG_ON (nr_copy_pages_check != nr_copy_pages);
 	restore_highmem();
 	device_power_up();
 	local_irq_enable();
@@ -922,8 +622,14 @@ int swsusp_resume(void)
 	 * execution continues at place where swsusp_arch_suspend was called
          */
 	BUG_ON(!error);
+	/* The only reason why swsusp_arch_resume() can fail is memory being
+	 * very tight, so we have to free it as soon as we can to avoid
+	 * subsequent failures
+	 */
+	swsusp_free();
 	restore_processor_state();
 	restore_highmem();
+	touch_softlockup_watchdog();
 	device_power_up();
 	local_irq_enable();
 	return error;
@@ -936,54 +642,28 @@ int swsusp_resume(void)
  *
  *	We don't know which pages are usable until we allocate them.
  *
- *	Allocated but unusable (ie eaten) memory pages are linked together
- *	to create a list, so that we can free them easily
- *
- *	We could have used a type other than (void *)
- *	for this purpose, but ...
+ *	Allocated but unusable (ie eaten) memory pages are marked so that
+ *	swsusp_free() can release them
  */
-static void **eaten_memory = NULL;
-
-static inline void eat_page(void *page)
-{
-	void **c;
 
-	c = eaten_memory;
-	eaten_memory = page;
-	*eaten_memory = c;
-}
-
-static unsigned long get_usable_page(unsigned gfp_mask)
+unsigned long get_safe_page(gfp_t gfp_mask)
 {
 	unsigned long m;
 
-	m = get_zeroed_page(gfp_mask);
-	while (!PageNosaveFree(virt_to_page(m))) {
-		eat_page((void *)m);
+	do {
 		m = get_zeroed_page(gfp_mask);
-		if (!m)
-			break;
+		if (m && PageNosaveFree(virt_to_page(m)))
+			/* This is for swsusp_free() */
+			SetPageNosave(virt_to_page(m));
+	} while (m && PageNosaveFree(virt_to_page(m)));
+	if (m) {
+		/* This is for swsusp_free() */
+		SetPageNosave(virt_to_page(m));
+		SetPageNosaveFree(virt_to_page(m));
 	}
 	return m;
 }
 
-static void free_eaten_memory(void)
-{
-	unsigned long m;
-	void **c;
-	int i = 0;
-
-	c = eaten_memory;
-	while (c) {
-		m = (unsigned long)c;
-		c = *c;
-		free_page(m);
-		i++;
-	}
-	eaten_memory = NULL;
-	pr_debug("swsusp: %d unused pages freed\n", i);
-}
-
 /**
  *	check_pagedir - We ensure here that pages that the PBEs point to
  *	won't collide with pages where we're going to restore from the loaded
@@ -1001,7 +681,7 @@ static int check_pagedir(struct pbe *pblist)
 		p->address = 0UL;
 
 	for_each_pbe (p, pblist) {
-		p->address = get_usable_page(GFP_ATOMIC);
+		p->address = get_safe_page(GFP_ATOMIC);
 		if (!p->address)
 			return -ENOMEM;
 	}
@@ -1020,7 +700,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
 	unsigned long zone_pfn;
 	struct pbe *pbpage, *tail, *p;
 	void *m;
-	int rel = 0, error = 0;
+	int rel = 0;
 
 	if (!pblist) /* a sanity check */
 		return NULL;
@@ -1028,41 +708,37 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
 	pr_debug("swsusp: Relocating pagedir (%lu pages to check)\n",
 			swsusp_info.pagedir_pages);
 
-	/* Set page flags */
+	/* Clear page flags */
 
 	for_each_zone (zone) {
         	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-                	SetPageNosaveFree(pfn_to_page(zone_pfn +
+        		if (pfn_valid(zone_pfn + zone->zone_start_pfn))
+                		ClearPageNosaveFree(pfn_to_page(zone_pfn +
 					zone->zone_start_pfn));
 	}
 
-	/* Clear orig addresses */
+	/* Mark orig addresses */
 
 	for_each_pbe (p, pblist)
-		ClearPageNosaveFree(virt_to_page(p->orig_address));
+		SetPageNosaveFree(virt_to_page(p->orig_address));
 
 	tail = pblist + PB_PAGE_SKIP;
 
 	/* Relocate colliding pages */
 
 	for_each_pb_page (pbpage, pblist) {
-		if (!PageNosaveFree(virt_to_page((unsigned long)pbpage))) {
-			m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD);
-			if (!m) {
-				error = -ENOMEM;
-				break;
-			}
+		if (PageNosaveFree(virt_to_page((unsigned long)pbpage))) {
+			m = (void *)get_safe_page(GFP_ATOMIC | __GFP_COLD);
+			if (!m)
+				return NULL;
 			memcpy(m, (void *)pbpage, PAGE_SIZE);
 			if (pbpage == pblist)
 				pblist = (struct pbe *)m;
 			else
 				tail->next = (struct pbe *)m;
-
-			eat_page((void *)pbpage);
 			pbpage = (struct pbe *)m;
 
 			/* We have to link the PBEs again */
-
 			for (p = pbpage; p < pbpage + PB_PAGE_SKIP; p++)
 				if (p->next) /* needed to save the end */
 					p->next = p + 1;
@@ -1072,14 +748,13 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
 		tail = pbpage + PB_PAGE_SKIP;
 	}
 
-	if (error) {
-		printk("\nswsusp: Out of memory\n\n");
-		free_pagedir(pblist);
-		free_eaten_memory();
-		pblist = NULL;
+	/* This is for swsusp_free() */
+	for_each_pb_page (pbpage, pblist) {
+		SetPageNosave(virt_to_page(pbpage));
+		SetPageNosaveFree(virt_to_page(pbpage));
 	}
-	else
-		printk("swsusp: Relocated %d pages\n", rel);
+
+	printk("swsusp: Relocated %d pages\n", rel);
 
 	return pblist;
 }
@@ -1179,7 +854,8 @@ static const char * sanity_check(void)
 	if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
 		return "machine";
 #if 0
-	if(swsusp_info.cpus != num_online_cpus())
+	/* We can't use number of online CPUs when we use hotplug to remove them ;-))) */
+	if (swsusp_info.cpus != num_possible_cpus())
 		return "number of cpus";
 #endif
 	return NULL;
@@ -1212,13 +888,14 @@ static int check_sig(void)
 		return error;
 	if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
 		memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
+		memcpy(key_iv, swsusp_header.key_iv, MAXKEY+MAXIV);
+		memset(swsusp_header.key_iv, 0, MAXKEY+MAXIV);
 
 		/*
 		 * Reset swap signature now.
 		 */
 		error = bio_write_page(0, &swsusp_header);
 	} else { 
-		printk(KERN_ERR "swsusp: Suspend partition has wrong signature?\n");
 		return -EINVAL;
 	}
 	if (!error)
@@ -1239,6 +916,10 @@ static int data_read(struct pbe *pblist)
 	int error = 0;
 	int i = 0;
 	int mod = swsusp_info.image_pages / 100;
+	void *tfm;
+
+	if ((error = crypto_init(0, &tfm)))
+		return error;
 
 	if (!mod)
 		mod = 1;
@@ -1250,14 +931,15 @@ static int data_read(struct pbe *pblist)
 		if (!(i % mod))
 			printk("\b\b\b\b%3d%%", i / mod);
 
-		error = bio_read_page(swp_offset(p->swap_address),
-				  (void *)p->address);
-		if (error)
+		if ((error = crypto_read(p, tfm))) {
+			crypto_exit(tfm);
 			return error;
+		}
 
 		i++;
 	}
 	printk("\b\b\b\bdone\n");
+	crypto_exit(tfm);
 	return error;
 }
 
@@ -1290,10 +972,8 @@ static int read_pagedir(struct pbe *pblist)
 			break;
 	}
 
-	if (error)
-		free_page((unsigned long)pblist);
-
-	BUG_ON(i != swsusp_info.pagedir_pages);
+	if (!error)
+		BUG_ON(i != swsusp_info.pagedir_pages);
 
 	return error;
 }
@@ -1331,18 +1011,10 @@ static int read_suspend_image(void)
 	/* Allocate memory for the image and read the data from swap */
 
 	error = check_pagedir(pagedir_nosave);
-	free_eaten_memory();
+
 	if (!error)
 		error = data_read(pagedir_nosave);
 
-	if (error) { /* We fail cleanly */
-		for_each_pbe (p, pagedir_nosave)
-			if (p->address) {
-				free_page(p->address);
-				p->address = 0UL;
-			}
-		free_pagedir(pagedir_nosave);
-	}
 	return error;
 }
 
@@ -1385,6 +1057,7 @@ int swsusp_read(void)
 
 	error = read_suspend_image();
 	blkdev_put(resume_bdev);
+	memset(key_iv, 0, MAXKEY+MAXIV);
 
 	if (!error)
 		pr_debug("swsusp: Reading resume file was successful\n");
diff --git a/kernel/printk.c b/kernel/printk.c
index 5092397fac29..3cb9708209bc 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -10,7 +10,7 @@
  * elsewhere, in preparation for a serial line console (someday).
  * Ted Ts'o, 2/11/93.
  * Modified for sysctl support, 1/8/97, Chris Horn.
- * Fixed SMP synchronization, 08/08/99, Manfred Spraul 
+ * Fixed SMP synchronization, 08/08/99, Manfred Spraul
  *     manfreds@colorfullife.com
  * Rewrote bits to get rid of console_lock
  *	01Mar01 Andrew Morton <andrewm@uow.edu.au>
@@ -148,7 +148,7 @@ static int __init console_setup(char *str)
 	if (!strcmp(str, "ttyb"))
 		strcpy(name, "ttyS1");
 #endif
-	for(s = name; *s; s++)
+	for (s = name; *s; s++)
 		if ((*s >= '0' && *s <= '9') || *s == ',')
 			break;
 	idx = simple_strtoul(s, NULL, 10);
@@ -169,11 +169,11 @@ static int __init log_buf_len_setup(char *str)
 		size = roundup_pow_of_two(size);
 	if (size > log_buf_len) {
 		unsigned long start, dest_idx, offset;
-		char * new_log_buf;
+		char *new_log_buf;
 
 		new_log_buf = alloc_bootmem(size);
 		if (!new_log_buf) {
-			printk("log_buf_len: allocation failed\n");
+			printk(KERN_WARNING "log_buf_len: allocation failed\n");
 			goto out;
 		}
 
@@ -193,10 +193,9 @@ static int __init log_buf_len_setup(char *str)
 		log_end -= offset;
 		spin_unlock_irqrestore(&logbuf_lock, flags);
 
-		printk("log_buf_len: %d\n", log_buf_len);
+		printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len);
 	}
 out:
-
 	return 1;
 }
 
@@ -217,7 +216,7 @@ __setup("log_buf_len=", log_buf_len_setup);
  *	9 -- Return number of unread characters in the log buffer
  *     10 -- Return size of the log buffer
  */
-int do_syslog(int type, char __user * buf, int len)
+int do_syslog(int type, char __user *buf, int len)
 {
 	unsigned long i, j, limit, count;
 	int do_clear = 0;
@@ -244,7 +243,8 @@ int do_syslog(int type, char __user * buf, int len)
 			error = -EFAULT;
 			goto out;
 		}
-		error = wait_event_interruptible(log_wait, (log_start - log_end));
+		error = wait_event_interruptible(log_wait,
+							(log_start - log_end));
 		if (error)
 			goto out;
 		i = 0;
@@ -264,7 +264,7 @@ int do_syslog(int type, char __user * buf, int len)
 			error = i;
 		break;
 	case 4:		/* Read/clear last kernel messages */
-		do_clear = 1; 
+		do_clear = 1;
 		/* FALL THRU */
 	case 3:		/* Read last kernel messages */
 		error = -EINVAL;
@@ -288,11 +288,11 @@ int do_syslog(int type, char __user * buf, int len)
 		limit = log_end;
 		/*
 		 * __put_user() could sleep, and while we sleep
-		 * printk() could overwrite the messages 
+		 * printk() could overwrite the messages
 		 * we try to copy to user space. Therefore
 		 * the messages are copied in reverse. <manfreds>
 		 */
-		for(i = 0; i < count && !error; i++) {
+		for (i = 0; i < count && !error; i++) {
 			j = limit-1-i;
 			if (j + log_buf_len < log_end)
 				break;
@@ -306,10 +306,10 @@ int do_syslog(int type, char __user * buf, int len)
 		if (error)
 			break;
 		error = i;
-		if(i != count) {
+		if (i != count) {
 			int offset = count-error;
 			/* buffer overflow during copy, correct user buffer. */
-			for(i=0;i<error;i++) {
+			for (i = 0; i < error; i++) {
 				if (__get_user(c,&buf[i+offset]) ||
 				    __put_user(c,&buf[i])) {
 					error = -EFAULT;
@@ -351,7 +351,7 @@ out:
 	return error;
 }
 
-asmlinkage long sys_syslog(int type, char __user * buf, int len)
+asmlinkage long sys_syslog(int type, char __user *buf, int len)
 {
 	return do_syslog(type, buf, len);
 }
@@ -404,21 +404,19 @@ static void call_console_drivers(unsigned long start, unsigned long end)
 	cur_index = start;
 	start_print = start;
 	while (cur_index != end) {
-		if (	msg_level < 0 &&
-			((end - cur_index) > 2) &&
-			LOG_BUF(cur_index + 0) == '<' &&
-			LOG_BUF(cur_index + 1) >= '0' &&
-			LOG_BUF(cur_index + 1) <= '7' &&
-			LOG_BUF(cur_index + 2) == '>')
-		{
+		if (msg_level < 0 && ((end - cur_index) > 2) &&
+				LOG_BUF(cur_index + 0) == '<' &&
+				LOG_BUF(cur_index + 1) >= '0' &&
+				LOG_BUF(cur_index + 1) <= '7' &&
+				LOG_BUF(cur_index + 2) == '>') {
 			msg_level = LOG_BUF(cur_index + 1) - '0';
 			cur_index += 3;
 			start_print = cur_index;
 		}
 		while (cur_index != end) {
 			char c = LOG_BUF(cur_index);
-			cur_index++;
 
+			cur_index++;
 			if (c == '\n') {
 				if (msg_level < 0) {
 					/*
@@ -461,7 +459,7 @@ static void zap_locks(void)
 	static unsigned long oops_timestamp;
 
 	if (time_after_eq(jiffies, oops_timestamp) &&
-			!time_after(jiffies, oops_timestamp + 30*HZ))
+			!time_after(jiffies, oops_timestamp + 30 * HZ))
 		return;
 
 	oops_timestamp = jiffies;
@@ -488,9 +486,14 @@ static int __init printk_time_setup(char *str)
 
 __setup("time", printk_time_setup);
 
+__attribute__((weak)) unsigned long long printk_clock(void)
+{
+	return sched_clock();
+}
+
 /*
  * This is printk.  It can be called from any context.  We want it to work.
- * 
+ *
  * We try to grab the console_sem.  If we succeed, it's easy - we log the output and
  * call the console drivers.  If we fail to get the semaphore we place the output
  * into the log buffer and return.  The current holder of the console_sem will
@@ -514,6 +517,9 @@ asmlinkage int printk(const char *fmt, ...)
 	return r;
 }
 
+/* cpu currently holding logbuf_lock */
+static volatile unsigned int printk_cpu = UINT_MAX;
+
 asmlinkage int vprintk(const char *fmt, va_list args)
 {
 	unsigned long flags;
@@ -522,11 +528,15 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 	static char printk_buf[1024];
 	static int log_level_unknown = 1;
 
-	if (unlikely(oops_in_progress))
+	preempt_disable();
+	if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id())
+		/* If a crash is occurring during printk() on this CPU,
+		 * make sure we can't deadlock */
 		zap_locks();
 
 	/* This stops the holder of console_sem just where we want him */
 	spin_lock_irqsave(&logbuf_lock, flags);
+	printk_cpu = smp_processor_id();
 
 	/* Emit the output into the temporary buffer */
 	printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args);
@@ -558,7 +568,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 					loglev_char = default_message_loglevel
 						+ '0';
 				}
-				t = sched_clock();
+				t = printk_clock();
 				nanosec_rem = do_div(t, 1000000000);
 				tlen = sprintf(tbuf,
 						"<%c>[%5lu.%06lu] ",
@@ -595,6 +605,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 		 * CPU until it is officially up.  We shouldn't be calling into
 		 * random console drivers on a CPU which doesn't exist yet..
 		 */
+		printk_cpu = UINT_MAX;
 		spin_unlock_irqrestore(&logbuf_lock, flags);
 		goto out;
 	}
@@ -604,6 +615,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 		 * We own the drivers.  We can drop the spinlock and let
 		 * release_console_sem() print the text
 		 */
+		printk_cpu = UINT_MAX;
 		spin_unlock_irqrestore(&logbuf_lock, flags);
 		console_may_schedule = 0;
 		release_console_sem();
@@ -613,9 +625,11 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 		 * allows the semaphore holder to proceed and to call the
 		 * console drivers with the output which we just produced.
 		 */
+		printk_cpu = UINT_MAX;
 		spin_unlock_irqrestore(&logbuf_lock, flags);
 	}
 out:
+	preempt_enable();
 	return printed_len;
 }
 EXPORT_SYMBOL(printk);
@@ -623,13 +637,19 @@ EXPORT_SYMBOL(vprintk);
 
 #else
 
-asmlinkage long sys_syslog(int type, char __user * buf, int len)
+asmlinkage long sys_syslog(int type, char __user *buf, int len)
 {
 	return 0;
 }
 
-int do_syslog(int type, char __user * buf, int len) { return 0; }
-static void call_console_drivers(unsigned long start, unsigned long end) {}
+int do_syslog(int type, char __user *buf, int len)
+{
+	return 0;
+}
+
+static void call_console_drivers(unsigned long start, unsigned long end)
+{
+}
 
 #endif
 
@@ -835,9 +855,9 @@ EXPORT_SYMBOL(console_start);
  * print any messages that were printed by the kernel before the
  * console driver was initialized.
  */
-void register_console(struct console * console)
+void register_console(struct console *console)
 {
-	int     i;
+	int i;
 	unsigned long flags;
 
 	if (preferred_console < 0)
@@ -862,7 +882,8 @@ void register_console(struct console * console)
 	 *	See if this console matches one we selected on
 	 *	the command line.
 	 */
-	for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) {
+	for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0];
+			i++) {
 		if (strcmp(console_cmdline[i].name, console->name) != 0)
 			continue;
 		if (console->index >= 0 &&
@@ -917,9 +938,9 @@ void register_console(struct console * console)
 }
 EXPORT_SYMBOL(register_console);
 
-int unregister_console(struct console * console)
+int unregister_console(struct console *console)
 {
-        struct console *a,*b;
+        struct console *a, *b;
 	int res = 1;
 
 	acquire_console_sem();
@@ -933,10 +954,10 @@ int unregister_console(struct console * console)
 				b->next = a->next;
 				res = 0;
 				break;
-			}  
+			}
 		}
 	}
-	
+
 	/* If last console is removed, we re-enable picking the first
 	 * one that gets registered. Without that, pmac early boot console
 	 * would prevent fbcon from taking over.
@@ -978,7 +999,7 @@ void tty_write_message(struct tty_struct *tty, char *msg)
 int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
 {
 	static DEFINE_SPINLOCK(ratelimit_lock);
-	static unsigned long toks = 10*5*HZ;
+	static unsigned long toks = 10 * 5 * HZ;
 	static unsigned long last_msg;
 	static int missed;
 	unsigned long flags;
@@ -991,6 +1012,7 @@ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
 		toks = ratelimit_burst * ratelimit_jiffies;
 	if (toks >= ratelimit_jiffies) {
 		int lost = missed;
+
 		missed = 0;
 		toks -= ratelimit_jiffies;
 		spin_unlock_irqrestore(&ratelimit_lock, flags);
@@ -1005,7 +1027,7 @@ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
 EXPORT_SYMBOL(__printk_ratelimit);
 
 /* minimum time in jiffies between messages */
-int printk_ratelimit_jiffies = 5*HZ;
+int printk_ratelimit_jiffies = 5 * HZ;
 
 /* number of messages we send before ratelimiting */
 int printk_ratelimit_burst = 10;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 8dcb8f6288bc..863eee8bff47 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -56,6 +56,10 @@ void ptrace_untrace(task_t *child)
 			signal_wake_up(child, 1);
 		}
 	}
+	if (child->signal->flags & SIGNAL_GROUP_EXIT) {
+		sigaddset(&child->pending.signal, SIGKILL);
+		signal_wake_up(child, 1);
+	}
 	spin_unlock(&child->sighand->siglock);
 }
 
@@ -77,8 +81,7 @@ void __ptrace_unlink(task_t *child)
 		SET_LINKS(child);
 	}
 
-	if (child->state == TASK_TRACED)
-		ptrace_untrace(child);
+	ptrace_untrace(child);
 }
 
 /*
@@ -118,6 +121,33 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 	return ret;
 }
 
+static int may_attach(struct task_struct *task)
+{
+	if (!task->mm)
+		return -EPERM;
+	if (((current->uid != task->euid) ||
+	     (current->uid != task->suid) ||
+	     (current->uid != task->uid) ||
+	     (current->gid != task->egid) ||
+	     (current->gid != task->sgid) ||
+	     (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
+		return -EPERM;
+	smp_rmb();
+	if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
+		return -EPERM;
+
+	return security_ptrace(current, task);
+}
+
+int ptrace_may_attach(struct task_struct *task)
+{
+	int err;
+	task_lock(task);
+	err = may_attach(task);
+	task_unlock(task);
+	return !err;
+}
+
 int ptrace_attach(struct task_struct *task)
 {
 	int retval;
@@ -127,22 +157,10 @@ int ptrace_attach(struct task_struct *task)
 		goto bad;
 	if (task == current)
 		goto bad;
-	if (!task->mm)
-		goto bad;
-	if(((current->uid != task->euid) ||
-	    (current->uid != task->suid) ||
-	    (current->uid != task->uid) ||
- 	    (current->gid != task->egid) ||
- 	    (current->gid != task->sgid) ||
- 	    (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
-		goto bad;
-	smp_rmb();
-	if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
-		goto bad;
 	/* the same process cannot be attached many times */
 	if (task->ptrace & PT_PTRACED)
 		goto bad;
-	retval = security_ptrace(current, task);
+	retval = may_attach(task);
 	if (retval)
 		goto bad;
 
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f436993bd590..c4d159a21e04 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -45,6 +45,7 @@
 #include <linux/percpu.h>
 #include <linux/notifier.h>
 #include <linux/rcupdate.h>
+#include <linux/rcuref.h>
 #include <linux/cpu.h>
 
 /* Definition for rcupdate control block. */
@@ -70,7 +71,20 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
 
 /* Fake initialization required by compiler */
 static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
-static int maxbatch = 10;
+static int maxbatch = 10000;
+
+#ifndef __HAVE_ARCH_CMPXCHG
+/*
+ * We use an array of spinlocks for the rcurefs -- similar to ones in sparc
+ * 32 bit atomic_t implementations, and a hash function similar to that
+ * for our refcounting needs.
+ * Can't help multiprocessors which donot have cmpxchg :(
+ */
+
+spinlock_t __rcuref_hash[RCUREF_HASH_SIZE] = {
+	[0 ... (RCUREF_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED
+};
+#endif
 
 /**
  * call_rcu - Queue an RCU callback for invocation after a grace period.
@@ -95,6 +109,10 @@ void fastcall call_rcu(struct rcu_head *head,
 	rdp = &__get_cpu_var(rcu_data);
 	*rdp->nxttail = head;
 	rdp->nxttail = &head->next;
+
+	if (unlikely(++rdp->count > 10000))
+		set_need_resched();
+
 	local_irq_restore(flags);
 }
 
@@ -126,10 +144,25 @@ void fastcall call_rcu_bh(struct rcu_head *head,
 	rdp = &__get_cpu_var(rcu_bh_data);
 	*rdp->nxttail = head;
 	rdp->nxttail = &head->next;
+	rdp->count++;
+/*
+ *  Should we directly call rcu_do_batch() here ?
+ *  if (unlikely(rdp->count > 10000))
+ *      rcu_do_batch(rdp);
+ */
 	local_irq_restore(flags);
 }
 
 /*
+ * Return the number of RCU batches processed thus far.  Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed(void)
+{
+	return rcu_ctrlblk.completed;
+}
+
+/*
  * Invoke the completed RCU callbacks. They are expected to be in
  * a per-cpu list.
  */
@@ -143,6 +176,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
 		next = rdp->donelist = list->next;
 		list->func(list);
 		list = next;
+		rdp->count--;
 		if (++count >= maxbatch)
 			break;
 	}
@@ -476,6 +510,7 @@ void synchronize_kernel(void)
 }
 
 module_param(maxbatch, int, 0);
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
 EXPORT_SYMBOL(call_rcu);  /* WARNING: GPL-only in April 2006. */
 EXPORT_SYMBOL(call_rcu_bh);  /* WARNING: GPL-only in April 2006. */
 EXPORT_SYMBOL_GPL(synchronize_rcu);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
new file mode 100644
index 000000000000..9b58f1eff3ca
--- /dev/null
+++ b/kernel/rcutorture.c
@@ -0,0 +1,492 @@
+/*
+ * Read-Copy Update /proc-based torture test facility
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2005
+ *
+ * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ *
+ * See also:  Documentation/RCU/torture.txt
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/rcuref.h>
+#include <linux/cpu.h>
+#include <linux/random.h>
+#include <linux/delay.h>
+#include <linux/byteorder/swabb.h>
+#include <linux/stat.h>
+
+MODULE_LICENSE("GPL");
+
+static int nreaders = -1;	/* # reader threads, defaults to 4*ncpus */
+static int stat_interval = 0;	/* Interval between stats, in seconds. */
+				/*  Defaults to "only at end of test". */
+static int verbose = 0;		/* Print more debug info. */
+
+MODULE_PARM(nreaders, "i");
+MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
+MODULE_PARM(stat_interval, "i");
+MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
+MODULE_PARM(verbose, "i");
+MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
+#define TORTURE_FLAG "rcutorture: "
+#define PRINTK_STRING(s) \
+	do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
+#define VERBOSE_PRINTK_STRING(s) \
+	do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
+#define VERBOSE_PRINTK_ERRSTRING(s) \
+	do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0)
+
+static char printk_buf[4096];
+
+static int nrealreaders;
+static struct task_struct *writer_task;
+static struct task_struct **reader_tasks;
+static struct task_struct *stats_task;
+
+#define RCU_TORTURE_PIPE_LEN 10
+
+struct rcu_torture {
+	struct rcu_head rtort_rcu;
+	int rtort_pipe_count;
+	struct list_head rtort_free;
+};
+
+static int fullstop = 0;	/* stop generating callbacks at test end. */
+static LIST_HEAD(rcu_torture_freelist);
+static struct rcu_torture *rcu_torture_current = NULL;
+static long rcu_torture_current_version = 0;
+static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
+static DEFINE_SPINLOCK(rcu_torture_lock);
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
+	{ 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) =
+	{ 0 };
+static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
+atomic_t n_rcu_torture_alloc;
+atomic_t n_rcu_torture_alloc_fail;
+atomic_t n_rcu_torture_free;
+
+/*
+ * Allocate an element from the rcu_tortures pool.
+ */
+struct rcu_torture *
+rcu_torture_alloc(void)
+{
+	struct list_head *p;
+
+	spin_lock(&rcu_torture_lock);
+	if (list_empty(&rcu_torture_freelist)) {
+		atomic_inc(&n_rcu_torture_alloc_fail);
+		spin_unlock(&rcu_torture_lock);
+		return NULL;
+	}
+	atomic_inc(&n_rcu_torture_alloc);
+	p = rcu_torture_freelist.next;
+	list_del_init(p);
+	spin_unlock(&rcu_torture_lock);
+	return container_of(p, struct rcu_torture, rtort_free);
+}
+
+/*
+ * Free an element to the rcu_tortures pool.
+ */
+static void
+rcu_torture_free(struct rcu_torture *p)
+{
+	atomic_inc(&n_rcu_torture_free);
+	spin_lock(&rcu_torture_lock);
+	list_add_tail(&p->rtort_free, &rcu_torture_freelist);
+	spin_unlock(&rcu_torture_lock);
+}
+
+static void
+rcu_torture_cb(struct rcu_head *p)
+{
+	int i;
+	struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
+
+	if (fullstop) {
+		/* Test is ending, just drop callbacks on the floor. */
+		/* The next initialization will pick up the pieces. */
+		return;
+	}
+	i = rp->rtort_pipe_count;
+	if (i > RCU_TORTURE_PIPE_LEN)
+		i = RCU_TORTURE_PIPE_LEN;
+	atomic_inc(&rcu_torture_wcount[i]);
+	if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN)
+		rcu_torture_free(rp);
+	else
+		call_rcu(p, rcu_torture_cb);
+}
+
+struct rcu_random_state {
+	unsigned long rrs_state;
+	unsigned long rrs_count;
+};
+
+#define RCU_RANDOM_MULT 39916801  /* prime */
+#define RCU_RANDOM_ADD	479001701 /* prime */
+#define RCU_RANDOM_REFRESH 10000
+
+#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 }
+
+/*
+ * Crude but fast random-number generator.  Uses a linear congruential
+ * generator, with occasional help from get_random_bytes().
+ */
+static long
+rcu_random(struct rcu_random_state *rrsp)
+{
+	long refresh;
+
+	if (--rrsp->rrs_count < 0) {
+		get_random_bytes(&refresh, sizeof(refresh));
+		rrsp->rrs_state += refresh;
+		rrsp->rrs_count = RCU_RANDOM_REFRESH;
+	}
+	rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
+	return swahw32(rrsp->rrs_state);
+}
+
+/*
+ * RCU torture writer kthread.  Repeatedly substitutes a new structure
+ * for that pointed to by rcu_torture_current, freeing the old structure
+ * after a series of grace periods (the "pipeline").
+ */
+static int
+rcu_torture_writer(void *arg)
+{
+	int i;
+	long oldbatch = rcu_batches_completed();
+	struct rcu_torture *rp;
+	struct rcu_torture *old_rp;
+	static DEFINE_RCU_RANDOM(rand);
+
+	VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
+	do {
+		schedule_timeout_uninterruptible(1);
+		if (rcu_batches_completed() == oldbatch)
+			continue;
+		if ((rp = rcu_torture_alloc()) == NULL)
+			continue;
+		rp->rtort_pipe_count = 0;
+		udelay(rcu_random(&rand) & 0x3ff);
+		old_rp = rcu_torture_current;
+		rcu_assign_pointer(rcu_torture_current, rp);
+		smp_wmb();
+		if (old_rp != NULL) {
+			i = old_rp->rtort_pipe_count;
+			if (i > RCU_TORTURE_PIPE_LEN)
+				i = RCU_TORTURE_PIPE_LEN;
+			atomic_inc(&rcu_torture_wcount[i]);
+			old_rp->rtort_pipe_count++;
+			call_rcu(&old_rp->rtort_rcu, rcu_torture_cb);
+		}
+		rcu_torture_current_version++;
+		oldbatch = rcu_batches_completed();
+	} while (!kthread_should_stop() && !fullstop);
+	VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
+	while (!kthread_should_stop())
+		schedule_timeout_uninterruptible(1);
+	return 0;
+}
+
+/*
+ * RCU torture reader kthread.  Repeatedly dereferences rcu_torture_current,
+ * incrementing the corresponding element of the pipeline array.  The
+ * counter in the element should never be greater than 1, otherwise, the
+ * RCU implementation is broken.
+ */
+static int
+rcu_torture_reader(void *arg)
+{
+	int completed;
+	DEFINE_RCU_RANDOM(rand);
+	struct rcu_torture *p;
+	int pipe_count;
+
+	VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
+	do {
+		rcu_read_lock();
+		completed = rcu_batches_completed();
+		p = rcu_dereference(rcu_torture_current);
+		if (p == NULL) {
+			/* Wait for rcu_torture_writer to get underway */
+			rcu_read_unlock();
+			schedule_timeout_interruptible(HZ);
+			continue;
+		}
+		udelay(rcu_random(&rand) & 0x7f);
+		preempt_disable();
+		pipe_count = p->rtort_pipe_count;
+		if (pipe_count > RCU_TORTURE_PIPE_LEN) {
+			/* Should not happen, but... */
+			pipe_count = RCU_TORTURE_PIPE_LEN;
+		}
+		++__get_cpu_var(rcu_torture_count)[pipe_count];
+		completed = rcu_batches_completed() - completed;
+		if (completed > RCU_TORTURE_PIPE_LEN) {
+			/* Should not happen, but... */
+			completed = RCU_TORTURE_PIPE_LEN;
+		}
+		++__get_cpu_var(rcu_torture_batch)[completed];
+		preempt_enable();
+		rcu_read_unlock();
+		schedule();
+	} while (!kthread_should_stop() && !fullstop);
+	VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
+	while (!kthread_should_stop())
+		schedule_timeout_uninterruptible(1);
+	return 0;
+}
+
+/*
+ * Create an RCU-torture statistics message in the specified buffer.
+ */
+static int
+rcu_torture_printk(char *page)
+{
+	int cnt = 0;
+	int cpu;
+	int i;
+	long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+	long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+
+	for_each_cpu(cpu) {
+		for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+			pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
+			batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
+		}
+	}
+	for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
+		if (pipesummary[i] != 0)
+			break;
+	}
+	cnt += sprintf(&page[cnt], "rcutorture: ");
+	cnt += sprintf(&page[cnt],
+		       "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d",
+		       rcu_torture_current,
+		       rcu_torture_current_version,
+		       list_empty(&rcu_torture_freelist),
+		       atomic_read(&n_rcu_torture_alloc),
+		       atomic_read(&n_rcu_torture_alloc_fail),
+		       atomic_read(&n_rcu_torture_free));
+	cnt += sprintf(&page[cnt], "\nrcutorture: ");
+	if (i > 1)
+		cnt += sprintf(&page[cnt], "!!! ");
+	cnt += sprintf(&page[cnt], "Reader Pipe: ");
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
+		cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);
+	cnt += sprintf(&page[cnt], "\nrcutorture: ");
+	cnt += sprintf(&page[cnt], "Reader Batch: ");
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++)
+		cnt += sprintf(&page[cnt], " %ld", batchsummary[i]);
+	cnt += sprintf(&page[cnt], "\nrcutorture: ");
+	cnt += sprintf(&page[cnt], "Free-Block Circulation: ");
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+		cnt += sprintf(&page[cnt], " %d",
+			       atomic_read(&rcu_torture_wcount[i]));
+	}
+	cnt += sprintf(&page[cnt], "\n");
+	return cnt;
+}
+
+/*
+ * Print torture statistics.  Caller must ensure that there is only
+ * one call to this function at a given time!!!  This is normally
+ * accomplished by relying on the module system to only have one copy
+ * of the module loaded, and then by giving the rcu_torture_stats
+ * kthread full control (or the init/cleanup functions when rcu_torture_stats
+ * thread is not running).
+ */
+static void
+rcu_torture_stats_print(void)
+{
+	int cnt;
+
+	cnt = rcu_torture_printk(printk_buf);
+	printk(KERN_ALERT "%s", printk_buf);
+}
+
+/*
+ * Periodically prints torture statistics, if periodic statistics printing
+ * was specified via the stat_interval module parameter.
+ *
+ * No need to worry about fullstop here, since this one doesn't reference
+ * volatile state or register callbacks.
+ */
+static int
+rcu_torture_stats(void *arg)
+{
+	VERBOSE_PRINTK_STRING("rcu_torture_stats task started");
+	do {
+		schedule_timeout_interruptible(stat_interval * HZ);
+		rcu_torture_stats_print();
+	} while (!kthread_should_stop());
+	VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
+	return 0;
+}
+
+static void
+rcu_torture_cleanup(void)
+{
+	int i;
+
+	fullstop = 1;
+	if (writer_task != NULL) {
+		VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
+		kthread_stop(writer_task);
+	}
+	writer_task = NULL;
+
+	if (reader_tasks != NULL) {
+		for (i = 0; i < nrealreaders; i++) {
+			if (reader_tasks[i] != NULL) {
+				VERBOSE_PRINTK_STRING(
+					"Stopping rcu_torture_reader task");
+				kthread_stop(reader_tasks[i]);
+			}
+			reader_tasks[i] = NULL;
+		}
+		kfree(reader_tasks);
+		reader_tasks = NULL;
+	}
+	rcu_torture_current = NULL;
+
+	if (stats_task != NULL) {
+		VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
+		kthread_stop(stats_task);
+	}
+	stats_task = NULL;
+
+	/* Wait for all RCU callbacks to fire.  */
+
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++)
+		synchronize_rcu();
+	rcu_torture_stats_print();  /* -After- the stats thread is stopped! */
+	PRINTK_STRING("--- End of test");
+}
+
+static int
+rcu_torture_init(void)
+{
+	int i;
+	int cpu;
+	int firsterr = 0;
+
+	/* Process args and tell the world that the torturer is on the job. */
+
+	if (nreaders >= 0)
+		nrealreaders = nreaders;
+	else
+		nrealreaders = 2 * num_online_cpus();
+	printk(KERN_ALERT TORTURE_FLAG
+	       "--- Start of test: nreaders=%d stat_interval=%d verbose=%d\n",
+	       nrealreaders, stat_interval, verbose);
+	fullstop = 0;
+
+	/* Set up the freelist. */
+
+	INIT_LIST_HEAD(&rcu_torture_freelist);
+	for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) {
+		list_add_tail(&rcu_tortures[i].rtort_free,
+			      &rcu_torture_freelist);
+	}
+
+	/* Initialize the statistics so that each run gets its own numbers. */
+
+	rcu_torture_current = NULL;
+	rcu_torture_current_version = 0;
+	atomic_set(&n_rcu_torture_alloc, 0);
+	atomic_set(&n_rcu_torture_alloc_fail, 0);
+	atomic_set(&n_rcu_torture_free, 0);
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
+		atomic_set(&rcu_torture_wcount[i], 0);
+	for_each_cpu(cpu) {
+		for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+			per_cpu(rcu_torture_count, cpu)[i] = 0;
+			per_cpu(rcu_torture_batch, cpu)[i] = 0;
+		}
+	}
+
+	/* Start up the kthreads. */
+
+	VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task");
+	writer_task = kthread_run(rcu_torture_writer, NULL,
+				  "rcu_torture_writer");
+	if (IS_ERR(writer_task)) {
+		firsterr = PTR_ERR(writer_task);
+		VERBOSE_PRINTK_ERRSTRING("Failed to create writer");
+		writer_task = NULL;
+		goto unwind;
+	}
+	reader_tasks = kmalloc(nrealreaders * sizeof(reader_tasks[0]),
+			       GFP_KERNEL);
+	if (reader_tasks == NULL) {
+		VERBOSE_PRINTK_ERRSTRING("out of memory");
+		firsterr = -ENOMEM;
+		goto unwind;
+	}
+	for (i = 0; i < nrealreaders; i++) {
+		VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task");
+		reader_tasks[i] = kthread_run(rcu_torture_reader, NULL,
+					      "rcu_torture_reader");
+		if (IS_ERR(reader_tasks[i])) {
+			firsterr = PTR_ERR(reader_tasks[i]);
+			VERBOSE_PRINTK_ERRSTRING("Failed to create reader");
+			reader_tasks[i] = NULL;
+			goto unwind;
+		}
+	}
+	if (stat_interval > 0) {
+		VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task");
+		stats_task = kthread_run(rcu_torture_stats, NULL,
+					"rcu_torture_stats");
+		if (IS_ERR(stats_task)) {
+			firsterr = PTR_ERR(stats_task);
+			VERBOSE_PRINTK_ERRSTRING("Failed to create stats");
+			stats_task = NULL;
+			goto unwind;
+		}
+	}
+	return 0;
+
+unwind:
+	rcu_torture_cleanup();
+	return firsterr;
+}
+
+module_init(rcu_torture_init);
+module_exit(rcu_torture_cleanup);
diff --git a/kernel/resource.c b/kernel/resource.c
index 26967e042201..92285d822de6 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -430,10 +430,9 @@ EXPORT_SYMBOL(adjust_resource);
  */
 struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name)
 {
-	struct resource *res = kmalloc(sizeof(*res), GFP_KERNEL);
+	struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
 
 	if (res) {
-		memset(res, 0, sizeof(*res));
 		res->name = name;
 		res->start = start;
 		res->end = start + n - 1;
diff --git a/kernel/sched.c b/kernel/sched.c
index a646e4f36c41..b4f4eb613537 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -294,6 +294,10 @@ static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
 
 static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
 {
+#ifdef CONFIG_DEBUG_SPINLOCK
+	/* this is a valid case when another task releases the spinlock */
+	rq->lock.owner = current;
+#endif
 	spin_unlock_irq(&rq->lock);
 }
 
@@ -875,7 +879,7 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
  * smp_call_function() if an IPI is sent by the same process we are
  * waiting to become inactive.
  */
-void wait_task_inactive(task_t * p)
+void wait_task_inactive(task_t *p)
 {
 	unsigned long flags;
 	runqueue_t *rq;
@@ -966,8 +970,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 		int local_group;
 		int i;
 
+		/* Skip over this group if it has no CPUs allowed */
+		if (!cpus_intersects(group->cpumask, p->cpus_allowed))
+			goto nextgroup;
+
 		local_group = cpu_isset(this_cpu, group->cpumask);
-		/* XXX: put a cpus allowed check */
 
 		/* Tally up the load of all CPUs in the group */
 		avg_load = 0;
@@ -992,6 +999,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 			min_load = avg_load;
 			idlest = group;
 		}
+nextgroup:
 		group = group->next;
 	} while (group != sd->groups);
 
@@ -1003,13 +1011,18 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 /*
  * find_idlest_queue - find the idlest runqueue among the cpus in group.
  */
-static int find_idlest_cpu(struct sched_group *group, int this_cpu)
+static int
+find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 {
+	cpumask_t tmp;
 	unsigned long load, min_load = ULONG_MAX;
 	int idlest = -1;
 	int i;
 
-	for_each_cpu_mask(i, group->cpumask) {
+	/* Traverse only the allowed CPUs */
+	cpus_and(tmp, group->cpumask, p->cpus_allowed);
+
+	for_each_cpu_mask(i, tmp) {
 		load = source_load(i, 0);
 
 		if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -1052,7 +1065,7 @@ static int sched_balance_self(int cpu, int flag)
 		if (!group)
 			goto nextlevel;
 
-		new_cpu = find_idlest_cpu(group, cpu);
+		new_cpu = find_idlest_cpu(group, t, cpu);
 		if (new_cpu == -1 || new_cpu == cpu)
 			goto nextlevel;
 
@@ -1127,7 +1140,7 @@ static inline int wake_idle(int cpu, task_t *p)
  *
  * returns failure only if the task is already active.
  */
-static int try_to_wake_up(task_t * p, unsigned int state, int sync)
+static int try_to_wake_up(task_t *p, unsigned int state, int sync)
 {
 	int cpu, this_cpu, success = 0;
 	unsigned long flags;
@@ -1252,6 +1265,16 @@ out_activate:
 	}
 
 	/*
+	 * Tasks that have marked their sleep as noninteractive get
+	 * woken up without updating their sleep average. (i.e. their
+	 * sleep is handled in a priority-neutral manner, no priority
+	 * boost and no penalty.)
+	 */
+	if (old_state & TASK_NONINTERACTIVE)
+		__activate_task(p, rq);
+	else
+		activate_task(p, rq, cpu == this_cpu);
+	/*
 	 * Sync wakeups (i.e. those types of wakeups where the waker
 	 * has indicated that it will leave the CPU in short order)
 	 * don't trigger a preemption, if the woken up task will run on
@@ -1259,7 +1282,6 @@ out_activate:
 	 * the waker guarantees that the freshly woken up task is going
 	 * to be considered on this CPU.)
 	 */
-	activate_task(p, rq, cpu == this_cpu);
 	if (!sync || cpu != this_cpu) {
 		if (TASK_PREEMPTS_CURR(p, rq))
 			resched_task(rq->curr);
@@ -1274,7 +1296,7 @@ out:
 	return success;
 }
 
-int fastcall wake_up_process(task_t * p)
+int fastcall wake_up_process(task_t *p)
 {
 	return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
 				 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
@@ -1353,7 +1375,7 @@ void fastcall sched_fork(task_t *p, int clone_flags)
  * that must be done for every newly created context, then puts the task
  * on the runqueue and wakes it.
  */
-void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
+void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
 {
 	unsigned long flags;
 	int this_cpu, cpu;
@@ -1436,7 +1458,7 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
  * artificially, because any timeslice recovered here
  * was given away by the parent in the first place.)
  */
-void fastcall sched_exit(task_t * p)
+void fastcall sched_exit(task_t *p)
 {
 	unsigned long flags;
 	runqueue_t *rq;
@@ -1446,7 +1468,7 @@ void fastcall sched_exit(task_t * p)
 	 * the sleep_avg of the parent as well.
 	 */
 	rq = task_rq_lock(p->parent, &flags);
-	if (p->first_time_slice) {
+	if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
 		p->parent->time_slice += p->time_slice;
 		if (unlikely(p->parent->time_slice > task_timeslice(p)))
 			p->parent->time_slice = task_timeslice(p);
@@ -1478,6 +1500,7 @@ static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
 
 /**
  * finish_task_switch - clean up after a task-switch
+ * @rq: runqueue associated with task-switch
  * @prev: the thread we just switched away from.
  *
  * finish_task_switch must be called after the context switch, paired
@@ -1752,7 +1775,8 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
  */
 static inline
 int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
-	     struct sched_domain *sd, enum idle_type idle, int *all_pinned)
+		     struct sched_domain *sd, enum idle_type idle,
+		     int *all_pinned)
 {
 	/*
 	 * We do not migrate tasks that are:
@@ -1882,10 +1906,11 @@ out:
  */
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
-		   unsigned long *imbalance, enum idle_type idle)
+		   unsigned long *imbalance, enum idle_type idle, int *sd_idle)
 {
 	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+	unsigned long max_pull;
 	int load_idx;
 
 	max_load = this_load = total_load = total_pwr = 0;
@@ -1907,6 +1932,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		avg_load = 0;
 
 		for_each_cpu_mask(i, group->cpumask) {
+			if (*sd_idle && !idle_cpu(i))
+				*sd_idle = 0;
+
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
 				load = target_load(i, load_idx);
@@ -1932,7 +1960,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		group = group->next;
 	} while (group != sd->groups);
 
-	if (!busiest || this_load >= max_load)
+	if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE)
 		goto out_balanced;
 
 	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
@@ -1952,8 +1980,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	 * by pulling tasks to us.  Be careful of negative numbers as they'll
 	 * appear as very large values with unsigned longs.
 	 */
+
+	/* Don't want to pull so many tasks that a group would go idle */
+	max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE);
+
 	/* How much load to actually move to equalise the imbalance */
-	*imbalance = min((max_load - avg_load) * busiest->cpu_power,
+	*imbalance = min(max_pull * busiest->cpu_power,
 				(avg_load - this_load) * this->cpu_power)
 			/ SCHED_LOAD_SCALE;
 
@@ -2050,11 +2082,14 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 	unsigned long imbalance;
 	int nr_moved, all_pinned = 0;
 	int active_balance = 0;
+	int sd_idle = 0;
+
+	if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER)
+		sd_idle = 1;
 
-	spin_lock(&this_rq->lock);
 	schedstat_inc(sd, lb_cnt[idle]);
 
-	group = find_busiest_group(sd, this_cpu, &imbalance, idle);
+	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle);
 	if (!group) {
 		schedstat_inc(sd, lb_nobusyg[idle]);
 		goto out_balanced;
@@ -2078,19 +2113,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 		 * still unbalanced. nr_moved simply stays zero, so it is
 		 * correctly treated as an imbalance.
 		 */
-		double_lock_balance(this_rq, busiest);
+		double_rq_lock(this_rq, busiest);
 		nr_moved = move_tasks(this_rq, this_cpu, busiest,
-						imbalance, sd, idle,
-						&all_pinned);
-		spin_unlock(&busiest->lock);
+					imbalance, sd, idle, &all_pinned);
+		double_rq_unlock(this_rq, busiest);
 
 		/* All tasks on this runqueue were pinned by CPU affinity */
 		if (unlikely(all_pinned))
 			goto out_balanced;
 	}
 
-	spin_unlock(&this_rq->lock);
-
 	if (!nr_moved) {
 		schedstat_inc(sd, lb_failed[idle]);
 		sd->nr_balance_failed++;
@@ -2098,6 +2130,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 		if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
 
 			spin_lock(&busiest->lock);
+
+			/* don't kick the migration_thread, if the curr
+			 * task on busiest cpu can't be moved to this_cpu
+			 */
+			if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+				spin_unlock(&busiest->lock);
+				all_pinned = 1;
+				goto out_one_pinned;
+			}
+
 			if (!busiest->active_balance) {
 				busiest->active_balance = 1;
 				busiest->push_cpu = this_cpu;
@@ -2130,19 +2172,23 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 			sd->balance_interval *= 2;
 	}
 
+	if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+		return -1;
 	return nr_moved;
 
 out_balanced:
-	spin_unlock(&this_rq->lock);
-
 	schedstat_inc(sd, lb_balanced[idle]);
 
 	sd->nr_balance_failed = 0;
+
+out_one_pinned:
 	/* tune up the balancing interval */
 	if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
 			(sd->balance_interval < sd->max_interval))
 		sd->balance_interval *= 2;
 
+	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+		return -1;
 	return 0;
 }
 
@@ -2160,9 +2206,13 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
 	runqueue_t *busiest = NULL;
 	unsigned long imbalance;
 	int nr_moved = 0;
+	int sd_idle = 0;
+
+	if (sd->flags & SD_SHARE_CPUPOWER)
+		sd_idle = 1;
 
 	schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
-	group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
+	group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle);
 	if (!group) {
 		schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
 		goto out_balanced;
@@ -2176,22 +2226,30 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
 
 	BUG_ON(busiest == this_rq);
 
-	/* Attempt to move tasks */
-	double_lock_balance(this_rq, busiest);
-
 	schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
-	nr_moved = move_tasks(this_rq, this_cpu, busiest,
+
+	nr_moved = 0;
+	if (busiest->nr_running > 1) {
+		/* Attempt to move tasks */
+		double_lock_balance(this_rq, busiest);
+		nr_moved = move_tasks(this_rq, this_cpu, busiest,
 					imbalance, sd, NEWLY_IDLE, NULL);
-	if (!nr_moved)
+		spin_unlock(&busiest->lock);
+	}
+
+	if (!nr_moved) {
 		schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
-	else
+		if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+			return -1;
+	} else
 		sd->nr_balance_failed = 0;
 
-	spin_unlock(&busiest->lock);
 	return nr_moved;
 
 out_balanced:
 	schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+		return -1;
 	sd->nr_balance_failed = 0;
 	return 0;
 }
@@ -2316,7 +2374,11 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
 
 		if (j - sd->last_balance >= interval) {
 			if (load_balance(this_cpu, this_rq, sd, idle)) {
-				/* We've pulled tasks over so no longer idle */
+				/*
+				 * We've pulled tasks over so either we're no
+				 * longer idle, or one of our SMT siblings is
+				 * not idle.
+				 */
 				idle = NOT_IDLE;
 			}
 			sd->last_balance += interval;
@@ -2449,8 +2511,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 		cpustat->idle = cputime64_add(cpustat->idle, tmp);
 	/* Account for system time used */
 	acct_update_integrals(p);
-	/* Update rss highwater mark */
-	update_mem_hiwater(p);
 }
 
 /*
@@ -2575,6 +2635,13 @@ out:
 }
 
 #ifdef CONFIG_SCHED_SMT
+static inline void wakeup_busy_runqueue(runqueue_t *rq)
+{
+	/* If an SMT runqueue is sleeping due to priority reasons wake it up */
+	if (rq->curr == rq->idle && rq->nr_running)
+		resched_task(rq->idle);
+}
+
 static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
 {
 	struct sched_domain *tmp, *sd = NULL;
@@ -2608,12 +2675,7 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
 	for_each_cpu_mask(i, sibling_map) {
 		runqueue_t *smt_rq = cpu_rq(i);
 
-		/*
-		 * If an SMT sibling task is sleeping due to priority
-		 * reasons wake it up now.
-		 */
-		if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
-			resched_task(smt_rq->idle);
+		wakeup_busy_runqueue(smt_rq);
 	}
 
 	for_each_cpu_mask(i, sibling_map)
@@ -2624,6 +2686,16 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
 	 */
 }
 
+/*
+ * number of 'lost' timeslices this task wont be able to fully
+ * utilize, if another task runs on a sibling. This models the
+ * slowdown effect of other tasks running on siblings:
+ */
+static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
+{
+	return p->time_slice * (100 - sd->per_cpu_gain) / 100;
+}
+
 static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
 {
 	struct sched_domain *tmp, *sd = NULL;
@@ -2667,6 +2739,10 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
 		runqueue_t *smt_rq = cpu_rq(i);
 		task_t *smt_curr = smt_rq->curr;
 
+		/* Kernel threads do not participate in dependent sleeping */
+		if (!p->mm || !smt_curr->mm || rt_task(p))
+			goto check_smt_task;
+
 		/*
 		 * If a user task with lower static priority than the
 		 * running task on the SMT sibling is trying to schedule,
@@ -2675,21 +2751,45 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
 		 * task from using an unfair proportion of the
 		 * physical cpu's resources. -ck
 		 */
-		if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
-			task_timeslice(p) || rt_task(smt_curr)) &&
-			p->mm && smt_curr->mm && !rt_task(p))
-				ret = 1;
+		if (rt_task(smt_curr)) {
+			/*
+			 * With real time tasks we run non-rt tasks only
+			 * per_cpu_gain% of the time.
+			 */
+			if ((jiffies % DEF_TIMESLICE) >
+				(sd->per_cpu_gain * DEF_TIMESLICE / 100))
+					ret = 1;
+		} else
+			if (smt_curr->static_prio < p->static_prio &&
+				!TASK_PREEMPTS_CURR(p, smt_rq) &&
+				smt_slice(smt_curr, sd) > task_timeslice(p))
+					ret = 1;
+
+check_smt_task:
+		if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
+			rt_task(smt_curr))
+				continue;
+		if (!p->mm) {
+			wakeup_busy_runqueue(smt_rq);
+			continue;
+		}
 
 		/*
-		 * Reschedule a lower priority task on the SMT sibling,
-		 * or wake it up if it has been put to sleep for priority
-		 * reasons.
+		 * Reschedule a lower priority task on the SMT sibling for
+		 * it to be put to sleep, or wake it up if it has been put to
+		 * sleep for priority reasons to see if it should run now.
 		 */
-		if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
-			task_timeslice(smt_curr) || rt_task(p)) &&
-			smt_curr->mm && p->mm && !rt_task(smt_curr)) ||
-			(smt_curr == smt_rq->idle && smt_rq->nr_running))
-				resched_task(smt_curr);
+		if (rt_task(p)) {
+			if ((jiffies % DEF_TIMESLICE) >
+				(sd->per_cpu_gain * DEF_TIMESLICE / 100))
+					resched_task(smt_curr);
+		} else {
+			if (TASK_PREEMPTS_CURR(p, smt_rq) &&
+				smt_slice(p, sd) > task_timeslice(smt_curr))
+					resched_task(smt_curr);
+			else
+				wakeup_busy_runqueue(smt_rq);
+		}
 	}
 out_unlock:
 	for_each_cpu_mask(i, sibling_map)
@@ -2887,6 +2987,7 @@ switch_tasks:
 	if (next == rq->idle)
 		schedstat_inc(rq, sched_goidle);
 	prefetch(next);
+	prefetch_stack(next);
 	clear_tsk_need_resched(prev);
 	rcu_qsctr_inc(task_cpu(prev));
 
@@ -3014,7 +3115,8 @@ need_resched:
 
 #endif /* CONFIG_PREEMPT */
 
-int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
+int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
+			  void *key)
 {
 	task_t *p = curr->private;
 	return try_to_wake_up(p, mode, sync);
@@ -3056,7 +3158,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
  * @key: is directly passed to the wakeup function
  */
 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
-				int nr_exclusive, void *key)
+			int nr_exclusive, void *key)
 {
 	unsigned long flags;
 
@@ -3088,7 +3190,8 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
  *
  * On UP it can prevent extra preemption.
  */
-void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+void fastcall
+__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
 {
 	unsigned long flags;
 	int sync = 1;
@@ -3279,7 +3382,8 @@ void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
 
 EXPORT_SYMBOL(interruptible_sleep_on);
 
-long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
+long fastcall __sched
+interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
 	SLEEP_ON_VAR
 
@@ -3378,8 +3482,8 @@ EXPORT_SYMBOL(set_user_nice);
  */
 int can_nice(const task_t *p, const int nice)
 {
-	/* convert nice value [19,-20] to rlimit style value [0,39] */
-	int nice_rlim = 19 - nice;
+	/* convert nice value [19,-20] to rlimit style value [1,40] */
+	int nice_rlim = 20 - nice;
 	return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
 		capable(CAP_SYS_NICE));
 }
@@ -3498,7 +3602,8 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  */
-int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param)
+int sched_setscheduler(struct task_struct *p, int policy,
+		       struct sched_param *param)
 {
 	int retval;
 	int oldprio, oldpolicy = -1;
@@ -3518,7 +3623,7 @@ recheck:
 	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
 	 */
 	if (param->sched_priority < 0 ||
-	    (p->mm &&  param->sched_priority > MAX_USER_RT_PRIO-1) ||
+	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
 	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
 		return -EINVAL;
 	if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
@@ -3581,7 +3686,8 @@ recheck:
 }
 EXPORT_SYMBOL_GPL(sched_setscheduler);
 
-static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+static int
+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 {
 	int retval;
 	struct sched_param lparam;
@@ -3848,7 +3954,7 @@ asmlinkage long sys_sched_yield(void)
 	if (rt_task(current))
 		target = rq->active;
 
-	if (current->array->nr_active == 1) {
+	if (array->nr_active == 1) {
 		schedstat_inc(rq, yld_act_empty);
 		if (!rq->expired->nr_active)
 			schedstat_inc(rq, yld_both_empty);
@@ -3912,7 +4018,7 @@ EXPORT_SYMBOL(cond_resched);
  * operations here to prevent schedule() from being called twice (once via
  * spin_unlock(), once by hand).
  */
-int cond_resched_lock(spinlock_t * lock)
+int cond_resched_lock(spinlock_t *lock)
 {
 	int ret = 0;
 
@@ -4095,7 +4201,7 @@ static inline struct task_struct *younger_sibling(struct task_struct *p)
 	return list_entry(p->sibling.next,struct task_struct,sibling);
 }
 
-static void show_task(task_t * p)
+static void show_task(task_t *p)
 {
 	task_t *relative;
 	unsigned state;
@@ -4121,7 +4227,7 @@ static void show_task(task_t * p)
 #endif
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	{
-		unsigned long * n = (unsigned long *) (p->thread_info+1);
+		unsigned long *n = (unsigned long *) (p->thread_info+1);
 		while (!*n)
 			n++;
 		free = (unsigned long) n - (unsigned long)(p->thread_info+1);
@@ -4330,7 +4436,7 @@ out:
  * thread migration by bumping thread off CPU then 'pushing' onto
  * another runqueue.
  */
-static int migration_thread(void * data)
+static int migration_thread(void *data)
 {
 	runqueue_t *rq;
 	int cpu = (long)data;
@@ -4779,7 +4885,7 @@ static int sd_parent_degenerate(struct sched_domain *sd,
  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
  * hold the hotplug lock.
  */
-void cpu_attach_domain(struct sched_domain *sd, int cpu)
+static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
 	runqueue_t *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
@@ -4802,7 +4908,7 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu)
 }
 
 /* cpus with isolated domains */
-cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
+static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
 
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
@@ -4830,8 +4936,8 @@ __setup ("isolcpus=", isolated_cpu_setup);
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
-void init_sched_build_groups(struct sched_group groups[],
-			cpumask_t span, int (*group_fn)(int cpu))
+static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
+				    int (*group_fn)(int cpu))
 {
 	struct sched_group *first = NULL, *last = NULL;
 	cpumask_t covered = CPU_MASK_NONE;
@@ -4864,12 +4970,85 @@ void init_sched_build_groups(struct sched_group groups[],
 	last->next = first;
 }
 
+#define SD_NODES_PER_DOMAIN 16
 
-#ifdef ARCH_HAS_SCHED_DOMAIN
-extern void build_sched_domains(const cpumask_t *cpu_map);
-extern void arch_init_sched_domains(const cpumask_t *cpu_map);
-extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
-#else
+#ifdef CONFIG_NUMA
+/**
+ * find_next_best_node - find the next node to include in a sched_domain
+ * @node: node whose sched_domain we're building
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain.  Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int find_next_best_node(int node, unsigned long *used_nodes)
+{
+	int i, n, val, min_val, best_node = 0;
+
+	min_val = INT_MAX;
+
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		/* Start at @node */
+		n = (node + i) % MAX_NUMNODES;
+
+		if (!nr_cpus_node(n))
+			continue;
+
+		/* Skip already used nodes */
+		if (test_bit(n, used_nodes))
+			continue;
+
+		/* Simple min distance search */
+		val = node_distance(node, n);
+
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+
+	set_bit(best_node, used_nodes);
+	return best_node;
+}
+
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ * @size: number of nodes to include in this span
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span.  It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ */
+static cpumask_t sched_domain_node_span(int node)
+{
+	int i;
+	cpumask_t span, nodemask;
+	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
+
+	cpus_clear(span);
+	bitmap_zero(used_nodes, MAX_NUMNODES);
+
+	nodemask = node_to_cpumask(node);
+	cpus_or(span, span, nodemask);
+	set_bit(node, used_nodes);
+
+	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
+		int next_node = find_next_best_node(node, used_nodes);
+		nodemask = node_to_cpumask(next_node);
+		cpus_or(span, span, nodemask);
+	}
+
+	return span;
+}
+#endif
+
+/*
+ * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
+ * can switch it on easily if needed.
+ */
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static struct sched_group sched_group_cpus[NR_CPUS];
@@ -4891,36 +5070,20 @@ static int cpu_to_phys_group(int cpu)
 }
 
 #ifdef CONFIG_NUMA
-
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static int cpu_to_node_group(int cpu)
-{
-	return cpu_to_node(cpu);
-}
-#endif
-
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
 /*
- * The domains setup code relies on siblings not spanning
- * multiple nodes. Make sure the architecture has a proper
- * siblings map:
+ * The init_sched_build_groups can't handle what we want to do with node
+ * groups, so roll our own. Now each node has its own list of groups which
+ * gets dynamically allocated.
  */
-static void check_sibling_maps(void)
-{
-	int i, j;
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
+static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
 
-	for_each_online_cpu(i) {
-		for_each_cpu_mask(j, cpu_sibling_map[i]) {
-			if (cpu_to_node(i) != cpu_to_node(j)) {
-				printk(KERN_INFO "warning: CPU %d siblings map "
-					"to different node - isolating "
-					"them.\n", i);
-				cpu_sibling_map[i] = cpumask_of_cpu(i);
-				break;
-			}
-		}
-	}
+static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
+static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
+
+static int cpu_to_allnodes_group(int cpu)
+{
+	return cpu_to_node(cpu);
 }
 #endif
 
@@ -4928,9 +5091,24 @@ static void check_sibling_maps(void)
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
  */
-static void build_sched_domains(const cpumask_t *cpu_map)
+void build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
+#ifdef CONFIG_NUMA
+	struct sched_group **sched_group_nodes = NULL;
+	struct sched_group *sched_group_allnodes = NULL;
+
+	/*
+	 * Allocate the per-node list of sched groups
+	 */
+	sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
+					   GFP_ATOMIC);
+	if (!sched_group_nodes) {
+		printk(KERN_WARNING "Can not alloc sched group node list\n");
+		return;
+	}
+	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+#endif
 
 	/*
 	 * Set up domains for cpus specified by the cpu_map.
@@ -4943,11 +5121,35 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 		cpus_and(nodemask, nodemask, *cpu_map);
 
 #ifdef CONFIG_NUMA
+		if (cpus_weight(*cpu_map)
+				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+			if (!sched_group_allnodes) {
+				sched_group_allnodes
+					= kmalloc(sizeof(struct sched_group)
+							* MAX_NUMNODES,
+						  GFP_KERNEL);
+				if (!sched_group_allnodes) {
+					printk(KERN_WARNING
+					"Can not alloc allnodes sched group\n");
+					break;
+				}
+				sched_group_allnodes_bycpu[i]
+						= sched_group_allnodes;
+			}
+			sd = &per_cpu(allnodes_domains, i);
+			*sd = SD_ALLNODES_INIT;
+			sd->span = *cpu_map;
+			group = cpu_to_allnodes_group(i);
+			sd->groups = &sched_group_allnodes[group];
+			p = sd;
+		} else
+			p = NULL;
+
 		sd = &per_cpu(node_domains, i);
-		group = cpu_to_node_group(i);
 		*sd = SD_NODE_INIT;
-		sd->span = *cpu_map;
-		sd->groups = &sched_group_nodes[group];
+		sd->span = sched_domain_node_span(cpu_to_node(i));
+		sd->parent = p;
+		cpus_and(sd->span, sd->span, *cpu_map);
 #endif
 
 		p = sd;
@@ -4972,7 +5174,7 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 
 #ifdef CONFIG_SCHED_SMT
 	/* Set up CPU (sibling) groups */
-	for_each_online_cpu(i) {
+	for_each_cpu_mask(i, *cpu_map) {
 		cpumask_t this_sibling_map = cpu_sibling_map[i];
 		cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
 		if (i != first_cpu(this_sibling_map))
@@ -4997,8 +5199,77 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 
 #ifdef CONFIG_NUMA
 	/* Set up node groups */
-	init_sched_build_groups(sched_group_nodes, *cpu_map,
-					&cpu_to_node_group);
+	if (sched_group_allnodes)
+		init_sched_build_groups(sched_group_allnodes, *cpu_map,
+					&cpu_to_allnodes_group);
+
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		/* Set up node groups */
+		struct sched_group *sg, *prev;
+		cpumask_t nodemask = node_to_cpumask(i);
+		cpumask_t domainspan;
+		cpumask_t covered = CPU_MASK_NONE;
+		int j;
+
+		cpus_and(nodemask, nodemask, *cpu_map);
+		if (cpus_empty(nodemask)) {
+			sched_group_nodes[i] = NULL;
+			continue;
+		}
+
+		domainspan = sched_domain_node_span(i);
+		cpus_and(domainspan, domainspan, *cpu_map);
+
+		sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+		sched_group_nodes[i] = sg;
+		for_each_cpu_mask(j, nodemask) {
+			struct sched_domain *sd;
+			sd = &per_cpu(node_domains, j);
+			sd->groups = sg;
+			if (sd->groups == NULL) {
+				/* Turn off balancing if we have no groups */
+				sd->flags = 0;
+			}
+		}
+		if (!sg) {
+			printk(KERN_WARNING
+			"Can not alloc domain group for node %d\n", i);
+			continue;
+		}
+		sg->cpu_power = 0;
+		sg->cpumask = nodemask;
+		cpus_or(covered, covered, nodemask);
+		prev = sg;
+
+		for (j = 0; j < MAX_NUMNODES; j++) {
+			cpumask_t tmp, notcovered;
+			int n = (i + j) % MAX_NUMNODES;
+
+			cpus_complement(notcovered, covered);
+			cpus_and(tmp, notcovered, *cpu_map);
+			cpus_and(tmp, tmp, domainspan);
+			if (cpus_empty(tmp))
+				break;
+
+			nodemask = node_to_cpumask(n);
+			cpus_and(tmp, tmp, nodemask);
+			if (cpus_empty(tmp))
+				continue;
+
+			sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+			if (!sg) {
+				printk(KERN_WARNING
+				"Can not alloc domain group for node %d\n", j);
+				break;
+			}
+			sg->cpu_power = 0;
+			sg->cpumask = tmp;
+			cpus_or(covered, covered, tmp);
+			prev->next = sg;
+			prev = sg;
+		}
+		prev->next = sched_group_nodes[i];
+	}
 #endif
 
 	/* Calculate CPU power for physical packages and nodes */
@@ -5017,14 +5288,46 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 		sd->groups->cpu_power = power;
 
 #ifdef CONFIG_NUMA
-		if (i == first_cpu(sd->groups->cpumask)) {
-			/* Only add "power" once for each physical package. */
-			sd = &per_cpu(node_domains, i);
-			sd->groups->cpu_power += power;
+		sd = &per_cpu(allnodes_domains, i);
+		if (sd->groups) {
+			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+				(cpus_weight(sd->groups->cpumask)-1) / 10;
+			sd->groups->cpu_power = power;
 		}
 #endif
 	}
 
+#ifdef CONFIG_NUMA
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		struct sched_group *sg = sched_group_nodes[i];
+		int j;
+
+		if (sg == NULL)
+			continue;
+next_sg:
+		for_each_cpu_mask(j, sg->cpumask) {
+			struct sched_domain *sd;
+			int power;
+
+			sd = &per_cpu(phys_domains, j);
+			if (j != first_cpu(sd->groups->cpumask)) {
+				/*
+				 * Only add "power" once for each
+				 * physical package.
+				 */
+				continue;
+			}
+			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+				(cpus_weight(sd->groups->cpumask)-1) / 10;
+
+			sg->cpu_power += power;
+		}
+		sg = sg->next;
+		if (sg != sched_group_nodes[i])
+			goto next_sg;
+	}
+#endif
+
 	/* Attach the domains */
 	for_each_cpu_mask(i, *cpu_map) {
 		struct sched_domain *sd;
@@ -5039,13 +5342,10 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 /*
  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
  */
-static void arch_init_sched_domains(cpumask_t *cpu_map)
+static void arch_init_sched_domains(const cpumask_t *cpu_map)
 {
 	cpumask_t cpu_default_map;
 
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
-	check_sibling_maps();
-#endif
 	/*
 	 * Setup mask for cpus without special case scheduling requirements.
 	 * For now this just excludes isolated cpus, but could be used to
@@ -5058,10 +5358,47 @@ static void arch_init_sched_domains(cpumask_t *cpu_map)
 
 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
-	/* Do nothing: everything is statically allocated. */
-}
+#ifdef CONFIG_NUMA
+	int i;
+	int cpu;
 
-#endif /* ARCH_HAS_SCHED_DOMAIN */
+	for_each_cpu_mask(cpu, *cpu_map) {
+		struct sched_group *sched_group_allnodes
+			= sched_group_allnodes_bycpu[cpu];
+		struct sched_group **sched_group_nodes
+			= sched_group_nodes_bycpu[cpu];
+
+		if (sched_group_allnodes) {
+			kfree(sched_group_allnodes);
+			sched_group_allnodes_bycpu[cpu] = NULL;
+		}
+
+		if (!sched_group_nodes)
+			continue;
+
+		for (i = 0; i < MAX_NUMNODES; i++) {
+			cpumask_t nodemask = node_to_cpumask(i);
+			struct sched_group *oldsg, *sg = sched_group_nodes[i];
+
+			cpus_and(nodemask, nodemask, *cpu_map);
+			if (cpus_empty(nodemask))
+				continue;
+
+			if (sg == NULL)
+				continue;
+			sg = sg->next;
+next_sg:
+			oldsg = sg;
+			sg = sg->next;
+			kfree(oldsg);
+			if (oldsg != sched_group_nodes[i])
+				goto next_sg;
+		}
+		kfree(sched_group_nodes);
+		sched_group_nodes_bycpu[cpu] = NULL;
+	}
+#endif
+}
 
 /*
  * Detach sched domains from a group of cpus specified in cpu_map
@@ -5263,3 +5600,47 @@ void normalize_rt_tasks(void)
 }
 
 #endif /* CONFIG_MAGIC_SYSRQ */
+
+#ifdef CONFIG_IA64
+/*
+ * These functions are only useful for the IA64 MCA handling.
+ *
+ * They can only be called when the whole system has been
+ * stopped - every CPU needs to be quiescent, and no scheduling
+ * activity can take place. Using them for anything else would
+ * be a serious bug, and as a result, they aren't even visible
+ * under any other configuration.
+ */
+
+/**
+ * curr_task - return the current task for a given cpu.
+ * @cpu: the processor in question.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+task_t *curr_task(int cpu)
+{
+	return cpu_curr(cpu);
+}
+
+/**
+ * set_curr_task - set the current task for a given cpu.
+ * @cpu: the processor in question.
+ * @p: the task pointer to set.
+ *
+ * Description: This function must only be used when non-maskable interrupts
+ * are serviced on a separate stack.  It allows the architecture to switch the
+ * notion of the current task on a cpu in a non-blocking manner.  This function
+ * must be called with all CPU's synchronized, and interrupts disabled, the
+ * and caller must save the original value of the current task (see
+ * curr_task() above) and restore that value before reenabling interrupts and
+ * re-starting the system.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+void set_curr_task(int cpu, task_t *p)
+{
+	cpu_curr(cpu) = p;
+}
+
+#endif
diff --git a/kernel/signal.c b/kernel/signal.c
index ca1186eef938..1bf3c39d6109 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -262,7 +262,7 @@ next_signal(struct sigpending *pending, sigset_t *mask)
 	return sig;
 }
 
-static struct sigqueue *__sigqueue_alloc(struct task_struct *t, unsigned int __nocast flags,
+static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 					 int override_rlimit)
 {
 	struct sigqueue *q = NULL;
@@ -277,7 +277,6 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, unsigned int __n
 	} else {
 		INIT_LIST_HEAD(&q->list);
 		q->flags = 0;
-		q->lock = NULL;
 		q->user = get_uid(t->user);
 	}
 	return(q);
@@ -397,20 +396,8 @@ void __exit_signal(struct task_struct *tsk)
 	flush_sigqueue(&tsk->pending);
 	if (sig) {
 		/*
-		 * We are cleaning up the signal_struct here.  We delayed
-		 * calling exit_itimers until after flush_sigqueue, just in
-		 * case our thread-local pending queue contained a queued
-		 * timer signal that would have been cleared in
-		 * exit_itimers.  When that called sigqueue_free, it would
-		 * attempt to re-take the tasklist_lock and deadlock.  This
-		 * can never happen if we ensure that all queues the
-		 * timer's signal might be queued on have been flushed
-		 * first.  The shared_pending queue, and our own pending
-		 * queue are the only queues the timer could be on, since
-		 * there are no other threads left in the group and timer
-		 * signals are constrained to threads inside the group.
+		 * We are cleaning up the signal_struct here.
 		 */
-		exit_itimers(sig);
 		exit_thread_group_keys(sig);
 		kmem_cache_free(signal_cachep, sig);
 	}
@@ -418,6 +405,8 @@ void __exit_signal(struct task_struct *tsk)
 
 void exit_signal(struct task_struct *tsk)
 {
+	atomic_dec(&tsk->signal->live);
+
 	write_lock_irq(&tasklist_lock);
 	__exit_signal(tsk);
 	write_unlock_irq(&tasklist_lock);
@@ -578,7 +567,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
  		 * is to alert stop-signal processing code when another
  		 * processor has come along and cleared the flag.
  		 */
- 		tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
+ 		if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
+ 			tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
  	}
 	if ( signr &&
 	     ((info->si_code & __SI_MASK) == __SI_TIMER) &&
@@ -661,8 +651,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	if (!valid_signal(sig))
 		return error;
 	error = -EPERM;
-	if ((!info || ((unsigned long)info != 1 &&
-			(unsigned long)info != 2 && SI_FROMUSER(info)))
+	if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
 	    && ((sig != SIGCONT) ||
 		(current->signal->session != t->signal->session))
 	    && (current->euid ^ t->suid) && (current->euid ^ t->uid)
@@ -678,7 +667,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
 
 /* forward decl */
 static void do_notify_parent_cldstop(struct task_struct *tsk,
-				     struct task_struct *parent,
+				     int to_self,
 				     int why);
 
 /*
@@ -692,7 +681,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
 {
 	struct task_struct *t;
 
-	if (p->flags & SIGNAL_GROUP_EXIT)
+	if (p->signal->flags & SIGNAL_GROUP_EXIT)
 		/*
 		 * The process is in the middle of dying already.
 		 */
@@ -729,14 +718,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
 			p->signal->group_stop_count = 0;
 			p->signal->flags = SIGNAL_STOP_CONTINUED;
 			spin_unlock(&p->sighand->siglock);
-			if (p->ptrace & PT_PTRACED)
-				do_notify_parent_cldstop(p, p->parent,
-							 CLD_STOPPED);
-			else
-				do_notify_parent_cldstop(
-					p->group_leader,
-					p->group_leader->real_parent,
-							 CLD_STOPPED);
+			do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_STOPPED);
 			spin_lock(&p->sighand->siglock);
 		}
 		rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
@@ -777,14 +759,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
 			p->signal->flags = SIGNAL_STOP_CONTINUED;
 			p->signal->group_exit_code = 0;
 			spin_unlock(&p->sighand->siglock);
-			if (p->ptrace & PT_PTRACED)
-				do_notify_parent_cldstop(p, p->parent,
-							 CLD_CONTINUED);
-			else
-				do_notify_parent_cldstop(
-					p->group_leader,
-					p->group_leader->real_parent,
-							 CLD_CONTINUED);
+			do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_CONTINUED);
 			spin_lock(&p->sighand->siglock);
 		} else {
 			/*
@@ -813,7 +788,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	 * fast-pathed signals for kernel-internal things like SIGSTOP
 	 * or SIGKILL.
 	 */
-	if ((unsigned long)info == 2)
+	if (info == SEND_SIG_FORCED)
 		goto out_set;
 
 	/* Real-time signals must be queued if sent by sigqueue, or
@@ -825,19 +800,19 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	   pass on the info struct.  */
 
 	q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
-					     ((unsigned long) info < 2 ||
+					     (is_si_special(info) ||
 					      info->si_code >= 0)));
 	if (q) {
 		list_add_tail(&q->list, &signals->list);
 		switch ((unsigned long) info) {
-		case 0:
+		case (unsigned long) SEND_SIG_NOINFO:
 			q->info.si_signo = sig;
 			q->info.si_errno = 0;
 			q->info.si_code = SI_USER;
 			q->info.si_pid = current->pid;
 			q->info.si_uid = current->uid;
 			break;
-		case 1:
+		case (unsigned long) SEND_SIG_PRIV:
 			q->info.si_signo = sig;
 			q->info.si_errno = 0;
 			q->info.si_code = SI_KERNEL;
@@ -848,20 +823,13 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			copy_siginfo(&q->info, info);
 			break;
 		}
-	} else {
-		if (sig >= SIGRTMIN && info && (unsigned long)info != 1
-		   && info->si_code != SI_USER)
+	} else if (!is_si_special(info)) {
+		if (sig >= SIGRTMIN && info->si_code != SI_USER)
 		/*
 		 * Queue overflow, abort.  We may abort if the signal was rt
 		 * and sent by user using something other than kill().
 		 */
 			return -EAGAIN;
-		if (((unsigned long)info > 1) && (info->si_code == SI_TIMER))
-			/*
-			 * Set up a return to indicate that we dropped 
-			 * the signal.
-			 */
-			ret = info->si_sys_private;
 	}
 
 out_set:
@@ -882,12 +850,6 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 		BUG();
 	assert_spin_locked(&t->sighand->siglock);
 
-	if (((unsigned long)info > 2) && (info->si_code == SI_TIMER))
-		/*
-		 * Set up a return to indicate that we dropped the signal.
-		 */
-		ret = info->si_sys_private;
-
 	/* Short-circuit ignored signals.  */
 	if (sig_ignored(t, sig))
 		goto out;
@@ -917,11 +879,13 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 	int ret;
 
 	spin_lock_irqsave(&t->sighand->siglock, flags);
-	if (sigismember(&t->blocked, sig) || t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) {
+	if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) {
 		t->sighand->action[sig-1].sa.sa_handler = SIG_DFL;
+	}
+	if (sigismember(&t->blocked, sig)) {
 		sigdelset(&t->blocked, sig);
-		recalc_sigpending_tsk(t);
 	}
+	recalc_sigpending_tsk(t);
 	ret = specific_send_sig_info(sig, info, t);
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
 
@@ -931,15 +895,7 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 void
 force_sig_specific(int sig, struct task_struct *t)
 {
-	unsigned long int flags;
-
-	spin_lock_irqsave(&t->sighand->siglock, flags);
-	if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN)
-		t->sighand->action[sig-1].sa.sa_handler = SIG_DFL;
-	sigdelset(&t->blocked, sig);
-	recalc_sigpending_tsk(t);
-	specific_send_sig_info(sig, (void *)2, t);
-	spin_unlock_irqrestore(&t->sighand->siglock, flags);
+	force_sig_info(sig, SEND_SIG_FORCED, t);
 }
 
 /*
@@ -950,34 +906,31 @@ force_sig_specific(int sig, struct task_struct *t)
  * as soon as they're available, so putting the signal on the shared queue
  * will be equivalent to sending it to one such thread.
  */
-#define wants_signal(sig, p, mask) 			\
-	(!sigismember(&(p)->blocked, sig)		\
-	 && !((p)->state & mask)			\
-	 && !((p)->flags & PF_EXITING)			\
-	 && (task_curr(p) || !signal_pending(p)))
-
+static inline int wants_signal(int sig, struct task_struct *p)
+{
+	if (sigismember(&p->blocked, sig))
+		return 0;
+	if (p->flags & PF_EXITING)
+		return 0;
+	if (sig == SIGKILL)
+		return 1;
+	if (p->state & (TASK_STOPPED | TASK_TRACED))
+		return 0;
+	return task_curr(p) || !signal_pending(p);
+}
 
 static void
 __group_complete_signal(int sig, struct task_struct *p)
 {
-	unsigned int mask;
 	struct task_struct *t;
 
 	/*
-	 * Don't bother traced and stopped tasks (but
-	 * SIGKILL will punch through that).
-	 */
-	mask = TASK_STOPPED | TASK_TRACED;
-	if (sig == SIGKILL)
-		mask = 0;
-
-	/*
 	 * Now find a thread we can wake up to take the signal off the queue.
 	 *
 	 * If the main thread wants the signal, it gets first crack.
 	 * Probably the least surprising to the average bear.
 	 */
-	if (wants_signal(sig, p, mask))
+	if (wants_signal(sig, p))
 		t = p;
 	else if (thread_group_empty(p))
 		/*
@@ -995,7 +948,7 @@ __group_complete_signal(int sig, struct task_struct *p)
 			t = p->signal->curr_target = p;
 		BUG_ON(t->tgid != p->tgid);
 
-		while (!wants_signal(sig, t, mask)) {
+		while (!wants_signal(sig, t)) {
 			t = next_thread(t);
 			if (t == p->signal->curr_target)
 				/*
@@ -1077,12 +1030,6 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 	assert_spin_locked(&p->sighand->siglock);
 	handle_stop_signal(sig, p);
 
-	if (((unsigned long)info > 2) && (info->si_code == SI_TIMER))
-		/*
-		 * Set up a return to indicate that we dropped the signal.
-		 */
-		ret = info->si_sys_private;
-
 	/* Short-circuit ignored signals.  */
 	if (sig_ignored(p, sig))
 		return ret;
@@ -1135,8 +1082,8 @@ void zap_other_threads(struct task_struct *p)
 		if (t != p->group_leader)
 			t->exit_signal = -1;
 
+		/* SIGKILL will be handled before any pending SIGSTOP */
 		sigaddset(&t->pending.signal, SIGKILL);
-		rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
 		signal_wake_up(t, 1);
 	}
 }
@@ -1209,6 +1156,40 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid)
 	return error;
 }
 
+/* like kill_proc_info(), but doesn't use uid/euid of "current" */
+int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid,
+		      uid_t uid, uid_t euid)
+{
+	int ret = -EINVAL;
+	struct task_struct *p;
+
+	if (!valid_signal(sig))
+		return ret;
+
+	read_lock(&tasklist_lock);
+	p = find_task_by_pid(pid);
+	if (!p) {
+		ret = -ESRCH;
+		goto out_unlock;
+	}
+	if ((!info || ((unsigned long)info != 1 &&
+			(unsigned long)info != 2 && SI_FROMUSER(info)))
+	    && (euid != p->suid) && (euid != p->uid)
+	    && (uid != p->suid) && (uid != p->uid)) {
+		ret = -EPERM;
+		goto out_unlock;
+	}
+	if (sig && p->sighand) {
+		unsigned long flags;
+		spin_lock_irqsave(&p->sighand->siglock, flags);
+		ret = __group_send_sig_info(sig, info, p);
+		spin_unlock_irqrestore(&p->sighand->siglock, flags);
+	}
+out_unlock:
+	read_unlock(&tasklist_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kill_proc_info_as_uid);
 
 /*
  * kill_something_info() interprets pid in interesting ways just like kill(2).
@@ -1278,10 +1259,13 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 	return ret;
 }
 
+#define __si_special(priv) \
+	((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO)
+
 int
 send_sig(int sig, struct task_struct *p, int priv)
 {
-	return send_sig_info(sig, (void*)(long)(priv != 0), p);
+	return send_sig_info(sig, __si_special(priv), p);
 }
 
 /*
@@ -1301,7 +1285,7 @@ send_group_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 void
 force_sig(int sig, struct task_struct *p)
 {
-	force_sig_info(sig, (void*)1L, p);
+	force_sig_info(sig, SEND_SIG_PRIV, p);
 }
 
 /*
@@ -1326,13 +1310,13 @@ force_sigsegv(int sig, struct task_struct *p)
 int
 kill_pg(pid_t pgrp, int sig, int priv)
 {
-	return kill_pg_info(sig, (void *)(long)(priv != 0), pgrp);
+	return kill_pg_info(sig, __si_special(priv), pgrp);
 }
 
 int
 kill_proc(pid_t pid, int sig, int priv)
 {
-	return kill_proc_info(sig, (void *)(long)(priv != 0), pid);
+	return kill_proc_info(sig, __si_special(priv), pid);
 }
 
 /*
@@ -1363,11 +1347,12 @@ void sigqueue_free(struct sigqueue *q)
 	 * pending queue.
 	 */
 	if (unlikely(!list_empty(&q->list))) {
-		read_lock(&tasklist_lock);  
-		spin_lock_irqsave(q->lock, flags);
+		spinlock_t *lock = &current->sighand->siglock;
+		read_lock(&tasklist_lock);
+		spin_lock_irqsave(lock, flags);
 		if (!list_empty(&q->list))
 			list_del_init(&q->list);
-		spin_unlock_irqrestore(q->lock, flags);
+		spin_unlock_irqrestore(lock, flags);
 		read_unlock(&tasklist_lock);
 	}
 	q->flags &= ~SIGQUEUE_PREALLOC;
@@ -1380,16 +1365,16 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 	unsigned long flags;
 	int ret = 0;
 
-	/*
-	 * We need the tasklist lock even for the specific
-	 * thread case (when we don't need to follow the group
-	 * lists) in order to avoid races with "p->sighand"
-	 * going away or changing from under us.
-	 */
 	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
-	read_lock(&tasklist_lock);  
+	read_lock(&tasklist_lock);
+
+	if (unlikely(p->flags & PF_EXITING)) {
+		ret = -1;
+		goto out_err;
+	}
+
 	spin_lock_irqsave(&p->sighand->siglock, flags);
-	
+
 	if (unlikely(!list_empty(&q->list))) {
 		/*
 		 * If an SI_TIMER entry is already queue just increment
@@ -1399,14 +1384,13 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 			BUG();
 		q->info.si_overrun++;
 		goto out;
-	} 
+	}
 	/* Short-circuit ignored signals.  */
 	if (sig_ignored(p, sig)) {
 		ret = 1;
 		goto out;
 	}
 
-	q->lock = &p->sighand->siglock;
 	list_add_tail(&q->list, &p->pending.list);
 	sigaddset(&p->pending.signal, sig);
 	if (!sigismember(&p->blocked, sig))
@@ -1414,8 +1398,10 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 
 out:
 	spin_unlock_irqrestore(&p->sighand->siglock, flags);
+out_err:
 	read_unlock(&tasklist_lock);
-	return(ret);
+
+	return ret;
 }
 
 int
@@ -1452,7 +1438,6 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 	 * We always use the shared queue for process-wide signals,
 	 * to avoid several races.
 	 */
-	q->lock = &p->sighand->siglock;
 	list_add_tail(&q->list, &p->signal->shared_pending.list);
 	sigaddset(&p->signal->shared_pending.signal, sig);
 
@@ -1542,14 +1527,20 @@ void do_notify_parent(struct task_struct *tsk, int sig)
 	spin_unlock_irqrestore(&psig->siglock, flags);
 }
 
-static void
-do_notify_parent_cldstop(struct task_struct *tsk, struct task_struct *parent,
-			 int why)
+static void do_notify_parent_cldstop(struct task_struct *tsk, int to_self, int why)
 {
 	struct siginfo info;
 	unsigned long flags;
+	struct task_struct *parent;
 	struct sighand_struct *sighand;
 
+	if (to_self)
+		parent = tsk->parent;
+	else {
+		tsk = tsk->group_leader;
+		parent = tsk->real_parent;
+	}
+
 	info.si_signo = SIGCHLD;
 	info.si_errno = 0;
 	info.si_pid = tsk->pid;
@@ -1618,8 +1609,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
 		   !(current->ptrace & PT_ATTACHED)) &&
 	    (likely(current->parent->signal != current->signal) ||
 	     !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
-		do_notify_parent_cldstop(current, current->parent,
-					 CLD_TRAPPED);
+		do_notify_parent_cldstop(current, 1, CLD_TRAPPED);
 		read_unlock(&tasklist_lock);
 		schedule();
 	} else {
@@ -1668,25 +1658,25 @@ void ptrace_notify(int exit_code)
 static void
 finish_stop(int stop_count)
 {
+	int to_self;
+
 	/*
 	 * If there are no other threads in the group, or if there is
 	 * a group stop in progress and we are the last to stop,
 	 * report to the parent.  When ptraced, every thread reports itself.
 	 */
-	if (stop_count < 0 || (current->ptrace & PT_PTRACED)) {
-		read_lock(&tasklist_lock);
-		do_notify_parent_cldstop(current, current->parent,
-					 CLD_STOPPED);
-		read_unlock(&tasklist_lock);
-	}
-	else if (stop_count == 0) {
-		read_lock(&tasklist_lock);
-		do_notify_parent_cldstop(current->group_leader,
-					 current->group_leader->real_parent,
-					 CLD_STOPPED);
-		read_unlock(&tasklist_lock);
-	}
+	if (stop_count < 0 || (current->ptrace & PT_PTRACED))
+		to_self = 1;
+	else if (stop_count == 0)
+		to_self = 0;
+	else
+		goto out;
+
+	read_lock(&tasklist_lock);
+	do_notify_parent_cldstop(current, to_self, CLD_STOPPED);
+	read_unlock(&tasklist_lock);
 
+out:
 	schedule();
 	/*
 	 * Now we don't run again until continued.
@@ -1773,7 +1763,8 @@ do_signal_stop(int signr)
 				 * stop is always done with the siglock held,
 				 * so this check has no races.
 				 */
-				if (t->state < TASK_STOPPED) {
+				if (!t->exit_state &&
+				    !(t->state & (TASK_STOPPED|TASK_TRACED))) {
 					stop_count++;
 					signal_wake_up(t, 0);
 				}
@@ -1865,9 +1856,9 @@ relock:
 			/* Let the debugger run.  */
 			ptrace_stop(signr, signr, info);
 
-			/* We're back.  Did the debugger cancel the sig?  */
+			/* We're back.  Did the debugger cancel the sig or group_exit? */
 			signr = current->exit_code;
-			if (signr == 0)
+			if (signr == 0 || current->signal->flags & SIGNAL_GROUP_EXIT)
 				continue;
 
 			current->exit_code = 0;
@@ -2228,8 +2219,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
 			recalc_sigpending();
 			spin_unlock_irq(&current->sighand->siglock);
 
-			current->state = TASK_INTERRUPTIBLE;
-			timeout = schedule_timeout(timeout);
+			timeout = schedule_timeout_interruptible(timeout);
 
 			try_to_freeze();
 			spin_lock_irq(&current->sighand->siglock);
@@ -2270,26 +2260,13 @@ sys_kill(int pid, int sig)
 	return kill_something_info(sig, &info, pid);
 }
 
-/**
- *  sys_tgkill - send signal to one specific thread
- *  @tgid: the thread group ID of the thread
- *  @pid: the PID of the thread
- *  @sig: signal to be sent
- *
- *  This syscall also checks the tgid and returns -ESRCH even if the PID
- *  exists but it's not belonging to the target process anymore. This
- *  method solves the problem of threads exiting and PIDs getting reused.
- */
-asmlinkage long sys_tgkill(int tgid, int pid, int sig)
+static int do_tkill(int tgid, int pid, int sig)
 {
-	struct siginfo info;
 	int error;
+	struct siginfo info;
 	struct task_struct *p;
 
-	/* This is only valid for single tasks */
-	if (pid <= 0 || tgid <= 0)
-		return -EINVAL;
-
+	error = -ESRCH;
 	info.si_signo = sig;
 	info.si_errno = 0;
 	info.si_code = SI_TKILL;
@@ -2298,8 +2275,7 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig)
 
 	read_lock(&tasklist_lock);
 	p = find_task_by_pid(pid);
-	error = -ESRCH;
-	if (p && (p->tgid == tgid)) {
+	if (p && (tgid <= 0 || p->tgid == tgid)) {
 		error = check_kill_permission(sig, &info, p);
 		/*
 		 * The null signal is a permissions and process existence
@@ -2313,47 +2289,40 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig)
 		}
 	}
 	read_unlock(&tasklist_lock);
+
 	return error;
 }
 
+/**
+ *  sys_tgkill - send signal to one specific thread
+ *  @tgid: the thread group ID of the thread
+ *  @pid: the PID of the thread
+ *  @sig: signal to be sent
+ *
+ *  This syscall also checks the tgid and returns -ESRCH even if the PID
+ *  exists but it's not belonging to the target process anymore. This
+ *  method solves the problem of threads exiting and PIDs getting reused.
+ */
+asmlinkage long sys_tgkill(int tgid, int pid, int sig)
+{
+	/* This is only valid for single tasks */
+	if (pid <= 0 || tgid <= 0)
+		return -EINVAL;
+
+	return do_tkill(tgid, pid, sig);
+}
+
 /*
  *  Send a signal to only one task, even if it's a CLONE_THREAD task.
  */
 asmlinkage long
 sys_tkill(int pid, int sig)
 {
-	struct siginfo info;
-	int error;
-	struct task_struct *p;
-
 	/* This is only valid for single tasks */
 	if (pid <= 0)
 		return -EINVAL;
 
-	info.si_signo = sig;
-	info.si_errno = 0;
-	info.si_code = SI_TKILL;
-	info.si_pid = current->tgid;
-	info.si_uid = current->uid;
-
-	read_lock(&tasklist_lock);
-	p = find_task_by_pid(pid);
-	error = -ESRCH;
-	if (p) {
-		error = check_kill_permission(sig, &info, p);
-		/*
-		 * The null signal is a permissions and process existence
-		 * probe.  No signal is actually delivered.
-		 */
-		if (!error && sig && p->sighand) {
-			spin_lock_irq(&p->sighand->siglock);
-			handle_stop_signal(sig, p);
-			error = specific_send_sig_info(sig, &info, p);
-			spin_unlock_irq(&p->sighand->siglock);
-		}
-	}
-	read_unlock(&tasklist_lock);
-	return error;
+	return do_tkill(0, pid, sig);
 }
 
 asmlinkage long
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b4ab6af1dea8..f766b2fc48be 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -84,7 +84,7 @@ asmlinkage void __do_softirq(void)
 	cpu = smp_processor_id();
 restart:
 	/* Reset the pending bitmask before enabling irqs */
-	local_softirq_pending() = 0;
+	set_softirq_pending(0);
 
 	local_irq_enable();
 
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
new file mode 100644
index 000000000000..75976209cea7
--- /dev/null
+++ b/kernel/softlockup.c
@@ -0,0 +1,151 @@
+/*
+ * Detect Soft Lockups
+ *
+ * started by Ingo Molnar, (C) 2005, Red Hat
+ *
+ * this code detects soft lockups: incidents in where on a CPU
+ * the kernel does not reschedule for 10 seconds or more.
+ */
+
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/notifier.h>
+#include <linux/module.h>
+
+static DEFINE_SPINLOCK(print_lock);
+
+static DEFINE_PER_CPU(unsigned long, timestamp) = 0;
+static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0;
+static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
+
+static int did_panic = 0;
+static int softlock_panic(struct notifier_block *this, unsigned long event,
+				void *ptr)
+{
+	did_panic = 1;
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block panic_block = {
+	.notifier_call = softlock_panic,
+};
+
+void touch_softlockup_watchdog(void)
+{
+	per_cpu(timestamp, raw_smp_processor_id()) = jiffies;
+}
+EXPORT_SYMBOL(touch_softlockup_watchdog);
+
+/*
+ * This callback runs from the timer interrupt, and checks
+ * whether the watchdog thread has hung or not:
+ */
+void softlockup_tick(struct pt_regs *regs)
+{
+	int this_cpu = smp_processor_id();
+	unsigned long timestamp = per_cpu(timestamp, this_cpu);
+
+	if (per_cpu(print_timestamp, this_cpu) == timestamp)
+		return;
+
+	/* Do not cause a second panic when there already was one */
+	if (did_panic)
+		return;
+
+	if (time_after(jiffies, timestamp + 10*HZ)) {
+		per_cpu(print_timestamp, this_cpu) = timestamp;
+
+		spin_lock(&print_lock);
+		printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
+			this_cpu);
+		show_regs(regs);
+		spin_unlock(&print_lock);
+	}
+}
+
+/*
+ * The watchdog thread - runs every second and touches the timestamp.
+ */
+static int watchdog(void * __bind_cpu)
+{
+	struct sched_param param = { .sched_priority = 99 };
+	int this_cpu = (long) __bind_cpu;
+
+	printk("softlockup thread %d started up.\n", this_cpu);
+
+	sched_setscheduler(current, SCHED_FIFO, &param);
+	current->flags |= PF_NOFREEZE;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	/*
+	 * Run briefly once per second - if this gets delayed for
+	 * more than 10 seconds then the debug-printout triggers
+	 * in softlockup_tick():
+	 */
+	while (!kthread_should_stop()) {
+		msleep_interruptible(1000);
+		touch_softlockup_watchdog();
+	}
+	__set_current_state(TASK_RUNNING);
+
+	return 0;
+}
+
+/*
+ * Create/destroy watchdog threads as CPUs come and go:
+ */
+static int __devinit
+cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+	int hotcpu = (unsigned long)hcpu;
+	struct task_struct *p;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		BUG_ON(per_cpu(watchdog_task, hotcpu));
+		p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
+		if (IS_ERR(p)) {
+			printk("watchdog for %i failed\n", hotcpu);
+			return NOTIFY_BAD;
+		}
+  		per_cpu(watchdog_task, hotcpu) = p;
+		kthread_bind(p, hotcpu);
+ 		break;
+	case CPU_ONLINE:
+
+		wake_up_process(per_cpu(watchdog_task, hotcpu));
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+		/* Unbind so it can run.  Fall thru. */
+		kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id());
+	case CPU_DEAD:
+		p = per_cpu(watchdog_task, hotcpu);
+		per_cpu(watchdog_task, hotcpu) = NULL;
+		kthread_stop(p);
+		break;
+#endif /* CONFIG_HOTPLUG_CPU */
+ 	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata cpu_nfb = {
+	.notifier_call = cpu_callback
+};
+
+__init void spawn_softlockup_task(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+
+	cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+	register_cpu_notifier(&cpu_nfb);
+
+	notifier_chain_register(&panic_notifier_list, &panic_block);
+}
+
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 0c3f9d8bbe17..0375fcd5921d 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -3,7 +3,10 @@
  *
  * Author: Zwane Mwaikambo <zwane@fsmlabs.com>
  *
- * Copyright (2004) Ingo Molnar
+ * Copyright (2004, 2005) Ingo Molnar
+ *
+ * This file contains the spinlock/rwlock implementations for the
+ * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them)
  */
 
 #include <linux/config.h>
@@ -17,12 +20,12 @@
  * Generic declaration of the raw read_trylock() function,
  * architectures are supposed to optimize this:
  */
-int __lockfunc generic_raw_read_trylock(rwlock_t *lock)
+int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock)
 {
-	_raw_read_lock(lock);
+	__raw_read_lock(lock);
 	return 1;
 }
-EXPORT_SYMBOL(generic_raw_read_trylock);
+EXPORT_SYMBOL(generic__raw_read_trylock);
 
 int __lockfunc _spin_trylock(spinlock_t *lock)
 {
@@ -57,7 +60,7 @@ int __lockfunc _write_trylock(rwlock_t *lock)
 }
 EXPORT_SYMBOL(_write_trylock);
 
-#ifndef CONFIG_PREEMPT
+#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP)
 
 void __lockfunc _read_lock(rwlock_t *lock)
 {
@@ -72,7 +75,7 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
 
 	local_irq_save(flags);
 	preempt_disable();
-	_raw_spin_lock_flags(lock, flags);
+	_raw_spin_lock_flags(lock, &flags);
 	return flags;
 }
 EXPORT_SYMBOL(_spin_lock_irqsave);
diff --git a/kernel/sys.c b/kernel/sys.c
index 000e81ad2c1d..2fa1ed18123c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -361,17 +361,35 @@ out_unlock:
 	return retval;
 }
 
+/**
+ *	emergency_restart - reboot the system
+ *
+ *	Without shutting down any hardware or taking any locks
+ *	reboot the system.  This is called when we know we are in
+ *	trouble so this is our best effort to reboot.  This is
+ *	safe to call in interrupt context.
+ */
 void emergency_restart(void)
 {
 	machine_emergency_restart();
 }
 EXPORT_SYMBOL_GPL(emergency_restart);
 
-void kernel_restart(char *cmd)
+/**
+ *	kernel_restart - reboot the system
+ *
+ *	Shutdown everything and perform a clean reboot.
+ *	This is not safe to call in interrupt context.
+ */
+void kernel_restart_prepare(char *cmd)
 {
 	notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
 	system_state = SYSTEM_RESTART;
 	device_shutdown();
+}
+void kernel_restart(char *cmd)
+{
+	kernel_restart_prepare(cmd);
 	if (!cmd) {
 		printk(KERN_EMERG "Restarting system.\n");
 	} else {
@@ -382,6 +400,12 @@ void kernel_restart(char *cmd)
 }
 EXPORT_SYMBOL_GPL(kernel_restart);
 
+/**
+ *	kernel_kexec - reboot the system
+ *
+ *	Move into place and start executing a preloaded standalone
+ *	executable.  If nothing was preloaded return an error.
+ */
 void kernel_kexec(void)
 {
 #ifdef CONFIG_KEXEC
@@ -390,9 +414,7 @@ void kernel_kexec(void)
 	if (!image) {
 		return;
 	}
-	notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
-	system_state = SYSTEM_RESTART;
-	device_shutdown();
+	kernel_restart_prepare(NULL);
 	printk(KERN_EMERG "Starting new kernel\n");
 	machine_shutdown();
 	machine_kexec(image);
@@ -400,23 +422,39 @@ void kernel_kexec(void)
 }
 EXPORT_SYMBOL_GPL(kernel_kexec);
 
-void kernel_halt(void)
+/**
+ *	kernel_halt - halt the system
+ *
+ *	Shutdown everything and perform a clean system halt.
+ */
+void kernel_halt_prepare(void)
 {
 	notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
 	system_state = SYSTEM_HALT;
-	device_suspend(PMSG_SUSPEND);
 	device_shutdown();
+}
+void kernel_halt(void)
+{
+	kernel_halt_prepare();
 	printk(KERN_EMERG "System halted.\n");
 	machine_halt();
 }
 EXPORT_SYMBOL_GPL(kernel_halt);
 
-void kernel_power_off(void)
+/**
+ *	kernel_power_off - power_off the system
+ *
+ *	Shutdown everything and perform a clean system power_off.
+ */
+void kernel_power_off_prepare(void)
 {
 	notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
 	system_state = SYSTEM_POWER_OFF;
-	device_suspend(PMSG_SUSPEND);
 	device_shutdown();
+}
+void kernel_power_off(void)
+{
+	kernel_power_off_prepare();
 	printk(KERN_EMERG "Power down.\n");
 	machine_power_off();
 }
@@ -1713,7 +1751,6 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 			  unsigned long arg4, unsigned long arg5)
 {
 	long error;
-	int sig;
 
 	error = security_task_prctl(option, arg2, arg3, arg4, arg5);
 	if (error)
@@ -1721,19 +1758,17 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 
 	switch (option) {
 		case PR_SET_PDEATHSIG:
-			sig = arg2;
-			if (!valid_signal(sig)) {
+			if (!valid_signal(arg2)) {
 				error = -EINVAL;
 				break;
 			}
-			current->pdeath_signal = sig;
+			current->pdeath_signal = arg2;
 			break;
 		case PR_GET_PDEATHSIG:
 			error = put_user(current->pdeath_signal, (int __user *)arg2);
 			break;
 		case PR_GET_DUMPABLE:
-			if (current->mm->dumpable)
-				error = 1;
+			error = current->mm->dumpable;
 			break;
 		case PR_SET_DUMPABLE:
 			if (arg2 < 0 || arg2 > 2) {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3e0bbee549ea..8e56e2495542 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -31,6 +31,7 @@
 #include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/net.h>
 #include <linux/sysrq.h>
 #include <linux/highuid.h>
 #include <linux/writeback.h>
@@ -136,9 +137,6 @@ static struct ctl_table_header root_table_header =
 
 static ctl_table kern_table[];
 static ctl_table vm_table[];
-#ifdef CONFIG_NET
-extern ctl_table net_table[];
-#endif
 static ctl_table proc_table[];
 static ctl_table fs_table[];
 static ctl_table debug_table[];
diff --git a/kernel/time.c b/kernel/time.c
index dd5ae1162a8f..245d595a13cb 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -338,30 +338,20 @@ int do_adjtimex(struct timex *txc)
 		        if (mtemp >= MINSEC) {
 			    ltemp = (time_offset / mtemp) << (SHIFT_USEC -
 							      SHIFT_UPDATE);
-			    if (ltemp < 0)
-			        time_freq -= -ltemp >> SHIFT_KH;
-			    else
-			        time_freq += ltemp >> SHIFT_KH;
+			    time_freq += shift_right(ltemp, SHIFT_KH);
 			} else /* calibration interval too short (p. 12) */
 				result = TIME_ERROR;
 		    } else {	/* PLL mode */
 		        if (mtemp < MAXSEC) {
 			    ltemp *= mtemp;
-			    if (ltemp < 0)
-			        time_freq -= -ltemp >> (time_constant +
-							time_constant +
-							SHIFT_KF - SHIFT_USEC);
-			    else
-			        time_freq += ltemp >> (time_constant +
+			    time_freq += shift_right(ltemp,(time_constant +
 						       time_constant +
-						       SHIFT_KF - SHIFT_USEC);
+						       SHIFT_KF - SHIFT_USEC));
 			} else /* calibration interval too long (p. 12) */
 				result = TIME_ERROR;
 		    }
-		    if (time_freq > time_tolerance)
-		        time_freq = time_tolerance;
-		    else if (time_freq < -time_tolerance)
-		        time_freq = -time_tolerance;
+		    time_freq = min(time_freq, time_tolerance);
+		    time_freq = max(time_freq, -time_tolerance);
 		} /* STA_PLL || STA_PPSTIME */
 	    } /* txc->modes & ADJ_OFFSET */
 	    if (txc->modes & ADJ_TICK) {
@@ -384,10 +374,7 @@ leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
 	if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
 	    txc->offset	   = save_adjust;
 	else {
-	    if (time_offset < 0)
-		txc->offset = -(-time_offset >> SHIFT_UPDATE);
-	    else
-		txc->offset = time_offset >> SHIFT_UPDATE;
+	    txc->offset = shift_right(time_offset, SHIFT_UPDATE);
 	}
 	txc->freq	   = time_freq + pps_freq;
 	txc->maxerror	   = time_maxerror;
@@ -532,6 +519,7 @@ int do_settimeofday (struct timespec *tv)
 	clock_was_set();
 	return 0;
 }
+EXPORT_SYMBOL(do_settimeofday);
 
 void do_gettimeofday (struct timeval *tv)
 {
@@ -570,6 +558,7 @@ void getnstimeofday(struct timespec *tv)
 	tv->tv_sec = x.tv_sec;
 	tv->tv_nsec = x.tv_usec * NSEC_PER_USEC;
 }
+EXPORT_SYMBOL_GPL(getnstimeofday);
 #endif
 
 #if (BITS_PER_LONG < 64)
diff --git a/kernel/timer.c b/kernel/timer.c
index f2a11887a726..fd74268d8663 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -46,6 +46,10 @@ static void time_interpolator_update(long delta_nsec);
 #define time_interpolator_update(x)
 #endif
 
+u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
+
+EXPORT_SYMBOL(jiffies_64);
+
 /*
  * per-CPU timer vector definitions:
  */
@@ -91,30 +95,6 @@ static inline void set_running_timer(tvec_base_t *base,
 #endif
 }
 
-static void check_timer_failed(struct timer_list *timer)
-{
-	static int whine_count;
-	if (whine_count < 16) {
-		whine_count++;
-		printk("Uninitialised timer!\n");
-		printk("This is just a warning.  Your computer is OK\n");
-		printk("function=0x%p, data=0x%lx\n",
-			timer->function, timer->data);
-		dump_stack();
-	}
-	/*
-	 * Now fix it up
-	 */
-	timer->magic = TIMER_MAGIC;
-}
-
-static inline void check_timer(struct timer_list *timer)
-{
-	if (timer->magic != TIMER_MAGIC)
-		check_timer_failed(timer);
-}
-
-
 static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
 {
 	unsigned long expires = timer->expires;
@@ -177,7 +157,6 @@ void fastcall init_timer(struct timer_list *timer)
 {
 	timer->entry.next = NULL;
 	timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base;
-	timer->magic = TIMER_MAGIC;
 }
 EXPORT_SYMBOL(init_timer);
 
@@ -230,7 +209,6 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
 	int ret = 0;
 
 	BUG_ON(!timer->function);
-	check_timer(timer);
 
 	base = lock_timer_base(timer, &flags);
 
@@ -283,9 +261,6 @@ void add_timer_on(struct timer_list *timer, int cpu)
   	unsigned long flags;
 
   	BUG_ON(timer_pending(timer) || !timer->function);
-
-	check_timer(timer);
-
 	spin_lock_irqsave(&base->t_base.lock, flags);
 	timer->base = &base->t_base;
 	internal_add_timer(base, timer);
@@ -316,8 +291,6 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
 {
 	BUG_ON(!timer->function);
 
-	check_timer(timer);
-
 	/*
 	 * This is a common optimization triggered by the
 	 * networking code - if the timer is re-modified
@@ -348,8 +321,6 @@ int del_timer(struct timer_list *timer)
 	unsigned long flags;
 	int ret = 0;
 
-	check_timer(timer);
-
 	if (timer_pending(timer)) {
 		base = lock_timer_base(timer, &flags);
 		if (timer_pending(timer)) {
@@ -412,8 +383,6 @@ out:
  */
 int del_timer_sync(struct timer_list *timer)
 {
-	check_timer(timer);
-
 	for (;;) {
 		int ret = try_to_del_timer_sync(timer);
 		if (ret >= 0)
@@ -632,134 +601,118 @@ long time_next_adjust;
  */
 static void second_overflow(void)
 {
-    long ltemp;
-
-    /* Bump the maxerror field */
-    time_maxerror += time_tolerance >> SHIFT_USEC;
-    if ( time_maxerror > NTP_PHASE_LIMIT ) {
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_status |= STA_UNSYNC;
-    }
-
-    /*
-     * Leap second processing. If in leap-insert state at
-     * the end of the day, the system clock is set back one
-     * second; if in leap-delete state, the system clock is
-     * set ahead one second. The microtime() routine or
-     * external clock driver will insure that reported time
-     * is always monotonic. The ugly divides should be
-     * replaced.
-     */
-    switch (time_state) {
-
-    case TIME_OK:
-	if (time_status & STA_INS)
-	    time_state = TIME_INS;
-	else if (time_status & STA_DEL)
-	    time_state = TIME_DEL;
-	break;
-
-    case TIME_INS:
-	if (xtime.tv_sec % 86400 == 0) {
-	    xtime.tv_sec--;
-	    wall_to_monotonic.tv_sec++;
-	    /* The timer interpolator will make time change gradually instead
-	     * of an immediate jump by one second.
-	     */
-	    time_interpolator_update(-NSEC_PER_SEC);
-	    time_state = TIME_OOP;
-	    clock_was_set();
-	    printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n");
+	long ltemp;
+
+	/* Bump the maxerror field */
+	time_maxerror += time_tolerance >> SHIFT_USEC;
+	if (time_maxerror > NTP_PHASE_LIMIT) {
+		time_maxerror = NTP_PHASE_LIMIT;
+		time_status |= STA_UNSYNC;
 	}
-	break;
-
-    case TIME_DEL:
-	if ((xtime.tv_sec + 1) % 86400 == 0) {
-	    xtime.tv_sec++;
-	    wall_to_monotonic.tv_sec--;
-	    /* Use of time interpolator for a gradual change of time */
-	    time_interpolator_update(NSEC_PER_SEC);
-	    time_state = TIME_WAIT;
-	    clock_was_set();
-	    printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n");
+
+	/*
+	 * Leap second processing. If in leap-insert state at the end of the
+	 * day, the system clock is set back one second; if in leap-delete
+	 * state, the system clock is set ahead one second. The microtime()
+	 * routine or external clock driver will insure that reported time is
+	 * always monotonic. The ugly divides should be replaced.
+	 */
+	switch (time_state) {
+	case TIME_OK:
+		if (time_status & STA_INS)
+			time_state = TIME_INS;
+		else if (time_status & STA_DEL)
+			time_state = TIME_DEL;
+		break;
+	case TIME_INS:
+		if (xtime.tv_sec % 86400 == 0) {
+			xtime.tv_sec--;
+			wall_to_monotonic.tv_sec++;
+			/*
+			 * The timer interpolator will make time change
+			 * gradually instead of an immediate jump by one second
+			 */
+			time_interpolator_update(-NSEC_PER_SEC);
+			time_state = TIME_OOP;
+			clock_was_set();
+			printk(KERN_NOTICE "Clock: inserting leap second "
+					"23:59:60 UTC\n");
+		}
+		break;
+	case TIME_DEL:
+		if ((xtime.tv_sec + 1) % 86400 == 0) {
+			xtime.tv_sec++;
+			wall_to_monotonic.tv_sec--;
+			/*
+			 * Use of time interpolator for a gradual change of
+			 * time
+			 */
+			time_interpolator_update(NSEC_PER_SEC);
+			time_state = TIME_WAIT;
+			clock_was_set();
+			printk(KERN_NOTICE "Clock: deleting leap second "
+					"23:59:59 UTC\n");
+		}
+		break;
+	case TIME_OOP:
+		time_state = TIME_WAIT;
+		break;
+	case TIME_WAIT:
+		if (!(time_status & (STA_INS | STA_DEL)))
+		time_state = TIME_OK;
 	}
-	break;
-
-    case TIME_OOP:
-	time_state = TIME_WAIT;
-	break;
-
-    case TIME_WAIT:
-	if (!(time_status & (STA_INS | STA_DEL)))
-	    time_state = TIME_OK;
-    }
-
-    /*
-     * Compute the phase adjustment for the next second. In
-     * PLL mode, the offset is reduced by a fixed factor
-     * times the time constant. In FLL mode the offset is
-     * used directly. In either mode, the maximum phase
-     * adjustment for each second is clamped so as to spread
-     * the adjustment over not more than the number of
-     * seconds between updates.
-     */
-    if (time_offset < 0) {
-	ltemp = -time_offset;
-	if (!(time_status & STA_FLL))
-	    ltemp >>= SHIFT_KG + time_constant;
-	if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
-	    ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
-	time_offset += ltemp;
-	time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
-    } else {
+
+	/*
+	 * Compute the phase adjustment for the next second. In PLL mode, the
+	 * offset is reduced by a fixed factor times the time constant. In FLL
+	 * mode the offset is used directly. In either mode, the maximum phase
+	 * adjustment for each second is clamped so as to spread the adjustment
+	 * over not more than the number of seconds between updates.
+	 */
 	ltemp = time_offset;
 	if (!(time_status & STA_FLL))
-	    ltemp >>= SHIFT_KG + time_constant;
-	if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
-	    ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
+		ltemp = shift_right(ltemp, SHIFT_KG + time_constant);
+	ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE);
+	ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE);
 	time_offset -= ltemp;
 	time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
-    }
-
-    /*
-     * Compute the frequency estimate and additional phase
-     * adjustment due to frequency error for the next
-     * second. When the PPS signal is engaged, gnaw on the
-     * watchdog counter and update the frequency computed by
-     * the pll and the PPS signal.
-     */
-    pps_valid++;
-    if (pps_valid == PPS_VALID) {	/* PPS signal lost */
-	pps_jitter = MAXTIME;
-	pps_stabil = MAXFREQ;
-	time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
-			 STA_PPSWANDER | STA_PPSERROR);
-    }
-    ltemp = time_freq + pps_freq;
-    if (ltemp < 0)
-	time_adj -= -ltemp >>
-	    (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
-    else
-	time_adj += ltemp >>
-	    (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
+
+	/*
+	 * Compute the frequency estimate and additional phase adjustment due
+	 * to frequency error for the next second. When the PPS signal is
+	 * engaged, gnaw on the watchdog counter and update the frequency
+	 * computed by the pll and the PPS signal.
+	 */
+	pps_valid++;
+	if (pps_valid == PPS_VALID) {	/* PPS signal lost */
+		pps_jitter = MAXTIME;
+		pps_stabil = MAXFREQ;
+		time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
+				STA_PPSWANDER | STA_PPSERROR);
+	}
+	ltemp = time_freq + pps_freq;
+	time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
 
 #if HZ == 100
-    /* Compensate for (HZ==100) != (1 << SHIFT_HZ).
-     * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14)
-     */
-    if (time_adj < 0)
-	time_adj -= (-time_adj >> 2) + (-time_adj >> 5);
-    else
-	time_adj += (time_adj >> 2) + (time_adj >> 5);
+	/*
+	 * Compensate for (HZ==100) != (1 << SHIFT_HZ).  Add 25% and 3.125% to
+	 * get 128.125; => only 0.125% error (p. 14)
+	 */
+	time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5);
+#endif
+#if HZ == 250
+	/*
+	 * Compensate for (HZ==250) != (1 << SHIFT_HZ).  Add 1.5625% and
+	 * 0.78125% to get 255.85938; => only 0.05% error (p. 14)
+	 */
+	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
 #endif
 #if HZ == 1000
-    /* Compensate for (HZ==1000) != (1 << SHIFT_HZ).
-     * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
-     */
-    if (time_adj < 0)
-	time_adj -= (-time_adj >> 6) + (-time_adj >> 7);
-    else
-	time_adj += (time_adj >> 6) + (time_adj >> 7);
+	/*
+	 * Compensate for (HZ==1000) != (1 << SHIFT_HZ).  Add 1.5625% and
+	 * 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
+	 */
+	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
 #endif
 }
 
@@ -768,23 +721,20 @@ static void update_wall_time_one_tick(void)
 {
 	long time_adjust_step, delta_nsec;
 
-	if ( (time_adjust_step = time_adjust) != 0 ) {
-	    /* We are doing an adjtime thing. 
-	     *
-	     * Prepare time_adjust_step to be within bounds.
-	     * Note that a positive time_adjust means we want the clock
-	     * to run faster.
-	     *
-	     * Limit the amount of the step to be in the range
-	     * -tickadj .. +tickadj
-	     */
-	     if (time_adjust > tickadj)
-		time_adjust_step = tickadj;
-	     else if (time_adjust < -tickadj)
-		time_adjust_step = -tickadj;
-
-	    /* Reduce by this step the amount of time left  */
-	    time_adjust -= time_adjust_step;
+	if ((time_adjust_step = time_adjust) != 0 ) {
+		/*
+		 * We are doing an adjtime thing.  Prepare time_adjust_step to
+		 * be within bounds.  Note that a positive time_adjust means we
+		 * want the clock to run faster.
+		 *
+		 * Limit the amount of the step to be in the range
+		 * -tickadj .. +tickadj
+		 */
+		time_adjust_step = min(time_adjust_step, (long)tickadj);
+		time_adjust_step = max(time_adjust_step, (long)-tickadj);
+
+		/* Reduce by this step the amount of time left  */
+		time_adjust -= time_adjust_step;
 	}
 	delta_nsec = tick_nsec + time_adjust_step * 1000;
 	/*
@@ -792,13 +742,8 @@ static void update_wall_time_one_tick(void)
 	 * advance the tick more.
 	 */
 	time_phase += time_adj;
-	if (time_phase <= -FINENSEC) {
-		long ltemp = -time_phase >> (SHIFT_SCALE - 10);
-		time_phase += ltemp << (SHIFT_SCALE - 10);
-		delta_nsec -= ltemp;
-	}
-	else if (time_phase >= FINENSEC) {
-		long ltemp = time_phase >> (SHIFT_SCALE - 10);
+	if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) {
+		long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10));
 		time_phase -= ltemp << (SHIFT_SCALE - 10);
 		delta_nsec += ltemp;
 	}
@@ -950,6 +895,7 @@ void do_timer(struct pt_regs *regs)
 {
 	jiffies_64++;
 	update_times();
+	softlockup_tick(regs);
 }
 
 #ifdef __ARCH_WANT_SYS_ALARM
@@ -1023,7 +969,7 @@ asmlinkage long sys_getppid(void)
 	parent = me->group_leader->real_parent;
 	for (;;) {
 		pid = parent->tgid;
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
 {
 		struct task_struct *old = parent;
 
@@ -1127,8 +1073,8 @@ fastcall signed long __sched schedule_timeout(signed long timeout)
 		if (timeout < 0)
 		{
 			printk(KERN_ERR "schedule_timeout: wrong timeout "
-			       "value %lx from %p\n", timeout,
-			       __builtin_return_address(0));
+				"value %lx from %p\n", timeout,
+				__builtin_return_address(0));
 			current->state = TASK_RUNNING;
 			goto out;
 		}
@@ -1136,12 +1082,8 @@ fastcall signed long __sched schedule_timeout(signed long timeout)
 
 	expire = timeout + jiffies;
 
-	init_timer(&timer);
-	timer.expires = expire;
-	timer.data = (unsigned long) current;
-	timer.function = process_timeout;
-
-	add_timer(&timer);
+	setup_timer(&timer, process_timeout, (unsigned long)current);
+	__mod_timer(&timer, expire);
 	schedule();
 	del_singleshot_timer_sync(&timer);
 
@@ -1150,9 +1092,26 @@ fastcall signed long __sched schedule_timeout(signed long timeout)
  out:
 	return timeout < 0 ? 0 : timeout;
 }
-
 EXPORT_SYMBOL(schedule_timeout);
 
+/*
+ * We can use __set_current_state() here because schedule_timeout() calls
+ * schedule() unconditionally.
+ */
+signed long __sched schedule_timeout_interruptible(signed long timeout)
+{
+	__set_current_state(TASK_INTERRUPTIBLE);
+	return schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(schedule_timeout_interruptible);
+
+signed long __sched schedule_timeout_uninterruptible(signed long timeout)
+{
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	return schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(schedule_timeout_uninterruptible);
+
 /* Thread ID - the internal kernel "pid" */
 asmlinkage long sys_gettid(void)
 {
@@ -1169,8 +1128,7 @@ static long __sched nanosleep_restart(struct restart_block *restart)
 	if (!time_after(expire, now))
 		return 0;
 
-	current->state = TASK_INTERRUPTIBLE;
-	expire = schedule_timeout(expire - now);
+	expire = schedule_timeout_interruptible(expire - now);
 
 	ret = 0;
 	if (expire) {
@@ -1198,8 +1156,7 @@ asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __us
 		return -EINVAL;
 
 	expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
-	current->state = TASK_INTERRUPTIBLE;
-	expire = schedule_timeout(expire);
+	expire = schedule_timeout_interruptible(expire);
 
 	ret = 0;
 	if (expire) {
@@ -1428,7 +1385,7 @@ static inline u64 time_interpolator_get_cycles(unsigned int src)
 	}
 }
 
-static inline u64 time_interpolator_get_counter(void)
+static inline u64 time_interpolator_get_counter(int writelock)
 {
 	unsigned int src = time_interpolator->source;
 
@@ -1442,6 +1399,15 @@ static inline u64 time_interpolator_get_counter(void)
 			now = time_interpolator_get_cycles(src);
 			if (lcycle && time_after(lcycle, now))
 				return lcycle;
+
+			/* When holding the xtime write lock, there's no need
+			 * to add the overhead of the cmpxchg.  Readers are
+			 * force to retry until the write lock is released.
+			 */
+			if (writelock) {
+				time_interpolator->last_cycle = now;
+				return now;
+			}
 			/* Keep track of the last timer value returned. The use of cmpxchg here
 			 * will cause contention in an SMP environment.
 			 */
@@ -1455,7 +1421,7 @@ static inline u64 time_interpolator_get_counter(void)
 void time_interpolator_reset(void)
 {
 	time_interpolator->offset = 0;
-	time_interpolator->last_counter = time_interpolator_get_counter();
+	time_interpolator->last_counter = time_interpolator_get_counter(1);
 }
 
 #define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift)
@@ -1467,7 +1433,7 @@ unsigned long time_interpolator_get_offset(void)
 		return 0;
 
 	return time_interpolator->offset +
-		GET_TI_NSECS(time_interpolator_get_counter(), time_interpolator);
+		GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator);
 }
 
 #define INTERPOLATOR_ADJUST 65536
@@ -1482,16 +1448,18 @@ static void time_interpolator_update(long delta_nsec)
 	if (!time_interpolator)
 		return;
 
-	/* The interpolator compensates for late ticks by accumulating
-         * the late time in time_interpolator->offset. A tick earlier than
-	 * expected will lead to a reset of the offset and a corresponding
-	 * jump of the clock forward. Again this only works if the
-	 * interpolator clock is running slightly slower than the regular clock
-	 * and the tuning logic insures that.
-         */
+	/*
+	 * The interpolator compensates for late ticks by accumulating the late
+	 * time in time_interpolator->offset. A tick earlier than expected will
+	 * lead to a reset of the offset and a corresponding jump of the clock
+	 * forward. Again this only works if the interpolator clock is running
+	 * slightly slower than the regular clock and the tuning logic insures
+	 * that.
+	 */
 
-	counter = time_interpolator_get_counter();
-	offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator);
+	counter = time_interpolator_get_counter(1);
+	offset = time_interpolator->offset +
+			GET_TI_NSECS(counter, time_interpolator);
 
 	if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
 		time_interpolator->offset = offset - delta_nsec;
@@ -1588,10 +1556,8 @@ void msleep(unsigned int msecs)
 {
 	unsigned long timeout = msecs_to_jiffies(msecs) + 1;
 
-	while (timeout) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		timeout = schedule_timeout(timeout);
-	}
+	while (timeout)
+		timeout = schedule_timeout_uninterruptible(timeout);
 }
 
 EXPORT_SYMBOL(msleep);
@@ -1604,10 +1570,8 @@ unsigned long msleep_interruptible(unsigned int msecs)
 {
 	unsigned long timeout = msecs_to_jiffies(msecs) + 1;
 
-	while (timeout && !signal_pending(current)) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		timeout = schedule_timeout(timeout);
-	}
+	while (timeout && !signal_pending(current))
+		timeout = schedule_timeout_interruptible(timeout);
 	return jiffies_to_msecs(timeout);
 }
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 259cf55da3c9..7cee222231bc 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -12,6 +12,8 @@
  *   Andrew Morton <andrewm@uow.edu.au>
  *   Kai Petzke <wpp@marie.physik.tu-berlin.de>
  *   Theodore Ts'o <tytso@mit.edu>
+ *
+ * Made to use alloc_percpu by Christoph Lameter <clameter@sgi.com>.
  */
 
 #include <linux/module.h>
@@ -57,7 +59,7 @@ struct cpu_workqueue_struct {
  * per-CPU workqueues:
  */
 struct workqueue_struct {
-	struct cpu_workqueue_struct cpu_wq[NR_CPUS];
+	struct cpu_workqueue_struct *cpu_wq;
 	const char *name;
 	struct list_head list; 	/* Empty if single thread */
 };
@@ -102,7 +104,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
 		if (unlikely(is_single_threaded(wq)))
 			cpu = 0;
 		BUG_ON(!list_empty(&work->entry));
-		__queue_work(wq->cpu_wq + cpu, work);
+		__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 		ret = 1;
 	}
 	put_cpu();
@@ -118,7 +120,7 @@ static void delayed_work_timer_fn(unsigned long __data)
 	if (unlikely(is_single_threaded(wq)))
 		cpu = 0;
 
-	__queue_work(wq->cpu_wq + cpu, work);
+	__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 }
 
 int fastcall queue_delayed_work(struct workqueue_struct *wq,
@@ -265,13 +267,13 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
 
 	if (is_single_threaded(wq)) {
 		/* Always use cpu 0's area. */
-		flush_cpu_workqueue(wq->cpu_wq + 0);
+		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, 0));
 	} else {
 		int cpu;
 
 		lock_cpu_hotplug();
 		for_each_online_cpu(cpu)
-			flush_cpu_workqueue(wq->cpu_wq + cpu);
+			flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
 		unlock_cpu_hotplug();
 	}
 }
@@ -279,7 +281,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
 static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
 						   int cpu)
 {
-	struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu;
+	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
 	struct task_struct *p;
 
 	spin_lock_init(&cwq->lock);
@@ -308,13 +310,11 @@ struct workqueue_struct *__create_workqueue(const char *name,
 	struct workqueue_struct *wq;
 	struct task_struct *p;
 
-	BUG_ON(strlen(name) > 10);
-
-	wq = kmalloc(sizeof(*wq), GFP_KERNEL);
+	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
 	if (!wq)
 		return NULL;
-	memset(wq, 0, sizeof(*wq));
 
+	wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
 	wq->name = name;
 	/* We don't need the distraction of CPUs appearing and vanishing. */
 	lock_cpu_hotplug();
@@ -356,7 +356,7 @@ static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
 	unsigned long flags;
 	struct task_struct *p;
 
-	cwq = wq->cpu_wq + cpu;
+	cwq = per_cpu_ptr(wq->cpu_wq, cpu);
 	spin_lock_irqsave(&cwq->lock, flags);
 	p = cwq->thread;
 	cwq->thread = NULL;
@@ -383,6 +383,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
 		spin_unlock(&workqueue_lock);
 	}
 	unlock_cpu_hotplug();
+	free_percpu(wq->cpu_wq);
 	kfree(wq);
 }
 
@@ -461,7 +462,7 @@ int current_is_keventd(void)
 
 	BUG_ON(!keventd_wq);
 
-	cwq = keventd_wq->cpu_wq + cpu;
+	cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu);
 	if (current == cwq->thread)
 		ret = 1;
 
@@ -473,7 +474,7 @@ int current_is_keventd(void)
 /* Take the work from this (downed) CPU. */
 static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
 {
-	struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu;
+	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
 	LIST_HEAD(list);
 	struct work_struct *work;
 
@@ -484,7 +485,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
 		printk("Taking work for %s\n", wq->name);
 		work = list_entry(list.next,struct work_struct,entry);
 		list_del(&work->entry);
-		__queue_work(wq->cpu_wq + smp_processor_id(), work);
+		__queue_work(per_cpu_ptr(wq->cpu_wq, smp_processor_id()), work);
 	}
 	spin_unlock_irq(&cwq->lock);
 }
@@ -501,7 +502,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 	case CPU_UP_PREPARE:
 		/* Create a new workqueue thread for it. */
 		list_for_each_entry(wq, &workqueues, list) {
-			if (create_workqueue_thread(wq, hotcpu) < 0) {
+			if (!create_workqueue_thread(wq, hotcpu)) {
 				printk("workqueue for %i failed\n", hotcpu);
 				return NOTIFY_BAD;
 			}
@@ -511,15 +512,18 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 	case CPU_ONLINE:
 		/* Kick off worker threads. */
 		list_for_each_entry(wq, &workqueues, list) {
-			kthread_bind(wq->cpu_wq[hotcpu].thread, hotcpu);
-			wake_up_process(wq->cpu_wq[hotcpu].thread);
+			struct cpu_workqueue_struct *cwq;
+
+			cwq = per_cpu_ptr(wq->cpu_wq, hotcpu);
+			kthread_bind(cwq->thread, hotcpu);
+			wake_up_process(cwq->thread);
 		}
 		break;
 
 	case CPU_UP_CANCELED:
 		list_for_each_entry(wq, &workqueues, list) {
 			/* Unbind so it can run. */
-			kthread_bind(wq->cpu_wq[hotcpu].thread,
+			kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread,
 				     smp_processor_id());
 			cleanup_workqueue_thread(wq, hotcpu);
 		}
author	Thomas Gleixner <tglx@mtd.linutronix.de>	2005-11-06 15:36:37 +0100
committer	Thomas Gleixner <tglx@mtd.linutronix.de>	2005-11-06 15:36:37 +0100
commit	2fc2991175bf77395e6b15fe6b2304d3bf72da40 (patch)
tree	b0ff38c09240e7c00e1577d447ebe89143d752dc /kernel
parent	8b491d750885ebe8e7d385ce4186c85957d67123 (diff)
parent	7015faa7df829876a0f931cd18aa6d7c24a1b581 (diff)